Line data Source code
1 : /* Copyright (C) 1988-2026 Free Software Foundation, Inc.
2 :
3 : This file is part of GCC.
4 :
5 : GCC is free software; you can redistribute it and/or modify
6 : it under the terms of the GNU General Public License as published by
7 : the Free Software Foundation; either version 3, or (at your option)
8 : any later version.
9 :
10 : GCC is distributed in the hope that it will be useful,
11 : but WITHOUT ANY WARRANTY; without even the implied warranty of
12 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 : GNU General Public License for more details.
14 :
15 : You should have received a copy of the GNU General Public License
16 : along with GCC; see the file COPYING3. If not see
17 : <http://www.gnu.org/licenses/>. */
18 :
19 : #define IN_TARGET_CODE 1
20 :
21 : #include "config.h"
22 : #include "system.h"
23 : #include "coretypes.h"
24 : #include "backend.h"
25 : #include "rtl.h"
26 : #include "tree.h"
27 : #include "memmodel.h"
28 : #include "gimple.h"
29 : #include "cfghooks.h"
30 : #include "cfgloop.h"
31 : #include "df.h"
32 : #include "tm_p.h"
33 : #include "stringpool.h"
34 : #include "expmed.h"
35 : #include "optabs.h"
36 : #include "regs.h"
37 : #include "emit-rtl.h"
38 : #include "recog.h"
39 : #include "cgraph.h"
40 : #include "diagnostic.h"
41 : #include "cfgbuild.h"
42 : #include "alias.h"
43 : #include "fold-const.h"
44 : #include "attribs.h"
45 : #include "calls.h"
46 : #include "stor-layout.h"
47 : #include "varasm.h"
48 : #include "output.h"
49 : #include "insn-attr.h"
50 : #include "flags.h"
51 : #include "except.h"
52 : #include "explow.h"
53 : #include "expr.h"
54 : #include "cfgrtl.h"
55 : #include "common/common-target.h"
56 : #include "langhooks.h"
57 : #include "reload.h"
58 : #include "gimplify.h"
59 : #include "dwarf2.h"
60 : #include "tm-constrs.h"
61 : #include "cselib.h"
62 : #include "sched-int.h"
63 : #include "opts.h"
64 : #include "tree-pass.h"
65 : #include "context.h"
66 : #include "pass_manager.h"
67 : #include "target-globals.h"
68 : #include "gimple-iterator.h"
69 : #include "shrink-wrap.h"
70 : #include "builtins.h"
71 : #include "rtl-iter.h"
72 : #include "tree-iterator.h"
73 : #include "dbgcnt.h"
74 : #include "case-cfn-macros.h"
75 : #include "dojump.h"
76 : #include "fold-const-call.h"
77 : #include "tree-vrp.h"
78 : #include "tree-ssanames.h"
79 : #include "selftest.h"
80 : #include "selftest-rtl.h"
81 : #include "print-rtl.h"
82 : #include "intl.h"
83 : #include "ifcvt.h"
84 : #include "symbol-summary.h"
85 : #include "sreal.h"
86 : #include "ipa-cp.h"
87 : #include "ipa-prop.h"
88 : #include "ipa-fnsummary.h"
89 : #include "wide-int-bitmask.h"
90 : #include "tree-vector-builder.h"
91 : #include "debug.h"
92 : #include "dwarf2out.h"
93 : #include "i386-options.h"
94 : #include "i386-builtins.h"
95 : #include "i386-expand.h"
96 : #include "asan.h"
97 :
98 : /* Split one or more double-mode RTL references into pairs of half-mode
99 : references. The RTL can be REG, offsettable MEM, integer constant, or
100 : CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
101 : split and "num" is its length. lo_half and hi_half are output arrays
102 : that parallel "operands". */
103 :
104 : void
105 4166852 : split_double_mode (machine_mode mode, rtx operands[],
106 : int num, rtx lo_half[], rtx hi_half[])
107 : {
108 4166852 : machine_mode half_mode;
109 4166852 : unsigned int byte;
110 4166852 : rtx mem_op = NULL_RTX;
111 4166852 : int mem_num = 0;
112 :
113 4166852 : switch (mode)
114 : {
115 : case E_TImode:
116 : half_mode = DImode;
117 : break;
118 605892 : case E_DImode:
119 605892 : half_mode = SImode;
120 605892 : break;
121 6 : case E_P2HImode:
122 6 : half_mode = HImode;
123 6 : break;
124 30 : case E_P2QImode:
125 30 : half_mode = QImode;
126 30 : break;
127 0 : default:
128 0 : gcc_unreachable ();
129 : }
130 :
131 4166852 : byte = GET_MODE_SIZE (half_mode);
132 :
133 8544254 : while (num--)
134 : {
135 4377402 : rtx op = operands[num];
136 :
137 : /* simplify_subreg refuse to split volatile memory addresses,
138 : but we still have to handle it. */
139 4377402 : if (MEM_P (op))
140 : {
141 1742901 : if (mem_op && rtx_equal_p (op, mem_op))
142 : {
143 2426 : lo_half[num] = lo_half[mem_num];
144 2426 : hi_half[num] = hi_half[mem_num];
145 : }
146 : else
147 : {
148 1740475 : mem_op = op;
149 1740475 : mem_num = num;
150 1740475 : lo_half[num] = adjust_address (op, half_mode, 0);
151 1740475 : hi_half[num] = adjust_address (op, half_mode, byte);
152 : }
153 : }
154 : else
155 : {
156 2634501 : lo_half[num] = simplify_gen_subreg (half_mode, op,
157 2634501 : GET_MODE (op) == VOIDmode
158 : ? mode : GET_MODE (op), 0);
159 :
160 2634501 : rtx tmp = simplify_gen_subreg (half_mode, op,
161 2634501 : GET_MODE (op) == VOIDmode
162 2634501 : ? mode : GET_MODE (op), byte);
163 : /* simplify_gen_subreg will return NULL RTX for the
164 : high half of the paradoxical subreg. */
165 2634501 : hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode);
166 : }
167 : }
168 4166852 : }
169 :
170 : /* Emit the double word assignment DST = { LO, HI }. */
171 :
172 : void
173 101077 : split_double_concat (machine_mode mode, rtx dst, rtx lo, rtx hi)
174 : {
175 101077 : rtx dlo, dhi;
176 101077 : int deleted_move_count = 0;
177 101077 : split_double_mode (mode, &dst, 1, &dlo, &dhi);
178 : /* Constraints ensure that if both lo and hi are MEMs, then
179 : dst has early-clobber and thus addresses of MEMs don't use
180 : dlo/dhi registers. Otherwise if at least one of li and hi are MEMs,
181 : dlo/dhi are registers. */
182 101077 : if (MEM_P (lo)
183 5558 : && rtx_equal_p (dlo, hi)
184 102054 : && reg_overlap_mentioned_p (dhi, lo))
185 : {
186 : /* If dlo is same as hi and lo's address uses dhi register,
187 : code below would first emit_move_insn (dhi, hi)
188 : and then emit_move_insn (dlo, lo). But the former
189 : would invalidate lo's address. Load into dhi first,
190 : then swap. */
191 193 : emit_move_insn (dhi, lo);
192 193 : lo = dhi;
193 : }
194 100884 : else if (MEM_P (hi)
195 9414 : && !MEM_P (lo)
196 6597 : && !rtx_equal_p (dlo, lo)
197 102176 : && reg_overlap_mentioned_p (dlo, hi))
198 : {
199 : /* In this case, code below would first emit_move_insn (dlo, lo)
200 : and then emit_move_insn (dhi, hi). But the former would
201 : invalidate hi's address. */
202 11 : if (rtx_equal_p (dhi, lo))
203 : {
204 : /* We can't load into dhi first, so load into dlo
205 : first and we'll swap. */
206 5 : emit_move_insn (dlo, hi);
207 5 : hi = dlo;
208 : }
209 : else
210 : {
211 : /* Load into dhi first. */
212 6 : emit_move_insn (dhi, hi);
213 6 : hi = dhi;
214 : }
215 : }
216 101077 : if (!rtx_equal_p (dlo, hi))
217 : {
218 87104 : if (!rtx_equal_p (dlo, lo))
219 37933 : emit_move_insn (dlo, lo);
220 : else
221 : deleted_move_count++;
222 87104 : if (!rtx_equal_p (dhi, hi))
223 81027 : emit_move_insn (dhi, hi);
224 : else
225 6077 : deleted_move_count++;
226 : }
227 13973 : else if (!rtx_equal_p (lo, dhi))
228 : {
229 6999 : if (!rtx_equal_p (dhi, hi))
230 6999 : emit_move_insn (dhi, hi);
231 : else
232 : deleted_move_count++;
233 6999 : if (!rtx_equal_p (dlo, lo))
234 6899 : emit_move_insn (dlo, lo);
235 : else
236 100 : deleted_move_count++;
237 : }
238 6974 : else if (mode == TImode)
239 6956 : emit_insn (gen_swapdi (dlo, dhi));
240 : else
241 18 : emit_insn (gen_swapsi (dlo, dhi));
242 :
243 101077 : if (deleted_move_count == 2)
244 3085 : emit_note (NOTE_INSN_DELETED);
245 101077 : }
246 :
247 :
248 : /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
249 : for the target. */
250 :
251 : void
252 112714 : ix86_expand_clear (rtx dest)
253 : {
254 112714 : rtx tmp;
255 :
256 : /* We play register width games, which are only valid after reload. */
257 112714 : gcc_assert (reload_completed);
258 :
259 : /* Avoid HImode and its attendant prefix byte. */
260 225428 : if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
261 930 : dest = gen_rtx_REG (SImode, REGNO (dest));
262 112714 : tmp = gen_rtx_SET (dest, const0_rtx);
263 :
264 112714 : if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
265 : {
266 112714 : rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
267 112714 : tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
268 : }
269 :
270 112714 : emit_insn (tmp);
271 112714 : }
272 :
273 : /* Return true if V can be broadcasted from an integer of WIDTH bits
274 : which is returned in VAL_BROADCAST. Otherwise, return false. */
275 :
276 : static bool
277 4851 : ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
278 : HOST_WIDE_INT &val_broadcast)
279 : {
280 4851 : wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT);
281 4851 : val_broadcast = wi::extract_uhwi (val, 0, width);
282 6543 : for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
283 : {
284 5089 : HOST_WIDE_INT each = wi::extract_uhwi (val, i, width);
285 5089 : if (val_broadcast != each)
286 : return false;
287 : }
288 1454 : val_broadcast = sext_hwi (val_broadcast, width);
289 1454 : return true;
290 4851 : }
291 :
292 : /* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
293 :
294 : rtx
295 32417 : ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
296 : {
297 : /* Don't use integer vector broadcast if we can't move from GPR to SSE
298 : register directly. */
299 32417 : if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
300 : return nullptr;
301 :
302 32417 : unsigned int msize = GET_MODE_SIZE (mode);
303 :
304 : /* Only optimized for vpbroadcast[bwsd]/vbroadcastss with xmm/ymm/zmm. */
305 32417 : if (msize != 16 && msize != 32 && msize != 64)
306 : return nullptr;
307 :
308 : /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
309 : broadcast only if vector broadcast is available. */
310 32417 : if (!TARGET_AVX
311 1610 : || !CONST_WIDE_INT_P (op)
312 1603 : || standard_sse_constant_p (op, mode)
313 34020 : || (CONST_WIDE_INT_NUNITS (op) * HOST_BITS_PER_WIDE_INT
314 1603 : != GET_MODE_BITSIZE (mode)))
315 30822 : return nullptr;
316 :
317 1595 : HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
318 1595 : HOST_WIDE_INT val_broadcast;
319 1595 : scalar_int_mode broadcast_mode;
320 : /* vpbroadcastb zmm requires TARGET_AVX512BW. */
321 712 : if ((msize == 64 ? TARGET_AVX512BW : TARGET_AVX2)
322 2089 : && ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
323 : val_broadcast))
324 : broadcast_mode = QImode;
325 654 : else if ((msize == 64 ? TARGET_AVX512BW : TARGET_AVX2)
326 1968 : && ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
327 : val_broadcast))
328 : broadcast_mode = HImode;
329 : /* vbroadcasts[sd] only support memory operand w/o AVX2.
330 : When msize == 16, pshufs is used for vec_duplicate.
331 : when msize == 64, vpbroadcastd is used, and TARGET_AVX512F must be existed. */
332 412 : else if ((msize != 32 || TARGET_AVX2)
333 1768 : && ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
334 : val_broadcast))
335 : broadcast_mode = SImode;
336 1391 : else if (TARGET_64BIT && (msize != 32 || TARGET_AVX2)
337 2641 : && ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
338 : val_broadcast))
339 : broadcast_mode = DImode;
340 : else
341 141 : return nullptr;
342 :
343 : /* Check if OP can be broadcasted from VAL. */
344 1776 : for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
345 1561 : if (val != CONST_WIDE_INT_ELT (op, i))
346 : return nullptr;
347 :
348 215 : unsigned int nunits = (GET_MODE_SIZE (mode)
349 215 : / GET_MODE_SIZE (broadcast_mode));
350 215 : machine_mode vector_mode;
351 215 : if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
352 0 : gcc_unreachable ();
353 215 : rtx target = gen_reg_rtx (vector_mode);
354 215 : bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
355 : target,
356 : GEN_INT (val_broadcast));
357 215 : if (!ok)
358 : return nullptr;
359 215 : target = lowpart_subreg (mode, target, vector_mode);
360 215 : return target;
361 : }
362 :
363 : void
364 73079271 : ix86_expand_move (machine_mode mode, rtx operands[])
365 : {
366 73079271 : rtx op0, op1;
367 73079271 : rtx tmp, addend = NULL_RTX;
368 73079271 : enum tls_model model;
369 :
370 73079271 : op0 = operands[0];
371 73079271 : op1 = operands[1];
372 :
373 : /* Avoid complex sets of likely spilled hard registers before reload. */
374 73079271 : if (!ix86_hardreg_mov_ok (op0, op1))
375 : {
376 138303 : tmp = gen_reg_rtx (mode);
377 138303 : operands[0] = tmp;
378 138303 : ix86_expand_move (mode, operands);
379 138303 : operands[0] = op0;
380 138303 : operands[1] = tmp;
381 138303 : op1 = tmp;
382 : }
383 :
384 73079271 : switch (GET_CODE (op1))
385 : {
386 348673 : case CONST:
387 348673 : tmp = XEXP (op1, 0);
388 :
389 348673 : if (GET_CODE (tmp) != PLUS
390 336991 : || !SYMBOL_REF_P (XEXP (tmp, 0)))
391 : break;
392 :
393 334329 : op1 = XEXP (tmp, 0);
394 334329 : addend = XEXP (tmp, 1);
395 : /* FALLTHRU */
396 :
397 4926591 : case SYMBOL_REF:
398 4926591 : model = SYMBOL_REF_TLS_MODEL (op1);
399 :
400 4926591 : if (model)
401 10115 : op1 = legitimize_tls_address (op1, model, true);
402 4916476 : else if (ix86_force_load_from_GOT_p (op1))
403 : {
404 : /* Load the external function address via GOT slot to avoid PLT. */
405 24 : op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
406 : (TARGET_64BIT
407 : ? UNSPEC_GOTPCREL
408 : : UNSPEC_GOT));
409 24 : op1 = gen_rtx_CONST (Pmode, op1);
410 24 : op1 = gen_const_mem (Pmode, op1);
411 20 : set_mem_alias_set (op1, GOT_ALIAS_SET);
412 : }
413 : else
414 : {
415 : #if TARGET_PECOFF
416 : tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
417 :
418 : if (tmp)
419 : {
420 : op1 = tmp;
421 : if (!addend)
422 : break;
423 : }
424 : else
425 : #endif
426 4916456 : {
427 4916456 : op1 = operands[1];
428 4916456 : break;
429 : }
430 : }
431 :
432 10135 : if (addend)
433 : {
434 2786 : op1 = force_operand (op1, NULL_RTX);
435 2795 : op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
436 : op0, 1, OPTAB_DIRECT);
437 : }
438 : else
439 7349 : op1 = force_operand (op1, op0);
440 :
441 10135 : if (op1 == op0)
442 : return;
443 :
444 1148 : op1 = convert_to_mode (mode, op1, 1);
445 :
446 : default:
447 : break;
448 :
449 1490881 : case SUBREG:
450 : /* Transform TImode paradoxical SUBREG into zero_extendditi2. */
451 1490881 : if (TARGET_64BIT
452 1262849 : && mode == TImode
453 : && SUBREG_P (op1)
454 74142 : && GET_MODE (SUBREG_REG (op1)) == DImode
455 1536785 : && SUBREG_BYTE (op1) == 0)
456 45904 : op1 = gen_rtx_ZERO_EXTEND (TImode, SUBREG_REG (op1));
457 : /* As not all values in XFmode are representable in real_value,
458 : we might be called with unfoldable SUBREGs of constants. */
459 1490881 : if (mode == XFmode
460 3128 : && CONSTANT_P (SUBREG_REG (op1))
461 0 : && can_create_pseudo_p ())
462 : {
463 0 : machine_mode imode = GET_MODE (SUBREG_REG (op1));
464 0 : rtx r = force_const_mem (imode, SUBREG_REG (op1));
465 0 : if (r)
466 0 : r = validize_mem (r);
467 : else
468 0 : r = force_reg (imode, SUBREG_REG (op1));
469 0 : op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
470 : }
471 : break;
472 : }
473 :
474 73070284 : if ((flag_pic || MACHOPIC_INDIRECT)
475 73070284 : && symbolic_operand (op1, mode))
476 : {
477 : #if TARGET_MACHO
478 : if (TARGET_MACHO && !TARGET_64BIT)
479 : {
480 : /* dynamic-no-pic */
481 : if (MACHOPIC_INDIRECT)
482 : {
483 : tmp = (op0 && REG_P (op0) && mode == Pmode)
484 : ? op0 : gen_reg_rtx (Pmode);
485 : op1 = machopic_indirect_data_reference (op1, tmp);
486 : if (MACHOPIC_PURE)
487 : op1 = machopic_legitimize_pic_address (op1, mode,
488 : tmp == op1 ? 0 : tmp);
489 : }
490 : if (op0 != op1 && !MEM_P (op0))
491 : {
492 : rtx insn = gen_rtx_SET (op0, op1);
493 : emit_insn (insn);
494 : return;
495 : }
496 : }
497 : #endif
498 :
499 335349 : if (MEM_P (op0))
500 87451 : op1 = force_reg (mode, op1);
501 247898 : else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
502 : {
503 247841 : rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
504 247841 : op1 = legitimize_pic_address (op1, reg);
505 247841 : if (op0 == op1)
506 : return;
507 247841 : op1 = convert_to_mode (mode, op1, 1);
508 : }
509 : }
510 : else
511 : {
512 72734935 : if (MEM_P (op0)
513 99276068 : && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
514 10733515 : || !push_operand (op0, mode))
515 85005446 : && MEM_P (op1))
516 2167563 : op1 = force_reg (mode, op1);
517 :
518 72734935 : if (push_operand (op0, mode)
519 72734935 : && ! general_no_elim_operand (op1, mode))
520 1004 : op1 = copy_to_mode_reg (mode, op1);
521 :
522 : /* Force large constants in 64bit compilation into register
523 : to get them CSEed. */
524 72734935 : if (can_create_pseudo_p ()
525 67032226 : && (mode == DImode) && TARGET_64BIT
526 34904279 : && immediate_operand (op1, mode)
527 7904329 : && !x86_64_zext_immediate_operand (op1, VOIDmode)
528 716810 : && !register_operand (op0, mode)
529 72910280 : && optimize)
530 123441 : op1 = copy_to_mode_reg (mode, op1);
531 :
532 72734935 : if (can_create_pseudo_p ())
533 : {
534 67032226 : if (CONST_DOUBLE_P (op1))
535 : {
536 : /* If we are loading a floating point constant to a
537 : register, force the value to memory now, since we'll
538 : get better code out the back end. */
539 :
540 897739 : op1 = validize_mem (force_const_mem (mode, op1));
541 897739 : if (!register_operand (op0, mode))
542 : {
543 129390 : tmp = gen_reg_rtx (mode);
544 129390 : emit_insn (gen_rtx_SET (tmp, op1));
545 129390 : emit_move_insn (op0, tmp);
546 129390 : return;
547 : }
548 : }
549 : }
550 : }
551 :
552 : /* Special case inserting 64-bit values into a TImode register. */
553 72940894 : if (TARGET_64BIT
554 : /* Disable for -O0 (see PR110587) unless naked (PR110533). */
555 63224688 : && (optimize || ix86_function_naked (current_function_decl))
556 43278147 : && (mode == DImode || mode == DFmode)
557 29513097 : && SUBREG_P (op0)
558 484870 : && GET_MODE (SUBREG_REG (op0)) == TImode
559 399895 : && REG_P (SUBREG_REG (op0))
560 73340789 : && REG_P (op1))
561 : {
562 : /* Use *insvti_lowpart_1 to set lowpart. */
563 179747 : if (SUBREG_BYTE (op0) == 0)
564 : {
565 53566 : wide_int mask = wi::mask (64, true, 128);
566 53566 : tmp = immed_wide_int_const (mask, TImode);
567 53566 : op0 = SUBREG_REG (op0);
568 53566 : tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
569 53566 : if (mode == DFmode)
570 350 : op1 = gen_lowpart (DImode, op1);
571 53566 : op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
572 53566 : op1 = gen_rtx_IOR (TImode, tmp, op1);
573 53566 : }
574 : /* Use *insvti_highpart_1 to set highpart. */
575 126181 : else if (SUBREG_BYTE (op0) == 8)
576 : {
577 126181 : wide_int mask = wi::mask (64, false, 128);
578 126181 : tmp = immed_wide_int_const (mask, TImode);
579 126181 : op0 = SUBREG_REG (op0);
580 126181 : tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
581 126181 : if (mode == DFmode)
582 201 : op1 = gen_lowpart (DImode, op1);
583 126181 : op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
584 126181 : op1 = gen_rtx_ASHIFT (TImode, op1, GEN_INT (64));
585 126181 : op1 = gen_rtx_IOR (TImode, tmp, op1);
586 126181 : }
587 : }
588 :
589 72940894 : emit_insn (gen_rtx_SET (op0, op1));
590 : }
591 :
592 : /* OP is a memref of CONST_VECTOR, return scalar constant mem
593 : if CONST_VECTOR is a vec_duplicate, else return NULL. */
594 : rtx
595 2457355 : ix86_broadcast_from_constant (machine_mode mode, rtx op)
596 : {
597 2457355 : int nunits = GET_MODE_NUNITS (mode);
598 2457355 : if (nunits < 2)
599 : return nullptr;
600 :
601 : /* Don't use integer vector broadcast if we can't move from GPR to SSE
602 : register directly. */
603 2330910 : if (!TARGET_INTER_UNIT_MOVES_TO_VEC
604 8020 : && INTEGRAL_MODE_P (mode))
605 : return nullptr;
606 :
607 : /* Convert CONST_VECTOR to a non-standard SSE constant integer
608 : broadcast only if vector broadcast is available. */
609 2325500 : if (standard_sse_constant_p (op, mode))
610 : return nullptr;
611 :
612 4650994 : if (GET_MODE_INNER (mode) == TImode)
613 : return nullptr;
614 :
615 2325387 : rtx constant = get_pool_constant (XEXP (op, 0));
616 2325387 : if (!CONST_VECTOR_P (constant))
617 : return nullptr;
618 :
619 : /* There could be some rtx like
620 : (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
621 : but with "*.LC1" refer to V2DI constant vector. */
622 2325387 : if (GET_MODE (constant) != mode)
623 : {
624 652 : constant = simplify_subreg (mode, constant, GET_MODE (constant),
625 : 0);
626 652 : if (constant == nullptr || !CONST_VECTOR_P (constant))
627 : return nullptr;
628 : }
629 :
630 2325387 : rtx first = XVECEXP (constant, 0, 0);
631 :
632 7697007 : for (int i = 1; i < nunits; ++i)
633 : {
634 7083100 : rtx tmp = XVECEXP (constant, 0, i);
635 : /* Vector duplicate value. */
636 7083100 : if (!rtx_equal_p (tmp, first))
637 : return nullptr;
638 : }
639 :
640 : return first;
641 : }
642 :
643 : void
644 4810921 : ix86_expand_vector_move (machine_mode mode, rtx operands[])
645 : {
646 4810921 : rtx op0 = operands[0], op1 = operands[1];
647 : /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
648 : psABI since the biggest alignment is 4 byte for IA MCU psABI. */
649 4810921 : unsigned int align = (TARGET_IAMCU
650 4810921 : ? GET_MODE_BITSIZE (mode)
651 4810921 : : GET_MODE_ALIGNMENT (mode));
652 :
653 4810921 : if (push_operand (op0, VOIDmode))
654 2899 : op0 = emit_move_resolve_push (mode, op0);
655 :
656 : /* Force constants other than zero into memory. We do not know how
657 : the instructions used to build constants modify the upper 64 bits
658 : of the register, once we have that information we may be able
659 : to handle some of them more efficiently. */
660 4810921 : if (can_create_pseudo_p ()
661 4616121 : && (CONSTANT_P (op1)
662 4302536 : || (SUBREG_P (op1)
663 309306 : && CONSTANT_P (SUBREG_REG (op1))))
664 5124520 : && ((register_operand (op0, mode)
665 259869 : && !standard_sse_constant_p (op1, mode))
666 : /* ix86_expand_vector_move_misalign() does not like constants. */
667 : || (SSE_REG_MODE_P (mode)
668 256983 : && MEM_P (op0)
669 38436 : && MEM_ALIGN (op0) < align)))
670 : {
671 2249 : if (SUBREG_P (op1))
672 : {
673 14 : machine_mode imode = GET_MODE (SUBREG_REG (op1));
674 14 : rtx r = force_const_mem (imode, SUBREG_REG (op1));
675 14 : if (r)
676 14 : r = validize_mem (r);
677 : else
678 0 : r = force_reg (imode, SUBREG_REG (op1));
679 14 : op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
680 : }
681 : else
682 : {
683 2235 : machine_mode mode = GET_MODE (op0);
684 2235 : rtx tmp = ix86_convert_const_wide_int_to_broadcast
685 2235 : (mode, op1);
686 2235 : if (tmp == nullptr)
687 2214 : op1 = validize_mem (force_const_mem (mode, op1));
688 : else
689 : op1 = tmp;
690 : }
691 : }
692 :
693 4810921 : if (can_create_pseudo_p ()
694 4616121 : && GET_MODE_SIZE (mode) >= 16
695 3892767 : && VECTOR_MODE_P (mode)
696 8488336 : && (MEM_P (op1)
697 877624 : && SYMBOL_REF_P (XEXP (op1, 0))
698 495269 : && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
699 : {
700 478672 : rtx first = ix86_broadcast_from_constant (mode, op1);
701 478672 : if (first != nullptr)
702 : {
703 : /* Broadcast to XMM/YMM/ZMM register from an integer
704 : constant or scalar mem. */
705 124259 : rtx tmp = gen_reg_rtx (mode);
706 124259 : if (FLOAT_MODE_P (mode))
707 29276 : first = force_const_mem (GET_MODE_INNER (mode), first);
708 124259 : bool ok = ix86_expand_vector_init_duplicate (false, mode,
709 : tmp, first);
710 124259 : if (!ok && !TARGET_64BIT && GET_MODE_INNER (mode) == DImode)
711 : {
712 0 : first = force_const_mem (GET_MODE_INNER (mode), first);
713 0 : ok = ix86_expand_vector_init_duplicate (false, mode,
714 : tmp, first);
715 : }
716 124259 : if (ok)
717 : {
718 124259 : emit_move_insn (op0, tmp);
719 124259 : return;
720 : }
721 : }
722 : }
723 :
724 : /* We need to check memory alignment for SSE mode since attribute
725 : can make operands unaligned. */
726 4686662 : if (can_create_pseudo_p ()
727 : && SSE_REG_MODE_P (mode)
728 9503907 : && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
729 4221004 : || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
730 : {
731 493548 : rtx tmp[2];
732 :
733 : /* ix86_expand_vector_move_misalign() does not like both
734 : arguments in memory. */
735 493548 : if (!register_operand (op0, mode)
736 493548 : && !register_operand (op1, mode))
737 : {
738 154795 : rtx scratch = gen_reg_rtx (mode);
739 154795 : emit_move_insn (scratch, op1);
740 154795 : op1 = scratch;
741 : }
742 :
743 493548 : tmp[0] = op0; tmp[1] = op1;
744 493548 : ix86_expand_vector_move_misalign (mode, tmp);
745 493548 : return;
746 : }
747 :
748 : /* Special case TImode to 128-bit vector conversions via V2DI. */
749 1139970 : if (VECTOR_MODE_P (mode)
750 4141858 : && GET_MODE_SIZE (mode) == 16
751 2925852 : && SUBREG_P (op1)
752 242160 : && GET_MODE (SUBREG_REG (op1)) == TImode
753 3276 : && TARGET_64BIT && TARGET_SSE
754 4195736 : && ix86_pre_reload_split ())
755 : {
756 2515 : rtx tmp = gen_reg_rtx (V2DImode);
757 2515 : rtx lo = gen_reg_rtx (DImode);
758 2515 : rtx hi = gen_reg_rtx (DImode);
759 2515 : emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
760 2515 : emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
761 2515 : emit_insn (gen_vec_concatv2di (tmp, lo, hi));
762 2515 : emit_move_insn (op0, gen_lowpart (mode, tmp));
763 2515 : return;
764 : }
765 :
766 : /* If operand0 is a hard register, make operand1 a pseudo. */
767 4190599 : if (can_create_pseudo_p ()
768 8186398 : && !ix86_hardreg_mov_ok (op0, op1))
769 : {
770 134 : rtx tmp = gen_reg_rtx (GET_MODE (op0));
771 134 : emit_move_insn (tmp, op1);
772 134 : emit_move_insn (op0, tmp);
773 134 : return;
774 : }
775 :
776 : /* Make operand1 a register if it isn't already. */
777 4190465 : if (can_create_pseudo_p ()
778 3995665 : && !register_operand (op0, mode)
779 5304609 : && !register_operand (op1, mode))
780 : {
781 215540 : rtx tmp = gen_reg_rtx (GET_MODE (op0));
782 215540 : emit_move_insn (tmp, op1);
783 215540 : emit_move_insn (op0, tmp);
784 215540 : return;
785 : }
786 :
787 3974925 : emit_insn (gen_rtx_SET (op0, op1));
788 : }
789 :
790 : /* Split 32-byte AVX unaligned load and store if needed. */
791 :
792 : static void
793 12554 : ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
794 : {
795 12554 : rtx m;
796 12554 : rtx (*extract) (rtx, rtx, rtx);
797 12554 : machine_mode mode;
798 :
799 12554 : if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
800 4558 : || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
801 : {
802 12528 : emit_insn (gen_rtx_SET (op0, op1));
803 12528 : return;
804 : }
805 :
806 26 : rtx orig_op0 = NULL_RTX;
807 26 : mode = GET_MODE (op0);
808 26 : switch (GET_MODE_CLASS (mode))
809 : {
810 9 : case MODE_VECTOR_INT:
811 9 : case MODE_INT:
812 9 : if (mode != V32QImode)
813 : {
814 7 : if (!MEM_P (op0))
815 : {
816 3 : orig_op0 = op0;
817 3 : op0 = gen_reg_rtx (V32QImode);
818 : }
819 : else
820 4 : op0 = gen_lowpart (V32QImode, op0);
821 7 : op1 = gen_lowpart (V32QImode, op1);
822 7 : mode = V32QImode;
823 : }
824 : break;
825 : case MODE_VECTOR_FLOAT:
826 : break;
827 0 : default:
828 0 : gcc_unreachable ();
829 : }
830 :
831 26 : switch (mode)
832 : {
833 0 : default:
834 0 : gcc_unreachable ();
835 : case E_V32QImode:
836 : extract = gen_avx_vextractf128v32qi;
837 : mode = V16QImode;
838 : break;
839 1 : case E_V16BFmode:
840 1 : extract = gen_avx_vextractf128v16bf;
841 1 : mode = V8BFmode;
842 1 : break;
843 0 : case E_V16HFmode:
844 0 : extract = gen_avx_vextractf128v16hf;
845 0 : mode = V8HFmode;
846 0 : break;
847 8 : case E_V8SFmode:
848 8 : extract = gen_avx_vextractf128v8sf;
849 8 : mode = V4SFmode;
850 8 : break;
851 8 : case E_V4DFmode:
852 8 : extract = gen_avx_vextractf128v4df;
853 8 : mode = V2DFmode;
854 8 : break;
855 : }
856 :
857 26 : if (MEM_P (op1))
858 : {
859 9 : rtx r = gen_reg_rtx (mode);
860 9 : m = adjust_address (op1, mode, 0);
861 9 : emit_move_insn (r, m);
862 9 : m = adjust_address (op1, mode, 16);
863 9 : r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
864 9 : emit_move_insn (op0, r);
865 : }
866 17 : else if (MEM_P (op0))
867 : {
868 17 : m = adjust_address (op0, mode, 0);
869 17 : emit_insn (extract (m, op1, const0_rtx));
870 17 : m = adjust_address (op0, mode, 16);
871 17 : emit_insn (extract (m, copy_rtx (op1), const1_rtx));
872 : }
873 : else
874 0 : gcc_unreachable ();
875 :
876 26 : if (orig_op0)
877 3 : emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
878 : }
879 :
880 : /* Implement the movmisalign patterns for SSE. Non-SSE modes go
881 : straight to ix86_expand_vector_move. */
882 : /* Code generation for scalar reg-reg moves of single and double precision data:
883 : if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
884 : movaps reg, reg
885 : else
886 : movss reg, reg
887 : if (x86_sse_partial_reg_dependency == true)
888 : movapd reg, reg
889 : else
890 : movsd reg, reg
891 :
892 : Code generation for scalar loads of double precision data:
893 : if (x86_sse_split_regs == true)
894 : movlpd mem, reg (gas syntax)
895 : else
896 : movsd mem, reg
897 :
898 : Code generation for unaligned packed loads of single precision data
899 : (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
900 : if (x86_sse_unaligned_move_optimal)
901 : movups mem, reg
902 :
903 : if (x86_sse_partial_reg_dependency == true)
904 : {
905 : xorps reg, reg
906 : movlps mem, reg
907 : movhps mem+8, reg
908 : }
909 : else
910 : {
911 : movlps mem, reg
912 : movhps mem+8, reg
913 : }
914 :
915 : Code generation for unaligned packed loads of double precision data
916 : (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
917 : if (x86_sse_unaligned_move_optimal)
918 : movupd mem, reg
919 :
920 : if (x86_sse_split_regs == true)
921 : {
922 : movlpd mem, reg
923 : movhpd mem+8, reg
924 : }
925 : else
926 : {
927 : movsd mem, reg
928 : movhpd mem+8, reg
929 : }
930 : */
931 :
932 : void
933 822016 : ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
934 : {
935 822016 : rtx op0, op1, m;
936 :
937 822016 : op0 = operands[0];
938 822016 : op1 = operands[1];
939 :
940 : /* Use unaligned load/store for AVX512 or when optimizing for size. */
941 1644032 : if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
942 : {
943 24075 : emit_insn (gen_rtx_SET (op0, op1));
944 24075 : return;
945 : }
946 :
947 797941 : if (TARGET_AVX)
948 : {
949 62002 : if (GET_MODE_SIZE (mode) == 32)
950 12554 : ix86_avx256_split_vector_move_misalign (op0, op1);
951 : else
952 : /* Always use 128-bit mov<mode>_internal pattern for AVX. */
953 18447 : emit_insn (gen_rtx_SET (op0, op1));
954 31001 : return;
955 : }
956 :
957 766940 : if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
958 95 : || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
959 : {
960 766845 : emit_insn (gen_rtx_SET (op0, op1));
961 766845 : return;
962 : }
963 :
964 : /* ??? If we have typed data, then it would appear that using
965 : movdqu is the only way to get unaligned data loaded with
966 : integer type. */
967 95 : if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
968 : {
969 81 : emit_insn (gen_rtx_SET (op0, op1));
970 81 : return;
971 : }
972 :
973 14 : if (MEM_P (op1))
974 : {
975 6 : if (TARGET_SSE2 && mode == V2DFmode)
976 : {
977 2 : rtx zero;
978 :
979 : /* When SSE registers are split into halves, we can avoid
980 : writing to the top half twice. */
981 2 : if (TARGET_SSE_SPLIT_REGS)
982 : {
983 2 : emit_clobber (op0);
984 2 : zero = op0;
985 : }
986 : else
987 : {
988 : /* ??? Not sure about the best option for the Intel chips.
989 : The following would seem to satisfy; the register is
990 : entirely cleared, breaking the dependency chain. We
991 : then store to the upper half, with a dependency depth
992 : of one. A rumor has it that Intel recommends two movsd
993 : followed by an unpacklpd, but this is unconfirmed. And
994 : given that the dependency depth of the unpacklpd would
995 : still be one, I'm not sure why this would be better. */
996 0 : zero = CONST0_RTX (V2DFmode);
997 : }
998 :
999 2 : m = adjust_address (op1, DFmode, 0);
1000 2 : emit_insn (gen_sse2_loadlpd (op0, zero, m));
1001 2 : m = adjust_address (op1, DFmode, 8);
1002 2 : emit_insn (gen_sse2_loadhpd (op0, op0, m));
1003 2 : }
1004 : else
1005 : {
1006 4 : rtx t;
1007 :
1008 4 : if (mode != V4SFmode)
1009 0 : t = gen_reg_rtx (V4SFmode);
1010 : else
1011 : t = op0;
1012 :
1013 4 : if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
1014 2 : emit_move_insn (t, CONST0_RTX (V4SFmode));
1015 : else
1016 2 : emit_clobber (t);
1017 :
1018 4 : m = adjust_address (op1, V2SFmode, 0);
1019 4 : emit_insn (gen_sse_loadlps (t, t, m));
1020 4 : m = adjust_address (op1, V2SFmode, 8);
1021 4 : emit_insn (gen_sse_loadhps (t, t, m));
1022 4 : if (mode != V4SFmode)
1023 0 : emit_move_insn (op0, gen_lowpart (mode, t));
1024 : }
1025 : }
1026 8 : else if (MEM_P (op0))
1027 : {
1028 8 : if (TARGET_SSE2 && mode == V2DFmode)
1029 : {
1030 2 : m = adjust_address (op0, DFmode, 0);
1031 2 : emit_insn (gen_sse2_storelpd (m, op1));
1032 2 : m = adjust_address (op0, DFmode, 8);
1033 2 : emit_insn (gen_sse2_storehpd (m, op1));
1034 : }
1035 : else
1036 : {
1037 6 : if (mode != V4SFmode)
1038 0 : op1 = gen_lowpart (V4SFmode, op1);
1039 :
1040 6 : m = adjust_address (op0, V2SFmode, 0);
1041 6 : emit_insn (gen_sse_storelps (m, op1));
1042 6 : m = adjust_address (op0, V2SFmode, 8);
1043 6 : emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
1044 : }
1045 : }
1046 : else
1047 0 : gcc_unreachable ();
1048 : }
1049 :
1050 : /* Move bits 64:95 to bits 32:63. */
1051 :
1052 : void
1053 854 : ix86_move_vector_high_sse_to_mmx (rtx op)
1054 : {
1055 854 : rtx mask = gen_rtx_PARALLEL (VOIDmode,
1056 : gen_rtvec (4, GEN_INT (0), GEN_INT (2),
1057 : GEN_INT (0), GEN_INT (0)));
1058 854 : rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
1059 854 : op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1060 854 : rtx insn = gen_rtx_SET (dest, op);
1061 854 : emit_insn (insn);
1062 854 : }
1063 :
1064 : /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
1065 :
1066 : void
1067 764 : ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
1068 : {
1069 764 : rtx op0 = operands[0];
1070 764 : rtx op1 = operands[1];
1071 764 : rtx op2 = operands[2];
1072 764 : rtx src;
1073 :
1074 764 : machine_mode dmode = GET_MODE (op0);
1075 764 : machine_mode smode = GET_MODE (op1);
1076 764 : machine_mode inner_dmode = GET_MODE_INNER (dmode);
1077 764 : machine_mode inner_smode = GET_MODE_INNER (smode);
1078 :
1079 : /* Get the corresponding SSE mode for destination. */
1080 764 : int nunits = 16 / GET_MODE_SIZE (inner_dmode);
1081 1528 : machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
1082 1528 : nunits).require ();
1083 764 : machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
1084 1528 : nunits / 2).require ();
1085 :
1086 : /* Get the corresponding SSE mode for source. */
1087 764 : nunits = 16 / GET_MODE_SIZE (inner_smode);
1088 1528 : machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
1089 1528 : nunits).require ();
1090 :
1091 : /* Generate SSE pack with signed/unsigned saturation. */
1092 764 : rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
1093 764 : op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
1094 764 : op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
1095 :
1096 : /* paskusdw/packuswb does unsigned saturation of a signed source
1097 : which is different from generic us_truncate RTX. */
1098 764 : if (code == US_TRUNCATE)
1099 662 : src = gen_rtx_UNSPEC (sse_dmode,
1100 : gen_rtvec (2, op1, op2),
1101 : UNSPEC_US_TRUNCATE);
1102 : else
1103 : {
1104 102 : op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
1105 102 : op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
1106 102 : src = gen_rtx_VEC_CONCAT (sse_dmode, op1, op2);
1107 : }
1108 :
1109 764 : emit_move_insn (dest, src);
1110 :
1111 764 : ix86_move_vector_high_sse_to_mmx (op0);
1112 764 : }
1113 :
1114 : /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. This is also used
1115 : for a full unpack of OPERANDS[1] and OPERANDS[2] into a wider
1116 : OPERANDS[0]. */
1117 :
1118 : void
1119 5725 : ix86_split_mmx_punpck (rtx operands[], bool high_p)
1120 : {
1121 5725 : rtx op0 = operands[0];
1122 5725 : rtx op1 = operands[1];
1123 5725 : rtx op2 = operands[2];
1124 5725 : machine_mode mode = GET_MODE (op1);
1125 5725 : rtx mask;
1126 : /* The corresponding SSE mode. */
1127 5725 : machine_mode sse_mode, double_sse_mode;
1128 :
1129 5725 : switch (mode)
1130 : {
1131 1513 : case E_V8QImode:
1132 1513 : case E_V4QImode:
1133 1513 : case E_V2QImode:
1134 1513 : sse_mode = V16QImode;
1135 1513 : double_sse_mode = V32QImode;
1136 1513 : mask = gen_rtx_PARALLEL (VOIDmode,
1137 : gen_rtvec (16,
1138 : GEN_INT (0), GEN_INT (16),
1139 : GEN_INT (1), GEN_INT (17),
1140 : GEN_INT (2), GEN_INT (18),
1141 : GEN_INT (3), GEN_INT (19),
1142 : GEN_INT (4), GEN_INT (20),
1143 : GEN_INT (5), GEN_INT (21),
1144 : GEN_INT (6), GEN_INT (22),
1145 : GEN_INT (7), GEN_INT (23)));
1146 1513 : break;
1147 :
1148 3080 : case E_V4HImode:
1149 3080 : case E_V2HImode:
1150 3080 : sse_mode = V8HImode;
1151 3080 : double_sse_mode = V16HImode;
1152 3080 : mask = gen_rtx_PARALLEL (VOIDmode,
1153 : gen_rtvec (8,
1154 : GEN_INT (0), GEN_INT (8),
1155 : GEN_INT (1), GEN_INT (9),
1156 : GEN_INT (2), GEN_INT (10),
1157 : GEN_INT (3), GEN_INT (11)));
1158 3080 : break;
1159 :
1160 795 : case E_V2SImode:
1161 795 : sse_mode = V4SImode;
1162 795 : double_sse_mode = V8SImode;
1163 795 : mask = gen_rtx_PARALLEL (VOIDmode,
1164 : gen_rtvec (4,
1165 : GEN_INT (0), GEN_INT (4),
1166 : GEN_INT (1), GEN_INT (5)));
1167 795 : break;
1168 :
1169 337 : case E_V2SFmode:
1170 337 : sse_mode = V4SFmode;
1171 337 : double_sse_mode = V8SFmode;
1172 337 : mask = gen_rtx_PARALLEL (VOIDmode,
1173 : gen_rtvec (4,
1174 : GEN_INT (0), GEN_INT (4),
1175 : GEN_INT (1), GEN_INT (5)));
1176 337 : break;
1177 :
1178 0 : default:
1179 0 : gcc_unreachable ();
1180 : }
1181 :
1182 : /* Generate SSE punpcklXX. */
1183 5725 : rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
1184 5725 : op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
1185 5725 : op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
1186 :
1187 5725 : op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
1188 5725 : op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
1189 5725 : rtx insn = gen_rtx_SET (dest, op2);
1190 5725 : emit_insn (insn);
1191 :
1192 : /* Move high bits to low bits. */
1193 5725 : if (high_p)
1194 : {
1195 2294 : if (sse_mode == V4SFmode)
1196 : {
1197 119 : mask = gen_rtx_PARALLEL (VOIDmode,
1198 : gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1199 : GEN_INT (4), GEN_INT (5)));
1200 119 : op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
1201 119 : op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
1202 : }
1203 : else
1204 : {
1205 2175 : int sz = GET_MODE_SIZE (mode);
1206 :
1207 2175 : if (sz == 4)
1208 239 : mask = gen_rtx_PARALLEL (VOIDmode,
1209 : gen_rtvec (4, GEN_INT (1), GEN_INT (0),
1210 : GEN_INT (0), GEN_INT (1)));
1211 1936 : else if (sz == 8)
1212 1936 : mask = gen_rtx_PARALLEL (VOIDmode,
1213 : gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1214 : GEN_INT (0), GEN_INT (1)));
1215 : else
1216 0 : gcc_unreachable ();
1217 :
1218 2175 : dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
1219 2175 : op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1220 : }
1221 :
1222 2294 : insn = gen_rtx_SET (dest, op1);
1223 2294 : emit_insn (insn);
1224 : }
1225 5725 : }
1226 :
1227 : /* Helper function of ix86_fixup_binary_operands to canonicalize
1228 : operand order. Returns true if the operands should be swapped. */
1229 :
1230 : static bool
1231 174113385 : ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
1232 : rtx operands[])
1233 : {
1234 174113385 : rtx dst = operands[0];
1235 174113385 : rtx src1 = operands[1];
1236 174113385 : rtx src2 = operands[2];
1237 :
1238 : /* If the operation is not commutative, we can't do anything. */
1239 174113385 : if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
1240 26884473 : && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
1241 : return false;
1242 :
1243 : /* Highest priority is that src1 should match dst. */
1244 147241004 : if (rtx_equal_p (dst, src1))
1245 : return false;
1246 106820981 : if (rtx_equal_p (dst, src2))
1247 : return true;
1248 :
1249 : /* Next highest priority is that immediate constants come second. */
1250 106738470 : if (immediate_operand (src2, mode))
1251 : return false;
1252 25728596 : if (immediate_operand (src1, mode))
1253 : return true;
1254 :
1255 : /* Lowest priority is that memory references should come second. */
1256 25728596 : if (MEM_P (src2))
1257 : return false;
1258 24296767 : if (MEM_P (src1))
1259 : return true;
1260 :
1261 : return false;
1262 : }
1263 :
1264 : /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
1265 : destination to use for the operation. If different from the true
1266 : destination in operands[0], a copy operation will be required except
1267 : under TARGET_APX_NDD. */
1268 :
1269 : rtx
1270 13505221 : ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
1271 : rtx operands[], bool use_ndd)
1272 : {
1273 13505221 : rtx dst = operands[0];
1274 13505221 : rtx src1 = operands[1];
1275 13505221 : rtx src2 = operands[2];
1276 :
1277 : /* Canonicalize operand order. */
1278 13505221 : if (ix86_swap_binary_operands_p (code, mode, operands))
1279 : {
1280 : /* It is invalid to swap operands of different modes. */
1281 88322 : gcc_assert (GET_MODE (src1) == GET_MODE (src2));
1282 :
1283 : std::swap (src1, src2);
1284 : }
1285 :
1286 : /* Both source operands cannot be in memory. */
1287 13505221 : if (MEM_P (src1) && MEM_P (src2))
1288 : {
1289 : /* Optimization: Only read from memory once. */
1290 110575 : if (rtx_equal_p (src1, src2))
1291 : {
1292 17 : src2 = force_reg (mode, src2);
1293 17 : src1 = src2;
1294 : }
1295 110558 : else if (rtx_equal_p (dst, src1))
1296 3424 : src2 = force_reg (mode, src2);
1297 : else
1298 107134 : src1 = force_reg (mode, src1);
1299 : }
1300 :
1301 : /* If the destination is memory, and we do not have matching source
1302 : operands, do things in registers. */
1303 13505221 : if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1304 485024 : dst = gen_reg_rtx (mode);
1305 :
1306 : /* Source 1 cannot be a constant. */
1307 13505221 : if (CONSTANT_P (src1))
1308 714 : src1 = force_reg (mode, src1);
1309 :
1310 : /* Source 1 cannot be a non-matching memory. */
1311 13505221 : if (!use_ndd && MEM_P (src1) && !rtx_equal_p (dst, src1))
1312 438897 : src1 = force_reg (mode, src1);
1313 :
1314 : /* Improve address combine. */
1315 13505221 : if (code == PLUS
1316 9928623 : && GET_MODE_CLASS (mode) == MODE_INT
1317 9817445 : && MEM_P (src2))
1318 177520 : src2 = force_reg (mode, src2);
1319 :
1320 13505221 : operands[1] = src1;
1321 13505221 : operands[2] = src2;
1322 13505221 : return dst;
1323 : }
1324 :
1325 : /* Similarly, but assume that the destination has already been
1326 : set up properly. */
1327 :
1328 : void
1329 294264 : ix86_fixup_binary_operands_no_copy (enum rtx_code code,
1330 : machine_mode mode, rtx operands[],
1331 : bool use_ndd)
1332 : {
1333 294264 : rtx dst = ix86_fixup_binary_operands (code, mode, operands, use_ndd);
1334 294264 : gcc_assert (dst == operands[0]);
1335 294264 : }
1336 :
1337 : /* Attempt to expand a binary operator. Make the expansion closer to the
1338 : actual machine, then just general_operand, which will allow 3 separate
1339 : memory references (one output, two input) in a single insn. */
1340 :
1341 : void
1342 13210828 : ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
1343 : rtx operands[], bool use_ndd)
1344 : {
1345 13210828 : rtx src1, src2, dst, op, clob;
1346 :
1347 13210828 : dst = ix86_fixup_binary_operands (code, mode, operands, use_ndd);
1348 13210828 : src1 = operands[1];
1349 13210828 : src2 = operands[2];
1350 :
1351 : /* Emit the instruction. */
1352 :
1353 13210828 : op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
1354 :
1355 13210828 : if (reload_completed
1356 80480 : && code == PLUS
1357 908 : && !rtx_equal_p (dst, src1)
1358 13210828 : && !use_ndd)
1359 : {
1360 : /* This is going to be an LEA; avoid splitting it later. */
1361 0 : emit_insn (op);
1362 : }
1363 : else
1364 : {
1365 13210828 : clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1366 13210828 : emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1367 : }
1368 :
1369 : /* Fix up the destination if needed. */
1370 13210828 : if (dst != operands[0])
1371 485015 : emit_move_insn (operands[0], dst);
1372 13210828 : }
1373 :
1374 : /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
1375 : the given OPERANDS. */
1376 :
1377 : void
1378 83306 : ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
1379 : rtx operands[])
1380 : {
1381 83306 : rtx op1 = NULL_RTX, op2 = NULL_RTX;
1382 83306 : if (SUBREG_P (operands[1]))
1383 : {
1384 312 : op1 = operands[1];
1385 312 : op2 = operands[2];
1386 : }
1387 82994 : else if (SUBREG_P (operands[2]))
1388 : {
1389 : op1 = operands[2];
1390 : op2 = operands[1];
1391 : }
1392 : /* Optimize (__m128i) d | (__m128i) e and similar code
1393 : when d and e are float vectors into float vector logical
1394 : insn. In C/C++ without using intrinsics there is no other way
1395 : to express vector logical operation on float vectors than
1396 : to cast them temporarily to integer vectors. */
1397 3145 : if (op1
1398 3145 : && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
1399 3145 : && (SUBREG_P (op2) || CONST_VECTOR_P (op2))
1400 298 : && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
1401 303 : && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
1402 101 : && SUBREG_BYTE (op1) == 0
1403 101 : && (CONST_VECTOR_P (op2)
1404 1 : || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
1405 1 : && SUBREG_BYTE (op2) == 0))
1406 101 : && can_create_pseudo_p ())
1407 : {
1408 101 : rtx dst;
1409 101 : switch (GET_MODE (SUBREG_REG (op1)))
1410 : {
1411 17 : case E_V4SFmode:
1412 17 : case E_V8SFmode:
1413 17 : case E_V16SFmode:
1414 17 : case E_V2DFmode:
1415 17 : case E_V4DFmode:
1416 17 : case E_V8DFmode:
1417 17 : dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1418 17 : if (CONST_VECTOR_P (op2))
1419 : {
1420 16 : op2 = gen_lowpart (GET_MODE (dst), op2);
1421 16 : op2 = force_reg (GET_MODE (dst), op2);
1422 : }
1423 : else
1424 : {
1425 1 : op1 = operands[1];
1426 1 : op2 = SUBREG_REG (operands[2]);
1427 1 : if (!vector_operand (op2, GET_MODE (dst)))
1428 0 : op2 = force_reg (GET_MODE (dst), op2);
1429 : }
1430 17 : op1 = SUBREG_REG (op1);
1431 17 : if (!vector_operand (op1, GET_MODE (dst)))
1432 0 : op1 = force_reg (GET_MODE (dst), op1);
1433 17 : emit_insn (gen_rtx_SET (dst,
1434 : gen_rtx_fmt_ee (code, GET_MODE (dst),
1435 : op1, op2)));
1436 17 : emit_move_insn (operands[0], gen_lowpart (mode, dst));
1437 17 : return;
1438 : default:
1439 : break;
1440 : }
1441 : }
1442 83289 : if (!vector_operand (operands[1], mode))
1443 0 : operands[1] = force_reg (mode, operands[1]);
1444 83289 : if (!vector_operand (operands[2], mode))
1445 11277 : operands[2] = force_reg (mode, operands[2]);
1446 83289 : ix86_fixup_binary_operands_no_copy (code, mode, operands);
1447 83289 : emit_insn (gen_rtx_SET (operands[0],
1448 : gen_rtx_fmt_ee (code, mode, operands[1],
1449 : operands[2])));
1450 : }
1451 :
1452 : /* Return TRUE or FALSE depending on whether the binary operator meets the
1453 : appropriate constraints. */
1454 :
1455 : bool
1456 161606949 : ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1457 : rtx operands[3], bool use_ndd)
1458 : {
1459 161606949 : rtx dst = operands[0];
1460 161606949 : rtx src1 = operands[1];
1461 161606949 : rtx src2 = operands[2];
1462 :
1463 : /* Both source operands cannot be in memory. */
1464 154249898 : if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
1465 161607334 : && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
1466 998785 : return false;
1467 :
1468 : /* Canonicalize operand order for commutative operators. */
1469 160608164 : if (ix86_swap_binary_operands_p (code, mode, operands))
1470 534042 : std::swap (src1, src2);
1471 :
1472 : /* If the destination is memory, we must have a matching source operand. */
1473 160608164 : if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1474 : return false;
1475 :
1476 : /* Source 1 cannot be a constant. */
1477 155574993 : if (CONSTANT_P (src1))
1478 : return false;
1479 :
1480 : /* Source 1 cannot be a non-matching memory. */
1481 155571944 : if (!use_ndd && MEM_P (src1) && !rtx_equal_p (dst, src1))
1482 : /* Support "andhi/andsi/anddi" as a zero-extending move. */
1483 4405557 : return (code == AND
1484 510996 : && (mode == HImode
1485 510996 : || mode == SImode
1486 306319 : || (TARGET_64BIT && mode == DImode))
1487 4705379 : && satisfies_constraint_L (src2));
1488 :
1489 : return true;
1490 : }
1491 :
1492 : /* Attempt to expand a unary operator. Make the expansion closer to the
1493 : actual machine, then just general_operand, which will allow 2 separate
1494 : memory references (one output, one input) in a single insn. */
1495 :
1496 : void
1497 120568 : ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1498 : rtx operands[], bool use_ndd)
1499 : {
1500 120568 : bool matching_memory = false;
1501 120568 : rtx src, dst, op, clob;
1502 :
1503 120568 : dst = operands[0];
1504 120568 : src = operands[1];
1505 :
1506 : /* If the destination is memory, and we do not have matching source
1507 : operands, do things in registers. */
1508 120568 : if (MEM_P (dst))
1509 : {
1510 3350 : if (rtx_equal_p (dst, src))
1511 : matching_memory = true;
1512 : else
1513 3034 : dst = gen_reg_rtx (mode);
1514 : }
1515 :
1516 : /* When source operand is memory, destination must match. */
1517 120568 : if (!use_ndd && MEM_P (src) && !matching_memory)
1518 4661 : src = force_reg (mode, src);
1519 :
1520 : /* Emit the instruction. */
1521 :
1522 120568 : op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1523 :
1524 120568 : if (code == NOT)
1525 69733 : emit_insn (op);
1526 : else
1527 : {
1528 50835 : clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1529 50835 : emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1530 : }
1531 :
1532 : /* Fix up the destination if needed. */
1533 120568 : if (dst != operands[0])
1534 3034 : emit_move_insn (operands[0], dst);
1535 120568 : }
1536 :
1537 : /* Return TRUE or FALSE depending on whether the unary operator meets the
1538 : appropriate constraints. */
1539 :
1540 : bool
1541 1754446 : ix86_unary_operator_ok (enum rtx_code,
1542 : machine_mode,
1543 : rtx operands[2],
1544 : bool use_ndd)
1545 : {
1546 : /* If one of operands is memory, source and destination must match. */
1547 1754446 : if ((MEM_P (operands[0])
1548 1709677 : || (!use_ndd && MEM_P (operands[1])))
1549 1783295 : && ! rtx_equal_p (operands[0], operands[1]))
1550 : return false;
1551 : return true;
1552 : }
1553 :
1554 : /* Predict just emitted jump instruction to be taken with probability PROB. */
1555 :
1556 : static void
1557 66402 : predict_jump (int prob)
1558 : {
1559 66402 : rtx_insn *insn = get_last_insn ();
1560 66402 : gcc_assert (JUMP_P (insn));
1561 66402 : add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1562 66402 : }
1563 :
1564 : /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1565 : divisor are within the range [0-255]. */
1566 :
1567 : void
1568 27 : ix86_split_idivmod (machine_mode mode, rtx operands[],
1569 : bool unsigned_p)
1570 : {
1571 27 : rtx_code_label *end_label, *qimode_label;
1572 27 : rtx div, mod;
1573 27 : rtx_insn *insn;
1574 27 : rtx scratch, tmp0, tmp1, tmp2;
1575 27 : rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1576 :
1577 27 : operands[2] = force_reg (mode, operands[2]);
1578 27 : operands[3] = force_reg (mode, operands[3]);
1579 :
1580 27 : switch (mode)
1581 : {
1582 20 : case E_SImode:
1583 20 : if (GET_MODE (operands[0]) == SImode)
1584 : {
1585 16 : if (GET_MODE (operands[1]) == SImode)
1586 14 : gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1587 : else
1588 2 : gen_divmod4_1
1589 2 : = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1590 : }
1591 : else
1592 4 : gen_divmod4_1
1593 4 : = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1594 : break;
1595 :
1596 7 : case E_DImode:
1597 7 : gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1598 : break;
1599 :
1600 0 : default:
1601 0 : gcc_unreachable ();
1602 : }
1603 :
1604 27 : end_label = gen_label_rtx ();
1605 27 : qimode_label = gen_label_rtx ();
1606 :
1607 27 : scratch = gen_reg_rtx (mode);
1608 :
1609 : /* Use 8bit unsigned divimod if dividend and divisor are within
1610 : the range [0-255]. */
1611 27 : emit_move_insn (scratch, operands[2]);
1612 27 : scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1613 : scratch, 1, OPTAB_DIRECT);
1614 27 : emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
1615 27 : tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1616 27 : tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1617 27 : tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1618 : gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1619 : pc_rtx);
1620 27 : insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1621 27 : predict_jump (REG_BR_PROB_BASE * 50 / 100);
1622 27 : JUMP_LABEL (insn) = qimode_label;
1623 :
1624 : /* Generate original signed/unsigned divimod. */
1625 27 : emit_insn (gen_divmod4_1 (operands[0], operands[1],
1626 : operands[2], operands[3]));
1627 :
1628 : /* Branch to the end. */
1629 27 : emit_jump_insn (gen_jump (end_label));
1630 27 : emit_barrier ();
1631 :
1632 : /* Generate 8bit unsigned divide. */
1633 27 : emit_label (qimode_label);
1634 : /* Don't use operands[0] for result of 8bit divide since not all
1635 : registers support QImode ZERO_EXTRACT. */
1636 27 : tmp0 = lowpart_subreg (HImode, scratch, mode);
1637 27 : tmp1 = lowpart_subreg (HImode, operands[2], mode);
1638 27 : tmp2 = lowpart_subreg (QImode, operands[3], mode);
1639 27 : emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1640 :
1641 27 : if (unsigned_p)
1642 : {
1643 12 : div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1644 12 : mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1645 : }
1646 : else
1647 : {
1648 15 : div = gen_rtx_DIV (mode, operands[2], operands[3]);
1649 15 : mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1650 : }
1651 27 : if (mode == SImode)
1652 : {
1653 20 : if (GET_MODE (operands[0]) != SImode)
1654 4 : div = gen_rtx_ZERO_EXTEND (DImode, div);
1655 20 : if (GET_MODE (operands[1]) != SImode)
1656 2 : mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1657 : }
1658 :
1659 : /* Extract remainder from AH. */
1660 27 : scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
1661 27 : tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
1662 : GEN_INT (8), GEN_INT (8));
1663 27 : insn = emit_move_insn (operands[1], tmp1);
1664 27 : set_unique_reg_note (insn, REG_EQUAL, mod);
1665 :
1666 : /* Zero extend quotient from AL. */
1667 27 : tmp1 = gen_lowpart (QImode, tmp0);
1668 27 : insn = emit_insn (gen_extend_insn
1669 27 : (operands[0], tmp1,
1670 27 : GET_MODE (operands[0]), QImode, 1));
1671 27 : set_unique_reg_note (insn, REG_EQUAL, div);
1672 :
1673 27 : emit_label (end_label);
1674 27 : }
1675 :
1676 : /* Emit x86 binary operand CODE in mode MODE, where the first operand
1677 : matches destination. RTX includes clobber of FLAGS_REG. */
1678 :
1679 : void
1680 7734 : ix86_emit_binop (enum rtx_code code, machine_mode mode,
1681 : rtx dst, rtx src)
1682 : {
1683 7734 : rtx op, clob;
1684 :
1685 7734 : op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1686 7734 : clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1687 :
1688 7734 : emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1689 7734 : }
1690 :
1691 : /* Return true if regno1 def is nearest to the insn. */
1692 :
1693 : static bool
1694 15 : find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1695 : {
1696 15 : rtx_insn *prev = insn;
1697 15 : rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1698 :
1699 15 : if (insn == start)
1700 : return false;
1701 40 : while (prev && prev != start)
1702 : {
1703 30 : if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1704 : {
1705 10 : prev = PREV_INSN (prev);
1706 10 : continue;
1707 : }
1708 20 : if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1709 : return true;
1710 15 : else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1711 : return false;
1712 15 : prev = PREV_INSN (prev);
1713 : }
1714 :
1715 : /* None of the regs is defined in the bb. */
1716 : return false;
1717 : }
1718 :
1719 : /* INSN_UID of the last insn emitted by zero store peephole2s. */
1720 : int ix86_last_zero_store_uid;
1721 :
1722 : /* Split lea instructions into a sequence of instructions
1723 : which are executed on ALU to avoid AGU stalls.
1724 : It is assumed that it is allowed to clobber flags register
1725 : at lea position. */
1726 :
1727 : void
1728 5915 : ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1729 : {
1730 5915 : unsigned int regno0, regno1, regno2;
1731 5915 : struct ix86_address parts;
1732 5915 : rtx target, tmp;
1733 5915 : int ok, adds;
1734 :
1735 5915 : ok = ix86_decompose_address (operands[1], &parts);
1736 5915 : gcc_assert (ok);
1737 :
1738 5915 : target = gen_lowpart (mode, operands[0]);
1739 :
1740 5915 : regno0 = true_regnum (target);
1741 5915 : regno1 = INVALID_REGNUM;
1742 5915 : regno2 = INVALID_REGNUM;
1743 :
1744 5915 : if (parts.base)
1745 : {
1746 5907 : parts.base = gen_lowpart (mode, parts.base);
1747 5907 : regno1 = true_regnum (parts.base);
1748 : }
1749 :
1750 5915 : if (parts.index)
1751 : {
1752 5912 : parts.index = gen_lowpart (mode, parts.index);
1753 5912 : regno2 = true_regnum (parts.index);
1754 : }
1755 :
1756 5915 : if (parts.disp)
1757 190 : parts.disp = gen_lowpart (mode, parts.disp);
1758 :
1759 5915 : if (parts.scale > 1)
1760 : {
1761 : /* Case r1 = r1 + ... */
1762 11 : if (regno1 == regno0)
1763 : {
1764 : /* If we have a case r1 = r1 + C * r2 then we
1765 : should use multiplication which is very
1766 : expensive. Assume cost model is wrong if we
1767 : have such case here. */
1768 0 : gcc_assert (regno2 != regno0);
1769 :
1770 0 : for (adds = parts.scale; adds > 0; adds--)
1771 0 : ix86_emit_binop (PLUS, mode, target, parts.index);
1772 : }
1773 : else
1774 : {
1775 : /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1776 11 : if (regno0 != regno2)
1777 8 : emit_insn (gen_rtx_SET (target, parts.index));
1778 :
1779 : /* Use shift for scaling, but emit it as MULT instead
1780 : to avoid it being immediately peephole2 optimized back
1781 : into lea. */
1782 11 : ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
1783 :
1784 11 : if (parts.base)
1785 3 : ix86_emit_binop (PLUS, mode, target, parts.base);
1786 :
1787 11 : if (parts.disp && parts.disp != const0_rtx)
1788 3 : ix86_emit_binop (PLUS, mode, target, parts.disp);
1789 : }
1790 : }
1791 5904 : else if (!parts.base && !parts.index)
1792 : {
1793 0 : gcc_assert(parts.disp);
1794 0 : emit_insn (gen_rtx_SET (target, parts.disp));
1795 : }
1796 : else
1797 : {
1798 5904 : if (!parts.base)
1799 : {
1800 0 : if (regno0 != regno2)
1801 0 : emit_insn (gen_rtx_SET (target, parts.index));
1802 : }
1803 5904 : else if (!parts.index)
1804 : {
1805 3 : if (regno0 != regno1)
1806 1 : emit_insn (gen_rtx_SET (target, parts.base));
1807 : }
1808 : else
1809 : {
1810 5901 : if (regno0 == regno1)
1811 : tmp = parts.index;
1812 2972 : else if (regno0 == regno2)
1813 : tmp = parts.base;
1814 : else
1815 : {
1816 15 : rtx tmp1;
1817 :
1818 : /* Find better operand for SET instruction, depending
1819 : on which definition is farther from the insn. */
1820 15 : if (find_nearest_reg_def (insn, regno1, regno2))
1821 5 : tmp = parts.index, tmp1 = parts.base;
1822 : else
1823 10 : tmp = parts.base, tmp1 = parts.index;
1824 :
1825 15 : emit_insn (gen_rtx_SET (target, tmp));
1826 :
1827 15 : if (parts.disp && parts.disp != const0_rtx)
1828 0 : ix86_emit_binop (PLUS, mode, target, parts.disp);
1829 :
1830 15 : ix86_emit_binop (PLUS, mode, target, tmp1);
1831 15 : return;
1832 : }
1833 :
1834 5886 : ix86_emit_binop (PLUS, mode, target, tmp);
1835 : }
1836 :
1837 5889 : if (parts.disp && parts.disp != const0_rtx)
1838 4 : ix86_emit_binop (PLUS, mode, target, parts.disp);
1839 : }
1840 : }
1841 :
1842 : /* Post-reload splitter for converting an SF or DFmode value in an
1843 : SSE register into an unsigned SImode. */
1844 :
1845 : void
1846 0 : ix86_split_convert_uns_si_sse (rtx operands[])
1847 : {
1848 0 : machine_mode vecmode;
1849 0 : rtx value, large, zero_or_two31, input, two31, x;
1850 :
1851 0 : large = operands[1];
1852 0 : zero_or_two31 = operands[2];
1853 0 : input = operands[3];
1854 0 : two31 = operands[4];
1855 0 : vecmode = GET_MODE (large);
1856 0 : value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1857 :
1858 : /* Load up the value into the low element. We must ensure that the other
1859 : elements are valid floats -- zero is the easiest such value. */
1860 0 : if (MEM_P (input))
1861 : {
1862 0 : if (vecmode == V4SFmode)
1863 0 : emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1864 : else
1865 0 : emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1866 : }
1867 : else
1868 : {
1869 0 : input = gen_rtx_REG (vecmode, REGNO (input));
1870 0 : emit_move_insn (value, CONST0_RTX (vecmode));
1871 0 : if (vecmode == V4SFmode)
1872 0 : emit_insn (gen_sse_movss_v4sf (value, value, input));
1873 : else
1874 0 : emit_insn (gen_sse2_movsd_v2df (value, value, input));
1875 : }
1876 :
1877 0 : emit_move_insn (large, two31);
1878 0 : emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1879 :
1880 0 : x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1881 0 : emit_insn (gen_rtx_SET (large, x));
1882 :
1883 0 : x = gen_rtx_AND (vecmode, zero_or_two31, large);
1884 0 : emit_insn (gen_rtx_SET (zero_or_two31, x));
1885 :
1886 0 : x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1887 0 : emit_insn (gen_rtx_SET (value, x));
1888 :
1889 0 : large = gen_rtx_REG (V4SImode, REGNO (large));
1890 0 : emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1891 :
1892 0 : x = gen_rtx_REG (V4SImode, REGNO (value));
1893 0 : if (vecmode == V4SFmode)
1894 0 : emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1895 : else
1896 0 : emit_insn (gen_sse2_cvttpd2dq (x, value));
1897 0 : value = x;
1898 :
1899 0 : emit_insn (gen_xorv4si3 (value, value, large));
1900 0 : }
1901 :
1902 : /* Convert an unsigned DImode value into a DFmode, using only SSE.
1903 : Expects the 64-bit DImode to be supplied in a pair of integral
1904 : registers. Requires SSE2; will use SSE3 if available. For x86_32,
1905 : -mfpmath=sse, !optimize_size only. */
1906 :
1907 : void
1908 0 : ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1909 : {
1910 0 : REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1911 0 : rtx int_xmm, fp_xmm;
1912 0 : rtx biases, exponents;
1913 0 : rtx x;
1914 :
1915 0 : int_xmm = gen_reg_rtx (V4SImode);
1916 0 : if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1917 0 : emit_insn (gen_movdi_to_sse (int_xmm, input));
1918 0 : else if (TARGET_SSE_SPLIT_REGS)
1919 : {
1920 0 : emit_clobber (int_xmm);
1921 0 : emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1922 : }
1923 : else
1924 : {
1925 0 : x = gen_reg_rtx (V2DImode);
1926 0 : ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1927 0 : emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1928 : }
1929 :
1930 0 : x = gen_rtx_CONST_VECTOR (V4SImode,
1931 : gen_rtvec (4, GEN_INT (0x43300000UL),
1932 : GEN_INT (0x45300000UL),
1933 : const0_rtx, const0_rtx));
1934 0 : exponents = validize_mem (force_const_mem (V4SImode, x));
1935 :
1936 : /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1937 0 : emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1938 :
1939 : /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1940 : yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1941 : Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1942 : (0x1.0p84 + double(fp_value_hi_xmm)).
1943 : Note these exponents differ by 32. */
1944 :
1945 0 : fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1946 :
1947 : /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1948 : in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1949 0 : real_ldexp (&bias_lo_rvt, &dconst1, 52);
1950 0 : real_ldexp (&bias_hi_rvt, &dconst1, 84);
1951 0 : biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1952 0 : x = const_double_from_real_value (bias_hi_rvt, DFmode);
1953 0 : biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1954 0 : biases = validize_mem (force_const_mem (V2DFmode, biases));
1955 0 : emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1956 :
1957 : /* Add the upper and lower DFmode values together. */
1958 0 : if (TARGET_SSE3)
1959 0 : emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1960 : else
1961 : {
1962 0 : x = copy_to_mode_reg (V2DFmode, fp_xmm);
1963 0 : emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1964 0 : emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1965 : }
1966 :
1967 0 : ix86_expand_vector_extract (false, target, fp_xmm, 0);
1968 0 : }
1969 :
1970 : /* Not used, but eases macroization of patterns. */
1971 : void
1972 0 : ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1973 : {
1974 0 : gcc_unreachable ();
1975 : }
1976 :
1977 : static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
1978 :
1979 : /* Convert an unsigned SImode value into a DFmode. Only currently used
1980 : for SSE, but applicable anywhere. */
1981 :
1982 : void
1983 0 : ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1984 : {
1985 0 : REAL_VALUE_TYPE TWO31r;
1986 0 : rtx x, fp;
1987 :
1988 0 : x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1989 : NULL, 1, OPTAB_DIRECT);
1990 :
1991 0 : fp = gen_reg_rtx (DFmode);
1992 0 : emit_insn (gen_floatsidf2 (fp, x));
1993 :
1994 0 : real_ldexp (&TWO31r, &dconst1, 31);
1995 0 : x = const_double_from_real_value (TWO31r, DFmode);
1996 :
1997 0 : x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1998 :
1999 : /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
2000 0 : if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
2001 0 : x = ix86_expand_sse_fabs (x, NULL);
2002 :
2003 0 : if (x != target)
2004 0 : emit_move_insn (target, x);
2005 0 : }
2006 :
2007 : /* Convert a signed DImode value into a DFmode. Only used for SSE in
2008 : 32-bit mode; otherwise we have a direct convert instruction. */
2009 :
2010 : void
2011 0 : ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
2012 : {
2013 0 : REAL_VALUE_TYPE TWO32r;
2014 0 : rtx fp_lo, fp_hi, x;
2015 :
2016 0 : fp_lo = gen_reg_rtx (DFmode);
2017 0 : fp_hi = gen_reg_rtx (DFmode);
2018 :
2019 0 : emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
2020 :
2021 0 : real_ldexp (&TWO32r, &dconst1, 32);
2022 0 : x = const_double_from_real_value (TWO32r, DFmode);
2023 0 : fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
2024 :
2025 0 : ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
2026 :
2027 0 : x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
2028 : 0, OPTAB_DIRECT);
2029 0 : if (x != target)
2030 0 : emit_move_insn (target, x);
2031 0 : }
2032 :
2033 : /* Convert an unsigned SImode value into a SFmode, using only SSE.
2034 : For x86_32, -mfpmath=sse, !optimize_size only. */
2035 : void
2036 0 : ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
2037 : {
2038 0 : REAL_VALUE_TYPE ONE16r;
2039 0 : rtx fp_hi, fp_lo, int_hi, int_lo, x;
2040 :
2041 0 : real_ldexp (&ONE16r, &dconst1, 16);
2042 0 : x = const_double_from_real_value (ONE16r, SFmode);
2043 0 : int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
2044 : NULL, 0, OPTAB_DIRECT);
2045 0 : int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
2046 : NULL, 0, OPTAB_DIRECT);
2047 0 : fp_hi = gen_reg_rtx (SFmode);
2048 0 : fp_lo = gen_reg_rtx (SFmode);
2049 0 : emit_insn (gen_floatsisf2 (fp_hi, int_hi));
2050 0 : emit_insn (gen_floatsisf2 (fp_lo, int_lo));
2051 0 : if (TARGET_FMA)
2052 : {
2053 0 : x = validize_mem (force_const_mem (SFmode, x));
2054 0 : fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo);
2055 0 : emit_move_insn (target, fp_hi);
2056 : }
2057 : else
2058 : {
2059 0 : fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
2060 : 0, OPTAB_DIRECT);
2061 0 : fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
2062 : 0, OPTAB_DIRECT);
2063 0 : if (!rtx_equal_p (target, fp_hi))
2064 0 : emit_move_insn (target, fp_hi);
2065 : }
2066 0 : }
2067 :
2068 : /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
2069 : a vector of unsigned ints VAL to vector of floats TARGET. */
2070 :
2071 : void
2072 54 : ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
2073 : {
2074 54 : rtx tmp[8];
2075 54 : REAL_VALUE_TYPE TWO16r;
2076 54 : machine_mode intmode = GET_MODE (val);
2077 54 : machine_mode fltmode = GET_MODE (target);
2078 54 : rtx (*cvt) (rtx, rtx);
2079 :
2080 54 : if (intmode == V4SImode)
2081 : cvt = gen_floatv4siv4sf2;
2082 : else
2083 2 : cvt = gen_floatv8siv8sf2;
2084 54 : tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
2085 54 : tmp[0] = force_reg (intmode, tmp[0]);
2086 54 : tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
2087 : OPTAB_DIRECT);
2088 54 : tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
2089 : NULL_RTX, 1, OPTAB_DIRECT);
2090 54 : tmp[3] = gen_reg_rtx (fltmode);
2091 54 : emit_insn (cvt (tmp[3], tmp[1]));
2092 54 : tmp[4] = gen_reg_rtx (fltmode);
2093 54 : emit_insn (cvt (tmp[4], tmp[2]));
2094 54 : real_ldexp (&TWO16r, &dconst1, 16);
2095 54 : tmp[5] = const_double_from_real_value (TWO16r, SFmode);
2096 54 : tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
2097 54 : if (TARGET_FMA)
2098 : {
2099 1 : tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]);
2100 1 : emit_move_insn (target, tmp[6]);
2101 : }
2102 : else
2103 : {
2104 53 : tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5],
2105 : NULL_RTX, 1, OPTAB_DIRECT);
2106 53 : tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6],
2107 : target, 1, OPTAB_DIRECT);
2108 53 : if (tmp[7] != target)
2109 0 : emit_move_insn (target, tmp[7]);
2110 : }
2111 54 : }
2112 :
2113 : /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
2114 : pattern can be used on it instead of fixuns_trunc*.
2115 : This is done by doing just signed conversion if < 0x1p31, and otherwise by
2116 : subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
2117 :
2118 : rtx
2119 286 : ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
2120 : {
2121 286 : REAL_VALUE_TYPE TWO31r;
2122 286 : rtx two31r, tmp[4];
2123 286 : machine_mode mode = GET_MODE (val);
2124 286 : machine_mode scalarmode = GET_MODE_INNER (mode);
2125 572 : machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
2126 286 : rtx (*cmp) (rtx, rtx, rtx, rtx);
2127 286 : int i;
2128 :
2129 1144 : for (i = 0; i < 3; i++)
2130 858 : tmp[i] = gen_reg_rtx (mode);
2131 286 : real_ldexp (&TWO31r, &dconst1, 31);
2132 286 : two31r = const_double_from_real_value (TWO31r, scalarmode);
2133 286 : two31r = ix86_build_const_vector (mode, 1, two31r);
2134 286 : two31r = force_reg (mode, two31r);
2135 286 : switch (mode)
2136 : {
2137 : case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
2138 10 : case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
2139 16 : case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
2140 260 : case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
2141 0 : default: gcc_unreachable ();
2142 : }
2143 286 : tmp[3] = gen_rtx_LE (mode, two31r, val);
2144 286 : emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
2145 286 : tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
2146 : 0, OPTAB_DIRECT);
2147 286 : if (intmode == V4SImode || TARGET_AVX2)
2148 572 : *xorp = expand_simple_binop (intmode, ASHIFT,
2149 286 : gen_lowpart (intmode, tmp[0]),
2150 : GEN_INT (31), NULL_RTX, 0,
2151 : OPTAB_DIRECT);
2152 : else
2153 : {
2154 0 : rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
2155 0 : two31 = ix86_build_const_vector (intmode, 1, two31);
2156 0 : *xorp = expand_simple_binop (intmode, AND,
2157 0 : gen_lowpart (intmode, tmp[0]),
2158 : two31, NULL_RTX, 0,
2159 : OPTAB_DIRECT);
2160 : }
2161 286 : return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
2162 286 : 0, OPTAB_DIRECT);
2163 : }
2164 :
2165 : /* Generate code for floating point ABS or NEG. */
2166 :
2167 : void
2168 32875 : ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2169 : rtx operands[])
2170 : {
2171 32875 : rtx set, dst, src;
2172 32875 : bool use_sse = false;
2173 32875 : bool vector_mode = VECTOR_MODE_P (mode);
2174 32875 : machine_mode vmode = mode;
2175 32875 : rtvec par;
2176 :
2177 32875 : switch (mode)
2178 : {
2179 : case E_HFmode:
2180 : use_sse = true;
2181 : vmode = V8HFmode;
2182 : break;
2183 0 : case E_BFmode:
2184 0 : use_sse = true;
2185 0 : vmode = V8BFmode;
2186 0 : break;
2187 8986 : case E_SFmode:
2188 8986 : use_sse = TARGET_SSE_MATH && TARGET_SSE;
2189 : vmode = V4SFmode;
2190 : break;
2191 15427 : case E_DFmode:
2192 15427 : use_sse = TARGET_SSE_MATH && TARGET_SSE2;
2193 : vmode = V2DFmode;
2194 : break;
2195 8263 : default:
2196 8263 : use_sse = vector_mode || mode == TFmode;
2197 8263 : break;
2198 : }
2199 :
2200 32875 : dst = operands[0];
2201 32875 : src = operands[1];
2202 :
2203 32875 : set = gen_rtx_fmt_e (code, mode, src);
2204 32875 : set = gen_rtx_SET (dst, set);
2205 :
2206 32875 : if (use_sse)
2207 : {
2208 27702 : rtx mask, use, clob;
2209 :
2210 : /* NEG and ABS performed with SSE use bitwise mask operations.
2211 : Create the appropriate mask now. */
2212 27702 : mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
2213 27702 : use = gen_rtx_USE (VOIDmode, mask);
2214 27702 : if (vector_mode || mode == TFmode)
2215 4411 : par = gen_rtvec (2, set, use);
2216 : else
2217 : {
2218 23291 : clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2219 23291 : par = gen_rtvec (3, set, use, clob);
2220 : }
2221 : }
2222 : else
2223 : {
2224 5173 : rtx clob;
2225 :
2226 : /* Changing of sign for FP values is doable using integer unit too. */
2227 5173 : clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2228 5173 : par = gen_rtvec (2, set, clob);
2229 : }
2230 :
2231 32875 : emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2232 32875 : }
2233 :
2234 : /* Deconstruct a floating point ABS or NEG operation
2235 : with integer registers into integer operations. */
2236 :
2237 : void
2238 24 : ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2239 : rtx operands[])
2240 : {
2241 24 : enum rtx_code absneg_op;
2242 24 : rtx dst, set;
2243 :
2244 24 : gcc_assert (operands_match_p (operands[0], operands[1]));
2245 :
2246 24 : switch (mode)
2247 : {
2248 0 : case E_SFmode:
2249 0 : dst = gen_lowpart (SImode, operands[0]);
2250 :
2251 0 : if (code == ABS)
2252 : {
2253 0 : set = gen_int_mode (0x7fffffff, SImode);
2254 0 : absneg_op = AND;
2255 : }
2256 : else
2257 : {
2258 0 : set = gen_int_mode (0x80000000, SImode);
2259 0 : absneg_op = XOR;
2260 : }
2261 0 : set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2262 0 : break;
2263 :
2264 1 : case E_DFmode:
2265 1 : if (TARGET_64BIT)
2266 : {
2267 1 : dst = gen_lowpart (DImode, operands[0]);
2268 1 : dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
2269 :
2270 1 : if (code == ABS)
2271 0 : set = const0_rtx;
2272 : else
2273 1 : set = gen_rtx_NOT (DImode, dst);
2274 : }
2275 : else
2276 : {
2277 0 : dst = gen_highpart (SImode, operands[0]);
2278 :
2279 0 : if (code == ABS)
2280 : {
2281 0 : set = gen_int_mode (0x7fffffff, SImode);
2282 0 : absneg_op = AND;
2283 : }
2284 : else
2285 : {
2286 0 : set = gen_int_mode (0x80000000, SImode);
2287 0 : absneg_op = XOR;
2288 : }
2289 0 : set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2290 : }
2291 : break;
2292 :
2293 23 : case E_XFmode:
2294 23 : dst = gen_rtx_REG (SImode,
2295 23 : REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
2296 23 : if (code == ABS)
2297 : {
2298 1 : set = GEN_INT (0x7fff);
2299 1 : absneg_op = AND;
2300 : }
2301 : else
2302 : {
2303 22 : set = GEN_INT (0x8000);
2304 22 : absneg_op = XOR;
2305 : }
2306 23 : set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2307 23 : break;
2308 :
2309 0 : default:
2310 0 : gcc_unreachable ();
2311 : }
2312 :
2313 24 : set = gen_rtx_SET (dst, set);
2314 :
2315 24 : rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2316 24 : rtvec par = gen_rtvec (2, set, clob);
2317 :
2318 24 : emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2319 24 : }
2320 :
2321 : /* Expand a copysign operation. Special case operand 0 being a constant. */
2322 :
2323 : void
2324 23236 : ix86_expand_copysign (rtx operands[])
2325 : {
2326 23236 : machine_mode mode, vmode;
2327 23236 : rtx dest, vdest, op0, op1, mask, op2, op3;
2328 :
2329 23236 : mode = GET_MODE (operands[0]);
2330 :
2331 23236 : switch (mode)
2332 : {
2333 : case E_HFmode:
2334 : vmode = V8HFmode;
2335 : break;
2336 0 : case E_BFmode:
2337 0 : vmode = V8BFmode;
2338 0 : break;
2339 11562 : case E_SFmode:
2340 11562 : vmode = V4SFmode;
2341 11562 : break;
2342 11535 : case E_DFmode:
2343 11535 : vmode = V2DFmode;
2344 11535 : break;
2345 127 : case E_TFmode:
2346 127 : vmode = mode;
2347 127 : break;
2348 0 : default:
2349 0 : gcc_unreachable();
2350 : }
2351 :
2352 23236 : if (rtx_equal_p (operands[1], operands[2]))
2353 : {
2354 0 : emit_move_insn (operands[0], operands[1]);
2355 0 : return;
2356 : }
2357 :
2358 23236 : dest = operands[0];
2359 23236 : vdest = lowpart_subreg (vmode, dest, mode);
2360 23236 : if (vdest == NULL_RTX)
2361 0 : vdest = gen_reg_rtx (vmode);
2362 : else
2363 : dest = NULL_RTX;
2364 23236 : op1 = lowpart_subreg (vmode, force_reg (mode, operands[1]), mode);
2365 46458 : mask = ix86_build_signbit_mask (vmode, TARGET_AVX512F && mode != HFmode, 0);
2366 :
2367 23236 : if (CONST_DOUBLE_P (operands[2]))
2368 : {
2369 79 : if (real_isneg (CONST_DOUBLE_REAL_VALUE (operands[2])))
2370 : /* Simplify b = copysign (a, negative) to b = mask | a. */
2371 76 : op1 = gen_rtx_IOR (vmode, mask, op1);
2372 : else
2373 : {
2374 : /* Simplify b = copysign (a, positive) to b = invert_mask & a. */
2375 3 : rtx invert_mask
2376 3 : = ix86_build_signbit_mask (vmode,
2377 3 : TARGET_AVX512F && mode != HFmode,
2378 : true);
2379 3 : op1 = gen_rtx_AND (vmode, invert_mask, op1);
2380 : }
2381 79 : emit_move_insn (vdest, op1);
2382 79 : if (dest)
2383 0 : emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2384 79 : return;
2385 : }
2386 : else
2387 23157 : op0 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
2388 :
2389 23157 : op2 = gen_reg_rtx (vmode);
2390 23157 : op3 = gen_reg_rtx (vmode);
2391 23157 : rtx invert_mask;
2392 : /* NB: Generate vmovdqa, vpandn, vpand, vpor for AVX and generate pand,
2393 : pand, por for SSE. */
2394 23157 : if (TARGET_AVX)
2395 31 : invert_mask = gen_rtx_NOT (vmode, mask);
2396 : else
2397 23126 : invert_mask = ix86_build_signbit_mask (vmode,
2398 23126 : TARGET_AVX512F && mode != HFmode,
2399 : true);
2400 23157 : emit_move_insn (op2, gen_rtx_AND (vmode, invert_mask, op1));
2401 23157 : emit_move_insn (op3, gen_rtx_AND (vmode, mask, op0));
2402 23157 : emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3));
2403 23157 : if (dest)
2404 0 : emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2405 : }
2406 :
2407 : /* Expand an xorsign operation. */
2408 :
2409 : void
2410 20 : ix86_expand_xorsign (rtx operands[])
2411 : {
2412 20 : machine_mode mode, vmode;
2413 20 : rtx dest, vdest, op0, op1, mask, x, temp;
2414 :
2415 20 : dest = operands[0];
2416 20 : op0 = operands[1];
2417 20 : op1 = operands[2];
2418 :
2419 20 : mode = GET_MODE (dest);
2420 :
2421 20 : switch (mode)
2422 : {
2423 : case E_HFmode:
2424 : vmode = V8HFmode;
2425 : break;
2426 : case E_BFmode:
2427 : vmode = V8BFmode;
2428 : break;
2429 : case E_SFmode:
2430 : vmode = V4SFmode;
2431 : break;
2432 : case E_DFmode:
2433 : vmode = V2DFmode;
2434 : break;
2435 0 : default:
2436 0 : gcc_unreachable ();
2437 20 : break;
2438 : }
2439 :
2440 20 : temp = gen_reg_rtx (vmode);
2441 20 : mask = ix86_build_signbit_mask (vmode, 0, 0);
2442 :
2443 20 : op1 = lowpart_subreg (vmode, force_reg (mode, op1), mode);
2444 20 : x = gen_rtx_AND (vmode, op1, mask);
2445 20 : emit_insn (gen_rtx_SET (temp, x));
2446 :
2447 20 : op0 = lowpart_subreg (vmode, force_reg (mode, op0), mode);
2448 20 : x = gen_rtx_XOR (vmode, temp, op0);
2449 :
2450 20 : vdest = lowpart_subreg (vmode, dest, mode);
2451 20 : if (vdest == NULL_RTX)
2452 0 : vdest = gen_reg_rtx (vmode);
2453 : else
2454 : dest = NULL_RTX;
2455 20 : emit_insn (gen_rtx_SET (vdest, x));
2456 :
2457 20 : if (dest)
2458 0 : emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2459 20 : }
2460 :
2461 : static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2462 :
2463 : void
2464 6622072 : ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2465 : {
2466 6622072 : machine_mode mode = GET_MODE (op0);
2467 6622072 : rtx tmp;
2468 :
2469 : /* Handle special case - vector comparsion with boolean result, transform
2470 : it using ptest instruction or vpcmpeq + kortest. */
2471 6622072 : if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
2472 6602690 : || (mode == TImode && !TARGET_64BIT)
2473 6602690 : || mode == OImode
2474 13224762 : || GET_MODE_SIZE (mode) == 64)
2475 : {
2476 19382 : unsigned msize = GET_MODE_SIZE (mode);
2477 19382 : machine_mode p_mode
2478 19382 : = msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode;
2479 : /* kortest set CF when result is 0xFFFF (op0 == op1). */
2480 19382 : rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, FLAGS_REG);
2481 :
2482 19382 : gcc_assert (code == EQ || code == NE);
2483 :
2484 : /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors. */
2485 19382 : if (msize == 64)
2486 : {
2487 2435 : if (mode != V16SImode)
2488 : {
2489 2435 : op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
2490 2435 : op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
2491 : }
2492 :
2493 2435 : tmp = gen_reg_rtx (HImode);
2494 2435 : emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0)));
2495 2435 : emit_insn (gen_kortesthi_ccc (tmp, tmp));
2496 : }
2497 : /* Using ptest for 128/256-bit vectors. */
2498 : else
2499 : {
2500 16947 : if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
2501 : {
2502 0 : op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
2503 0 : op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
2504 0 : mode = p_mode;
2505 : }
2506 :
2507 : /* Generate XOR since we can't check that one operand is zero
2508 : vector. */
2509 16947 : tmp = gen_reg_rtx (mode);
2510 16947 : rtx ops[3] = { tmp, op0, op1 };
2511 16947 : ix86_expand_vector_logical_operator (XOR, mode, ops);
2512 16947 : tmp = gen_lowpart (p_mode, tmp);
2513 16947 : emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
2514 : gen_rtx_UNSPEC (CCZmode,
2515 : gen_rtvec (2, tmp, tmp),
2516 : UNSPEC_PTEST)));
2517 : }
2518 19382 : tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2519 19382 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2520 : gen_rtx_LABEL_REF (VOIDmode, label),
2521 : pc_rtx);
2522 19382 : emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2523 19382 : return;
2524 : }
2525 :
2526 6602690 : switch (mode)
2527 : {
2528 6571988 : case E_HFmode:
2529 6571988 : case E_SFmode:
2530 6571988 : case E_DFmode:
2531 6571988 : case E_XFmode:
2532 6571988 : case E_QImode:
2533 6571988 : case E_HImode:
2534 6571988 : case E_SImode:
2535 6571988 : simple:
2536 6571988 : tmp = ix86_expand_compare (code, op0, op1);
2537 6571988 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2538 : gen_rtx_LABEL_REF (VOIDmode, label),
2539 : pc_rtx);
2540 6571988 : emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2541 6571988 : return;
2542 :
2543 7 : case E_BFmode:
2544 7 : gcc_assert (TARGET_AVX10_2 && !flag_trapping_math);
2545 7 : goto simple;
2546 :
2547 2661120 : case E_DImode:
2548 2661120 : if (TARGET_64BIT)
2549 2632427 : goto simple;
2550 : /* FALLTHRU */
2551 88804 : case E_TImode:
2552 : /* DI and TI mode equality/inequality comparisons may be performed
2553 : on SSE registers. Avoid splitting them, except when optimizing
2554 : for size. */
2555 88804 : if ((code == EQ || code == NE)
2556 88804 : && !optimize_insn_for_size_p ())
2557 58102 : goto simple;
2558 :
2559 : /* Expand DImode branch into multiple compare+branch. */
2560 30702 : {
2561 30702 : rtx lo[2], hi[2];
2562 30702 : rtx_code_label *label2;
2563 30702 : enum rtx_code code1, code2, code3;
2564 30702 : machine_mode submode;
2565 :
2566 30702 : if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2567 : {
2568 0 : std::swap (op0, op1);
2569 0 : code = swap_condition (code);
2570 : }
2571 :
2572 30702 : split_double_mode (mode, &op0, 1, lo+0, hi+0);
2573 30702 : split_double_mode (mode, &op1, 1, lo+1, hi+1);
2574 :
2575 30702 : submode = mode == DImode ? SImode : DImode;
2576 :
2577 : /* If we are doing less-than or greater-or-equal-than,
2578 : op1 is a constant and the low word is zero, then we can just
2579 : examine the high word. Similarly for low word -1 and
2580 : less-or-equal-than or greater-than. */
2581 :
2582 30702 : if (CONST_INT_P (hi[1]))
2583 19772 : switch (code)
2584 : {
2585 10561 : case LT: case LTU: case GE: case GEU:
2586 10561 : if (lo[1] == const0_rtx)
2587 : {
2588 10150 : ix86_expand_branch (code, hi[0], hi[1], label);
2589 10150 : return;
2590 : }
2591 : break;
2592 7643 : case LE: case LEU: case GT: case GTU:
2593 7643 : if (lo[1] == constm1_rtx)
2594 : {
2595 524 : ix86_expand_branch (code, hi[0], hi[1], label);
2596 524 : return;
2597 : }
2598 : break;
2599 : default:
2600 : break;
2601 : }
2602 :
2603 : /* Emulate comparisons that do not depend on Zero flag with
2604 : double-word subtraction. Note that only Overflow, Sign
2605 : and Carry flags are valid, so swap arguments and condition
2606 : of comparisons that would otherwise test Zero flag. */
2607 :
2608 20028 : switch (code)
2609 : {
2610 12581 : case LE: case LEU: case GT: case GTU:
2611 12581 : std::swap (lo[0], lo[1]);
2612 12581 : std::swap (hi[0], hi[1]);
2613 12581 : code = swap_condition (code);
2614 : /* FALLTHRU */
2615 :
2616 16961 : case LT: case LTU: case GE: case GEU:
2617 16961 : {
2618 16961 : bool uns = (code == LTU || code == GEU);
2619 3981 : rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2620 16961 : = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2621 :
2622 16961 : if (!nonimmediate_operand (lo[0], submode))
2623 7119 : lo[0] = force_reg (submode, lo[0]);
2624 16961 : if (!x86_64_general_operand (lo[1], submode))
2625 0 : lo[1] = force_reg (submode, lo[1]);
2626 :
2627 16961 : if (!register_operand (hi[0], submode))
2628 7941 : hi[0] = force_reg (submode, hi[0]);
2629 12980 : if ((uns && !nonimmediate_operand (hi[1], submode))
2630 16961 : || (!uns && !x86_64_general_operand (hi[1], submode)))
2631 315 : hi[1] = force_reg (submode, hi[1]);
2632 :
2633 16961 : emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2634 :
2635 16961 : tmp = gen_rtx_SCRATCH (submode);
2636 16961 : emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2637 :
2638 20942 : tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2639 16961 : ix86_expand_branch (code, tmp, const0_rtx, label);
2640 16961 : return;
2641 : }
2642 :
2643 3067 : default:
2644 3067 : break;
2645 : }
2646 :
2647 : /* Otherwise, we need two or three jumps. */
2648 :
2649 3067 : label2 = gen_label_rtx ();
2650 :
2651 3067 : code1 = code;
2652 3067 : code2 = swap_condition (code);
2653 3067 : code3 = unsigned_condition (code);
2654 :
2655 3067 : switch (code)
2656 : {
2657 : case LT: case GT: case LTU: case GTU:
2658 : break;
2659 :
2660 : case LE: code1 = LT; code2 = GT; break;
2661 : case GE: code1 = GT; code2 = LT; break;
2662 0 : case LEU: code1 = LTU; code2 = GTU; break;
2663 0 : case GEU: code1 = GTU; code2 = LTU; break;
2664 :
2665 : case EQ: code1 = UNKNOWN; code2 = NE; break;
2666 : case NE: code2 = UNKNOWN; break;
2667 :
2668 0 : default:
2669 0 : gcc_unreachable ();
2670 : }
2671 :
2672 : /*
2673 : * a < b =>
2674 : * if (hi(a) < hi(b)) goto true;
2675 : * if (hi(a) > hi(b)) goto false;
2676 : * if (lo(a) < lo(b)) goto true;
2677 : * false:
2678 : */
2679 :
2680 0 : if (code1 != UNKNOWN)
2681 2399 : ix86_expand_branch (code1, hi[0], hi[1], label);
2682 3067 : if (code2 != UNKNOWN)
2683 668 : ix86_expand_branch (code2, hi[0], hi[1], label2);
2684 :
2685 3067 : ix86_expand_branch (code3, lo[0], lo[1], label);
2686 :
2687 3067 : if (code2 != UNKNOWN)
2688 668 : emit_label (label2);
2689 : return;
2690 : }
2691 :
2692 17446 : default:
2693 17446 : gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2694 17446 : goto simple;
2695 : }
2696 : }
2697 :
2698 : /* Figure out whether to use unordered fp comparisons. */
2699 :
2700 : static bool
2701 1149120 : ix86_unordered_fp_compare (enum rtx_code code)
2702 : {
2703 1149120 : if (!TARGET_IEEE_FP)
2704 : return false;
2705 :
2706 1144806 : switch (code)
2707 : {
2708 : case LT:
2709 : case LE:
2710 : case GT:
2711 : case GE:
2712 : case LTGT:
2713 : return false;
2714 :
2715 : case EQ:
2716 : case NE:
2717 :
2718 : case UNORDERED:
2719 : case ORDERED:
2720 : case UNLT:
2721 : case UNLE:
2722 : case UNGT:
2723 : case UNGE:
2724 : case UNEQ:
2725 : return true;
2726 :
2727 0 : default:
2728 0 : gcc_unreachable ();
2729 : }
2730 : }
2731 :
2732 : /* Return a comparison we can do and that it is equivalent to
2733 : swap_condition (code) apart possibly from orderedness.
2734 : But, never change orderedness if TARGET_IEEE_FP, returning
2735 : UNKNOWN in that case if necessary. */
2736 :
2737 : static enum rtx_code
2738 37559 : ix86_fp_swap_condition (enum rtx_code code)
2739 : {
2740 37559 : switch (code)
2741 : {
2742 1859 : case GT: /* GTU - CF=0 & ZF=0 */
2743 1859 : return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2744 533 : case GE: /* GEU - CF=0 */
2745 533 : return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2746 446 : case UNLT: /* LTU - CF=1 */
2747 446 : return TARGET_IEEE_FP ? UNKNOWN : GT;
2748 6315 : case UNLE: /* LEU - CF=1 | ZF=1 */
2749 6315 : return TARGET_IEEE_FP ? UNKNOWN : GE;
2750 28406 : default:
2751 28406 : return swap_condition (code);
2752 : }
2753 : }
2754 :
2755 : /* Return cost of comparison CODE using the best strategy for performance.
2756 : All following functions do use number of instructions as a cost metrics.
2757 : In future this should be tweaked to compute bytes for optimize_size and
2758 : take into account performance of various instructions on various CPUs. */
2759 :
2760 : static int
2761 1147986 : ix86_fp_comparison_cost (enum rtx_code code)
2762 : {
2763 1147986 : int arith_cost;
2764 :
2765 : /* The cost of code using bit-twiddling on %ah. */
2766 1147986 : switch (code)
2767 : {
2768 : case UNLE:
2769 : case UNLT:
2770 : case LTGT:
2771 : case GT:
2772 : case GE:
2773 : case UNORDERED:
2774 : case ORDERED:
2775 : case UNEQ:
2776 : arith_cost = 4;
2777 : break;
2778 85018 : case LT:
2779 85018 : case NE:
2780 85018 : case EQ:
2781 85018 : case UNGE:
2782 85018 : arith_cost = TARGET_IEEE_FP ? 5 : 4;
2783 : break;
2784 24753 : case LE:
2785 24753 : case UNGT:
2786 1063762 : arith_cost = TARGET_IEEE_FP ? 6 : 4;
2787 : break;
2788 0 : default:
2789 0 : gcc_unreachable ();
2790 : }
2791 :
2792 1147986 : switch (ix86_fp_comparison_strategy (code))
2793 : {
2794 1147986 : case IX86_FPCMP_COMI:
2795 1147986 : return arith_cost > 4 ? 3 : 2;
2796 0 : case IX86_FPCMP_SAHF:
2797 0 : return arith_cost > 4 ? 4 : 3;
2798 : default:
2799 : return arith_cost;
2800 : }
2801 : }
2802 :
2803 : /* Swap, force into registers, or otherwise massage the two operands
2804 : to a fp comparison. The operands are updated in place; the new
2805 : comparison code is returned. */
2806 :
2807 : static enum rtx_code
2808 573993 : ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2809 : {
2810 574064 : bool unordered_compare = ix86_unordered_fp_compare (code);
2811 574064 : rtx op0 = *pop0, op1 = *pop1;
2812 574064 : machine_mode op_mode = GET_MODE (op0);
2813 574064 : bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (op_mode);
2814 :
2815 571675 : if (op_mode == BFmode && (!TARGET_AVX10_2 || flag_trapping_math))
2816 : {
2817 71 : rtx op = gen_lowpart (HImode, op0);
2818 71 : if (CONST_INT_P (op))
2819 0 : op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
2820 : op0, BFmode);
2821 : else
2822 : {
2823 71 : rtx t1 = gen_reg_rtx (SImode);
2824 71 : emit_insn (gen_zero_extendhisi2 (t1, op));
2825 71 : emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
2826 71 : op = gen_lowpart (SFmode, t1);
2827 : }
2828 71 : *pop0 = op;
2829 71 : op = gen_lowpart (HImode, op1);
2830 71 : if (CONST_INT_P (op))
2831 6 : op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
2832 : op1, BFmode);
2833 : else
2834 : {
2835 65 : rtx t1 = gen_reg_rtx (SImode);
2836 65 : emit_insn (gen_zero_extendhisi2 (t1, op));
2837 65 : emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
2838 65 : op = gen_lowpart (SFmode, t1);
2839 : }
2840 71 : *pop1 = op;
2841 71 : return ix86_prepare_fp_compare_args (code, pop0, pop1);
2842 : }
2843 :
2844 : /* All of the unordered compare instructions only work on registers.
2845 : The same is true of the fcomi compare instructions. The XFmode
2846 : compare instructions require registers except when comparing
2847 : against zero or when converting operand 1 from fixed point to
2848 : floating point. */
2849 :
2850 573993 : if (!is_sse
2851 573993 : && (unordered_compare
2852 8247 : || (op_mode == XFmode
2853 10627 : && ! (standard_80387_constant_p (op0) == 1
2854 5311 : || standard_80387_constant_p (op1) == 1)
2855 4877 : && GET_CODE (op1) != FLOAT)
2856 3370 : || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2857 : {
2858 147804 : op0 = force_reg (op_mode, op0);
2859 147804 : op1 = force_reg (op_mode, op1);
2860 : }
2861 : else
2862 : {
2863 : /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2864 : things around if they appear profitable, otherwise force op0
2865 : into a register. */
2866 :
2867 426189 : if (standard_80387_constant_p (op0) == 0
2868 426189 : || (MEM_P (op0)
2869 56466 : && ! (standard_80387_constant_p (op1) == 0
2870 41169 : || MEM_P (op1))))
2871 : {
2872 37559 : enum rtx_code new_code = ix86_fp_swap_condition (code);
2873 37559 : if (new_code != UNKNOWN)
2874 : {
2875 : std::swap (op0, op1);
2876 426189 : code = new_code;
2877 : }
2878 : }
2879 :
2880 426189 : if (!REG_P (op0))
2881 52773 : op0 = force_reg (op_mode, op0);
2882 :
2883 426189 : if (CONSTANT_P (op1))
2884 : {
2885 193768 : int tmp = standard_80387_constant_p (op1);
2886 193768 : if (tmp == 0)
2887 74267 : op1 = validize_mem (force_const_mem (op_mode, op1));
2888 119501 : else if (tmp == 1)
2889 : {
2890 65397 : if (TARGET_CMOVE)
2891 65397 : op1 = force_reg (op_mode, op1);
2892 : }
2893 : else
2894 54104 : op1 = force_reg (op_mode, op1);
2895 : }
2896 : }
2897 :
2898 : /* Try to rearrange the comparison to make it cheaper. */
2899 573993 : if (ix86_fp_comparison_cost (code)
2900 573993 : > ix86_fp_comparison_cost (swap_condition (code))
2901 573993 : && (REG_P (op1) || can_create_pseudo_p ()))
2902 : {
2903 0 : std::swap (op0, op1);
2904 0 : code = swap_condition (code);
2905 0 : if (!REG_P (op0))
2906 0 : op0 = force_reg (op_mode, op0);
2907 : }
2908 :
2909 573993 : *pop0 = op0;
2910 573993 : *pop1 = op1;
2911 573993 : return code;
2912 : }
2913 :
2914 : /* Generate insn patterns to do a floating point compare of OPERANDS. */
2915 :
2916 : static rtx
2917 573993 : ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2918 : {
2919 573993 : bool unordered_compare = ix86_unordered_fp_compare (code);
2920 573993 : machine_mode cmp_mode;
2921 573993 : rtx tmp, scratch;
2922 :
2923 573993 : code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2924 :
2925 573993 : tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2926 573993 : if (unordered_compare)
2927 498425 : tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2928 :
2929 : /* Do fcomi/sahf based test when profitable. */
2930 573993 : switch (ix86_fp_comparison_strategy (code))
2931 : {
2932 573993 : case IX86_FPCMP_COMI:
2933 573993 : tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2934 : /* We only have vcomisbf16, No vcomubf16 nor vcomxbf16 */
2935 573993 : if (GET_MODE (op0) != E_BFmode)
2936 : {
2937 573965 : if (TARGET_AVX10_2 && (code == EQ || code == NE))
2938 972 : tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_OPTCOMX);
2939 573965 : if (unordered_compare)
2940 498417 : tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2941 : }
2942 573993 : cmp_mode = CCFPmode;
2943 573993 : emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2944 573993 : break;
2945 :
2946 0 : case IX86_FPCMP_SAHF:
2947 0 : cmp_mode = CCFPmode;
2948 0 : tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2949 0 : scratch = gen_reg_rtx (HImode);
2950 0 : emit_insn (gen_rtx_SET (scratch, tmp));
2951 0 : emit_insn (gen_x86_sahf_1 (scratch));
2952 0 : break;
2953 :
2954 0 : case IX86_FPCMP_ARITH:
2955 0 : cmp_mode = CCNOmode;
2956 0 : tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2957 0 : scratch = gen_reg_rtx (HImode);
2958 0 : emit_insn (gen_rtx_SET (scratch, tmp));
2959 :
2960 : /* In the unordered case, we have to check C2 for NaN's, which
2961 : doesn't happen to work out to anything nice combination-wise.
2962 : So do some bit twiddling on the value we've got in AH to come
2963 : up with an appropriate set of condition codes. */
2964 :
2965 0 : switch (code)
2966 : {
2967 0 : case GT:
2968 0 : case UNGT:
2969 0 : if (code == GT || !TARGET_IEEE_FP)
2970 : {
2971 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2972 0 : code = EQ;
2973 : }
2974 : else
2975 : {
2976 0 : emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2977 0 : emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2978 0 : emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2979 0 : cmp_mode = CCmode;
2980 0 : code = GEU;
2981 : }
2982 : break;
2983 0 : case LT:
2984 0 : case UNLT:
2985 0 : if (code == LT && TARGET_IEEE_FP)
2986 : {
2987 0 : emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2988 0 : emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2989 0 : cmp_mode = CCmode;
2990 0 : code = EQ;
2991 : }
2992 : else
2993 : {
2994 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2995 0 : code = NE;
2996 : }
2997 : break;
2998 0 : case GE:
2999 0 : case UNGE:
3000 0 : if (code == GE || !TARGET_IEEE_FP)
3001 : {
3002 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
3003 0 : code = EQ;
3004 : }
3005 : else
3006 : {
3007 0 : emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
3008 0 : emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
3009 0 : code = NE;
3010 : }
3011 : break;
3012 0 : case LE:
3013 0 : case UNLE:
3014 0 : if (code == LE && TARGET_IEEE_FP)
3015 : {
3016 0 : emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
3017 0 : emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
3018 0 : emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
3019 0 : cmp_mode = CCmode;
3020 0 : code = LTU;
3021 : }
3022 : else
3023 : {
3024 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
3025 0 : code = NE;
3026 : }
3027 : break;
3028 0 : case EQ:
3029 0 : case UNEQ:
3030 0 : if (code == EQ && TARGET_IEEE_FP)
3031 : {
3032 0 : emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
3033 0 : emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
3034 0 : cmp_mode = CCmode;
3035 0 : code = EQ;
3036 : }
3037 : else
3038 : {
3039 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
3040 0 : code = NE;
3041 : }
3042 : break;
3043 0 : case NE:
3044 0 : case LTGT:
3045 0 : if (code == NE && TARGET_IEEE_FP)
3046 : {
3047 0 : emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
3048 0 : emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
3049 : GEN_INT (0x40)));
3050 0 : code = NE;
3051 : }
3052 : else
3053 : {
3054 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
3055 0 : code = EQ;
3056 : }
3057 : break;
3058 :
3059 0 : case UNORDERED:
3060 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
3061 0 : code = NE;
3062 0 : break;
3063 0 : case ORDERED:
3064 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
3065 0 : code = EQ;
3066 0 : break;
3067 :
3068 0 : default:
3069 0 : gcc_unreachable ();
3070 : }
3071 : break;
3072 :
3073 0 : default:
3074 0 : gcc_unreachable();
3075 : }
3076 :
3077 : /* Return the test that should be put into the flags user, i.e.
3078 : the bcc, scc, or cmov instruction. */
3079 573993 : return gen_rtx_fmt_ee (code, VOIDmode,
3080 : gen_rtx_REG (cmp_mode, FLAGS_REG),
3081 : const0_rtx);
3082 : }
3083 :
3084 : /* Generate insn patterns to do an integer compare of OPERANDS. */
3085 :
3086 : static rtx
3087 6950446 : ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
3088 : {
3089 6950446 : machine_mode cmpmode;
3090 6950446 : rtx tmp, flags;
3091 :
3092 : /* Swap operands to emit carry flag comparison. */
3093 6950446 : if ((code == GTU || code == LEU)
3094 6950446 : && nonimmediate_operand (op1, VOIDmode))
3095 : {
3096 142790 : std::swap (op0, op1);
3097 142790 : code = swap_condition (code);
3098 : }
3099 :
3100 6950446 : cmpmode = SELECT_CC_MODE (code, op0, op1);
3101 6950446 : flags = gen_rtx_REG (cmpmode, FLAGS_REG);
3102 :
3103 : /* Attempt to use PTEST, if available, when testing vector modes for
3104 : equality/inequality against zero. */
3105 6950446 : if (op1 == const0_rtx
3106 2914495 : && SUBREG_P (op0)
3107 22876 : && cmpmode == CCZmode
3108 10381 : && SUBREG_BYTE (op0) == 0
3109 8697 : && REG_P (SUBREG_REG (op0))
3110 8697 : && VECTOR_MODE_P (GET_MODE (SUBREG_REG (op0)))
3111 8 : && TARGET_SSE4_1
3112 2 : && GET_MODE (op0) == TImode
3113 6950450 : && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op0))) == 16)
3114 : {
3115 2 : tmp = SUBREG_REG (op0);
3116 2 : if (GET_MODE (tmp) == V8HFmode || GET_MODE (tmp) == V8BFmode)
3117 1 : tmp = gen_lowpart (V8HImode, tmp);
3118 2 : tmp = gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, tmp, tmp), UNSPEC_PTEST);
3119 : }
3120 : else
3121 6950444 : tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
3122 :
3123 : /* This is very simple, but making the interface the same as in the
3124 : FP case makes the rest of the code easier. */
3125 6950446 : emit_insn (gen_rtx_SET (flags, tmp));
3126 :
3127 : /* Return the test that should be put into the flags user, i.e.
3128 : the bcc, scc, or cmov instruction. */
3129 6950446 : return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
3130 : }
3131 :
3132 : static rtx
3133 7654540 : ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
3134 : {
3135 7654540 : rtx ret;
3136 :
3137 7654540 : if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
3138 132221 : ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
3139 :
3140 7522319 : else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
3141 : {
3142 571873 : gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
3143 571873 : ret = ix86_expand_fp_compare (code, op0, op1);
3144 : }
3145 : else
3146 6950446 : ret = ix86_expand_int_compare (code, op0, op1);
3147 :
3148 7654540 : return ret;
3149 : }
3150 :
3151 : void
3152 587993 : ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
3153 : {
3154 587993 : rtx ret;
3155 :
3156 587993 : gcc_assert (GET_MODE (dest) == QImode);
3157 :
3158 587993 : ret = ix86_expand_compare (code, op0, op1);
3159 587993 : PUT_MODE (ret, QImode);
3160 587993 : emit_insn (gen_rtx_SET (dest, ret));
3161 587993 : }
3162 :
3163 : /* Expand floating point op0 <=> op1, i.e.
3164 : dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : -128. */
3165 :
3166 : void
3167 244 : ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1, rtx op2)
3168 : {
3169 244 : gcc_checking_assert (ix86_fp_comparison_strategy (GT) != IX86_FPCMP_ARITH);
3170 244 : rtx zero = NULL_RTX;
3171 244 : if (op2 != const0_rtx
3172 52 : && (TARGET_IEEE_FP || TARGET_ZERO_EXTEND_WITH_AND)
3173 34 : && GET_MODE (dest) == SImode)
3174 34 : zero = force_reg (SImode, const0_rtx);
3175 244 : rtx gt = ix86_expand_fp_compare (GT, op0, op1);
3176 244 : rtx l0 = op2 == const0_rtx ? gen_label_rtx () : NULL_RTX;
3177 244 : rtx l1 = op2 == const0_rtx ? gen_label_rtx () : NULL_RTX;
3178 244 : rtx l2 = TARGET_IEEE_FP ? gen_label_rtx () : NULL_RTX;
3179 244 : rtx lend = gen_label_rtx ();
3180 244 : rtx tmp;
3181 244 : rtx_insn *jmp;
3182 244 : if (l2)
3183 : {
3184 207 : rtx un = gen_rtx_fmt_ee (UNORDERED, VOIDmode,
3185 : gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
3186 207 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, un,
3187 : gen_rtx_LABEL_REF (VOIDmode, l2), pc_rtx);
3188 207 : jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3189 207 : add_reg_br_prob_note (jmp, profile_probability:: very_unlikely ());
3190 : }
3191 244 : if (op2 == const0_rtx)
3192 : {
3193 192 : rtx eq = gen_rtx_fmt_ee (UNEQ, VOIDmode,
3194 : gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
3195 192 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, eq,
3196 : gen_rtx_LABEL_REF (VOIDmode, l0), pc_rtx);
3197 192 : jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3198 192 : add_reg_br_prob_note (jmp, profile_probability::unlikely ());
3199 192 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, gt,
3200 : gen_rtx_LABEL_REF (VOIDmode, l1), pc_rtx);
3201 192 : jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3202 192 : add_reg_br_prob_note (jmp, profile_probability::even ());
3203 192 : emit_move_insn (dest, constm1_rtx);
3204 192 : emit_jump (lend);
3205 192 : emit_label (l0);
3206 192 : emit_move_insn (dest, const0_rtx);
3207 192 : emit_jump (lend);
3208 192 : emit_label (l1);
3209 192 : emit_move_insn (dest, const1_rtx);
3210 : }
3211 : else
3212 : {
3213 52 : rtx lt_tmp = NULL_RTX;
3214 52 : if (GET_MODE (dest) != SImode || !TARGET_ZERO_EXTEND_WITH_AND)
3215 : {
3216 52 : lt_tmp = gen_reg_rtx (QImode);
3217 52 : ix86_expand_setcc (lt_tmp, UNLT, gen_rtx_REG (CCFPmode, FLAGS_REG),
3218 : const0_rtx);
3219 52 : if (GET_MODE (dest) != QImode)
3220 : {
3221 52 : tmp = gen_reg_rtx (GET_MODE (dest));
3222 52 : emit_insn (gen_rtx_SET (tmp,
3223 : gen_rtx_ZERO_EXTEND (GET_MODE (dest),
3224 : lt_tmp)));
3225 52 : lt_tmp = tmp;
3226 : }
3227 : }
3228 52 : rtx gt_tmp;
3229 52 : if (zero)
3230 : {
3231 : /* If TARGET_IEEE_FP and dest has SImode, emit SImode clear
3232 : before the floating point comparison and use setcc_si_slp
3233 : pattern to hide it from the combiner, so that it doesn't
3234 : undo it. Similarly for TARGET_ZERO_EXTEND_WITH_AND, where
3235 : the ZERO_EXTEND normally emitted would need to be AND
3236 : with flags clobber. */
3237 34 : tmp = ix86_expand_compare (GT, XEXP (gt, 0), const0_rtx);
3238 34 : PUT_MODE (tmp, QImode);
3239 34 : emit_insn (gen_setcc_si_slp (zero, tmp, zero));
3240 34 : gt_tmp = zero;
3241 : }
3242 : else
3243 : {
3244 18 : gt_tmp = gen_reg_rtx (QImode);
3245 18 : ix86_expand_setcc (gt_tmp, GT, XEXP (gt, 0), const0_rtx);
3246 18 : if (GET_MODE (dest) != QImode)
3247 : {
3248 18 : tmp = gen_reg_rtx (GET_MODE (dest));
3249 18 : emit_insn (gen_rtx_SET (tmp,
3250 : gen_rtx_ZERO_EXTEND (GET_MODE (dest),
3251 : gt_tmp)));
3252 18 : gt_tmp = tmp;
3253 : }
3254 : }
3255 52 : if (lt_tmp)
3256 : {
3257 52 : tmp = expand_simple_binop (GET_MODE (dest), MINUS, gt_tmp, lt_tmp,
3258 : dest, 0, OPTAB_DIRECT);
3259 52 : if (!rtx_equal_p (tmp, dest))
3260 0 : emit_move_insn (dest, tmp);
3261 : }
3262 : else
3263 : {
3264 : /* For TARGET_ZERO_EXTEND_WITH_AND emit sbb directly, as we can't
3265 : do ZERO_EXTEND without clobbering flags. */
3266 0 : tmp = ix86_expand_compare (UNLT, XEXP (gt, 0), const0_rtx);
3267 0 : PUT_MODE (tmp, SImode);
3268 0 : emit_insn (gen_subsi3_carry (dest, gt_tmp,
3269 0 : force_reg (GET_MODE (dest), const0_rtx),
3270 : XEXP (gt, 0), tmp));
3271 : }
3272 : }
3273 244 : emit_jump (lend);
3274 244 : if (l2)
3275 : {
3276 207 : emit_label (l2);
3277 207 : emit_move_insn (dest, op2 == const0_rtx ? GEN_INT (-128) : op2);
3278 : }
3279 244 : emit_label (lend);
3280 244 : }
3281 :
3282 : /* Expand integral op0 <=> op1, i.e.
3283 : dest = op0 == op1 ? 0 : op0 < op1 ? -1 : 1. */
3284 :
3285 : void
3286 35 : ix86_expand_int_spaceship (rtx dest, rtx op0, rtx op1, rtx op2)
3287 : {
3288 35 : gcc_assert (INTVAL (op2));
3289 35 : rtx zero1 = NULL_RTX, zero2 = NULL_RTX;
3290 35 : if (TARGET_ZERO_EXTEND_WITH_AND && GET_MODE (dest) == SImode)
3291 : {
3292 0 : zero1 = force_reg (SImode, const0_rtx);
3293 0 : if (INTVAL (op2) != 1)
3294 0 : zero2 = force_reg (SImode, const0_rtx);
3295 : }
3296 :
3297 : /* Not using ix86_expand_int_compare here, so that it doesn't swap
3298 : operands nor optimize CC mode - we need a mode usable for both
3299 : LT and GT resp. LTU and GTU comparisons with the same unswapped
3300 : operands. */
3301 51 : rtx flags = gen_rtx_REG (INTVAL (op2) != 1 ? CCGCmode : CCmode, FLAGS_REG);
3302 35 : rtx tmp = gen_rtx_COMPARE (GET_MODE (flags), op0, op1);
3303 35 : emit_insn (gen_rtx_SET (flags, tmp));
3304 35 : rtx lt_tmp = NULL_RTX;
3305 35 : if (zero2)
3306 : {
3307 : /* For TARGET_ZERO_EXTEND_WITH_AND, emit setcc_si_slp to avoid
3308 : ZERO_EXTEND. */
3309 0 : tmp = ix86_expand_compare (LT, flags, const0_rtx);
3310 0 : PUT_MODE (tmp, QImode);
3311 0 : emit_insn (gen_setcc_si_slp (zero2, tmp, zero2));
3312 0 : lt_tmp = zero2;
3313 : }
3314 35 : else if (!zero1)
3315 : {
3316 35 : lt_tmp = gen_reg_rtx (QImode);
3317 51 : ix86_expand_setcc (lt_tmp, INTVAL (op2) != 1 ? LT : LTU, flags,
3318 : const0_rtx);
3319 35 : if (GET_MODE (dest) != QImode)
3320 : {
3321 35 : tmp = gen_reg_rtx (GET_MODE (dest));
3322 35 : emit_insn (gen_rtx_SET (tmp, gen_rtx_ZERO_EXTEND (GET_MODE (dest),
3323 : lt_tmp)));
3324 35 : lt_tmp = tmp;
3325 : }
3326 : }
3327 35 : rtx gt_tmp;
3328 35 : if (zero1)
3329 : {
3330 : /* For TARGET_ZERO_EXTEND_WITH_AND, emit setcc_si_slp to avoid
3331 : ZERO_EXTEND. */
3332 0 : tmp = ix86_expand_compare (INTVAL (op2) != 1 ? GT : GTU, flags,
3333 : const0_rtx);
3334 0 : PUT_MODE (tmp, QImode);
3335 0 : emit_insn (gen_setcc_si_slp (zero1, tmp, zero1));
3336 0 : gt_tmp = zero1;
3337 : }
3338 : else
3339 : {
3340 35 : gt_tmp = gen_reg_rtx (QImode);
3341 51 : ix86_expand_setcc (gt_tmp, INTVAL (op2) != 1 ? GT : GTU, flags,
3342 : const0_rtx);
3343 35 : if (GET_MODE (dest) != QImode)
3344 : {
3345 35 : tmp = gen_reg_rtx (GET_MODE (dest));
3346 35 : emit_insn (gen_rtx_SET (tmp, gen_rtx_ZERO_EXTEND (GET_MODE (dest),
3347 : gt_tmp)));
3348 35 : gt_tmp = tmp;
3349 : }
3350 : }
3351 35 : if (lt_tmp)
3352 : {
3353 35 : tmp = expand_simple_binop (GET_MODE (dest), MINUS, gt_tmp, lt_tmp, dest,
3354 : 0, OPTAB_DIRECT);
3355 35 : if (!rtx_equal_p (tmp, dest))
3356 0 : emit_move_insn (dest, tmp);
3357 : }
3358 : else
3359 : {
3360 : /* For TARGET_ZERO_EXTEND_WITH_AND emit sbb directly, as we can't
3361 : do ZERO_EXTEND without clobbering flags. */
3362 0 : tmp = ix86_expand_compare (LTU, flags, const0_rtx);
3363 0 : PUT_MODE (tmp, SImode);
3364 0 : emit_insn (gen_subsi3_carry (dest, gt_tmp,
3365 0 : force_reg (GET_MODE (dest), const0_rtx),
3366 : flags, tmp));
3367 : }
3368 35 : }
3369 :
3370 : /* Expand comparison setting or clearing carry flag. Return true when
3371 : successful and set pop for the operation. */
3372 : static bool
3373 29144 : ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
3374 : {
3375 58288 : machine_mode mode
3376 29144 : = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
3377 :
3378 : /* Do not handle double-mode compares that go through special path. */
3379 31469 : if (mode == (TARGET_64BIT ? TImode : DImode))
3380 : return false;
3381 :
3382 29134 : if (SCALAR_FLOAT_MODE_P (mode))
3383 : {
3384 1878 : rtx compare_op;
3385 1878 : rtx_insn *compare_seq;
3386 :
3387 1878 : gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
3388 :
3389 : /* Shortcut: following common codes never translate
3390 : into carry flag compares. */
3391 1878 : if (code == EQ || code == NE || code == UNEQ || code == LTGT
3392 : || code == ORDERED || code == UNORDERED)
3393 : return false;
3394 :
3395 : /* These comparisons require zero flag; swap operands so they won't. */
3396 : if ((code == GT || code == UNLE || code == LE || code == UNGT)
3397 1813 : && !TARGET_IEEE_FP)
3398 : {
3399 2 : std::swap (op0, op1);
3400 2 : code = swap_condition (code);
3401 : }
3402 :
3403 : /* Try to expand the comparison and verify that we end up with
3404 : carry flag based comparison. This fails to be true only when
3405 : we decide to expand comparison using arithmetic that is not
3406 : too common scenario. */
3407 1876 : start_sequence ();
3408 1876 : compare_op = ix86_expand_fp_compare (code, op0, op1);
3409 1876 : compare_seq = end_sequence ();
3410 :
3411 1876 : if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
3412 1876 : code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
3413 : else
3414 0 : code = GET_CODE (compare_op);
3415 :
3416 1876 : if (code != LTU && code != GEU)
3417 : return false;
3418 :
3419 63 : emit_insn (compare_seq);
3420 63 : *pop = compare_op;
3421 63 : return true;
3422 : }
3423 :
3424 27256 : if (!INTEGRAL_MODE_P (mode))
3425 : return false;
3426 :
3427 27180 : switch (code)
3428 : {
3429 : case LTU:
3430 : case GEU:
3431 : break;
3432 :
3433 : /* Convert a==0 into (unsigned)a<1. */
3434 23652 : case EQ:
3435 23652 : case NE:
3436 23652 : if (op1 != const0_rtx)
3437 : return false;
3438 10154 : op1 = const1_rtx;
3439 10154 : code = (code == EQ ? LTU : GEU);
3440 : break;
3441 :
3442 : /* Convert a>b into b<a or a>=b-1. */
3443 699 : case GTU:
3444 699 : case LEU:
3445 699 : if (CONST_INT_P (op1))
3446 : {
3447 657 : op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
3448 : /* Bail out on overflow. We still can swap operands but that
3449 : would force loading of the constant into register. */
3450 657 : if (op1 == const0_rtx
3451 657 : || !x86_64_immediate_operand (op1, GET_MODE (op1)))
3452 0 : return false;
3453 657 : code = (code == GTU ? GEU : LTU);
3454 : }
3455 : else
3456 : {
3457 42 : std::swap (op0, op1);
3458 42 : code = (code == GTU ? LTU : GEU);
3459 : }
3460 : break;
3461 :
3462 : /* Convert a>=0 into (unsigned)a<0x80000000. */
3463 1294 : case LT:
3464 1294 : case GE:
3465 1294 : if (mode == DImode || op1 != const0_rtx)
3466 : return false;
3467 204 : op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3468 102 : code = (code == LT ? GEU : LTU);
3469 : break;
3470 842 : case LE:
3471 842 : case GT:
3472 842 : if (mode == DImode || op1 != constm1_rtx)
3473 : return false;
3474 0 : op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3475 0 : code = (code == LE ? GEU : LTU);
3476 : break;
3477 :
3478 : default:
3479 : return false;
3480 : }
3481 : /* Swapping operands may cause constant to appear as first operand. */
3482 11648 : if (!nonimmediate_operand (op0, VOIDmode))
3483 : {
3484 0 : if (!can_create_pseudo_p ())
3485 : return false;
3486 0 : op0 = force_reg (mode, op0);
3487 : }
3488 11648 : *pop = ix86_expand_compare (code, op0, op1);
3489 11648 : gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
3490 : return true;
3491 : }
3492 :
3493 : /* Expand conditional increment or decrement using adb/sbb instructions.
3494 : The default case using setcc followed by the conditional move can be
3495 : done by generic code. */
3496 : bool
3497 6806 : ix86_expand_int_addcc (rtx operands[])
3498 : {
3499 6806 : enum rtx_code code = GET_CODE (operands[1]);
3500 6806 : rtx flags;
3501 6806 : rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
3502 6806 : rtx compare_op;
3503 6806 : rtx val = const0_rtx;
3504 6806 : bool fpcmp = false;
3505 6806 : machine_mode mode;
3506 6806 : rtx op0 = XEXP (operands[1], 0);
3507 6806 : rtx op1 = XEXP (operands[1], 1);
3508 :
3509 6806 : if (operands[3] != const1_rtx
3510 2835 : && operands[3] != constm1_rtx)
3511 : return false;
3512 4695 : if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3513 : return false;
3514 1270 : code = GET_CODE (compare_op);
3515 :
3516 1270 : flags = XEXP (compare_op, 0);
3517 :
3518 1270 : if (GET_MODE (flags) == CCFPmode)
3519 : {
3520 4 : fpcmp = true;
3521 4 : code = ix86_fp_compare_code_to_integer (code);
3522 : }
3523 :
3524 1270 : if (code != LTU)
3525 : {
3526 735 : val = constm1_rtx;
3527 735 : if (fpcmp)
3528 4 : PUT_CODE (compare_op,
3529 : reverse_condition_maybe_unordered
3530 : (GET_CODE (compare_op)));
3531 : else
3532 731 : PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
3533 : }
3534 :
3535 1270 : mode = GET_MODE (operands[0]);
3536 :
3537 : /* Construct either adc or sbb insn. */
3538 1270 : if ((code == LTU) == (operands[3] == constm1_rtx))
3539 : insn = gen_sub3_carry;
3540 : else
3541 515 : insn = gen_add3_carry;
3542 :
3543 1270 : emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
3544 :
3545 1270 : return true;
3546 : }
3547 :
3548 : bool
3549 436019 : ix86_expand_int_movcc (rtx operands[])
3550 : {
3551 436019 : enum rtx_code code = GET_CODE (operands[1]), compare_code;
3552 436019 : rtx_insn *compare_seq;
3553 436019 : rtx compare_op;
3554 436019 : machine_mode mode = GET_MODE (operands[0]);
3555 436019 : bool sign_bit_compare_p = false;
3556 436019 : bool negate_cc_compare_p = false;
3557 436019 : rtx op0 = XEXP (operands[1], 0);
3558 436019 : rtx op1 = XEXP (operands[1], 1);
3559 436019 : rtx op2 = operands[2];
3560 436019 : rtx op3 = operands[3];
3561 :
3562 436019 : if (GET_MODE (op0) == TImode
3563 420585 : || (GET_MODE (op0) == DImode
3564 105600 : && !TARGET_64BIT))
3565 : return false;
3566 :
3567 419489 : if (GET_MODE (op0) == BFmode
3568 419489 : && !ix86_fp_comparison_operator (operands[1], VOIDmode))
3569 : return false;
3570 :
3571 419489 : start_sequence ();
3572 419489 : compare_op = ix86_expand_compare (code, op0, op1);
3573 419489 : compare_seq = end_sequence ();
3574 :
3575 419489 : compare_code = GET_CODE (compare_op);
3576 :
3577 419489 : if ((op1 == const0_rtx && (code == GE || code == LT))
3578 377561 : || (op1 == constm1_rtx && (code == GT || code == LE)))
3579 : sign_bit_compare_p = true;
3580 :
3581 : /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
3582 : but if op1 is a constant, the latter form allows more optimizations,
3583 : either through the last 2 ops being constant handling, or the one
3584 : constant and one variable cases. On the other side, for cmov the
3585 : former might be better as we don't need to load the constant into
3586 : another register. */
3587 377561 : if (code == EQ && CONST_INT_P (op1) && rtx_equal_p (op0, op2))
3588 : op2 = op1;
3589 : /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1. */
3590 418978 : else if (code == NE && CONST_INT_P (op1) && rtx_equal_p (op0, op3))
3591 : op3 = op1;
3592 :
3593 : /* Don't attempt mode expansion here -- if we had to expand 5 or 6
3594 : HImode insns, we'd be swallowed in word prefix ops. */
3595 :
3596 4849 : if ((mode != HImode || TARGET_FAST_PREFIX)
3597 449949 : && (mode != (TARGET_64BIT ? TImode : DImode))
3598 419489 : && CONST_INT_P (op2)
3599 451736 : && CONST_INT_P (op3))
3600 : {
3601 25253 : rtx out = operands[0];
3602 25253 : HOST_WIDE_INT ct = INTVAL (op2);
3603 25253 : HOST_WIDE_INT cf = INTVAL (op3);
3604 25253 : HOST_WIDE_INT diff;
3605 :
3606 25253 : if ((mode == SImode
3607 11729 : || (TARGET_64BIT && mode == DImode))
3608 18297 : && (GET_MODE (op0) == SImode
3609 14299 : || (TARGET_64BIT && GET_MODE (op0) == DImode)))
3610 : {
3611 : /* Special case x != 0 ? -1 : y. */
3612 13136 : if (code == NE && op1 == const0_rtx && ct == -1)
3613 : {
3614 : negate_cc_compare_p = true;
3615 : std::swap (ct, cf);
3616 : code = EQ;
3617 : }
3618 13035 : else if (code == EQ && op1 == const0_rtx && cf == -1)
3619 25253 : negate_cc_compare_p = true;
3620 : }
3621 :
3622 25253 : diff = (unsigned HOST_WIDE_INT) ct - cf;
3623 : /* Make sure we can represent the difference between the two values. */
3624 25253 : if ((diff > 0) != ((cf < 0) != (ct < 0) ? cf < 0 : cf < ct))
3625 436019 : return false;
3626 :
3627 : /* Sign bit compares are better done using shifts than we do by using
3628 : sbb. */
3629 25105 : if (sign_bit_compare_p
3630 25105 : || negate_cc_compare_p
3631 25105 : || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3632 : {
3633 : /* Detect overlap between destination and compare sources. */
3634 11097 : rtx tmp = out;
3635 :
3636 11097 : if (negate_cc_compare_p)
3637 : {
3638 280 : if (GET_MODE (op0) == DImode)
3639 104 : emit_insn (gen_x86_negdi_ccc (gen_reg_rtx (DImode), op0));
3640 : else
3641 176 : emit_insn (gen_x86_negsi_ccc (gen_reg_rtx (SImode),
3642 176 : gen_lowpart (SImode, op0)));
3643 :
3644 280 : tmp = gen_reg_rtx (mode);
3645 280 : if (mode == DImode)
3646 123 : emit_insn (gen_x86_movdicc_0_m1_neg (tmp));
3647 : else
3648 157 : emit_insn (gen_x86_movsicc_0_m1_neg (gen_lowpart (SImode,
3649 : tmp)));
3650 : }
3651 10817 : else if (!sign_bit_compare_p)
3652 : {
3653 10441 : rtx flags;
3654 10441 : bool fpcmp = false;
3655 :
3656 10441 : compare_code = GET_CODE (compare_op);
3657 :
3658 10441 : flags = XEXP (compare_op, 0);
3659 :
3660 10441 : if (GET_MODE (flags) == CCFPmode)
3661 : {
3662 59 : fpcmp = true;
3663 59 : compare_code
3664 59 : = ix86_fp_compare_code_to_integer (compare_code);
3665 : }
3666 :
3667 : /* To simplify rest of code, restrict to the GEU case. */
3668 10441 : if (compare_code == LTU)
3669 : {
3670 6047 : std::swap (ct, cf);
3671 6047 : compare_code = reverse_condition (compare_code);
3672 6047 : code = reverse_condition (code);
3673 : }
3674 : else
3675 : {
3676 4394 : if (fpcmp)
3677 59 : PUT_CODE (compare_op,
3678 : reverse_condition_maybe_unordered
3679 : (GET_CODE (compare_op)));
3680 : else
3681 4335 : PUT_CODE (compare_op,
3682 : reverse_condition (GET_CODE (compare_op)));
3683 : }
3684 :
3685 10441 : diff = (unsigned HOST_WIDE_INT) ct - cf;
3686 : /* Make sure we can represent the difference
3687 : between the two values. */
3688 10441 : if ((diff > 0) != ((cf < 0) != (ct < 0) ? cf < 0 : cf < ct))
3689 : return false;
3690 :
3691 10440 : if (reg_overlap_mentioned_p (out, compare_op))
3692 0 : tmp = gen_reg_rtx (mode);
3693 :
3694 10440 : if (mode == DImode)
3695 2133 : emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
3696 : else
3697 8307 : emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
3698 : flags, compare_op));
3699 : }
3700 : else
3701 : {
3702 376 : if (code == GT || code == GE)
3703 153 : code = reverse_condition (code);
3704 : else
3705 : {
3706 223 : std::swap (ct, cf);
3707 :
3708 223 : diff = (unsigned HOST_WIDE_INT) ct - cf;
3709 : /* Make sure we can represent the difference
3710 : between the two values. */
3711 223 : if ((diff > 0) != ((cf < 0) != (ct < 0) ? cf < 0 : cf < ct))
3712 : return false;
3713 : }
3714 371 : tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
3715 : }
3716 :
3717 11091 : if (diff == 1)
3718 : {
3719 : /*
3720 : * cmpl op0,op1
3721 : * sbbl dest,dest
3722 : * [addl dest, ct]
3723 : *
3724 : * Size 5 - 8.
3725 : */
3726 1057 : if (ct)
3727 881 : tmp = expand_simple_binop (mode, PLUS,
3728 : tmp, GEN_INT (ct),
3729 : copy_rtx (tmp), 1, OPTAB_DIRECT);
3730 : }
3731 10034 : else if (cf == -1)
3732 : {
3733 : /*
3734 : * cmpl op0,op1
3735 : * sbbl dest,dest
3736 : * orl $ct, dest
3737 : *
3738 : * Size 8.
3739 : */
3740 597 : tmp = expand_simple_binop (mode, IOR,
3741 : tmp, GEN_INT (ct),
3742 : copy_rtx (tmp), 1, OPTAB_DIRECT);
3743 : }
3744 9437 : else if (diff == -1 && ct)
3745 : {
3746 : /*
3747 : * cmpl op0,op1
3748 : * sbbl dest,dest
3749 : * notl dest
3750 : * [addl dest, cf]
3751 : *
3752 : * Size 8 - 11.
3753 : */
3754 596 : tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3755 596 : if (cf)
3756 578 : tmp = expand_simple_binop (mode, PLUS,
3757 : copy_rtx (tmp), GEN_INT (cf),
3758 : copy_rtx (tmp), 1, OPTAB_DIRECT);
3759 : }
3760 : else
3761 : {
3762 : /*
3763 : * cmpl op0,op1
3764 : * sbbl dest,dest
3765 : * [notl dest]
3766 : * andl cf - ct, dest
3767 : * [addl dest, ct]
3768 : *
3769 : * Size 8 - 11.
3770 : */
3771 :
3772 8841 : if (cf == 0)
3773 : {
3774 939 : cf = ct;
3775 939 : ct = 0;
3776 939 : tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3777 : }
3778 :
3779 8841 : HOST_WIDE_INT ival = (unsigned HOST_WIDE_INT) cf - ct;
3780 : /* Make sure we can represent the difference
3781 : between the two values. */
3782 8841 : if ((ival > 0) != ((ct < 0) != (cf < 0) ? ct < 0 : ct < cf))
3783 16684 : return false;
3784 :
3785 8841 : tmp = expand_simple_binop (mode, AND,
3786 : copy_rtx (tmp),
3787 8841 : gen_int_mode (ival, mode),
3788 : copy_rtx (tmp), 1, OPTAB_DIRECT);
3789 8841 : if (ct)
3790 7073 : tmp = expand_simple_binop (mode, PLUS,
3791 : copy_rtx (tmp), GEN_INT (ct),
3792 : copy_rtx (tmp), 1, OPTAB_DIRECT);
3793 : }
3794 :
3795 11091 : if (!rtx_equal_p (tmp, out))
3796 474 : emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3797 :
3798 11091 : return true;
3799 : }
3800 :
3801 14008 : if (diff < 0)
3802 : {
3803 4776 : machine_mode cmp_mode = GET_MODE (op0);
3804 4776 : enum rtx_code new_code;
3805 :
3806 4776 : if (SCALAR_FLOAT_MODE_P (cmp_mode))
3807 : {
3808 70 : gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3809 :
3810 : /* We may be reversing a non-trapping
3811 : comparison to a trapping comparison. */
3812 136 : if (HONOR_NANS (cmp_mode) && flag_trapping_math
3813 57 : && code != EQ && code != NE
3814 127 : && code != ORDERED && code != UNORDERED)
3815 : new_code = UNKNOWN;
3816 : else
3817 13 : new_code = reverse_condition_maybe_unordered (code);
3818 : }
3819 : else
3820 4706 : new_code = ix86_reverse_condition (code, cmp_mode);
3821 4719 : if (new_code != UNKNOWN)
3822 : {
3823 4719 : std::swap (ct, cf);
3824 :
3825 4719 : diff = (unsigned HOST_WIDE_INT) ct - cf;
3826 : /* Make sure we can represent the difference
3827 : between the two values. */
3828 4719 : if ((diff > 0) != ((cf < 0) != (ct < 0) ? cf < 0 : cf < ct))
3829 : return false;
3830 :
3831 : code = new_code;
3832 : }
3833 : }
3834 :
3835 14008 : compare_code = UNKNOWN;
3836 14008 : if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3837 12263 : && CONST_INT_P (op1))
3838 : {
3839 6403 : if (op1 == const0_rtx
3840 214 : && (code == LT || code == GE))
3841 : compare_code = code;
3842 6403 : else if (op1 == constm1_rtx)
3843 : {
3844 90 : if (code == LE)
3845 : compare_code = LT;
3846 90 : else if (code == GT)
3847 : compare_code = GE;
3848 : }
3849 : }
3850 :
3851 : /* Optimize dest = (op0 < 0) ? -1 : cf. */
3852 : if (compare_code != UNKNOWN
3853 0 : && GET_MODE (op0) == GET_MODE (out)
3854 0 : && (cf == -1 || ct == -1))
3855 : {
3856 : /* If lea code below could be used, only optimize
3857 : if it results in a 2 insn sequence. */
3858 :
3859 0 : if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3860 0 : || diff == 3 || diff == 5 || diff == 9)
3861 0 : || (compare_code == LT && ct == -1)
3862 0 : || (compare_code == GE && cf == -1))
3863 : {
3864 : /*
3865 : * notl op1 (if necessary)
3866 : * sarl $31, op1
3867 : * orl cf, op1
3868 : */
3869 0 : if (ct != -1)
3870 : {
3871 0 : cf = ct;
3872 0 : ct = -1;
3873 0 : code = reverse_condition (code);
3874 : }
3875 :
3876 0 : out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3877 :
3878 0 : out = expand_simple_binop (mode, IOR,
3879 : out, GEN_INT (cf),
3880 : out, 1, OPTAB_DIRECT);
3881 0 : if (out != operands[0])
3882 0 : emit_move_insn (operands[0], out);
3883 :
3884 0 : return true;
3885 : }
3886 : }
3887 :
3888 :
3889 20696 : if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3890 6688 : || diff == 3 || diff == 5 || diff == 9)
3891 7667 : && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3892 21675 : && (mode != DImode
3893 1885 : || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3894 : {
3895 : /*
3896 : * xorl dest,dest
3897 : * cmpl op1,op2
3898 : * setcc dest
3899 : * lea cf(dest*(ct-cf)),dest
3900 : *
3901 : * Size 14.
3902 : *
3903 : * This also catches the degenerate setcc-only case.
3904 : */
3905 :
3906 7667 : rtx tmp;
3907 7667 : int nops;
3908 :
3909 7667 : out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3910 :
3911 7667 : nops = 0;
3912 : /* On x86_64 the lea instruction operates on Pmode, so we need
3913 : to get arithmetics done in proper mode to match. */
3914 7667 : if (diff == 1)
3915 6495 : tmp = copy_rtx (out);
3916 : else
3917 : {
3918 1172 : rtx out1;
3919 1172 : out1 = copy_rtx (out);
3920 1172 : tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3921 1172 : nops++;
3922 1172 : if (diff & 1)
3923 : {
3924 262 : tmp = gen_rtx_PLUS (mode, tmp, out1);
3925 262 : nops++;
3926 : }
3927 : }
3928 7667 : if (cf != 0)
3929 : {
3930 6901 : tmp = plus_constant (mode, tmp, cf);
3931 6901 : nops++;
3932 : }
3933 7667 : if (!rtx_equal_p (tmp, out))
3934 : {
3935 7139 : if (nops == 1)
3936 6063 : out = force_operand (tmp, copy_rtx (out));
3937 : else
3938 1076 : emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3939 : }
3940 7667 : if (!rtx_equal_p (out, operands[0]))
3941 692 : emit_move_insn (operands[0], copy_rtx (out));
3942 :
3943 7667 : return true;
3944 : }
3945 :
3946 : /*
3947 : * General case: Jumpful:
3948 : * xorl dest,dest cmpl op1, op2
3949 : * cmpl op1, op2 movl ct, dest
3950 : * setcc dest jcc 1f
3951 : * decl dest movl cf, dest
3952 : * andl (cf-ct),dest 1:
3953 : * addl ct,dest
3954 : *
3955 : * Size 20. Size 14.
3956 : *
3957 : * This is reasonably steep, but branch mispredict costs are
3958 : * high on modern cpus, so consider failing only if optimizing
3959 : * for space.
3960 : */
3961 :
3962 6341 : if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3963 6341 : && BRANCH_COST (optimize_insn_for_speed_p (),
3964 : false) >= 2)
3965 : {
3966 0 : if (cf == 0)
3967 : {
3968 0 : machine_mode cmp_mode = GET_MODE (op0);
3969 0 : enum rtx_code new_code;
3970 :
3971 0 : if (SCALAR_FLOAT_MODE_P (cmp_mode))
3972 : {
3973 0 : gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3974 :
3975 : /* We may be reversing a non-trapping
3976 : comparison to a trapping comparison. */
3977 0 : if (HONOR_NANS (cmp_mode) && flag_trapping_math
3978 0 : && code != EQ && code != NE
3979 0 : && code != ORDERED && code != UNORDERED)
3980 : new_code = UNKNOWN;
3981 : else
3982 0 : new_code = reverse_condition_maybe_unordered (code);
3983 :
3984 : }
3985 : else
3986 : {
3987 0 : new_code = ix86_reverse_condition (code, cmp_mode);
3988 0 : if (compare_code != UNKNOWN && new_code != UNKNOWN)
3989 0 : compare_code = reverse_condition (compare_code);
3990 : }
3991 :
3992 0 : if (new_code != UNKNOWN)
3993 : {
3994 0 : cf = ct;
3995 0 : ct = 0;
3996 0 : code = new_code;
3997 : }
3998 : }
3999 :
4000 0 : if (compare_code != UNKNOWN)
4001 : {
4002 : /* notl op1 (if needed)
4003 : sarl $31, op1
4004 : andl (cf-ct), op1
4005 : addl ct, op1
4006 :
4007 : For x < 0 (resp. x <= -1) there will be no notl,
4008 : so if possible swap the constants to get rid of the
4009 : complement.
4010 : True/false will be -1/0 while code below (store flag
4011 : followed by decrement) is 0/-1, so the constants need
4012 : to be exchanged once more. */
4013 :
4014 0 : if (compare_code == GE || !cf)
4015 : {
4016 0 : code = reverse_condition (code);
4017 0 : compare_code = LT;
4018 : }
4019 : else
4020 : std::swap (ct, cf);
4021 :
4022 0 : out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
4023 : }
4024 : else
4025 : {
4026 0 : out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
4027 :
4028 0 : out = expand_simple_binop (mode, PLUS, copy_rtx (out),
4029 : constm1_rtx,
4030 : copy_rtx (out), 1, OPTAB_DIRECT);
4031 : }
4032 :
4033 0 : HOST_WIDE_INT ival = (unsigned HOST_WIDE_INT) cf - ct;
4034 : /* Make sure we can represent the difference
4035 : between the two values. */
4036 0 : if ((ival > 0) != ((ct < 0) != (cf < 0) ? ct < 0 : ct < cf))
4037 : return false;
4038 :
4039 0 : out = expand_simple_binop (mode, AND, copy_rtx (out),
4040 0 : gen_int_mode (ival, mode),
4041 : copy_rtx (out), 1, OPTAB_DIRECT);
4042 0 : if (ct)
4043 0 : out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
4044 : copy_rtx (out), 1, OPTAB_DIRECT);
4045 0 : if (!rtx_equal_p (out, operands[0]))
4046 0 : emit_move_insn (operands[0], copy_rtx (out));
4047 :
4048 0 : return true;
4049 : }
4050 : }
4051 :
4052 400577 : if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
4053 : {
4054 : /* Try a few things more with specific constants and a variable. */
4055 :
4056 0 : optab op;
4057 0 : rtx var, orig_out, out, tmp;
4058 :
4059 0 : if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
4060 : return false;
4061 :
4062 0 : operands[2] = op2;
4063 0 : operands[3] = op3;
4064 :
4065 : /* If one of the two operands is an interesting constant, load a
4066 : constant with the above and mask it in with a logical operation. */
4067 :
4068 0 : if (CONST_INT_P (operands[2]))
4069 : {
4070 0 : var = operands[3];
4071 0 : if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
4072 0 : operands[3] = constm1_rtx, op = and_optab;
4073 0 : else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
4074 0 : operands[3] = const0_rtx, op = ior_optab;
4075 : else
4076 : return false;
4077 : }
4078 0 : else if (CONST_INT_P (operands[3]))
4079 : {
4080 0 : var = operands[2];
4081 0 : if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
4082 : {
4083 : /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
4084 : "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
4085 0 : if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
4086 0 : operands[1] = simplify_gen_relational (LT, VOIDmode,
4087 0 : GET_MODE (op0),
4088 : op0, const0_rtx);
4089 :
4090 0 : operands[2] = constm1_rtx;
4091 0 : op = and_optab;
4092 : }
4093 0 : else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
4094 0 : operands[2] = const0_rtx, op = ior_optab;
4095 : else
4096 : return false;
4097 : }
4098 : else
4099 : return false;
4100 :
4101 0 : orig_out = operands[0];
4102 0 : tmp = gen_reg_rtx (mode);
4103 0 : operands[0] = tmp;
4104 :
4105 : /* Recurse to get the constant loaded. */
4106 0 : if (!ix86_expand_int_movcc (operands))
4107 : return false;
4108 :
4109 : /* Mask in the interesting variable. */
4110 0 : out = expand_binop (mode, op, var, tmp, orig_out, 0,
4111 : OPTAB_WIDEN);
4112 0 : if (!rtx_equal_p (out, orig_out))
4113 0 : emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
4114 :
4115 0 : return true;
4116 : }
4117 :
4118 : /*
4119 : * For comparison with above,
4120 : *
4121 : * movl cf,dest
4122 : * movl ct,tmp
4123 : * cmpl op1,op2
4124 : * cmovcc tmp,dest
4125 : *
4126 : * Size 15.
4127 : */
4128 :
4129 400577 : if (! nonimmediate_operand (operands[2], mode))
4130 22529 : operands[2] = force_reg (mode, operands[2]);
4131 400577 : if (! nonimmediate_operand (operands[3], mode))
4132 171999 : operands[3] = force_reg (mode, operands[3]);
4133 :
4134 400577 : if (! register_operand (operands[2], VOIDmode)
4135 400577 : && (mode == QImode
4136 1093 : || ! register_operand (operands[3], VOIDmode)))
4137 1564 : operands[2] = force_reg (mode, operands[2]);
4138 :
4139 400577 : if (mode == QImode
4140 400577 : && ! register_operand (operands[3], VOIDmode))
4141 592 : operands[3] = force_reg (mode, operands[3]);
4142 :
4143 400577 : emit_insn (compare_seq);
4144 400577 : emit_insn (gen_rtx_SET (operands[0],
4145 : gen_rtx_IF_THEN_ELSE (mode,
4146 : compare_op, operands[2],
4147 : operands[3])));
4148 400577 : return true;
4149 : }
4150 :
4151 : /* Detect conditional moves that exactly match min/max operational
4152 : semantics. Note that this is IEEE safe, as long as we don't
4153 : interchange the operands.
4154 :
4155 : Returns FALSE if this conditional move doesn't match a MIN/MAX,
4156 : and TRUE if the operation is successful and instructions are emitted. */
4157 :
4158 : static bool
4159 9781 : ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
4160 : rtx cmp_op1, rtx if_true, rtx if_false)
4161 : {
4162 9781 : machine_mode mode = GET_MODE (dest);
4163 9781 : bool is_min;
4164 9781 : rtx tmp;
4165 :
4166 9781 : if (code == LT)
4167 : ;
4168 3250 : else if (code == LE && !HONOR_NANS (mode))
4169 : {
4170 : /* We can swap LE to GE and then invert to LT. */
4171 : std::swap (cmp_op0, cmp_op1);
4172 : std::swap (if_true, if_false);
4173 : }
4174 3209 : else if (code == UNGE)
4175 : std::swap (if_true, if_false);
4176 : else
4177 : return false;
4178 :
4179 8679 : if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
4180 : is_min = true;
4181 4627 : else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
4182 : is_min = false;
4183 : else
4184 1045 : return false;
4185 :
4186 7634 : if (immediate_operand (if_false, mode))
4187 8 : if_false = force_reg (mode, if_false);
4188 7634 : if (immediate_operand (if_true, mode))
4189 0 : if_true = force_reg (mode, if_true);
4190 :
4191 : /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
4192 : but MODE may be a vector mode and thus not appropriate. */
4193 7634 : if (!flag_finite_math_only || flag_signed_zeros)
4194 : {
4195 7634 : int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
4196 7634 : rtvec v;
4197 :
4198 7634 : if_true = force_reg (mode, if_true);
4199 7634 : v = gen_rtvec (2, if_true, if_false);
4200 7634 : tmp = gen_rtx_UNSPEC (mode, v, u);
4201 7634 : }
4202 : else
4203 : {
4204 0 : code = is_min ? SMIN : SMAX;
4205 0 : if (MEM_P (if_true) && MEM_P (if_false))
4206 0 : if_true = force_reg (mode, if_true);
4207 0 : tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
4208 : }
4209 :
4210 7634 : emit_insn (gen_rtx_SET (dest, tmp));
4211 7634 : return true;
4212 : }
4213 :
4214 : /* Return true if MODE is valid for vector compare to mask register,
4215 : Same result for conditionl vector move with mask register. */
4216 : static bool
4217 14930 : ix86_valid_mask_cmp_mode (machine_mode mode)
4218 : {
4219 : /* XOP has its own vector conditional movement. */
4220 14930 : if (TARGET_XOP && !TARGET_AVX512F)
4221 : return false;
4222 :
4223 : /* HFmode only supports vcmpsh whose dest is mask register. */
4224 14924 : if (TARGET_AVX512FP16 && mode == HFmode)
4225 : return true;
4226 :
4227 : /* AVX512F is needed for mask operation. */
4228 14832 : if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
4229 : return false;
4230 :
4231 : /* AVX512BW is needed for vector QI/HImode,
4232 : AVX512VL is needed for 128/256-bit vector. */
4233 182 : machine_mode inner_mode = GET_MODE_INNER (mode);
4234 182 : int vector_size = GET_MODE_SIZE (mode);
4235 182 : if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
4236 : return false;
4237 :
4238 162 : return vector_size == 64 || TARGET_AVX512VL;
4239 : }
4240 :
4241 : /* Return true if integer mask comparison should be used. */
4242 : static bool
4243 52491 : ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
4244 : rtx op_true, rtx op_false)
4245 : {
4246 52491 : int vector_size = GET_MODE_SIZE (mode);
4247 :
4248 52491 : if (cmp_mode == HFmode)
4249 : return true;
4250 52399 : else if (vector_size < 16)
4251 : return false;
4252 46155 : else if (vector_size == 64)
4253 : return true;
4254 92194 : else if (GET_MODE_INNER (cmp_mode) == HFmode)
4255 : return true;
4256 92194 : else if (GET_MODE_INNER (cmp_mode) == BFmode)
4257 : return true;
4258 :
4259 : /* When op_true is NULL, op_false must be NULL, or vice versa. */
4260 46097 : gcc_assert (!op_true == !op_false);
4261 :
4262 : /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
4263 : vector dest is required. */
4264 46097 : if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
4265 : return false;
4266 :
4267 : /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
4268 48 : if (op_false == CONST0_RTX (mode)
4269 48 : || op_true == CONST0_RTX (mode)
4270 48 : || (INTEGRAL_MODE_P (mode)
4271 40 : && (op_true == CONSTM1_RTX (mode)
4272 40 : || op_false == CONSTM1_RTX (mode))))
4273 0 : return false;
4274 :
4275 : return true;
4276 : }
4277 :
4278 : /* Expand an SSE comparison. Return the register with the result. */
4279 :
4280 : static rtx
4281 35534 : ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
4282 : rtx op_true, rtx op_false)
4283 : {
4284 35534 : machine_mode mode = GET_MODE (dest);
4285 35534 : machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
4286 :
4287 : /* In general case result of comparison can differ from operands' type. */
4288 35534 : machine_mode cmp_mode;
4289 :
4290 : /* In AVX512F the result of comparison is an integer mask. */
4291 35534 : bool maskcmp = false;
4292 35534 : rtx x;
4293 :
4294 35534 : if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
4295 : {
4296 145 : unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
4297 145 : maskcmp = true;
4298 145 : cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
4299 : }
4300 : else
4301 : cmp_mode = cmp_ops_mode;
4302 :
4303 35534 : cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
4304 :
4305 71068 : bool (*op1_predicate)(rtx, machine_mode)
4306 35534 : = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
4307 :
4308 35534 : if (!op1_predicate (cmp_op1, cmp_ops_mode))
4309 0 : cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
4310 :
4311 35534 : if (optimize
4312 506 : || (maskcmp && cmp_mode != mode)
4313 506 : || (op_true && reg_overlap_mentioned_p (dest, op_true))
4314 36040 : || (op_false && reg_overlap_mentioned_p (dest, op_false)))
4315 69911 : dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
4316 :
4317 35534 : if (maskcmp)
4318 : {
4319 145 : bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
4320 145 : gcc_assert (ok);
4321 : return dest;
4322 : }
4323 :
4324 35389 : x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
4325 :
4326 35389 : if (cmp_mode != mode)
4327 : {
4328 7215 : x = force_reg (cmp_ops_mode, x);
4329 7215 : convert_move (dest, x, false);
4330 : }
4331 : else
4332 28174 : emit_insn (gen_rtx_SET (dest, x));
4333 :
4334 : return dest;
4335 : }
4336 :
4337 : /* Emit x86 binary operand CODE in mode MODE for SSE vector
4338 : instructions that can be performed using GP registers. */
4339 :
4340 : static void
4341 7066 : ix86_emit_vec_binop (enum rtx_code code, machine_mode mode,
4342 : rtx dst, rtx src1, rtx src2)
4343 : {
4344 7066 : rtx tmp;
4345 :
4346 7066 : tmp = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
4347 :
4348 7066 : if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (SImode)
4349 7066 : && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
4350 : {
4351 102 : rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
4352 102 : tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
4353 : }
4354 :
4355 7066 : emit_insn (tmp);
4356 7066 : }
4357 :
4358 : /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
4359 : operations. This is used for both scalar and vector conditional moves. */
4360 :
4361 : void
4362 10163 : ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
4363 : {
4364 10163 : machine_mode mode = GET_MODE (dest);
4365 10163 : machine_mode cmpmode = GET_MODE (cmp);
4366 10163 : rtx x;
4367 :
4368 : /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
4369 10163 : if (rtx_equal_p (op_true, op_false))
4370 : {
4371 0 : emit_move_insn (dest, op_true);
4372 0 : return;
4373 : }
4374 :
4375 : /* If we have an integer mask and FP value then we need
4376 : to cast mask to FP mode. */
4377 10163 : if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
4378 : {
4379 1545 : cmp = force_reg (cmpmode, cmp);
4380 1545 : cmp = gen_rtx_SUBREG (mode, cmp, 0);
4381 : }
4382 :
4383 : /* In AVX512F the result of comparison is an integer mask. */
4384 10163 : if (mode != cmpmode
4385 1690 : && GET_MODE_CLASS (cmpmode) == MODE_INT)
4386 : {
4387 145 : gcc_assert (ix86_valid_mask_cmp_mode (mode));
4388 : /* Using scalar/vector move with mask register. */
4389 145 : cmp = force_reg (cmpmode, cmp);
4390 : /* Optimize for mask zero. */
4391 290 : op_true = (op_true != CONST0_RTX (mode)
4392 145 : ? force_reg (mode, op_true) : op_true);
4393 290 : op_false = (op_false != CONST0_RTX (mode)
4394 145 : ? force_reg (mode, op_false) : op_false);
4395 145 : if (op_true == CONST0_RTX (mode))
4396 : {
4397 0 : if (cmpmode == E_DImode && !TARGET_64BIT)
4398 : {
4399 0 : x = gen_reg_rtx (cmpmode);
4400 0 : emit_insn (gen_knotdi (x, cmp));
4401 : }
4402 : else
4403 0 : x = expand_simple_unop (cmpmode, NOT, cmp, NULL, 1);
4404 : cmp = x;
4405 : /* Reverse op_true op_false. */
4406 : std::swap (op_true, op_false);
4407 : }
4408 :
4409 145 : if (mode == HFmode)
4410 92 : emit_insn (gen_movhf_mask (dest, op_true, op_false, cmp));
4411 : else
4412 53 : emit_insn (gen_rtx_SET (dest,
4413 : gen_rtx_VEC_MERGE (mode,
4414 : op_true, op_false, cmp)));
4415 145 : return;
4416 : }
4417 :
4418 10018 : if (vector_all_ones_operand (op_true, mode)
4419 10018 : && op_false == CONST0_RTX (mode))
4420 : {
4421 2 : emit_move_insn (dest, cmp);
4422 2 : return;
4423 : }
4424 10016 : else if (op_false == CONST0_RTX (mode))
4425 : {
4426 903 : x = expand_simple_binop (mode, AND, cmp, op_true,
4427 : dest, 1, OPTAB_DIRECT);
4428 903 : if (x != dest)
4429 0 : emit_move_insn (dest, x);
4430 903 : return;
4431 : }
4432 9113 : else if (op_true == CONST0_RTX (mode))
4433 : {
4434 116 : op_false = force_reg (mode, op_false);
4435 116 : x = gen_rtx_NOT (mode, cmp);
4436 116 : ix86_emit_vec_binop (AND, mode, dest, x, op_false);
4437 116 : return;
4438 : }
4439 8997 : else if (vector_all_ones_operand (op_true, mode))
4440 : {
4441 2 : x = expand_simple_binop (mode, IOR, cmp, op_false,
4442 : dest, 1, OPTAB_DIRECT);
4443 2 : if (x != dest)
4444 0 : emit_move_insn (dest, x);
4445 2 : return;
4446 : }
4447 :
4448 8995 : if (TARGET_XOP)
4449 : {
4450 65 : op_true = force_reg (mode, op_true);
4451 :
4452 65 : if (GET_MODE_SIZE (mode) < 16
4453 65 : || !nonimmediate_operand (op_false, mode))
4454 49 : op_false = force_reg (mode, op_false);
4455 :
4456 65 : emit_insn (gen_rtx_SET (dest,
4457 : gen_rtx_IF_THEN_ELSE (mode, cmp,
4458 : op_true, op_false)));
4459 65 : return;
4460 : }
4461 :
4462 8930 : rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4463 8930 : machine_mode blend_mode = mode;
4464 :
4465 8930 : if (GET_MODE_SIZE (mode) < 16
4466 8930 : || !vector_operand (op_true, mode))
4467 2297 : op_true = force_reg (mode, op_true);
4468 :
4469 8930 : op_false = force_reg (mode, op_false);
4470 :
4471 8930 : switch (mode)
4472 : {
4473 29 : case E_V2SFmode:
4474 29 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4475 : gen = gen_mmx_blendvps;
4476 : break;
4477 320 : case E_V4SFmode:
4478 320 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4479 : gen = gen_sse4_1_blendvps;
4480 : break;
4481 157 : case E_V2DFmode:
4482 157 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4483 : gen = gen_sse4_1_blendvpd;
4484 : break;
4485 1097 : case E_SFmode:
4486 1097 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4487 : gen = gen_sse4_1_blendvss;
4488 : break;
4489 818 : case E_DFmode:
4490 818 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4491 : gen = gen_sse4_1_blendvsd;
4492 : break;
4493 222 : case E_V8QImode:
4494 222 : case E_V4HImode:
4495 222 : case E_V4HFmode:
4496 222 : case E_V4BFmode:
4497 222 : case E_V2SImode:
4498 222 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4499 : {
4500 : gen = gen_mmx_pblendvb_v8qi;
4501 : blend_mode = V8QImode;
4502 : }
4503 : break;
4504 95 : case E_V4QImode:
4505 95 : case E_V2HImode:
4506 95 : case E_V2HFmode:
4507 95 : case E_V2BFmode:
4508 95 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4509 : {
4510 : gen = gen_mmx_pblendvb_v4qi;
4511 : blend_mode = V4QImode;
4512 : }
4513 : break;
4514 36 : case E_V2QImode:
4515 36 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4516 : gen = gen_mmx_pblendvb_v2qi;
4517 : break;
4518 5497 : case E_V16QImode:
4519 5497 : case E_V8HImode:
4520 5497 : case E_V8HFmode:
4521 5497 : case E_V8BFmode:
4522 5497 : case E_V4SImode:
4523 5497 : case E_V2DImode:
4524 5497 : case E_V1TImode:
4525 5497 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4526 : {
4527 : gen = gen_sse4_1_pblendvb;
4528 : blend_mode = V16QImode;
4529 : }
4530 : break;
4531 99 : case E_V8SFmode:
4532 99 : if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV)
4533 : gen = gen_avx_blendvps256;
4534 : break;
4535 192 : case E_V4DFmode:
4536 192 : if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV)
4537 : gen = gen_avx_blendvpd256;
4538 : break;
4539 368 : case E_V32QImode:
4540 368 : case E_V16HImode:
4541 368 : case E_V16HFmode:
4542 368 : case E_V16BFmode:
4543 368 : case E_V8SImode:
4544 368 : case E_V4DImode:
4545 368 : if (TARGET_AVX2 && TARGET_SSE_MOVCC_USE_BLENDV)
4546 : {
4547 : gen = gen_avx2_pblendvb;
4548 : blend_mode = V32QImode;
4549 : }
4550 : break;
4551 :
4552 0 : case E_V64QImode:
4553 0 : gen = gen_avx512bw_blendmv64qi;
4554 0 : break;
4555 0 : case E_V32HImode:
4556 0 : gen = gen_avx512bw_blendmv32hi;
4557 0 : break;
4558 0 : case E_V32HFmode:
4559 0 : gen = gen_avx512bw_blendmv32hf;
4560 0 : break;
4561 0 : case E_V32BFmode:
4562 0 : gen = gen_avx512bw_blendmv32bf;
4563 0 : break;
4564 0 : case E_V16SImode:
4565 0 : gen = gen_avx512f_blendmv16si;
4566 0 : break;
4567 0 : case E_V8DImode:
4568 0 : gen = gen_avx512f_blendmv8di;
4569 0 : break;
4570 0 : case E_V8DFmode:
4571 0 : gen = gen_avx512f_blendmv8df;
4572 0 : break;
4573 : case E_V16SFmode:
4574 : gen = gen_avx512f_blendmv16sf;
4575 : break;
4576 :
4577 : default:
4578 : break;
4579 : }
4580 :
4581 0 : if (gen != NULL)
4582 : {
4583 2081 : if (blend_mode == mode)
4584 : x = dest;
4585 : else
4586 : {
4587 1016 : x = gen_reg_rtx (blend_mode);
4588 1016 : op_false = gen_lowpart (blend_mode, op_false);
4589 1016 : op_true = gen_lowpart (blend_mode, op_true);
4590 1016 : cmp = gen_lowpart (blend_mode, cmp);
4591 : }
4592 :
4593 2081 : emit_insn (gen (x, op_false, op_true, cmp));
4594 :
4595 2081 : if (x != dest)
4596 1016 : emit_move_insn (dest, gen_lowpart (mode, x));
4597 : }
4598 : else
4599 : {
4600 6849 : rtx t2, t3;
4601 :
4602 6849 : t2 = expand_simple_binop (mode, AND, op_true, cmp,
4603 : NULL, 1, OPTAB_DIRECT);
4604 :
4605 6849 : t3 = gen_reg_rtx (mode);
4606 6849 : x = gen_rtx_NOT (mode, cmp);
4607 6849 : ix86_emit_vec_binop (AND, mode, t3, x, op_false);
4608 :
4609 6849 : x = expand_simple_binop (mode, IOR, t3, t2,
4610 : dest, 1, OPTAB_DIRECT);
4611 6849 : if (x != dest)
4612 0 : emit_move_insn (dest, x);
4613 : }
4614 : }
4615 :
4616 : /* Swap, force into registers, or otherwise massage the two operands
4617 : to an sse comparison with a mask result. Thus we differ a bit from
4618 : ix86_prepare_fp_compare_args which expects to produce a flags result.
4619 :
4620 : The DEST operand exists to help determine whether to commute commutative
4621 : operators. The POP0/POP1 operands are updated in place. The new
4622 : comparison code is returned, or UNKNOWN if not implementable. */
4623 :
4624 : static enum rtx_code
4625 16996 : ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
4626 : rtx *pop0, rtx *pop1)
4627 : {
4628 16996 : switch (code)
4629 : {
4630 67 : case LTGT:
4631 67 : case UNEQ:
4632 : /* AVX supports all the needed comparisons. */
4633 67 : if (TARGET_AVX)
4634 : break;
4635 : /* We have no LTGT as an operator. We could implement it with
4636 : NE & ORDERED, but this requires an extra temporary. It's
4637 : not clear that it's worth it. */
4638 : return UNKNOWN;
4639 :
4640 : case LT:
4641 : case LE:
4642 : case UNGT:
4643 : case UNGE:
4644 : /* These are supported directly. */
4645 : break;
4646 :
4647 5365 : case EQ:
4648 5365 : case NE:
4649 5365 : case UNORDERED:
4650 5365 : case ORDERED:
4651 : /* AVX has 3 operand comparisons, no need to swap anything. */
4652 5365 : if (TARGET_AVX)
4653 : break;
4654 : /* For commutative operators, try to canonicalize the destination
4655 : operand to be first in the comparison - this helps reload to
4656 : avoid extra moves. */
4657 790 : if (!dest || !rtx_equal_p (dest, *pop1))
4658 : break;
4659 : /* FALLTHRU */
4660 :
4661 10581 : case GE:
4662 10581 : case GT:
4663 10581 : case UNLE:
4664 10581 : case UNLT:
4665 : /* These are not supported directly before AVX, and furthermore
4666 : ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
4667 : comparison operands to transform into something that is
4668 : supported. */
4669 10581 : std::swap (*pop0, *pop1);
4670 10581 : code = swap_condition (code);
4671 10581 : break;
4672 :
4673 0 : default:
4674 0 : gcc_unreachable ();
4675 : }
4676 :
4677 : return code;
4678 : }
4679 :
4680 : /* Expand a floating-point conditional move. Return true if successful. */
4681 :
4682 : bool
4683 95697 : ix86_expand_fp_movcc (rtx operands[])
4684 : {
4685 95697 : machine_mode mode = GET_MODE (operands[0]);
4686 95697 : enum rtx_code code = GET_CODE (operands[1]);
4687 95697 : rtx tmp, compare_op;
4688 95697 : rtx op0 = XEXP (operands[1], 0);
4689 95697 : rtx op1 = XEXP (operands[1], 1);
4690 :
4691 95697 : if (GET_MODE (op0) == BFmode
4692 95697 : && !ix86_fp_comparison_operator (operands[1], VOIDmode))
4693 : return false;
4694 :
4695 95697 : if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
4696 : {
4697 65189 : machine_mode cmode;
4698 :
4699 : /* Since we've no cmove for sse registers, don't force bad register
4700 : allocation just to gain access to it. Deny movcc when the
4701 : comparison mode doesn't match the move mode. */
4702 65189 : cmode = GET_MODE (op0);
4703 65189 : if (cmode == VOIDmode)
4704 0 : cmode = GET_MODE (op1);
4705 65189 : if (cmode != mode)
4706 : return false;
4707 :
4708 9801 : code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
4709 9801 : if (code == UNKNOWN)
4710 : return false;
4711 :
4712 9781 : if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
4713 : operands[2], operands[3]))
4714 : return true;
4715 :
4716 2147 : tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
4717 : operands[2], operands[3]);
4718 2147 : ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
4719 2147 : return true;
4720 : }
4721 :
4722 30508 : if (GET_MODE (op0) == TImode
4723 30508 : || (GET_MODE (op0) == DImode
4724 72 : && !TARGET_64BIT))
4725 : return false;
4726 :
4727 : /* The floating point conditional move instructions don't directly
4728 : support conditions resulting from a signed integer comparison. */
4729 :
4730 30436 : compare_op = ix86_expand_compare (code, op0, op1);
4731 30436 : if (!fcmov_comparison_operator (compare_op, VOIDmode))
4732 : {
4733 146 : tmp = gen_reg_rtx (QImode);
4734 146 : ix86_expand_setcc (tmp, code, op0, op1);
4735 :
4736 146 : compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
4737 : }
4738 :
4739 30436 : operands[2] = force_reg (mode, operands[2]);
4740 30436 : operands[3] = force_reg (mode, operands[3]);
4741 30436 : emit_insn (gen_rtx_SET (operands[0],
4742 : gen_rtx_IF_THEN_ELSE (mode, compare_op,
4743 : operands[2], operands[3])));
4744 :
4745 30436 : return true;
4746 : }
4747 :
4748 : /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
4749 :
4750 : static int
4751 4885 : ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
4752 : {
4753 4885 : switch (code)
4754 : {
4755 : case EQ:
4756 : return 0;
4757 379 : case LT:
4758 379 : case LTU:
4759 379 : return 1;
4760 212 : case LE:
4761 212 : case LEU:
4762 212 : return 2;
4763 3072 : case NE:
4764 3072 : return 4;
4765 307 : case GE:
4766 307 : case GEU:
4767 307 : return 5;
4768 502 : case GT:
4769 502 : case GTU:
4770 502 : return 6;
4771 0 : default:
4772 0 : gcc_unreachable ();
4773 : }
4774 : }
4775 :
4776 : /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
4777 :
4778 : static int
4779 1785 : ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
4780 : {
4781 1785 : switch (code)
4782 : {
4783 : case EQ:
4784 : return 0x00;
4785 354 : case NE:
4786 354 : return 0x04;
4787 514 : case GT:
4788 514 : return 0x0e;
4789 88 : case LE:
4790 88 : return 0x02;
4791 53 : case GE:
4792 53 : return 0x0d;
4793 624 : case LT:
4794 624 : return 0x01;
4795 2 : case UNLE:
4796 2 : return 0x0a;
4797 2 : case UNLT:
4798 2 : return 0x09;
4799 11 : case UNGE:
4800 11 : return 0x05;
4801 44 : case UNGT:
4802 44 : return 0x06;
4803 2 : case UNEQ:
4804 2 : return 0x18;
4805 0 : case LTGT:
4806 0 : return 0x0c;
4807 2 : case ORDERED:
4808 2 : return 0x07;
4809 2 : case UNORDERED:
4810 2 : return 0x03;
4811 0 : default:
4812 0 : gcc_unreachable ();
4813 : }
4814 : }
4815 :
4816 : /* Return immediate value to be used in UNSPEC_PCMP
4817 : for comparison CODE in MODE. */
4818 :
4819 : static int
4820 6670 : ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
4821 : {
4822 6670 : if (FLOAT_MODE_P (mode))
4823 1785 : return ix86_fp_cmp_code_to_pcmp_immediate (code);
4824 4885 : return ix86_int_cmp_code_to_pcmp_immediate (code);
4825 : }
4826 :
4827 : /* Expand AVX-512 vector comparison. */
4828 :
4829 : bool
4830 6670 : ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
4831 : {
4832 6670 : machine_mode mask_mode = GET_MODE (dest);
4833 6670 : machine_mode cmp_mode = GET_MODE (cmp_op0);
4834 6670 : rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
4835 6670 : int unspec_code;
4836 6670 : rtx unspec;
4837 :
4838 6670 : switch (code)
4839 : {
4840 : case LEU:
4841 : case GTU:
4842 : case GEU:
4843 : case LTU:
4844 : unspec_code = UNSPEC_UNSIGNED_PCMP;
4845 : break;
4846 :
4847 6256 : default:
4848 6256 : unspec_code = UNSPEC_PCMP;
4849 : }
4850 :
4851 6670 : unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
4852 : unspec_code);
4853 6670 : emit_insn (gen_rtx_SET (dest, unspec));
4854 :
4855 6670 : return true;
4856 : }
4857 :
4858 : /* Expand fp vector comparison. */
4859 :
4860 : bool
4861 7195 : ix86_expand_fp_vec_cmp (rtx operands[])
4862 : {
4863 7195 : enum rtx_code code = GET_CODE (operands[1]);
4864 7195 : rtx cmp;
4865 :
4866 7195 : code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4867 : &operands[2], &operands[3]);
4868 7195 : if (code == UNKNOWN)
4869 : {
4870 20 : rtx temp;
4871 20 : switch (GET_CODE (operands[1]))
4872 : {
4873 2 : case LTGT:
4874 2 : temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
4875 : operands[3], NULL, NULL);
4876 2 : cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
4877 : operands[3], NULL, NULL);
4878 2 : code = AND;
4879 2 : break;
4880 18 : case UNEQ:
4881 18 : temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4882 : operands[3], NULL, NULL);
4883 18 : cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4884 : operands[3], NULL, NULL);
4885 18 : code = IOR;
4886 18 : break;
4887 0 : default:
4888 0 : gcc_unreachable ();
4889 : }
4890 20 : cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4891 : OPTAB_DIRECT);
4892 : }
4893 : else
4894 7175 : cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
4895 : NULL, NULL);
4896 :
4897 7195 : if (operands[0] != cmp)
4898 7112 : emit_move_insn (operands[0], cmp);
4899 :
4900 7195 : return true;
4901 : }
4902 :
4903 : static rtx
4904 17155 : ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4905 : rtx op_true, rtx op_false, bool *negate)
4906 : {
4907 17155 : machine_mode data_mode = GET_MODE (dest);
4908 17155 : machine_mode mode = GET_MODE (cop0);
4909 17155 : rtx x;
4910 :
4911 17155 : *negate = false;
4912 :
4913 : /* XOP supports all of the comparisons on all 128-bit vector int types. */
4914 17155 : if (TARGET_XOP
4915 201 : && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4916 17356 : && GET_MODE_SIZE (mode) <= 16)
4917 : ;
4918 : /* AVX512F supports all of the comparsions
4919 : on all 128/256/512-bit vector int types. */
4920 16957 : else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
4921 : ;
4922 : else
4923 : {
4924 : /* Canonicalize the comparison to EQ, GT, GTU. */
4925 16904 : switch (code)
4926 : {
4927 : case EQ:
4928 : case GT:
4929 : case GTU:
4930 : break;
4931 :
4932 851 : case LE:
4933 851 : case LEU:
4934 : /* x <= cst can be handled as x < cst + 1 unless there is
4935 : wrap around in cst + 1. */
4936 851 : if (CONST_VECTOR_P (cop1)
4937 1425 : && GET_MODE_INNER (mode) != TImode)
4938 : {
4939 574 : unsigned int n_elts = GET_MODE_NUNITS (mode), i;
4940 574 : machine_mode eltmode = GET_MODE_INNER (mode);
4941 3659 : for (i = 0; i < n_elts; ++i)
4942 : {
4943 3086 : rtx elt = CONST_VECTOR_ELT (cop1, i);
4944 3086 : if (!CONST_INT_P (elt))
4945 : break;
4946 3086 : if (code == LE)
4947 : {
4948 : /* For LE punt if some element is signed maximum. */
4949 2062 : if ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
4950 : == (GET_MODE_MASK (eltmode) >> 1))
4951 : break;
4952 : }
4953 : /* For LEU punt if some element is unsigned maximum. */
4954 1024 : else if (elt == constm1_rtx)
4955 : break;
4956 : }
4957 574 : if (i == n_elts)
4958 : {
4959 573 : rtvec v = rtvec_alloc (n_elts);
4960 4230 : for (i = 0; i < n_elts; ++i)
4961 3084 : RTVEC_ELT (v, i)
4962 3084 : = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) + 1,
4963 : eltmode);
4964 573 : cop1 = gen_rtx_CONST_VECTOR (mode, v);
4965 573 : std::swap (cop0, cop1);
4966 573 : code = code == LE ? GT : GTU;
4967 : break;
4968 : }
4969 : }
4970 : /* FALLTHRU */
4971 3314 : case NE:
4972 3314 : code = reverse_condition (code);
4973 3314 : *negate = true;
4974 3314 : break;
4975 :
4976 435 : case GE:
4977 435 : case GEU:
4978 : /* x >= cst can be handled as x > cst - 1 unless there is
4979 : wrap around in cst - 1. */
4980 435 : if (CONST_VECTOR_P (cop1)
4981 644 : && GET_MODE_INNER (mode) != TImode)
4982 : {
4983 209 : unsigned int n_elts = GET_MODE_NUNITS (mode), i;
4984 209 : machine_mode eltmode = GET_MODE_INNER (mode);
4985 1453 : for (i = 0; i < n_elts; ++i)
4986 : {
4987 1292 : rtx elt = CONST_VECTOR_ELT (cop1, i);
4988 1292 : if (!CONST_INT_P (elt))
4989 : break;
4990 1292 : if (code == GE)
4991 : {
4992 : /* For GE punt if some element is signed minimum. */
4993 1244 : if (INTVAL (elt) < 0
4994 136 : && ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
4995 : == 0))
4996 : break;
4997 : }
4998 : /* For GEU punt if some element is zero. */
4999 48 : else if (elt == const0_rtx)
5000 : break;
5001 : }
5002 209 : if (i == n_elts)
5003 : {
5004 161 : rtvec v = rtvec_alloc (n_elts);
5005 1566 : for (i = 0; i < n_elts; ++i)
5006 1244 : RTVEC_ELT (v, i)
5007 1244 : = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) - 1,
5008 : eltmode);
5009 161 : cop1 = gen_rtx_CONST_VECTOR (mode, v);
5010 161 : code = code == GE ? GT : GTU;
5011 : break;
5012 : }
5013 : }
5014 274 : code = reverse_condition (code);
5015 274 : *negate = true;
5016 : /* FALLTHRU */
5017 :
5018 1632 : case LT:
5019 1632 : case LTU:
5020 1632 : std::swap (cop0, cop1);
5021 1632 : code = swap_condition (code);
5022 1632 : break;
5023 :
5024 0 : default:
5025 0 : gcc_unreachable ();
5026 : }
5027 :
5028 : /* Only SSE4.1/SSE4.2 supports V2DImode. */
5029 16904 : if (mode == V2DImode)
5030 : {
5031 788 : switch (code)
5032 : {
5033 584 : case EQ:
5034 : /* SSE4.1 supports EQ. */
5035 584 : if (!TARGET_SSE4_1)
5036 17155 : return NULL;
5037 : break;
5038 :
5039 204 : case GT:
5040 204 : case GTU:
5041 : /* SSE4.2 supports GT/GTU. */
5042 204 : if (!TARGET_SSE4_2)
5043 : return NULL;
5044 : break;
5045 :
5046 0 : default:
5047 0 : gcc_unreachable ();
5048 : }
5049 : }
5050 :
5051 16904 : if (CONST_VECTOR_P (cop0))
5052 1228 : cop0 = force_reg (mode, cop0);
5053 15676 : else if (CONST_VECTOR_P (cop1))
5054 7221 : cop1 = force_reg (mode, cop1);
5055 :
5056 16904 : rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
5057 16904 : rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
5058 16904 : if (*negate)
5059 3588 : std::swap (optrue, opfalse);
5060 :
5061 : /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
5062 : not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
5063 : min (x, y) == x). While we add one instruction (the minimum),
5064 : we remove the need for two instructions in the negation, as the
5065 : result is done this way.
5066 : When using masks, do it for SI/DImode element types, as it is shorter
5067 : than the two subtractions. */
5068 16904 : if ((code != EQ
5069 7202 : && GET_MODE_SIZE (mode) != 64
5070 7202 : && vector_all_ones_operand (opfalse, data_mode)
5071 552 : && optrue == CONST0_RTX (data_mode))
5072 23554 : || (code == GTU
5073 1954 : && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
5074 : /* Don't do it if not using integer masks and we'd end up with
5075 : the right values in the registers though. */
5076 658 : && (GET_MODE_SIZE (mode) == 64
5077 658 : || !vector_all_ones_operand (optrue, data_mode)
5078 541 : || opfalse != CONST0_RTX (data_mode))))
5079 : {
5080 669 : rtx (*gen) (rtx, rtx, rtx) = NULL;
5081 :
5082 669 : switch (mode)
5083 : {
5084 0 : case E_V16SImode:
5085 0 : gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
5086 : break;
5087 0 : case E_V8DImode:
5088 0 : gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
5089 0 : cop0 = force_reg (mode, cop0);
5090 0 : cop1 = force_reg (mode, cop1);
5091 0 : break;
5092 24 : case E_V32QImode:
5093 24 : if (TARGET_AVX2)
5094 24 : gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
5095 : break;
5096 24 : case E_V16HImode:
5097 24 : if (TARGET_AVX2)
5098 24 : gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
5099 : break;
5100 25 : case E_V8SImode:
5101 25 : if (TARGET_AVX2)
5102 25 : gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
5103 : break;
5104 20 : case E_V4DImode:
5105 20 : if (TARGET_AVX512VL)
5106 : {
5107 0 : gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
5108 0 : cop0 = force_reg (mode, cop0);
5109 0 : cop1 = force_reg (mode, cop1);
5110 : }
5111 : break;
5112 60 : case E_V16QImode:
5113 60 : if (code == GTU && TARGET_SSE2)
5114 : gen = gen_uminv16qi3;
5115 24 : else if (code == GT && TARGET_SSE4_1)
5116 : gen = gen_sminv16qi3;
5117 : break;
5118 40 : case E_V8QImode:
5119 40 : if (code == GTU && TARGET_SSE2)
5120 : gen = gen_uminv8qi3;
5121 38 : else if (code == GT && TARGET_SSE4_1)
5122 : gen = gen_sminv8qi3;
5123 : break;
5124 13 : case E_V4QImode:
5125 13 : if (code == GTU && TARGET_SSE2)
5126 : gen = gen_uminv4qi3;
5127 2 : else if (code == GT && TARGET_SSE4_1)
5128 : gen = gen_sminv4qi3;
5129 : break;
5130 8 : case E_V2QImode:
5131 8 : if (code == GTU && TARGET_SSE2)
5132 : gen = gen_uminv2qi3;
5133 6 : else if (code == GT && TARGET_SSE4_1)
5134 : gen = gen_sminv2qi3;
5135 : break;
5136 69 : case E_V8HImode:
5137 69 : if (code == GTU && TARGET_SSE4_1)
5138 : gen = gen_uminv8hi3;
5139 59 : else if (code == GT && TARGET_SSE2)
5140 : gen = gen_sminv8hi3;
5141 : break;
5142 4 : case E_V4HImode:
5143 4 : if (code == GTU && TARGET_SSE4_1)
5144 : gen = gen_uminv4hi3;
5145 4 : else if (code == GT && TARGET_SSE2)
5146 : gen = gen_sminv4hi3;
5147 : break;
5148 16 : case E_V2HImode:
5149 16 : if (code == GTU && TARGET_SSE4_1)
5150 : gen = gen_uminv2hi3;
5151 16 : else if (code == GT && TARGET_SSE2)
5152 : gen = gen_sminv2hi3;
5153 : break;
5154 239 : case E_V4SImode:
5155 239 : if (TARGET_SSE4_1)
5156 52 : gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
5157 : break;
5158 103 : case E_V2SImode:
5159 103 : if (TARGET_SSE4_1)
5160 0 : gen = (code == GTU) ? gen_uminv2si3 : gen_sminv2si3;
5161 : break;
5162 24 : case E_V2DImode:
5163 24 : if (TARGET_AVX512VL)
5164 : {
5165 0 : gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
5166 0 : cop0 = force_reg (mode, cop0);
5167 0 : cop1 = force_reg (mode, cop1);
5168 : }
5169 : break;
5170 : default:
5171 : break;
5172 : }
5173 :
5174 0 : if (gen)
5175 : {
5176 276 : rtx tem = gen_reg_rtx (mode);
5177 276 : if (!vector_operand (cop0, mode))
5178 0 : cop0 = force_reg (mode, cop0);
5179 276 : if (!vector_operand (cop1, mode))
5180 0 : cop1 = force_reg (mode, cop1);
5181 276 : *negate = !*negate;
5182 276 : emit_insn (gen (tem, cop0, cop1));
5183 276 : cop1 = tem;
5184 276 : code = EQ;
5185 : }
5186 : }
5187 :
5188 : /* Unsigned parallel compare is not supported by the hardware.
5189 : Play some tricks to turn this into a signed comparison
5190 : against 0. */
5191 16904 : if (code == GTU)
5192 : {
5193 1111 : cop0 = force_reg (mode, cop0);
5194 :
5195 1111 : switch (mode)
5196 : {
5197 761 : case E_V16SImode:
5198 761 : case E_V8DImode:
5199 761 : case E_V8SImode:
5200 761 : case E_V4DImode:
5201 761 : case E_V4SImode:
5202 761 : case E_V2SImode:
5203 761 : case E_V2DImode:
5204 761 : {
5205 761 : rtx t1, t2, mask;
5206 :
5207 : /* Subtract (-(INT MAX) - 1) from both operands to make
5208 : them signed. */
5209 761 : mask = ix86_build_signbit_mask (mode, true, false);
5210 761 : t1 = gen_reg_rtx (mode);
5211 761 : emit_insn (gen_sub3_insn (t1, cop0, mask));
5212 :
5213 761 : t2 = gen_reg_rtx (mode);
5214 761 : emit_insn (gen_sub3_insn (t2, cop1, mask));
5215 :
5216 761 : cop0 = t1;
5217 761 : cop1 = t2;
5218 761 : code = GT;
5219 : }
5220 761 : break;
5221 :
5222 350 : case E_V64QImode:
5223 350 : case E_V32HImode:
5224 350 : case E_V32QImode:
5225 350 : case E_V16HImode:
5226 350 : case E_V16QImode:
5227 350 : case E_V8QImode:
5228 350 : case E_V4QImode:
5229 350 : case E_V2QImode:
5230 350 : case E_V8HImode:
5231 350 : case E_V4HImode:
5232 350 : case E_V2HImode:
5233 : /* Perform a parallel unsigned saturating subtraction. */
5234 350 : x = gen_reg_rtx (mode);
5235 350 : emit_insn (gen_rtx_SET
5236 : (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
5237 350 : cop0 = x;
5238 350 : cop1 = CONST0_RTX (mode);
5239 350 : code = EQ;
5240 350 : *negate = !*negate;
5241 350 : break;
5242 :
5243 0 : default:
5244 0 : gcc_unreachable ();
5245 : }
5246 : }
5247 : }
5248 :
5249 17155 : if (*negate)
5250 3600 : std::swap (op_true, op_false);
5251 :
5252 17155 : if (CONST_VECTOR_P (cop1))
5253 419 : cop1 = force_reg (mode, cop1);
5254 :
5255 : /* Allow the comparison to be done in one mode, but the movcc to
5256 : happen in another mode. */
5257 17155 : if (data_mode == mode)
5258 17113 : x = ix86_expand_sse_cmp (dest, code, cop0, cop1, op_true, op_false);
5259 : else
5260 : {
5261 126 : gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
5262 42 : x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
5263 : op_true, op_false);
5264 42 : if (GET_MODE (x) == mode)
5265 24 : x = gen_lowpart (data_mode, x);
5266 : }
5267 :
5268 : return x;
5269 : }
5270 :
5271 : /* Expand integer vector comparison. */
5272 :
5273 : bool
5274 10393 : ix86_expand_int_vec_cmp (rtx operands[])
5275 : {
5276 10393 : rtx_code code = GET_CODE (operands[1]);
5277 10393 : bool negate = false;
5278 10393 : rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
5279 : operands[3], NULL, NULL, &negate);
5280 :
5281 10393 : if (!cmp)
5282 : return false;
5283 :
5284 10393 : if (negate)
5285 : {
5286 3630 : if (TARGET_AVX512F && GET_MODE_SIZE (GET_MODE (cmp)) >= 16)
5287 91 : cmp = gen_rtx_XOR (GET_MODE (cmp), cmp, CONSTM1_RTX (GET_MODE (cmp)));
5288 : else
5289 : {
5290 6826 : cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
5291 3413 : CONST0_RTX (GET_MODE (cmp)),
5292 : NULL, NULL, &negate);
5293 3413 : gcc_assert (!negate);
5294 : }
5295 : }
5296 :
5297 10393 : if (operands[0] != cmp)
5298 10098 : emit_move_insn (operands[0], cmp);
5299 :
5300 : return true;
5301 : }
5302 :
5303 : /* Expand a floating-point vector conditional move; a vcond operation
5304 : rather than a movcc operation. */
5305 :
5306 : bool
5307 0 : ix86_expand_fp_vcond (rtx operands[])
5308 : {
5309 0 : enum rtx_code code = GET_CODE (operands[3]);
5310 0 : rtx cmp;
5311 :
5312 0 : code = ix86_prepare_sse_fp_compare_args (operands[0], code,
5313 : &operands[4], &operands[5]);
5314 0 : if (code == UNKNOWN)
5315 : {
5316 0 : rtx temp;
5317 0 : switch (GET_CODE (operands[3]))
5318 : {
5319 0 : case LTGT:
5320 0 : temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
5321 : operands[5], operands[0], operands[0]);
5322 0 : cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
5323 : operands[5], operands[1], operands[2]);
5324 0 : code = AND;
5325 0 : break;
5326 0 : case UNEQ:
5327 0 : temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
5328 : operands[5], operands[0], operands[0]);
5329 0 : cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
5330 : operands[5], operands[1], operands[2]);
5331 0 : code = IOR;
5332 0 : break;
5333 0 : default:
5334 0 : gcc_unreachable ();
5335 : }
5336 0 : cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
5337 : OPTAB_DIRECT);
5338 0 : ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
5339 0 : return true;
5340 : }
5341 :
5342 0 : if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
5343 : operands[5], operands[1], operands[2]))
5344 : return true;
5345 :
5346 0 : cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
5347 : operands[1], operands[2]);
5348 0 : ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
5349 0 : return true;
5350 : }
5351 :
5352 : /* Expand a signed/unsigned integral vector conditional move. */
5353 :
5354 : bool
5355 3349 : ix86_expand_int_vcond (rtx operands[])
5356 : {
5357 3349 : machine_mode data_mode = GET_MODE (operands[0]);
5358 3349 : machine_mode mode = GET_MODE (operands[4]);
5359 3349 : enum rtx_code code = GET_CODE (operands[3]);
5360 3349 : bool negate = false;
5361 3349 : rtx x, cop0, cop1;
5362 :
5363 3349 : cop0 = operands[4];
5364 3349 : cop1 = operands[5];
5365 :
5366 : /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
5367 : and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
5368 3349 : if ((code == LT || code == GE)
5369 0 : && data_mode == mode
5370 0 : && cop1 == CONST0_RTX (mode)
5371 0 : && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
5372 0 : && GET_MODE_UNIT_SIZE (data_mode) > 1
5373 0 : && GET_MODE_UNIT_SIZE (data_mode) <= 8
5374 3349 : && (GET_MODE_SIZE (data_mode) == 16
5375 0 : || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
5376 : {
5377 0 : rtx negop = operands[2 - (code == LT)];
5378 0 : int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
5379 0 : if (negop == CONST1_RTX (data_mode))
5380 : {
5381 0 : rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
5382 : operands[0], 1, OPTAB_DIRECT);
5383 0 : if (res != operands[0])
5384 0 : emit_move_insn (operands[0], res);
5385 0 : return true;
5386 : }
5387 0 : else if (GET_MODE_INNER (data_mode) != DImode
5388 0 : && vector_all_ones_operand (negop, data_mode))
5389 : {
5390 0 : rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
5391 : operands[0], 0, OPTAB_DIRECT);
5392 0 : if (res != operands[0])
5393 0 : emit_move_insn (operands[0], res);
5394 0 : return true;
5395 : }
5396 : }
5397 :
5398 3349 : if (!nonimmediate_operand (cop1, mode))
5399 126 : cop1 = force_reg (mode, cop1);
5400 3349 : if (!general_operand (operands[1], data_mode))
5401 0 : operands[1] = force_reg (data_mode, operands[1]);
5402 3349 : if (!general_operand (operands[2], data_mode))
5403 0 : operands[2] = force_reg (data_mode, operands[2]);
5404 :
5405 3349 : x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
5406 : operands[1], operands[2], &negate);
5407 :
5408 3349 : if (!x)
5409 : return false;
5410 :
5411 3349 : ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
5412 3349 : operands[2-negate]);
5413 3349 : return true;
5414 : }
5415 :
5416 : static bool
5417 123160 : ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
5418 : struct expand_vec_perm_d *d)
5419 : {
5420 : /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5421 : expander, so args are either in d, or in op0, op1 etc. */
5422 123160 : machine_mode mode = GET_MODE (d ? d->op0 : op0);
5423 123160 : machine_mode maskmode = mode;
5424 123160 : rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
5425 :
5426 123160 : switch (mode)
5427 : {
5428 23432 : case E_V16QImode:
5429 23432 : if (TARGET_AVX512VL && TARGET_AVX512VBMI)
5430 : gen = gen_avx512vl_vpermt2varv16qi3;
5431 : break;
5432 521 : case E_V32QImode:
5433 521 : if (TARGET_AVX512VL && TARGET_AVX512VBMI)
5434 : gen = gen_avx512vl_vpermt2varv32qi3;
5435 : break;
5436 198 : case E_V64QImode:
5437 198 : if (TARGET_AVX512VBMI)
5438 : gen = gen_avx512bw_vpermt2varv64qi3;
5439 : break;
5440 13191 : case E_V8HImode:
5441 13191 : if (TARGET_AVX512VL && TARGET_AVX512BW)
5442 : gen = gen_avx512vl_vpermt2varv8hi3;
5443 : break;
5444 775 : case E_V16HImode:
5445 775 : if (TARGET_AVX512VL && TARGET_AVX512BW)
5446 : gen = gen_avx512vl_vpermt2varv16hi3;
5447 : break;
5448 331 : case E_V32HImode:
5449 331 : if (TARGET_AVX512BW)
5450 : gen = gen_avx512bw_vpermt2varv32hi3;
5451 : break;
5452 33346 : case E_V4SImode:
5453 33346 : if (TARGET_AVX512VL)
5454 : gen = gen_avx512vl_vpermt2varv4si3;
5455 : break;
5456 1169 : case E_V8SImode:
5457 1169 : if (TARGET_AVX512VL)
5458 : gen = gen_avx512vl_vpermt2varv8si3;
5459 : break;
5460 126 : case E_V16SImode:
5461 126 : if (TARGET_AVX512F)
5462 : gen = gen_avx512f_vpermt2varv16si3;
5463 : break;
5464 10333 : case E_V4SFmode:
5465 10333 : if (TARGET_AVX512VL)
5466 : {
5467 : gen = gen_avx512vl_vpermt2varv4sf3;
5468 : maskmode = V4SImode;
5469 : }
5470 : break;
5471 6063 : case E_V8SFmode:
5472 6063 : if (TARGET_AVX512VL)
5473 : {
5474 : gen = gen_avx512vl_vpermt2varv8sf3;
5475 : maskmode = V8SImode;
5476 : }
5477 : break;
5478 239 : case E_V16SFmode:
5479 239 : if (TARGET_AVX512F)
5480 : {
5481 : gen = gen_avx512f_vpermt2varv16sf3;
5482 : maskmode = V16SImode;
5483 : }
5484 : break;
5485 2 : case E_V2DImode:
5486 2 : if (TARGET_AVX512VL)
5487 : gen = gen_avx512vl_vpermt2varv2di3;
5488 : break;
5489 292 : case E_V4DImode:
5490 292 : if (TARGET_AVX512VL)
5491 : gen = gen_avx512vl_vpermt2varv4di3;
5492 : break;
5493 10 : case E_V8DImode:
5494 10 : if (TARGET_AVX512F)
5495 : gen = gen_avx512f_vpermt2varv8di3;
5496 : break;
5497 2 : case E_V2DFmode:
5498 2 : if (TARGET_AVX512VL)
5499 : {
5500 : gen = gen_avx512vl_vpermt2varv2df3;
5501 : maskmode = V2DImode;
5502 : }
5503 : break;
5504 1848 : case E_V4DFmode:
5505 1848 : if (TARGET_AVX512VL)
5506 : {
5507 : gen = gen_avx512vl_vpermt2varv4df3;
5508 : maskmode = V4DImode;
5509 : }
5510 : break;
5511 186 : case E_V8DFmode:
5512 186 : if (TARGET_AVX512F)
5513 : {
5514 : gen = gen_avx512f_vpermt2varv8df3;
5515 : maskmode = V8DImode;
5516 : }
5517 : break;
5518 : default:
5519 : break;
5520 : }
5521 :
5522 : if (gen == NULL)
5523 : return false;
5524 :
5525 909 : if (d && d->testing_p)
5526 : return true;
5527 :
5528 : /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5529 : expander, so args are either in d, or in op0, op1 etc. */
5530 898 : if (d)
5531 : {
5532 898 : rtx vec[64];
5533 898 : target = d->target;
5534 898 : op0 = d->op0;
5535 898 : op1 = d->op1;
5536 15622 : for (int i = 0; i < d->nelt; ++i)
5537 14724 : vec[i] = GEN_INT (d->perm[i]);
5538 898 : mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
5539 : }
5540 :
5541 906 : emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
5542 906 : return true;
5543 : }
5544 :
5545 : /* Expand a variable vector permutation. */
5546 :
5547 : void
5548 18 : ix86_expand_vec_perm (rtx operands[])
5549 : {
5550 18 : rtx target = operands[0];
5551 18 : rtx op0 = operands[1];
5552 18 : rtx op1 = operands[2];
5553 18 : rtx mask = operands[3];
5554 18 : rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
5555 18 : machine_mode mode = GET_MODE (op0);
5556 18 : machine_mode maskmode = GET_MODE (mask);
5557 18 : int w, e, i;
5558 18 : bool one_operand_shuffle = rtx_equal_p (op0, op1);
5559 :
5560 : /* Number of elements in the vector. */
5561 18 : w = GET_MODE_NUNITS (mode);
5562 18 : e = GET_MODE_UNIT_SIZE (mode);
5563 18 : gcc_assert (w <= 64);
5564 :
5565 : /* For HF mode vector, convert it to HI using subreg. */
5566 36 : if (GET_MODE_INNER (mode) == HFmode)
5567 : {
5568 6 : machine_mode orig_mode = mode;
5569 6 : mode = mode_for_vector (HImode, w).require ();
5570 6 : target = lowpart_subreg (mode, target, orig_mode);
5571 6 : op0 = lowpart_subreg (mode, op0, orig_mode);
5572 6 : op1 = lowpart_subreg (mode, op1, orig_mode);
5573 : }
5574 :
5575 18 : if (TARGET_AVX512F && one_operand_shuffle)
5576 : {
5577 5 : rtx (*gen) (rtx, rtx, rtx) = NULL;
5578 5 : switch (mode)
5579 : {
5580 : case E_V16SImode:
5581 : gen = gen_avx512f_permvarv16si;
5582 : break;
5583 0 : case E_V16SFmode:
5584 0 : gen = gen_avx512f_permvarv16sf;
5585 0 : break;
5586 0 : case E_V8DImode:
5587 0 : gen = gen_avx512f_permvarv8di;
5588 0 : break;
5589 0 : case E_V8DFmode:
5590 0 : gen = gen_avx512f_permvarv8df;
5591 0 : break;
5592 : default:
5593 : break;
5594 : }
5595 0 : if (gen != NULL)
5596 : {
5597 0 : emit_insn (gen (target, op0, mask));
5598 16 : return;
5599 : }
5600 : }
5601 :
5602 18 : if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
5603 : return;
5604 :
5605 10 : if (TARGET_AVX2)
5606 : {
5607 5 : if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
5608 : {
5609 : /* Unfortunately, the VPERMQ and VPERMPD instructions only support
5610 : an constant shuffle operand. With a tiny bit of effort we can
5611 : use VPERMD instead. A re-interpretation stall for V4DFmode is
5612 : unfortunate but there's no avoiding it.
5613 : Similarly for V16HImode we don't have instructions for variable
5614 : shuffling, while for V32QImode we can use after preparing suitable
5615 : masks vpshufb; vpshufb; vpermq; vpor. */
5616 :
5617 : if (mode == V16HImode)
5618 : {
5619 : maskmode = mode = V32QImode;
5620 : w = 32;
5621 : e = 1;
5622 : }
5623 : else
5624 : {
5625 : maskmode = mode = V8SImode;
5626 : w = 8;
5627 : e = 4;
5628 : }
5629 0 : t1 = gen_reg_rtx (maskmode);
5630 :
5631 : /* Replicate the low bits of the V4DImode mask into V8SImode:
5632 : mask = { A B C D }
5633 : t1 = { A A B B C C D D }. */
5634 0 : for (i = 0; i < w / 2; ++i)
5635 0 : vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
5636 0 : vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5637 0 : vt = force_reg (maskmode, vt);
5638 0 : mask = gen_lowpart (maskmode, mask);
5639 0 : if (maskmode == V8SImode)
5640 0 : emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
5641 : else
5642 0 : emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
5643 :
5644 : /* Multiply the shuffle indicies by two. */
5645 0 : t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
5646 : OPTAB_DIRECT);
5647 :
5648 : /* Add one to the odd shuffle indicies:
5649 : t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
5650 0 : for (i = 0; i < w / 2; ++i)
5651 : {
5652 0 : vec[i * 2] = const0_rtx;
5653 0 : vec[i * 2 + 1] = const1_rtx;
5654 : }
5655 0 : vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5656 0 : vt = validize_mem (force_const_mem (maskmode, vt));
5657 0 : t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
5658 : OPTAB_DIRECT);
5659 :
5660 : /* Continue as if V8SImode (resp. V32QImode) was used initially. */
5661 0 : operands[3] = mask = t1;
5662 0 : target = gen_reg_rtx (mode);
5663 0 : op0 = gen_lowpart (mode, op0);
5664 0 : op1 = gen_lowpart (mode, op1);
5665 : }
5666 :
5667 5 : switch (mode)
5668 : {
5669 1 : case E_V8SImode:
5670 : /* The VPERMD and VPERMPS instructions already properly ignore
5671 : the high bits of the shuffle elements. No need for us to
5672 : perform an AND ourselves. */
5673 1 : if (one_operand_shuffle)
5674 : {
5675 0 : emit_insn (gen_avx2_permvarv8si (target, op0, mask));
5676 0 : if (target != operands[0])
5677 0 : emit_move_insn (operands[0],
5678 0 : gen_lowpart (GET_MODE (operands[0]), target));
5679 : }
5680 : else
5681 : {
5682 1 : t1 = gen_reg_rtx (V8SImode);
5683 1 : t2 = gen_reg_rtx (V8SImode);
5684 1 : emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
5685 1 : emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
5686 1 : goto merge_two;
5687 : }
5688 0 : return;
5689 :
5690 0 : case E_V8SFmode:
5691 0 : mask = gen_lowpart (V8SImode, mask);
5692 0 : if (one_operand_shuffle)
5693 0 : emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
5694 : else
5695 : {
5696 0 : t1 = gen_reg_rtx (V8SFmode);
5697 0 : t2 = gen_reg_rtx (V8SFmode);
5698 0 : emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
5699 0 : emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
5700 0 : goto merge_two;
5701 : }
5702 0 : return;
5703 :
5704 1 : case E_V4SImode:
5705 1 : if (one_operand_shuffle)
5706 : break; /* Handled below for TARGET_AVX. */
5707 : /* By combining the two 128-bit input vectors into one 256-bit
5708 : input vector, we can use VPERMD and VPERMPS for the full
5709 : two-operand shuffle. */
5710 0 : t1 = gen_reg_rtx (V8SImode);
5711 0 : t2 = gen_reg_rtx (V8SImode);
5712 0 : emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
5713 0 : emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5714 0 : emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
5715 0 : emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
5716 0 : return;
5717 :
5718 1 : case E_V4SFmode:
5719 1 : if (one_operand_shuffle)
5720 : break; /* Handled below for TARGET_AVX. */
5721 0 : t1 = gen_reg_rtx (V8SFmode);
5722 0 : t2 = gen_reg_rtx (V8SImode);
5723 0 : mask = gen_lowpart (V4SImode, mask);
5724 0 : emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
5725 0 : emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5726 0 : emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
5727 0 : emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
5728 0 : return;
5729 :
5730 0 : case E_V32QImode:
5731 0 : t1 = gen_reg_rtx (V32QImode);
5732 0 : t2 = gen_reg_rtx (V32QImode);
5733 0 : t3 = gen_reg_rtx (V32QImode);
5734 0 : vt2 = GEN_INT (-128);
5735 0 : vt = gen_const_vec_duplicate (V32QImode, vt2);
5736 0 : vt = force_reg (V32QImode, vt);
5737 0 : for (i = 0; i < 32; i++)
5738 0 : vec[i] = i < 16 ? vt2 : const0_rtx;
5739 0 : vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
5740 0 : vt2 = force_reg (V32QImode, vt2);
5741 : /* From mask create two adjusted masks, which contain the same
5742 : bits as mask in the low 7 bits of each vector element.
5743 : The first mask will have the most significant bit clear
5744 : if it requests element from the same 128-bit lane
5745 : and MSB set if it requests element from the other 128-bit lane.
5746 : The second mask will have the opposite values of the MSB,
5747 : and additionally will have its 128-bit lanes swapped.
5748 : E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
5749 : t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
5750 : t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
5751 : stands for other 12 bytes. */
5752 : /* The bit whether element is from the same lane or the other
5753 : lane is bit 4, so shift it up by 3 to the MSB position. */
5754 0 : t5 = gen_reg_rtx (V4DImode);
5755 0 : emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
5756 : GEN_INT (3)));
5757 : /* Clear MSB bits from the mask just in case it had them set. */
5758 0 : emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
5759 : /* After this t1 will have MSB set for elements from other lane. */
5760 0 : emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
5761 : /* Clear bits other than MSB. */
5762 0 : emit_insn (gen_andv32qi3 (t1, t1, vt));
5763 : /* Or in the lower bits from mask into t3. */
5764 0 : emit_insn (gen_iorv32qi3 (t3, t1, t2));
5765 : /* And invert MSB bits in t1, so MSB is set for elements from the same
5766 : lane. */
5767 0 : emit_insn (gen_xorv32qi3 (t1, t1, vt));
5768 : /* Swap 128-bit lanes in t3. */
5769 0 : t6 = gen_reg_rtx (V4DImode);
5770 0 : emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
5771 : const2_rtx, GEN_INT (3),
5772 : const0_rtx, const1_rtx));
5773 : /* And or in the lower bits from mask into t1. */
5774 0 : emit_insn (gen_iorv32qi3 (t1, t1, t2));
5775 0 : if (one_operand_shuffle)
5776 : {
5777 : /* Each of these shuffles will put 0s in places where
5778 : element from the other 128-bit lane is needed, otherwise
5779 : will shuffle in the requested value. */
5780 0 : emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
5781 0 : gen_lowpart (V32QImode, t6)));
5782 0 : emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
5783 : /* For t3 the 128-bit lanes are swapped again. */
5784 0 : t7 = gen_reg_rtx (V4DImode);
5785 0 : emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
5786 : const2_rtx, GEN_INT (3),
5787 : const0_rtx, const1_rtx));
5788 : /* And oring both together leads to the result. */
5789 0 : emit_insn (gen_iorv32qi3 (target, t1,
5790 0 : gen_lowpart (V32QImode, t7)));
5791 0 : if (target != operands[0])
5792 0 : emit_move_insn (operands[0],
5793 0 : gen_lowpart (GET_MODE (operands[0]), target));
5794 0 : return;
5795 : }
5796 :
5797 0 : t4 = gen_reg_rtx (V32QImode);
5798 : /* Similarly to the above one_operand_shuffle code,
5799 : just for repeated twice for each operand. merge_two:
5800 : code will merge the two results together. */
5801 0 : emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
5802 0 : gen_lowpart (V32QImode, t6)));
5803 0 : emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
5804 0 : gen_lowpart (V32QImode, t6)));
5805 0 : emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
5806 0 : emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
5807 0 : t7 = gen_reg_rtx (V4DImode);
5808 0 : emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
5809 : const2_rtx, GEN_INT (3),
5810 : const0_rtx, const1_rtx));
5811 0 : t8 = gen_reg_rtx (V4DImode);
5812 0 : emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
5813 : const2_rtx, GEN_INT (3),
5814 : const0_rtx, const1_rtx));
5815 0 : emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
5816 0 : emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
5817 0 : t1 = t4;
5818 0 : t2 = t3;
5819 0 : goto merge_two;
5820 :
5821 2 : default:
5822 4 : gcc_assert (GET_MODE_SIZE (mode) <= 16);
5823 : break;
5824 : }
5825 : }
5826 :
5827 9 : if (TARGET_AVX && one_operand_shuffle)
5828 8 : switch (mode)
5829 : {
5830 2 : case V4SImode:
5831 2 : op0 = gen_lowpart (V4SFmode, op0);
5832 2 : t1 = gen_reg_rtx (V4SFmode);
5833 2 : emit_insn (gen_avx_vpermilvarv4sf3 (t1, op0, mask));
5834 2 : emit_move_insn (target, gen_lowpart (mode, t1));
5835 2 : return;
5836 2 : case V4SFmode:
5837 2 : emit_insn (gen_avx_vpermilvarv4sf3 (target, op0, mask));
5838 2 : return;
5839 2 : case V2DImode:
5840 2 : op0 = gen_lowpart (V2DFmode, op0);
5841 2 : t1 = gen_reg_rtx (V2DImode);
5842 2 : t2 = gen_reg_rtx (V2DFmode);
5843 2 : emit_insn (gen_addv2di3 (t1, mask, mask));
5844 2 : emit_insn (gen_avx_vpermilvarv2df3 (t2, op0, t1));
5845 2 : emit_move_insn (target, gen_lowpart (mode, t2));
5846 2 : return;
5847 2 : case V2DFmode:
5848 2 : t1 = gen_reg_rtx (V2DImode);
5849 2 : emit_insn (gen_addv2di3 (t1, mask, mask));
5850 2 : emit_insn (gen_avx_vpermilvarv2df3 (target, op0, t1));
5851 2 : return;
5852 : default:
5853 : break;
5854 : }
5855 :
5856 1 : if (TARGET_XOP)
5857 : {
5858 : /* The XOP VPPERM insn supports three inputs. By ignoring the
5859 : one_operand_shuffle special case, we avoid creating another
5860 : set of constant vectors in memory. */
5861 0 : one_operand_shuffle = false;
5862 :
5863 : /* mask = mask & {2*w-1, ...} */
5864 0 : vt = GEN_INT (2*w - 1);
5865 : }
5866 : else
5867 : {
5868 : /* mask = mask & {w-1, ...} */
5869 1 : vt = GEN_INT (w - 1);
5870 : }
5871 :
5872 1 : vt = gen_const_vec_duplicate (maskmode, vt);
5873 1 : mask = expand_simple_binop (maskmode, AND, mask, vt,
5874 : NULL_RTX, 0, OPTAB_DIRECT);
5875 :
5876 : /* For non-QImode operations, convert the word permutation control
5877 : into a byte permutation control. */
5878 1 : if (mode != V16QImode)
5879 : {
5880 1 : mask = expand_simple_binop (maskmode, ASHIFT, mask,
5881 2 : GEN_INT (exact_log2 (e)),
5882 : NULL_RTX, 0, OPTAB_DIRECT);
5883 :
5884 : /* Convert mask to vector of chars. */
5885 1 : mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
5886 :
5887 : /* Replicate each of the input bytes into byte positions:
5888 : (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
5889 : (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
5890 : (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
5891 18 : for (i = 0; i < 16; ++i)
5892 16 : vec[i] = GEN_INT (i/e * e);
5893 1 : vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5894 1 : vt = validize_mem (force_const_mem (V16QImode, vt));
5895 1 : if (TARGET_XOP)
5896 0 : emit_insn (gen_xop_pperm (mask, mask, mask, vt));
5897 : else
5898 1 : emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
5899 :
5900 : /* Convert it into the byte positions by doing
5901 : mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
5902 17 : for (i = 0; i < 16; ++i)
5903 16 : vec[i] = GEN_INT (i % e);
5904 1 : vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5905 1 : vt = validize_mem (force_const_mem (V16QImode, vt));
5906 1 : emit_insn (gen_addv16qi3 (mask, mask, vt));
5907 : }
5908 :
5909 : /* The actual shuffle operations all operate on V16QImode. */
5910 1 : op0 = gen_lowpart (V16QImode, op0);
5911 1 : op1 = gen_lowpart (V16QImode, op1);
5912 :
5913 1 : if (TARGET_XOP)
5914 : {
5915 0 : if (GET_MODE (target) != V16QImode)
5916 0 : target = gen_reg_rtx (V16QImode);
5917 0 : emit_insn (gen_xop_pperm (target, op0, op1, mask));
5918 0 : if (target != operands[0])
5919 0 : emit_move_insn (operands[0],
5920 0 : gen_lowpart (GET_MODE (operands[0]), target));
5921 : }
5922 1 : else if (one_operand_shuffle)
5923 : {
5924 1 : if (GET_MODE (target) != V16QImode)
5925 1 : target = gen_reg_rtx (V16QImode);
5926 1 : emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
5927 1 : if (target != operands[0])
5928 1 : emit_move_insn (operands[0],
5929 1 : gen_lowpart (GET_MODE (operands[0]), target));
5930 : }
5931 : else
5932 : {
5933 0 : rtx xops[6];
5934 0 : bool ok;
5935 :
5936 : /* Shuffle the two input vectors independently. */
5937 0 : t1 = gen_reg_rtx (V16QImode);
5938 0 : t2 = gen_reg_rtx (V16QImode);
5939 0 : emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
5940 0 : emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
5941 :
5942 1 : merge_two:
5943 : /* Then merge them together. The key is whether any given control
5944 : element contained a bit set that indicates the second word. */
5945 1 : mask = operands[3];
5946 1 : vt = GEN_INT (w);
5947 1 : if (maskmode == V2DImode && !TARGET_SSE4_1)
5948 : {
5949 : /* Without SSE4.1, we don't have V2DImode EQ. Perform one
5950 : more shuffle to convert the V2DI input mask into a V4SI
5951 : input mask. At which point the masking that expand_int_vcond
5952 : will work as desired. */
5953 0 : rtx t3 = gen_reg_rtx (V4SImode);
5954 0 : emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
5955 : const0_rtx, const0_rtx,
5956 : const2_rtx, const2_rtx));
5957 0 : mask = t3;
5958 0 : maskmode = V4SImode;
5959 0 : e = w = 4;
5960 : }
5961 :
5962 1 : vt = gen_const_vec_duplicate (maskmode, vt);
5963 1 : vt = force_reg (maskmode, vt);
5964 1 : mask = expand_simple_binop (maskmode, AND, mask, vt,
5965 : NULL_RTX, 0, OPTAB_DIRECT);
5966 :
5967 1 : if (GET_MODE (target) != mode)
5968 0 : target = gen_reg_rtx (mode);
5969 1 : xops[0] = target;
5970 1 : xops[1] = gen_lowpart (mode, t2);
5971 1 : xops[2] = gen_lowpart (mode, t1);
5972 1 : xops[3] = gen_rtx_EQ (maskmode, mask, vt);
5973 1 : xops[4] = mask;
5974 1 : xops[5] = vt;
5975 1 : ok = ix86_expand_int_vcond (xops);
5976 1 : gcc_assert (ok);
5977 1 : if (target != operands[0])
5978 0 : emit_move_insn (operands[0],
5979 0 : gen_lowpart (GET_MODE (operands[0]), target));
5980 : }
5981 : }
5982 :
5983 : /* Extend SRC into next wider integer vector type. UNSIGNED_P is
5984 : true if we should do zero extension, else sign extension. */
5985 :
5986 : void
5987 354 : ix86_expand_sse_extend (rtx dest, rtx src, bool unsigned_p)
5988 : {
5989 354 : machine_mode imode = GET_MODE (src);
5990 354 : rtx ops[3];
5991 :
5992 354 : switch (imode)
5993 : {
5994 354 : case E_V8QImode:
5995 354 : case E_V4QImode:
5996 354 : case E_V2QImode:
5997 354 : case E_V4HImode:
5998 354 : case E_V2HImode:
5999 354 : case E_V2SImode:
6000 354 : break;
6001 0 : default:
6002 0 : gcc_unreachable ();
6003 : }
6004 :
6005 354 : ops[0] = dest;
6006 :
6007 354 : ops[1] = force_reg (imode, src);
6008 :
6009 354 : if (unsigned_p)
6010 97 : ops[2] = force_reg (imode, CONST0_RTX (imode));
6011 : else
6012 257 : ops[2] = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
6013 : ops[1], pc_rtx, pc_rtx);
6014 :
6015 354 : ix86_split_mmx_punpck (ops, false);
6016 354 : }
6017 :
6018 : /* Unpack SRC into the next wider integer vector type. UNSIGNED_P is
6019 : true if we should do zero extension, else sign extension. HIGH_P is
6020 : true if we want the N/2 high elements, else the low elements. */
6021 :
6022 : void
6023 18736 : ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
6024 : {
6025 18736 : machine_mode imode = GET_MODE (src);
6026 18736 : rtx tmp;
6027 :
6028 18736 : if (TARGET_SSE4_1)
6029 : {
6030 6466 : rtx (*unpack)(rtx, rtx);
6031 6466 : rtx (*extract)(rtx, rtx) = NULL;
6032 6466 : machine_mode halfmode = BLKmode;
6033 :
6034 6466 : switch (imode)
6035 : {
6036 198 : case E_V64QImode:
6037 198 : if (unsigned_p)
6038 : unpack = gen_avx512bw_zero_extendv32qiv32hi2;
6039 : else
6040 64 : unpack = gen_avx512bw_sign_extendv32qiv32hi2;
6041 198 : halfmode = V32QImode;
6042 198 : extract
6043 198 : = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
6044 : break;
6045 711 : case E_V32QImode:
6046 711 : if (unsigned_p)
6047 : unpack = gen_avx2_zero_extendv16qiv16hi2;
6048 : else
6049 142 : unpack = gen_avx2_sign_extendv16qiv16hi2;
6050 711 : halfmode = V16QImode;
6051 711 : extract
6052 711 : = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
6053 : break;
6054 104 : case E_V32HImode:
6055 104 : if (unsigned_p)
6056 : unpack = gen_avx512f_zero_extendv16hiv16si2;
6057 : else
6058 64 : unpack = gen_avx512f_sign_extendv16hiv16si2;
6059 104 : halfmode = V16HImode;
6060 104 : extract
6061 104 : = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
6062 : break;
6063 429 : case E_V16HImode:
6064 429 : if (unsigned_p)
6065 : unpack = gen_avx2_zero_extendv8hiv8si2;
6066 : else
6067 314 : unpack = gen_avx2_sign_extendv8hiv8si2;
6068 429 : halfmode = V8HImode;
6069 429 : extract
6070 429 : = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
6071 : break;
6072 104 : case E_V16SImode:
6073 104 : if (unsigned_p)
6074 : unpack = gen_avx512f_zero_extendv8siv8di2;
6075 : else
6076 86 : unpack = gen_avx512f_sign_extendv8siv8di2;
6077 104 : halfmode = V8SImode;
6078 104 : extract
6079 104 : = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
6080 : break;
6081 382 : case E_V8SImode:
6082 382 : if (unsigned_p)
6083 : unpack = gen_avx2_zero_extendv4siv4di2;
6084 : else
6085 320 : unpack = gen_avx2_sign_extendv4siv4di2;
6086 382 : halfmode = V4SImode;
6087 382 : extract
6088 382 : = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
6089 : break;
6090 2597 : case E_V16QImode:
6091 2597 : if (unsigned_p)
6092 : unpack = gen_sse4_1_zero_extendv8qiv8hi2;
6093 : else
6094 270 : unpack = gen_sse4_1_sign_extendv8qiv8hi2;
6095 : break;
6096 993 : case E_V8HImode:
6097 993 : if (unsigned_p)
6098 : unpack = gen_sse4_1_zero_extendv4hiv4si2;
6099 : else
6100 776 : unpack = gen_sse4_1_sign_extendv4hiv4si2;
6101 : break;
6102 544 : case E_V4SImode:
6103 544 : if (unsigned_p)
6104 : unpack = gen_sse4_1_zero_extendv2siv2di2;
6105 : else
6106 484 : unpack = gen_sse4_1_sign_extendv2siv2di2;
6107 : break;
6108 119 : case E_V8QImode:
6109 119 : if (unsigned_p)
6110 : unpack = gen_sse4_1_zero_extendv4qiv4hi2;
6111 : else
6112 78 : unpack = gen_sse4_1_sign_extendv4qiv4hi2;
6113 : break;
6114 279 : case E_V4HImode:
6115 279 : if (unsigned_p)
6116 : unpack = gen_sse4_1_zero_extendv2hiv2si2;
6117 : else
6118 220 : unpack = gen_sse4_1_sign_extendv2hiv2si2;
6119 : break;
6120 6 : case E_V4QImode:
6121 6 : if (unsigned_p)
6122 : unpack = gen_sse4_1_zero_extendv2qiv2hi2;
6123 : else
6124 0 : unpack = gen_sse4_1_sign_extendv2qiv2hi2;
6125 : break;
6126 0 : default:
6127 0 : gcc_unreachable ();
6128 : }
6129 :
6130 12932 : if (GET_MODE_SIZE (imode) >= 32)
6131 : {
6132 1928 : tmp = gen_reg_rtx (halfmode);
6133 1928 : emit_insn (extract (tmp, src));
6134 : }
6135 4538 : else if (high_p)
6136 : {
6137 2352 : switch (GET_MODE_SIZE (imode))
6138 : {
6139 971 : case 16:
6140 : /* Shift higher 8 bytes to lower 8 bytes. */
6141 971 : tmp = gen_reg_rtx (V1TImode);
6142 971 : emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
6143 : GEN_INT (64)));
6144 971 : break;
6145 202 : case 8:
6146 : /* Shift higher 4 bytes to lower 4 bytes. */
6147 202 : tmp = gen_reg_rtx (V1DImode);
6148 202 : emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
6149 : GEN_INT (32)));
6150 202 : break;
6151 3 : case 4:
6152 : /* Shift higher 2 bytes to lower 2 bytes. */
6153 3 : tmp = gen_reg_rtx (V1SImode);
6154 3 : emit_insn (gen_mmx_lshrv1si3 (tmp, gen_lowpart (V1SImode, src),
6155 : GEN_INT (16)));
6156 3 : break;
6157 0 : default:
6158 0 : gcc_unreachable ();
6159 : }
6160 :
6161 1176 : tmp = gen_lowpart (imode, tmp);
6162 : }
6163 : else
6164 : tmp = src;
6165 :
6166 6466 : emit_insn (unpack (dest, tmp));
6167 : }
6168 : else
6169 : {
6170 12270 : rtx (*unpack)(rtx, rtx, rtx);
6171 :
6172 12270 : switch (imode)
6173 : {
6174 3368 : case E_V16QImode:
6175 3368 : if (high_p)
6176 : unpack = gen_vec_interleave_highv16qi;
6177 : else
6178 1687 : unpack = gen_vec_interleave_lowv16qi;
6179 : break;
6180 5142 : case E_V8HImode:
6181 5142 : if (high_p)
6182 : unpack = gen_vec_interleave_highv8hi;
6183 : else
6184 2571 : unpack = gen_vec_interleave_lowv8hi;
6185 : break;
6186 2362 : case E_V4SImode:
6187 2362 : if (high_p)
6188 : unpack = gen_vec_interleave_highv4si;
6189 : else
6190 1181 : unpack = gen_vec_interleave_lowv4si;
6191 : break;
6192 478 : case E_V8QImode:
6193 478 : if (high_p)
6194 : unpack = gen_mmx_punpckhbw;
6195 : else
6196 239 : unpack = gen_mmx_punpcklbw;
6197 : break;
6198 906 : case E_V4HImode:
6199 906 : if (high_p)
6200 : unpack = gen_mmx_punpckhwd;
6201 : else
6202 453 : unpack = gen_mmx_punpcklwd;
6203 : break;
6204 14 : case E_V4QImode:
6205 14 : if (high_p)
6206 : unpack = gen_mmx_punpckhbw_low;
6207 : else
6208 7 : unpack = gen_mmx_punpcklbw_low;
6209 : break;
6210 0 : default:
6211 0 : gcc_unreachable ();
6212 : }
6213 :
6214 12270 : if (unsigned_p)
6215 4882 : tmp = force_reg (imode, CONST0_RTX (imode));
6216 : else
6217 7388 : tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
6218 : src, pc_rtx, pc_rtx);
6219 :
6220 12270 : rtx tmp2 = gen_reg_rtx (imode);
6221 12270 : emit_insn (unpack (tmp2, src, tmp));
6222 12270 : emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
6223 : }
6224 18736 : }
6225 :
6226 : /* Return true if mem is pool constant which contains a const_vector
6227 : perm index, assign the index to PERM. */
6228 : bool
6229 35 : ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
6230 : {
6231 35 : machine_mode mode = GET_MODE (mem);
6232 35 : int nelt = GET_MODE_NUNITS (mode);
6233 :
6234 35 : if (!INTEGRAL_MODE_P (mode))
6235 : return false;
6236 :
6237 : /* Needs to be constant pool. */
6238 35 : if (!(MEM_P (mem))
6239 35 : || !SYMBOL_REF_P (XEXP (mem, 0))
6240 70 : || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
6241 : return false;
6242 :
6243 35 : rtx constant = get_pool_constant (XEXP (mem, 0));
6244 :
6245 35 : if (!CONST_VECTOR_P (constant))
6246 : return false;
6247 :
6248 : /* There could be some rtx like
6249 : (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
6250 : but with "*.LC1" refer to V2DI constant vector. */
6251 35 : if (GET_MODE (constant) != mode)
6252 : {
6253 0 : constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
6254 :
6255 0 : if (constant == nullptr || !CONST_VECTOR_P (constant))
6256 : return false;
6257 : }
6258 :
6259 771 : for (int i = 0; i != nelt; i++)
6260 736 : perm[i] = UINTVAL (XVECEXP (constant, 0, i));
6261 :
6262 : return true;
6263 : }
6264 :
6265 : /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
6266 : but works for floating pointer parameters and nonoffsetable memories.
6267 : For pushes, it returns just stack offsets; the values will be saved
6268 : in the right order. Maximally three parts are generated. */
6269 :
6270 : static int
6271 4129276 : ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
6272 : {
6273 4129276 : int size;
6274 :
6275 4129276 : if (!TARGET_64BIT)
6276 1561710 : size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
6277 : else
6278 6695544 : size = (GET_MODE_SIZE (mode) + 4) / 8;
6279 :
6280 4129276 : gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
6281 4129276 : gcc_assert (size >= 2 && size <= 4);
6282 :
6283 : /* Optimize constant pool reference to immediates. This is used by fp
6284 : moves, that force all constants to memory to allow combining. */
6285 4129276 : if (MEM_P (operand) && MEM_READONLY_P (operand))
6286 38080 : operand = avoid_constant_pool_reference (operand);
6287 :
6288 4129276 : if (MEM_P (operand) && !offsettable_memref_p (operand))
6289 : {
6290 : /* The only non-offsetable memories we handle are pushes. */
6291 183996 : int ok = push_operand (operand, VOIDmode);
6292 :
6293 183996 : gcc_assert (ok);
6294 :
6295 183996 : operand = copy_rtx (operand);
6296 183996 : PUT_MODE (operand, word_mode);
6297 183996 : parts[0] = parts[1] = parts[2] = parts[3] = operand;
6298 183996 : return size;
6299 : }
6300 :
6301 3945280 : if (CONST_VECTOR_P (operand))
6302 : {
6303 41950 : scalar_int_mode imode = int_mode_for_mode (mode).require ();
6304 : /* Caution: if we looked through a constant pool memory above,
6305 : the operand may actually have a different mode now. That's
6306 : ok, since we want to pun this all the way back to an integer. */
6307 41950 : operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
6308 41950 : gcc_assert (operand != NULL);
6309 41950 : mode = imode;
6310 : }
6311 :
6312 3945280 : if (!TARGET_64BIT)
6313 : {
6314 622928 : if (mode == DImode)
6315 493889 : split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
6316 : else
6317 : {
6318 129039 : int i;
6319 :
6320 129039 : if (REG_P (operand))
6321 : {
6322 66914 : gcc_assert (reload_completed);
6323 200742 : for (i = 0; i < size; i++)
6324 133828 : parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
6325 : }
6326 62125 : else if (offsettable_memref_p (operand))
6327 : {
6328 60777 : operand = adjust_address (operand, SImode, 0);
6329 60777 : parts[0] = operand;
6330 122074 : for (i = 1; i < size; i++)
6331 61297 : parts[i] = adjust_address (operand, SImode, 4 * i);
6332 : }
6333 1348 : else if (CONST_DOUBLE_P (operand))
6334 : {
6335 1348 : const REAL_VALUE_TYPE *r;
6336 1348 : long l[4];
6337 :
6338 1348 : r = CONST_DOUBLE_REAL_VALUE (operand);
6339 1348 : switch (mode)
6340 : {
6341 0 : case E_TFmode:
6342 0 : real_to_target (l, r, mode);
6343 0 : parts[3] = gen_int_mode (l[3], SImode);
6344 0 : parts[2] = gen_int_mode (l[2], SImode);
6345 0 : break;
6346 198 : case E_XFmode:
6347 : /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
6348 : long double may not be 80-bit. */
6349 198 : real_to_target (l, r, mode);
6350 198 : parts[2] = gen_int_mode (l[2], SImode);
6351 198 : break;
6352 1150 : case E_DFmode:
6353 1150 : REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
6354 1150 : break;
6355 0 : default:
6356 0 : gcc_unreachable ();
6357 : }
6358 1348 : parts[1] = gen_int_mode (l[1], SImode);
6359 1348 : parts[0] = gen_int_mode (l[0], SImode);
6360 : }
6361 : else
6362 0 : gcc_unreachable ();
6363 : }
6364 : }
6365 : else
6366 : {
6367 3322352 : if (mode == TImode)
6368 3302249 : split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
6369 3322352 : if (mode == XFmode || mode == TFmode)
6370 : {
6371 20103 : machine_mode upper_mode = mode==XFmode ? SImode : DImode;
6372 20103 : if (REG_P (operand))
6373 : {
6374 1491 : gcc_assert (reload_completed);
6375 1491 : parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
6376 1491 : parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
6377 : }
6378 18612 : else if (offsettable_memref_p (operand))
6379 : {
6380 14492 : operand = adjust_address (operand, DImode, 0);
6381 14492 : parts[0] = operand;
6382 14492 : parts[1] = adjust_address (operand, upper_mode, 8);
6383 : }
6384 4120 : else if (CONST_DOUBLE_P (operand))
6385 : {
6386 4120 : long l[4];
6387 :
6388 4120 : real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
6389 :
6390 : /* real_to_target puts 32-bit pieces in each long. */
6391 8240 : parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
6392 4120 : | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
6393 4120 : << 32), DImode);
6394 :
6395 4120 : if (upper_mode == SImode)
6396 2940 : parts[1] = gen_int_mode (l[2], SImode);
6397 : else
6398 1180 : parts[1]
6399 1180 : = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
6400 1180 : | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
6401 1180 : << 32), DImode);
6402 : }
6403 : else
6404 0 : gcc_unreachable ();
6405 : }
6406 : }
6407 :
6408 : return size;
6409 : }
6410 :
6411 : /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
6412 : Return false when normal moves are needed; true when all required
6413 : insns have been emitted. Operands 2-4 contain the input values
6414 : int the correct order; operands 5-7 contain the output values. */
6415 :
6416 : void
6417 2077552 : ix86_split_long_move (rtx operands[])
6418 : {
6419 2077552 : rtx part[2][4];
6420 2077552 : int nparts, i, j;
6421 2077552 : int push = 0;
6422 2077552 : int collisions = 0;
6423 2077552 : machine_mode mode = GET_MODE (operands[0]);
6424 2077552 : bool collisionparts[4];
6425 :
6426 : /* The DFmode expanders may ask us to move double.
6427 : For 64bit target this is single move. By hiding the fact
6428 : here we simplify i386.md splitters. */
6429 3764352 : if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
6430 : {
6431 : /* Optimize constant pool reference to immediates. This is used by
6432 : fp moves, that force all constants to memory to allow combining. */
6433 :
6434 12914 : if (MEM_P (operands[1])
6435 12499 : && SYMBOL_REF_P (XEXP (operands[1], 0))
6436 13520 : && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
6437 117 : operands[1] = get_pool_constant (XEXP (operands[1], 0));
6438 12914 : if (push_operand (operands[0], VOIDmode))
6439 : {
6440 12914 : operands[0] = copy_rtx (operands[0]);
6441 12914 : PUT_MODE (operands[0], word_mode);
6442 : }
6443 : else
6444 0 : operands[0] = gen_lowpart (DImode, operands[0]);
6445 12914 : operands[1] = gen_lowpart (DImode, operands[1]);
6446 12914 : emit_move_insn (operands[0], operands[1]);
6447 12914 : return;
6448 : }
6449 :
6450 : /* The only non-offsettable memory we handle is push. */
6451 2064638 : if (push_operand (operands[0], VOIDmode))
6452 : push = 1;
6453 : else
6454 1880642 : gcc_assert (!MEM_P (operands[0])
6455 : || offsettable_memref_p (operands[0]));
6456 :
6457 2064638 : nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
6458 2064638 : ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
6459 :
6460 : /* When emitting push, take care for source operands on the stack. */
6461 183996 : if (push && MEM_P (operands[1])
6462 2161802 : && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
6463 : {
6464 56279 : rtx src_base = XEXP (part[1][nparts - 1], 0);
6465 :
6466 : /* Compensate for the stack decrement by 4. */
6467 56279 : if (!TARGET_64BIT && nparts == 3
6468 51619 : && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
6469 0 : src_base = plus_constant (Pmode, src_base, 4);
6470 :
6471 : /* src_base refers to the stack pointer and is
6472 : automatically decreased by emitted push. */
6473 169116 : for (i = 0; i < nparts; i++)
6474 112837 : part[1][i] = change_address (part[1][i],
6475 112837 : GET_MODE (part[1][i]), src_base);
6476 : }
6477 :
6478 : /* We need to do copy in the right order in case an address register
6479 : of the source overlaps the destination. */
6480 2064638 : if (REG_P (part[0][0]) && MEM_P (part[1][0]))
6481 : {
6482 : rtx tmp;
6483 :
6484 2360874 : for (i = 0; i < nparts; i++)
6485 : {
6486 1573916 : collisionparts[i]
6487 1573916 : = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
6488 1573916 : if (collisionparts[i])
6489 16896 : collisions++;
6490 : }
6491 :
6492 : /* Collision in the middle part can be handled by reordering. */
6493 786958 : if (collisions == 1 && nparts == 3 && collisionparts [1])
6494 : {
6495 0 : std::swap (part[0][1], part[0][2]);
6496 0 : std::swap (part[1][1], part[1][2]);
6497 : }
6498 786958 : else if (collisions == 1
6499 786958 : && nparts == 4
6500 0 : && (collisionparts [1] || collisionparts [2]))
6501 : {
6502 0 : if (collisionparts [1])
6503 : {
6504 0 : std::swap (part[0][1], part[0][2]);
6505 0 : std::swap (part[1][1], part[1][2]);
6506 : }
6507 : else
6508 : {
6509 0 : std::swap (part[0][2], part[0][3]);
6510 0 : std::swap (part[1][2], part[1][3]);
6511 : }
6512 : }
6513 :
6514 : /* If there are more collisions, we can't handle it by reordering.
6515 : Do an lea to the last part and use only one colliding move. */
6516 786958 : else if (collisions > 1)
6517 : {
6518 84 : rtx base, addr;
6519 :
6520 84 : collisions = 1;
6521 :
6522 84 : base = part[0][nparts - 1];
6523 :
6524 : /* Handle the case when the last part isn't valid for lea.
6525 : Happens in 64-bit mode storing the 12-byte XFmode. */
6526 126 : if (GET_MODE (base) != Pmode)
6527 0 : base = gen_rtx_REG (Pmode, REGNO (base));
6528 :
6529 84 : addr = XEXP (part[1][0], 0);
6530 84 : if (TARGET_TLS_DIRECT_SEG_REFS)
6531 : {
6532 84 : struct ix86_address parts;
6533 84 : int ok = ix86_decompose_address (addr, &parts);
6534 84 : gcc_assert (ok);
6535 : /* It is not valid to use %gs: or %fs: in lea. */
6536 84 : gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
6537 : }
6538 84 : emit_insn (gen_rtx_SET (base, addr));
6539 84 : part[1][0] = replace_equiv_address (part[1][0], base);
6540 168 : for (i = 1; i < nparts; i++)
6541 : {
6542 168 : tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
6543 84 : part[1][i] = replace_equiv_address (part[1][i], tmp);
6544 : }
6545 : }
6546 : }
6547 :
6548 2064638 : if (push)
6549 : {
6550 183996 : if (!TARGET_64BIT)
6551 : {
6552 158576 : if (nparts == 3)
6553 : {
6554 580 : if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
6555 0 : emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
6556 580 : emit_move_insn (part[0][2], part[1][2]);
6557 : }
6558 157996 : else if (nparts == 4)
6559 : {
6560 0 : emit_move_insn (part[0][3], part[1][3]);
6561 0 : emit_move_insn (part[0][2], part[1][2]);
6562 : }
6563 : }
6564 : else
6565 : {
6566 : /* In 64bit mode we don't have 32bit push available. In case this is
6567 : register, it is OK - we will just use larger counterpart. We also
6568 : retype memory - these comes from attempt to avoid REX prefix on
6569 : moving of second half of TFmode value. */
6570 25420 : if (GET_MODE (part[1][1]) == SImode)
6571 : {
6572 11245 : switch (GET_CODE (part[1][1]))
6573 : {
6574 10811 : case MEM:
6575 10811 : part[1][1] = adjust_address (part[1][1], DImode, 0);
6576 10811 : break;
6577 :
6578 434 : case REG:
6579 434 : part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
6580 434 : break;
6581 :
6582 0 : default:
6583 0 : gcc_unreachable ();
6584 : }
6585 :
6586 11245 : if (GET_MODE (part[1][0]) == SImode)
6587 0 : part[1][0] = part[1][1];
6588 : }
6589 : }
6590 183996 : emit_move_insn (part[0][1], part[1][1]);
6591 183996 : emit_move_insn (part[0][0], part[1][0]);
6592 183996 : return;
6593 : }
6594 :
6595 : /* Choose correct order to not overwrite the source before it is copied. */
6596 1880642 : if ((REG_P (part[0][0])
6597 1024715 : && REG_P (part[1][1])
6598 79878 : && (REGNO (part[0][0]) == REGNO (part[1][1])
6599 64716 : || (nparts == 3
6600 0 : && REGNO (part[0][0]) == REGNO (part[1][2]))
6601 64716 : || (nparts == 4
6602 0 : && REGNO (part[0][0]) == REGNO (part[1][3]))))
6603 2890195 : || (collisions > 0
6604 16812 : && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
6605 : {
6606 95106 : for (i = 0, j = nparts - 1; i < nparts; i++, j--)
6607 : {
6608 63404 : operands[2 + i] = part[0][j];
6609 63404 : operands[6 + i] = part[1][j];
6610 : }
6611 : }
6612 : else
6613 : {
6614 5546889 : for (i = 0; i < nparts; i++)
6615 : {
6616 3697949 : operands[2 + i] = part[0][i];
6617 3697949 : operands[6 + i] = part[1][i];
6618 : }
6619 : }
6620 :
6621 : /* Attempt to locally unCSE nonzero constants. */
6622 3761353 : for (j = 0; j < nparts - 1; j++)
6623 1880711 : if (CONST_INT_P (operands[6 + j])
6624 224780 : && operands[6 + j] != const0_rtx
6625 62905 : && REG_P (operands[2 + j]))
6626 111846 : for (i = j; i < nparts - 1; i++)
6627 55923 : if (CONST_INT_P (operands[7 + i])
6628 55923 : && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
6629 22579 : operands[7 + i] = operands[2 + j];
6630 :
6631 5641995 : for (i = 0; i < nparts; i++)
6632 3761353 : emit_move_insn (operands[2 + i], operands[6 + i]);
6633 :
6634 : return;
6635 : }
6636 :
6637 : /* Helper function of ix86_split_ashl used to generate an SImode/DImode
6638 : left shift by a constant, either using a single shift or
6639 : a sequence of add instructions. */
6640 :
6641 : static void
6642 4304 : ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
6643 : {
6644 4304 : if (count == 1
6645 4304 : || (count * ix86_cost->add <= ix86_cost->shift_const
6646 0 : && !optimize_insn_for_size_p ()))
6647 : {
6648 16 : while (count-- > 0)
6649 8 : emit_insn (gen_add2_insn (operand, operand));
6650 : }
6651 : else
6652 : {
6653 4296 : rtx (*insn)(rtx, rtx, rtx);
6654 :
6655 4296 : insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6656 4296 : emit_insn (insn (operand, operand, GEN_INT (count)));
6657 : }
6658 4304 : }
6659 :
6660 : void
6661 10178 : ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
6662 : {
6663 10178 : rtx (*gen_ashl3)(rtx, rtx, rtx);
6664 10178 : rtx (*gen_shld)(rtx, rtx, rtx);
6665 10178 : int half_width = GET_MODE_BITSIZE (mode) >> 1;
6666 10178 : machine_mode half_mode;
6667 :
6668 10178 : rtx low[2], high[2];
6669 10178 : int count;
6670 :
6671 10178 : if (CONST_INT_P (operands[2]))
6672 : {
6673 8471 : split_double_mode (mode, operands, 2, low, high);
6674 8471 : count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6675 :
6676 8471 : if (count >= half_width)
6677 : {
6678 2464 : emit_move_insn (high[0], low[1]);
6679 2464 : ix86_expand_clear (low[0]);
6680 :
6681 2464 : if (count > half_width)
6682 141 : ix86_expand_ashl_const (high[0], count - half_width, mode);
6683 : }
6684 6007 : else if (count == 1)
6685 : {
6686 1844 : if (!rtx_equal_p (operands[0], operands[1]))
6687 0 : emit_move_insn (operands[0], operands[1]);
6688 1844 : rtx x3 = gen_rtx_REG (CCCmode, FLAGS_REG);
6689 1844 : rtx x4 = gen_rtx_LTU (mode, x3, const0_rtx);
6690 1844 : half_mode = mode == DImode ? SImode : DImode;
6691 1844 : emit_insn (gen_add3_cc_overflow_1 (half_mode, low[0],
6692 : low[0], low[0]));
6693 1844 : emit_insn (gen_add3_carry (half_mode, high[0], high[0], high[0],
6694 : x3, x4));
6695 : }
6696 : else
6697 : {
6698 4163 : gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6699 :
6700 4163 : if (!rtx_equal_p (operands[0], operands[1]))
6701 0 : emit_move_insn (operands[0], operands[1]);
6702 :
6703 4163 : emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
6704 4163 : ix86_expand_ashl_const (low[0], count, mode);
6705 : }
6706 8741 : return;
6707 : }
6708 :
6709 1707 : split_double_mode (mode, operands, 1, low, high);
6710 1707 : half_mode = mode == DImode ? SImode : DImode;
6711 :
6712 1707 : gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6713 :
6714 1707 : if (operands[1] == const1_rtx)
6715 : {
6716 : /* Assuming we've chosen a QImode capable registers, then 1 << N
6717 : can be done with two 32/64-bit shifts, no branches, no cmoves. */
6718 270 : if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
6719 : {
6720 162 : rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
6721 :
6722 162 : ix86_expand_clear (low[0]);
6723 162 : ix86_expand_clear (high[0]);
6724 162 : emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
6725 :
6726 162 : d = gen_lowpart (QImode, low[0]);
6727 162 : d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6728 162 : s = gen_rtx_EQ (QImode, flags, const0_rtx);
6729 162 : emit_insn (gen_rtx_SET (d, s));
6730 :
6731 162 : d = gen_lowpart (QImode, high[0]);
6732 162 : d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6733 162 : s = gen_rtx_NE (QImode, flags, const0_rtx);
6734 162 : emit_insn (gen_rtx_SET (d, s));
6735 : }
6736 :
6737 : /* Otherwise, we can get the same results by manually performing
6738 : a bit extract operation on bit 5/6, and then performing the two
6739 : shifts. The two methods of getting 0/1 into low/high are exactly
6740 : the same size. Avoiding the shift in the bit extract case helps
6741 : pentium4 a bit; no one else seems to care much either way. */
6742 : else
6743 : {
6744 108 : rtx (*gen_lshr3)(rtx, rtx, rtx);
6745 108 : rtx (*gen_and3)(rtx, rtx, rtx);
6746 108 : rtx (*gen_xor3)(rtx, rtx, rtx);
6747 108 : HOST_WIDE_INT bits;
6748 108 : rtx x;
6749 :
6750 108 : if (mode == DImode)
6751 : {
6752 : gen_lshr3 = gen_lshrsi3;
6753 : gen_and3 = gen_andsi3;
6754 : gen_xor3 = gen_xorsi3;
6755 : bits = 5;
6756 : }
6757 : else
6758 : {
6759 0 : gen_lshr3 = gen_lshrdi3;
6760 0 : gen_and3 = gen_anddi3;
6761 0 : gen_xor3 = gen_xordi3;
6762 0 : bits = 6;
6763 : }
6764 :
6765 108 : if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
6766 0 : x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
6767 : else
6768 108 : x = gen_lowpart (half_mode, operands[2]);
6769 108 : emit_insn (gen_rtx_SET (high[0], x));
6770 :
6771 108 : emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
6772 108 : emit_insn (gen_and3 (high[0], high[0], const1_rtx));
6773 108 : emit_move_insn (low[0], high[0]);
6774 108 : emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
6775 : }
6776 :
6777 270 : emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6778 270 : emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
6779 270 : return;
6780 : }
6781 :
6782 1437 : if (operands[1] == constm1_rtx)
6783 : {
6784 : /* For -1 << N, we can avoid the shld instruction, because we
6785 : know that we're shifting 0...31/63 ones into a -1. */
6786 117 : emit_move_insn (low[0], constm1_rtx);
6787 117 : if (optimize_insn_for_size_p ())
6788 6 : emit_move_insn (high[0], low[0]);
6789 : else
6790 111 : emit_move_insn (high[0], constm1_rtx);
6791 : }
6792 : else
6793 : {
6794 1320 : gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6795 :
6796 1320 : if (!rtx_equal_p (operands[0], operands[1]))
6797 0 : emit_move_insn (operands[0], operands[1]);
6798 :
6799 1320 : split_double_mode (mode, operands, 1, low, high);
6800 1320 : emit_insn (gen_shld (high[0], low[0], operands[2]));
6801 : }
6802 :
6803 1437 : emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6804 :
6805 1437 : if (TARGET_CMOVE && scratch)
6806 : {
6807 966 : ix86_expand_clear (scratch);
6808 966 : emit_insn (gen_x86_shift_adj_1
6809 : (half_mode, high[0], low[0], operands[2], scratch));
6810 : }
6811 : else
6812 471 : emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
6813 : }
6814 :
6815 : void
6816 6038 : ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
6817 : {
6818 4798 : rtx (*gen_ashr3)(rtx, rtx, rtx)
6819 6038 : = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
6820 6038 : rtx (*gen_shrd)(rtx, rtx, rtx);
6821 6038 : int half_width = GET_MODE_BITSIZE (mode) >> 1;
6822 :
6823 6038 : rtx low[2], high[2];
6824 6038 : int count;
6825 :
6826 6038 : if (CONST_INT_P (operands[2]))
6827 : {
6828 5861 : split_double_mode (mode, operands, 2, low, high);
6829 5861 : count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6830 :
6831 11722 : if (count == GET_MODE_BITSIZE (mode) - 1)
6832 : {
6833 85 : emit_move_insn (high[0], high[1]);
6834 85 : emit_insn (gen_ashr3 (high[0], high[0],
6835 85 : GEN_INT (half_width - 1)));
6836 85 : emit_move_insn (low[0], high[0]);
6837 :
6838 : }
6839 5776 : else if (count >= half_width)
6840 : {
6841 1619 : emit_move_insn (low[0], high[1]);
6842 1619 : emit_move_insn (high[0], low[0]);
6843 1619 : emit_insn (gen_ashr3 (high[0], high[0],
6844 1619 : GEN_INT (half_width - 1)));
6845 :
6846 1619 : if (count > half_width)
6847 38 : emit_insn (gen_ashr3 (low[0], low[0],
6848 38 : GEN_INT (count - half_width)));
6849 : }
6850 4157 : else if (count == 1
6851 766 : && (TARGET_USE_RCR || optimize_size > 1))
6852 : {
6853 1 : if (!rtx_equal_p (operands[0], operands[1]))
6854 0 : emit_move_insn (operands[0], operands[1]);
6855 1 : if (mode == DImode)
6856 : {
6857 0 : emit_insn (gen_ashrsi3_carry (high[0], high[0]));
6858 0 : emit_insn (gen_rcrsi2 (low[0], low[0]));
6859 : }
6860 : else
6861 : {
6862 1 : emit_insn (gen_ashrdi3_carry (high[0], high[0]));
6863 1 : emit_insn (gen_rcrdi2 (low[0], low[0]));
6864 : }
6865 : }
6866 : else
6867 : {
6868 4156 : gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6869 :
6870 4156 : if (!rtx_equal_p (operands[0], operands[1]))
6871 0 : emit_move_insn (operands[0], operands[1]);
6872 :
6873 4156 : emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6874 4156 : emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
6875 : }
6876 : }
6877 : else
6878 : {
6879 177 : machine_mode half_mode;
6880 :
6881 177 : gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6882 :
6883 177 : if (!rtx_equal_p (operands[0], operands[1]))
6884 0 : emit_move_insn (operands[0], operands[1]);
6885 :
6886 177 : split_double_mode (mode, operands, 1, low, high);
6887 177 : half_mode = mode == DImode ? SImode : DImode;
6888 :
6889 177 : emit_insn (gen_shrd (low[0], high[0], operands[2]));
6890 177 : emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
6891 :
6892 177 : if (TARGET_CMOVE && scratch)
6893 : {
6894 139 : emit_move_insn (scratch, high[0]);
6895 139 : emit_insn (gen_ashr3 (scratch, scratch,
6896 139 : GEN_INT (half_width - 1)));
6897 139 : emit_insn (gen_x86_shift_adj_1
6898 : (half_mode, low[0], high[0], operands[2], scratch));
6899 : }
6900 : else
6901 38 : emit_insn (gen_x86_shift_adj_3
6902 : (half_mode, low[0], high[0], operands[2]));
6903 : }
6904 6038 : }
6905 :
6906 : void
6907 13235 : ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
6908 : {
6909 5892 : rtx (*gen_lshr3)(rtx, rtx, rtx)
6910 13235 : = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
6911 13235 : rtx (*gen_shrd)(rtx, rtx, rtx);
6912 13235 : int half_width = GET_MODE_BITSIZE (mode) >> 1;
6913 :
6914 13235 : rtx low[2], high[2];
6915 13235 : int count;
6916 :
6917 13235 : if (CONST_INT_P (operands[2]))
6918 : {
6919 11833 : split_double_mode (mode, operands, 2, low, high);
6920 11833 : count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6921 :
6922 11833 : if (count >= half_width)
6923 : {
6924 8471 : emit_move_insn (low[0], high[1]);
6925 8471 : ix86_expand_clear (high[0]);
6926 :
6927 8471 : if (count > half_width)
6928 651 : emit_insn (gen_lshr3 (low[0], low[0],
6929 651 : GEN_INT (count - half_width)));
6930 : }
6931 3362 : else if (count == 1
6932 678 : && (TARGET_USE_RCR || optimize_size > 1))
6933 : {
6934 1 : if (!rtx_equal_p (operands[0], operands[1]))
6935 0 : emit_move_insn (operands[0], operands[1]);
6936 1 : if (mode == DImode)
6937 : {
6938 0 : emit_insn (gen_lshrsi3_carry (high[0], high[0]));
6939 0 : emit_insn (gen_rcrsi2 (low[0], low[0]));
6940 : }
6941 : else
6942 : {
6943 1 : emit_insn (gen_lshrdi3_carry (high[0], high[0]));
6944 1 : emit_insn (gen_rcrdi2 (low[0], low[0]));
6945 : }
6946 : }
6947 : else
6948 : {
6949 3361 : gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6950 :
6951 3361 : if (!rtx_equal_p (operands[0], operands[1]))
6952 0 : emit_move_insn (operands[0], operands[1]);
6953 :
6954 3361 : emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6955 3361 : emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
6956 : }
6957 : }
6958 : else
6959 : {
6960 1402 : machine_mode half_mode;
6961 :
6962 1402 : gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6963 :
6964 1402 : if (!rtx_equal_p (operands[0], operands[1]))
6965 0 : emit_move_insn (operands[0], operands[1]);
6966 :
6967 1402 : split_double_mode (mode, operands, 1, low, high);
6968 1402 : half_mode = mode == DImode ? SImode : DImode;
6969 :
6970 1402 : emit_insn (gen_shrd (low[0], high[0], operands[2]));
6971 1402 : emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
6972 :
6973 1402 : if (TARGET_CMOVE && scratch)
6974 : {
6975 1132 : ix86_expand_clear (scratch);
6976 1132 : emit_insn (gen_x86_shift_adj_1
6977 : (half_mode, low[0], high[0], operands[2], scratch));
6978 : }
6979 : else
6980 270 : emit_insn (gen_x86_shift_adj_2
6981 : (half_mode, low[0], high[0], operands[2]));
6982 : }
6983 13235 : }
6984 :
6985 : /* Helper function to split TImode ashl under NDD. */
6986 : void
6987 1 : ix86_split_ashl_ndd (rtx *operands, rtx scratch)
6988 : {
6989 1 : gcc_assert (TARGET_APX_NDD);
6990 1 : int half_width = GET_MODE_BITSIZE (TImode) >> 1;
6991 :
6992 1 : rtx low[2], high[2];
6993 1 : int count;
6994 :
6995 1 : split_double_mode (TImode, operands, 2, low, high);
6996 1 : if (CONST_INT_P (operands[2]))
6997 : {
6998 0 : count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (TImode) - 1);
6999 :
7000 0 : if (count >= half_width)
7001 : {
7002 0 : count = count - half_width;
7003 0 : if (count == 0)
7004 : {
7005 0 : if (!rtx_equal_p (high[0], low[1]))
7006 0 : emit_move_insn (high[0], low[1]);
7007 : }
7008 0 : else if (count == 1)
7009 0 : emit_insn (gen_adddi3 (high[0], low[1], low[1]));
7010 : else
7011 0 : emit_insn (gen_ashldi3 (high[0], low[1], GEN_INT (count)));
7012 :
7013 0 : ix86_expand_clear (low[0]);
7014 : }
7015 0 : else if (count == 1)
7016 : {
7017 0 : rtx x3 = gen_rtx_REG (CCCmode, FLAGS_REG);
7018 0 : rtx x4 = gen_rtx_LTU (TImode, x3, const0_rtx);
7019 0 : emit_insn (gen_add3_cc_overflow_1 (DImode, low[0],
7020 : low[1], low[1]));
7021 0 : emit_insn (gen_add3_carry (DImode, high[0], high[1], high[1],
7022 : x3, x4));
7023 : }
7024 : else
7025 : {
7026 0 : emit_insn (gen_x86_64_shld_ndd (high[0], high[1], low[1],
7027 : GEN_INT (count)));
7028 0 : emit_insn (gen_ashldi3 (low[0], low[1], GEN_INT (count)));
7029 : }
7030 : }
7031 : else
7032 : {
7033 1 : emit_insn (gen_x86_64_shld_ndd (high[0], high[1], low[1],
7034 : operands[2]));
7035 1 : emit_insn (gen_ashldi3 (low[0], low[1], operands[2]));
7036 1 : if (TARGET_CMOVE && scratch)
7037 : {
7038 1 : ix86_expand_clear (scratch);
7039 1 : emit_insn (gen_x86_shift_adj_1
7040 : (DImode, high[0], low[0], operands[2], scratch));
7041 : }
7042 : else
7043 0 : emit_insn (gen_x86_shift_adj_2 (DImode, high[0], low[0], operands[2]));
7044 : }
7045 1 : }
7046 :
7047 : /* Helper function to split TImode l/ashr under NDD. */
7048 : void
7049 2 : ix86_split_rshift_ndd (enum rtx_code code, rtx *operands, rtx scratch)
7050 : {
7051 2 : gcc_assert (TARGET_APX_NDD);
7052 2 : int half_width = GET_MODE_BITSIZE (TImode) >> 1;
7053 2 : bool ashr_p = code == ASHIFTRT;
7054 2 : rtx (*gen_shr)(rtx, rtx, rtx) = ashr_p ? gen_ashrdi3
7055 : : gen_lshrdi3;
7056 :
7057 2 : rtx low[2], high[2];
7058 2 : int count;
7059 :
7060 2 : split_double_mode (TImode, operands, 2, low, high);
7061 2 : if (CONST_INT_P (operands[2]))
7062 : {
7063 0 : count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (TImode) - 1);
7064 :
7065 0 : if (ashr_p && (count == GET_MODE_BITSIZE (TImode) - 1))
7066 : {
7067 0 : emit_insn (gen_shr (high[0], high[1],
7068 : GEN_INT (half_width - 1)));
7069 0 : emit_move_insn (low[0], high[0]);
7070 : }
7071 0 : else if (count >= half_width)
7072 : {
7073 0 : if (ashr_p)
7074 0 : emit_insn (gen_shr (high[0], high[1],
7075 : GEN_INT (half_width - 1)));
7076 : else
7077 0 : ix86_expand_clear (high[0]);
7078 :
7079 0 : if (count > half_width)
7080 0 : emit_insn (gen_shr (low[0], high[1],
7081 0 : GEN_INT (count - half_width)));
7082 : else
7083 0 : emit_move_insn (low[0], high[1]);
7084 : }
7085 : else
7086 : {
7087 0 : emit_insn (gen_x86_64_shrd_ndd (low[0], low[1], high[1],
7088 : GEN_INT (count)));
7089 0 : emit_insn (gen_shr (high[0], high[1], GEN_INT (count)));
7090 : }
7091 : }
7092 : else
7093 : {
7094 2 : emit_insn (gen_x86_64_shrd_ndd (low[0], low[1], high[1],
7095 : operands[2]));
7096 2 : emit_insn (gen_shr (high[0], high[1], operands[2]));
7097 :
7098 2 : if (TARGET_CMOVE && scratch)
7099 : {
7100 2 : if (ashr_p)
7101 : {
7102 1 : emit_move_insn (scratch, high[0]);
7103 1 : emit_insn (gen_shr (scratch, scratch,
7104 : GEN_INT (half_width - 1)));
7105 : }
7106 : else
7107 1 : ix86_expand_clear (scratch);
7108 :
7109 2 : emit_insn (gen_x86_shift_adj_1
7110 : (DImode, low[0], high[0], operands[2], scratch));
7111 : }
7112 0 : else if (ashr_p)
7113 0 : emit_insn (gen_x86_shift_adj_3
7114 : (DImode, low[0], high[0], operands[2]));
7115 : else
7116 0 : emit_insn (gen_x86_shift_adj_2
7117 : (DImode, low[0], high[0], operands[2]));
7118 : }
7119 2 : }
7120 :
7121 : /* Expand move of V1TI mode register X to a new TI mode register. */
7122 : static rtx
7123 17 : ix86_expand_v1ti_to_ti (rtx x)
7124 : {
7125 17 : rtx result = gen_reg_rtx (TImode);
7126 17 : if (TARGET_SSE2)
7127 : {
7128 17 : rtx temp = force_reg (V2DImode, gen_lowpart (V2DImode, x));
7129 17 : rtx lo = gen_lowpart (DImode, result);
7130 17 : emit_insn (gen_vec_extractv2didi (lo, temp, const0_rtx));
7131 17 : rtx hi = gen_highpart (DImode, result);
7132 17 : emit_insn (gen_vec_extractv2didi (hi, temp, const1_rtx));
7133 : }
7134 : else
7135 0 : emit_move_insn (result, gen_lowpart (TImode, x));
7136 17 : return result;
7137 : }
7138 :
7139 : /* Expand move of TI mode register X to a new V1TI mode register. */
7140 : static rtx
7141 17 : ix86_expand_ti_to_v1ti (rtx x)
7142 : {
7143 17 : if (TARGET_SSE2)
7144 : {
7145 17 : rtx lo = gen_lowpart (DImode, x);
7146 17 : rtx hi = gen_highpart (DImode, x);
7147 17 : rtx tmp = gen_reg_rtx (V2DImode);
7148 17 : emit_insn (gen_vec_concatv2di (tmp, lo, hi));
7149 17 : return force_reg (V1TImode, gen_lowpart (V1TImode, tmp));
7150 : }
7151 :
7152 0 : return force_reg (V1TImode, gen_lowpart (V1TImode, x));
7153 : }
7154 :
7155 : /* Expand V1TI mode shift (of rtx_code CODE) by constant. */
7156 : void
7157 42 : ix86_expand_v1ti_shift (enum rtx_code code, rtx operands[])
7158 : {
7159 42 : rtx op1 = force_reg (V1TImode, operands[1]);
7160 :
7161 42 : if (!CONST_INT_P (operands[2]))
7162 : {
7163 6 : rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
7164 6 : rtx tmp2 = gen_reg_rtx (TImode);
7165 3 : rtx (*shift) (rtx, rtx, rtx)
7166 6 : = (code == ASHIFT) ? gen_ashlti3 : gen_lshrti3;
7167 6 : emit_insn (shift (tmp2, tmp1, operands[2]));
7168 6 : rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
7169 6 : emit_move_insn (operands[0], tmp3);
7170 6 : return;
7171 : }
7172 :
7173 36 : HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
7174 :
7175 36 : if (bits == 0)
7176 : {
7177 0 : emit_move_insn (operands[0], op1);
7178 0 : return;
7179 : }
7180 :
7181 36 : if ((bits & 7) == 0)
7182 : {
7183 0 : rtx tmp = gen_reg_rtx (V1TImode);
7184 0 : if (code == ASHIFT)
7185 0 : emit_insn (gen_sse2_ashlv1ti3 (tmp, op1, GEN_INT (bits)));
7186 : else
7187 0 : emit_insn (gen_sse2_lshrv1ti3 (tmp, op1, GEN_INT (bits)));
7188 0 : emit_move_insn (operands[0], tmp);
7189 0 : return;
7190 : }
7191 :
7192 36 : rtx tmp1 = gen_reg_rtx (V1TImode);
7193 36 : if (code == ASHIFT)
7194 18 : emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (64)));
7195 : else
7196 18 : emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
7197 :
7198 : /* tmp2 is operands[1] shifted by 64, in V2DImode. */
7199 36 : rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
7200 :
7201 : /* tmp3 will be the V2DImode result. */
7202 36 : rtx tmp3 = gen_reg_rtx (V2DImode);
7203 :
7204 36 : if (bits > 64)
7205 : {
7206 18 : if (code == ASHIFT)
7207 9 : emit_insn (gen_ashlv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
7208 : else
7209 9 : emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
7210 : }
7211 : else
7212 : {
7213 : /* tmp4 is operands[1], in V2DImode. */
7214 18 : rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7215 :
7216 18 : rtx tmp5 = gen_reg_rtx (V2DImode);
7217 18 : if (code == ASHIFT)
7218 9 : emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (bits)));
7219 : else
7220 9 : emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
7221 :
7222 18 : rtx tmp6 = gen_reg_rtx (V2DImode);
7223 18 : if (code == ASHIFT)
7224 9 : emit_insn (gen_lshrv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
7225 : else
7226 9 : emit_insn (gen_ashlv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
7227 :
7228 18 : emit_insn (gen_iorv2di3 (tmp3, tmp5, tmp6));
7229 : }
7230 :
7231 : /* Convert the result back to V1TImode and store in operands[0]. */
7232 36 : rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7233 36 : emit_move_insn (operands[0], tmp7);
7234 : }
7235 :
7236 : /* Expand V1TI mode rotate (of rtx_code CODE) by constant. */
7237 : void
7238 39 : ix86_expand_v1ti_rotate (enum rtx_code code, rtx operands[])
7239 : {
7240 39 : rtx op1 = force_reg (V1TImode, operands[1]);
7241 :
7242 39 : if (!CONST_INT_P (operands[2]))
7243 : {
7244 8 : rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
7245 8 : rtx tmp2 = gen_reg_rtx (TImode);
7246 4 : rtx (*rotate) (rtx, rtx, rtx)
7247 8 : = (code == ROTATE) ? gen_rotlti3 : gen_rotrti3;
7248 8 : emit_insn (rotate (tmp2, tmp1, operands[2]));
7249 8 : rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
7250 8 : emit_move_insn (operands[0], tmp3);
7251 8 : return;
7252 : }
7253 :
7254 31 : HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
7255 :
7256 31 : if (bits == 0)
7257 : {
7258 0 : emit_move_insn (operands[0], op1);
7259 0 : return;
7260 : }
7261 :
7262 31 : if (code == ROTATERT)
7263 16 : bits = 128 - bits;
7264 :
7265 31 : if ((bits & 31) == 0)
7266 : {
7267 5 : rtx tmp2 = gen_reg_rtx (V4SImode);
7268 5 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7269 5 : if (bits == 32)
7270 1 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x93)));
7271 4 : else if (bits == 64)
7272 2 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x4e)));
7273 : else
7274 2 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x39)));
7275 5 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp2));
7276 5 : return;
7277 : }
7278 :
7279 26 : if ((bits & 7) == 0)
7280 : {
7281 6 : rtx tmp1 = gen_reg_rtx (V1TImode);
7282 6 : rtx tmp2 = gen_reg_rtx (V1TImode);
7283 6 : rtx tmp3 = gen_reg_rtx (V1TImode);
7284 :
7285 6 : emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (bits)));
7286 6 : emit_insn (gen_sse2_lshrv1ti3 (tmp2, op1, GEN_INT (128 - bits)));
7287 6 : emit_insn (gen_iorv1ti3 (tmp3, tmp1, tmp2));
7288 6 : emit_move_insn (operands[0], tmp3);
7289 6 : return;
7290 : }
7291 :
7292 20 : rtx op1_v4si = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7293 :
7294 20 : rtx lobits;
7295 20 : rtx hibits;
7296 :
7297 20 : switch (bits >> 5)
7298 : {
7299 7 : case 0:
7300 7 : lobits = op1_v4si;
7301 7 : hibits = gen_reg_rtx (V4SImode);
7302 7 : emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x93)));
7303 7 : break;
7304 :
7305 2 : case 1:
7306 2 : lobits = gen_reg_rtx (V4SImode);
7307 2 : hibits = gen_reg_rtx (V4SImode);
7308 2 : emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x93)));
7309 2 : emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x4e)));
7310 2 : break;
7311 :
7312 2 : case 2:
7313 2 : lobits = gen_reg_rtx (V4SImode);
7314 2 : hibits = gen_reg_rtx (V4SImode);
7315 2 : emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x4e)));
7316 2 : emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x39)));
7317 2 : break;
7318 :
7319 9 : default:
7320 9 : lobits = gen_reg_rtx (V4SImode);
7321 9 : emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x39)));
7322 9 : hibits = op1_v4si;
7323 9 : break;
7324 : }
7325 :
7326 20 : rtx tmp1 = gen_reg_rtx (V4SImode);
7327 20 : rtx tmp2 = gen_reg_rtx (V4SImode);
7328 20 : rtx tmp3 = gen_reg_rtx (V4SImode);
7329 :
7330 20 : emit_insn (gen_ashlv4si3 (tmp1, lobits, GEN_INT (bits & 31)));
7331 20 : emit_insn (gen_lshrv4si3 (tmp2, hibits, GEN_INT (32 - (bits & 31))));
7332 20 : emit_insn (gen_iorv4si3 (tmp3, tmp1, tmp2));
7333 :
7334 20 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
7335 : }
7336 :
7337 : /* Expand V1TI mode ashiftrt by constant. */
7338 : void
7339 109 : ix86_expand_v1ti_ashiftrt (rtx operands[])
7340 : {
7341 109 : rtx op1 = force_reg (V1TImode, operands[1]);
7342 :
7343 109 : if (!CONST_INT_P (operands[2]))
7344 : {
7345 3 : rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
7346 3 : rtx tmp2 = gen_reg_rtx (TImode);
7347 3 : emit_insn (gen_ashrti3 (tmp2, tmp1, operands[2]));
7348 3 : rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
7349 3 : emit_move_insn (operands[0], tmp3);
7350 3 : return;
7351 : }
7352 :
7353 106 : HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
7354 :
7355 106 : if (bits == 0)
7356 : {
7357 0 : emit_move_insn (operands[0], op1);
7358 0 : return;
7359 : }
7360 :
7361 106 : if (bits == 127)
7362 : {
7363 : /* Two operations. */
7364 3 : rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
7365 3 : rtx tmp2 = gen_reg_rtx (V4SImode);
7366 3 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7367 :
7368 3 : rtx tmp3 = gen_reg_rtx (V4SImode);
7369 3 : emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7370 :
7371 3 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
7372 3 : return;
7373 : }
7374 :
7375 103 : if (bits == 64)
7376 : {
7377 : /* Three operations. */
7378 3 : rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
7379 3 : rtx tmp2 = gen_reg_rtx (V4SImode);
7380 3 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7381 :
7382 3 : rtx tmp3 = gen_reg_rtx (V4SImode);
7383 3 : emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7384 :
7385 3 : rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
7386 3 : rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7387 3 : rtx tmp6 = gen_reg_rtx (V2DImode);
7388 3 : emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
7389 :
7390 3 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
7391 3 : return;
7392 : }
7393 :
7394 100 : if (bits == 96)
7395 : {
7396 : /* Three operations. */
7397 3 : rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
7398 3 : rtx tmp2 = gen_reg_rtx (V4SImode);
7399 3 : emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
7400 :
7401 3 : rtx tmp3 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
7402 3 : rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
7403 3 : rtx tmp5 = gen_reg_rtx (V2DImode);
7404 3 : emit_insn (gen_vec_interleave_highv2di (tmp5, tmp3, tmp4));
7405 :
7406 3 : rtx tmp6 = force_reg(V4SImode, gen_lowpart (V4SImode, tmp5));
7407 3 : rtx tmp7 = gen_reg_rtx (V4SImode);
7408 3 : emit_insn (gen_sse2_pshufd (tmp7, tmp6, GEN_INT (0xfd)));
7409 :
7410 3 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
7411 3 : return;
7412 : }
7413 :
7414 97 : if (bits >= 111)
7415 : {
7416 : /* Three operations. */
7417 21 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7418 21 : rtx tmp2 = gen_reg_rtx (V4SImode);
7419 21 : emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
7420 :
7421 21 : rtx tmp3 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
7422 21 : rtx tmp4 = gen_reg_rtx (V8HImode);
7423 21 : emit_insn (gen_sse2_pshufhw (tmp4, tmp3, GEN_INT (0xfe)));
7424 :
7425 21 : rtx tmp5 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp4));
7426 21 : rtx tmp6 = gen_reg_rtx (V4SImode);
7427 21 : emit_insn (gen_sse2_pshufd (tmp6, tmp5, GEN_INT (0xfe)));
7428 :
7429 21 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
7430 21 : return;
7431 : }
7432 :
7433 76 : if (TARGET_AVX2 || TARGET_SSE4_1)
7434 : {
7435 : /* Three operations. */
7436 50 : if (bits == 32)
7437 : {
7438 2 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7439 2 : rtx tmp2 = gen_reg_rtx (V4SImode);
7440 2 : emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
7441 :
7442 2 : rtx tmp3 = gen_reg_rtx (V1TImode);
7443 2 : emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (32)));
7444 :
7445 2 : if (TARGET_AVX2)
7446 : {
7447 1 : rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
7448 1 : rtx tmp5 = gen_reg_rtx (V4SImode);
7449 1 : emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
7450 : GEN_INT (7)));
7451 :
7452 1 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
7453 : }
7454 : else
7455 : {
7456 1 : rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
7457 1 : rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
7458 1 : rtx tmp6 = gen_reg_rtx (V8HImode);
7459 1 : emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
7460 : GEN_INT (0x3f)));
7461 :
7462 1 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
7463 : }
7464 2 : return;
7465 : }
7466 :
7467 : /* Three operations. */
7468 48 : if (bits == 8 || bits == 16 || bits == 24)
7469 : {
7470 6 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7471 6 : rtx tmp2 = gen_reg_rtx (V4SImode);
7472 6 : emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
7473 :
7474 6 : rtx tmp3 = gen_reg_rtx (V1TImode);
7475 6 : emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (bits)));
7476 :
7477 6 : if (TARGET_AVX2)
7478 : {
7479 3 : rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
7480 3 : rtx tmp5 = gen_reg_rtx (V4SImode);
7481 3 : emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
7482 : GEN_INT (7)));
7483 :
7484 3 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
7485 : }
7486 : else
7487 : {
7488 3 : rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
7489 3 : rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
7490 3 : rtx tmp6 = gen_reg_rtx (V8HImode);
7491 3 : emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
7492 : GEN_INT (0x3f)));
7493 :
7494 3 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
7495 : }
7496 6 : return;
7497 : }
7498 : }
7499 :
7500 68 : if (bits > 96)
7501 : {
7502 : /* Four operations. */
7503 3 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7504 3 : rtx tmp2 = gen_reg_rtx (V4SImode);
7505 3 : emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
7506 :
7507 3 : rtx tmp3 = gen_reg_rtx (V4SImode);
7508 3 : emit_insn (gen_ashrv4si3 (tmp3, tmp1, GEN_INT (31)));
7509 :
7510 3 : rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
7511 3 : rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7512 3 : rtx tmp6 = gen_reg_rtx (V2DImode);
7513 3 : emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
7514 :
7515 3 : rtx tmp7 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp6));
7516 3 : rtx tmp8 = gen_reg_rtx (V4SImode);
7517 3 : emit_insn (gen_sse2_pshufd (tmp8, tmp7, GEN_INT (0xfd)));
7518 :
7519 3 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp8));
7520 3 : return;
7521 : }
7522 :
7523 65 : if (TARGET_SSE4_1 && (bits == 48 || bits == 80))
7524 : {
7525 : /* Four operations. */
7526 4 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7527 4 : rtx tmp2 = gen_reg_rtx (V4SImode);
7528 4 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7529 :
7530 4 : rtx tmp3 = gen_reg_rtx (V4SImode);
7531 4 : emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7532 :
7533 4 : rtx tmp4 = gen_reg_rtx (V1TImode);
7534 4 : emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
7535 :
7536 4 : rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
7537 4 : rtx tmp6 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp4));
7538 4 : rtx tmp7 = gen_reg_rtx (V8HImode);
7539 6 : emit_insn (gen_sse4_1_pblendw (tmp7, tmp5, tmp6,
7540 : GEN_INT (bits == 48 ? 0x1f : 0x07)));
7541 :
7542 4 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
7543 4 : return;
7544 : }
7545 :
7546 61 : if ((bits & 7) == 0)
7547 : {
7548 : /* Five operations. */
7549 9 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7550 9 : rtx tmp2 = gen_reg_rtx (V4SImode);
7551 9 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7552 :
7553 9 : rtx tmp3 = gen_reg_rtx (V4SImode);
7554 9 : emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7555 :
7556 9 : rtx tmp4 = gen_reg_rtx (V1TImode);
7557 9 : emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
7558 :
7559 9 : rtx tmp5 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7560 9 : rtx tmp6 = gen_reg_rtx (V1TImode);
7561 9 : emit_insn (gen_sse2_ashlv1ti3 (tmp6, tmp5, GEN_INT (128 - bits)));
7562 :
7563 9 : rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
7564 9 : rtx tmp8 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp6));
7565 9 : rtx tmp9 = gen_reg_rtx (V2DImode);
7566 9 : emit_insn (gen_iorv2di3 (tmp9, tmp7, tmp8));
7567 :
7568 9 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp9));
7569 9 : return;
7570 : }
7571 :
7572 52 : if (TARGET_AVX2 && bits < 32)
7573 : {
7574 : /* Six operations. */
7575 9 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7576 9 : rtx tmp2 = gen_reg_rtx (V4SImode);
7577 9 : emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
7578 :
7579 9 : rtx tmp3 = gen_reg_rtx (V1TImode);
7580 9 : emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
7581 :
7582 9 : rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7583 9 : rtx tmp5 = gen_reg_rtx (V2DImode);
7584 9 : emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
7585 :
7586 9 : rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7587 9 : rtx tmp7 = gen_reg_rtx (V2DImode);
7588 9 : emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
7589 :
7590 9 : rtx tmp8 = gen_reg_rtx (V2DImode);
7591 9 : emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
7592 :
7593 9 : rtx tmp9 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp8));
7594 9 : rtx tmp10 = gen_reg_rtx (V4SImode);
7595 9 : emit_insn (gen_avx2_pblenddv4si (tmp10, tmp2, tmp9, GEN_INT (7)));
7596 :
7597 9 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp10));
7598 9 : return;
7599 : }
7600 :
7601 43 : if (TARGET_SSE4_1 && bits < 15)
7602 : {
7603 : /* Six operations. */
7604 4 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7605 4 : rtx tmp2 = gen_reg_rtx (V4SImode);
7606 4 : emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
7607 :
7608 4 : rtx tmp3 = gen_reg_rtx (V1TImode);
7609 4 : emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
7610 :
7611 4 : rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7612 4 : rtx tmp5 = gen_reg_rtx (V2DImode);
7613 4 : emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
7614 :
7615 4 : rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7616 4 : rtx tmp7 = gen_reg_rtx (V2DImode);
7617 4 : emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
7618 :
7619 4 : rtx tmp8 = gen_reg_rtx (V2DImode);
7620 4 : emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
7621 :
7622 4 : rtx tmp9 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
7623 4 : rtx tmp10 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp8));
7624 4 : rtx tmp11 = gen_reg_rtx (V8HImode);
7625 4 : emit_insn (gen_sse4_1_pblendw (tmp11, tmp9, tmp10, GEN_INT (0x3f)));
7626 :
7627 4 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp11));
7628 4 : return;
7629 : }
7630 :
7631 18 : if (bits == 1)
7632 : {
7633 : /* Eight operations. */
7634 1 : rtx tmp1 = gen_reg_rtx (V1TImode);
7635 1 : emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
7636 :
7637 1 : rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7638 1 : rtx tmp3 = gen_reg_rtx (V2DImode);
7639 1 : emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (1)));
7640 :
7641 1 : rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
7642 1 : rtx tmp5 = gen_reg_rtx (V2DImode);
7643 1 : emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (63)));
7644 :
7645 1 : rtx tmp6 = gen_reg_rtx (V2DImode);
7646 1 : emit_insn (gen_iorv2di3 (tmp6, tmp3, tmp5));
7647 :
7648 1 : rtx tmp7 = gen_reg_rtx (V2DImode);
7649 1 : emit_insn (gen_lshrv2di3 (tmp7, tmp2, GEN_INT (63)));
7650 :
7651 1 : rtx tmp8 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp7));
7652 1 : rtx tmp9 = gen_reg_rtx (V4SImode);
7653 1 : emit_insn (gen_sse2_pshufd (tmp9, tmp8, GEN_INT (0xbf)));
7654 :
7655 1 : rtx tmp10 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp9));
7656 1 : rtx tmp11 = gen_reg_rtx (V2DImode);
7657 1 : emit_insn (gen_ashlv2di3 (tmp11, tmp10, GEN_INT (31)));
7658 :
7659 1 : rtx tmp12 = gen_reg_rtx (V2DImode);
7660 1 : emit_insn (gen_iorv2di3 (tmp12, tmp6, tmp11));
7661 :
7662 1 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp12));
7663 1 : return;
7664 : }
7665 :
7666 38 : if (bits > 64)
7667 : {
7668 : /* Eight operations. */
7669 12 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7670 12 : rtx tmp2 = gen_reg_rtx (V4SImode);
7671 12 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7672 :
7673 12 : rtx tmp3 = gen_reg_rtx (V4SImode);
7674 12 : emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7675 :
7676 12 : rtx tmp4 = gen_reg_rtx (V1TImode);
7677 12 : emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
7678 :
7679 12 : rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
7680 12 : rtx tmp6 = gen_reg_rtx (V2DImode);
7681 12 : emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits - 64)));
7682 :
7683 12 : rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7684 12 : rtx tmp8 = gen_reg_rtx (V1TImode);
7685 12 : emit_insn (gen_sse2_ashlv1ti3 (tmp8, tmp7, GEN_INT (64)));
7686 :
7687 12 : rtx tmp9 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7688 12 : rtx tmp10 = gen_reg_rtx (V2DImode);
7689 12 : emit_insn (gen_ashlv2di3 (tmp10, tmp9, GEN_INT (128 - bits)));
7690 :
7691 12 : rtx tmp11 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp8));
7692 12 : rtx tmp12 = gen_reg_rtx (V2DImode);
7693 12 : emit_insn (gen_iorv2di3 (tmp12, tmp10, tmp11));
7694 :
7695 12 : rtx tmp13 = gen_reg_rtx (V2DImode);
7696 12 : emit_insn (gen_iorv2di3 (tmp13, tmp6, tmp12));
7697 :
7698 12 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp13));
7699 : }
7700 : else
7701 : {
7702 : /* Nine operations. */
7703 26 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7704 26 : rtx tmp2 = gen_reg_rtx (V4SImode);
7705 26 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7706 :
7707 26 : rtx tmp3 = gen_reg_rtx (V4SImode);
7708 26 : emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7709 :
7710 26 : rtx tmp4 = gen_reg_rtx (V1TImode);
7711 26 : emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
7712 :
7713 26 : rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7714 26 : rtx tmp6 = gen_reg_rtx (V2DImode);
7715 26 : emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits)));
7716 :
7717 26 : rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
7718 26 : rtx tmp8 = gen_reg_rtx (V2DImode);
7719 26 : emit_insn (gen_ashlv2di3 (tmp8, tmp7, GEN_INT (64 - bits)));
7720 :
7721 26 : rtx tmp9 = gen_reg_rtx (V2DImode);
7722 26 : emit_insn (gen_iorv2di3 (tmp9, tmp6, tmp8));
7723 :
7724 26 : rtx tmp10 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7725 26 : rtx tmp11 = gen_reg_rtx (V1TImode);
7726 26 : emit_insn (gen_sse2_ashlv1ti3 (tmp11, tmp10, GEN_INT (64)));
7727 :
7728 26 : rtx tmp12 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp11));
7729 26 : rtx tmp13 = gen_reg_rtx (V2DImode);
7730 26 : emit_insn (gen_ashlv2di3 (tmp13, tmp12, GEN_INT (64 - bits)));
7731 :
7732 26 : rtx tmp14 = gen_reg_rtx (V2DImode);
7733 26 : emit_insn (gen_iorv2di3 (tmp14, tmp9, tmp13));
7734 :
7735 26 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp14));
7736 : }
7737 : }
7738 :
7739 : /* Expand V2DI mode ashiftrt. */
7740 : void
7741 404 : ix86_expand_v2di_ashiftrt (rtx operands[])
7742 : {
7743 404 : if (operands[2] == const0_rtx)
7744 : {
7745 0 : emit_move_insn (operands[0], operands[1]);
7746 0 : return;
7747 : }
7748 :
7749 404 : if (TARGET_SSE4_2
7750 133 : && CONST_INT_P (operands[2])
7751 133 : && UINTVAL (operands[2]) >= 63
7752 412 : && !optimize_insn_for_size_p ())
7753 : {
7754 8 : rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode));
7755 8 : emit_insn (gen_sse4_2_gtv2di3 (operands[0], zero, operands[1]));
7756 8 : return;
7757 : }
7758 :
7759 396 : if (CONST_INT_P (operands[2])
7760 376 : && (!TARGET_XOP || UINTVAL (operands[2]) >= 63))
7761 : {
7762 280 : vec_perm_builder sel (4, 4, 1);
7763 280 : sel.quick_grow (4);
7764 280 : rtx arg0, arg1;
7765 280 : rtx op1 = lowpart_subreg (V4SImode,
7766 : force_reg (V2DImode, operands[1]),
7767 : V2DImode);
7768 280 : rtx target = gen_reg_rtx (V4SImode);
7769 280 : if (UINTVAL (operands[2]) >= 63)
7770 : {
7771 99 : arg0 = arg1 = gen_reg_rtx (V4SImode);
7772 99 : emit_insn (gen_ashrv4si3 (arg0, op1, GEN_INT (31)));
7773 99 : sel[0] = 1;
7774 99 : sel[1] = 1;
7775 99 : sel[2] = 3;
7776 99 : sel[3] = 3;
7777 : }
7778 181 : else if (INTVAL (operands[2]) > 32)
7779 : {
7780 18 : arg0 = gen_reg_rtx (V4SImode);
7781 18 : arg1 = gen_reg_rtx (V4SImode);
7782 18 : emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31)));
7783 18 : emit_insn (gen_ashrv4si3 (arg0, op1,
7784 18 : GEN_INT (INTVAL (operands[2]) - 32)));
7785 18 : sel[0] = 1;
7786 18 : sel[1] = 5;
7787 18 : sel[2] = 3;
7788 18 : sel[3] = 7;
7789 : }
7790 163 : else if (INTVAL (operands[2]) == 32)
7791 : {
7792 5 : arg0 = op1;
7793 5 : arg1 = gen_reg_rtx (V4SImode);
7794 5 : emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31)));
7795 5 : sel[0] = 1;
7796 5 : sel[1] = 5;
7797 5 : sel[2] = 3;
7798 5 : sel[3] = 7;
7799 : }
7800 : else
7801 : {
7802 158 : arg0 = gen_reg_rtx (V2DImode);
7803 158 : arg1 = gen_reg_rtx (V4SImode);
7804 158 : emit_insn (gen_lshrv2di3 (arg0, operands[1], operands[2]));
7805 158 : emit_insn (gen_ashrv4si3 (arg1, op1, operands[2]));
7806 158 : arg0 = lowpart_subreg (V4SImode, arg0, V2DImode);
7807 158 : sel[0] = 0;
7808 158 : sel[1] = 5;
7809 158 : sel[2] = 2;
7810 158 : sel[3] = 7;
7811 : }
7812 379 : vec_perm_indices indices (sel, arg0 != arg1 ? 2 : 1, 4);
7813 280 : rtx op0 = operands[0];
7814 280 : bool ok = targetm.vectorize.vec_perm_const (V4SImode, V4SImode,
7815 : target, arg0, arg1,
7816 : indices);
7817 280 : gcc_assert (ok);
7818 280 : emit_move_insn (op0, lowpart_subreg (V2DImode, target, V4SImode));
7819 280 : return;
7820 280 : }
7821 116 : if (!TARGET_XOP)
7822 : {
7823 20 : rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode));
7824 20 : rtx zero_or_all_ones;
7825 20 : if (TARGET_SSE4_2)
7826 : {
7827 0 : zero_or_all_ones = gen_reg_rtx (V2DImode);
7828 0 : emit_insn (gen_sse4_2_gtv2di3 (zero_or_all_ones, zero,
7829 : operands[1]));
7830 : }
7831 : else
7832 : {
7833 20 : rtx temp = gen_reg_rtx (V4SImode);
7834 20 : emit_insn (gen_ashrv4si3 (temp,
7835 : lowpart_subreg (V4SImode,
7836 : force_reg (V2DImode,
7837 : operands[1]),
7838 : V2DImode),
7839 : GEN_INT (31)));
7840 20 : zero_or_all_ones = gen_reg_rtx (V4SImode);
7841 20 : emit_insn (gen_sse2_pshufd_1 (zero_or_all_ones, temp,
7842 : const1_rtx, const1_rtx,
7843 : GEN_INT (3), GEN_INT (3)));
7844 20 : zero_or_all_ones = lowpart_subreg (V2DImode, zero_or_all_ones,
7845 : V4SImode);
7846 : }
7847 20 : rtx lshr_res = gen_reg_rtx (V2DImode);
7848 20 : emit_insn (gen_lshrv2di3 (lshr_res, operands[1], operands[2]));
7849 20 : rtx ashl_res = gen_reg_rtx (V2DImode);
7850 20 : rtx amount;
7851 20 : if (TARGET_64BIT)
7852 : {
7853 20 : amount = gen_reg_rtx (DImode);
7854 20 : emit_insn (gen_subdi3 (amount, force_reg (DImode, GEN_INT (64)),
7855 : operands[2]));
7856 : }
7857 : else
7858 : {
7859 0 : rtx temp = gen_reg_rtx (SImode);
7860 0 : emit_insn (gen_subsi3 (temp, force_reg (SImode, GEN_INT (64)),
7861 : lowpart_subreg (SImode, operands[2],
7862 : DImode)));
7863 0 : amount = gen_reg_rtx (V4SImode);
7864 0 : emit_insn (gen_vec_setv4si_0 (amount, CONST0_RTX (V4SImode),
7865 : temp));
7866 : }
7867 20 : amount = lowpart_subreg (DImode, amount, GET_MODE (amount));
7868 20 : emit_insn (gen_ashlv2di3 (ashl_res, zero_or_all_ones, amount));
7869 20 : emit_insn (gen_iorv2di3 (operands[0], lshr_res, ashl_res));
7870 20 : return;
7871 : }
7872 :
7873 96 : rtx reg = gen_reg_rtx (V2DImode);
7874 96 : rtx par;
7875 96 : bool negate = false;
7876 96 : int i;
7877 :
7878 96 : if (CONST_INT_P (operands[2]))
7879 96 : operands[2] = GEN_INT (-INTVAL (operands[2]));
7880 : else
7881 : negate = true;
7882 :
7883 96 : par = gen_rtx_PARALLEL (V2DImode, rtvec_alloc (2));
7884 288 : for (i = 0; i < 2; i++)
7885 192 : XVECEXP (par, 0, i) = operands[2];
7886 :
7887 96 : emit_insn (gen_vec_initv2didi (reg, par));
7888 :
7889 96 : if (negate)
7890 0 : emit_insn (gen_negv2di2 (reg, reg));
7891 :
7892 96 : emit_insn (gen_xop_shav2di3 (operands[0], operands[1], reg));
7893 : }
7894 :
7895 : /* Replace all occurrences of REG FROM with REG TO in X, including
7896 : occurrences with different modes. */
7897 :
7898 : rtx
7899 39725 : ix86_replace_reg_with_reg (rtx x, rtx from, rtx to)
7900 : {
7901 39725 : gcc_checking_assert (REG_P (from)
7902 : && REG_P (to)
7903 : && GET_MODE (from) == GET_MODE (to));
7904 39725 : if (!reg_overlap_mentioned_p (from, x))
7905 : return x;
7906 94 : rtx ret = copy_rtx (x);
7907 94 : subrtx_ptr_iterator::array_type array;
7908 458 : FOR_EACH_SUBRTX_PTR (iter, array, &ret, NONCONST)
7909 : {
7910 364 : rtx *loc = *iter;
7911 364 : x = *loc;
7912 364 : if (REG_P (x) && REGNO (x) == REGNO (from))
7913 : {
7914 94 : if (x == from)
7915 94 : *loc = to;
7916 : else
7917 : {
7918 0 : gcc_checking_assert (REG_NREGS (x) == 1);
7919 0 : *loc = gen_rtx_REG (GET_MODE (x), REGNO (to));
7920 : }
7921 : }
7922 : }
7923 94 : return ret;
7924 94 : }
7925 :
7926 : /* Return mode for the memcpy/memset loop counter. Prefer SImode over
7927 : DImode for constant loop counts. */
7928 :
7929 : static machine_mode
7930 32362 : counter_mode (rtx count_exp)
7931 : {
7932 7507 : if (GET_MODE (count_exp) != VOIDmode)
7933 25679 : return GET_MODE (count_exp);
7934 6683 : if (!CONST_INT_P (count_exp))
7935 0 : return Pmode;
7936 : if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
7937 : return DImode;
7938 : return SImode;
7939 : }
7940 :
7941 : /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
7942 : to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
7943 : specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
7944 : memory by VALUE (supposed to be in MODE).
7945 :
7946 : The size is rounded down to whole number of chunk size moved at once.
7947 : SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
7948 :
7949 :
7950 : static void
7951 18256 : expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
7952 : rtx destptr, rtx srcptr, rtx value,
7953 : rtx count, machine_mode mode, int unroll,
7954 : int expected_size, bool issetmem)
7955 : {
7956 18256 : rtx_code_label *out_label = nullptr;
7957 18256 : rtx_code_label *top_label = nullptr;
7958 18256 : rtx iter, tmp;
7959 18256 : machine_mode iter_mode = counter_mode (count);
7960 18256 : int piece_size_n = GET_MODE_SIZE (mode) * unroll;
7961 18256 : rtx piece_size = GEN_INT (piece_size_n);
7962 36512 : rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
7963 18256 : rtx size;
7964 18256 : int i;
7965 18256 : int loop_count;
7966 :
7967 18256 : if (expected_size != -1 && CONST_INT_P (count))
7968 6604 : loop_count = INTVAL (count) / GET_MODE_SIZE (mode) / unroll;
7969 : else
7970 : loop_count = -1;
7971 :
7972 : /* Don't generate the loop if the loop count is 1. */
7973 6604 : if (loop_count != 1)
7974 : {
7975 18254 : top_label = gen_label_rtx ();
7976 18254 : out_label = gen_label_rtx ();
7977 : }
7978 18256 : iter = gen_reg_rtx (iter_mode);
7979 :
7980 18256 : size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
7981 : NULL, 1, OPTAB_DIRECT);
7982 : /* Those two should combine. */
7983 18256 : if (piece_size == const1_rtx)
7984 : {
7985 4193 : emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
7986 : true, out_label);
7987 4193 : predict_jump (REG_BR_PROB_BASE * 10 / 100);
7988 : }
7989 18256 : emit_move_insn (iter, const0_rtx);
7990 :
7991 18256 : if (loop_count != 1)
7992 18254 : emit_label (top_label);
7993 :
7994 21001 : tmp = convert_modes (Pmode, iter_mode, iter, true);
7995 :
7996 : /* This assert could be relaxed - in this case we'll need to compute
7997 : smallest power of two, containing in PIECE_SIZE_N and pass it to
7998 : offset_address. */
7999 18256 : gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
8000 18256 : destmem = offset_address (destmem, tmp, piece_size_n);
8001 18256 : destmem = adjust_address (destmem, mode, 0);
8002 :
8003 18256 : if (!issetmem)
8004 : {
8005 11961 : srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
8006 11961 : srcmem = adjust_address (srcmem, mode, 0);
8007 :
8008 : /* When unrolling for chips that reorder memory reads and writes,
8009 : we can save registers by using single temporary.
8010 : Also using 4 temporaries is overkill in 32bit mode. */
8011 11961 : if (!TARGET_64BIT && 0)
8012 : {
8013 : for (i = 0; i < unroll; i++)
8014 : {
8015 : if (i)
8016 : {
8017 : destmem = adjust_address (copy_rtx (destmem), mode,
8018 : GET_MODE_SIZE (mode));
8019 : srcmem = adjust_address (copy_rtx (srcmem), mode,
8020 : GET_MODE_SIZE (mode));
8021 : }
8022 : emit_move_insn (destmem, srcmem);
8023 : }
8024 : }
8025 : else
8026 : {
8027 11961 : rtx tmpreg[4];
8028 11961 : gcc_assert (unroll <= 4);
8029 49514 : for (i = 0; i < unroll; i++)
8030 : {
8031 37553 : tmpreg[i] = gen_reg_rtx (mode);
8032 37553 : if (i)
8033 51184 : srcmem = adjust_address (copy_rtx (srcmem), mode,
8034 : GET_MODE_SIZE (mode));
8035 37553 : emit_move_insn (tmpreg[i], srcmem);
8036 : }
8037 49514 : for (i = 0; i < unroll; i++)
8038 : {
8039 37553 : if (i)
8040 51184 : destmem = adjust_address (copy_rtx (destmem), mode,
8041 : GET_MODE_SIZE (mode));
8042 37553 : emit_move_insn (destmem, tmpreg[i]);
8043 : }
8044 : }
8045 : }
8046 : else
8047 29018 : for (i = 0; i < unroll; i++)
8048 : {
8049 22723 : if (i)
8050 32856 : destmem = adjust_address (copy_rtx (destmem), mode,
8051 : GET_MODE_SIZE (mode));
8052 22723 : emit_move_insn (destmem, value);
8053 : }
8054 :
8055 18256 : tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
8056 : true, OPTAB_LIB_WIDEN);
8057 18256 : if (tmp != iter)
8058 0 : emit_move_insn (iter, tmp);
8059 :
8060 18256 : if (loop_count != 1)
8061 : {
8062 18254 : emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
8063 : true, top_label);
8064 18254 : if (expected_size != -1)
8065 : {
8066 9105 : expected_size /= GET_MODE_SIZE (mode) * unroll;
8067 9105 : if (expected_size == 0)
8068 1 : predict_jump (0);
8069 9104 : else if (expected_size > REG_BR_PROB_BASE)
8070 2 : predict_jump (REG_BR_PROB_BASE - 1);
8071 : else
8072 9102 : predict_jump (REG_BR_PROB_BASE
8073 9102 : - (REG_BR_PROB_BASE + expected_size / 2)
8074 9102 : / expected_size);
8075 : }
8076 : else
8077 9149 : predict_jump (REG_BR_PROB_BASE * 80 / 100);
8078 : }
8079 18256 : iter = ix86_zero_extend_to_Pmode (iter);
8080 21001 : tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
8081 : true, OPTAB_LIB_WIDEN);
8082 18256 : if (tmp != destptr)
8083 0 : emit_move_insn (destptr, tmp);
8084 18256 : if (!issetmem)
8085 : {
8086 13308 : tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
8087 : true, OPTAB_LIB_WIDEN);
8088 11961 : if (tmp != srcptr)
8089 0 : emit_move_insn (srcptr, tmp);
8090 : }
8091 18256 : if (loop_count != 1)
8092 18254 : emit_label (out_label);
8093 18256 : }
8094 :
8095 : /* Divide COUNTREG by SCALE. */
8096 : static rtx
8097 16660 : scale_counter (rtx countreg, int scale)
8098 : {
8099 16660 : rtx sc;
8100 :
8101 16660 : if (scale == 1)
8102 : return countreg;
8103 10742 : if (CONST_INT_P (countreg))
8104 10725 : return GEN_INT (INTVAL (countreg) / scale);
8105 17 : gcc_assert (REG_P (countreg));
8106 :
8107 51 : sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
8108 34 : GEN_INT (exact_log2 (scale)),
8109 : NULL, 1, OPTAB_DIRECT);
8110 17 : return sc;
8111 : }
8112 :
8113 : /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
8114 : When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
8115 : When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
8116 : For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
8117 : ORIG_VALUE is the original value passed to memset to fill the memory with.
8118 : Other arguments have same meaning as for previous function. */
8119 :
8120 : static void
8121 16660 : expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
8122 : rtx destptr, rtx srcptr, rtx value, rtx orig_value,
8123 : rtx count,
8124 : machine_mode mode, bool issetmem)
8125 : {
8126 16660 : rtx destexp;
8127 16660 : rtx srcexp;
8128 16660 : rtx countreg;
8129 16660 : HOST_WIDE_INT rounded_count;
8130 :
8131 : /* If possible, it is shorter to use rep movs.
8132 : TODO: Maybe it is better to move this logic to decide_alg. */
8133 16660 : if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
8134 243 : && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
8135 239 : && (!issetmem || orig_value == const0_rtx))
8136 16660 : mode = SImode;
8137 :
8138 16660 : if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
8139 16382 : destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
8140 :
8141 33320 : countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
8142 16660 : GET_MODE_SIZE (mode)));
8143 16660 : if (mode != QImode)
8144 : {
8145 32486 : destexp = gen_rtx_ASHIFT (Pmode, countreg,
8146 : GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
8147 11002 : destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
8148 : }
8149 : else
8150 5940 : destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
8151 16660 : if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
8152 : {
8153 11412 : rounded_count
8154 11412 : = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
8155 11412 : destmem = shallow_copy_rtx (destmem);
8156 11412 : set_mem_size (destmem, rounded_count);
8157 : }
8158 5256 : else if (MEM_SIZE_KNOWN_P (destmem))
8159 333 : clear_mem_size (destmem);
8160 :
8161 16660 : if (issetmem)
8162 : {
8163 6067 : value = force_reg (mode, gen_lowpart (mode, value));
8164 6067 : emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
8165 : }
8166 : else
8167 : {
8168 10593 : if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
8169 10378 : srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
8170 10593 : if (mode != QImode)
8171 : {
8172 18160 : srcexp = gen_rtx_ASHIFT (Pmode, countreg,
8173 : GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
8174 6176 : srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
8175 : }
8176 : else
8177 4619 : srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
8178 10593 : if (CONST_INT_P (count))
8179 : {
8180 6487 : rounded_count
8181 6487 : = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
8182 6487 : srcmem = shallow_copy_rtx (srcmem);
8183 6487 : set_mem_size (srcmem, rounded_count);
8184 : }
8185 : else
8186 : {
8187 4120 : if (MEM_SIZE_KNOWN_P (srcmem))
8188 0 : clear_mem_size (srcmem);
8189 : }
8190 10593 : emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
8191 : destexp, srcexp));
8192 : }
8193 16660 : }
8194 :
8195 : /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
8196 : DESTMEM.
8197 : SRC is passed by pointer to be updated on return.
8198 : Return value is updated DST. */
8199 : static rtx
8200 13 : emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
8201 : HOST_WIDE_INT size_to_move)
8202 : {
8203 13 : rtx dst = destmem, src = *srcmem, tempreg;
8204 13 : enum insn_code code;
8205 13 : machine_mode move_mode;
8206 13 : int piece_size, i;
8207 :
8208 : /* Find the widest mode in which we could perform moves.
8209 : Start with the biggest power of 2 less than SIZE_TO_MOVE and half
8210 : it until move of such size is supported. */
8211 13 : piece_size = 1 << floor_log2 (size_to_move);
8212 26 : while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
8213 26 : || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
8214 : {
8215 0 : gcc_assert (piece_size > 1);
8216 0 : piece_size >>= 1;
8217 : }
8218 :
8219 : /* Find the corresponding vector mode with the same size as MOVE_MODE.
8220 : MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
8221 39 : if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
8222 : {
8223 0 : int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
8224 0 : if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
8225 0 : || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
8226 : {
8227 0 : move_mode = word_mode;
8228 0 : piece_size = GET_MODE_SIZE (move_mode);
8229 0 : code = optab_handler (mov_optab, move_mode);
8230 : }
8231 : }
8232 13 : gcc_assert (code != CODE_FOR_nothing);
8233 :
8234 13 : dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
8235 13 : src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
8236 :
8237 : /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
8238 13 : gcc_assert (size_to_move % piece_size == 0);
8239 :
8240 26 : for (i = 0; i < size_to_move; i += piece_size)
8241 : {
8242 : /* We move from memory to memory, so we'll need to do it via
8243 : a temporary register. */
8244 13 : tempreg = gen_reg_rtx (move_mode);
8245 13 : emit_insn (GEN_FCN (code) (tempreg, src));
8246 13 : emit_insn (GEN_FCN (code) (dst, tempreg));
8247 :
8248 26 : emit_move_insn (destptr,
8249 13 : plus_constant (Pmode, copy_rtx (destptr), piece_size));
8250 26 : emit_move_insn (srcptr,
8251 13 : plus_constant (Pmode, copy_rtx (srcptr), piece_size));
8252 :
8253 13 : dst = adjust_automodify_address_nv (dst, move_mode, destptr,
8254 : piece_size);
8255 13 : src = adjust_automodify_address_nv (src, move_mode, srcptr,
8256 : piece_size);
8257 : }
8258 :
8259 : /* Update DST and SRC rtx. */
8260 13 : *srcmem = src;
8261 13 : return dst;
8262 : }
8263 :
8264 : /* Helper function for the string operations below. Dest VARIABLE whether
8265 : it is aligned to VALUE bytes. If true, jump to the label. */
8266 :
8267 : static rtx_code_label *
8268 35973 : ix86_expand_aligntest (rtx variable, int value, bool epilogue)
8269 : {
8270 35973 : rtx_code_label *label = gen_label_rtx ();
8271 35973 : rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
8272 35973 : if (GET_MODE (variable) == DImode)
8273 905 : emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
8274 : else
8275 35068 : emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
8276 35973 : emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
8277 : 1, label);
8278 35973 : if (epilogue)
8279 3 : predict_jump (REG_BR_PROB_BASE * 50 / 100);
8280 : else
8281 35970 : predict_jump (REG_BR_PROB_BASE * 90 / 100);
8282 35973 : return label;
8283 : }
8284 :
8285 :
8286 : /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
8287 :
8288 : static void
8289 8580 : expand_cpymem_epilogue (rtx destmem, rtx srcmem,
8290 : rtx destptr, rtx srcptr, rtx count, int max_size)
8291 : {
8292 8580 : rtx src, dest;
8293 8580 : if (CONST_INT_P (count))
8294 : {
8295 6401 : unsigned HOST_WIDE_INT countval = UINTVAL (count);
8296 6401 : unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
8297 6401 : unsigned int destalign = MEM_ALIGN (destmem);
8298 6401 : cfun->machine->by_pieces_in_use = true;
8299 6401 : move_by_pieces (destmem, srcmem, epilogue_size, destalign,
8300 : RETURN_BEGIN);
8301 6401 : cfun->machine->by_pieces_in_use = false;
8302 6401 : return;
8303 : }
8304 2179 : if (max_size > 8)
8305 : {
8306 2179 : count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
8307 : count, 1, OPTAB_DIRECT);
8308 2179 : expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
8309 : count, QImode, 1, 4, false);
8310 2179 : return;
8311 : }
8312 :
8313 : /* When there are stringops, we can cheaply increase dest and src pointers.
8314 : Otherwise we save code size by maintaining offset (zero is readily
8315 : available from preceding rep operation) and using x86 addressing modes.
8316 : */
8317 0 : if (TARGET_SINGLE_STRINGOP)
8318 : {
8319 0 : if (max_size > 4)
8320 : {
8321 0 : rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
8322 0 : src = change_address (srcmem, SImode, srcptr);
8323 0 : dest = change_address (destmem, SImode, destptr);
8324 0 : emit_insn (gen_strmov (destptr, dest, srcptr, src));
8325 0 : emit_label (label);
8326 0 : LABEL_NUSES (label) = 1;
8327 : }
8328 0 : if (max_size > 2)
8329 : {
8330 0 : rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
8331 0 : src = change_address (srcmem, HImode, srcptr);
8332 0 : dest = change_address (destmem, HImode, destptr);
8333 0 : emit_insn (gen_strmov (destptr, dest, srcptr, src));
8334 0 : emit_label (label);
8335 0 : LABEL_NUSES (label) = 1;
8336 : }
8337 0 : if (max_size > 1)
8338 : {
8339 0 : rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
8340 0 : src = change_address (srcmem, QImode, srcptr);
8341 0 : dest = change_address (destmem, QImode, destptr);
8342 0 : emit_insn (gen_strmov (destptr, dest, srcptr, src));
8343 0 : emit_label (label);
8344 0 : LABEL_NUSES (label) = 1;
8345 : }
8346 : }
8347 : else
8348 : {
8349 0 : rtx offset = force_reg (Pmode, const0_rtx);
8350 0 : rtx tmp;
8351 :
8352 0 : if (max_size > 4)
8353 : {
8354 0 : rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
8355 0 : src = change_address (srcmem, SImode, srcptr);
8356 0 : dest = change_address (destmem, SImode, destptr);
8357 0 : emit_move_insn (dest, src);
8358 0 : tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
8359 : true, OPTAB_LIB_WIDEN);
8360 0 : if (tmp != offset)
8361 0 : emit_move_insn (offset, tmp);
8362 0 : emit_label (label);
8363 0 : LABEL_NUSES (label) = 1;
8364 : }
8365 0 : if (max_size > 2)
8366 : {
8367 0 : rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
8368 0 : tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
8369 0 : src = change_address (srcmem, HImode, tmp);
8370 0 : tmp = gen_rtx_PLUS (Pmode, destptr, offset);
8371 0 : dest = change_address (destmem, HImode, tmp);
8372 0 : emit_move_insn (dest, src);
8373 0 : tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
8374 : true, OPTAB_LIB_WIDEN);
8375 0 : if (tmp != offset)
8376 0 : emit_move_insn (offset, tmp);
8377 0 : emit_label (label);
8378 0 : LABEL_NUSES (label) = 1;
8379 : }
8380 0 : if (max_size > 1)
8381 : {
8382 0 : rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
8383 0 : tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
8384 0 : src = change_address (srcmem, QImode, tmp);
8385 0 : tmp = gen_rtx_PLUS (Pmode, destptr, offset);
8386 0 : dest = change_address (destmem, QImode, tmp);
8387 0 : emit_move_insn (dest, src);
8388 0 : emit_label (label);
8389 0 : LABEL_NUSES (label) = 1;
8390 : }
8391 : }
8392 : }
8393 :
8394 : /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
8395 : with value PROMOTED_VAL.
8396 : SRC is passed by pointer to be updated on return.
8397 : Return value is updated DST. */
8398 : static rtx
8399 6 : emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
8400 : HOST_WIDE_INT size_to_move)
8401 : {
8402 6 : rtx dst = destmem;
8403 6 : enum insn_code code;
8404 6 : machine_mode move_mode;
8405 6 : int piece_size, i;
8406 :
8407 : /* Find the widest mode in which we could perform moves.
8408 : Start with the biggest power of 2 less than SIZE_TO_MOVE and half
8409 : it until move of such size is supported. */
8410 6 : move_mode = GET_MODE (promoted_val);
8411 6 : if (move_mode == VOIDmode)
8412 0 : move_mode = QImode;
8413 12 : if (size_to_move < GET_MODE_SIZE (move_mode))
8414 : {
8415 5 : unsigned int move_bits = size_to_move * BITS_PER_UNIT;
8416 5 : move_mode = int_mode_for_size (move_bits, 0).require ();
8417 5 : promoted_val = gen_lowpart (move_mode, promoted_val);
8418 : }
8419 6 : piece_size = GET_MODE_SIZE (move_mode);
8420 6 : code = optab_handler (mov_optab, move_mode);
8421 6 : gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
8422 :
8423 6 : dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
8424 :
8425 : /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
8426 6 : gcc_assert (size_to_move % piece_size == 0);
8427 :
8428 12 : for (i = 0; i < size_to_move; i += piece_size)
8429 : {
8430 12 : if (piece_size <= GET_MODE_SIZE (word_mode))
8431 : {
8432 4 : emit_insn (gen_strset (destptr, dst, promoted_val));
8433 4 : dst = adjust_automodify_address_nv (dst, move_mode, destptr,
8434 : piece_size);
8435 4 : continue;
8436 : }
8437 :
8438 2 : emit_insn (GEN_FCN (code) (dst, promoted_val));
8439 :
8440 4 : emit_move_insn (destptr,
8441 2 : plus_constant (Pmode, copy_rtx (destptr), piece_size));
8442 :
8443 2 : dst = adjust_automodify_address_nv (dst, move_mode, destptr,
8444 : piece_size);
8445 : }
8446 :
8447 : /* Update DST rtx. */
8448 6 : return dst;
8449 : }
8450 : /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
8451 : static void
8452 311 : expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
8453 : rtx count, int max_size)
8454 : {
8455 622 : count = expand_simple_binop (counter_mode (count), AND, count,
8456 311 : GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
8457 311 : expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
8458 311 : gen_lowpart (QImode, value), count, QImode,
8459 : 1, max_size / 2, true);
8460 311 : }
8461 :
8462 : /* Callback routine for store_by_pieces. Return the RTL of a register
8463 : containing GET_MODE_SIZE (MODE) bytes in the RTL register op_p which
8464 : is an integer or a word vector register. If PREV_P isn't nullptr,
8465 : it has the RTL info from the previous iteration. */
8466 :
8467 : static rtx
8468 5018 : setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT,
8469 : fixed_size_mode mode)
8470 : {
8471 5018 : rtx target;
8472 5018 : by_pieces_prev *prev = (by_pieces_prev *) prev_p;
8473 5018 : if (prev)
8474 : {
8475 5018 : rtx prev_op = prev->data;
8476 5018 : if (prev_op)
8477 : {
8478 2908 : machine_mode prev_mode = GET_MODE (prev_op);
8479 2908 : if (prev_mode == mode)
8480 : return prev_op;
8481 54 : if (VECTOR_MODE_P (prev_mode)
8482 1098 : && VECTOR_MODE_P (mode)
8483 1152 : && GET_MODE_INNER (prev_mode) == GET_MODE_INNER (mode))
8484 : {
8485 0 : target = gen_rtx_SUBREG (mode, prev_op, 0);
8486 0 : return target;
8487 : }
8488 : }
8489 : }
8490 :
8491 3262 : rtx op = (rtx) op_p;
8492 3262 : machine_mode op_mode = GET_MODE (op);
8493 :
8494 3262 : if (VECTOR_MODE_P (mode))
8495 : {
8496 3692 : gcc_assert (GET_MODE_INNER (mode) == QImode);
8497 :
8498 1846 : unsigned int op_size = GET_MODE_SIZE (op_mode);
8499 1846 : unsigned int size = GET_MODE_SIZE (mode);
8500 1846 : unsigned int nunits;
8501 1846 : machine_mode vec_mode;
8502 1846 : if (op_size < size)
8503 : {
8504 : /* If OP size is smaller than MODE size, duplicate it. */
8505 1 : nunits = size / GET_MODE_SIZE (QImode);
8506 1 : vec_mode = mode_for_vector (QImode, nunits).require ();
8507 1 : nunits = size / op_size;
8508 1 : gcc_assert (SCALAR_INT_MODE_P (op_mode));
8509 1 : machine_mode dup_mode
8510 1 : = mode_for_vector (as_a <scalar_mode> (op_mode),
8511 2 : nunits).require ();
8512 1 : target = gen_reg_rtx (vec_mode);
8513 1 : op = gen_vec_duplicate (dup_mode, op);
8514 1 : rtx dup_op = gen_reg_rtx (dup_mode);
8515 1 : emit_move_insn (dup_op, op);
8516 1 : op = gen_rtx_SUBREG (vec_mode, dup_op, 0);
8517 1 : emit_move_insn (target, op);
8518 1 : return target;
8519 : }
8520 1845 : nunits = op_size / GET_MODE_SIZE (QImode);
8521 1845 : vec_mode = mode_for_vector (QImode, nunits).require ();
8522 1845 : target = gen_reg_rtx (vec_mode);
8523 1845 : op = gen_rtx_SUBREG (vec_mode, op, 0);
8524 1845 : emit_move_insn (target, op);
8525 1845 : if (op_size == size)
8526 : return target;
8527 :
8528 0 : rtx tmp = gen_reg_rtx (mode);
8529 0 : target = gen_rtx_SUBREG (mode, target, 0);
8530 0 : emit_move_insn (tmp, target);
8531 0 : return tmp;
8532 : }
8533 :
8534 1416 : if (VECTOR_MODE_P (op_mode))
8535 : {
8536 2822 : gcc_assert (GET_MODE_INNER (op_mode) == word_mode);
8537 1411 : target = gen_reg_rtx (word_mode);
8538 1411 : op = gen_rtx_SUBREG (word_mode, op, 0);
8539 1411 : emit_move_insn (target, op);
8540 : }
8541 : else
8542 : target = op;
8543 :
8544 1416 : if (mode == GET_MODE (target))
8545 : return target;
8546 :
8547 241 : rtx tmp = gen_reg_rtx (mode);
8548 241 : target = gen_rtx_SUBREG (mode, target, 0);
8549 241 : emit_move_insn (tmp, target);
8550 241 : return tmp;
8551 : }
8552 :
8553 : /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
8554 : static void
8555 7916 : expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
8556 : rtx count, int max_size)
8557 : {
8558 7916 : rtx dest;
8559 :
8560 7916 : if (CONST_INT_P (count))
8561 : {
8562 7604 : unsigned HOST_WIDE_INT countval = UINTVAL (count);
8563 7604 : unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
8564 7604 : unsigned int destalign = MEM_ALIGN (destmem);
8565 7604 : cfun->machine->by_pieces_in_use = true;
8566 12303 : store_by_pieces (destmem, epilogue_size, setmem_epilogue_gen_val,
8567 : vec_value ? vec_value : value, destalign, true,
8568 : RETURN_BEGIN);
8569 7604 : cfun->machine->by_pieces_in_use = false;
8570 7604 : return;
8571 : }
8572 312 : if (max_size > 32)
8573 : {
8574 311 : expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
8575 311 : return;
8576 : }
8577 1 : if (max_size > 16)
8578 : {
8579 0 : rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
8580 0 : if (TARGET_64BIT)
8581 : {
8582 0 : dest = change_address (destmem, DImode, destptr);
8583 0 : emit_insn (gen_strset (destptr, dest, value));
8584 0 : dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
8585 0 : emit_insn (gen_strset (destptr, dest, value));
8586 : }
8587 : else
8588 : {
8589 0 : dest = change_address (destmem, SImode, destptr);
8590 0 : emit_insn (gen_strset (destptr, dest, value));
8591 0 : dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
8592 0 : emit_insn (gen_strset (destptr, dest, value));
8593 0 : dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
8594 0 : emit_insn (gen_strset (destptr, dest, value));
8595 0 : dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
8596 0 : emit_insn (gen_strset (destptr, dest, value));
8597 : }
8598 0 : emit_label (label);
8599 0 : LABEL_NUSES (label) = 1;
8600 : }
8601 1 : if (max_size > 8)
8602 : {
8603 0 : rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
8604 0 : if (TARGET_64BIT)
8605 : {
8606 0 : dest = change_address (destmem, DImode, destptr);
8607 0 : emit_insn (gen_strset (destptr, dest, value));
8608 : }
8609 : else
8610 : {
8611 0 : dest = change_address (destmem, SImode, destptr);
8612 0 : emit_insn (gen_strset (destptr, dest, value));
8613 0 : dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
8614 0 : emit_insn (gen_strset (destptr, dest, value));
8615 : }
8616 0 : emit_label (label);
8617 0 : LABEL_NUSES (label) = 1;
8618 : }
8619 1 : if (max_size > 4)
8620 : {
8621 1 : rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
8622 1 : dest = change_address (destmem, SImode, destptr);
8623 1 : emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
8624 1 : emit_label (label);
8625 1 : LABEL_NUSES (label) = 1;
8626 : }
8627 1 : if (max_size > 2)
8628 : {
8629 1 : rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
8630 1 : dest = change_address (destmem, HImode, destptr);
8631 1 : emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
8632 1 : emit_label (label);
8633 1 : LABEL_NUSES (label) = 1;
8634 : }
8635 1 : if (max_size > 1)
8636 : {
8637 1 : rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
8638 1 : dest = change_address (destmem, QImode, destptr);
8639 1 : emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
8640 1 : emit_label (label);
8641 1 : LABEL_NUSES (label) = 1;
8642 : }
8643 : }
8644 :
8645 : /* Adjust COUNTER by the VALUE. */
8646 : static void
8647 19 : ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
8648 : {
8649 19 : emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
8650 19 : }
8651 :
8652 : /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
8653 : DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
8654 : Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
8655 : ignored.
8656 : Return value is updated DESTMEM. */
8657 :
8658 : static rtx
8659 7 : expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
8660 : rtx destptr, rtx srcptr, rtx value,
8661 : rtx vec_value, rtx count, int align,
8662 : int desired_alignment, bool issetmem)
8663 : {
8664 7 : int i;
8665 35 : for (i = 1; i < desired_alignment; i <<= 1)
8666 : {
8667 28 : if (align <= i)
8668 : {
8669 19 : rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
8670 19 : if (issetmem)
8671 : {
8672 12 : if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
8673 2 : destmem = emit_memset (destmem, destptr, vec_value, i);
8674 : else
8675 4 : destmem = emit_memset (destmem, destptr, value, i);
8676 : }
8677 : else
8678 13 : destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
8679 19 : ix86_adjust_counter (count, i);
8680 19 : emit_label (label);
8681 19 : LABEL_NUSES (label) = 1;
8682 19 : set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
8683 : }
8684 : }
8685 7 : return destmem;
8686 : }
8687 :
8688 : /* Test if COUNT&SIZE is nonzero and if so, expand movme
8689 : or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
8690 : and jump to DONE_LABEL. */
8691 : static void
8692 28754 : expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
8693 : rtx destptr, rtx srcptr,
8694 : rtx value, rtx vec_value,
8695 : rtx count, int size,
8696 : rtx done_label, bool issetmem)
8697 : {
8698 28754 : rtx_code_label *label = ix86_expand_aligntest (count, size, false);
8699 28754 : machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
8700 28754 : rtx modesize;
8701 28754 : rtx scalar_value = value;
8702 28754 : int n;
8703 :
8704 : /* If we do not have vector value to copy, we must reduce size. */
8705 28754 : if (issetmem)
8706 : {
8707 3618 : if (!vec_value)
8708 : {
8709 9 : if (GET_MODE (value) == VOIDmode && size > 8)
8710 0 : mode = Pmode;
8711 27 : else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
8712 1 : mode = GET_MODE (value);
8713 : }
8714 : else
8715 3609 : mode = GET_MODE (vec_value), value = vec_value;
8716 : }
8717 : else
8718 : {
8719 : /* Choose appropriate vector mode. */
8720 25136 : if (size >= 32)
8721 6282 : switch (MOVE_MAX)
8722 : {
8723 0 : case 64:
8724 0 : if (size >= 64)
8725 : {
8726 : mode = V64QImode;
8727 : break;
8728 : }
8729 : /* FALLTHRU */
8730 0 : case 32:
8731 0 : mode = V32QImode;
8732 0 : break;
8733 : case 16:
8734 : mode = V16QImode;
8735 : break;
8736 : case 8:
8737 : mode = DImode;
8738 : break;
8739 0 : default:
8740 0 : gcc_unreachable ();
8741 : }
8742 18854 : else if (size >= 16)
8743 6282 : mode = TARGET_SSE ? V16QImode : DImode;
8744 25136 : srcmem = change_address (srcmem, mode, srcptr);
8745 : }
8746 32363 : if (issetmem && vec_value && GET_MODE_SIZE (mode) > size)
8747 : {
8748 : /* For memset with vector and the size is smaller than the vector
8749 : size, first try the narrower vector, otherwise, use the
8750 : original value. */
8751 1809 : machine_mode inner_mode = GET_MODE_INNER (mode);
8752 1809 : unsigned int nunits = size / GET_MODE_SIZE (inner_mode);
8753 1809 : if (nunits > 1)
8754 : {
8755 320 : mode = mode_for_vector (GET_MODE_INNER (mode),
8756 320 : nunits).require ();
8757 160 : value = gen_rtx_SUBREG (mode, value, 0);
8758 : }
8759 : else
8760 : {
8761 1649 : scalar_int_mode smode
8762 1649 : = smallest_int_mode_for_size (size * BITS_PER_UNIT).require ();
8763 4947 : gcc_assert (GET_MODE_SIZE (GET_MODE (scalar_value))
8764 : >= GET_MODE_SIZE (smode));
8765 1649 : mode = smode;
8766 1649 : if (GET_MODE (scalar_value) == mode)
8767 : value = scalar_value;
8768 : else
8769 749 : value = gen_rtx_SUBREG (mode, scalar_value, 0);
8770 : }
8771 : }
8772 28754 : destmem = change_address (destmem, mode, destptr);
8773 57508 : modesize = GEN_INT (GET_MODE_SIZE (mode));
8774 57508 : gcc_assert (GET_MODE_SIZE (mode) <= size);
8775 129382 : for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
8776 : {
8777 35937 : if (issetmem)
8778 4519 : emit_move_insn (destmem, gen_lowpart (mode, value));
8779 : else
8780 : {
8781 31418 : emit_move_insn (destmem, srcmem);
8782 62836 : srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
8783 : }
8784 71874 : destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
8785 : }
8786 :
8787 28754 : destmem = offset_address (destmem, count, 1);
8788 57508 : destmem = offset_address (destmem, GEN_INT (-2 * size),
8789 28754 : GET_MODE_SIZE (mode));
8790 28754 : if (!issetmem)
8791 : {
8792 25136 : srcmem = offset_address (srcmem, count, 1);
8793 50272 : srcmem = offset_address (srcmem, GEN_INT (-2 * size),
8794 25136 : GET_MODE_SIZE (mode));
8795 : }
8796 129382 : for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
8797 : {
8798 35937 : if (issetmem)
8799 4519 : emit_move_insn (destmem, gen_lowpart (mode, value));
8800 : else
8801 : {
8802 31418 : emit_move_insn (destmem, srcmem);
8803 62836 : srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
8804 : }
8805 71874 : destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
8806 : }
8807 28754 : emit_jump_insn (gen_jump (done_label));
8808 28754 : emit_barrier ();
8809 :
8810 28754 : emit_label (label);
8811 28754 : LABEL_NUSES (label) = 1;
8812 28754 : }
8813 :
8814 : /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
8815 : and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
8816 : bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
8817 : proceed with an loop copying SIZE bytes at once. Do moves in MODE.
8818 : DONE_LABEL is a label after the whole copying sequence. The label is created
8819 : on demand if *DONE_LABEL is NULL.
8820 : MIN_SIZE is minimal size of block copied. This value gets adjusted for new
8821 : bounds after the initial copies.
8822 :
8823 : DESTMEM/SRCMEM are memory expressions pointing to the copies block,
8824 : DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
8825 : we will dispatch to a library call for large blocks.
8826 :
8827 : In pseudocode we do:
8828 :
8829 : if (COUNT < SIZE)
8830 : {
8831 : Assume that SIZE is 4. Bigger sizes are handled analogously
8832 : if (COUNT & 4)
8833 : {
8834 : copy 4 bytes from SRCPTR to DESTPTR
8835 : copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
8836 : goto done_label
8837 : }
8838 : if (!COUNT)
8839 : goto done_label;
8840 : copy 1 byte from SRCPTR to DESTPTR
8841 : if (COUNT & 2)
8842 : {
8843 : copy 2 bytes from SRCPTR to DESTPTR
8844 : copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
8845 : }
8846 : }
8847 : else
8848 : {
8849 : copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
8850 : copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
8851 :
8852 : OLD_DESPTR = DESTPTR;
8853 : Align DESTPTR up to DESIRED_ALIGN
8854 : SRCPTR += DESTPTR - OLD_DESTPTR
8855 : COUNT -= DEST_PTR - OLD_DESTPTR
8856 : if (DYNAMIC_CHECK)
8857 : Round COUNT down to multiple of SIZE
8858 : << optional caller supplied zero size guard is here >>
8859 : << optional caller supplied dynamic check is here >>
8860 : << caller supplied main copy loop is here >>
8861 : }
8862 : done_label:
8863 : */
8864 : static void
8865 10546 : expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
8866 : rtx *destptr, rtx *srcptr,
8867 : machine_mode mode,
8868 : rtx value, rtx vec_value,
8869 : rtx *count,
8870 : rtx_code_label **done_label,
8871 : int size,
8872 : int desired_align,
8873 : int align,
8874 : unsigned HOST_WIDE_INT *min_size,
8875 : bool dynamic_check,
8876 : bool issetmem)
8877 : {
8878 10546 : rtx_code_label *loop_label = NULL, *label;
8879 10546 : int n;
8880 10546 : rtx modesize;
8881 10546 : int prolog_size = 0;
8882 10546 : rtx mode_value;
8883 :
8884 : /* Chose proper value to copy. */
8885 10546 : if (issetmem && VECTOR_MODE_P (mode))
8886 : mode_value = vec_value;
8887 : else
8888 10546 : mode_value = value;
8889 21092 : gcc_assert (GET_MODE_SIZE (mode) <= size);
8890 :
8891 : /* See if block is big or small, handle small blocks. */
8892 10546 : if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
8893 : {
8894 7197 : int size2 = size;
8895 7197 : loop_label = gen_label_rtx ();
8896 :
8897 7197 : if (!*done_label)
8898 7197 : *done_label = gen_label_rtx ();
8899 :
8900 7197 : emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
8901 : 1, loop_label);
8902 7197 : size2 >>= 1;
8903 :
8904 : /* Handle sizes > 3. */
8905 35951 : for (;size2 > 2; size2 >>= 1)
8906 28754 : expand_small_cpymem_or_setmem (destmem, srcmem,
8907 : *destptr, *srcptr,
8908 : value, vec_value,
8909 : *count,
8910 : size2, *done_label, issetmem);
8911 : /* Nothing to copy? Jump to DONE_LABEL if so */
8912 7197 : emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
8913 : 1, *done_label);
8914 :
8915 : /* Do a byte copy. */
8916 7197 : destmem = change_address (destmem, QImode, *destptr);
8917 7197 : if (issetmem)
8918 907 : emit_move_insn (destmem, gen_lowpart (QImode, value));
8919 : else
8920 : {
8921 6290 : srcmem = change_address (srcmem, QImode, *srcptr);
8922 6290 : emit_move_insn (destmem, srcmem);
8923 : }
8924 :
8925 : /* Handle sizes 2 and 3. */
8926 7197 : label = ix86_expand_aligntest (*count, 2, false);
8927 7197 : destmem = change_address (destmem, HImode, *destptr);
8928 7197 : destmem = offset_address (destmem, *count, 1);
8929 7197 : destmem = offset_address (destmem, GEN_INT (-2), 2);
8930 7197 : if (issetmem)
8931 907 : emit_move_insn (destmem, gen_lowpart (HImode, value));
8932 : else
8933 : {
8934 6290 : srcmem = change_address (srcmem, HImode, *srcptr);
8935 6290 : srcmem = offset_address (srcmem, *count, 1);
8936 6290 : srcmem = offset_address (srcmem, GEN_INT (-2), 2);
8937 6290 : emit_move_insn (destmem, srcmem);
8938 : }
8939 :
8940 7197 : emit_label (label);
8941 7197 : LABEL_NUSES (label) = 1;
8942 7197 : emit_jump_insn (gen_jump (*done_label));
8943 7197 : emit_barrier ();
8944 : }
8945 : else
8946 3349 : gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
8947 : || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
8948 :
8949 : /* Start memcpy for COUNT >= SIZE. */
8950 7197 : if (loop_label)
8951 : {
8952 7197 : emit_label (loop_label);
8953 7197 : LABEL_NUSES (loop_label) = 1;
8954 : }
8955 :
8956 : /* Copy first desired_align bytes. */
8957 10546 : if (!issetmem)
8958 7946 : srcmem = change_address (srcmem, mode, *srcptr);
8959 10546 : destmem = change_address (destmem, mode, *destptr);
8960 10546 : modesize = GEN_INT (GET_MODE_SIZE (mode));
8961 21113 : for (n = 0; prolog_size < desired_align - align; n++)
8962 : {
8963 21 : if (issetmem)
8964 3 : emit_move_insn (destmem, mode_value);
8965 : else
8966 : {
8967 18 : emit_move_insn (destmem, srcmem);
8968 36 : srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
8969 : }
8970 42 : destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
8971 42 : prolog_size += GET_MODE_SIZE (mode);
8972 : }
8973 :
8974 :
8975 : /* Copy last SIZE bytes. */
8976 10546 : destmem = offset_address (destmem, *count, 1);
8977 10546 : destmem = offset_address (destmem,
8978 10546 : GEN_INT (-size - prolog_size),
8979 : 1);
8980 10546 : if (issetmem)
8981 2600 : emit_move_insn (destmem, mode_value);
8982 : else
8983 : {
8984 7946 : srcmem = offset_address (srcmem, *count, 1);
8985 7946 : srcmem = offset_address (srcmem,
8986 : GEN_INT (-size - prolog_size),
8987 : 1);
8988 7946 : emit_move_insn (destmem, srcmem);
8989 : }
8990 82628 : for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
8991 : {
8992 30768 : destmem = offset_address (destmem, modesize, 1);
8993 30768 : if (issetmem)
8994 7599 : emit_move_insn (destmem, mode_value);
8995 : else
8996 : {
8997 23169 : srcmem = offset_address (srcmem, modesize, 1);
8998 23169 : emit_move_insn (destmem, srcmem);
8999 : }
9000 : }
9001 :
9002 : /* Align destination. */
9003 10546 : if (desired_align > 1 && desired_align > align)
9004 : {
9005 21 : rtx saveddest = *destptr;
9006 :
9007 21 : gcc_assert (desired_align <= size);
9008 : /* Align destptr up, place it to new register. */
9009 21 : *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
9010 : GEN_INT (prolog_size),
9011 : NULL_RTX, 1, OPTAB_DIRECT);
9012 21 : if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
9013 21 : REG_POINTER (*destptr) = 1;
9014 21 : *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
9015 21 : GEN_INT (-desired_align),
9016 : *destptr, 1, OPTAB_DIRECT);
9017 : /* See how many bytes we skipped. */
9018 21 : saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
9019 : *destptr,
9020 : NULL_RTX, 1, OPTAB_DIRECT);
9021 : /* Adjust srcptr and count. */
9022 21 : if (!issetmem)
9023 18 : *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
9024 : saveddest, *srcptr, 1, OPTAB_DIRECT);
9025 21 : *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
9026 : saveddest, *count, 1, OPTAB_DIRECT);
9027 : /* We copied at most size + prolog_size. */
9028 21 : if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
9029 14 : *min_size
9030 14 : = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
9031 : else
9032 7 : *min_size = 0;
9033 :
9034 : /* Our loops always round down the block size, but for dispatch to
9035 : library we need precise value. */
9036 21 : if (dynamic_check)
9037 21 : *count = expand_simple_binop (GET_MODE (*count), AND, *count,
9038 : GEN_INT (-size), *count, 1, OPTAB_DIRECT);
9039 : }
9040 : else
9041 : {
9042 10525 : gcc_assert (prolog_size == 0);
9043 : /* Decrease count, so we won't end up copying last word twice. */
9044 10525 : if (!CONST_INT_P (*count))
9045 7197 : *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
9046 : constm1_rtx, *count, 1, OPTAB_DIRECT);
9047 : else
9048 3328 : *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
9049 : (unsigned HOST_WIDE_INT)size));
9050 10525 : if (*min_size)
9051 9351 : *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
9052 : }
9053 10546 : }
9054 :
9055 :
9056 : /* This function is like the previous one, except here we know how many bytes
9057 : need to be copied. That allows us to update alignment not only of DST, which
9058 : is returned, but also of SRC, which is passed as a pointer for that
9059 : reason. */
9060 : static rtx
9061 0 : expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
9062 : rtx srcreg, rtx value, rtx vec_value,
9063 : int desired_align, int align_bytes,
9064 : bool issetmem)
9065 : {
9066 0 : rtx src = NULL;
9067 0 : rtx orig_dst = dst;
9068 0 : rtx orig_src = NULL;
9069 0 : int piece_size = 1;
9070 0 : int copied_bytes = 0;
9071 :
9072 0 : if (!issetmem)
9073 : {
9074 0 : gcc_assert (srcp != NULL);
9075 0 : src = *srcp;
9076 0 : orig_src = src;
9077 : }
9078 :
9079 0 : for (piece_size = 1;
9080 0 : piece_size <= desired_align && copied_bytes < align_bytes;
9081 0 : piece_size <<= 1)
9082 : {
9083 0 : if (align_bytes & piece_size)
9084 : {
9085 0 : if (issetmem)
9086 : {
9087 0 : if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
9088 0 : dst = emit_memset (dst, destreg, vec_value, piece_size);
9089 : else
9090 0 : dst = emit_memset (dst, destreg, value, piece_size);
9091 : }
9092 : else
9093 0 : dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
9094 0 : copied_bytes += piece_size;
9095 : }
9096 : }
9097 0 : if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
9098 0 : set_mem_align (dst, desired_align * BITS_PER_UNIT);
9099 0 : if (MEM_SIZE_KNOWN_P (orig_dst))
9100 0 : set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
9101 :
9102 0 : if (!issetmem)
9103 : {
9104 0 : int src_align_bytes = get_mem_align_offset (src, desired_align
9105 : * BITS_PER_UNIT);
9106 0 : if (src_align_bytes >= 0)
9107 0 : src_align_bytes = desired_align - src_align_bytes;
9108 0 : if (src_align_bytes >= 0)
9109 : {
9110 : unsigned int src_align;
9111 0 : for (src_align = desired_align; src_align >= 2; src_align >>= 1)
9112 : {
9113 0 : if ((src_align_bytes & (src_align - 1))
9114 0 : == (align_bytes & (src_align - 1)))
9115 : break;
9116 : }
9117 0 : if (src_align > (unsigned int) desired_align)
9118 : src_align = desired_align;
9119 0 : if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
9120 0 : set_mem_align (src, src_align * BITS_PER_UNIT);
9121 : }
9122 0 : if (MEM_SIZE_KNOWN_P (orig_src))
9123 0 : set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
9124 0 : *srcp = src;
9125 : }
9126 :
9127 0 : return dst;
9128 : }
9129 :
9130 : /* Return true if ALG can be used in current context.
9131 : Assume we expand memset if MEMSET is true. */
9132 : static bool
9133 834735 : alg_usable_p (enum stringop_alg alg, bool memset,
9134 : addr_space_t dst_as, addr_space_t src_as)
9135 : {
9136 834735 : if (alg == no_stringop)
9137 : return false;
9138 : /* It is not possible to use a library call if we have non-default
9139 : address space. We can do better than the generic byte-at-a-time
9140 : loop, used as a fallback. */
9141 834735 : if (alg == libcall &&
9142 468505 : !(ADDR_SPACE_GENERIC_P (dst_as) && ADDR_SPACE_GENERIC_P (src_as)))
9143 : return false;
9144 834728 : if (alg == vector_loop)
9145 367949 : return TARGET_SSE || TARGET_AVX;
9146 : /* Algorithms using the rep prefix want at least edi and ecx;
9147 : additionally, memset wants eax and memcpy wants esi. Don't
9148 : consider such algorithms if the user has appropriated those
9149 : registers for their own purposes, or if we have the destination
9150 : in the non-default address space, since string insns cannot
9151 : override the destination segment. */
9152 650722 : if (alg == rep_prefix_1_byte
9153 : || alg == rep_prefix_4_byte
9154 650722 : || alg == rep_prefix_8_byte)
9155 : {
9156 33572 : if (fixed_regs[CX_REG]
9157 33568 : || fixed_regs[DI_REG]
9158 33564 : || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG])
9159 33560 : || !ADDR_SPACE_GENERIC_P (dst_as)
9160 67132 : || !(ADDR_SPACE_GENERIC_P (src_as) || Pmode == word_mode))
9161 12 : return false;
9162 : }
9163 : return true;
9164 : }
9165 :
9166 : /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
9167 : static enum stringop_alg
9168 165316 : decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
9169 : unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
9170 : bool memset, bool zero_memset, addr_space_t dst_as,
9171 : addr_space_t src_as, int *dynamic_check, bool *noalign, bool recur)
9172 : {
9173 165316 : const struct stringop_algs *algs;
9174 165316 : bool optimize_for_speed;
9175 165316 : int max = 0;
9176 165316 : const struct processor_costs *cost;
9177 165316 : int i;
9178 165316 : bool any_alg_usable_p = false;
9179 :
9180 165316 : *noalign = false;
9181 165316 : *dynamic_check = -1;
9182 :
9183 : /* Even if the string operation call is cold, we still might spend a lot
9184 : of time processing large blocks. */
9185 165316 : if (optimize_function_for_size_p (cfun)
9186 165316 : || (optimize_insn_for_size_p ()
9187 9925 : && (max_size < 256
9188 3703 : || (expected_size != -1 && expected_size < 256))))
9189 : optimize_for_speed = false;
9190 : else
9191 148598 : optimize_for_speed = true;
9192 :
9193 148598 : cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
9194 165316 : if (memset)
9195 49023 : algs = &cost->memset[TARGET_64BIT != 0];
9196 : else
9197 125193 : algs = &cost->memcpy[TARGET_64BIT != 0];
9198 :
9199 : /* See maximal size for user defined algorithm. */
9200 826580 : for (i = 0; i < MAX_STRINGOP_ALGS; i++)
9201 : {
9202 661264 : enum stringop_alg candidate = algs->size[i].alg;
9203 661264 : bool usable = alg_usable_p (candidate, memset, dst_as, src_as);
9204 661264 : any_alg_usable_p |= usable;
9205 :
9206 661264 : if (candidate != libcall && candidate && usable)
9207 313847 : max = algs->size[i].max;
9208 : }
9209 :
9210 : /* If expected size is not known but max size is small enough
9211 : so inline version is a win, set expected size into
9212 : the range. */
9213 165316 : if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
9214 35845 : && expected_size == -1)
9215 18415 : expected_size = min_size / 2 + max_size / 2;
9216 :
9217 : /* If user specified the algorithm, honor it if possible. */
9218 165316 : if (ix86_stringop_alg != no_stringop
9219 165316 : && alg_usable_p (ix86_stringop_alg, memset, dst_as, src_as))
9220 : return ix86_stringop_alg;
9221 : /* rep; movq or rep; movl is the smallest variant. */
9222 165204 : else if (!optimize_for_speed)
9223 : {
9224 16635 : *noalign = true;
9225 16635 : if (!count || (count & 3) || (memset && !zero_memset))
9226 5914 : return alg_usable_p (rep_prefix_1_byte, memset, dst_as, src_as)
9227 5914 : ? rep_prefix_1_byte : loop_1_byte;
9228 : else
9229 10721 : return alg_usable_p (rep_prefix_4_byte, memset, dst_as, src_as)
9230 10721 : ? rep_prefix_4_byte : loop;
9231 : }
9232 : /* Very tiny blocks are best handled via the loop, REP is expensive to
9233 : setup. */
9234 148569 : else if (expected_size != -1 && expected_size < 4)
9235 : return loop_1_byte;
9236 145648 : else if (expected_size != -1)
9237 : {
9238 : enum stringop_alg alg = libcall;
9239 : bool alg_noalign = false;
9240 182143 : for (i = 0; i < MAX_STRINGOP_ALGS; i++)
9241 : {
9242 : /* We get here if the algorithms that were not libcall-based
9243 : were rep-prefix based and we are unable to use rep prefixes
9244 : based on global register usage. Break out of the loop and
9245 : use the heuristic below. */
9246 179210 : if (algs->size[i].max == 0)
9247 : break;
9248 179210 : if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
9249 : {
9250 75278 : enum stringop_alg candidate = algs->size[i].alg;
9251 :
9252 75278 : if (candidate != libcall
9253 75278 : && alg_usable_p (candidate, memset, dst_as, src_as))
9254 : {
9255 20356 : alg = candidate;
9256 20356 : alg_noalign = algs->size[i].noalign;
9257 : }
9258 : /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
9259 : last non-libcall inline algorithm. */
9260 75278 : if (TARGET_INLINE_ALL_STRINGOPS)
9261 : {
9262 : /* When the current size is best to be copied by a libcall,
9263 : but we are still forced to inline, run the heuristic below
9264 : that will pick code for medium sized blocks. */
9265 10982 : if (alg != libcall)
9266 : {
9267 5107 : *noalign = alg_noalign;
9268 5107 : return alg;
9269 : }
9270 5875 : else if (!any_alg_usable_p)
9271 : break;
9272 : }
9273 64296 : else if (alg_usable_p (candidate, memset, dst_as, src_as)
9274 64296 : && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
9275 22 : && candidate == rep_prefix_1_byte
9276 : /* NB: If min_size != max_size, size is
9277 : unknown. */
9278 22 : && min_size != max_size))
9279 : {
9280 64277 : *noalign = algs->size[i].noalign;
9281 64277 : return candidate;
9282 : }
9283 : }
9284 : }
9285 : }
9286 : /* When asked to inline the call anyway, try to pick meaningful choice.
9287 : We look for maximal size of block that is faster to copy by hand and
9288 : take blocks of at most of that size guessing that average size will
9289 : be roughly half of the block.
9290 :
9291 : If this turns out to be bad, we might simply specify the preferred
9292 : choice in ix86_costs. */
9293 72055 : if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
9294 76270 : && (algs->unknown_size == libcall
9295 0 : || !alg_usable_p (algs->unknown_size, memset, dst_as, src_as)))
9296 : {
9297 4215 : enum stringop_alg alg;
9298 4215 : HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
9299 :
9300 : /* If there aren't any usable algorithms or if recursing already,
9301 : then recursing on smaller sizes or same size isn't going to
9302 : find anything. Just return the simple byte-at-a-time copy loop. */
9303 4215 : if (!any_alg_usable_p || recur)
9304 : {
9305 : /* Pick something reasonable. */
9306 0 : if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
9307 0 : *dynamic_check = 128;
9308 0 : return loop_1_byte;
9309 : }
9310 4215 : alg = decide_alg (count, new_expected_size, min_size, max_size,
9311 : memset, zero_memset, dst_as, src_as,
9312 : dynamic_check, noalign, true);
9313 4215 : gcc_assert (*dynamic_check == -1);
9314 4215 : if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
9315 8 : *dynamic_check = max;
9316 : else
9317 4207 : gcc_assert (alg != libcall);
9318 4215 : return alg;
9319 : }
9320 :
9321 : /* Try to use some reasonable fallback algorithm. Note that for
9322 : non-default address spaces we default to a loop instead of
9323 : a libcall. */
9324 :
9325 72049 : bool have_as = !(ADDR_SPACE_GENERIC_P (dst_as)
9326 : && ADDR_SPACE_GENERIC_P (src_as));
9327 :
9328 72049 : return (alg_usable_p (algs->unknown_size, memset, dst_as, src_as)
9329 72049 : ? algs->unknown_size : have_as ? loop : libcall);
9330 : }
9331 :
9332 : /* Decide on alignment. We know that the operand is already aligned to ALIGN
9333 : (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
9334 : static int
9335 34664 : decide_alignment (int align,
9336 : enum stringop_alg alg,
9337 : int expected_size,
9338 : machine_mode move_mode)
9339 : {
9340 34664 : int desired_align = 0;
9341 :
9342 34664 : gcc_assert (alg != no_stringop);
9343 :
9344 34664 : if (alg == libcall)
9345 : return 0;
9346 34664 : if (move_mode == VOIDmode)
9347 : return 0;
9348 :
9349 34664 : desired_align = GET_MODE_SIZE (move_mode);
9350 : /* PentiumPro has special logic triggering for 8 byte aligned blocks.
9351 : copying whole cacheline at once. */
9352 34664 : if (TARGET_CPU_P (PENTIUMPRO)
9353 0 : && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
9354 34664 : desired_align = 8;
9355 :
9356 34664 : if (optimize_size)
9357 9807 : desired_align = 1;
9358 34664 : if (desired_align < align)
9359 : desired_align = align;
9360 34664 : if (expected_size != -1 && expected_size < 4)
9361 0 : desired_align = align;
9362 :
9363 : return desired_align;
9364 : }
9365 :
9366 :
9367 : /* Helper function for memcpy. For QImode value 0xXY produce
9368 : 0xXYXYXYXY of wide specified by MODE. This is essentially
9369 : a * 0x10101010, but we can do slightly better than
9370 : synth_mult by unwinding the sequence by hand on CPUs with
9371 : slow multiply. */
9372 : static rtx
9373 16550 : promote_duplicated_reg (machine_mode mode, rtx val)
9374 : {
9375 16550 : if (val == const0_rtx)
9376 14958 : return copy_to_mode_reg (mode, CONST0_RTX (mode));
9377 :
9378 1592 : machine_mode valmode = GET_MODE (val);
9379 1592 : if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
9380 : {
9381 : /* Duplicate the scalar value for integer vector. */
9382 1271 : gcc_assert ((val == const0_rtx || val == constm1_rtx)
9383 : || GET_MODE_INNER (mode) == valmode);
9384 647 : rtx dup = gen_reg_rtx (mode);
9385 647 : bool ok = ix86_expand_vector_init_duplicate (false, mode, dup,
9386 : val);
9387 647 : gcc_assert (ok);
9388 : return dup;
9389 : }
9390 :
9391 945 : rtx tmp;
9392 945 : int nops = mode == DImode ? 3 : 2;
9393 :
9394 40 : gcc_assert (mode == SImode || mode == DImode);
9395 945 : if (CONST_INT_P (val))
9396 : {
9397 656 : HOST_WIDE_INT v = INTVAL (val) & 255;
9398 :
9399 656 : v |= v << 8;
9400 656 : v |= v << 16;
9401 656 : if (mode == DImode)
9402 628 : v |= (v << 16) << 16;
9403 656 : return copy_to_mode_reg (mode, gen_int_mode (v, mode));
9404 : }
9405 :
9406 289 : if (valmode == VOIDmode)
9407 : valmode = QImode;
9408 289 : if (valmode != QImode)
9409 0 : val = gen_lowpart (QImode, val);
9410 289 : if (mode == QImode)
9411 : return val;
9412 289 : if (!TARGET_PARTIAL_REG_STALL)
9413 289 : nops--;
9414 289 : if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
9415 289 : + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
9416 289 : <= (ix86_cost->shift_const + ix86_cost->add) * nops
9417 289 : + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
9418 : {
9419 289 : rtx reg = convert_modes (mode, QImode, val, true);
9420 289 : tmp = promote_duplicated_reg (mode, const1_rtx);
9421 289 : return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
9422 289 : OPTAB_DIRECT);
9423 : }
9424 : else
9425 : {
9426 0 : rtx reg = convert_modes (mode, QImode, val, true);
9427 :
9428 0 : if (!TARGET_PARTIAL_REG_STALL)
9429 0 : emit_insn (gen_insv_1 (mode, reg, reg));
9430 : else
9431 : {
9432 0 : tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
9433 : NULL, 1, OPTAB_DIRECT);
9434 0 : reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
9435 : OPTAB_DIRECT);
9436 : }
9437 0 : tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
9438 : NULL, 1, OPTAB_DIRECT);
9439 0 : reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
9440 0 : if (mode == SImode)
9441 : return reg;
9442 0 : tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
9443 : NULL, 1, OPTAB_DIRECT);
9444 0 : reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
9445 0 : return reg;
9446 : }
9447 : }
9448 :
9449 : /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
9450 : be needed by main loop copying SIZE_NEEDED chunks and prologue getting
9451 : alignment from ALIGN to DESIRED_ALIGN. */
9452 : static rtx
9453 12327 : promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
9454 : int align)
9455 : {
9456 12327 : rtx promoted_val;
9457 :
9458 12327 : if (TARGET_64BIT
9459 10850 : && (size_needed > 4 || (desired_align > align && desired_align > 4)))
9460 4379 : promoted_val = promote_duplicated_reg (DImode, val);
9461 7948 : else if (size_needed > 2 || (desired_align > align && desired_align > 2))
9462 6137 : promoted_val = promote_duplicated_reg (SImode, val);
9463 1811 : else if (size_needed > 1 || (desired_align > align && desired_align > 1))
9464 0 : promoted_val = promote_duplicated_reg (HImode, val);
9465 : else
9466 : promoted_val = val;
9467 :
9468 12327 : return promoted_val;
9469 : }
9470 :
9471 : /* Copy the address to a Pmode register. This is used for x32 to
9472 : truncate DImode TLS address to a SImode register. */
9473 :
9474 : static rtx
9475 67637 : ix86_copy_addr_to_reg (rtx addr)
9476 : {
9477 67637 : rtx reg;
9478 72212 : if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
9479 : {
9480 67637 : reg = copy_addr_to_reg (addr);
9481 67637 : REG_POINTER (reg) = 1;
9482 67637 : return reg;
9483 : }
9484 : else
9485 : {
9486 0 : gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
9487 0 : reg = copy_to_mode_reg (DImode, addr);
9488 0 : REG_POINTER (reg) = 1;
9489 0 : return gen_rtx_SUBREG (SImode, reg, 0);
9490 : }
9491 : }
9492 :
9493 : /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
9494 : operations when profitable. The code depends upon architecture, block size
9495 : and alignment, but always has one of the following overall structures:
9496 :
9497 : Aligned move sequence:
9498 :
9499 : 1) Prologue guard: Conditional that jumps up to epilogues for small
9500 : blocks that can be handled by epilogue alone. This is faster
9501 : but also needed for correctness, since prologue assume the block
9502 : is larger than the desired alignment.
9503 :
9504 : Optional dynamic check for size and libcall for large
9505 : blocks is emitted here too, with -minline-stringops-dynamically.
9506 :
9507 : 2) Prologue: copy first few bytes in order to get destination
9508 : aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
9509 : than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
9510 : copied. We emit either a jump tree on power of two sized
9511 : blocks, or a byte loop.
9512 :
9513 : 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
9514 : with specified algorithm.
9515 :
9516 : 4) Epilogue: code copying tail of the block that is too small to be
9517 : handled by main body (or up to size guarded by prologue guard).
9518 :
9519 : Misaligned move sequence
9520 :
9521 : 1) missaligned move prologue/epilogue containing:
9522 : a) Prologue handling small memory blocks and jumping to done_label
9523 : (skipped if blocks are known to be large enough)
9524 : b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
9525 : needed by single possibly misaligned move
9526 : (skipped if alignment is not needed)
9527 : c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
9528 :
9529 : 2) Zero size guard dispatching to done_label, if needed
9530 :
9531 : 3) dispatch to library call, if needed,
9532 :
9533 : 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
9534 : with specified algorithm. */
9535 : bool
9536 147811 : ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
9537 : rtx align_exp, rtx expected_align_exp,
9538 : rtx expected_size_exp, rtx min_size_exp,
9539 : rtx max_size_exp, rtx probable_max_size_exp,
9540 : bool issetmem)
9541 : {
9542 147811 : rtx destreg;
9543 147811 : rtx srcreg = NULL;
9544 147811 : rtx_code_label *label = NULL;
9545 147811 : rtx tmp;
9546 147811 : rtx_code_label *jump_around_label = NULL;
9547 147811 : HOST_WIDE_INT align = 1;
9548 147811 : unsigned HOST_WIDE_INT count = 0;
9549 147811 : HOST_WIDE_INT expected_size = -1;
9550 147811 : int size_needed = 0, epilogue_size_needed;
9551 147811 : int desired_align = 0, align_bytes = 0;
9552 147811 : enum stringop_alg alg;
9553 147811 : rtx promoted_val = NULL;
9554 147811 : rtx vec_promoted_val = NULL;
9555 147811 : bool force_loopy_epilogue = false;
9556 147811 : int dynamic_check;
9557 147811 : bool need_zero_guard = false;
9558 147811 : bool noalign;
9559 147811 : machine_mode move_mode = VOIDmode;
9560 147811 : int unroll_factor = 1;
9561 : /* TODO: Once value ranges are available, fill in proper data. */
9562 147811 : unsigned HOST_WIDE_INT min_size = HOST_WIDE_INT_0U;
9563 147811 : unsigned HOST_WIDE_INT max_size = HOST_WIDE_INT_M1U;
9564 147811 : unsigned HOST_WIDE_INT probable_max_size = HOST_WIDE_INT_M1U;
9565 147811 : bool misaligned_prologue_used = false;
9566 147811 : addr_space_t dst_as, src_as = ADDR_SPACE_GENERIC;
9567 :
9568 147811 : if (CONST_INT_P (align_exp))
9569 147811 : align = INTVAL (align_exp);
9570 : /* i386 can do misaligned access on reasonably increased cost. */
9571 147811 : if (CONST_INT_P (expected_align_exp)
9572 147811 : && INTVAL (expected_align_exp) > align)
9573 : align = INTVAL (expected_align_exp);
9574 : /* ALIGN is the minimum of destination and source alignment, but we care here
9575 : just about destination alignment. */
9576 140739 : else if (!issetmem
9577 237060 : && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
9578 3269 : align = MEM_ALIGN (dst) / BITS_PER_UNIT;
9579 :
9580 147811 : if (CONST_INT_P (count_exp))
9581 : {
9582 67525 : min_size = max_size = probable_max_size = count = expected_size
9583 67525 : = INTVAL (count_exp);
9584 : /* When COUNT is 0, there is nothing to do. */
9585 67525 : if (!count)
9586 : return true;
9587 : }
9588 : else
9589 : {
9590 80286 : if (min_size_exp)
9591 80286 : min_size = INTVAL (min_size_exp);
9592 80286 : if (max_size_exp)
9593 70035 : max_size = INTVAL (max_size_exp);
9594 80286 : if (probable_max_size_exp)
9595 72071 : probable_max_size = INTVAL (probable_max_size_exp);
9596 80286 : if (CONST_INT_P (expected_size_exp))
9597 80286 : expected_size = INTVAL (expected_size_exp);
9598 : }
9599 :
9600 : /* Make sure we don't need to care about overflow later on. */
9601 147809 : if (count > (HOST_WIDE_INT_1U << 30))
9602 : return false;
9603 :
9604 147635 : dst_as = MEM_ADDR_SPACE (dst);
9605 147635 : if (!issetmem)
9606 103279 : src_as = MEM_ADDR_SPACE (src);
9607 :
9608 : /* Step 0: Decide on preferred algorithm, desired alignment and
9609 : size of chunks to be copied by main loop. */
9610 147635 : alg = decide_alg (count, expected_size, min_size, probable_max_size,
9611 44356 : issetmem, issetmem && val_exp == const0_rtx,
9612 : dst_as, src_as, &dynamic_check, &noalign, false);
9613 :
9614 147635 : if (dump_file)
9615 7 : fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
9616 7 : stringop_alg_names[alg]);
9617 :
9618 147635 : if (alg == libcall)
9619 : return false;
9620 34664 : gcc_assert (alg != no_stringop);
9621 :
9622 34664 : if (!count)
9623 16271 : count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
9624 34664 : destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
9625 34664 : if (!issetmem)
9626 22337 : srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
9627 :
9628 34664 : bool aligned_dstmem = false;
9629 34664 : unsigned int nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX;
9630 34664 : bool single_insn_p = count && count <= nunits;
9631 34664 : if (single_insn_p)
9632 : {
9633 : /* If it can be done with a single instruction, use vector
9634 : instruction and don't align destination. */
9635 6 : alg = vector_loop;
9636 6 : noalign = true;
9637 6 : dynamic_check = -1;
9638 : }
9639 :
9640 34664 : unroll_factor = 1;
9641 34664 : move_mode = word_mode;
9642 34664 : switch (alg)
9643 : {
9644 0 : case libcall:
9645 0 : case no_stringop:
9646 0 : case last_alg:
9647 0 : gcc_unreachable ();
9648 1703 : case loop_1_byte:
9649 1703 : need_zero_guard = true;
9650 1703 : move_mode = QImode;
9651 1703 : break;
9652 51 : case loop:
9653 51 : need_zero_guard = true;
9654 51 : break;
9655 20 : case unrolled_loop:
9656 20 : need_zero_guard = true;
9657 20 : unroll_factor = (TARGET_64BIT ? 4 : 2);
9658 : break;
9659 16230 : case vector_loop:
9660 16230 : need_zero_guard = true;
9661 16230 : unroll_factor = 4;
9662 : /* Get the vector mode to move STORE_MAX_PIECES/MOVE_MAX bytes. */
9663 16230 : nunits /= GET_MODE_SIZE (word_mode);
9664 16230 : if (nunits > 1)
9665 : {
9666 16226 : move_mode = mode_for_vector (word_mode, nunits).require ();
9667 16226 : gcc_assert (optab_handler (mov_optab, move_mode)
9668 : != CODE_FOR_nothing);
9669 : }
9670 : break;
9671 25 : case rep_prefix_8_byte:
9672 25 : move_mode = DImode;
9673 25 : break;
9674 10716 : case rep_prefix_4_byte:
9675 10716 : move_mode = SImode;
9676 10716 : break;
9677 5919 : case rep_prefix_1_byte:
9678 5919 : move_mode = QImode;
9679 5919 : break;
9680 : }
9681 34664 : size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
9682 34664 : epilogue_size_needed = size_needed;
9683 :
9684 : /* If we are going to call any library calls conditionally, make sure any
9685 : pending stack adjustment happen before the first conditional branch,
9686 : otherwise they will be emitted before the library call only and won't
9687 : happen from the other branches. */
9688 34664 : if (dynamic_check != -1)
9689 7 : do_pending_stack_adjust ();
9690 :
9691 34664 : desired_align = decide_alignment (align, alg, expected_size, move_mode);
9692 34664 : if (!TARGET_ALIGN_STRINGOPS || noalign)
9693 32874 : align = desired_align;
9694 :
9695 : /* Step 1: Prologue guard. */
9696 :
9697 : /* Alignment code needs count to be in register. */
9698 34664 : if (CONST_INT_P (count_exp) && desired_align > align)
9699 : {
9700 20 : if (INTVAL (count_exp) > desired_align
9701 20 : && INTVAL (count_exp) > size_needed)
9702 : {
9703 20 : align_bytes
9704 20 : = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
9705 20 : if (align_bytes <= 0)
9706 : align_bytes = 0;
9707 : else
9708 0 : align_bytes = desired_align - align_bytes;
9709 : }
9710 0 : if (align_bytes == 0)
9711 40 : count_exp = force_reg (counter_mode (count_exp), count_exp);
9712 : }
9713 34664 : gcc_assert (desired_align >= 1 && align >= 1);
9714 :
9715 34664 : if (!single_insn_p)
9716 : {
9717 : /* Misaligned move sequences handle both prologue and epilogue
9718 : at once. Default code generation results in a smaller code
9719 : for large alignments and also avoids redundant job when sizes
9720 : are known precisely. */
9721 34658 : misaligned_prologue_used
9722 69316 : = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
9723 34652 : && MAX (desired_align, epilogue_size_needed) <= 32
9724 18101 : && desired_align <= epilogue_size_needed
9725 40802 : && ((desired_align > align && !align_bytes)
9726 6123 : || (!count && epilogue_size_needed > 1)));
9727 :
9728 : /* Destination is aligned after the misaligned prologue. */
9729 34658 : aligned_dstmem = misaligned_prologue_used;
9730 :
9731 34658 : if (noalign && !misaligned_prologue_used)
9732 : {
9733 : /* Also use misaligned prologue if alignment isn't needed and
9734 : destination isn't aligned. Since alignment isn't needed,
9735 : the destination after prologue won't be aligned. */
9736 32868 : aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode)
9737 32868 : <= MEM_ALIGN (dst));
9738 32868 : if (!aligned_dstmem)
9739 10525 : misaligned_prologue_used = true;
9740 : }
9741 : }
9742 :
9743 : /* Do the cheap promotion to allow better CSE across the
9744 : main loop and epilogue (ie one load of the big constant in the
9745 : front of all code.
9746 : For now the misaligned move sequences do not have fast path
9747 : without broadcasting. */
9748 34664 : if (issetmem
9749 12327 : && (alg == vector_loop
9750 6582 : || CONST_INT_P (val_exp)
9751 48 : || misaligned_prologue_used))
9752 : {
9753 6534 : if (alg == vector_loop)
9754 : {
9755 5745 : promoted_val = promote_duplicated_reg_to_size (val_exp,
9756 11490 : GET_MODE_SIZE (word_mode),
9757 : desired_align, align);
9758 : /* Duplicate the promoted scalar value if not 0 nor -1. */
9759 5745 : vec_promoted_val
9760 5745 : = promote_duplicated_reg (move_mode,
9761 5745 : (val_exp == const0_rtx
9762 647 : || val_exp == constm1_rtx)
9763 : ? val_exp : promoted_val);
9764 : }
9765 : else
9766 : {
9767 6534 : promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
9768 : desired_align, align);
9769 : }
9770 : }
9771 : /* Misaligned move sequences handles both prologues and epilogues at once.
9772 : Default code generation results in smaller code for large alignments and
9773 : also avoids redundant job when sizes are known precisely. */
9774 34616 : if (misaligned_prologue_used)
9775 : {
9776 : /* Misaligned move prologue handled small blocks by itself. */
9777 10546 : expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
9778 10546 : (dst, src, &destreg, &srcreg,
9779 : move_mode, promoted_val, vec_promoted_val,
9780 : &count_exp,
9781 : &jump_around_label,
9782 10546 : desired_align < align
9783 0 : ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
9784 : desired_align, align, &min_size, dynamic_check, issetmem);
9785 10546 : if (!issetmem)
9786 7946 : src = change_address (src, BLKmode, srcreg);
9787 10546 : dst = change_address (dst, BLKmode, destreg);
9788 10546 : if (aligned_dstmem)
9789 21 : set_mem_align (dst, desired_align * BITS_PER_UNIT);
9790 10546 : epilogue_size_needed = 0;
9791 10546 : if (need_zero_guard
9792 10268 : && min_size < (unsigned HOST_WIDE_INT) size_needed)
9793 : {
9794 : /* It is possible that we copied enough so the main loop will not
9795 : execute. */
9796 7245 : gcc_assert (size_needed > 1);
9797 7245 : if (jump_around_label == NULL_RTX)
9798 50 : jump_around_label = gen_label_rtx ();
9799 14490 : emit_cmp_and_jump_insns (count_exp,
9800 : GEN_INT (size_needed),
9801 : LTU, 0, counter_mode (count_exp), 1, jump_around_label);
9802 7245 : if (expected_size == -1
9803 56 : || expected_size < (desired_align - align) / 2 + size_needed)
9804 7190 : predict_jump (REG_BR_PROB_BASE * 20 / 100);
9805 : else
9806 55 : predict_jump (REG_BR_PROB_BASE * 60 / 100);
9807 : }
9808 : }
9809 : /* Ensure that alignment prologue won't copy past end of block. */
9810 24118 : else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
9811 : {
9812 16496 : epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
9813 : /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
9814 : Make sure it is power of 2. */
9815 16496 : epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
9816 :
9817 : /* To improve performance of small blocks, we jump around the VAL
9818 : promoting mode. This mean that if the promoted VAL is not constant,
9819 : we might not use it in the epilogue and have to use byte
9820 : loop variant. */
9821 16496 : if (issetmem && epilogue_size_needed > 2 && !promoted_val)
9822 16496 : force_loopy_epilogue = true;
9823 16496 : if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
9824 16488 : || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
9825 : {
9826 : /* If main algorithm works on QImode, no epilogue is needed.
9827 : For small sizes just don't align anything. */
9828 2237 : if (size_needed == 1)
9829 0 : desired_align = align;
9830 : else
9831 2237 : goto epilogue;
9832 : }
9833 14259 : else if (!count
9834 256 : && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
9835 : {
9836 255 : label = gen_label_rtx ();
9837 510 : emit_cmp_and_jump_insns (count_exp,
9838 : GEN_INT (epilogue_size_needed),
9839 : LTU, 0, counter_mode (count_exp), 1, label);
9840 255 : if (expected_size == -1 || expected_size < epilogue_size_needed)
9841 255 : predict_jump (REG_BR_PROB_BASE * 60 / 100);
9842 : else
9843 0 : predict_jump (REG_BR_PROB_BASE * 20 / 100);
9844 : }
9845 : }
9846 :
9847 : /* Emit code to decide on runtime whether library call or inline should be
9848 : used. */
9849 32427 : if (dynamic_check != -1)
9850 : {
9851 7 : if (!issetmem && CONST_INT_P (count_exp))
9852 : {
9853 1 : if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
9854 : {
9855 1 : emit_block_copy_via_libcall (dst, src, count_exp);
9856 1 : count_exp = const0_rtx;
9857 1 : goto epilogue;
9858 : }
9859 : }
9860 : else
9861 : {
9862 6 : rtx_code_label *hot_label = gen_label_rtx ();
9863 6 : if (jump_around_label == NULL_RTX)
9864 1 : jump_around_label = gen_label_rtx ();
9865 12 : emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
9866 : LEU, 0, counter_mode (count_exp),
9867 : 1, hot_label);
9868 6 : predict_jump (REG_BR_PROB_BASE * 90 / 100);
9869 6 : if (issetmem)
9870 4 : set_storage_via_libcall (dst, count_exp, val_exp);
9871 : else
9872 2 : emit_block_copy_via_libcall (dst, src, count_exp);
9873 6 : emit_jump (jump_around_label);
9874 6 : emit_label (hot_label);
9875 : }
9876 : }
9877 :
9878 : /* Step 2: Alignment prologue. */
9879 : /* Do the expensive promotion once we branched off the small blocks. */
9880 32426 : if (issetmem && !promoted_val)
9881 48 : promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
9882 : desired_align, align);
9883 :
9884 32426 : if (desired_align > align && !misaligned_prologue_used)
9885 : {
9886 7 : if (align_bytes == 0)
9887 : {
9888 : /* Except for the first move in prologue, we no longer know
9889 : constant offset in aliasing info. It don't seems to worth
9890 : the pain to maintain it for the first move, so throw away
9891 : the info early. */
9892 7 : dst = change_address (dst, BLKmode, destreg);
9893 7 : if (!issetmem)
9894 5 : src = change_address (src, BLKmode, srcreg);
9895 7 : dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
9896 : promoted_val, vec_promoted_val,
9897 : count_exp, align, desired_align,
9898 : issetmem);
9899 : /* At most desired_align - align bytes are copied. */
9900 7 : if (min_size < (unsigned)(desired_align - align))
9901 0 : min_size = 0;
9902 : else
9903 7 : min_size -= desired_align - align;
9904 : }
9905 : else
9906 : {
9907 : /* If we know how many bytes need to be stored before dst is
9908 : sufficiently aligned, maintain aliasing info accurately. */
9909 0 : dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
9910 : srcreg,
9911 : promoted_val,
9912 : vec_promoted_val,
9913 : desired_align,
9914 : align_bytes,
9915 : issetmem);
9916 :
9917 0 : count_exp = plus_constant (counter_mode (count_exp),
9918 0 : count_exp, -align_bytes);
9919 0 : count -= align_bytes;
9920 0 : min_size -= align_bytes;
9921 0 : max_size -= align_bytes;
9922 : }
9923 7 : if (need_zero_guard
9924 7 : && min_size < (unsigned HOST_WIDE_INT) size_needed
9925 1 : && (count < (unsigned HOST_WIDE_INT) size_needed
9926 0 : || (align_bytes == 0
9927 0 : && count < ((unsigned HOST_WIDE_INT) size_needed
9928 0 : + desired_align - align))))
9929 : {
9930 : /* It is possible that we copied enough so the main loop will not
9931 : execute. */
9932 1 : gcc_assert (size_needed > 1);
9933 1 : if (label == NULL_RTX)
9934 0 : label = gen_label_rtx ();
9935 2 : emit_cmp_and_jump_insns (count_exp,
9936 : GEN_INT (size_needed),
9937 : LTU, 0, counter_mode (count_exp), 1, label);
9938 1 : if (expected_size == -1
9939 0 : || expected_size < (desired_align - align) / 2 + size_needed)
9940 1 : predict_jump (REG_BR_PROB_BASE * 20 / 100);
9941 : else
9942 0 : predict_jump (REG_BR_PROB_BASE * 60 / 100);
9943 : }
9944 : }
9945 32426 : if (label && size_needed == 1)
9946 : {
9947 0 : emit_label (label);
9948 0 : LABEL_NUSES (label) = 1;
9949 0 : label = NULL;
9950 0 : epilogue_size_needed = 1;
9951 0 : if (issetmem)
9952 0 : promoted_val = val_exp;
9953 : }
9954 32426 : else if (label == NULL_RTX && !misaligned_prologue_used)
9955 21626 : epilogue_size_needed = size_needed;
9956 :
9957 : /* Step 3: Main loop. */
9958 :
9959 32426 : switch (alg)
9960 : {
9961 0 : case libcall:
9962 0 : case no_stringop:
9963 0 : case last_alg:
9964 0 : gcc_unreachable ();
9965 1774 : case loop_1_byte:
9966 1774 : case loop:
9967 1774 : case unrolled_loop:
9968 1774 : expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
9969 : count_exp, move_mode, unroll_factor,
9970 : expected_size, issetmem);
9971 1774 : break;
9972 13992 : case vector_loop:
9973 13992 : expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
9974 : vec_promoted_val, count_exp, move_mode,
9975 : unroll_factor, expected_size, issetmem);
9976 13992 : break;
9977 16660 : case rep_prefix_8_byte:
9978 16660 : case rep_prefix_4_byte:
9979 16660 : case rep_prefix_1_byte:
9980 16660 : expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
9981 : val_exp, count_exp, move_mode, issetmem);
9982 16660 : break;
9983 : }
9984 : /* Adjust properly the offset of src and dest memory for aliasing. */
9985 32426 : if (CONST_INT_P (count_exp))
9986 : {
9987 18364 : if (!issetmem)
9988 8556 : src = adjust_automodify_address_nv (src, BLKmode, srcreg,
9989 : (count / size_needed) * size_needed);
9990 18364 : dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
9991 : (count / size_needed) * size_needed);
9992 : }
9993 : else
9994 : {
9995 14062 : if (!issetmem)
9996 11819 : src = change_address (src, BLKmode, srcreg);
9997 14062 : dst = change_address (dst, BLKmode, destreg);
9998 : }
9999 :
10000 : /* Step 4: Epilogue to copy the remaining bytes. */
10001 34664 : epilogue:
10002 34664 : if (label)
10003 : {
10004 : /* When the main loop is done, COUNT_EXP might hold original count,
10005 : while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
10006 : Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
10007 : bytes. Compensate if needed. */
10008 :
10009 255 : if (size_needed < epilogue_size_needed)
10010 : {
10011 0 : tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
10012 0 : GEN_INT (size_needed - 1), count_exp, 1,
10013 : OPTAB_DIRECT);
10014 0 : if (tmp != count_exp)
10015 0 : emit_move_insn (count_exp, tmp);
10016 : }
10017 255 : emit_label (label);
10018 255 : LABEL_NUSES (label) = 1;
10019 : }
10020 :
10021 34664 : if (count_exp != const0_rtx && epilogue_size_needed > 1)
10022 : {
10023 16496 : if (force_loopy_epilogue)
10024 0 : expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
10025 : epilogue_size_needed);
10026 : else
10027 : {
10028 16496 : if (issetmem)
10029 7916 : expand_setmem_epilogue (dst, destreg, promoted_val,
10030 : vec_promoted_val, count_exp,
10031 : epilogue_size_needed);
10032 : else
10033 8580 : expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
10034 : epilogue_size_needed);
10035 : }
10036 : }
10037 34664 : if (jump_around_label)
10038 7248 : emit_label (jump_around_label);
10039 : return true;
10040 : }
10041 :
10042 : /* Fully unroll memmove of known size with up to 8 registers. */
10043 :
10044 : static bool
10045 1873 : ix86_expand_unroll_movmem (rtx dst, rtx src, rtx destreg, rtx srcreg,
10046 : unsigned HOST_WIDE_INT count,
10047 : machine_mode mode)
10048 : {
10049 : /* If 8 registers registers can cover all memory, load them into
10050 : registers and store them together to avoid possible address
10051 : overlap between source and destination. */
10052 1873 : unsigned HOST_WIDE_INT moves = count / GET_MODE_SIZE (mode);
10053 1873 : if (moves == 0)
10054 : {
10055 0 : mode = smallest_int_mode_for_size
10056 0 : (count * BITS_PER_UNIT).require ();
10057 0 : if (count == GET_MODE_SIZE (mode))
10058 : moves = 1;
10059 : else
10060 : {
10061 : /* Reduce the smallest move size by half so that MOVES == 1. */
10062 0 : mode = smallest_int_mode_for_size
10063 0 : (GET_MODE_BITSIZE (mode) / 2).require ();
10064 0 : moves = count / GET_MODE_SIZE (mode);
10065 0 : gcc_assert (moves == 1);
10066 : }
10067 : }
10068 1873 : else if (moves > 8)
10069 : return false;
10070 :
10071 1864 : unsigned int i;
10072 1864 : rtx tmp[9];
10073 :
10074 4296 : for (i = 0; i < moves; i++)
10075 2432 : tmp[i] = gen_reg_rtx (mode);
10076 :
10077 1864 : rtx srcmem = change_address (src, mode, srcreg);
10078 6160 : for (i = 0; i < moves; i++)
10079 : {
10080 2432 : emit_move_insn (tmp[i], srcmem);
10081 4864 : srcmem = offset_address (srcmem,
10082 2432 : GEN_INT (GET_MODE_SIZE (mode)),
10083 2432 : GET_MODE_SIZE (mode));
10084 : }
10085 :
10086 1864 : unsigned int epilogue_size = count & (GET_MODE_SIZE (mode) - 1);
10087 1864 : machine_mode epilogue_mode = VOIDmode;
10088 1864 : if (epilogue_size)
10089 : {
10090 : /* Handle the remaining bytes with overlapping move. */
10091 1691 : epilogue_mode = smallest_int_mode_for_size
10092 1691 : (epilogue_size * BITS_PER_UNIT).require ();
10093 1691 : tmp[8] = gen_reg_rtx (epilogue_mode);
10094 1691 : srcmem = adjust_address (srcmem, epilogue_mode, 0);
10095 1691 : srcmem = offset_address (srcmem, GEN_INT (epilogue_size), 1);
10096 3382 : srcmem = offset_address (srcmem,
10097 1691 : GEN_INT (-GET_MODE_SIZE (epilogue_mode)),
10098 1691 : GET_MODE_SIZE (epilogue_mode));
10099 1691 : emit_move_insn (tmp[8], srcmem);
10100 : }
10101 :
10102 1864 : rtx destmem = change_address (dst, mode, destreg);
10103 6160 : for (i = 0; i < moves; i++)
10104 : {
10105 2432 : emit_move_insn (destmem, tmp[i]);
10106 4864 : destmem = offset_address (destmem,
10107 2432 : GEN_INT (GET_MODE_SIZE (mode)),
10108 2432 : GET_MODE_SIZE (mode));
10109 : }
10110 :
10111 1864 : if (epilogue_size)
10112 : {
10113 : /* Use overlapping move. */
10114 1691 : destmem = adjust_address (destmem, epilogue_mode, 0);
10115 1691 : destmem = offset_address (destmem, GEN_INT (epilogue_size), 1);
10116 3382 : destmem = offset_address (destmem,
10117 1691 : GEN_INT (-GET_MODE_SIZE (epilogue_mode)),
10118 1691 : GET_MODE_SIZE (epilogue_mode));
10119 1691 : emit_move_insn (destmem, tmp[8]);
10120 : }
10121 :
10122 : return true;
10123 : }
10124 :
10125 : /* Expand memmove of size with MOVES * mode size and MOVES <= 4. If
10126 : FORWARD is true, copy forward. Otherwise copy backward. */
10127 :
10128 : static void
10129 2298 : ix86_expand_n_move_movmem (rtx destmem, rtx srcmem, machine_mode mode,
10130 : unsigned int moves, bool forward)
10131 : {
10132 2298 : gcc_assert (moves <= 4);
10133 :
10134 : unsigned int i;
10135 : rtx tmp[8];
10136 :
10137 11490 : for (i = 0; i < moves; i++)
10138 9192 : tmp[i] = gen_reg_rtx (mode);
10139 :
10140 2298 : rtx step;
10141 2298 : if (forward)
10142 2298 : step = GEN_INT (GET_MODE_SIZE (mode));
10143 : else
10144 2298 : step = GEN_INT (-GET_MODE_SIZE (mode));
10145 :
10146 : /* Load MOVES. */
10147 9192 : for (i = 0; i < moves - 1; i++)
10148 : {
10149 6894 : emit_move_insn (tmp[i], srcmem);
10150 13788 : srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
10151 : }
10152 2298 : emit_move_insn (tmp[i], srcmem);
10153 :
10154 : /* Store MOVES. */
10155 11490 : for (i = 0; i < moves - 1; i++)
10156 : {
10157 6894 : emit_move_insn (destmem, tmp[i]);
10158 13788 : destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
10159 : }
10160 2298 : emit_move_insn (destmem, tmp[i]);
10161 2298 : }
10162 :
10163 : /* Load MOVES of mode size into REGS. If LAST is true, load the
10164 : last MOVES. Otherwise, load the first MOVES. */
10165 :
10166 : static void
10167 2298 : ix86_expand_load_movmem (rtx src, rtx srcreg, rtx count_exp,
10168 : machine_mode mode, unsigned int moves,
10169 : rtx regs[], bool last)
10170 : {
10171 2298 : unsigned int i;
10172 :
10173 11490 : for (i = 0; i < moves; i++)
10174 9192 : regs[i] = gen_reg_rtx (mode);
10175 :
10176 2298 : rtx srcmem = change_address (src, mode, srcreg);
10177 2298 : rtx step;
10178 2298 : if (last)
10179 : {
10180 1149 : srcmem = offset_address (srcmem, count_exp, 1);
10181 2298 : step = GEN_INT (-GET_MODE_SIZE (mode));
10182 2298 : srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
10183 : }
10184 : else
10185 2298 : step = GEN_INT (GET_MODE_SIZE (mode));
10186 :
10187 9192 : for (i = 0; i < moves - 1; i++)
10188 : {
10189 6894 : emit_move_insn (regs[i], srcmem);
10190 13788 : srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
10191 : }
10192 2298 : emit_move_insn (regs[i], srcmem);
10193 2298 : }
10194 :
10195 : /* Store MOVES of mode size into REGS. If LAST is true, store the
10196 : last MOVES. Otherwise, store the first MOVES. */
10197 :
10198 : static void
10199 2298 : ix86_expand_store_movmem (rtx dst, rtx destreg, rtx count_exp,
10200 : machine_mode mode, unsigned int moves,
10201 : rtx regs[], bool last)
10202 : {
10203 2298 : unsigned int i;
10204 :
10205 2298 : rtx destmem = change_address (dst, mode, destreg);
10206 2298 : rtx step;
10207 2298 : if (last)
10208 : {
10209 1149 : destmem = offset_address (destmem, count_exp, 1);
10210 2298 : step = GEN_INT (-GET_MODE_SIZE (mode));
10211 2298 : destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
10212 : }
10213 : else
10214 2298 : step = GEN_INT (GET_MODE_SIZE (mode));
10215 :
10216 9192 : for (i = 0; i < moves - 1; i++)
10217 : {
10218 6894 : emit_move_insn (destmem, regs[i]);
10219 13788 : destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
10220 : }
10221 2298 : emit_move_insn (destmem, regs[i]);
10222 2298 : }
10223 :
10224 : /* Expand memmove of size between (MOVES / 2) * mode size and
10225 : MOVES * mode size with overlapping load and store. MOVES is even.
10226 : MOVES >= 2 and MOVES <= 8. */
10227 :
10228 : static void
10229 12538 : ix86_expand_n_overlapping_move_movmem (rtx dst, rtx src, rtx destreg,
10230 : rtx srcreg, rtx count_exp,
10231 : machine_mode mode,
10232 : unsigned int moves)
10233 : {
10234 12538 : gcc_assert (moves >= 2 && moves <= 8 && (moves & 1) == 0);
10235 :
10236 12538 : unsigned int half_moves = moves / 2;
10237 12538 : unsigned int i, j;
10238 12538 : rtx tmp[8];
10239 :
10240 47890 : for (i = 0; i < moves; i++)
10241 35352 : tmp[i] = gen_reg_rtx (mode);
10242 :
10243 12538 : rtx base_srcmem = change_address (src, mode, srcreg);
10244 :
10245 : /* Load the first half. */
10246 12538 : rtx srcmem = base_srcmem;
10247 30214 : for (i = 0; i < half_moves - 1; i++)
10248 : {
10249 5138 : emit_move_insn (tmp[i], srcmem);
10250 10276 : srcmem = offset_address (srcmem,
10251 5138 : GEN_INT (GET_MODE_SIZE (mode)),
10252 5138 : GET_MODE_SIZE (mode));
10253 : }
10254 12538 : emit_move_insn (tmp[i], srcmem);
10255 :
10256 : /* Load the second half. */
10257 12538 : srcmem = offset_address (base_srcmem, count_exp, 1);
10258 12538 : srcmem = offset_address (srcmem,
10259 12538 : GEN_INT (-GET_MODE_SIZE (mode)),
10260 12538 : GET_MODE_SIZE (mode));
10261 30214 : for (j = half_moves, i = 0; i < half_moves - 1; i++, j++)
10262 : {
10263 5138 : emit_move_insn (tmp[j], srcmem);
10264 10276 : srcmem = offset_address (srcmem,
10265 5138 : GEN_INT (-GET_MODE_SIZE (mode)),
10266 5138 : GET_MODE_SIZE (mode));
10267 : }
10268 12538 : emit_move_insn (tmp[j], srcmem);
10269 :
10270 12538 : rtx base_destmem = change_address (dst, mode, destreg);
10271 :
10272 : /* Store the first half. */
10273 12538 : rtx destmem = base_destmem;
10274 30214 : for (i = 0; i < half_moves - 1; i++)
10275 : {
10276 5138 : emit_move_insn (destmem, tmp[i]);
10277 10276 : destmem = offset_address (destmem,
10278 5138 : GEN_INT (GET_MODE_SIZE (mode)),
10279 5138 : GET_MODE_SIZE (mode));
10280 : }
10281 12538 : emit_move_insn (destmem, tmp[i]);
10282 :
10283 : /* Store the second half. */
10284 12538 : destmem = offset_address (base_destmem, count_exp, 1);
10285 25076 : destmem = offset_address (destmem, GEN_INT (-GET_MODE_SIZE (mode)),
10286 12538 : GET_MODE_SIZE (mode));
10287 30214 : for (j = half_moves, i = 0; i < half_moves - 1; i++, j++)
10288 : {
10289 5138 : emit_move_insn (destmem, tmp[j]);
10290 10276 : destmem = offset_address (destmem, GEN_INT (-GET_MODE_SIZE (mode)),
10291 5138 : GET_MODE_SIZE (mode));
10292 : }
10293 12538 : emit_move_insn (destmem, tmp[j]);
10294 12538 : }
10295 :
10296 : /* Expand memmove of size < mode size which is <= 64. */
10297 :
10298 : static void
10299 2814 : ix86_expand_less_move_movmem (rtx dst, rtx src, rtx destreg,
10300 : rtx srcreg, rtx count_exp,
10301 : unsigned HOST_WIDE_INT min_size,
10302 : machine_mode mode,
10303 : rtx_code_label *done_label)
10304 : {
10305 2814 : bool skip = false;
10306 2814 : machine_mode count_mode = counter_mode (count_exp);
10307 :
10308 2814 : rtx_code_label *between_32_63_label
10309 2814 : = GET_MODE_SIZE (mode) > 32 ? gen_label_rtx () : nullptr;
10310 : /* Jump to BETWEEN_32_64_LABEL if size >= 32 and size < 64. */
10311 3 : if (between_32_63_label)
10312 : {
10313 3 : if (min_size && min_size >= 32)
10314 : {
10315 1 : emit_jump_insn (gen_jump (between_32_63_label));
10316 1 : emit_barrier ();
10317 1 : skip = true;
10318 : }
10319 : else
10320 2 : emit_cmp_and_jump_insns (count_exp, GEN_INT (32), GEU,
10321 : nullptr, count_mode, 1,
10322 : between_32_63_label);
10323 : }
10324 :
10325 3 : rtx_code_label *between_16_31_label
10326 2813 : = (!skip && GET_MODE_SIZE (mode) > 16) ? gen_label_rtx () : nullptr;
10327 : /* Jump to BETWEEN_16_31_LABEL if size >= 16 and size < 31. */
10328 4 : if (between_16_31_label)
10329 : {
10330 4 : if (min_size && min_size >= 16)
10331 : {
10332 2 : emit_jump_insn (gen_jump (between_16_31_label));
10333 2 : emit_barrier ();
10334 2 : skip = true;
10335 : }
10336 : else
10337 2 : emit_cmp_and_jump_insns (count_exp, GEN_INT (16), GEU,
10338 : nullptr, count_mode, 1,
10339 : between_16_31_label);
10340 : }
10341 :
10342 2 : rtx_code_label *between_8_15_label
10343 5623 : = (!skip && GET_MODE_SIZE (mode) > 8) ? gen_label_rtx () : nullptr;
10344 : /* Jump to BETWEEN_8_15_LABEL if size >= 8 and size < 15. */
10345 1895 : if (between_8_15_label)
10346 : {
10347 1895 : if (min_size && min_size >= 8)
10348 : {
10349 147 : emit_jump_insn (gen_jump (between_8_15_label));
10350 147 : emit_barrier ();
10351 147 : skip = true;
10352 : }
10353 : else
10354 1748 : emit_cmp_and_jump_insns (count_exp, GEN_INT (8), GEU,
10355 : nullptr, count_mode, 1,
10356 : between_8_15_label);
10357 : }
10358 :
10359 147 : rtx_code_label *between_4_7_label
10360 5331 : = (!skip && GET_MODE_SIZE (mode) > 4) ? gen_label_rtx () : nullptr;
10361 : /* Jump to BETWEEN_4_7_LABEL if size >= 4 and size < 7. */
10362 2131 : if (between_4_7_label)
10363 : {
10364 2131 : if (min_size && min_size >= 4)
10365 : {
10366 152 : emit_jump_insn (gen_jump (between_4_7_label));
10367 152 : emit_barrier ();
10368 152 : skip = true;
10369 : }
10370 : else
10371 1979 : emit_cmp_and_jump_insns (count_exp, GEN_INT (4), GEU,
10372 : nullptr, count_mode, 1,
10373 : between_4_7_label);
10374 : }
10375 :
10376 152 : rtx_code_label *between_2_3_label
10377 5174 : = (!skip && GET_MODE_SIZE (mode) > 2) ? gen_label_rtx () : nullptr;
10378 : /* Jump to BETWEEN_2_3_LABEL if size >= 2 and size < 3. */
10379 2366 : if (between_2_3_label)
10380 : {
10381 2366 : if (min_size && min_size >= 2)
10382 : {
10383 128 : emit_jump_insn (gen_jump (between_2_3_label));
10384 128 : emit_barrier ();
10385 128 : skip = true;
10386 : }
10387 : else
10388 2238 : emit_cmp_and_jump_insns (count_exp, GEN_INT (1), GT,
10389 : nullptr, count_mode, 1,
10390 : between_2_3_label);
10391 : }
10392 :
10393 2814 : if (!skip)
10394 : {
10395 2384 : rtx_code_label *zero_label
10396 2384 : = min_size == 0 ? gen_label_rtx () : nullptr;
10397 : /* Skip if size == 0. */
10398 1556 : if (zero_label)
10399 1556 : emit_cmp_and_jump_insns (count_exp, GEN_INT (1), LT,
10400 : nullptr, count_mode, 1,
10401 : zero_label,
10402 : profile_probability::unlikely ());
10403 :
10404 : /* Move 1 byte. */
10405 2384 : rtx tmp0 = gen_reg_rtx (QImode);
10406 2384 : rtx srcmem = change_address (src, QImode, srcreg);
10407 2384 : emit_move_insn (tmp0, srcmem);
10408 2384 : rtx destmem = change_address (dst, QImode, destreg);
10409 2384 : emit_move_insn (destmem, tmp0);
10410 :
10411 2384 : if (zero_label)
10412 1556 : emit_label (zero_label);
10413 :
10414 2384 : emit_jump_insn (gen_jump (done_label));
10415 2384 : emit_barrier ();
10416 : }
10417 :
10418 2814 : if (between_32_63_label)
10419 : {
10420 3 : emit_label (between_32_63_label);
10421 3 : ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
10422 : count_exp, OImode, 2);
10423 3 : emit_jump_insn (gen_jump (done_label));
10424 3 : emit_barrier ();
10425 : }
10426 :
10427 2814 : if (between_16_31_label)
10428 : {
10429 4 : emit_label (between_16_31_label);
10430 4 : ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
10431 : count_exp, TImode, 2);
10432 4 : emit_jump_insn (gen_jump (done_label));
10433 4 : emit_barrier ();
10434 : }
10435 :
10436 2814 : if (between_8_15_label)
10437 : {
10438 1895 : emit_label (between_8_15_label);
10439 1895 : ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
10440 : count_exp, DImode, 2);
10441 1895 : emit_jump_insn (gen_jump (done_label));
10442 1895 : emit_barrier ();
10443 : }
10444 :
10445 2814 : if (between_4_7_label)
10446 : {
10447 2131 : emit_label (between_4_7_label);
10448 2131 : ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
10449 : count_exp, SImode, 2);
10450 2131 : emit_jump_insn (gen_jump (done_label));
10451 2131 : emit_barrier ();
10452 : }
10453 :
10454 2814 : if (between_2_3_label)
10455 : {
10456 2366 : emit_label (between_2_3_label);
10457 2366 : ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
10458 : count_exp, HImode, 2);
10459 2366 : emit_jump_insn (gen_jump (done_label));
10460 2366 : emit_barrier ();
10461 : }
10462 2814 : }
10463 :
10464 : /* Expand movmem with overlapping unaligned loads and stores:
10465 : 1. Load all sources into registers and store them together to avoid
10466 : possible address overlap between source and destination.
10467 : 2. For known size, first try to fully unroll with 8 registers.
10468 : 3. For size <= 2 * MOVE_MAX, load all sources into 2 registers first
10469 : and then store them together.
10470 : 4. For size > 2 * MOVE_MAX and size <= 4 * MOVE_MAX, load all sources
10471 : into 4 registers first and then store them together.
10472 : 5. For size > 4 * MOVE_MAX and size <= 8 * MOVE_MAX, load all sources
10473 : into 8 registers first and then store them together.
10474 : 6. For size > 8 * MOVE_MAX,
10475 : a. If address of destination > address of source, copy backward
10476 : with a 4 * MOVE_MAX loop with unaligned loads and stores. Load
10477 : the first 4 * MOVE_MAX into 4 registers before the loop and
10478 : store them after the loop to support overlapping addresses.
10479 : b. Otherwise, copy forward with a 4 * MOVE_MAX loop with unaligned
10480 : loads and stores. Load the last 4 * MOVE_MAX into 4 registers
10481 : before the loop and store them after the loop to support
10482 : overlapping addresses.
10483 : */
10484 :
10485 : bool
10486 17325 : ix86_expand_movmem (rtx operands[])
10487 : {
10488 : /* Since there are much less registers available in 32-bit mode, don't
10489 : inline movmem in 32-bit mode. */
10490 17325 : if (!TARGET_64BIT || optimize_insn_for_size_p ())
10491 3828 : return false;
10492 :
10493 13497 : rtx dst = operands[0];
10494 13497 : rtx src = operands[1];
10495 13497 : rtx count_exp = operands[2];
10496 13497 : rtx expected_size_exp = operands[5];
10497 13497 : rtx min_size_exp = operands[6];
10498 13497 : rtx probable_max_size_exp = operands[8];
10499 13497 : unsigned HOST_WIDE_INT count = HOST_WIDE_INT_0U;
10500 13497 : HOST_WIDE_INT expected_size = HOST_WIDE_INT_M1U;
10501 13497 : unsigned HOST_WIDE_INT min_size = HOST_WIDE_INT_0U;
10502 13497 : unsigned HOST_WIDE_INT probable_max_size = HOST_WIDE_INT_M1U;
10503 :
10504 13497 : if (CONST_INT_P (count_exp))
10505 : {
10506 2008 : min_size = probable_max_size = count = expected_size
10507 2008 : = INTVAL (count_exp);
10508 : /* When COUNT is 0, there is nothing to do. */
10509 2008 : if (!count)
10510 : return true;
10511 : }
10512 : else
10513 : {
10514 11489 : if (min_size_exp)
10515 11489 : min_size = INTVAL (min_size_exp);
10516 11489 : if (probable_max_size_exp)
10517 8761 : probable_max_size = INTVAL (probable_max_size_exp);
10518 11489 : if (CONST_INT_P (expected_size_exp))
10519 11489 : expected_size = INTVAL (expected_size_exp);
10520 : }
10521 :
10522 : /* Make sure we don't need to care about overflow later on. */
10523 13497 : if (count > (HOST_WIDE_INT_1U << 30))
10524 : return false;
10525 :
10526 13466 : addr_space_t dst_as = MEM_ADDR_SPACE (dst);
10527 13466 : addr_space_t src_as = MEM_ADDR_SPACE (src);
10528 13466 : int dynamic_check;
10529 13466 : bool noalign;
10530 13466 : enum stringop_alg alg = decide_alg (count, expected_size, min_size,
10531 : probable_max_size, false, false,
10532 : dst_as, src_as, &dynamic_check,
10533 : &noalign, false);
10534 13466 : if (alg == libcall)
10535 : return false;
10536 :
10537 5318 : rtx destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
10538 5318 : rtx srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
10539 :
10540 5318 : unsigned int move_max = MOVE_MAX;
10541 5318 : machine_mode mode = smallest_int_mode_for_size
10542 5318 : (move_max * BITS_PER_UNIT).require ();
10543 5318 : if (probable_max_size && probable_max_size < move_max)
10544 : {
10545 : /* Get a usable MOVE_MAX. */
10546 2899 : mode = smallest_int_mode_for_size
10547 2899 : (probable_max_size * BITS_PER_UNIT).require ();
10548 : /* Reduce MOVE_MAX by half so that MOVE_MAX can be used. */
10549 5798 : if (GET_MODE_SIZE (mode) > probable_max_size)
10550 2414 : mode = smallest_int_mode_for_size
10551 2414 : (GET_MODE_BITSIZE (mode) / 2).require ();
10552 5798 : move_max = GET_MODE_SIZE (mode);
10553 : }
10554 :
10555 : /* Try to fully unroll memmove of known size first. */
10556 5318 : if (count
10557 5318 : && ix86_expand_unroll_movmem (dst, src, destreg, srcreg, count,
10558 : mode))
10559 : return true;
10560 :
10561 3454 : rtx_code_label *done_label = gen_label_rtx ();
10562 :
10563 3454 : rtx_code_label *less_vec_label = nullptr;
10564 3454 : if (min_size == 0 || min_size < move_max)
10565 2814 : less_vec_label = gen_label_rtx ();
10566 :
10567 3454 : machine_mode count_mode = counter_mode (count_exp);
10568 :
10569 : /* Jump to LESS_VEC_LABEL if size < MOVE_MAX. */
10570 3454 : if (less_vec_label)
10571 2814 : emit_cmp_and_jump_insns (count_exp, GEN_INT (move_max), LTU,
10572 : nullptr, count_mode, 1,
10573 : less_vec_label);
10574 :
10575 3454 : rtx_code_label *more_2x_vec_label = nullptr;
10576 3454 : if (probable_max_size == 0 || probable_max_size > 2 * move_max)
10577 1501 : more_2x_vec_label = gen_label_rtx ();
10578 :
10579 : /* Jump to MORE_2X_VEC_LABEL if size > 2 * MOVE_MAX. */
10580 1501 : if (more_2x_vec_label)
10581 1501 : emit_cmp_and_jump_insns (count_exp, GEN_INT (2 * move_max), GTU,
10582 : nullptr, count_mode, 1,
10583 : more_2x_vec_label);
10584 :
10585 3454 : if (min_size == 0 || min_size <= 2 * move_max)
10586 : {
10587 : /* Size >= MOVE_MAX and size <= 2 * MOVE_MAX. */
10588 3433 : ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
10589 : count_exp, mode, 2);
10590 3433 : emit_jump_insn (gen_jump (done_label));
10591 3433 : emit_barrier ();
10592 : }
10593 :
10594 3454 : if (less_vec_label)
10595 : {
10596 : /* Size < MOVE_MAX. */
10597 2814 : emit_label (less_vec_label);
10598 2814 : ix86_expand_less_move_movmem (dst, src, destreg, srcreg,
10599 : count_exp, min_size, mode,
10600 : done_label);
10601 2814 : emit_jump_insn (gen_jump (done_label));
10602 2814 : emit_barrier ();
10603 : }
10604 :
10605 3454 : if (more_2x_vec_label)
10606 : {
10607 : /* Size > 2 * MOVE_MAX and destination may overlap with source. */
10608 1501 : emit_label (more_2x_vec_label);
10609 :
10610 1501 : rtx_code_label *more_8x_vec_label = nullptr;
10611 1501 : if (probable_max_size == 0 || probable_max_size > 8 * move_max)
10612 1149 : more_8x_vec_label = gen_label_rtx ();
10613 :
10614 : /* Jump to MORE_8X_VEC_LABEL if size > 8 * MOVE_MAX. */
10615 1149 : if (more_8x_vec_label)
10616 1149 : emit_cmp_and_jump_insns (count_exp, GEN_INT (8 * move_max), GTU,
10617 : nullptr, count_mode, 1,
10618 : more_8x_vec_label);
10619 :
10620 1501 : rtx_code_label *last_4x_vec_label = nullptr;
10621 1501 : if (min_size == 0 || min_size <= 4 * move_max)
10622 1490 : last_4x_vec_label = gen_label_rtx ();
10623 :
10624 : /* Jump to LAST_4X_VEC_LABEL if size <= 4 * MOVE_MAX. */
10625 1490 : if (last_4x_vec_label)
10626 1490 : emit_cmp_and_jump_insns (count_exp, GEN_INT (4 * move_max), LEU,
10627 : nullptr, count_mode, 1,
10628 : last_4x_vec_label);
10629 :
10630 1501 : if (probable_max_size == 0 || probable_max_size > 4 * move_max)
10631 : {
10632 : /* Size > 4 * MOVE_MAX and size <= 8 * MOVE_MAX. */
10633 1216 : ix86_expand_n_overlapping_move_movmem (dst, src, destreg,
10634 : srcreg, count_exp,
10635 : mode, 8);
10636 1216 : emit_jump_insn (gen_jump (done_label));
10637 1216 : emit_barrier ();
10638 : }
10639 :
10640 1501 : if (last_4x_vec_label)
10641 : {
10642 : /* Size > 2 * MOVE_MAX and size <= 4 * MOVE_MAX. */
10643 1490 : emit_label (last_4x_vec_label);
10644 1490 : ix86_expand_n_overlapping_move_movmem (dst, src, destreg,
10645 : srcreg, count_exp,
10646 : mode, 4);
10647 1490 : emit_jump_insn (gen_jump (done_label));
10648 1490 : emit_barrier ();
10649 : }
10650 :
10651 1501 : if (more_8x_vec_label)
10652 : {
10653 : /* Size > 8 * MOVE_MAX. */
10654 1149 : emit_label (more_8x_vec_label);
10655 :
10656 1149 : rtx loop_count = gen_reg_rtx (count_mode);
10657 1149 : emit_move_insn (loop_count, count_exp);
10658 :
10659 : /* Jump to MORE_8X_VEC_BACKWARD_LABEL if source address is
10660 : lower than destination address. */
10661 1149 : rtx_code_label *more_8x_vec_backward_label = gen_label_rtx ();
10662 1149 : emit_cmp_and_jump_insns (srcreg, destreg, LTU, nullptr,
10663 1149 : GET_MODE (destreg), 1,
10664 : more_8x_vec_backward_label);
10665 :
10666 : /* Skip if source == destination which is less common. */
10667 1149 : emit_cmp_and_jump_insns (srcreg, destreg, EQ, nullptr,
10668 1149 : GET_MODE (destreg), 1, done_label,
10669 : profile_probability::unlikely ());
10670 :
10671 1149 : rtx base_destreg = gen_reg_rtx (GET_MODE (destreg));
10672 1149 : emit_move_insn (base_destreg, destreg);
10673 :
10674 : /* Load the last 4 * MOVE_MAX. */
10675 1149 : rtx regs[4];
10676 1149 : ix86_expand_load_movmem (src, srcreg, count_exp, mode,
10677 : ARRAY_SIZE (regs), regs, true);
10678 :
10679 1149 : rtx srcmem = change_address (src, mode, srcreg);
10680 1149 : rtx destmem = change_address (dst, mode, destreg);
10681 :
10682 : /* Copy forward with a 4 * MOVE_MAX loop. */
10683 1149 : rtx_code_label *loop_4x_vec_forward_label = gen_label_rtx ();
10684 1149 : emit_label (loop_4x_vec_forward_label);
10685 :
10686 1149 : ix86_expand_n_move_movmem (destmem, srcmem, mode, 4, true);
10687 :
10688 1149 : rtx tmp;
10689 1149 : rtx delta = GEN_INT (4 * MOVE_MAX);
10690 :
10691 : /* Decrement LOOP_COUNT by 4 * MOVE_MAX. */
10692 1149 : tmp = expand_simple_binop (GET_MODE (loop_count), MINUS,
10693 : loop_count, delta, nullptr, 1,
10694 : OPTAB_DIRECT);
10695 1149 : if (tmp != loop_count)
10696 1149 : emit_move_insn (loop_count, tmp);
10697 :
10698 : /* Increment DESTREG and SRCREG by 4 * MOVE_MAX. */
10699 1149 : tmp = expand_simple_binop (GET_MODE (destreg), PLUS,
10700 : destreg, delta, nullptr, 1,
10701 : OPTAB_DIRECT);
10702 1149 : if (tmp != destreg)
10703 1149 : emit_move_insn (destreg, tmp);
10704 1149 : tmp = expand_simple_binop (GET_MODE (srcreg), PLUS, srcreg,
10705 : delta, nullptr, 1, OPTAB_DIRECT);
10706 1149 : if (tmp != srcreg)
10707 1149 : emit_move_insn (srcreg, tmp);
10708 :
10709 : /* Stop if LOOP_EXP <= 4 * MOVE_MAX. */
10710 1149 : emit_cmp_and_jump_insns (loop_count, delta, GTU, nullptr,
10711 1149 : GET_MODE (loop_count), 1,
10712 : loop_4x_vec_forward_label);
10713 :
10714 : /* Store the last 4 * MOVE_MAX. */
10715 1149 : ix86_expand_store_movmem (dst, base_destreg, count_exp, mode,
10716 : ARRAY_SIZE (regs), regs, true);
10717 :
10718 1149 : emit_jump_insn (gen_jump (done_label));
10719 1149 : emit_barrier ();
10720 :
10721 : /* Copy backward with a 4 * MOVE_MAX loop. */
10722 1149 : emit_label (more_8x_vec_backward_label);
10723 :
10724 1149 : base_destreg = gen_reg_rtx (GET_MODE (destreg));
10725 1149 : emit_move_insn (base_destreg, destreg);
10726 :
10727 : /* Load the first 4 * MOVE_MAX. */
10728 1149 : ix86_expand_load_movmem (src, srcreg, count_exp, mode,
10729 : ARRAY_SIZE (regs), regs, false);
10730 :
10731 : /* Increment DESTREG and SRCREG by COUNT_EXP. */
10732 1149 : tmp = expand_simple_binop (GET_MODE (destreg), PLUS,
10733 : destreg, count_exp, nullptr, 1,
10734 : OPTAB_DIRECT);
10735 1149 : if (tmp != destreg)
10736 1149 : emit_move_insn (destreg, tmp);
10737 1149 : tmp = expand_simple_binop (GET_MODE (srcreg), PLUS, srcreg,
10738 : count_exp, nullptr, 1, OPTAB_DIRECT);
10739 1149 : if (tmp != srcreg)
10740 1149 : emit_move_insn (srcreg, tmp);
10741 :
10742 1149 : srcmem = change_address (src, mode, srcreg);
10743 1149 : destmem = change_address (dst, mode, destreg);
10744 2298 : rtx step = GEN_INT (-GET_MODE_SIZE (mode));
10745 2298 : srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
10746 2298 : destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
10747 :
10748 1149 : rtx_code_label *loop_4x_vec_backward_label = gen_label_rtx ();
10749 1149 : emit_label (loop_4x_vec_backward_label);
10750 :
10751 1149 : ix86_expand_n_move_movmem (destmem, srcmem, mode, 4, false);
10752 :
10753 : /* Decrement LOOP_COUNT by 4 * MOVE_MAX. */
10754 1149 : tmp = expand_simple_binop (GET_MODE (loop_count), MINUS,
10755 : loop_count, delta, nullptr, 1,
10756 : OPTAB_DIRECT);
10757 1149 : if (tmp != loop_count)
10758 1149 : emit_move_insn (loop_count, tmp);
10759 :
10760 : /* Decrement DESTREG and SRCREG by 4 * MOVE_MAX. */
10761 1149 : tmp = expand_simple_binop (GET_MODE (destreg), MINUS,
10762 : destreg, delta, nullptr, 1,
10763 : OPTAB_DIRECT);
10764 1149 : if (tmp != destreg)
10765 1149 : emit_move_insn (destreg, tmp);
10766 1149 : tmp = expand_simple_binop (GET_MODE (srcreg), MINUS, srcreg,
10767 : delta, nullptr, 1, OPTAB_DIRECT);
10768 1149 : if (tmp != srcreg)
10769 1149 : emit_move_insn (srcreg, tmp);
10770 :
10771 : /* Stop if LOOP_EXP <= 4 * MOVE_MAX. */
10772 1149 : emit_cmp_and_jump_insns (loop_count, delta, GTU, nullptr,
10773 1149 : GET_MODE (loop_count), 1,
10774 : loop_4x_vec_backward_label);
10775 :
10776 : /* Store the first 4 * MOVE_MAX. */
10777 1149 : ix86_expand_store_movmem (dst, base_destreg, count_exp, mode,
10778 : ARRAY_SIZE (regs), regs, false);
10779 :
10780 1149 : emit_jump_insn (gen_jump (done_label));
10781 1149 : emit_barrier ();
10782 : }
10783 : }
10784 :
10785 3454 : emit_label (done_label);
10786 :
10787 3454 : return true;
10788 : }
10789 :
10790 : /* Expand cmpstrn or memcmp. */
10791 :
10792 : bool
10793 170798 : ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
10794 : rtx length, rtx align, bool is_cmpstrn)
10795 : {
10796 : /* Expand strncmp and memcmp only with -minline-all-stringops since
10797 : "repz cmpsb" can be much slower than strncmp and memcmp functions
10798 : implemented with vector instructions, see
10799 :
10800 : https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
10801 : */
10802 170798 : if (!TARGET_INLINE_ALL_STRINGOPS)
10803 : return false;
10804 :
10805 : /* Can't use this if the user has appropriated ecx, esi or edi. */
10806 5796 : if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
10807 : return false;
10808 :
10809 5796 : if (is_cmpstrn)
10810 : {
10811 : /* For strncmp, length is the maximum length, which can be larger
10812 : than actual string lengths. We can expand the cmpstrn pattern
10813 : to "repz cmpsb" only if one of the strings is a constant so
10814 : that expand_builtin_strncmp() can write the length argument to
10815 : be the minimum of the const string length and the actual length
10816 : argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
10817 69 : tree t1 = MEM_EXPR (src1);
10818 69 : tree t2 = MEM_EXPR (src2);
10819 138 : if (!((t1 && TREE_CODE (t1) == MEM_REF
10820 69 : && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
10821 0 : && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
10822 : == STRING_CST))
10823 69 : || (t2 && TREE_CODE (t2) == MEM_REF
10824 69 : && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
10825 69 : && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
10826 : == STRING_CST))))
10827 : return false;
10828 : }
10829 :
10830 5796 : rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
10831 5796 : rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
10832 5796 : if (addr1 != XEXP (src1, 0))
10833 5796 : src1 = replace_equiv_address_nv (src1, addr1);
10834 5796 : if (addr2 != XEXP (src2, 0))
10835 5796 : src2 = replace_equiv_address_nv (src2, addr2);
10836 :
10837 : /* NB: Make a copy of the data length to avoid changing the original
10838 : data length by cmpstrnqi patterns. */
10839 5796 : length = ix86_zero_extend_to_Pmode (length);
10840 8711 : rtx lengthreg = gen_reg_rtx (Pmode);
10841 5796 : emit_move_insn (lengthreg, length);
10842 :
10843 : /* If we are testing strict equality, we can use known alignment to
10844 : good advantage. This may be possible with combine, particularly
10845 : once cc0 is dead. */
10846 5796 : if (CONST_INT_P (length))
10847 : {
10848 0 : if (length == const0_rtx)
10849 : {
10850 0 : emit_move_insn (result, const0_rtx);
10851 0 : return true;
10852 : }
10853 0 : emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
10854 : src1, src2));
10855 : }
10856 : else
10857 : {
10858 8711 : emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
10859 5796 : emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
10860 : src1, src2));
10861 : }
10862 :
10863 5796 : rtx out = gen_lowpart (QImode, result);
10864 5796 : emit_insn (gen_cmpintqi (out));
10865 5796 : emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
10866 :
10867 5796 : return true;
10868 : }
10869 :
10870 : /* Expand the appropriate insns for doing strlen if not just doing
10871 : repnz; scasb
10872 :
10873 : out = result, initialized with the start address
10874 : align_rtx = alignment of the address.
10875 : scratch = scratch register, initialized with the startaddress when
10876 : not aligned, otherwise undefined
10877 :
10878 : This is just the body. It needs the initializations mentioned above and
10879 : some address computing at the end. These things are done in i386.md. */
10880 :
10881 : static void
10882 11 : ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
10883 : {
10884 11 : int align;
10885 11 : rtx tmp;
10886 11 : rtx_code_label *align_2_label = NULL;
10887 11 : rtx_code_label *align_3_label = NULL;
10888 11 : rtx_code_label *align_4_label = gen_label_rtx ();
10889 11 : rtx_code_label *end_0_label = gen_label_rtx ();
10890 11 : rtx mem;
10891 11 : rtx tmpreg = gen_reg_rtx (SImode);
10892 11 : rtx scratch = gen_reg_rtx (SImode);
10893 11 : rtx cmp;
10894 :
10895 11 : align = 0;
10896 11 : if (CONST_INT_P (align_rtx))
10897 11 : align = INTVAL (align_rtx);
10898 :
10899 : /* Loop to check 1..3 bytes for null to get an aligned pointer. */
10900 :
10901 : /* Is there a known alignment and is it less than 4? */
10902 11 : if (align < 4)
10903 : {
10904 15 : rtx scratch1 = gen_reg_rtx (Pmode);
10905 11 : emit_move_insn (scratch1, out);
10906 : /* Is there a known alignment and is it not 2? */
10907 11 : if (align != 2)
10908 : {
10909 11 : align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
10910 11 : align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
10911 :
10912 : /* Leave just the 3 lower bits. */
10913 15 : align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
10914 : NULL_RTX, 0, OPTAB_WIDEN);
10915 :
10916 15 : emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
10917 11 : Pmode, 1, align_4_label);
10918 15 : emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
10919 11 : Pmode, 1, align_2_label);
10920 15 : emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
10921 11 : Pmode, 1, align_3_label);
10922 : }
10923 : else
10924 : {
10925 : /* Since the alignment is 2, we have to check 2 or 0 bytes;
10926 : check if is aligned to 4 - byte. */
10927 :
10928 0 : align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
10929 : NULL_RTX, 0, OPTAB_WIDEN);
10930 :
10931 0 : emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
10932 0 : Pmode, 1, align_4_label);
10933 : }
10934 :
10935 11 : mem = change_address (src, QImode, out);
10936 :
10937 : /* Now compare the bytes. */
10938 :
10939 : /* Compare the first n unaligned byte on a byte per byte basis. */
10940 11 : emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
10941 : QImode, 1, end_0_label);
10942 :
10943 : /* Increment the address. */
10944 11 : emit_insn (gen_add2_insn (out, const1_rtx));
10945 :
10946 : /* Not needed with an alignment of 2 */
10947 11 : if (align != 2)
10948 : {
10949 11 : emit_label (align_2_label);
10950 :
10951 11 : emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
10952 : end_0_label);
10953 :
10954 11 : emit_insn (gen_add2_insn (out, const1_rtx));
10955 :
10956 11 : emit_label (align_3_label);
10957 : }
10958 :
10959 11 : emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
10960 : end_0_label);
10961 :
10962 11 : emit_insn (gen_add2_insn (out, const1_rtx));
10963 : }
10964 :
10965 : /* Generate loop to check 4 bytes at a time. It is not a good idea to
10966 : align this loop. It gives only huge programs, but does not help to
10967 : speed up. */
10968 11 : emit_label (align_4_label);
10969 :
10970 11 : mem = change_address (src, SImode, out);
10971 11 : emit_move_insn (scratch, mem);
10972 11 : emit_insn (gen_add2_insn (out, GEN_INT (4)));
10973 :
10974 : /* This formula yields a nonzero result iff one of the bytes is zero.
10975 : This saves three branches inside loop and many cycles. */
10976 :
10977 11 : emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
10978 11 : emit_insn (gen_one_cmplsi2 (scratch, scratch));
10979 11 : emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
10980 11 : emit_insn (gen_andsi3 (tmpreg, tmpreg,
10981 : gen_int_mode (0x80808080, SImode)));
10982 11 : emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
10983 : align_4_label);
10984 :
10985 11 : if (TARGET_CMOVE)
10986 : {
10987 11 : rtx reg = gen_reg_rtx (SImode);
10988 15 : rtx reg2 = gen_reg_rtx (Pmode);
10989 11 : emit_move_insn (reg, tmpreg);
10990 11 : emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
10991 :
10992 : /* If zero is not in the first two bytes, move two bytes forward. */
10993 11 : emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
10994 11 : tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
10995 11 : tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
10996 11 : emit_insn (gen_rtx_SET (tmpreg,
10997 : gen_rtx_IF_THEN_ELSE (SImode, tmp,
10998 : reg,
10999 : tmpreg)));
11000 : /* Emit lea manually to avoid clobbering of flags. */
11001 15 : emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
11002 :
11003 11 : tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
11004 11 : tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
11005 15 : emit_insn (gen_rtx_SET (out,
11006 : gen_rtx_IF_THEN_ELSE (Pmode, tmp,
11007 : reg2,
11008 : out)));
11009 11 : }
11010 : else
11011 : {
11012 0 : rtx_code_label *end_2_label = gen_label_rtx ();
11013 : /* Is zero in the first two bytes? */
11014 :
11015 0 : emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
11016 0 : tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
11017 0 : tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
11018 0 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
11019 : gen_rtx_LABEL_REF (VOIDmode, end_2_label),
11020 : pc_rtx);
11021 0 : tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
11022 0 : JUMP_LABEL (tmp) = end_2_label;
11023 :
11024 : /* Not in the first two. Move two bytes forward. */
11025 0 : emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
11026 0 : emit_insn (gen_add2_insn (out, const2_rtx));
11027 :
11028 0 : emit_label (end_2_label);
11029 :
11030 : }
11031 :
11032 : /* Avoid branch in fixing the byte. */
11033 11 : tmpreg = gen_lowpart (QImode, tmpreg);
11034 11 : emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
11035 11 : tmp = gen_rtx_REG (CCmode, FLAGS_REG);
11036 11 : cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
11037 15 : emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
11038 :
11039 11 : emit_label (end_0_label);
11040 11 : }
11041 :
11042 : /* Expand strlen. */
11043 :
11044 : bool
11045 13880 : ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
11046 : {
11047 13880 : if (TARGET_UNROLL_STRLEN
11048 13880 : && TARGET_INLINE_ALL_STRINGOPS
11049 11 : && eoschar == const0_rtx
11050 11 : && optimize > 1)
11051 : {
11052 : /* The generic case of strlen expander is long. Avoid it's
11053 : expanding unless TARGET_INLINE_ALL_STRINGOPS. */
11054 15 : rtx addr = force_reg (Pmode, XEXP (src, 0));
11055 : /* Well it seems that some optimizer does not combine a call like
11056 : foo(strlen(bar), strlen(bar));
11057 : when the move and the subtraction is done here. It does calculate
11058 : the length just once when these instructions are done inside of
11059 : output_strlen_unroll(). But I think since &bar[strlen(bar)] is
11060 : often used and I use one fewer register for the lifetime of
11061 : output_strlen_unroll() this is better. */
11062 :
11063 11 : emit_move_insn (out, addr);
11064 :
11065 11 : ix86_expand_strlensi_unroll_1 (out, src, align);
11066 :
11067 : /* strlensi_unroll_1 returns the address of the zero at the end of
11068 : the string, like memchr(), so compute the length by subtracting
11069 : the start address. */
11070 11 : emit_insn (gen_sub2_insn (out, addr));
11071 11 : return true;
11072 : }
11073 : else
11074 : return false;
11075 : }
11076 :
11077 : /* For given symbol (function) construct code to compute address of it's PLT
11078 : entry in large x86-64 PIC model. */
11079 :
11080 : static rtx
11081 34 : construct_plt_address (rtx symbol)
11082 : {
11083 34 : rtx tmp, unspec;
11084 :
11085 34 : gcc_assert (SYMBOL_REF_P (symbol));
11086 34 : gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
11087 34 : gcc_assert (Pmode == DImode);
11088 :
11089 34 : tmp = gen_reg_rtx (Pmode);
11090 34 : unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
11091 :
11092 34 : emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
11093 34 : emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
11094 34 : return tmp;
11095 : }
11096 :
11097 : /* Additional registers that are clobbered by SYSV calls. */
11098 :
11099 : static int const x86_64_ms_sysv_extra_clobbered_registers
11100 : [NUM_X86_64_MS_CLOBBERED_REGS] =
11101 : {
11102 : SI_REG, DI_REG,
11103 : XMM6_REG, XMM7_REG,
11104 : XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
11105 : XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
11106 : };
11107 :
11108 : rtx_insn *
11109 6240966 : ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
11110 : rtx callarg2,
11111 : rtx pop, bool sibcall)
11112 : {
11113 6240966 : rtx vec[3];
11114 6240966 : rtx use = NULL, call;
11115 6240966 : unsigned int vec_len = 0;
11116 6240966 : tree fndecl;
11117 6240966 : bool call_no_callee_saved_registers = false;
11118 :
11119 6240966 : if (SYMBOL_REF_P (XEXP (fnaddr, 0)))
11120 : {
11121 6057196 : fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
11122 6057196 : if (fndecl)
11123 : {
11124 5797169 : if (lookup_attribute ("interrupt",
11125 5797169 : TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
11126 1 : error ("interrupt service routine cannot be called directly");
11127 5797168 : else if (ix86_type_no_callee_saved_registers_p (TREE_TYPE (fndecl)))
11128 5797169 : call_no_callee_saved_registers = true;
11129 5797169 : if (fndecl == current_function_decl
11130 5797169 : && decl_binds_to_current_def_p (fndecl))
11131 11281 : cfun->machine->recursive_function = true;
11132 : }
11133 : }
11134 : else
11135 : {
11136 183770 : if (MEM_P (fnaddr))
11137 : {
11138 183770 : tree mem_expr = MEM_EXPR (fnaddr);
11139 183770 : if (mem_expr != nullptr
11140 183725 : && TREE_CODE (mem_expr) == MEM_REF
11141 367495 : && ix86_type_no_callee_saved_registers_p (TREE_TYPE (mem_expr)))
11142 : call_no_callee_saved_registers = true;
11143 : }
11144 :
11145 : fndecl = NULL_TREE;
11146 : }
11147 :
11148 6240966 : if (pop == const0_rtx)
11149 0 : pop = NULL;
11150 6240966 : gcc_assert (!TARGET_64BIT || !pop);
11151 :
11152 6240966 : rtx addr = XEXP (fnaddr, 0);
11153 6240966 : if (TARGET_MACHO && !TARGET_64BIT)
11154 : {
11155 : #if TARGET_MACHO
11156 : if (flag_pic && SYMBOL_REF_P (XEXP (fnaddr, 0)))
11157 : fnaddr = machopic_indirect_call_target (fnaddr);
11158 : #endif
11159 : }
11160 : else
11161 : {
11162 : /* Static functions and indirect calls don't need the pic register. Also,
11163 : check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
11164 : it an indirect call. */
11165 6240966 : if (flag_pic
11166 527914 : && SYMBOL_REF_P (addr)
11167 6741985 : && ix86_call_use_plt_p (addr))
11168 : {
11169 400738 : if (flag_plt
11170 400738 : && (SYMBOL_REF_DECL (addr) == NULL_TREE
11171 400704 : || !lookup_attribute ("noplt",
11172 400704 : DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
11173 : {
11174 400703 : if (!TARGET_64BIT
11175 221973 : || (ix86_cmodel == CM_LARGE_PIC
11176 : && DEFAULT_ABI != MS_ABI))
11177 : {
11178 536224 : use_reg (&use, gen_rtx_REG (Pmode,
11179 : REAL_PIC_OFFSET_TABLE_REGNUM));
11180 178764 : if (ix86_use_pseudo_pic_reg ())
11181 357494 : emit_move_insn (gen_rtx_REG (Pmode,
11182 178764 : REAL_PIC_OFFSET_TABLE_REGNUM),
11183 : pic_offset_table_rtx);
11184 : }
11185 : }
11186 35 : else if (!TARGET_PECOFF && !TARGET_MACHO)
11187 : {
11188 35 : if (TARGET_64BIT
11189 35 : && ix86_cmodel == CM_LARGE_PIC
11190 : && DEFAULT_ABI != MS_ABI)
11191 : {
11192 1 : fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
11193 : UNSPEC_GOT);
11194 1 : fnaddr = gen_rtx_CONST (Pmode, fnaddr);
11195 1 : fnaddr = force_reg (Pmode, fnaddr);
11196 1 : fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
11197 : }
11198 34 : else if (TARGET_64BIT)
11199 : {
11200 38 : fnaddr = gen_rtx_UNSPEC (Pmode,
11201 : gen_rtvec (1, addr),
11202 : UNSPEC_GOTPCREL);
11203 38 : fnaddr = gen_rtx_CONST (Pmode, fnaddr);
11204 : }
11205 : else
11206 : {
11207 0 : fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
11208 : UNSPEC_GOT);
11209 0 : fnaddr = gen_rtx_CONST (Pmode, fnaddr);
11210 0 : fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
11211 : fnaddr);
11212 : }
11213 39 : fnaddr = gen_const_mem (Pmode, fnaddr);
11214 : /* Pmode may not be the same as word_mode for x32, which
11215 : doesn't support indirect branch via 32-bit memory slot.
11216 : Since x32 GOT slot is 64 bit with zero upper 32 bits,
11217 : indirect branch via x32 GOT slot is OK. */
11218 35 : if (GET_MODE (fnaddr) != word_mode)
11219 4 : fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
11220 35 : fnaddr = gen_rtx_MEM (QImode, fnaddr);
11221 : }
11222 : }
11223 : }
11224 :
11225 : /* Skip setting up RAX register for -mskip-rax-setup when there are no
11226 : parameters passed in vector registers. */
11227 6240966 : if (TARGET_64BIT
11228 5400984 : && (INTVAL (callarg2) > 0
11229 5339712 : || (INTVAL (callarg2) == 0
11230 321180 : && (TARGET_SSE || !flag_skip_rax_setup))))
11231 : {
11232 382450 : rtx al = gen_rtx_REG (QImode, AX_REG);
11233 382450 : emit_move_insn (al, callarg2);
11234 382450 : use_reg (&use, al);
11235 : }
11236 :
11237 6240966 : if (ix86_cmodel == CM_LARGE_PIC
11238 : && !TARGET_PECOFF
11239 45 : && MEM_P (fnaddr)
11240 45 : && SYMBOL_REF_P (XEXP (fnaddr, 0))
11241 6241003 : && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
11242 34 : fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
11243 : /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
11244 : branch via x32 GOT slot is OK. */
11245 6240932 : else if (TARGET_X32
11246 74 : && MEM_P (fnaddr)
11247 74 : && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
11248 8 : && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode)
11249 6240936 : && !TARGET_INDIRECT_BRANCH_REGISTER)
11250 : ;
11251 6240932 : else if (sibcall
11252 6240932 : ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
11253 6112723 : : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
11254 : {
11255 532 : fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
11256 532 : fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
11257 : }
11258 :
11259 : /* PR100665: Hwasan may tag code pointer which is not supported by LAM,
11260 : mask off code pointers here.
11261 : TODO: also need to handle indirect jump. */
11262 6241988 : if (ix86_memtag_can_tag_addresses () && !fndecl
11263 6240990 : && sanitize_flags_p (SANITIZE_HWADDRESS))
11264 : {
11265 24 : rtx untagged_addr = ix86_memtag_untagged_pointer (XEXP (fnaddr, 0),
11266 : NULL_RTX);
11267 24 : fnaddr = gen_rtx_MEM (QImode, untagged_addr);
11268 : }
11269 :
11270 6240966 : call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
11271 :
11272 6240966 : if (retval)
11273 2462122 : call = gen_rtx_SET (retval, call);
11274 6240966 : vec[vec_len++] = call;
11275 :
11276 6240966 : if (pop)
11277 : {
11278 450736 : pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
11279 225368 : pop = gen_rtx_SET (stack_pointer_rtx, pop);
11280 225368 : vec[vec_len++] = pop;
11281 : }
11282 :
11283 6240966 : static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
11284 :
11285 6240966 : if ((cfun->machine->call_saved_registers
11286 6240966 : == TYPE_NO_CALLER_SAVED_REGISTERS)
11287 6240966 : && (!fndecl
11288 468 : || (!TREE_THIS_VOLATILE (fndecl)
11289 186 : && !lookup_attribute ("no_caller_saved_registers",
11290 186 : TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
11291 : {
11292 182 : bool is_64bit_ms_abi = (TARGET_64BIT
11293 182 : && ix86_function_abi (fndecl) == MS_ABI);
11294 182 : char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
11295 :
11296 : /* If there are no caller-saved registers, add all registers
11297 : that are clobbered by the call which returns. */
11298 16926 : for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
11299 16744 : if (!fixed_regs[i]
11300 3242 : && (ix86_call_used_regs[i] == 1
11301 1506 : || (ix86_call_used_regs[i] & c_mask))
11302 2150 : && !STACK_REGNO_P (i)
11303 2150 : && !MMX_REGNO_P (i))
11304 2150 : clobber_reg (&use,
11305 2150 : gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
11306 : }
11307 5400802 : else if (TARGET_64BIT_MS_ABI
11308 6314197 : && (!callarg2 || INTVAL (callarg2) != -2))
11309 : {
11310 : unsigned i;
11311 :
11312 861848 : for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
11313 : {
11314 795552 : int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
11315 795552 : machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
11316 :
11317 795552 : clobber_reg (&use, gen_rtx_REG (mode, regno));
11318 : }
11319 :
11320 : /* Set here, but it may get cleared later. */
11321 66296 : if (TARGET_CALL_MS2SYSV_XLOGUES)
11322 : {
11323 7046 : if (!TARGET_SSE)
11324 : ;
11325 :
11326 : /* Don't break hot-patched functions. */
11327 7046 : else if (ix86_function_ms_hook_prologue (current_function_decl))
11328 : ;
11329 :
11330 : /* TODO: Cases not yet examined. */
11331 7046 : else if (flag_split_stack)
11332 0 : warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
11333 :
11334 : else
11335 : {
11336 7046 : gcc_assert (!reload_completed);
11337 7046 : cfun->machine->call_ms2sysv = true;
11338 : }
11339 : }
11340 : }
11341 :
11342 6240966 : if (TARGET_MACHO && TARGET_64BIT && !sibcall
11343 : && ((SYMBOL_REF_P (addr) && !SYMBOL_REF_LOCAL_P (addr))
11344 : || !fndecl || TREE_PUBLIC (fndecl)))
11345 : {
11346 : /* We allow public functions defined in a TU to bind locally for PIC
11347 : code (the default) on 64bit Mach-O.
11348 : If such functions are not inlined, we cannot tell at compile-time if
11349 : they will be called via the lazy symbol resolver (this can depend on
11350 : options given at link-time). Therefore, we must assume that the lazy
11351 : resolver could be used which clobbers R11 and R10. */
11352 : clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
11353 : clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
11354 : }
11355 :
11356 6240966 : if (call_no_callee_saved_registers)
11357 : {
11358 : /* After calling a no_callee_saved_registers function, all
11359 : registers may be clobbered. Clobber all registers that are
11360 : not used by the callee. */
11361 61 : bool is_64bit_ms_abi = (TARGET_64BIT
11362 61 : && ix86_function_abi (fndecl) == MS_ABI);
11363 61 : char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
11364 5673 : for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
11365 5612 : if (!fixed_regs[i]
11366 2691 : && i != HARD_FRAME_POINTER_REGNUM
11367 2630 : && !(ix86_call_used_regs[i] == 1
11368 1007 : || (ix86_call_used_regs[i] & c_mask))
11369 305 : && !STACK_REGNO_P (i)
11370 305 : && !MMX_REGNO_P (i))
11371 305 : clobber_reg (&use,
11372 305 : gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
11373 : }
11374 :
11375 6240966 : if (vec_len > 1)
11376 225368 : call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
11377 6240966 : rtx_insn *call_insn = emit_call_insn (call);
11378 6240966 : if (use)
11379 601652 : CALL_INSN_FUNCTION_USAGE (call_insn) = use;
11380 :
11381 6240966 : return call_insn;
11382 : }
11383 :
11384 : /* Split simple return with popping POPC bytes from stack to indirect
11385 : branch with stack adjustment . */
11386 :
11387 : void
11388 0 : ix86_split_simple_return_pop_internal (rtx popc)
11389 : {
11390 0 : struct machine_function *m = cfun->machine;
11391 0 : rtx ecx = gen_rtx_REG (SImode, CX_REG);
11392 0 : rtx_insn *insn;
11393 :
11394 : /* There is no "pascal" calling convention in any 64bit ABI. */
11395 0 : gcc_assert (!TARGET_64BIT);
11396 :
11397 0 : insn = emit_insn (gen_pop (ecx));
11398 0 : m->fs.cfa_offset -= UNITS_PER_WORD;
11399 0 : m->fs.sp_offset -= UNITS_PER_WORD;
11400 :
11401 0 : rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
11402 0 : x = gen_rtx_SET (stack_pointer_rtx, x);
11403 0 : add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
11404 0 : add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
11405 0 : RTX_FRAME_RELATED_P (insn) = 1;
11406 :
11407 0 : x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
11408 0 : x = gen_rtx_SET (stack_pointer_rtx, x);
11409 0 : insn = emit_insn (x);
11410 0 : add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
11411 0 : RTX_FRAME_RELATED_P (insn) = 1;
11412 :
11413 : /* Now return address is in ECX. */
11414 0 : emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11415 0 : }
11416 :
11417 : /* Errors in the source file can cause expand_expr to return const0_rtx
11418 : where we expect a vector. To avoid crashing, use one of the vector
11419 : clear instructions. */
11420 :
11421 : static rtx
11422 197970 : safe_vector_operand (rtx x, machine_mode mode)
11423 : {
11424 0 : if (x == const0_rtx)
11425 0 : x = CONST0_RTX (mode);
11426 24 : return x;
11427 : }
11428 :
11429 : /* Subroutine of ix86_expand_builtin to take care of binop insns. */
11430 :
11431 : static rtx
11432 8994 : ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
11433 : {
11434 8994 : rtx pat;
11435 8994 : tree arg0 = CALL_EXPR_ARG (exp, 0);
11436 8994 : tree arg1 = CALL_EXPR_ARG (exp, 1);
11437 8994 : rtx op0 = expand_normal (arg0);
11438 8994 : rtx op1 = expand_normal (arg1);
11439 8994 : machine_mode tmode = insn_data[icode].operand[0].mode;
11440 8994 : machine_mode mode0 = insn_data[icode].operand[1].mode;
11441 8994 : machine_mode mode1 = insn_data[icode].operand[2].mode;
11442 :
11443 8994 : if (VECTOR_MODE_P (mode0))
11444 8983 : op0 = safe_vector_operand (op0, mode0);
11445 8994 : if (VECTOR_MODE_P (mode1))
11446 8847 : op1 = safe_vector_operand (op1, mode1);
11447 :
11448 2848 : if (optimize || !target
11449 2848 : || GET_MODE (target) != tmode
11450 11842 : || !insn_data[icode].operand[0].predicate (target, tmode))
11451 6199 : target = gen_reg_rtx (tmode);
11452 :
11453 8994 : if (GET_MODE (op1) == SImode && mode1 == TImode)
11454 : {
11455 0 : rtx x = gen_reg_rtx (V4SImode);
11456 0 : emit_insn (gen_sse2_loadd (x, op1));
11457 0 : op1 = gen_lowpart (TImode, x);
11458 : }
11459 :
11460 8994 : if (!insn_data[icode].operand[1].predicate (op0, mode0))
11461 1405 : op0 = copy_to_mode_reg (mode0, op0);
11462 8994 : if (!insn_data[icode].operand[2].predicate (op1, mode1))
11463 817 : op1 = copy_to_mode_reg (mode1, op1);
11464 :
11465 8994 : pat = GEN_FCN (icode) (target, op0, op1);
11466 8994 : if (! pat)
11467 : return 0;
11468 :
11469 8994 : emit_insn (pat);
11470 :
11471 8994 : return target;
11472 : }
11473 :
11474 : /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
11475 :
11476 : static rtx
11477 1815 : ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
11478 : enum ix86_builtin_func_type m_type,
11479 : enum rtx_code sub_code)
11480 : {
11481 1815 : rtx pat;
11482 1815 : unsigned int i, nargs;
11483 1815 : bool comparison_p = false;
11484 1815 : bool tf_p = false;
11485 1815 : bool last_arg_constant = false;
11486 1815 : int num_memory = 0;
11487 1815 : rtx xops[4];
11488 :
11489 1815 : machine_mode tmode = insn_data[icode].operand[0].mode;
11490 :
11491 1815 : switch (m_type)
11492 : {
11493 : case MULTI_ARG_4_DF2_DI_I:
11494 : case MULTI_ARG_4_DF2_DI_I1:
11495 : case MULTI_ARG_4_SF2_SI_I:
11496 : case MULTI_ARG_4_SF2_SI_I1:
11497 : nargs = 4;
11498 : last_arg_constant = true;
11499 : break;
11500 :
11501 844 : case MULTI_ARG_3_SF:
11502 844 : case MULTI_ARG_3_DF:
11503 844 : case MULTI_ARG_3_SF2:
11504 844 : case MULTI_ARG_3_DF2:
11505 844 : case MULTI_ARG_3_DI:
11506 844 : case MULTI_ARG_3_SI:
11507 844 : case MULTI_ARG_3_SI_DI:
11508 844 : case MULTI_ARG_3_HI:
11509 844 : case MULTI_ARG_3_HI_SI:
11510 844 : case MULTI_ARG_3_QI:
11511 844 : case MULTI_ARG_3_DI2:
11512 844 : case MULTI_ARG_3_SI2:
11513 844 : case MULTI_ARG_3_HI2:
11514 844 : case MULTI_ARG_3_QI2:
11515 844 : nargs = 3;
11516 844 : break;
11517 :
11518 128 : case MULTI_ARG_2_SF:
11519 128 : case MULTI_ARG_2_DF:
11520 128 : case MULTI_ARG_2_DI:
11521 128 : case MULTI_ARG_2_SI:
11522 128 : case MULTI_ARG_2_HI:
11523 128 : case MULTI_ARG_2_QI:
11524 128 : nargs = 2;
11525 128 : break;
11526 :
11527 64 : case MULTI_ARG_2_DI_IMM:
11528 64 : case MULTI_ARG_2_SI_IMM:
11529 64 : case MULTI_ARG_2_HI_IMM:
11530 64 : case MULTI_ARG_2_QI_IMM:
11531 64 : nargs = 2;
11532 64 : last_arg_constant = true;
11533 64 : break;
11534 :
11535 187 : case MULTI_ARG_1_SF:
11536 187 : case MULTI_ARG_1_DF:
11537 187 : case MULTI_ARG_1_SF2:
11538 187 : case MULTI_ARG_1_DF2:
11539 187 : case MULTI_ARG_1_DI:
11540 187 : case MULTI_ARG_1_SI:
11541 187 : case MULTI_ARG_1_HI:
11542 187 : case MULTI_ARG_1_QI:
11543 187 : case MULTI_ARG_1_SI_DI:
11544 187 : case MULTI_ARG_1_HI_DI:
11545 187 : case MULTI_ARG_1_HI_SI:
11546 187 : case MULTI_ARG_1_QI_DI:
11547 187 : case MULTI_ARG_1_QI_SI:
11548 187 : case MULTI_ARG_1_QI_HI:
11549 187 : nargs = 1;
11550 187 : break;
11551 :
11552 384 : case MULTI_ARG_2_DI_CMP:
11553 384 : case MULTI_ARG_2_SI_CMP:
11554 384 : case MULTI_ARG_2_HI_CMP:
11555 384 : case MULTI_ARG_2_QI_CMP:
11556 384 : nargs = 2;
11557 384 : comparison_p = true;
11558 384 : break;
11559 :
11560 128 : case MULTI_ARG_2_SF_TF:
11561 128 : case MULTI_ARG_2_DF_TF:
11562 128 : case MULTI_ARG_2_DI_TF:
11563 128 : case MULTI_ARG_2_SI_TF:
11564 128 : case MULTI_ARG_2_HI_TF:
11565 128 : case MULTI_ARG_2_QI_TF:
11566 128 : nargs = 2;
11567 128 : tf_p = true;
11568 128 : break;
11569 :
11570 0 : default:
11571 0 : gcc_unreachable ();
11572 : }
11573 :
11574 628 : if (optimize || !target
11575 628 : || GET_MODE (target) != tmode
11576 2419 : || !insn_data[icode].operand[0].predicate (target, tmode))
11577 1211 : target = gen_reg_rtx (tmode);
11578 604 : else if (memory_operand (target, tmode))
11579 0 : num_memory++;
11580 :
11581 1815 : gcc_assert (nargs <= ARRAY_SIZE (xops));
11582 :
11583 6254 : for (i = 0; i < nargs; i++)
11584 : {
11585 4447 : tree arg = CALL_EXPR_ARG (exp, i);
11586 4447 : rtx op = expand_normal (arg);
11587 4447 : int adjust = (comparison_p) ? 1 : 0;
11588 4447 : machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
11589 :
11590 4447 : if (last_arg_constant && i == nargs - 1)
11591 : {
11592 144 : if (!insn_data[icode].operand[i + 1].predicate (op, mode))
11593 : {
11594 30 : enum insn_code new_icode = icode;
11595 30 : switch (icode)
11596 : {
11597 8 : case CODE_FOR_xop_vpermil2v2df3:
11598 8 : case CODE_FOR_xop_vpermil2v4sf3:
11599 8 : case CODE_FOR_xop_vpermil2v4df3:
11600 8 : case CODE_FOR_xop_vpermil2v8sf3:
11601 8 : error ("the last argument must be a 2-bit immediate");
11602 8 : return gen_reg_rtx (tmode);
11603 5 : case CODE_FOR_xop_rotlv2di3:
11604 5 : new_icode = CODE_FOR_rotlv2di3;
11605 5 : goto xop_rotl;
11606 5 : case CODE_FOR_xop_rotlv4si3:
11607 5 : new_icode = CODE_FOR_rotlv4si3;
11608 5 : goto xop_rotl;
11609 6 : case CODE_FOR_xop_rotlv8hi3:
11610 6 : new_icode = CODE_FOR_rotlv8hi3;
11611 6 : goto xop_rotl;
11612 : case CODE_FOR_xop_rotlv16qi3:
11613 : new_icode = CODE_FOR_rotlv16qi3;
11614 22 : xop_rotl:
11615 22 : if (CONST_INT_P (op))
11616 : {
11617 6 : int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
11618 6 : op = GEN_INT (INTVAL (op) & mask);
11619 6 : gcc_checking_assert
11620 : (insn_data[icode].operand[i + 1].predicate (op, mode));
11621 : }
11622 : else
11623 : {
11624 16 : gcc_checking_assert
11625 : (nargs == 2
11626 : && insn_data[new_icode].operand[0].mode == tmode
11627 : && insn_data[new_icode].operand[1].mode == tmode
11628 : && insn_data[new_icode].operand[2].mode == mode
11629 : && insn_data[new_icode].operand[0].predicate
11630 : == insn_data[icode].operand[0].predicate
11631 : && insn_data[new_icode].operand[1].predicate
11632 : == insn_data[icode].operand[1].predicate);
11633 16 : icode = new_icode;
11634 16 : goto non_constant;
11635 : }
11636 : break;
11637 0 : default:
11638 0 : gcc_unreachable ();
11639 : }
11640 : }
11641 : }
11642 : else
11643 : {
11644 4303 : non_constant:
11645 4319 : if (VECTOR_MODE_P (mode))
11646 4303 : op = safe_vector_operand (op, mode);
11647 :
11648 : /* If we aren't optimizing, only allow one memory operand to be
11649 : generated. */
11650 4319 : if (memory_operand (op, mode))
11651 826 : num_memory++;
11652 :
11653 4319 : gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
11654 :
11655 4319 : if (optimize
11656 1506 : || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
11657 5747 : || num_memory > 1)
11658 3398 : op = force_reg (mode, op);
11659 : }
11660 :
11661 4439 : xops[i] = op;
11662 : }
11663 :
11664 1807 : switch (nargs)
11665 : {
11666 187 : case 1:
11667 187 : pat = GEN_FCN (icode) (target, xops[0]);
11668 187 : break;
11669 :
11670 704 : case 2:
11671 704 : if (tf_p)
11672 128 : pat = GEN_FCN (icode) (target, xops[0], xops[1],
11673 128 : GEN_INT ((int)sub_code));
11674 576 : else if (! comparison_p)
11675 192 : pat = GEN_FCN (icode) (target, xops[0], xops[1]);
11676 : else
11677 : {
11678 384 : rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
11679 : xops[0], xops[1]);
11680 :
11681 384 : pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
11682 : }
11683 : break;
11684 :
11685 844 : case 3:
11686 844 : pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
11687 844 : break;
11688 :
11689 72 : case 4:
11690 72 : pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
11691 72 : break;
11692 :
11693 : default:
11694 : gcc_unreachable ();
11695 : }
11696 :
11697 1807 : if (! pat)
11698 : return 0;
11699 :
11700 1807 : emit_insn (pat);
11701 1807 : return target;
11702 : }
11703 :
11704 : /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
11705 : insns with vec_merge. */
11706 :
11707 : static rtx
11708 52 : ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
11709 : rtx target)
11710 : {
11711 52 : rtx pat;
11712 52 : tree arg0 = CALL_EXPR_ARG (exp, 0);
11713 52 : rtx op1, op0 = expand_normal (arg0);
11714 52 : machine_mode tmode = insn_data[icode].operand[0].mode;
11715 52 : machine_mode mode0 = insn_data[icode].operand[1].mode;
11716 :
11717 16 : if (optimize || !target
11718 16 : || GET_MODE (target) != tmode
11719 68 : || !insn_data[icode].operand[0].predicate (target, tmode))
11720 36 : target = gen_reg_rtx (tmode);
11721 :
11722 52 : if (VECTOR_MODE_P (mode0))
11723 52 : op0 = safe_vector_operand (op0, mode0);
11724 :
11725 36 : if ((optimize && !register_operand (op0, mode0))
11726 88 : || !insn_data[icode].operand[1].predicate (op0, mode0))
11727 0 : op0 = copy_to_mode_reg (mode0, op0);
11728 :
11729 52 : op1 = op0;
11730 52 : if (!insn_data[icode].operand[2].predicate (op1, mode0))
11731 16 : op1 = copy_to_mode_reg (mode0, op1);
11732 :
11733 52 : pat = GEN_FCN (icode) (target, op0, op1);
11734 52 : if (! pat)
11735 : return 0;
11736 52 : emit_insn (pat);
11737 52 : return target;
11738 : }
11739 :
11740 : /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
11741 :
11742 : static rtx
11743 614 : ix86_expand_sse_compare (const struct builtin_description *d,
11744 : tree exp, rtx target, bool swap)
11745 : {
11746 614 : rtx pat;
11747 614 : tree arg0 = CALL_EXPR_ARG (exp, 0);
11748 614 : tree arg1 = CALL_EXPR_ARG (exp, 1);
11749 614 : rtx op0 = expand_normal (arg0);
11750 614 : rtx op1 = expand_normal (arg1);
11751 614 : rtx op2;
11752 614 : machine_mode tmode = insn_data[d->icode].operand[0].mode;
11753 614 : machine_mode mode0 = insn_data[d->icode].operand[1].mode;
11754 614 : machine_mode mode1 = insn_data[d->icode].operand[2].mode;
11755 614 : enum rtx_code comparison = d->comparison;
11756 :
11757 614 : if (VECTOR_MODE_P (mode0))
11758 614 : op0 = safe_vector_operand (op0, mode0);
11759 614 : if (VECTOR_MODE_P (mode1))
11760 614 : op1 = safe_vector_operand (op1, mode1);
11761 :
11762 : /* Swap operands if we have a comparison that isn't available in
11763 : hardware. */
11764 614 : if (swap)
11765 80 : std::swap (op0, op1);
11766 :
11767 202 : if (optimize || !target
11768 202 : || GET_MODE (target) != tmode
11769 816 : || !insn_data[d->icode].operand[0].predicate (target, tmode))
11770 412 : target = gen_reg_rtx (tmode);
11771 :
11772 412 : if ((optimize && !register_operand (op0, mode0))
11773 956 : || !insn_data[d->icode].operand[1].predicate (op0, mode0))
11774 272 : op0 = copy_to_mode_reg (mode0, op0);
11775 412 : if ((optimize && !register_operand (op1, mode1))
11776 972 : || !insn_data[d->icode].operand[2].predicate (op1, mode1))
11777 54 : op1 = copy_to_mode_reg (mode1, op1);
11778 :
11779 614 : op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
11780 614 : pat = GEN_FCN (d->icode) (target, op0, op1, op2);
11781 614 : if (! pat)
11782 : return 0;
11783 614 : emit_insn (pat);
11784 614 : return target;
11785 : }
11786 :
11787 : /* Subroutine of ix86_sse_comi and ix86_sse_comi_round to take care of
11788 : * ordered EQ or unordered NE, generate PF jump. */
11789 :
11790 : static rtx
11791 646 : ix86_ssecom_setcc (const enum rtx_code comparison,
11792 : bool check_unordered, machine_mode mode,
11793 : rtx set_dst, rtx target)
11794 : {
11795 :
11796 646 : rtx_code_label *label = NULL;
11797 :
11798 : /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
11799 : with NAN operands.
11800 : Under TARGET_AVX10_2, VCOMX/VUCOMX are generated instead of
11801 : COMI/UCOMI. VCOMX/VUCOMX will not set ZF for NAN operands. */
11802 646 : if (check_unordered)
11803 : {
11804 122 : gcc_assert (comparison == EQ || comparison == NE);
11805 :
11806 122 : rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
11807 122 : label = gen_label_rtx ();
11808 122 : rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
11809 122 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
11810 : gen_rtx_LABEL_REF (VOIDmode, label),
11811 : pc_rtx);
11812 122 : emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
11813 : }
11814 :
11815 : /* NB: Set CCFPmode and check a different CCmode which is in subset
11816 : of CCFPmode. */
11817 646 : if (GET_MODE (set_dst) != mode)
11818 : {
11819 200 : gcc_assert (mode == CCAmode || mode == CCCmode
11820 : || mode == CCOmode || mode == CCPmode
11821 : || mode == CCSmode || mode == CCZmode);
11822 200 : set_dst = gen_rtx_REG (mode, FLAGS_REG);
11823 : }
11824 :
11825 646 : emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
11826 : gen_rtx_fmt_ee (comparison, QImode,
11827 : set_dst,
11828 : const0_rtx)));
11829 :
11830 646 : if (label)
11831 122 : emit_label (label);
11832 :
11833 646 : return SUBREG_REG (target);
11834 : }
11835 :
11836 : /* Subroutine of ix86_expand_builtin to take care of comi insns. */
11837 :
11838 : static rtx
11839 547 : ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
11840 : rtx target, bool comx_ok)
11841 : {
11842 547 : rtx pat, set_dst;
11843 547 : tree arg0 = CALL_EXPR_ARG (exp, 0);
11844 547 : tree arg1 = CALL_EXPR_ARG (exp, 1);
11845 547 : rtx op0 = expand_normal (arg0);
11846 547 : rtx op1 = expand_normal (arg1);
11847 547 : enum insn_code icode = d->icode;
11848 547 : const struct insn_data_d *insn_p = &insn_data[icode];
11849 547 : machine_mode mode0 = insn_p->operand[0].mode;
11850 547 : machine_mode mode1 = insn_p->operand[1].mode;
11851 :
11852 547 : if (VECTOR_MODE_P (mode0))
11853 547 : op0 = safe_vector_operand (op0, mode0);
11854 547 : if (VECTOR_MODE_P (mode1))
11855 547 : op1 = safe_vector_operand (op1, mode1);
11856 :
11857 547 : enum rtx_code comparison = d->comparison;
11858 547 : rtx const_val = const0_rtx;
11859 :
11860 547 : bool check_unordered = false;
11861 547 : machine_mode mode = CCFPmode;
11862 547 : switch (comparison)
11863 : {
11864 194 : case LE: /* -> GE */
11865 194 : case LT: /* -> GT */
11866 194 : std::swap (op0, op1);
11867 194 : comparison = swap_condition (comparison);
11868 : /* FALLTHRU */
11869 : case GT:
11870 : case GE:
11871 : break;
11872 73 : case EQ:
11873 73 : if (!TARGET_AVX10_2 || !comx_ok)
11874 45 : check_unordered = true;
11875 : mode = CCZmode;
11876 : break;
11877 96 : case NE:
11878 96 : if (!TARGET_AVX10_2 || !comx_ok)
11879 68 : check_unordered = true;
11880 96 : mode = CCZmode;
11881 96 : const_val = const1_rtx;
11882 96 : break;
11883 0 : default:
11884 0 : gcc_unreachable ();
11885 : }
11886 :
11887 547 : target = gen_reg_rtx (SImode);
11888 547 : emit_move_insn (target, const_val);
11889 547 : target = gen_rtx_SUBREG (QImode, target, 0);
11890 :
11891 426 : if ((optimize && !register_operand (op0, mode0))
11892 925 : || !insn_p->operand[0].predicate (op0, mode0))
11893 169 : op0 = copy_to_mode_reg (mode0, op0);
11894 426 : if ((optimize && !register_operand (op1, mode1))
11895 924 : || !insn_p->operand[1].predicate (op1, mode1))
11896 49 : op1 = copy_to_mode_reg (mode1, op1);
11897 :
11898 547 : if ((comparison == EQ || comparison == NE)
11899 169 : && TARGET_AVX10_2 && comx_ok)
11900 : {
11901 56 : switch (icode)
11902 : {
11903 : case CODE_FOR_sse_comi:
11904 : icode = CODE_FOR_avx10_2_comxsf;
11905 : break;
11906 14 : case CODE_FOR_sse_ucomi:
11907 14 : icode = CODE_FOR_avx10_2_ucomxsf;
11908 14 : break;
11909 14 : case CODE_FOR_sse2_comi:
11910 14 : icode = CODE_FOR_avx10_2_comxdf;
11911 14 : break;
11912 14 : case CODE_FOR_sse2_ucomi:
11913 14 : icode = CODE_FOR_avx10_2_ucomxdf;
11914 14 : break;
11915 :
11916 0 : default:
11917 0 : gcc_unreachable ();
11918 : }
11919 : }
11920 547 : pat = GEN_FCN (icode) (op0, op1);
11921 547 : if (! pat)
11922 : return 0;
11923 :
11924 547 : set_dst = SET_DEST (pat);
11925 547 : emit_insn (pat);
11926 547 : return ix86_ssecom_setcc (comparison, check_unordered, mode,
11927 547 : set_dst, target);
11928 : }
11929 :
11930 : /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
11931 :
11932 : static rtx
11933 0 : ix86_expand_sse_round (const struct builtin_description *d, tree exp,
11934 : rtx target)
11935 : {
11936 0 : rtx pat;
11937 0 : tree arg0 = CALL_EXPR_ARG (exp, 0);
11938 0 : rtx op1, op0 = expand_normal (arg0);
11939 0 : machine_mode tmode = insn_data[d->icode].operand[0].mode;
11940 0 : machine_mode mode0 = insn_data[d->icode].operand[1].mode;
11941 :
11942 0 : if (optimize || target == 0
11943 0 : || GET_MODE (target) != tmode
11944 0 : || !insn_data[d->icode].operand[0].predicate (target, tmode))
11945 0 : target = gen_reg_rtx (tmode);
11946 :
11947 0 : if (VECTOR_MODE_P (mode0))
11948 0 : op0 = safe_vector_operand (op0, mode0);
11949 :
11950 0 : if ((optimize && !register_operand (op0, mode0))
11951 0 : || !insn_data[d->icode].operand[0].predicate (op0, mode0))
11952 0 : op0 = copy_to_mode_reg (mode0, op0);
11953 :
11954 0 : op1 = GEN_INT (d->comparison);
11955 :
11956 0 : pat = GEN_FCN (d->icode) (target, op0, op1);
11957 0 : if (! pat)
11958 : return 0;
11959 0 : emit_insn (pat);
11960 0 : return target;
11961 : }
11962 :
11963 : static rtx
11964 12 : ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
11965 : tree exp, rtx target)
11966 : {
11967 12 : rtx pat;
11968 12 : tree arg0 = CALL_EXPR_ARG (exp, 0);
11969 12 : tree arg1 = CALL_EXPR_ARG (exp, 1);
11970 12 : rtx op0 = expand_normal (arg0);
11971 12 : rtx op1 = expand_normal (arg1);
11972 12 : rtx op2;
11973 12 : machine_mode tmode = insn_data[d->icode].operand[0].mode;
11974 12 : machine_mode mode0 = insn_data[d->icode].operand[1].mode;
11975 12 : machine_mode mode1 = insn_data[d->icode].operand[2].mode;
11976 :
11977 0 : if (optimize || target == 0
11978 0 : || GET_MODE (target) != tmode
11979 12 : || !insn_data[d->icode].operand[0].predicate (target, tmode))
11980 12 : target = gen_reg_rtx (tmode);
11981 :
11982 12 : op0 = safe_vector_operand (op0, mode0);
11983 12 : op1 = safe_vector_operand (op1, mode1);
11984 :
11985 12 : if ((optimize && !register_operand (op0, mode0))
11986 12 : || !insn_data[d->icode].operand[0].predicate (op0, mode0))
11987 12 : op0 = copy_to_mode_reg (mode0, op0);
11988 12 : if ((optimize && !register_operand (op1, mode1))
11989 12 : || !insn_data[d->icode].operand[1].predicate (op1, mode1))
11990 12 : op1 = copy_to_mode_reg (mode1, op1);
11991 :
11992 12 : op2 = GEN_INT (d->comparison);
11993 :
11994 12 : pat = GEN_FCN (d->icode) (target, op0, op1, op2);
11995 12 : if (! pat)
11996 : return 0;
11997 12 : emit_insn (pat);
11998 12 : return target;
11999 : }
12000 :
12001 : /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
12002 :
12003 : static rtx
12004 239 : ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
12005 : rtx target)
12006 : {
12007 239 : rtx pat;
12008 239 : tree arg0 = CALL_EXPR_ARG (exp, 0);
12009 239 : tree arg1 = CALL_EXPR_ARG (exp, 1);
12010 239 : rtx op0 = expand_normal (arg0);
12011 239 : rtx op1 = expand_normal (arg1);
12012 239 : machine_mode mode0 = insn_data[d->icode].operand[0].mode;
12013 239 : machine_mode mode1 = insn_data[d->icode].operand[1].mode;
12014 239 : enum rtx_code comparison = d->comparison;
12015 239 : rtx result = NULL_RTX;
12016 :
12017 239 : if (VECTOR_MODE_P (mode0))
12018 239 : op0 = safe_vector_operand (op0, mode0);
12019 239 : if (VECTOR_MODE_P (mode1))
12020 239 : op1 = safe_vector_operand (op1, mode1);
12021 :
12022 239 : switch (d->code)
12023 : {
12024 49 : case IX86_BUILTIN_PTESTZ:
12025 49 : case IX86_BUILTIN_PTESTZ256:
12026 : // Returns (OP0 & OP1) == 0
12027 49 : if (rtx_equal_p (op0, CONST0_RTX (mode0))
12028 49 : || rtx_equal_p (op1, CONST0_RTX (mode1)))
12029 2 : result = const1_rtx;
12030 47 : else if (rtx_equal_p (op0, CONSTM1_RTX (mode0)))
12031 : {
12032 1 : op1 = force_reg (mode1, op1);
12033 1 : op0 = op1;
12034 : }
12035 46 : else if (rtx_equal_p (op1, CONSTM1_RTX (mode1)))
12036 : {
12037 1 : op0 = force_reg (mode0, op0);
12038 1 : op1 = op0;
12039 : }
12040 45 : else if (MEM_P (op0) && !MEM_P (op1))
12041 : std::swap (op0, op1);
12042 : break;
12043 :
12044 31 : case IX86_BUILTIN_PTESTC:
12045 31 : case IX86_BUILTIN_PTESTC256:
12046 : // Returns (~OP0 & OP1) == 0
12047 31 : if (rtx_equal_p (op0, CONSTM1_RTX (mode0))
12048 31 : || rtx_equal_p (op1, CONST0_RTX (mode1))
12049 62 : || rtx_equal_p (op0, op1))
12050 2 : result = const1_rtx;
12051 : break;
12052 :
12053 27 : case IX86_BUILTIN_PTESTNZC:
12054 27 : case IX86_BUILTIN_PTESTNZC256:
12055 : // Returns ((OP0 && OP1) != 0) && ((~OP0 && OP1) != 0)
12056 27 : if (rtx_equal_p (op0, CONST0_RTX (mode0))
12057 26 : || rtx_equal_p (op0, CONSTM1_RTX (mode0))
12058 26 : || rtx_equal_p (op1, CONST0_RTX (mode1))
12059 53 : || rtx_equal_p (op0, op1))
12060 1 : result = const0_rtx;
12061 : break;
12062 :
12063 : default:
12064 : break;
12065 : }
12066 :
12067 167 : if ((optimize && !register_operand (op0, mode0))
12068 210 : || !insn_data[d->icode].operand[0].predicate (op0, mode0)
12069 377 : || result)
12070 104 : op0 = copy_to_mode_reg (mode0, op0);
12071 167 : if ((optimize && !register_operand (op1, mode1))
12072 211 : || !insn_data[d->icode].operand[1].predicate (op1, mode1)
12073 450 : || result)
12074 31 : op1 = copy_to_mode_reg (mode1, op1);
12075 :
12076 239 : if (result)
12077 : {
12078 5 : if (!target)
12079 0 : target = gen_reg_rtx (SImode);
12080 5 : emit_move_insn (target, result);
12081 5 : return target;
12082 : }
12083 :
12084 234 : target = gen_reg_rtx (SImode);
12085 234 : emit_move_insn (target, const0_rtx);
12086 234 : target = gen_rtx_SUBREG (QImode, target, 0);
12087 :
12088 234 : pat = GEN_FCN (d->icode) (op0, op1);
12089 234 : if (! pat)
12090 : return 0;
12091 234 : emit_insn (pat);
12092 234 : emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
12093 : gen_rtx_fmt_ee (comparison, QImode,
12094 : SET_DEST (pat),
12095 : const0_rtx)));
12096 :
12097 234 : return SUBREG_REG (target);
12098 : }
12099 :
12100 : /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
12101 :
12102 : static rtx
12103 216 : ix86_expand_sse_pcmpestr (const struct builtin_description *d,
12104 : tree exp, rtx target)
12105 : {
12106 216 : rtx pat;
12107 216 : tree arg0 = CALL_EXPR_ARG (exp, 0);
12108 216 : tree arg1 = CALL_EXPR_ARG (exp, 1);
12109 216 : tree arg2 = CALL_EXPR_ARG (exp, 2);
12110 216 : tree arg3 = CALL_EXPR_ARG (exp, 3);
12111 216 : tree arg4 = CALL_EXPR_ARG (exp, 4);
12112 216 : rtx scratch0, scratch1;
12113 216 : rtx op0 = expand_normal (arg0);
12114 216 : rtx op1 = expand_normal (arg1);
12115 216 : rtx op2 = expand_normal (arg2);
12116 216 : rtx op3 = expand_normal (arg3);
12117 216 : rtx op4 = expand_normal (arg4);
12118 216 : machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
12119 :
12120 216 : tmode0 = insn_data[d->icode].operand[0].mode;
12121 216 : tmode1 = insn_data[d->icode].operand[1].mode;
12122 216 : modev2 = insn_data[d->icode].operand[2].mode;
12123 216 : modei3 = insn_data[d->icode].operand[3].mode;
12124 216 : modev4 = insn_data[d->icode].operand[4].mode;
12125 216 : modei5 = insn_data[d->icode].operand[5].mode;
12126 216 : modeimm = insn_data[d->icode].operand[6].mode;
12127 :
12128 216 : if (VECTOR_MODE_P (modev2))
12129 216 : op0 = safe_vector_operand (op0, modev2);
12130 216 : if (VECTOR_MODE_P (modev4))
12131 216 : op2 = safe_vector_operand (op2, modev4);
12132 :
12133 216 : if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
12134 6 : op0 = copy_to_mode_reg (modev2, op0);
12135 216 : if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
12136 34 : op1 = copy_to_mode_reg (modei3, op1);
12137 160 : if ((optimize && !register_operand (op2, modev4))
12138 371 : || !insn_data[d->icode].operand[4].predicate (op2, modev4))
12139 5 : op2 = copy_to_mode_reg (modev4, op2);
12140 216 : if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
12141 34 : op3 = copy_to_mode_reg (modei5, op3);
12142 :
12143 216 : if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
12144 : {
12145 21 : error ("the fifth argument must be an 8-bit immediate");
12146 21 : return const0_rtx;
12147 : }
12148 :
12149 195 : if (d->code == IX86_BUILTIN_PCMPESTRI128)
12150 : {
12151 5 : if (optimize || !target
12152 5 : || GET_MODE (target) != tmode0
12153 34 : || !insn_data[d->icode].operand[0].predicate (target, tmode0))
12154 24 : target = gen_reg_rtx (tmode0);
12155 :
12156 29 : scratch1 = gen_reg_rtx (tmode1);
12157 :
12158 29 : pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
12159 : }
12160 166 : else if (d->code == IX86_BUILTIN_PCMPESTRM128)
12161 : {
12162 5 : if (optimize || !target
12163 5 : || GET_MODE (target) != tmode1
12164 36 : || !insn_data[d->icode].operand[1].predicate (target, tmode1))
12165 26 : target = gen_reg_rtx (tmode1);
12166 :
12167 31 : scratch0 = gen_reg_rtx (tmode0);
12168 :
12169 31 : pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
12170 : }
12171 : else
12172 : {
12173 135 : gcc_assert (d->flag);
12174 :
12175 135 : scratch0 = gen_reg_rtx (tmode0);
12176 135 : scratch1 = gen_reg_rtx (tmode1);
12177 :
12178 135 : pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
12179 : }
12180 :
12181 195 : if (! pat)
12182 : return 0;
12183 :
12184 195 : emit_insn (pat);
12185 :
12186 195 : if (d->flag)
12187 : {
12188 135 : target = gen_reg_rtx (SImode);
12189 135 : emit_move_insn (target, const0_rtx);
12190 135 : target = gen_rtx_SUBREG (QImode, target, 0);
12191 :
12192 135 : emit_insn
12193 135 : (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
12194 : gen_rtx_fmt_ee (EQ, QImode,
12195 : gen_rtx_REG ((machine_mode) d->flag,
12196 : FLAGS_REG),
12197 : const0_rtx)));
12198 135 : return SUBREG_REG (target);
12199 : }
12200 : else
12201 : return target;
12202 : }
12203 :
12204 :
12205 : /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
12206 :
12207 : static rtx
12208 275 : ix86_expand_sse_pcmpistr (const struct builtin_description *d,
12209 : tree exp, rtx target)
12210 : {
12211 275 : rtx pat;
12212 275 : tree arg0 = CALL_EXPR_ARG (exp, 0);
12213 275 : tree arg1 = CALL_EXPR_ARG (exp, 1);
12214 275 : tree arg2 = CALL_EXPR_ARG (exp, 2);
12215 275 : rtx scratch0, scratch1;
12216 275 : rtx op0 = expand_normal (arg0);
12217 275 : rtx op1 = expand_normal (arg1);
12218 275 : rtx op2 = expand_normal (arg2);
12219 275 : machine_mode tmode0, tmode1, modev2, modev3, modeimm;
12220 :
12221 275 : tmode0 = insn_data[d->icode].operand[0].mode;
12222 275 : tmode1 = insn_data[d->icode].operand[1].mode;
12223 275 : modev2 = insn_data[d->icode].operand[2].mode;
12224 275 : modev3 = insn_data[d->icode].operand[3].mode;
12225 275 : modeimm = insn_data[d->icode].operand[4].mode;
12226 :
12227 275 : if (VECTOR_MODE_P (modev2))
12228 275 : op0 = safe_vector_operand (op0, modev2);
12229 275 : if (VECTOR_MODE_P (modev3))
12230 275 : op1 = safe_vector_operand (op1, modev3);
12231 :
12232 275 : if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
12233 4 : op0 = copy_to_mode_reg (modev2, op0);
12234 210 : if ((optimize && !register_operand (op1, modev3))
12235 481 : || !insn_data[d->icode].operand[3].predicate (op1, modev3))
12236 4 : op1 = copy_to_mode_reg (modev3, op1);
12237 :
12238 275 : if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
12239 : {
12240 21 : error ("the third argument must be an 8-bit immediate");
12241 21 : return const0_rtx;
12242 : }
12243 :
12244 254 : if (d->code == IX86_BUILTIN_PCMPISTRI128)
12245 : {
12246 5 : if (optimize || !target
12247 5 : || GET_MODE (target) != tmode0
12248 38 : || !insn_data[d->icode].operand[0].predicate (target, tmode0))
12249 28 : target = gen_reg_rtx (tmode0);
12250 :
12251 33 : scratch1 = gen_reg_rtx (tmode1);
12252 :
12253 33 : pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
12254 : }
12255 221 : else if (d->code == IX86_BUILTIN_PCMPISTRM128)
12256 : {
12257 8 : if (optimize || !target
12258 8 : || GET_MODE (target) != tmode1
12259 58 : || !insn_data[d->icode].operand[1].predicate (target, tmode1))
12260 42 : target = gen_reg_rtx (tmode1);
12261 :
12262 50 : scratch0 = gen_reg_rtx (tmode0);
12263 :
12264 50 : pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
12265 : }
12266 : else
12267 : {
12268 171 : gcc_assert (d->flag);
12269 :
12270 171 : scratch0 = gen_reg_rtx (tmode0);
12271 171 : scratch1 = gen_reg_rtx (tmode1);
12272 :
12273 171 : pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
12274 : }
12275 :
12276 254 : if (! pat)
12277 : return 0;
12278 :
12279 254 : emit_insn (pat);
12280 :
12281 254 : if (d->flag)
12282 : {
12283 171 : target = gen_reg_rtx (SImode);
12284 171 : emit_move_insn (target, const0_rtx);
12285 171 : target = gen_rtx_SUBREG (QImode, target, 0);
12286 :
12287 171 : emit_insn
12288 171 : (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
12289 : gen_rtx_fmt_ee (EQ, QImode,
12290 : gen_rtx_REG ((machine_mode) d->flag,
12291 : FLAGS_REG),
12292 : const0_rtx)));
12293 171 : return SUBREG_REG (target);
12294 : }
12295 : else
12296 : return target;
12297 : }
12298 :
12299 : /* Fixup modeless constants to fit required mode. */
12300 :
12301 : static rtx
12302 260756 : fixup_modeless_constant (rtx x, machine_mode mode)
12303 : {
12304 260756 : if (GET_MODE (x) == VOIDmode)
12305 41433 : x = convert_to_mode (mode, x, 1);
12306 260756 : return x;
12307 : }
12308 :
12309 : /* Expand the outgoing argument ARG to extract unsigned char and short
12310 : integer constants suitable for the predicates and the instruction
12311 : templates which expect the unsigned expanded value. */
12312 :
12313 : static rtx
12314 281980 : ix86_expand_unsigned_small_int_cst_argument (tree arg)
12315 : {
12316 : /* When passing 0xff as an unsigned char function argument with the
12317 : C frontend promotion, expand_normal gets
12318 :
12319 : <integer_cst 0x7fffe6aa23a8 type <integer_type 0x7fffe98225e8 int> constant 255>
12320 :
12321 : and returns the rtx value using the sign-extended representation:
12322 :
12323 : (const_int 255 [0xff])
12324 :
12325 : Without the C frontend promotion, expand_normal gets
12326 :
12327 : <integer_cst 0x7fffe9824018 type <integer_type 0x7fffe9822348 unsigned char > constant 255>
12328 :
12329 : and returns
12330 :
12331 : (const_int -1 [0xffffffffffffffff])
12332 :
12333 : which doesn't work with the predicates nor the instruction templates
12334 : which expect the unsigned expanded value. Extract the unsigned char
12335 : and short integer constants to return
12336 :
12337 : (const_int 255 [0xff])
12338 :
12339 : so that the expanded value is always unsigned, without the C frontend
12340 : promotion. */
12341 :
12342 281980 : if (TREE_CODE (arg) == INTEGER_CST)
12343 : {
12344 60302 : tree type = TREE_TYPE (arg);
12345 60302 : if (INTEGRAL_TYPE_P (type)
12346 60302 : && TYPE_UNSIGNED (type)
12347 82107 : && TYPE_PRECISION (type) < TYPE_PRECISION (integer_type_node))
12348 : {
12349 18319 : HOST_WIDE_INT cst = TREE_INT_CST_LOW (arg);
12350 18319 : return GEN_INT (cst);
12351 : }
12352 : }
12353 :
12354 263661 : return expand_normal (arg);
12355 : }
12356 :
12357 : /* Subroutine of ix86_expand_builtin to take care of insns with
12358 : variable number of operands. */
12359 :
12360 : static rtx
12361 71001 : ix86_expand_args_builtin (const struct builtin_description *d,
12362 : tree exp, rtx target)
12363 : {
12364 71001 : rtx pat, real_target;
12365 71001 : unsigned int i, nargs;
12366 71001 : unsigned int nargs_constant = 0;
12367 71001 : unsigned int mask_pos = 0;
12368 71001 : int num_memory = 0;
12369 71001 : rtx xops[6];
12370 71001 : bool second_arg_count = false;
12371 71001 : enum insn_code icode = d->icode;
12372 71001 : const struct insn_data_d *insn_p = &insn_data[icode];
12373 71001 : machine_mode tmode = insn_p->operand[0].mode;
12374 71001 : machine_mode rmode = VOIDmode;
12375 71001 : bool swap = false;
12376 71001 : enum rtx_code comparison = d->comparison;
12377 :
12378 71001 : switch ((enum ix86_builtin_func_type) d->flag)
12379 : {
12380 0 : case V2DF_FTYPE_V2DF_ROUND:
12381 0 : case V4DF_FTYPE_V4DF_ROUND:
12382 0 : case V8DF_FTYPE_V8DF_ROUND:
12383 0 : case V4SF_FTYPE_V4SF_ROUND:
12384 0 : case V8SF_FTYPE_V8SF_ROUND:
12385 0 : case V16SF_FTYPE_V16SF_ROUND:
12386 0 : case V8HF_FTYPE_V8HF_ROUND:
12387 0 : case V16HF_FTYPE_V16HF_ROUND:
12388 0 : case V32HF_FTYPE_V32HF_ROUND:
12389 0 : case V4SI_FTYPE_V4SF_ROUND:
12390 0 : case V8SI_FTYPE_V8SF_ROUND:
12391 0 : case V16SI_FTYPE_V16SF_ROUND:
12392 0 : return ix86_expand_sse_round (d, exp, target);
12393 12 : case V4SI_FTYPE_V2DF_V2DF_ROUND:
12394 12 : case V8SI_FTYPE_V4DF_V4DF_ROUND:
12395 12 : case V16SI_FTYPE_V8DF_V8DF_ROUND:
12396 12 : return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
12397 239 : case INT_FTYPE_V8SF_V8SF_PTEST:
12398 239 : case INT_FTYPE_V4DI_V4DI_PTEST:
12399 239 : case INT_FTYPE_V4DF_V4DF_PTEST:
12400 239 : case INT_FTYPE_V4SF_V4SF_PTEST:
12401 239 : case INT_FTYPE_V2DI_V2DI_PTEST:
12402 239 : case INT_FTYPE_V2DF_V2DF_PTEST:
12403 239 : return ix86_expand_sse_ptest (d, exp, target);
12404 : case FLOAT128_FTYPE_FLOAT128:
12405 : case FLOAT_FTYPE_FLOAT:
12406 : case FLOAT_FTYPE_BFLOAT16:
12407 : case INT_FTYPE_INT:
12408 : case UINT_FTYPE_UINT:
12409 : case UINT16_FTYPE_UINT16:
12410 : case UINT64_FTYPE_INT:
12411 : case UINT64_FTYPE_UINT64:
12412 : case INT64_FTYPE_INT64:
12413 : case INT64_FTYPE_V4SF:
12414 : case INT64_FTYPE_V2DF:
12415 : case INT_FTYPE_V16QI:
12416 : case INT_FTYPE_V8QI:
12417 : case INT_FTYPE_V8SF:
12418 : case INT_FTYPE_V4DF:
12419 : case INT_FTYPE_V4SF:
12420 : case INT_FTYPE_V2DF:
12421 : case INT_FTYPE_V32QI:
12422 : case V16QI_FTYPE_V16QI:
12423 : case V8SI_FTYPE_V8SF:
12424 : case V8SI_FTYPE_V4SI:
12425 : case V8HI_FTYPE_V8HI:
12426 : case V8HI_FTYPE_V16QI:
12427 : case V8QI_FTYPE_V8QI:
12428 : case V8SF_FTYPE_V8SF:
12429 : case V8SF_FTYPE_V8SI:
12430 : case V8SF_FTYPE_V4SF:
12431 : case V8SF_FTYPE_V8HI:
12432 : case V4SI_FTYPE_V4SI:
12433 : case V4SI_FTYPE_V16QI:
12434 : case V4SI_FTYPE_V4SF:
12435 : case V4SI_FTYPE_V8SI:
12436 : case V4SI_FTYPE_V8HI:
12437 : case V4SI_FTYPE_V4DF:
12438 : case V4SI_FTYPE_V2DF:
12439 : case V4HI_FTYPE_V4HI:
12440 : case V4DF_FTYPE_V4DF:
12441 : case V4DF_FTYPE_V4SI:
12442 : case V4DF_FTYPE_V4SF:
12443 : case V4DF_FTYPE_V2DF:
12444 : case V4SF_FTYPE_V4SF:
12445 : case V4SF_FTYPE_V4SI:
12446 : case V4SF_FTYPE_V8SF:
12447 : case V4SF_FTYPE_V4DF:
12448 : case V4SF_FTYPE_V8HI:
12449 : case V4SF_FTYPE_V2DF:
12450 : case V2DI_FTYPE_V2DI:
12451 : case V2DI_FTYPE_V16QI:
12452 : case V2DI_FTYPE_V8HI:
12453 : case V2DI_FTYPE_V4SI:
12454 : case V2DF_FTYPE_V2DF:
12455 : case V2DF_FTYPE_V4SI:
12456 : case V2DF_FTYPE_V4DF:
12457 : case V2DF_FTYPE_V4SF:
12458 : case V2DF_FTYPE_V2SI:
12459 : case V2SI_FTYPE_V2SI:
12460 : case V2SI_FTYPE_V4SF:
12461 : case V2SI_FTYPE_V2SF:
12462 : case V2SI_FTYPE_V2DF:
12463 : case V2SF_FTYPE_V2SF:
12464 : case V2SF_FTYPE_V2SI:
12465 : case V32QI_FTYPE_V32QI:
12466 : case V32QI_FTYPE_V16QI:
12467 : case V16HI_FTYPE_V16HI:
12468 : case V16HI_FTYPE_V8HI:
12469 : case V8SI_FTYPE_V8SI:
12470 : case V16HI_FTYPE_V16QI:
12471 : case V8SI_FTYPE_V16QI:
12472 : case V4DI_FTYPE_V16QI:
12473 : case V8SI_FTYPE_V8HI:
12474 : case V4DI_FTYPE_V8HI:
12475 : case V4DI_FTYPE_V4SI:
12476 : case V4DI_FTYPE_V2DI:
12477 : case UQI_FTYPE_UQI:
12478 : case UHI_FTYPE_UHI:
12479 : case USI_FTYPE_USI:
12480 : case USI_FTYPE_UQI:
12481 : case USI_FTYPE_UHI:
12482 : case UDI_FTYPE_UDI:
12483 : case UHI_FTYPE_V16QI:
12484 : case USI_FTYPE_V32QI:
12485 : case UDI_FTYPE_V64QI:
12486 : case V16QI_FTYPE_UHI:
12487 : case V32QI_FTYPE_USI:
12488 : case V64QI_FTYPE_UDI:
12489 : case V8HI_FTYPE_UQI:
12490 : case V16HI_FTYPE_UHI:
12491 : case V32HI_FTYPE_USI:
12492 : case V4SI_FTYPE_UQI:
12493 : case V8SI_FTYPE_UQI:
12494 : case V4SI_FTYPE_UHI:
12495 : case V8SI_FTYPE_UHI:
12496 : case UQI_FTYPE_V8HI:
12497 : case UHI_FTYPE_V16HI:
12498 : case USI_FTYPE_V32HI:
12499 : case UQI_FTYPE_V4SI:
12500 : case UQI_FTYPE_V8SI:
12501 : case UHI_FTYPE_V16SI:
12502 : case UQI_FTYPE_V2DI:
12503 : case UQI_FTYPE_V4DI:
12504 : case UQI_FTYPE_V8DI:
12505 : case V16SI_FTYPE_UHI:
12506 : case V2DI_FTYPE_UQI:
12507 : case V4DI_FTYPE_UQI:
12508 : case V16SI_FTYPE_INT:
12509 : case V16SF_FTYPE_V8SF:
12510 : case V16SI_FTYPE_V8SI:
12511 : case V16SF_FTYPE_V4SF:
12512 : case V16SI_FTYPE_V4SI:
12513 : case V16SI_FTYPE_V16SF:
12514 : case V16SI_FTYPE_V16SI:
12515 : case V64QI_FTYPE_V64QI:
12516 : case V32HI_FTYPE_V32HI:
12517 : case V16SF_FTYPE_V16SF:
12518 : case V8DI_FTYPE_UQI:
12519 : case V8DI_FTYPE_V8DI:
12520 : case V8DF_FTYPE_V4DF:
12521 : case V8DF_FTYPE_V2DF:
12522 : case V8DF_FTYPE_V8DF:
12523 : case V4DI_FTYPE_V4DI:
12524 : case V16BF_FTYPE_V16SF:
12525 : case V8BF_FTYPE_V8SF:
12526 : case V8BF_FTYPE_V4SF:
12527 : nargs = 1;
12528 : break;
12529 52 : case V4SF_FTYPE_V4SF_VEC_MERGE:
12530 52 : case V2DF_FTYPE_V2DF_VEC_MERGE:
12531 52 : return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
12532 9528 : case FLOAT128_FTYPE_FLOAT128_FLOAT128:
12533 9528 : case V16QI_FTYPE_V16QI_V16QI:
12534 9528 : case V16QI_FTYPE_V8HI_V8HI:
12535 9528 : case V16HF_FTYPE_V16HF_V16HF:
12536 9528 : case V16SF_FTYPE_V16SF_V16SF:
12537 9528 : case V16SI_FTYPE_V16SI_V16SI:
12538 9528 : case V8QI_FTYPE_V8QI_V8QI:
12539 9528 : case V8QI_FTYPE_V4HI_V4HI:
12540 9528 : case V8HI_FTYPE_V8HI_V8HI:
12541 9528 : case V8HI_FTYPE_V16QI_V16QI:
12542 9528 : case V8HI_FTYPE_V4SI_V4SI:
12543 9528 : case V8HF_FTYPE_V8HF_V8HF:
12544 9528 : case V8SF_FTYPE_V8SF_V8SF:
12545 9528 : case V8SF_FTYPE_V8SF_V8SI:
12546 9528 : case V8DF_FTYPE_V8DF_V8DF:
12547 9528 : case V4SI_FTYPE_V4SI_V4SI:
12548 9528 : case V4SI_FTYPE_V8HI_V8HI:
12549 9528 : case V4SI_FTYPE_V2DF_V2DF:
12550 9528 : case V4HI_FTYPE_V4HI_V4HI:
12551 9528 : case V4HI_FTYPE_V8QI_V8QI:
12552 9528 : case V4HI_FTYPE_V2SI_V2SI:
12553 9528 : case V4DF_FTYPE_V4DF_V4DF:
12554 9528 : case V4DF_FTYPE_V4DF_V4DI:
12555 9528 : case V4SF_FTYPE_V4SF_V4SF:
12556 9528 : case V4SF_FTYPE_V4SF_V4SI:
12557 9528 : case V4SF_FTYPE_V4SF_V2SI:
12558 9528 : case V4SF_FTYPE_V4SF_V2DF:
12559 9528 : case V4SF_FTYPE_V4SF_UINT:
12560 9528 : case V4SF_FTYPE_V4SF_DI:
12561 9528 : case V4SF_FTYPE_V4SF_SI:
12562 9528 : case V4DI_FTYPE_V4DI_V2DI:
12563 9528 : case V2DI_FTYPE_V2DI_V2DI:
12564 9528 : case V2DI_FTYPE_V16QI_V16QI:
12565 9528 : case V2DI_FTYPE_V4SI_V4SI:
12566 9528 : case V2DI_FTYPE_V2DI_V16QI:
12567 9528 : case V2SI_FTYPE_V2SI_V2SI:
12568 9528 : case V2SI_FTYPE_V4HI_V4HI:
12569 9528 : case V2SI_FTYPE_V2SF_V2SF:
12570 9528 : case V2DF_FTYPE_V2DF_V2DF:
12571 9528 : case V2DF_FTYPE_V2DF_V4SF:
12572 9528 : case V2DF_FTYPE_V2DF_V2DI:
12573 9528 : case V2DF_FTYPE_V2DF_DI:
12574 9528 : case V2DF_FTYPE_V2DF_SI:
12575 9528 : case V2DF_FTYPE_V2DF_UINT:
12576 9528 : case V2SF_FTYPE_V2SF_V2SF:
12577 9528 : case V1DI_FTYPE_V1DI_V1DI:
12578 9528 : case V1DI_FTYPE_V8QI_V8QI:
12579 9528 : case V1DI_FTYPE_V2SI_V2SI:
12580 9528 : case V32QI_FTYPE_V16HI_V16HI:
12581 9528 : case V16HI_FTYPE_V8SI_V8SI:
12582 9528 : case V64QI_FTYPE_V64QI_V64QI:
12583 9528 : case V32QI_FTYPE_V32QI_V32QI:
12584 9528 : case V32BF_FTYPE_V32BF_V32BF:
12585 9528 : case V16BF_FTYPE_V16BF_V16BF:
12586 9528 : case V8BF_FTYPE_V8BF_V8BF:
12587 9528 : case V16HI_FTYPE_V32QI_V32QI:
12588 9528 : case V16HI_FTYPE_V16HI_V16HI:
12589 9528 : case V8SI_FTYPE_V4DF_V4DF:
12590 9528 : case V8SI_FTYPE_V8SI_V8SI:
12591 9528 : case V8SI_FTYPE_V16HI_V16HI:
12592 9528 : case V4DI_FTYPE_V4DI_V4DI:
12593 9528 : case V4DI_FTYPE_V8SI_V8SI:
12594 9528 : case V4DI_FTYPE_V32QI_V32QI:
12595 9528 : case V8DI_FTYPE_V64QI_V64QI:
12596 9528 : if (comparison == UNKNOWN)
12597 8994 : return ix86_expand_binop_builtin (icode, exp, target);
12598 : nargs = 2;
12599 : break;
12600 80 : case V4SF_FTYPE_V4SF_V4SF_SWAP:
12601 80 : case V2DF_FTYPE_V2DF_V2DF_SWAP:
12602 80 : gcc_assert (comparison != UNKNOWN);
12603 : nargs = 2;
12604 : swap = true;
12605 : break;
12606 1481 : case V16HI_FTYPE_V16HI_V8HI_COUNT:
12607 1481 : case V16HI_FTYPE_V16HI_SI_COUNT:
12608 1481 : case V8SI_FTYPE_V8SI_V4SI_COUNT:
12609 1481 : case V8SI_FTYPE_V8SI_SI_COUNT:
12610 1481 : case V4DI_FTYPE_V4DI_V2DI_COUNT:
12611 1481 : case V4DI_FTYPE_V4DI_INT_COUNT:
12612 1481 : case V8HI_FTYPE_V8HI_V8HI_COUNT:
12613 1481 : case V8HI_FTYPE_V8HI_SI_COUNT:
12614 1481 : case V4SI_FTYPE_V4SI_V4SI_COUNT:
12615 1481 : case V4SI_FTYPE_V4SI_SI_COUNT:
12616 1481 : case V4HI_FTYPE_V4HI_V4HI_COUNT:
12617 1481 : case V4HI_FTYPE_V4HI_SI_COUNT:
12618 1481 : case V2DI_FTYPE_V2DI_V2DI_COUNT:
12619 1481 : case V2DI_FTYPE_V2DI_SI_COUNT:
12620 1481 : case V2SI_FTYPE_V2SI_V2SI_COUNT:
12621 1481 : case V2SI_FTYPE_V2SI_SI_COUNT:
12622 1481 : case V1DI_FTYPE_V1DI_V1DI_COUNT:
12623 1481 : case V1DI_FTYPE_V1DI_SI_COUNT:
12624 1481 : nargs = 2;
12625 1481 : second_arg_count = true;
12626 1481 : break;
12627 1408 : case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
12628 1408 : case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
12629 1408 : case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
12630 1408 : case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
12631 1408 : case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
12632 1408 : case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
12633 1408 : case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
12634 1408 : case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
12635 1408 : case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
12636 1408 : case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
12637 1408 : case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
12638 1408 : case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
12639 1408 : case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
12640 1408 : case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
12641 1408 : case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
12642 1408 : case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
12643 1408 : case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
12644 1408 : case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
12645 1408 : nargs = 4;
12646 1408 : second_arg_count = true;
12647 1408 : break;
12648 967 : case UINT64_FTYPE_UINT64_UINT64:
12649 967 : case UINT_FTYPE_UINT_UINT:
12650 967 : case UINT_FTYPE_UINT_USHORT:
12651 967 : case UINT_FTYPE_UINT_UCHAR:
12652 967 : case UINT16_FTYPE_UINT16_INT:
12653 967 : case UINT8_FTYPE_UINT8_INT:
12654 967 : case UQI_FTYPE_UQI_UQI:
12655 967 : case UHI_FTYPE_UHI_UHI:
12656 967 : case USI_FTYPE_USI_USI:
12657 967 : case UDI_FTYPE_UDI_UDI:
12658 967 : case V16SI_FTYPE_V8DF_V8DF:
12659 967 : case V32BF_FTYPE_V16SF_V16SF:
12660 967 : case V16BF_FTYPE_V8SF_V8SF:
12661 967 : case V8BF_FTYPE_V4SF_V4SF:
12662 967 : case V16BF_FTYPE_V16SF_UHI:
12663 967 : case V8BF_FTYPE_V8SF_UQI:
12664 967 : case V8BF_FTYPE_V4SF_UQI:
12665 967 : case V16QI_FTYPE_V16QI_V8HF:
12666 967 : nargs = 2;
12667 967 : break;
12668 786 : case V2DI_FTYPE_V2DI_INT_CONVERT:
12669 786 : nargs = 2;
12670 786 : rmode = V1TImode;
12671 786 : nargs_constant = 1;
12672 786 : break;
12673 42 : case V4DI_FTYPE_V4DI_INT_CONVERT:
12674 42 : nargs = 2;
12675 42 : rmode = V2TImode;
12676 42 : nargs_constant = 1;
12677 42 : break;
12678 16 : case V8DI_FTYPE_V8DI_INT_CONVERT:
12679 16 : nargs = 2;
12680 16 : rmode = V4TImode;
12681 16 : nargs_constant = 1;
12682 16 : break;
12683 2424 : case V8HI_FTYPE_V8HI_INT:
12684 2424 : case V8HI_FTYPE_V8SF_INT:
12685 2424 : case V16HI_FTYPE_V16SF_INT:
12686 2424 : case V8HI_FTYPE_V4SF_INT:
12687 2424 : case V8SF_FTYPE_V8SF_INT:
12688 2424 : case V4SF_FTYPE_V16SF_INT:
12689 2424 : case V16SF_FTYPE_V16SF_INT:
12690 2424 : case V4SI_FTYPE_V4SI_INT:
12691 2424 : case V4SI_FTYPE_V8SI_INT:
12692 2424 : case V4HI_FTYPE_V4HI_INT:
12693 2424 : case V4DF_FTYPE_V4DF_INT:
12694 2424 : case V4DF_FTYPE_V8DF_INT:
12695 2424 : case V4SF_FTYPE_V4SF_INT:
12696 2424 : case V4SF_FTYPE_V8SF_INT:
12697 2424 : case V2DI_FTYPE_V2DI_INT:
12698 2424 : case V2DF_FTYPE_V2DF_INT:
12699 2424 : case V2DF_FTYPE_V4DF_INT:
12700 2424 : case V16HI_FTYPE_V16HI_INT:
12701 2424 : case V8SI_FTYPE_V8SI_INT:
12702 2424 : case V16SI_FTYPE_V16SI_INT:
12703 2424 : case V4SI_FTYPE_V16SI_INT:
12704 2424 : case V4DI_FTYPE_V4DI_INT:
12705 2424 : case V2DI_FTYPE_V4DI_INT:
12706 2424 : case V4DI_FTYPE_V8DI_INT:
12707 2424 : case UQI_FTYPE_UQI_UQI_CONST:
12708 2424 : case UHI_FTYPE_UHI_UQI:
12709 2424 : case USI_FTYPE_USI_UQI:
12710 2424 : case UDI_FTYPE_UDI_UQI:
12711 2424 : nargs = 2;
12712 2424 : nargs_constant = 1;
12713 2424 : break;
12714 18713 : case V16QI_FTYPE_V16QI_V16QI_V16QI:
12715 18713 : case V8SF_FTYPE_V8SF_V8SF_V8SF:
12716 18713 : case V4DF_FTYPE_V4DF_V4DF_V4DF:
12717 18713 : case V4SF_FTYPE_V4SF_V4SF_V4SF:
12718 18713 : case V2DF_FTYPE_V2DF_V2DF_V2DF:
12719 18713 : case V32QI_FTYPE_V32QI_V32QI_V32QI:
12720 18713 : case UHI_FTYPE_V16SI_V16SI_UHI:
12721 18713 : case UQI_FTYPE_V8DI_V8DI_UQI:
12722 18713 : case V16HI_FTYPE_V16SI_V16HI_UHI:
12723 18713 : case V16QI_FTYPE_V16SI_V16QI_UHI:
12724 18713 : case V16QI_FTYPE_V8DI_V16QI_UQI:
12725 18713 : case V32HF_FTYPE_V32HF_V32HF_USI:
12726 18713 : case V16SF_FTYPE_V16SF_V16SF_UHI:
12727 18713 : case V16SF_FTYPE_V4SF_V16SF_UHI:
12728 18713 : case V16SI_FTYPE_SI_V16SI_UHI:
12729 18713 : case V16SI_FTYPE_V16HI_V16SI_UHI:
12730 18713 : case V16SI_FTYPE_V16QI_V16SI_UHI:
12731 18713 : case V8SF_FTYPE_V4SF_V8SF_UQI:
12732 18713 : case V4DF_FTYPE_V2DF_V4DF_UQI:
12733 18713 : case V8SI_FTYPE_V4SI_V8SI_UQI:
12734 18713 : case V8SI_FTYPE_SI_V8SI_UQI:
12735 18713 : case V4SI_FTYPE_V4SI_V4SI_UQI:
12736 18713 : case V4SI_FTYPE_SI_V4SI_UQI:
12737 18713 : case V4DI_FTYPE_V2DI_V4DI_UQI:
12738 18713 : case V4DI_FTYPE_DI_V4DI_UQI:
12739 18713 : case V2DI_FTYPE_V2DI_V2DI_UQI:
12740 18713 : case V2DI_FTYPE_DI_V2DI_UQI:
12741 18713 : case V64QI_FTYPE_V64QI_V64QI_UDI:
12742 18713 : case V64QI_FTYPE_V16QI_V64QI_UDI:
12743 18713 : case V64QI_FTYPE_QI_V64QI_UDI:
12744 18713 : case V32QI_FTYPE_V32QI_V32QI_USI:
12745 18713 : case V32QI_FTYPE_V16QI_V32QI_USI:
12746 18713 : case V32QI_FTYPE_QI_V32QI_USI:
12747 18713 : case V16QI_FTYPE_V16QI_V16QI_UHI:
12748 18713 : case V16QI_FTYPE_QI_V16QI_UHI:
12749 18713 : case V32HI_FTYPE_V8HI_V32HI_USI:
12750 18713 : case V32HI_FTYPE_V32BF_V32HI_USI:
12751 18713 : case V32HI_FTYPE_HI_V32HI_USI:
12752 18713 : case V16HI_FTYPE_V8HI_V16HI_UHI:
12753 18713 : case V16HI_FTYPE_V16BF_V16HI_UHI:
12754 18713 : case V16HI_FTYPE_HI_V16HI_UHI:
12755 18713 : case V8HI_FTYPE_V8HI_V8HI_UQI:
12756 18713 : case V8HI_FTYPE_V8BF_V8HI_UQI:
12757 18713 : case V8BF_FTYPE_V8BF_V8BF_UQI:
12758 18713 : case V8HI_FTYPE_HI_V8HI_UQI:
12759 18713 : case V16HF_FTYPE_V16HF_V16HF_UHI:
12760 18713 : case V8SF_FTYPE_V8HI_V8SF_UQI:
12761 18713 : case V4SF_FTYPE_V8HI_V4SF_UQI:
12762 18713 : case V8SI_FTYPE_V8HF_V8SI_UQI:
12763 18713 : case V8SF_FTYPE_V8HF_V8SF_UQI:
12764 18713 : case V8SI_FTYPE_V8SF_V8SI_UQI:
12765 18713 : case V4SI_FTYPE_V4SF_V4SI_UQI:
12766 18713 : case V4SI_FTYPE_V8HF_V4SI_UQI:
12767 18713 : case V4SF_FTYPE_V8HF_V4SF_UQI:
12768 18713 : case V4DI_FTYPE_V8HF_V4DI_UQI:
12769 18713 : case V4DI_FTYPE_V4SF_V4DI_UQI:
12770 18713 : case V2DI_FTYPE_V8HF_V2DI_UQI:
12771 18713 : case V2DI_FTYPE_V4SF_V2DI_UQI:
12772 18713 : case V8HF_FTYPE_V8HF_V8HF_UQI:
12773 18713 : case V8HF_FTYPE_V8HF_V8HF_V8HF:
12774 18713 : case V8HF_FTYPE_V8HI_V8HF_UQI:
12775 18713 : case V8HF_FTYPE_V8SI_V8HF_UQI:
12776 18713 : case V8HF_FTYPE_V8SF_V8HF_UQI:
12777 18713 : case V8HF_FTYPE_V4SI_V8HF_UQI:
12778 18713 : case V8HF_FTYPE_V4SF_V8HF_UQI:
12779 18713 : case V8HF_FTYPE_V4DI_V8HF_UQI:
12780 18713 : case V8HF_FTYPE_V4DF_V8HF_UQI:
12781 18713 : case V8HF_FTYPE_V2DI_V8HF_UQI:
12782 18713 : case V8HF_FTYPE_V2DF_V8HF_UQI:
12783 18713 : case V4SF_FTYPE_V4DI_V4SF_UQI:
12784 18713 : case V4SF_FTYPE_V2DI_V4SF_UQI:
12785 18713 : case V4DF_FTYPE_V4DI_V4DF_UQI:
12786 18713 : case V4DF_FTYPE_V8HF_V4DF_UQI:
12787 18713 : case V2DF_FTYPE_V8HF_V2DF_UQI:
12788 18713 : case V2DF_FTYPE_V2DI_V2DF_UQI:
12789 18713 : case V16QI_FTYPE_V8HI_V16QI_UQI:
12790 18713 : case V16QI_FTYPE_V16HI_V16QI_UHI:
12791 18713 : case V16QI_FTYPE_V4SI_V16QI_UQI:
12792 18713 : case V16QI_FTYPE_V8SI_V16QI_UQI:
12793 18713 : case V8HI_FTYPE_V8HF_V8HI_UQI:
12794 18713 : case V8HI_FTYPE_V4SI_V8HI_UQI:
12795 18713 : case V8HI_FTYPE_V8SI_V8HI_UQI:
12796 18713 : case V16QI_FTYPE_V2DI_V16QI_UQI:
12797 18713 : case V16QI_FTYPE_V4DI_V16QI_UQI:
12798 18713 : case V8HI_FTYPE_V2DI_V8HI_UQI:
12799 18713 : case V8HI_FTYPE_V4DI_V8HI_UQI:
12800 18713 : case V4SI_FTYPE_V2DI_V4SI_UQI:
12801 18713 : case V4SI_FTYPE_V4DI_V4SI_UQI:
12802 18713 : case V32QI_FTYPE_V32HI_V32QI_USI:
12803 18713 : case UHI_FTYPE_V16QI_V16QI_UHI:
12804 18713 : case USI_FTYPE_V32QI_V32QI_USI:
12805 18713 : case UDI_FTYPE_V64QI_V64QI_UDI:
12806 18713 : case UQI_FTYPE_V8HI_V8HI_UQI:
12807 18713 : case UHI_FTYPE_V16HI_V16HI_UHI:
12808 18713 : case USI_FTYPE_V32HI_V32HI_USI:
12809 18713 : case UQI_FTYPE_V4SI_V4SI_UQI:
12810 18713 : case UQI_FTYPE_V8SI_V8SI_UQI:
12811 18713 : case UQI_FTYPE_V2DI_V2DI_UQI:
12812 18713 : case UQI_FTYPE_V4DI_V4DI_UQI:
12813 18713 : case V4SF_FTYPE_V2DF_V4SF_UQI:
12814 18713 : case V4SF_FTYPE_V4DF_V4SF_UQI:
12815 18713 : case V16SI_FTYPE_V16SI_V16SI_UHI:
12816 18713 : case V16SI_FTYPE_V4SI_V16SI_UHI:
12817 18713 : case V2DI_FTYPE_V4SI_V2DI_UQI:
12818 18713 : case V2DI_FTYPE_V8HI_V2DI_UQI:
12819 18713 : case V2DI_FTYPE_V16QI_V2DI_UQI:
12820 18713 : case V4DI_FTYPE_V4DI_V4DI_UQI:
12821 18713 : case V4DI_FTYPE_V4SI_V4DI_UQI:
12822 18713 : case V4DI_FTYPE_V8HI_V4DI_UQI:
12823 18713 : case V4DI_FTYPE_V16QI_V4DI_UQI:
12824 18713 : case V4DI_FTYPE_V4DF_V4DI_UQI:
12825 18713 : case V2DI_FTYPE_V2DF_V2DI_UQI:
12826 18713 : case V4SI_FTYPE_V4DF_V4SI_UQI:
12827 18713 : case V4SI_FTYPE_V2DF_V4SI_UQI:
12828 18713 : case V4SI_FTYPE_V8HI_V4SI_UQI:
12829 18713 : case V4SI_FTYPE_V16QI_V4SI_UQI:
12830 18713 : case V4DI_FTYPE_V4DI_V4DI_V4DI:
12831 18713 : case V8DF_FTYPE_V2DF_V8DF_UQI:
12832 18713 : case V8DF_FTYPE_V4DF_V8DF_UQI:
12833 18713 : case V8DF_FTYPE_V8DF_V8DF_UQI:
12834 18713 : case V8SF_FTYPE_V8SF_V8SF_UQI:
12835 18713 : case V8SF_FTYPE_V8SI_V8SF_UQI:
12836 18713 : case V4DF_FTYPE_V4DF_V4DF_UQI:
12837 18713 : case V4SF_FTYPE_V4SF_V4SF_UQI:
12838 18713 : case V2DF_FTYPE_V2DF_V2DF_UQI:
12839 18713 : case V2DF_FTYPE_V4SF_V2DF_UQI:
12840 18713 : case V2DF_FTYPE_V4SI_V2DF_UQI:
12841 18713 : case V4SF_FTYPE_V4SI_V4SF_UQI:
12842 18713 : case V4DF_FTYPE_V4SF_V4DF_UQI:
12843 18713 : case V4DF_FTYPE_V4SI_V4DF_UQI:
12844 18713 : case V8SI_FTYPE_V8SI_V8SI_UQI:
12845 18713 : case V8SI_FTYPE_V8HI_V8SI_UQI:
12846 18713 : case V8SI_FTYPE_V16QI_V8SI_UQI:
12847 18713 : case V8DF_FTYPE_V8SI_V8DF_UQI:
12848 18713 : case V8DI_FTYPE_DI_V8DI_UQI:
12849 18713 : case V16SF_FTYPE_V8SF_V16SF_UHI:
12850 18713 : case V16SI_FTYPE_V8SI_V16SI_UHI:
12851 18713 : case V16HF_FTYPE_V16HI_V16HF_UHI:
12852 18713 : case V16HF_FTYPE_V16HF_V16HF_V16HF:
12853 18713 : case V16HI_FTYPE_V16HF_V16HI_UHI:
12854 18713 : case V16HI_FTYPE_V16HI_V16HI_UHI:
12855 18713 : case V16BF_FTYPE_V16BF_V16BF_UHI:
12856 18713 : case V8HI_FTYPE_V16QI_V8HI_UQI:
12857 18713 : case V16HI_FTYPE_V16QI_V16HI_UHI:
12858 18713 : case V32HI_FTYPE_V32HI_V32HI_USI:
12859 18713 : case V32BF_FTYPE_V32BF_V32BF_USI:
12860 18713 : case V32HI_FTYPE_V32QI_V32HI_USI:
12861 18713 : case V8DI_FTYPE_V16QI_V8DI_UQI:
12862 18713 : case V8DI_FTYPE_V2DI_V8DI_UQI:
12863 18713 : case V8DI_FTYPE_V4DI_V8DI_UQI:
12864 18713 : case V8DI_FTYPE_V8DI_V8DI_UQI:
12865 18713 : case V8DI_FTYPE_V8HI_V8DI_UQI:
12866 18713 : case V8DI_FTYPE_V8SI_V8DI_UQI:
12867 18713 : case V8HI_FTYPE_V8DI_V8HI_UQI:
12868 18713 : case V8SI_FTYPE_V8DI_V8SI_UQI:
12869 18713 : case V4SI_FTYPE_V4SI_V4SI_V4SI:
12870 18713 : case V4DI_FTYPE_V4DI_V4DI_V2DI:
12871 18713 : case V16SI_FTYPE_V16SI_V16SI_V16SI:
12872 18713 : case V8DI_FTYPE_V8DI_V8DI_V8DI:
12873 18713 : case V32HI_FTYPE_V32HI_V32HI_V32HI:
12874 18713 : case V2DI_FTYPE_V2DI_V2DI_V2DI:
12875 18713 : case V16HI_FTYPE_V16HI_V16HI_V16HI:
12876 18713 : case V8SI_FTYPE_V8SI_V8SI_V8SI:
12877 18713 : case V8HI_FTYPE_V8HI_V8HI_V8HI:
12878 18713 : case V32BF_FTYPE_V16SF_V16SF_USI:
12879 18713 : case V16BF_FTYPE_V8SF_V8SF_UHI:
12880 18713 : case V8BF_FTYPE_V4SF_V4SF_UQI:
12881 18713 : case V16BF_FTYPE_V16SF_V16BF_UHI:
12882 18713 : case V8BF_FTYPE_V8SF_V8BF_UQI:
12883 18713 : case V8BF_FTYPE_V4SF_V8BF_UQI:
12884 18713 : case V16SF_FTYPE_V16SF_V32BF_V32BF:
12885 18713 : case V8SF_FTYPE_V8SF_V16BF_V16BF:
12886 18713 : case V4SF_FTYPE_V4SF_V8BF_V8BF:
12887 18713 : case V16QI_FTYPE_V16QI_V8HF_V8HF:
12888 18713 : case V32QI_FTYPE_V32QI_V16HF_V16HF:
12889 18713 : case V64QI_FTYPE_V64QI_V32HF_V32HF:
12890 18713 : case V16QI_FTYPE_V8HF_V16QI_UQI:
12891 18713 : case V16QI_FTYPE_V16HF_V16QI_UHI:
12892 18713 : case V32QI_FTYPE_V32HF_V32QI_USI:
12893 18713 : case V8HF_FTYPE_V16QI_V8HF_UQI:
12894 18713 : case V16HF_FTYPE_V16QI_V16HF_UHI:
12895 18713 : case V32HF_FTYPE_V32QI_V32HF_USI:
12896 18713 : case V16SI_FTYPE_V16SF_V16SI_UHI:
12897 18713 : case V32HI_FTYPE_V32HF_V32HI_USI:
12898 18713 : case V8DI_FTYPE_V8SF_V8DI_UQI:
12899 18713 : case V8DI_FTYPE_V8DF_V8DI_UQI:
12900 18713 : case V8SI_FTYPE_V8DF_V8SI_UQI:
12901 18713 : nargs = 3;
12902 18713 : break;
12903 1481 : case V32QI_FTYPE_V32QI_V32QI_INT:
12904 1481 : case V16HI_FTYPE_V16HI_V16HI_INT:
12905 1481 : case V16QI_FTYPE_V16QI_V16QI_INT:
12906 1481 : case V4DI_FTYPE_V4DI_V4DI_INT:
12907 1481 : case V8HI_FTYPE_V8HI_V8HI_INT:
12908 1481 : case V8SI_FTYPE_V8SI_V8SI_INT:
12909 1481 : case V8SI_FTYPE_V8SI_V4SI_INT:
12910 1481 : case V8SF_FTYPE_V8SF_V8SF_INT:
12911 1481 : case V8SF_FTYPE_V8SF_V4SF_INT:
12912 1481 : case V4SI_FTYPE_V4SI_V4SI_INT:
12913 1481 : case V4DF_FTYPE_V4DF_V4DF_INT:
12914 1481 : case V16SF_FTYPE_V16SF_V16SF_INT:
12915 1481 : case V16SF_FTYPE_V16SF_V4SF_INT:
12916 1481 : case V16SI_FTYPE_V16SI_V4SI_INT:
12917 1481 : case V4DF_FTYPE_V4DF_V2DF_INT:
12918 1481 : case V4SF_FTYPE_V4SF_V4SF_INT:
12919 1481 : case V2DI_FTYPE_V2DI_V2DI_INT:
12920 1481 : case V4DI_FTYPE_V4DI_V2DI_INT:
12921 1481 : case V2DF_FTYPE_V2DF_V2DF_INT:
12922 1481 : case UQI_FTYPE_V8DI_V8UDI_INT:
12923 1481 : case UQI_FTYPE_V8DF_V8DF_INT:
12924 1481 : case UQI_FTYPE_V2DF_V2DF_INT:
12925 1481 : case UQI_FTYPE_V4SF_V4SF_INT:
12926 1481 : case UHI_FTYPE_V16SI_V16SI_INT:
12927 1481 : case UHI_FTYPE_V16SF_V16SF_INT:
12928 1481 : case V64QI_FTYPE_V64QI_V64QI_INT:
12929 1481 : case V32HI_FTYPE_V32HI_V32HI_INT:
12930 1481 : case V16SI_FTYPE_V16SI_V16SI_INT:
12931 1481 : case V8DI_FTYPE_V8DI_V8DI_INT:
12932 1481 : nargs = 3;
12933 1481 : nargs_constant = 1;
12934 1481 : break;
12935 47 : case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
12936 47 : nargs = 3;
12937 47 : rmode = V4DImode;
12938 47 : nargs_constant = 1;
12939 47 : break;
12940 80 : case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
12941 80 : nargs = 3;
12942 80 : rmode = V2DImode;
12943 80 : nargs_constant = 1;
12944 80 : break;
12945 48 : case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
12946 48 : nargs = 3;
12947 48 : rmode = DImode;
12948 48 : nargs_constant = 1;
12949 48 : break;
12950 20 : case V2DI_FTYPE_V2DI_UINT_UINT:
12951 20 : nargs = 3;
12952 20 : nargs_constant = 2;
12953 20 : break;
12954 8 : case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
12955 8 : nargs = 3;
12956 8 : rmode = V8DImode;
12957 8 : nargs_constant = 1;
12958 8 : break;
12959 16 : case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
12960 16 : nargs = 5;
12961 16 : rmode = V8DImode;
12962 16 : mask_pos = 2;
12963 16 : nargs_constant = 1;
12964 16 : break;
12965 320 : case QI_FTYPE_V8DF_INT_UQI:
12966 320 : case QI_FTYPE_V4DF_INT_UQI:
12967 320 : case QI_FTYPE_V2DF_INT_UQI:
12968 320 : case HI_FTYPE_V16SF_INT_UHI:
12969 320 : case QI_FTYPE_V8SF_INT_UQI:
12970 320 : case QI_FTYPE_V4SF_INT_UQI:
12971 320 : case QI_FTYPE_V8HF_INT_UQI:
12972 320 : case HI_FTYPE_V16HF_INT_UHI:
12973 320 : case SI_FTYPE_V32HF_INT_USI:
12974 320 : case QI_FTYPE_V8BF_INT_UQI:
12975 320 : case HI_FTYPE_V16BF_INT_UHI:
12976 320 : case SI_FTYPE_V32BF_INT_USI:
12977 320 : case V4SI_FTYPE_V4SI_V4SI_UHI:
12978 320 : case V8SI_FTYPE_V8SI_V8SI_UHI:
12979 320 : nargs = 3;
12980 320 : mask_pos = 1;
12981 320 : nargs_constant = 1;
12982 320 : break;
12983 17 : case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
12984 17 : nargs = 5;
12985 17 : rmode = V4DImode;
12986 17 : mask_pos = 2;
12987 17 : nargs_constant = 1;
12988 17 : break;
12989 17 : case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
12990 17 : nargs = 5;
12991 17 : rmode = V2DImode;
12992 17 : mask_pos = 2;
12993 17 : nargs_constant = 1;
12994 17 : break;
12995 17264 : case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
12996 17264 : case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
12997 17264 : case V32BF_FTYPE_V32BF_V32BF_V32BF_USI:
12998 17264 : case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
12999 17264 : case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
13000 17264 : case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
13001 17264 : case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
13002 17264 : case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
13003 17264 : case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
13004 17264 : case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
13005 17264 : case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
13006 17264 : case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
13007 17264 : case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
13008 17264 : case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
13009 17264 : case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
13010 17264 : case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
13011 17264 : case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
13012 17264 : case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
13013 17264 : case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
13014 17264 : case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
13015 17264 : case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
13016 17264 : case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
13017 17264 : case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
13018 17264 : case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
13019 17264 : case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
13020 17264 : case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
13021 17264 : case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
13022 17264 : case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
13023 17264 : case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
13024 17264 : case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
13025 17264 : case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
13026 17264 : case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
13027 17264 : case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
13028 17264 : case V8BF_FTYPE_V8BF_V8BF_V8BF_UQI:
13029 17264 : case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
13030 17264 : case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
13031 17264 : case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI:
13032 17264 : case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
13033 17264 : case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
13034 17264 : case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
13035 17264 : case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
13036 17264 : case V16BF_FTYPE_V16BF_V16BF_V16BF_UHI:
13037 17264 : case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
13038 17264 : case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
13039 17264 : case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
13040 17264 : case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
13041 17264 : case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
13042 17264 : case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
13043 17264 : case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
13044 17264 : case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
13045 17264 : case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
13046 17264 : case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
13047 17264 : case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
13048 17264 : case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
13049 17264 : case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
13050 17264 : case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
13051 17264 : case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
13052 17264 : case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
13053 17264 : case V32BF_FTYPE_V16SF_V16SF_V32BF_USI:
13054 17264 : case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI:
13055 17264 : case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI:
13056 17264 : case V32HF_FTYPE_V16SF_V16SF_V32HF_USI:
13057 17264 : case V16HF_FTYPE_V8SF_V8SF_V16HF_UHI:
13058 17264 : case V8HF_FTYPE_V4SF_V4SF_V8HF_UQI:
13059 17264 : case V16QI_FTYPE_V8HF_V8HF_V16QI_UHI:
13060 17264 : case V32QI_FTYPE_V16HF_V16HF_V32QI_USI:
13061 17264 : case V64QI_FTYPE_V32HF_V32HF_V64QI_UDI:
13062 17264 : case V16QI_FTYPE_V16QI_V8HF_V16QI_UHI:
13063 17264 : case V16QI_FTYPE_V32QI_V16HF_V16QI_UHI:
13064 17264 : case V32QI_FTYPE_V64QI_V32HF_V32QI_USI:
13065 17264 : nargs = 4;
13066 17264 : break;
13067 11 : case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
13068 11 : case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
13069 11 : case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
13070 11 : case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
13071 11 : case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
13072 11 : case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
13073 11 : nargs = 4;
13074 11 : nargs_constant = 1;
13075 11 : break;
13076 3718 : case UQI_FTYPE_V4DI_V4DI_INT_UQI:
13077 3718 : case UQI_FTYPE_V8SI_V8SI_INT_UQI:
13078 3718 : case QI_FTYPE_V4DF_V4DF_INT_UQI:
13079 3718 : case QI_FTYPE_V8SF_V8SF_INT_UQI:
13080 3718 : case UHI_FTYPE_V16HF_V16HF_INT_UHI:
13081 3718 : case UQI_FTYPE_V2DI_V2DI_INT_UQI:
13082 3718 : case UQI_FTYPE_V4SI_V4SI_INT_UQI:
13083 3718 : case UQI_FTYPE_V2DF_V2DF_INT_UQI:
13084 3718 : case UQI_FTYPE_V4SF_V4SF_INT_UQI:
13085 3718 : case UQI_FTYPE_V8HF_V8HF_INT_UQI:
13086 3718 : case UDI_FTYPE_V64QI_V64QI_INT_UDI:
13087 3718 : case USI_FTYPE_V32QI_V32QI_INT_USI:
13088 3718 : case UHI_FTYPE_V16QI_V16QI_INT_UHI:
13089 3718 : case USI_FTYPE_V32HI_V32HI_INT_USI:
13090 3718 : case USI_FTYPE_V32BF_V32BF_INT_USI:
13091 3718 : case USI_FTYPE_V32HF_V32HF_INT_USI:
13092 3718 : case UHI_FTYPE_V16HI_V16HI_INT_UHI:
13093 3718 : case UHI_FTYPE_V16BF_V16BF_INT_UHI:
13094 3718 : case UQI_FTYPE_V8HI_V8HI_INT_UQI:
13095 3718 : case UQI_FTYPE_V8BF_V8BF_INT_UQI:
13096 3718 : nargs = 4;
13097 3718 : mask_pos = 1;
13098 3718 : nargs_constant = 1;
13099 3718 : break;
13100 23 : case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
13101 23 : nargs = 4;
13102 23 : nargs_constant = 2;
13103 23 : break;
13104 67 : case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
13105 67 : case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
13106 67 : case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI:
13107 67 : case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI:
13108 67 : case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI:
13109 67 : nargs = 4;
13110 67 : break;
13111 679 : case UQI_FTYPE_V8DI_V8DI_INT_UQI:
13112 679 : case UHI_FTYPE_V16SI_V16SI_INT_UHI:
13113 679 : mask_pos = 1;
13114 679 : nargs = 4;
13115 679 : nargs_constant = 1;
13116 679 : break;
13117 3948 : case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
13118 3948 : case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
13119 3948 : case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
13120 3948 : case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
13121 3948 : case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
13122 3948 : case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
13123 3948 : case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
13124 3948 : case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
13125 3948 : case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
13126 3948 : case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
13127 3948 : case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
13128 3948 : case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
13129 3948 : case V32HI_FTYPE_V32HI_INT_V32HI_USI:
13130 3948 : case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
13131 3948 : case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
13132 3948 : case V32BF_FTYPE_V32BF_INT_V32BF_USI:
13133 3948 : case V16BF_FTYPE_V16BF_INT_V16BF_UHI:
13134 3948 : case V8BF_FTYPE_V8BF_INT_V8BF_UQI:
13135 3948 : case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
13136 3948 : case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
13137 3948 : case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
13138 3948 : case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
13139 3948 : case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
13140 3948 : case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
13141 3948 : case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
13142 3948 : case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
13143 3948 : case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
13144 3948 : case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
13145 3948 : case V16HF_FTYPE_V16HF_INT_V16HF_UHI:
13146 3948 : case V8HF_FTYPE_V8HF_INT_V8HF_UQI:
13147 3948 : case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
13148 3948 : case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
13149 3948 : case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
13150 3948 : case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
13151 3948 : case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
13152 3948 : nargs = 4;
13153 3948 : mask_pos = 2;
13154 3948 : nargs_constant = 1;
13155 3948 : break;
13156 1726 : case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
13157 1726 : case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
13158 1726 : case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
13159 1726 : case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
13160 1726 : case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
13161 1726 : case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
13162 1726 : case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
13163 1726 : case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
13164 1726 : case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
13165 1726 : case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
13166 1726 : case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
13167 1726 : case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
13168 1726 : case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
13169 1726 : case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
13170 1726 : case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
13171 1726 : case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
13172 1726 : case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
13173 1726 : case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
13174 1726 : case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
13175 1726 : case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
13176 1726 : case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
13177 1726 : case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
13178 1726 : case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
13179 1726 : case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
13180 1726 : case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
13181 1726 : case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
13182 1726 : case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
13183 1726 : nargs = 5;
13184 1726 : mask_pos = 2;
13185 1726 : nargs_constant = 1;
13186 1726 : break;
13187 268 : case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
13188 268 : case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
13189 268 : case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
13190 268 : case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
13191 268 : case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
13192 268 : case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
13193 268 : case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
13194 268 : case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
13195 268 : case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
13196 268 : case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
13197 268 : nargs = 5;
13198 268 : mask_pos = 1;
13199 268 : nargs_constant = 1;
13200 268 : break;
13201 732 : case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
13202 732 : case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
13203 732 : case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
13204 732 : case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
13205 732 : case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
13206 732 : case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
13207 732 : case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
13208 732 : case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
13209 732 : case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
13210 732 : case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
13211 732 : case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
13212 732 : case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
13213 732 : case V8BF_FTYPE_V8BF_V8BF_INT_V8BF_UQI:
13214 732 : case V16BF_FTYPE_V16BF_V16BF_INT_V16BF_UHI:
13215 732 : case V32BF_FTYPE_V32BF_V32BF_INT_V32BF_USI:
13216 732 : case V16HF_FTYPE_V16HF_V16HF_INT_V16HF_UHI:
13217 732 : case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI:
13218 732 : nargs = 5;
13219 732 : mask_pos = 1;
13220 732 : nargs_constant = 2;
13221 732 : break;
13222 :
13223 0 : default:
13224 0 : gcc_unreachable ();
13225 : }
13226 :
13227 56327 : gcc_assert (nargs <= ARRAY_SIZE (xops));
13228 :
13229 61704 : if (comparison != UNKNOWN)
13230 : {
13231 614 : gcc_assert (nargs == 2);
13232 614 : return ix86_expand_sse_compare (d, exp, target, swap);
13233 : }
13234 :
13235 61090 : if (rmode == VOIDmode || rmode == tmode)
13236 : {
13237 60905 : if (optimize
13238 17726 : || target == 0
13239 17726 : || GET_MODE (target) != tmode
13240 78429 : || !insn_p->operand[0].predicate (target, tmode))
13241 43469 : target = gen_reg_rtx (tmode);
13242 17436 : else if (memory_operand (target, tmode))
13243 578 : num_memory++;
13244 : real_target = target;
13245 : }
13246 : else
13247 : {
13248 185 : real_target = gen_reg_rtx (tmode);
13249 185 : target = lowpart_subreg (rmode, real_target, tmode);
13250 : }
13251 :
13252 261387 : for (i = 0; i < nargs; i++)
13253 : {
13254 200530 : tree arg = CALL_EXPR_ARG (exp, i);
13255 200530 : rtx op = ix86_expand_unsigned_small_int_cst_argument (arg);
13256 200530 : machine_mode mode = insn_p->operand[i + 1].mode;
13257 : /* Need to fixup modeless constant before testing predicate. */
13258 200530 : op = fixup_modeless_constant (op, mode);
13259 200530 : bool match = insn_p->operand[i + 1].predicate (op, mode);
13260 :
13261 200530 : if (second_arg_count && i == 1)
13262 : {
13263 : /* SIMD shift insns take either an 8-bit immediate or
13264 : register as count. But builtin functions take int as
13265 : count. If count doesn't match, we put it in register.
13266 : The instructions are using 64-bit count, if op is just
13267 : 32-bit, zero-extend it, as negative shift counts
13268 : are undefined behavior and zero-extension is more
13269 : efficient. */
13270 2889 : if (!match)
13271 : {
13272 1750 : if (SCALAR_INT_MODE_P (GET_MODE (op)))
13273 489 : op = convert_modes (mode, GET_MODE (op), op, 1);
13274 : else
13275 1261 : op = lowpart_subreg (mode, op, GET_MODE (op));
13276 1750 : if (!insn_p->operand[i + 1].predicate (op, mode))
13277 190 : op = copy_to_reg (op);
13278 : }
13279 : }
13280 197641 : else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
13281 149585 : (!mask_pos && (nargs - i) <= nargs_constant))
13282 : {
13283 16466 : if (!match)
13284 233 : switch (icode)
13285 : {
13286 2 : case CODE_FOR_avx_vinsertf128v4di:
13287 2 : case CODE_FOR_avx_vextractf128v4di:
13288 2 : error ("the last argument must be an 1-bit immediate");
13289 2 : return const0_rtx;
13290 :
13291 8 : case CODE_FOR_avx512f_cmpv8di3_mask:
13292 8 : case CODE_FOR_avx512f_cmpv16si3_mask:
13293 8 : case CODE_FOR_avx512f_ucmpv8di3_mask:
13294 8 : case CODE_FOR_avx512f_ucmpv16si3_mask:
13295 8 : case CODE_FOR_avx512vl_cmpv4di3_mask:
13296 8 : case CODE_FOR_avx512vl_cmpv8si3_mask:
13297 8 : case CODE_FOR_avx512vl_ucmpv4di3_mask:
13298 8 : case CODE_FOR_avx512vl_ucmpv8si3_mask:
13299 8 : case CODE_FOR_avx512vl_cmpv2di3_mask:
13300 8 : case CODE_FOR_avx512vl_cmpv4si3_mask:
13301 8 : case CODE_FOR_avx512vl_ucmpv2di3_mask:
13302 8 : case CODE_FOR_avx512vl_ucmpv4si3_mask:
13303 8 : error ("the last argument must be a 3-bit immediate");
13304 8 : return const0_rtx;
13305 :
13306 24 : case CODE_FOR_sse4_1_roundsd:
13307 24 : case CODE_FOR_sse4_1_roundss:
13308 :
13309 24 : case CODE_FOR_sse4_1_roundpd:
13310 24 : case CODE_FOR_sse4_1_roundps:
13311 24 : case CODE_FOR_avx_roundpd256:
13312 24 : case CODE_FOR_avx_roundps256:
13313 :
13314 24 : case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
13315 24 : case CODE_FOR_sse4_1_roundps_sfix:
13316 24 : case CODE_FOR_avx_roundpd_vec_pack_sfix256:
13317 24 : case CODE_FOR_avx_roundps_sfix256:
13318 :
13319 24 : case CODE_FOR_sse4_1_blendps:
13320 24 : case CODE_FOR_avx_blendpd256:
13321 24 : case CODE_FOR_avx_vpermilv4df:
13322 24 : case CODE_FOR_avx_vpermilv4df_mask:
13323 24 : case CODE_FOR_avx512f_getmantv8df_mask:
13324 24 : case CODE_FOR_avx512f_getmantv16sf_mask:
13325 24 : case CODE_FOR_avx512vl_getmantv16hf_mask:
13326 24 : case CODE_FOR_avx512vl_getmantv8sf_mask:
13327 24 : case CODE_FOR_avx512vl_getmantv4df_mask:
13328 24 : case CODE_FOR_avx512fp16_getmantv8hf_mask:
13329 24 : case CODE_FOR_avx512vl_getmantv4sf_mask:
13330 24 : case CODE_FOR_avx512vl_getmantv2df_mask:
13331 24 : case CODE_FOR_avx512dq_rangepv8df_mask_round:
13332 24 : case CODE_FOR_avx512dq_rangepv16sf_mask_round:
13333 24 : case CODE_FOR_avx512dq_rangepv4df_mask:
13334 24 : case CODE_FOR_avx512dq_rangepv8sf_mask:
13335 24 : case CODE_FOR_avx512dq_rangepv2df_mask:
13336 24 : case CODE_FOR_avx512dq_rangepv4sf_mask:
13337 24 : case CODE_FOR_avx_shufpd256_mask:
13338 24 : error ("the last argument must be a 4-bit immediate");
13339 24 : return const0_rtx;
13340 :
13341 15 : case CODE_FOR_sha1rnds4:
13342 15 : case CODE_FOR_sse4_1_blendpd:
13343 15 : case CODE_FOR_avx_vpermilv2df:
13344 15 : case CODE_FOR_avx_vpermilv2df_mask:
13345 15 : case CODE_FOR_xop_vpermil2v2df3:
13346 15 : case CODE_FOR_xop_vpermil2v4sf3:
13347 15 : case CODE_FOR_xop_vpermil2v4df3:
13348 15 : case CODE_FOR_xop_vpermil2v8sf3:
13349 15 : case CODE_FOR_avx512f_vinsertf32x4_mask:
13350 15 : case CODE_FOR_avx512f_vinserti32x4_mask:
13351 15 : case CODE_FOR_avx512f_vextractf32x4_mask:
13352 15 : case CODE_FOR_avx512f_vextracti32x4_mask:
13353 15 : case CODE_FOR_sse2_shufpd:
13354 15 : case CODE_FOR_sse2_shufpd_mask:
13355 15 : case CODE_FOR_avx512dq_shuf_f64x2_mask:
13356 15 : case CODE_FOR_avx512dq_shuf_i64x2_mask:
13357 15 : case CODE_FOR_avx512vl_shuf_i32x4_mask:
13358 15 : case CODE_FOR_avx512vl_shuf_f32x4_mask:
13359 15 : error ("the last argument must be a 2-bit immediate");
13360 15 : return const0_rtx;
13361 :
13362 30 : case CODE_FOR_avx_vextractf128v4df:
13363 30 : case CODE_FOR_avx_vextractf128v8sf:
13364 30 : case CODE_FOR_avx_vextractf128v8si:
13365 30 : case CODE_FOR_avx_vinsertf128v4df:
13366 30 : case CODE_FOR_avx_vinsertf128v8sf:
13367 30 : case CODE_FOR_avx_vinsertf128v8si:
13368 30 : case CODE_FOR_avx512f_vinsertf64x4_mask:
13369 30 : case CODE_FOR_avx512f_vinserti64x4_mask:
13370 30 : case CODE_FOR_avx512f_vextractf64x4_mask:
13371 30 : case CODE_FOR_avx512f_vextracti64x4_mask:
13372 30 : case CODE_FOR_avx512dq_vinsertf32x8_mask:
13373 30 : case CODE_FOR_avx512dq_vinserti32x8_mask:
13374 30 : case CODE_FOR_avx512vl_vinsertv4df:
13375 30 : case CODE_FOR_avx512vl_vinsertv4di:
13376 30 : case CODE_FOR_avx512vl_vinsertv8sf:
13377 30 : case CODE_FOR_avx512vl_vinsertv8si:
13378 30 : error ("the last argument must be a 1-bit immediate");
13379 30 : return const0_rtx;
13380 :
13381 16 : case CODE_FOR_avx_vmcmpv2df3:
13382 16 : case CODE_FOR_avx_vmcmpv4sf3:
13383 16 : case CODE_FOR_avx_cmpv2df3:
13384 16 : case CODE_FOR_avx_cmpv4sf3:
13385 16 : if (CONST_INT_P (op) && IN_RANGE (INTVAL (op), 8, 31))
13386 : {
13387 4 : error ("'%s' needs isa option %s", d->name, "-mavx");
13388 4 : return const0_rtx;
13389 : }
13390 : /* FALLTHRU */
13391 18 : case CODE_FOR_avx_cmpv4df3:
13392 18 : case CODE_FOR_avx_cmpv8sf3:
13393 18 : case CODE_FOR_avx512f_cmpv8df3_mask:
13394 18 : case CODE_FOR_avx512f_cmpv16sf3_mask:
13395 18 : case CODE_FOR_avx512f_vmcmpv2df3_mask:
13396 18 : case CODE_FOR_avx512f_vmcmpv4sf3_mask:
13397 18 : case CODE_FOR_avx512bw_cmpv32hf3_mask:
13398 18 : case CODE_FOR_avx512vl_cmpv16hf3_mask:
13399 18 : case CODE_FOR_avx512fp16_cmpv8hf3_mask:
13400 18 : error ("the last argument must be a 5-bit immediate");
13401 18 : return const0_rtx;
13402 :
13403 132 : default:
13404 132 : switch (nargs_constant)
13405 : {
13406 8 : case 2:
13407 8 : if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
13408 8 : (!mask_pos && (nargs - i) == nargs_constant))
13409 : {
13410 4 : error ("the next to last argument must be an 8-bit immediate");
13411 4 : break;
13412 : }
13413 : /* FALLTHRU */
13414 128 : case 1:
13415 128 : error ("the last argument must be an 8-bit immediate");
13416 128 : break;
13417 0 : default:
13418 0 : gcc_unreachable ();
13419 : }
13420 132 : return const0_rtx;
13421 : }
13422 : }
13423 : else
13424 : {
13425 181175 : if (VECTOR_MODE_P (mode))
13426 130554 : op = safe_vector_operand (op, mode);
13427 :
13428 : /* If we aren't optimizing, only allow one memory operand to
13429 : be generated. */
13430 181175 : if (memory_operand (op, mode))
13431 : {
13432 29863 : num_memory++;
13433 29863 : if (!optimize && num_memory > 1)
13434 13602 : op = copy_to_mode_reg (mode, op);
13435 : }
13436 :
13437 181175 : if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
13438 : {
13439 178885 : if (!match)
13440 42558 : op = copy_to_mode_reg (mode, op);
13441 : }
13442 : else
13443 : {
13444 2290 : op = copy_to_reg (op);
13445 2290 : op = lowpart_subreg (mode, op, GET_MODE (op));
13446 : }
13447 : }
13448 :
13449 200297 : xops[i] = op;
13450 : }
13451 :
13452 60857 : switch (nargs)
13453 : {
13454 4763 : case 1:
13455 4763 : pat = GEN_FCN (icode) (real_target, xops[0]);
13456 4763 : break;
13457 5663 : case 2:
13458 5663 : pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
13459 5663 : break;
13460 20627 : case 3:
13461 20627 : pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
13462 20627 : break;
13463 27064 : case 4:
13464 27064 : pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
13465 27064 : xops[2], xops[3]);
13466 27064 : break;
13467 2740 : case 5:
13468 2740 : pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
13469 2740 : xops[2], xops[3], xops[4]);
13470 2740 : break;
13471 : case 6:
13472 : pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
13473 : xops[2], xops[3], xops[4], xops[5]);
13474 : break;
13475 : default:
13476 : gcc_unreachable ();
13477 : }
13478 :
13479 60857 : if (! pat)
13480 : return 0;
13481 :
13482 60857 : emit_insn (pat);
13483 60857 : return target;
13484 : }
13485 :
13486 : /* Transform pattern of following layout:
13487 : (set A
13488 : (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
13489 : )
13490 : into:
13491 : (set (A B)) */
13492 :
13493 : static rtx
13494 4944 : ix86_erase_embedded_rounding (rtx pat)
13495 : {
13496 4944 : if (NONJUMP_INSN_P (pat))
13497 694 : pat = PATTERN (pat);
13498 :
13499 4944 : gcc_assert (GET_CODE (pat) == SET);
13500 4944 : rtx src = SET_SRC (pat);
13501 4944 : gcc_assert (XVECLEN (src, 0) == 2);
13502 4944 : rtx p0 = XVECEXP (src, 0, 0);
13503 4944 : gcc_assert (GET_CODE (src) == UNSPEC
13504 : && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
13505 4944 : rtx res = gen_rtx_SET (SET_DEST (pat), p0);
13506 4944 : return res;
13507 : }
13508 :
13509 : /* Subroutine of ix86_expand_round_builtin to take care of comi insns
13510 : with rounding. */
13511 : static rtx
13512 103 : ix86_expand_sse_comi_round (const struct builtin_description *d,
13513 : tree exp, rtx target, bool comx_ok)
13514 : {
13515 103 : rtx pat, set_dst;
13516 103 : tree arg0 = CALL_EXPR_ARG (exp, 0);
13517 103 : tree arg1 = CALL_EXPR_ARG (exp, 1);
13518 103 : tree arg2 = CALL_EXPR_ARG (exp, 2);
13519 103 : tree arg3 = CALL_EXPR_ARG (exp, 3);
13520 103 : rtx op0 = expand_normal (arg0);
13521 103 : rtx op1 = expand_normal (arg1);
13522 103 : rtx op2 = expand_normal (arg2);
13523 103 : rtx op3 = expand_normal (arg3);
13524 103 : enum insn_code icode = d->icode;
13525 103 : const struct insn_data_d *insn_p = &insn_data[icode];
13526 103 : machine_mode mode0 = insn_p->operand[0].mode;
13527 103 : machine_mode mode1 = insn_p->operand[1].mode;
13528 :
13529 : /* See avxintrin.h for values. */
13530 103 : static const enum rtx_code comparisons[32] =
13531 : {
13532 : EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
13533 : UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
13534 : EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
13535 : UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
13536 : };
13537 103 : static const bool ordereds[32] =
13538 : {
13539 : true, true, true, false, false, false, false, true,
13540 : false, false, false, true, true, true, true, false,
13541 : true, true, true, false, false, false, false, true,
13542 : false, false, false, true, true, true, true, false
13543 : };
13544 103 : static const bool non_signalings[32] =
13545 : {
13546 : true, false, false, true, true, false, false, true,
13547 : true, false, false, true, true, false, false, true,
13548 : false, true, true, false, false, true, true, false,
13549 : false, true, true, false, false, true, true, false
13550 : };
13551 :
13552 103 : if (!CONST_INT_P (op2))
13553 : {
13554 0 : error ("the third argument must be comparison constant");
13555 0 : return const0_rtx;
13556 : }
13557 103 : if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
13558 : {
13559 0 : error ("incorrect comparison mode");
13560 0 : return const0_rtx;
13561 : }
13562 :
13563 103 : if (!insn_p->operand[2].predicate (op3, SImode))
13564 : {
13565 4 : error ("incorrect rounding operand");
13566 4 : return const0_rtx;
13567 : }
13568 :
13569 99 : if (VECTOR_MODE_P (mode0))
13570 99 : op0 = safe_vector_operand (op0, mode0);
13571 99 : if (VECTOR_MODE_P (mode1))
13572 99 : op1 = safe_vector_operand (op1, mode1);
13573 :
13574 99 : enum rtx_code comparison = comparisons[INTVAL (op2)];
13575 99 : enum rtx_code orig_comp = comparison;
13576 99 : bool ordered = ordereds[INTVAL (op2)];
13577 99 : bool non_signaling = non_signalings[INTVAL (op2)];
13578 99 : rtx const_val = const0_rtx;
13579 :
13580 99 : bool check_unordered = false;
13581 99 : machine_mode mode = CCFPmode;
13582 99 : switch (comparison)
13583 : {
13584 8 : case ORDERED:
13585 8 : if (!ordered)
13586 : {
13587 4 : if (TARGET_AVX10_2 && comx_ok)
13588 : {
13589 : /* Unlike VCOMI{SH,SS,SD}, VCOMX{SH,SS,SD} will set SF
13590 : differently. So directly return true here. */
13591 0 : target = gen_reg_rtx (SImode);
13592 0 : emit_move_insn (target, const1_rtx);
13593 0 : return target;
13594 : }
13595 : else
13596 : {
13597 : /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
13598 : if (!non_signaling)
13599 99 : ordered = true;
13600 99 : mode = CCSmode;
13601 : }
13602 : }
13603 : else
13604 : {
13605 : /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
13606 : if (non_signaling)
13607 : ordered = false;
13608 : mode = CCPmode;
13609 : }
13610 : comparison = NE;
13611 : break;
13612 8 : case UNORDERED:
13613 8 : if (ordered)
13614 : {
13615 4 : if (TARGET_AVX10_2 && comx_ok)
13616 : {
13617 : /* Unlike VCOMI{SH,SS,SD}, VCOMX{SH,SS,SD} will set SF
13618 : differently. So directly return false here. */
13619 0 : target = gen_reg_rtx (SImode);
13620 0 : emit_move_insn (target, const0_rtx);
13621 0 : return target;
13622 : }
13623 : else
13624 : {
13625 : /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
13626 : if (non_signaling)
13627 99 : ordered = false;
13628 : mode = CCSmode;
13629 : }
13630 : }
13631 : else
13632 : {
13633 : /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
13634 : if (!non_signaling)
13635 99 : ordered = true;
13636 99 : mode = CCPmode;
13637 : }
13638 : comparison = EQ;
13639 : break;
13640 :
13641 40 : case LE: /* -> GE */
13642 40 : case LT: /* -> GT */
13643 40 : case UNGE: /* -> UNLE */
13644 40 : case UNGT: /* -> UNLT */
13645 40 : std::swap (op0, op1);
13646 40 : comparison = swap_condition (comparison);
13647 : /* FALLTHRU */
13648 68 : case GT:
13649 68 : case GE:
13650 68 : case UNEQ:
13651 68 : case UNLT:
13652 68 : case UNLE:
13653 68 : case LTGT:
13654 : /* These are supported by CCFPmode. NB: Use ordered/signaling
13655 : COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
13656 : with NAN operands. */
13657 68 : if (ordered == non_signaling)
13658 : ordered = !ordered;
13659 : break;
13660 : /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
13661 : _CMP_EQ_OQ/_CMP_EQ_OS.
13662 : Under TARGET_AVX10_2, VCOMX/VUCOMX are always generated instead
13663 : of COMI/UCOMI, VCOMX/VUCOMX will not set ZF with NAN. */
13664 8 : case EQ:
13665 8 : if (!TARGET_AVX10_2 || !comx_ok)
13666 5 : check_unordered = true;
13667 : mode = CCZmode;
13668 : break;
13669 7 : case NE:
13670 : /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
13671 : _CMP_NEQ_UQ/_CMP_NEQ_US.
13672 : Under TARGET_AVX10_2, VCOMX/VUCOMX are always generated instead
13673 : of COMI/UCOMI, VCOMX/VUCOMX will not set ZF with NAN. */
13674 7 : gcc_assert (!ordered);
13675 7 : if (!TARGET_AVX10_2 || !comx_ok)
13676 4 : check_unordered = true;
13677 7 : mode = CCZmode;
13678 7 : const_val = const1_rtx;
13679 7 : break;
13680 0 : default:
13681 0 : gcc_unreachable ();
13682 : }
13683 :
13684 99 : target = gen_reg_rtx (SImode);
13685 99 : emit_move_insn (target, const_val);
13686 99 : target = gen_rtx_SUBREG (QImode, target, 0);
13687 :
13688 93 : if ((optimize && !register_operand (op0, mode0))
13689 192 : || !insn_p->operand[0].predicate (op0, mode0))
13690 6 : op0 = copy_to_mode_reg (mode0, op0);
13691 93 : if ((optimize && !register_operand (op1, mode1))
13692 192 : || !insn_p->operand[1].predicate (op1, mode1))
13693 6 : op1 = copy_to_mode_reg (mode1, op1);
13694 :
13695 : /* Generate comx instead of comi when EQ/NE to avoid NAN checks.
13696 : Use orig_comp to exclude ORDERED/UNORDERED cases. */
13697 99 : if ((orig_comp == EQ || orig_comp == NE)
13698 15 : && TARGET_AVX10_2 && comx_ok)
13699 : {
13700 6 : switch (icode)
13701 : {
13702 : case CODE_FOR_avx512fp16_comi_round:
13703 99 : icode = CODE_FOR_avx10_2_comxhf_round;
13704 : break;
13705 4 : case CODE_FOR_sse_comi_round:
13706 4 : icode = CODE_FOR_avx10_2_comxsf_round;
13707 4 : break;
13708 2 : case CODE_FOR_sse2_comi_round:
13709 2 : icode = CODE_FOR_avx10_2_comxdf_round;
13710 2 : break;
13711 :
13712 : default:
13713 : break;
13714 : }
13715 : }
13716 :
13717 : /* Generate comi instead of comx when UNEQ/LTGT to avoid NAN checks. */
13718 99 : if ((comparison == UNEQ || comparison == LTGT)
13719 8 : && TARGET_AVX10_2 && comx_ok)
13720 : {
13721 0 : switch (icode)
13722 : {
13723 : case CODE_FOR_avx10_2_comxhf_round:
13724 99 : icode = CODE_FOR_avx512fp16_comi_round;
13725 : break;
13726 0 : case CODE_FOR_avx10_2_comxsf_round:
13727 0 : icode = CODE_FOR_sse_comi_round;
13728 0 : break;
13729 0 : case CODE_FOR_avx10_2_comxdf_round:
13730 0 : icode = CODE_FOR_sse2_comi_round;
13731 0 : break;
13732 :
13733 : default:
13734 : break;
13735 : }
13736 : }
13737 :
13738 : /*
13739 : 1. COMI/VCOMX: ordered and signaling.
13740 : 2. UCOMI/VUCOMX: unordered and non-signaling.
13741 : */
13742 99 : if (non_signaling)
13743 38 : switch (icode)
13744 : {
13745 : case CODE_FOR_sse_comi_round:
13746 : icode = CODE_FOR_sse_ucomi_round;
13747 : break;
13748 17 : case CODE_FOR_sse2_comi_round:
13749 17 : icode = CODE_FOR_sse2_ucomi_round;
13750 17 : break;
13751 0 : case CODE_FOR_avx512fp16_comi_round:
13752 0 : icode = CODE_FOR_avx512fp16_ucomi_round;
13753 0 : break;
13754 3 : case CODE_FOR_avx10_2_comxsf_round:
13755 3 : icode = CODE_FOR_avx10_2_ucomxsf_round;
13756 3 : break;
13757 0 : case CODE_FOR_avx10_2_comxhf_round:
13758 0 : icode = CODE_FOR_avx10_2_ucomxhf_round;
13759 0 : break;
13760 1 : case CODE_FOR_avx10_2_comxdf_round:
13761 1 : icode = CODE_FOR_avx10_2_ucomxdf_round;
13762 1 : break;
13763 0 : default:
13764 0 : gcc_unreachable ();
13765 : }
13766 :
13767 99 : pat = GEN_FCN (icode) (op0, op1, op3);
13768 99 : if (! pat)
13769 : return 0;
13770 :
13771 : /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
13772 99 : if (INTVAL (op3) == NO_ROUND)
13773 : {
13774 1 : pat = ix86_erase_embedded_rounding (pat);
13775 1 : if (! pat)
13776 : return 0;
13777 :
13778 1 : set_dst = SET_DEST (pat);
13779 : }
13780 : else
13781 : {
13782 98 : gcc_assert (GET_CODE (pat) == SET);
13783 98 : set_dst = SET_DEST (pat);
13784 : }
13785 :
13786 99 : emit_insn (pat);
13787 :
13788 99 : return ix86_ssecom_setcc (comparison, check_unordered, mode,
13789 99 : set_dst, target);
13790 : }
13791 :
13792 : static rtx
13793 15589 : ix86_expand_round_builtin (const struct builtin_description *d,
13794 : tree exp, rtx target)
13795 : {
13796 15589 : rtx pat;
13797 15589 : unsigned int i, nargs;
13798 15589 : rtx xops[6];
13799 15589 : enum insn_code icode = d->icode;
13800 15589 : const struct insn_data_d *insn_p = &insn_data[icode];
13801 15589 : machine_mode tmode = insn_p->operand[0].mode;
13802 15589 : unsigned int nargs_constant = 0;
13803 15589 : unsigned int redundant_embed_rnd = 0;
13804 :
13805 15589 : switch ((enum ix86_builtin_func_type) d->flag)
13806 : {
13807 : case UINT64_FTYPE_V2DF_INT:
13808 : case UINT64_FTYPE_V4SF_INT:
13809 : case UINT64_FTYPE_V8HF_INT:
13810 : case UINT_FTYPE_V2DF_INT:
13811 : case UINT_FTYPE_V4SF_INT:
13812 : case UINT_FTYPE_V8HF_INT:
13813 : case INT64_FTYPE_V2DF_INT:
13814 : case INT64_FTYPE_V4SF_INT:
13815 : case INT64_FTYPE_V8HF_INT:
13816 : case INT_FTYPE_V2DF_INT:
13817 : case INT_FTYPE_V4SF_INT:
13818 : case INT_FTYPE_V8HF_INT:
13819 : nargs = 2;
13820 : break;
13821 651 : case V32HF_FTYPE_V32HF_V32HF_INT:
13822 651 : case V8HF_FTYPE_V8HF_V8HF_INT:
13823 651 : case V8HF_FTYPE_V8HF_INT_INT:
13824 651 : case V8HF_FTYPE_V8HF_UINT_INT:
13825 651 : case V8HF_FTYPE_V8HF_INT64_INT:
13826 651 : case V8HF_FTYPE_V8HF_UINT64_INT:
13827 651 : case V4SF_FTYPE_V4SF_UINT_INT:
13828 651 : case V4SF_FTYPE_V4SF_UINT64_INT:
13829 651 : case V2DF_FTYPE_V2DF_UINT64_INT:
13830 651 : case V4SF_FTYPE_V4SF_INT_INT:
13831 651 : case V4SF_FTYPE_V4SF_INT64_INT:
13832 651 : case V2DF_FTYPE_V2DF_INT64_INT:
13833 651 : case V4SF_FTYPE_V4SF_V4SF_INT:
13834 651 : case V2DF_FTYPE_V2DF_V2DF_INT:
13835 651 : case V4SF_FTYPE_V4SF_V2DF_INT:
13836 651 : case V2DF_FTYPE_V2DF_V4SF_INT:
13837 651 : nargs = 3;
13838 651 : break;
13839 4554 : case V8SF_FTYPE_V8DF_V8SF_QI_INT:
13840 4554 : case V8DF_FTYPE_V8DF_V8DF_QI_INT:
13841 4554 : case V32HI_FTYPE_V32HF_V32HI_USI_INT:
13842 4554 : case V32HI_FTYPE_V32BF_V32HI_USI_INT:
13843 4554 : case V8SI_FTYPE_V8DF_V8SI_QI_INT:
13844 4554 : case V8DI_FTYPE_V8HF_V8DI_UQI_INT:
13845 4554 : case V8DI_FTYPE_V8DF_V8DI_QI_INT:
13846 4554 : case V8SF_FTYPE_V8DI_V8SF_QI_INT:
13847 4554 : case V8DF_FTYPE_V8DI_V8DF_QI_INT:
13848 4554 : case V8DF_FTYPE_V8HF_V8DF_UQI_INT:
13849 4554 : case V16SF_FTYPE_V16HF_V16SF_UHI_INT:
13850 4554 : case V32HF_FTYPE_V32HI_V32HF_USI_INT:
13851 4554 : case V32HF_FTYPE_V32HF_V32HF_USI_INT:
13852 4554 : case V32HF_FTYPE_V32HF_V32HF_V32HF_INT:
13853 4554 : case V16SF_FTYPE_V16SF_V16SF_HI_INT:
13854 4554 : case V8DI_FTYPE_V8SF_V8DI_QI_INT:
13855 4554 : case V16SF_FTYPE_V16SI_V16SF_HI_INT:
13856 4554 : case V16SI_FTYPE_V16SF_V16SI_HI_INT:
13857 4554 : case V16SI_FTYPE_V16SF_V16SI_UHI_INT:
13858 4554 : case V16SI_FTYPE_V16HF_V16SI_UHI_INT:
13859 4554 : case V16HF_FTYPE_V16SI_V16HF_UHI_INT:
13860 4554 : case V8DF_FTYPE_V8SF_V8DF_QI_INT:
13861 4554 : case V16SF_FTYPE_V16HI_V16SF_HI_INT:
13862 4554 : case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
13863 4554 : case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
13864 4554 : case V8HF_FTYPE_V8DI_V8HF_UQI_INT:
13865 4554 : case V8HF_FTYPE_V8DF_V8HF_UQI_INT:
13866 4554 : case V16HF_FTYPE_V16SF_V16HF_UHI_INT:
13867 4554 : case V16HI_FTYPE_V16BF_V16HI_UHI_INT:
13868 4554 : case V8HF_FTYPE_V8HF_V8HF_V8HF_INT:
13869 4554 : nargs = 4;
13870 4554 : break;
13871 180 : case V4SF_FTYPE_V4SF_V4SF_INT_INT:
13872 180 : case V2DF_FTYPE_V2DF_V2DF_INT_INT:
13873 180 : nargs_constant = 2;
13874 180 : nargs = 4;
13875 180 : break;
13876 103 : case INT_FTYPE_V4SF_V4SF_INT_INT:
13877 103 : case INT_FTYPE_V2DF_V2DF_INT_INT:
13878 103 : return ix86_expand_sse_comi_round (d, exp, target, true);
13879 6233 : case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
13880 6233 : case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
13881 6233 : case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
13882 6233 : case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT:
13883 6233 : case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
13884 6233 : case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT:
13885 6233 : case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
13886 6233 : case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT:
13887 6233 : case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
13888 6233 : case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
13889 6233 : case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
13890 6233 : case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
13891 6233 : case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
13892 6233 : case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
13893 6233 : case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT:
13894 6233 : case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT:
13895 6233 : case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT:
13896 6233 : case V32HF_FTYPE_V16SF_V16SF_V32HF_USI_INT:
13897 6233 : nargs = 5;
13898 6233 : break;
13899 635 : case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT:
13900 635 : case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
13901 635 : case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
13902 635 : case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
13903 635 : case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
13904 635 : nargs_constant = 4;
13905 635 : nargs = 5;
13906 635 : break;
13907 1181 : case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
13908 1181 : case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
13909 1181 : case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
13910 1181 : case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
13911 1181 : case USI_FTYPE_V32HF_V32HF_INT_USI_INT:
13912 1181 : case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT:
13913 1181 : nargs_constant = 3;
13914 1181 : nargs = 5;
13915 1181 : break;
13916 1071 : case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
13917 1071 : case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
13918 1071 : case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
13919 1071 : case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
13920 1071 : case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
13921 1071 : case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
13922 1071 : case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT:
13923 1071 : case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI_INT:
13924 1071 : case V32HF_FTYPE_V32HF_V32HF_INT_V32HF_USI_INT:
13925 1071 : case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI_INT:
13926 1071 : nargs = 6;
13927 1071 : nargs_constant = 4;
13928 1071 : break;
13929 252 : case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
13930 252 : case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
13931 252 : case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
13932 252 : case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
13933 252 : nargs = 6;
13934 252 : nargs_constant = 3;
13935 252 : break;
13936 0 : default:
13937 0 : gcc_unreachable ();
13938 : }
13939 14757 : gcc_assert (nargs <= ARRAY_SIZE (xops));
13940 :
13941 15486 : if (optimize
13942 4265 : || target == 0
13943 4265 : || GET_MODE (target) != tmode
13944 19751 : || !insn_p->operand[0].predicate (target, tmode))
13945 11221 : target = gen_reg_rtx (tmode);
13946 :
13947 85365 : for (i = 0; i < nargs; i++)
13948 : {
13949 70434 : tree arg = CALL_EXPR_ARG (exp, i);
13950 70434 : rtx op = ix86_expand_unsigned_small_int_cst_argument (arg);
13951 70434 : machine_mode mode = insn_p->operand[i + 1].mode;
13952 70434 : bool match = insn_p->operand[i + 1].predicate (op, mode);
13953 :
13954 70434 : if (i == nargs - nargs_constant)
13955 : {
13956 3319 : if (!match)
13957 : {
13958 40 : switch (icode)
13959 : {
13960 12 : case CODE_FOR_avx512f_getmantv8df_mask_round:
13961 12 : case CODE_FOR_avx512f_getmantv16sf_mask_round:
13962 12 : case CODE_FOR_avx512bw_getmantv32hf_mask_round:
13963 12 : case CODE_FOR_avx512f_vgetmantv2df_round:
13964 12 : case CODE_FOR_avx512f_vgetmantv2df_mask_round:
13965 12 : case CODE_FOR_avx512f_vgetmantv4sf_round:
13966 12 : case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
13967 12 : case CODE_FOR_avx512f_vgetmantv8hf_mask_round:
13968 12 : error ("the immediate argument must be a 4-bit immediate");
13969 12 : return const0_rtx;
13970 8 : case CODE_FOR_avx512f_cmpv8df3_mask_round:
13971 8 : case CODE_FOR_avx512f_cmpv16sf3_mask_round:
13972 8 : case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
13973 8 : case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
13974 8 : case CODE_FOR_avx512f_vmcmpv8hf3_mask_round:
13975 8 : case CODE_FOR_avx512bw_cmpv32hf3_mask_round:
13976 8 : error ("the immediate argument must be a 5-bit immediate");
13977 8 : return const0_rtx;
13978 20 : default:
13979 20 : error ("the immediate argument must be an 8-bit immediate");
13980 20 : return const0_rtx;
13981 : }
13982 : }
13983 : }
13984 67115 : else if (i == nargs-1)
13985 : {
13986 15446 : if (!insn_p->operand[nargs].predicate (op, SImode))
13987 : {
13988 515 : error ("incorrect rounding operand");
13989 515 : return const0_rtx;
13990 : }
13991 :
13992 : /* If there is no rounding use normal version of the pattern. */
13993 14931 : if (INTVAL (op) == NO_ROUND)
13994 : {
13995 : /* Skip erasing embedded rounding for below expanders who
13996 : generates multiple insns. In ix86_erase_embedded_rounding
13997 : the pattern will be transformed to a single set, and emit_insn
13998 : appends the set instead of insert it to chain. So the insns
13999 : emitted inside define_expander would be ignored. */
14000 4975 : switch (icode)
14001 : {
14002 : case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round:
14003 : case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round:
14004 : case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round:
14005 : case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round:
14006 : case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round:
14007 : case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round:
14008 : redundant_embed_rnd = 0;
14009 : break;
14010 4943 : default:
14011 4943 : redundant_embed_rnd = 1;
14012 4943 : break;
14013 : }
14014 : }
14015 : }
14016 : else
14017 : {
14018 51669 : if (VECTOR_MODE_P (mode))
14019 37752 : op = safe_vector_operand (op, mode);
14020 :
14021 51669 : op = fixup_modeless_constant (op, mode);
14022 :
14023 51669 : if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
14024 : {
14025 51669 : if (optimize || !match)
14026 45341 : op = copy_to_mode_reg (mode, op);
14027 : }
14028 : else
14029 : {
14030 0 : op = copy_to_reg (op);
14031 0 : op = lowpart_subreg (mode, op, GET_MODE (op));
14032 : }
14033 : }
14034 :
14035 69879 : xops[i] = op;
14036 : }
14037 :
14038 14931 : switch (nargs)
14039 : {
14040 : case 1:
14041 : pat = GEN_FCN (icode) (target, xops[0]);
14042 : break;
14043 696 : case 2:
14044 696 : pat = GEN_FCN (icode) (target, xops[0], xops[1]);
14045 696 : break;
14046 607 : case 3:
14047 607 : pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
14048 607 : break;
14049 4610 : case 4:
14050 4610 : pat = GEN_FCN (icode) (target, xops[0], xops[1],
14051 4610 : xops[2], xops[3]);
14052 4610 : break;
14053 7745 : case 5:
14054 7745 : pat = GEN_FCN (icode) (target, xops[0], xops[1],
14055 7745 : xops[2], xops[3], xops[4]);
14056 7745 : break;
14057 1273 : case 6:
14058 1273 : pat = GEN_FCN (icode) (target, xops[0], xops[1],
14059 1273 : xops[2], xops[3], xops[4], xops[5]);
14060 1273 : break;
14061 : default:
14062 : gcc_unreachable ();
14063 : }
14064 :
14065 14931 : if (!pat)
14066 : return 0;
14067 :
14068 14931 : if (redundant_embed_rnd)
14069 4943 : pat = ix86_erase_embedded_rounding (pat);
14070 :
14071 14931 : emit_insn (pat);
14072 14931 : return target;
14073 : }
14074 :
14075 : /* Subroutine of ix86_expand_builtin to take care of special insns
14076 : with variable number of operands. */
14077 :
14078 : static rtx
14079 27184 : ix86_expand_special_args_builtin (const struct builtin_description *d,
14080 : tree exp, rtx target)
14081 : {
14082 27184 : tree arg;
14083 27184 : rtx pat, op;
14084 27184 : unsigned int i, nargs, arg_adjust, memory;
14085 27184 : unsigned int constant = 100;
14086 27184 : bool aligned_mem = false;
14087 27184 : rtx xops[4];
14088 27184 : enum insn_code icode = d->icode;
14089 27184 : const struct insn_data_d *insn_p = &insn_data[icode];
14090 27184 : machine_mode tmode = insn_p->operand[0].mode;
14091 27184 : enum { load, store } klass;
14092 :
14093 27184 : switch ((enum ix86_builtin_func_type) d->flag)
14094 : {
14095 15371 : case VOID_FTYPE_VOID:
14096 15371 : emit_insn (GEN_FCN (icode) (target));
14097 15371 : return 0;
14098 : case VOID_FTYPE_UINT64:
14099 : case VOID_FTYPE_UNSIGNED:
14100 : nargs = 0;
14101 : klass = store;
14102 : memory = 0;
14103 : break;
14104 :
14105 7581 : case INT_FTYPE_VOID:
14106 7581 : case USHORT_FTYPE_VOID:
14107 7581 : case UINT64_FTYPE_VOID:
14108 7581 : case UINT_FTYPE_VOID:
14109 7581 : case UINT8_FTYPE_VOID:
14110 7581 : case UNSIGNED_FTYPE_VOID:
14111 7581 : nargs = 0;
14112 7581 : klass = load;
14113 7581 : memory = 0;
14114 7581 : break;
14115 359 : case CHAR_FTYPE_PCCHAR:
14116 359 : case SHORT_FTYPE_PCSHORT:
14117 359 : case INT_FTYPE_PCINT:
14118 359 : case INT64_FTYPE_PCINT64:
14119 359 : case UINT64_FTYPE_PUNSIGNED:
14120 359 : case V2DI_FTYPE_PV2DI:
14121 359 : case V4DI_FTYPE_PV4DI:
14122 359 : case V32QI_FTYPE_PCCHAR:
14123 359 : case V16QI_FTYPE_PCCHAR:
14124 359 : case V8SF_FTYPE_PCV4SF:
14125 359 : case V8SF_FTYPE_PCFLOAT:
14126 359 : case V4SF_FTYPE_PCFLOAT:
14127 359 : case V4SF_FTYPE_PCFLOAT16:
14128 359 : case V4SF_FTYPE_PCBFLOAT16:
14129 359 : case V4SF_FTYPE_PCV8BF:
14130 359 : case V4SF_FTYPE_PCV8HF:
14131 359 : case V8SF_FTYPE_PCFLOAT16:
14132 359 : case V8SF_FTYPE_PCBFLOAT16:
14133 359 : case V8SF_FTYPE_PCV16HF:
14134 359 : case V8SF_FTYPE_PCV16BF:
14135 359 : case V4DF_FTYPE_PCV2DF:
14136 359 : case V4DF_FTYPE_PCDOUBLE:
14137 359 : case V2DF_FTYPE_PCDOUBLE:
14138 359 : case VOID_FTYPE_PVOID:
14139 359 : case V8DI_FTYPE_PV8DI:
14140 359 : nargs = 1;
14141 359 : klass = load;
14142 359 : memory = 0;
14143 359 : switch (icode)
14144 : {
14145 : case CODE_FOR_sse4_1_movntdqa:
14146 : case CODE_FOR_avx2_movntdqa:
14147 : case CODE_FOR_avx512f_movntdqa:
14148 : aligned_mem = true;
14149 : break;
14150 : default:
14151 : break;
14152 : }
14153 : break;
14154 371 : case VOID_FTYPE_PV2SF_V4SF:
14155 371 : case VOID_FTYPE_PV8DI_V8DI:
14156 371 : case VOID_FTYPE_PV4DI_V4DI:
14157 371 : case VOID_FTYPE_PV2DI_V2DI:
14158 371 : case VOID_FTYPE_PCHAR_V32QI:
14159 371 : case VOID_FTYPE_PCHAR_V16QI:
14160 371 : case VOID_FTYPE_PFLOAT_V16SF:
14161 371 : case VOID_FTYPE_PFLOAT_V8SF:
14162 371 : case VOID_FTYPE_PFLOAT_V4SF:
14163 371 : case VOID_FTYPE_PDOUBLE_V8DF:
14164 371 : case VOID_FTYPE_PDOUBLE_V4DF:
14165 371 : case VOID_FTYPE_PDOUBLE_V2DF:
14166 371 : case VOID_FTYPE_PLONGLONG_LONGLONG:
14167 371 : case VOID_FTYPE_PULONGLONG_ULONGLONG:
14168 371 : case VOID_FTYPE_PUNSIGNED_UNSIGNED:
14169 371 : case VOID_FTYPE_PINT_INT:
14170 371 : nargs = 1;
14171 371 : klass = store;
14172 : /* Reserve memory operand for target. */
14173 371 : memory = ARRAY_SIZE (xops);
14174 371 : switch (icode)
14175 : {
14176 : /* These builtins and instructions require the memory
14177 : to be properly aligned. */
14178 : case CODE_FOR_avx_movntv4di:
14179 : case CODE_FOR_sse2_movntv2di:
14180 : case CODE_FOR_avx_movntv8sf:
14181 : case CODE_FOR_sse_movntv4sf:
14182 : case CODE_FOR_sse4a_vmmovntv4sf:
14183 : case CODE_FOR_avx_movntv4df:
14184 : case CODE_FOR_sse2_movntv2df:
14185 : case CODE_FOR_sse4a_vmmovntv2df:
14186 : case CODE_FOR_sse2_movntidi:
14187 : case CODE_FOR_sse_movntq:
14188 : case CODE_FOR_sse2_movntisi:
14189 : case CODE_FOR_avx512f_movntv16sf:
14190 : case CODE_FOR_avx512f_movntv8df:
14191 : case CODE_FOR_avx512f_movntv8di:
14192 : aligned_mem = true;
14193 : break;
14194 : default:
14195 : break;
14196 : }
14197 : break;
14198 0 : case VOID_FTYPE_PVOID_PCVOID:
14199 0 : nargs = 1;
14200 0 : klass = store;
14201 0 : memory = 0;
14202 :
14203 0 : break;
14204 26 : case V4SF_FTYPE_V4SF_PCV2SF:
14205 26 : case V2DF_FTYPE_V2DF_PCDOUBLE:
14206 26 : nargs = 2;
14207 26 : klass = load;
14208 26 : memory = 1;
14209 26 : break;
14210 93 : case V8SF_FTYPE_PCV8SF_V8SI:
14211 93 : case V4DF_FTYPE_PCV4DF_V4DI:
14212 93 : case V4SF_FTYPE_PCV4SF_V4SI:
14213 93 : case V2DF_FTYPE_PCV2DF_V2DI:
14214 93 : case V8SI_FTYPE_PCV8SI_V8SI:
14215 93 : case V4DI_FTYPE_PCV4DI_V4DI:
14216 93 : case V4SI_FTYPE_PCV4SI_V4SI:
14217 93 : case V2DI_FTYPE_PCV2DI_V2DI:
14218 93 : case VOID_FTYPE_INT_INT64:
14219 93 : nargs = 2;
14220 93 : klass = load;
14221 93 : memory = 0;
14222 93 : break;
14223 360 : case VOID_FTYPE_PV8DF_V8DF_UQI:
14224 360 : case VOID_FTYPE_PV4DF_V4DF_UQI:
14225 360 : case VOID_FTYPE_PV2DF_V2DF_UQI:
14226 360 : case VOID_FTYPE_PV16SF_V16SF_UHI:
14227 360 : case VOID_FTYPE_PV8SF_V8SF_UQI:
14228 360 : case VOID_FTYPE_PV4SF_V4SF_UQI:
14229 360 : case VOID_FTYPE_PV8DI_V8DI_UQI:
14230 360 : case VOID_FTYPE_PV4DI_V4DI_UQI:
14231 360 : case VOID_FTYPE_PV2DI_V2DI_UQI:
14232 360 : case VOID_FTYPE_PV16SI_V16SI_UHI:
14233 360 : case VOID_FTYPE_PV8SI_V8SI_UQI:
14234 360 : case VOID_FTYPE_PV4SI_V4SI_UQI:
14235 360 : case VOID_FTYPE_PV64QI_V64QI_UDI:
14236 360 : case VOID_FTYPE_PV32HI_V32HI_USI:
14237 360 : case VOID_FTYPE_PV32QI_V32QI_USI:
14238 360 : case VOID_FTYPE_PV16QI_V16QI_UHI:
14239 360 : case VOID_FTYPE_PV16HI_V16HI_UHI:
14240 360 : case VOID_FTYPE_PV8HI_V8HI_UQI:
14241 360 : switch (icode)
14242 : {
14243 : /* These builtins and instructions require the memory
14244 : to be properly aligned. */
14245 : case CODE_FOR_avx512f_storev16sf_mask:
14246 : case CODE_FOR_avx512f_storev16si_mask:
14247 : case CODE_FOR_avx512f_storev8df_mask:
14248 : case CODE_FOR_avx512f_storev8di_mask:
14249 : case CODE_FOR_avx512vl_storev8sf_mask:
14250 : case CODE_FOR_avx512vl_storev8si_mask:
14251 : case CODE_FOR_avx512vl_storev4df_mask:
14252 : case CODE_FOR_avx512vl_storev4di_mask:
14253 : case CODE_FOR_avx512vl_storev4sf_mask:
14254 : case CODE_FOR_avx512vl_storev4si_mask:
14255 : case CODE_FOR_avx512vl_storev2df_mask:
14256 : case CODE_FOR_avx512vl_storev2di_mask:
14257 11813 : aligned_mem = true;
14258 : break;
14259 : default:
14260 : break;
14261 : }
14262 : /* FALLTHRU */
14263 : case VOID_FTYPE_PV8SF_V8SI_V8SF:
14264 : case VOID_FTYPE_PV4DF_V4DI_V4DF:
14265 : case VOID_FTYPE_PV4SF_V4SI_V4SF:
14266 : case VOID_FTYPE_PV2DF_V2DI_V2DF:
14267 : case VOID_FTYPE_PV8SI_V8SI_V8SI:
14268 : case VOID_FTYPE_PV4DI_V4DI_V4DI:
14269 : case VOID_FTYPE_PV4SI_V4SI_V4SI:
14270 : case VOID_FTYPE_PV2DI_V2DI_V2DI:
14271 : case VOID_FTYPE_PV8SI_V8DI_UQI:
14272 : case VOID_FTYPE_PV8HI_V8DI_UQI:
14273 : case VOID_FTYPE_PV16HI_V16SI_UHI:
14274 : case VOID_FTYPE_PUDI_V8DI_UQI:
14275 : case VOID_FTYPE_PV16QI_V16SI_UHI:
14276 : case VOID_FTYPE_PV4SI_V4DI_UQI:
14277 : case VOID_FTYPE_PUDI_V2DI_UQI:
14278 : case VOID_FTYPE_PUDI_V4DI_UQI:
14279 : case VOID_FTYPE_PUSI_V2DI_UQI:
14280 : case VOID_FTYPE_PV8HI_V8SI_UQI:
14281 : case VOID_FTYPE_PUDI_V4SI_UQI:
14282 : case VOID_FTYPE_PUSI_V4DI_UQI:
14283 : case VOID_FTYPE_PUHI_V2DI_UQI:
14284 : case VOID_FTYPE_PUDI_V8SI_UQI:
14285 : case VOID_FTYPE_PUSI_V4SI_UQI:
14286 : case VOID_FTYPE_PCHAR_V64QI_UDI:
14287 : case VOID_FTYPE_PCHAR_V32QI_USI:
14288 : case VOID_FTYPE_PCHAR_V16QI_UHI:
14289 : case VOID_FTYPE_PSHORT_V32HI_USI:
14290 : case VOID_FTYPE_PSHORT_V16HI_UHI:
14291 : case VOID_FTYPE_PSHORT_V8HI_UQI:
14292 : case VOID_FTYPE_PINT_V16SI_UHI:
14293 : case VOID_FTYPE_PINT_V8SI_UQI:
14294 : case VOID_FTYPE_PINT_V4SI_UQI:
14295 : case VOID_FTYPE_PINT64_V8DI_UQI:
14296 : case VOID_FTYPE_PINT64_V4DI_UQI:
14297 : case VOID_FTYPE_PINT64_V2DI_UQI:
14298 : case VOID_FTYPE_PDOUBLE_V8DF_UQI:
14299 : case VOID_FTYPE_PDOUBLE_V4DF_UQI:
14300 : case VOID_FTYPE_PDOUBLE_V2DF_UQI:
14301 : case VOID_FTYPE_PFLOAT_V16SF_UHI:
14302 : case VOID_FTYPE_PFLOAT_V8SF_UQI:
14303 : case VOID_FTYPE_PFLOAT_V4SF_UQI:
14304 : case VOID_FTYPE_PCFLOAT16_V8HF_UQI:
14305 : case VOID_FTYPE_PV32QI_V32HI_USI:
14306 : case VOID_FTYPE_PV16QI_V16HI_UHI:
14307 : case VOID_FTYPE_PUDI_V8HI_UQI:
14308 : nargs = 2;
14309 : klass = store;
14310 : /* Reserve memory operand for target. */
14311 : memory = ARRAY_SIZE (xops);
14312 : break;
14313 1243 : case V4SF_FTYPE_PCV4SF_V4SF_UQI:
14314 1243 : case V8SF_FTYPE_PCV8SF_V8SF_UQI:
14315 1243 : case V16SF_FTYPE_PCV16SF_V16SF_UHI:
14316 1243 : case V4SI_FTYPE_PCV4SI_V4SI_UQI:
14317 1243 : case V8SI_FTYPE_PCV8SI_V8SI_UQI:
14318 1243 : case V16SI_FTYPE_PCV16SI_V16SI_UHI:
14319 1243 : case V2DF_FTYPE_PCV2DF_V2DF_UQI:
14320 1243 : case V4DF_FTYPE_PCV4DF_V4DF_UQI:
14321 1243 : case V8DF_FTYPE_PCV8DF_V8DF_UQI:
14322 1243 : case V2DI_FTYPE_PCV2DI_V2DI_UQI:
14323 1243 : case V4DI_FTYPE_PCV4DI_V4DI_UQI:
14324 1243 : case V8DI_FTYPE_PCV8DI_V8DI_UQI:
14325 1243 : case V64QI_FTYPE_PCV64QI_V64QI_UDI:
14326 1243 : case V32HI_FTYPE_PCV32HI_V32HI_USI:
14327 1243 : case V32QI_FTYPE_PCV32QI_V32QI_USI:
14328 1243 : case V16QI_FTYPE_PCV16QI_V16QI_UHI:
14329 1243 : case V16HI_FTYPE_PCV16HI_V16HI_UHI:
14330 1243 : case V8HI_FTYPE_PCV8HI_V8HI_UQI:
14331 1243 : switch (icode)
14332 : {
14333 : /* These builtins and instructions require the memory
14334 : to be properly aligned. */
14335 : case CODE_FOR_avx512f_loadv16sf_mask:
14336 : case CODE_FOR_avx512f_loadv16si_mask:
14337 : case CODE_FOR_avx512f_loadv8df_mask:
14338 : case CODE_FOR_avx512f_loadv8di_mask:
14339 : case CODE_FOR_avx512vl_loadv8sf_mask:
14340 : case CODE_FOR_avx512vl_loadv8si_mask:
14341 : case CODE_FOR_avx512vl_loadv4df_mask:
14342 : case CODE_FOR_avx512vl_loadv4di_mask:
14343 : case CODE_FOR_avx512vl_loadv4sf_mask:
14344 : case CODE_FOR_avx512vl_loadv4si_mask:
14345 : case CODE_FOR_avx512vl_loadv2df_mask:
14346 : case CODE_FOR_avx512vl_loadv2di_mask:
14347 : case CODE_FOR_avx512bw_loadv64qi_mask:
14348 : case CODE_FOR_avx512vl_loadv32qi_mask:
14349 : case CODE_FOR_avx512vl_loadv16qi_mask:
14350 : case CODE_FOR_avx512bw_loadv32hi_mask:
14351 : case CODE_FOR_avx512vl_loadv16hi_mask:
14352 : case CODE_FOR_avx512vl_loadv8hi_mask:
14353 11813 : aligned_mem = true;
14354 : break;
14355 : default:
14356 : break;
14357 : }
14358 : /* FALLTHRU */
14359 : case V64QI_FTYPE_PCCHAR_V64QI_UDI:
14360 : case V32QI_FTYPE_PCCHAR_V32QI_USI:
14361 : case V16QI_FTYPE_PCCHAR_V16QI_UHI:
14362 : case V32HI_FTYPE_PCSHORT_V32HI_USI:
14363 : case V16HI_FTYPE_PCSHORT_V16HI_UHI:
14364 : case V8HI_FTYPE_PCSHORT_V8HI_UQI:
14365 : case V16SI_FTYPE_PCINT_V16SI_UHI:
14366 : case V8SI_FTYPE_PCINT_V8SI_UQI:
14367 : case V4SI_FTYPE_PCINT_V4SI_UQI:
14368 : case V8DI_FTYPE_PCINT64_V8DI_UQI:
14369 : case V4DI_FTYPE_PCINT64_V4DI_UQI:
14370 : case V2DI_FTYPE_PCINT64_V2DI_UQI:
14371 : case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
14372 : case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
14373 : case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
14374 : case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
14375 : case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
14376 : case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
14377 : case V8HF_FTYPE_PCFLOAT16_V8HF_UQI:
14378 : nargs = 3;
14379 : klass = load;
14380 : memory = 0;
14381 : break;
14382 105 : case INT_FTYPE_PINT_INT_INT_INT:
14383 105 : case LONGLONG_FTYPE_PLONGLONG_LONGLONG_LONGLONG_INT:
14384 105 : nargs = 4;
14385 105 : klass = load;
14386 105 : memory = 0;
14387 105 : constant = 3;
14388 105 : break;
14389 0 : default:
14390 0 : gcc_unreachable ();
14391 : }
14392 :
14393 8339 : gcc_assert (nargs <= ARRAY_SIZE (xops));
14394 :
14395 11813 : if (klass == store)
14396 : {
14397 1878 : arg = CALL_EXPR_ARG (exp, 0);
14398 1878 : op = expand_normal (arg);
14399 1878 : gcc_assert (target == 0);
14400 1878 : if (memory)
14401 : {
14402 1715 : op = ix86_zero_extend_to_Pmode (op);
14403 1715 : target = gen_rtx_MEM (tmode, op);
14404 : /* target at this point has just BITS_PER_UNIT MEM_ALIGN
14405 : on it. Try to improve it using get_pointer_alignment,
14406 : and if the special builtin is one that requires strict
14407 : mode alignment, also from it's GET_MODE_ALIGNMENT.
14408 : Failure to do so could lead to ix86_legitimate_combined_insn
14409 : rejecting all changes to such insns. */
14410 1715 : unsigned int align = get_pointer_alignment (arg);
14411 1715 : if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
14412 275 : align = GET_MODE_ALIGNMENT (tmode);
14413 3430 : if (MEM_ALIGN (target) < align)
14414 422 : set_mem_align (target, align);
14415 : }
14416 : else
14417 163 : target = force_reg (tmode, op);
14418 : arg_adjust = 1;
14419 : }
14420 : else
14421 : {
14422 9935 : arg_adjust = 0;
14423 9935 : if (optimize
14424 2918 : || target == 0
14425 2918 : || !register_operand (target, tmode)
14426 12842 : || GET_MODE (target) != tmode)
14427 7028 : target = gen_reg_rtx (tmode);
14428 : }
14429 :
14430 21202 : for (i = 0; i < nargs; i++)
14431 : {
14432 9389 : machine_mode mode = insn_p->operand[i + 1].mode;
14433 :
14434 9389 : arg = CALL_EXPR_ARG (exp, i + arg_adjust);
14435 9389 : op = ix86_expand_unsigned_small_int_cst_argument (arg);
14436 :
14437 9389 : if (i == memory)
14438 : {
14439 : /* This must be the memory operand. */
14440 2354 : op = ix86_zero_extend_to_Pmode (op);
14441 2354 : op = gen_rtx_MEM (mode, op);
14442 : /* op at this point has just BITS_PER_UNIT MEM_ALIGN
14443 : on it. Try to improve it using get_pointer_alignment,
14444 : and if the special builtin is one that requires strict
14445 : mode alignment, also from it's GET_MODE_ALIGNMENT.
14446 : Failure to do so could lead to ix86_legitimate_combined_insn
14447 : rejecting all changes to such insns. */
14448 2354 : unsigned int align = get_pointer_alignment (arg);
14449 2354 : if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
14450 299 : align = GET_MODE_ALIGNMENT (mode);
14451 4708 : if (MEM_ALIGN (op) < align)
14452 523 : set_mem_align (op, align);
14453 : }
14454 7035 : else if (i == constant)
14455 : {
14456 : /* This must be the constant. */
14457 105 : if (!insn_p->operand[nargs].predicate(op, SImode))
14458 : {
14459 0 : error ("the fourth argument must be one of enum %qs", "_CMPCCX_ENUM");
14460 0 : return const0_rtx;
14461 : }
14462 : }
14463 : else
14464 : {
14465 : /* This must be register. */
14466 6930 : if (VECTOR_MODE_P (mode))
14467 3475 : op = safe_vector_operand (op, mode);
14468 :
14469 6930 : op = fixup_modeless_constant (op, mode);
14470 :
14471 : /* NB: 3-operands load implied it's a mask load or v{p}expand*,
14472 : and that mask operand shoud be at the end.
14473 : Keep all-ones mask which would be simplified by the expander. */
14474 1771 : if (nargs == 3 && i == 2 && klass == load
14475 1771 : && constm1_operand (op, mode)
14476 7103 : && insn_p->operand[i].predicate (op, mode))
14477 : ;
14478 6930 : else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
14479 6930 : op = copy_to_mode_reg (mode, op);
14480 : else
14481 : {
14482 0 : op = copy_to_reg (op);
14483 0 : op = lowpart_subreg (mode, op, GET_MODE (op));
14484 : }
14485 : }
14486 :
14487 9389 : xops[i]= op;
14488 : }
14489 :
14490 11813 : switch (nargs)
14491 : {
14492 7744 : case 0:
14493 7744 : pat = GEN_FCN (icode) (target);
14494 7744 : break;
14495 730 : case 1:
14496 730 : pat = GEN_FCN (icode) (target, xops[0]);
14497 730 : break;
14498 1463 : case 2:
14499 1463 : pat = GEN_FCN (icode) (target, xops[0], xops[1]);
14500 1463 : break;
14501 1771 : case 3:
14502 1771 : pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
14503 1771 : break;
14504 105 : case 4:
14505 105 : pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
14506 105 : break;
14507 : default:
14508 : gcc_unreachable ();
14509 : }
14510 :
14511 11813 : if (! pat)
14512 : return 0;
14513 :
14514 11813 : emit_insn (pat);
14515 11813 : return klass == store ? 0 : target;
14516 : }
14517 :
14518 : /* Return the integer constant in ARG. Constrain it to be in the range
14519 : of the subparts of VEC_TYPE; issue an error if not. */
14520 :
14521 : static int
14522 603 : get_element_number (tree vec_type, tree arg)
14523 : {
14524 603 : unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
14525 :
14526 603 : if (!tree_fits_uhwi_p (arg)
14527 603 : || (elt = tree_to_uhwi (arg), elt > max))
14528 : {
14529 0 : error ("selector must be an integer constant in the range "
14530 : "[0, %wi]", max);
14531 0 : return 0;
14532 : }
14533 :
14534 603 : return elt;
14535 : }
14536 :
14537 : /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
14538 : ix86_expand_vector_init. We DO have language-level syntax for this, in
14539 : the form of (type){ init-list }. Except that since we can't place emms
14540 : instructions from inside the compiler, we can't allow the use of MMX
14541 : registers unless the user explicitly asks for it. So we do *not* define
14542 : vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
14543 : we have builtins invoked by mmintrin.h that gives us license to emit
14544 : these sorts of instructions. */
14545 :
14546 : static rtx
14547 229 : ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
14548 : {
14549 229 : machine_mode tmode = TYPE_MODE (type);
14550 229 : machine_mode inner_mode = GET_MODE_INNER (tmode);
14551 229 : int i, n_elt = GET_MODE_NUNITS (tmode);
14552 229 : rtvec v = rtvec_alloc (n_elt);
14553 :
14554 229 : gcc_assert (VECTOR_MODE_P (tmode));
14555 229 : gcc_assert (call_expr_nargs (exp) == n_elt);
14556 :
14557 1203 : for (i = 0; i < n_elt; ++i)
14558 : {
14559 974 : rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
14560 974 : RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
14561 : }
14562 :
14563 229 : if (!target || !register_operand (target, tmode))
14564 0 : target = gen_reg_rtx (tmode);
14565 :
14566 229 : ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
14567 229 : return target;
14568 : }
14569 :
14570 : /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
14571 : ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
14572 : had a language-level syntax for referencing vector elements. */
14573 :
14574 : static rtx
14575 399 : ix86_expand_vec_ext_builtin (tree exp, rtx target)
14576 : {
14577 399 : machine_mode tmode, mode0;
14578 399 : tree arg0, arg1;
14579 399 : int elt;
14580 399 : rtx op0;
14581 :
14582 399 : arg0 = CALL_EXPR_ARG (exp, 0);
14583 399 : arg1 = CALL_EXPR_ARG (exp, 1);
14584 :
14585 399 : op0 = expand_normal (arg0);
14586 399 : elt = get_element_number (TREE_TYPE (arg0), arg1);
14587 :
14588 399 : tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
14589 399 : mode0 = TYPE_MODE (TREE_TYPE (arg0));
14590 399 : gcc_assert (VECTOR_MODE_P (mode0));
14591 :
14592 399 : op0 = force_reg (mode0, op0);
14593 :
14594 399 : if (optimize || !target || !register_operand (target, tmode))
14595 320 : target = gen_reg_rtx (tmode);
14596 :
14597 399 : ix86_expand_vector_extract (true, target, op0, elt);
14598 :
14599 399 : return target;
14600 : }
14601 :
14602 : /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
14603 : ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
14604 : a language-level syntax for referencing vector elements. */
14605 :
14606 : static rtx
14607 204 : ix86_expand_vec_set_builtin (tree exp)
14608 : {
14609 204 : machine_mode tmode, mode1;
14610 204 : tree arg0, arg1, arg2;
14611 204 : int elt;
14612 204 : rtx op0, op1, target;
14613 :
14614 204 : arg0 = CALL_EXPR_ARG (exp, 0);
14615 204 : arg1 = CALL_EXPR_ARG (exp, 1);
14616 204 : arg2 = CALL_EXPR_ARG (exp, 2);
14617 :
14618 204 : tmode = TYPE_MODE (TREE_TYPE (arg0));
14619 204 : mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
14620 204 : gcc_assert (VECTOR_MODE_P (tmode));
14621 :
14622 204 : op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
14623 204 : op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
14624 204 : elt = get_element_number (TREE_TYPE (arg0), arg2);
14625 :
14626 204 : if (GET_MODE (op1) != mode1)
14627 82 : op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
14628 :
14629 204 : op0 = force_reg (tmode, op0);
14630 204 : op1 = force_reg (mode1, op1);
14631 :
14632 : /* OP0 is the source of these builtin functions and shouldn't be
14633 : modified. Create a copy, use it and return it as target. */
14634 204 : target = gen_reg_rtx (tmode);
14635 204 : emit_move_insn (target, op0);
14636 204 : ix86_expand_vector_set (true, target, op1, elt);
14637 :
14638 204 : return target;
14639 : }
14640 :
14641 : /* Return true if the necessary isa options for this builtin exist,
14642 : else false.
14643 : fcode = DECL_MD_FUNCTION_CODE (fndecl); */
14644 : bool
14645 1294782 : ix86_check_builtin_isa_match (unsigned int fcode,
14646 : HOST_WIDE_INT* pbisa,
14647 : HOST_WIDE_INT* pbisa2)
14648 : {
14649 1294782 : HOST_WIDE_INT isa = ix86_isa_flags;
14650 1294782 : HOST_WIDE_INT isa2 = ix86_isa_flags2;
14651 1294782 : HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
14652 1294782 : HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
14653 1294782 : HOST_WIDE_INT tmp_isa = isa, tmp_isa2 = isa2;
14654 : /* The general case is we require all the ISAs specified in bisa{,2}
14655 : to be enabled.
14656 : The exceptions are:
14657 : OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
14658 : OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
14659 : OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
14660 : (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
14661 : OPTION_MASK_ISA2_AVXVNNI
14662 : (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL) or
14663 : OPTION_MASK_ISA2_AVXIFMA
14664 : (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_AVX512BF16) or
14665 : OPTION_MASK_ISA2_AVXNECONVERT
14666 : OPTION_MASK_ISA_AES or (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_VAES)
14667 : OPTION_MASK_ISA2_AVX10_2 or OPTION_MASK_ISA2_AVXVNNIINT8
14668 : OPTION_MASK_ISA2_AVX10_2 or OPTION_MASK_ISA2_AVXVNNIINT16
14669 : where for each such pair it is sufficient if either of the ISAs is
14670 : enabled, plus if it is ored with other options also those others.
14671 : OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
14672 :
14673 : #define SHARE_BUILTIN(A1, A2, B1, B2) \
14674 : if ((((bisa & (A1)) == (A1) && (bisa2 & (A2)) == (A2)) \
14675 : && ((bisa & (B1)) == (B1) && (bisa2 & (B2)) == (B2))) \
14676 : && (((isa & (A1)) == (A1) && (isa2 & (A2)) == (A2)) \
14677 : || ((isa & (B1)) == (B1) && (isa2 & (B2)) == (B2)))) \
14678 : { \
14679 : tmp_isa |= (A1) | (B1); \
14680 : tmp_isa2 |= (A2) | (B2); \
14681 : }
14682 :
14683 1294782 : SHARE_BUILTIN (OPTION_MASK_ISA_SSE, 0, OPTION_MASK_ISA_3DNOW_A, 0);
14684 1294782 : SHARE_BUILTIN (OPTION_MASK_ISA_SSE4_2, 0, OPTION_MASK_ISA_CRC32, 0);
14685 1294782 : SHARE_BUILTIN (OPTION_MASK_ISA_FMA, 0, OPTION_MASK_ISA_FMA4, 0);
14686 1294782 : SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, 0,
14687 1294782 : OPTION_MASK_ISA2_AVXVNNI);
14688 1294782 : SHARE_BUILTIN (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL, 0, 0,
14689 1294782 : OPTION_MASK_ISA2_AVXIFMA);
14690 1294782 : SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BF16, 0,
14691 1294782 : OPTION_MASK_ISA2_AVXNECONVERT);
14692 1294782 : SHARE_BUILTIN (OPTION_MASK_ISA_AES, 0, OPTION_MASK_ISA_AVX512VL,
14693 1294782 : OPTION_MASK_ISA2_VAES);
14694 1294782 : SHARE_BUILTIN (0, OPTION_MASK_ISA2_AVXVNNIINT8, 0,
14695 1294782 : OPTION_MASK_ISA2_AVX10_2);
14696 1294782 : SHARE_BUILTIN (0, OPTION_MASK_ISA2_AVXVNNIINT16, 0,
14697 1294782 : OPTION_MASK_ISA2_AVX10_2);
14698 1294782 : isa = tmp_isa;
14699 1294782 : isa2 = tmp_isa2;
14700 :
14701 1294782 : if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
14702 : /* __builtin_ia32_maskmovq requires MMX registers. */
14703 4563 : && fcode != IX86_BUILTIN_MASKMOVQ)
14704 : {
14705 4554 : bisa &= ~OPTION_MASK_ISA_MMX;
14706 4554 : bisa |= OPTION_MASK_ISA_SSE2;
14707 : }
14708 :
14709 1294782 : if (pbisa)
14710 173271 : *pbisa = bisa;
14711 1294782 : if (pbisa2)
14712 173271 : *pbisa2 = bisa2;
14713 :
14714 1294782 : return (bisa & isa) == bisa && (bisa2 & isa2) == bisa2;
14715 : }
14716 :
14717 : /* Emit instructions to set the carry flag from ARG. */
14718 :
14719 : void
14720 13560 : ix86_expand_carry (rtx arg)
14721 : {
14722 13560 : if (!CONST_INT_P (arg) || arg == const0_rtx)
14723 : {
14724 13554 : arg = convert_to_mode (QImode, arg, 1);
14725 13554 : arg = copy_to_mode_reg (QImode, arg);
14726 13554 : emit_insn (gen_addqi3_cconly_overflow (arg, constm1_rtx));
14727 : }
14728 : else
14729 6 : emit_insn (gen_x86_stc ());
14730 13560 : }
14731 :
14732 : /* Expand an expression EXP that calls a built-in function,
14733 : with result going to TARGET if that's convenient
14734 : (and in mode MODE if that's convenient).
14735 : SUBTARGET may be used as the target for computing one of EXP's operands.
14736 : IGNORE is nonzero if the value is to be ignored. */
14737 :
14738 : rtx
14739 174062 : ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
14740 : machine_mode mode, int ignore)
14741 : {
14742 174062 : size_t i;
14743 174062 : enum insn_code icode, icode2;
14744 174062 : tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
14745 174062 : tree arg0, arg1, arg2, arg3, arg4;
14746 174062 : rtx op0, op1, op2, op3, op4, pat, pat2, insn;
14747 174062 : machine_mode mode0, mode1, mode2, mode3, mode4;
14748 174062 : unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
14749 174062 : HOST_WIDE_INT bisa, bisa2;
14750 :
14751 : /* For CPU builtins that can be folded, fold first and expand the fold. */
14752 174062 : switch (fcode)
14753 : {
14754 196 : case IX86_BUILTIN_CPU_INIT:
14755 196 : {
14756 : /* Make it call __cpu_indicator_init in libgcc. */
14757 196 : tree call_expr, fndecl, type;
14758 196 : type = build_function_type_list (integer_type_node, NULL_TREE);
14759 196 : fndecl = build_fn_decl ("__cpu_indicator_init", type);
14760 196 : call_expr = build_call_expr (fndecl, 0);
14761 196 : return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
14762 : }
14763 595 : case IX86_BUILTIN_CPU_IS:
14764 595 : case IX86_BUILTIN_CPU_SUPPORTS:
14765 595 : {
14766 595 : tree arg0 = CALL_EXPR_ARG (exp, 0);
14767 595 : tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
14768 595 : gcc_assert (fold_expr != NULL_TREE);
14769 595 : return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
14770 : }
14771 : }
14772 :
14773 173271 : if (!ix86_check_builtin_isa_match (fcode, &bisa, &bisa2))
14774 : {
14775 23 : bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
14776 23 : if (TARGET_ABI_X32)
14777 0 : bisa |= OPTION_MASK_ABI_X32;
14778 : else
14779 23 : bisa |= OPTION_MASK_ABI_64;
14780 23 : char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
14781 : (enum fpmath_unit) 0,
14782 : (enum prefer_vector_width) 0,
14783 : PVW_NONE, false, add_abi_p);
14784 23 : if (!opts)
14785 0 : error ("%qE needs unknown isa option", fndecl);
14786 : else
14787 : {
14788 23 : gcc_assert (opts != NULL);
14789 23 : error ("%qE needs isa option %s", fndecl, opts);
14790 23 : free (opts);
14791 : }
14792 23 : return expand_call (exp, target, ignore);
14793 : }
14794 :
14795 173248 : switch (fcode)
14796 : {
14797 35 : case IX86_BUILTIN_MASKMOVQ:
14798 35 : case IX86_BUILTIN_MASKMOVDQU:
14799 34 : icode = (fcode == IX86_BUILTIN_MASKMOVQ
14800 35 : ? CODE_FOR_mmx_maskmovq
14801 : : CODE_FOR_sse2_maskmovdqu);
14802 : /* Note the arg order is different from the operand order. */
14803 35 : arg1 = CALL_EXPR_ARG (exp, 0);
14804 35 : arg2 = CALL_EXPR_ARG (exp, 1);
14805 35 : arg0 = CALL_EXPR_ARG (exp, 2);
14806 35 : op0 = expand_normal (arg0);
14807 35 : op1 = expand_normal (arg1);
14808 35 : op2 = expand_normal (arg2);
14809 35 : mode0 = insn_data[icode].operand[0].mode;
14810 35 : mode1 = insn_data[icode].operand[1].mode;
14811 35 : mode2 = insn_data[icode].operand[2].mode;
14812 :
14813 35 : op0 = ix86_zero_extend_to_Pmode (op0);
14814 35 : op0 = gen_rtx_MEM (mode1, op0);
14815 :
14816 35 : if (!insn_data[icode].operand[0].predicate (op0, mode0))
14817 0 : op0 = copy_to_mode_reg (mode0, op0);
14818 35 : if (!insn_data[icode].operand[1].predicate (op1, mode1))
14819 2 : op1 = copy_to_mode_reg (mode1, op1);
14820 35 : if (!insn_data[icode].operand[2].predicate (op2, mode2))
14821 2 : op2 = copy_to_mode_reg (mode2, op2);
14822 35 : pat = GEN_FCN (icode) (op0, op1, op2);
14823 35 : if (! pat)
14824 56621 : return 0;
14825 35 : emit_insn (pat);
14826 35 : return 0;
14827 :
14828 22008 : case IX86_BUILTIN_LDMXCSR:
14829 22008 : op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
14830 22008 : target = assign_stack_temp (SImode, GET_MODE_SIZE (SImode));
14831 22008 : emit_move_insn (target, op0);
14832 22008 : emit_insn (gen_sse_ldmxcsr (target));
14833 22008 : return 0;
14834 :
14835 14785 : case IX86_BUILTIN_STMXCSR:
14836 14785 : target = assign_stack_temp (SImode, GET_MODE_SIZE (SImode));
14837 14785 : emit_insn (gen_sse_stmxcsr (target));
14838 14785 : return copy_to_mode_reg (SImode, target);
14839 :
14840 11 : case IX86_BUILTIN_CLFLUSH:
14841 11 : arg0 = CALL_EXPR_ARG (exp, 0);
14842 11 : op0 = expand_normal (arg0);
14843 11 : icode = CODE_FOR_sse2_clflush;
14844 11 : if (!insn_data[icode].operand[0].predicate (op0, Pmode))
14845 5 : op0 = ix86_zero_extend_to_Pmode (op0);
14846 :
14847 11 : emit_insn (gen_sse2_clflush (op0));
14848 11 : return 0;
14849 :
14850 19 : case IX86_BUILTIN_CLWB:
14851 19 : arg0 = CALL_EXPR_ARG (exp, 0);
14852 19 : op0 = expand_normal (arg0);
14853 19 : icode = CODE_FOR_clwb;
14854 19 : if (!insn_data[icode].operand[0].predicate (op0, Pmode))
14855 9 : op0 = ix86_zero_extend_to_Pmode (op0);
14856 :
14857 19 : emit_insn (gen_clwb (op0));
14858 19 : return 0;
14859 :
14860 19 : case IX86_BUILTIN_CLFLUSHOPT:
14861 19 : arg0 = CALL_EXPR_ARG (exp, 0);
14862 19 : op0 = expand_normal (arg0);
14863 19 : icode = CODE_FOR_clflushopt;
14864 19 : if (!insn_data[icode].operand[0].predicate (op0, Pmode))
14865 9 : op0 = ix86_zero_extend_to_Pmode (op0);
14866 :
14867 19 : emit_insn (gen_clflushopt (op0));
14868 19 : return 0;
14869 :
14870 47 : case IX86_BUILTIN_MONITOR:
14871 47 : case IX86_BUILTIN_MONITORX:
14872 47 : arg0 = CALL_EXPR_ARG (exp, 0);
14873 47 : arg1 = CALL_EXPR_ARG (exp, 1);
14874 47 : arg2 = CALL_EXPR_ARG (exp, 2);
14875 47 : op0 = expand_normal (arg0);
14876 47 : op1 = expand_normal (arg1);
14877 47 : op2 = expand_normal (arg2);
14878 47 : if (!REG_P (op0))
14879 19 : op0 = ix86_zero_extend_to_Pmode (op0);
14880 47 : if (!REG_P (op1))
14881 22 : op1 = copy_to_mode_reg (SImode, op1);
14882 47 : if (!REG_P (op2))
14883 25 : op2 = copy_to_mode_reg (SImode, op2);
14884 :
14885 47 : emit_insn (fcode == IX86_BUILTIN_MONITOR
14886 26 : ? gen_sse3_monitor (Pmode, op0, op1, op2)
14887 21 : : gen_monitorx (Pmode, op0, op1, op2));
14888 47 : return 0;
14889 :
14890 25 : case IX86_BUILTIN_MWAIT:
14891 25 : arg0 = CALL_EXPR_ARG (exp, 0);
14892 25 : arg1 = CALL_EXPR_ARG (exp, 1);
14893 25 : op0 = expand_normal (arg0);
14894 25 : op1 = expand_normal (arg1);
14895 25 : if (!REG_P (op0))
14896 13 : op0 = copy_to_mode_reg (SImode, op0);
14897 25 : if (!REG_P (op1))
14898 11 : op1 = copy_to_mode_reg (SImode, op1);
14899 25 : emit_insn (gen_sse3_mwait (op0, op1));
14900 25 : return 0;
14901 :
14902 21 : case IX86_BUILTIN_MWAITX:
14903 21 : arg0 = CALL_EXPR_ARG (exp, 0);
14904 21 : arg1 = CALL_EXPR_ARG (exp, 1);
14905 21 : arg2 = CALL_EXPR_ARG (exp, 2);
14906 21 : op0 = expand_normal (arg0);
14907 21 : op1 = expand_normal (arg1);
14908 21 : op2 = expand_normal (arg2);
14909 21 : if (!REG_P (op0))
14910 11 : op0 = copy_to_mode_reg (SImode, op0);
14911 21 : if (!REG_P (op1))
14912 10 : op1 = copy_to_mode_reg (SImode, op1);
14913 21 : if (!REG_P (op2))
14914 11 : op2 = copy_to_mode_reg (SImode, op2);
14915 21 : emit_insn (gen_mwaitx (op0, op1, op2));
14916 21 : return 0;
14917 :
14918 21 : case IX86_BUILTIN_UMONITOR:
14919 21 : arg0 = CALL_EXPR_ARG (exp, 0);
14920 21 : op0 = expand_normal (arg0);
14921 :
14922 21 : op0 = ix86_zero_extend_to_Pmode (op0);
14923 21 : emit_insn (gen_umonitor (Pmode, op0));
14924 21 : return 0;
14925 :
14926 42 : case IX86_BUILTIN_UMWAIT:
14927 42 : case IX86_BUILTIN_TPAUSE:
14928 42 : arg0 = CALL_EXPR_ARG (exp, 0);
14929 42 : arg1 = CALL_EXPR_ARG (exp, 1);
14930 42 : op0 = expand_normal (arg0);
14931 42 : op1 = expand_normal (arg1);
14932 :
14933 42 : if (!REG_P (op0))
14934 20 : op0 = copy_to_mode_reg (SImode, op0);
14935 :
14936 42 : op1 = force_reg (DImode, op1);
14937 :
14938 42 : if (TARGET_64BIT)
14939 : {
14940 42 : op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
14941 : NULL, 1, OPTAB_DIRECT);
14942 42 : switch (fcode)
14943 : {
14944 : case IX86_BUILTIN_UMWAIT:
14945 : icode = CODE_FOR_umwait_rex64;
14946 : break;
14947 21 : case IX86_BUILTIN_TPAUSE:
14948 21 : icode = CODE_FOR_tpause_rex64;
14949 21 : break;
14950 0 : default:
14951 0 : gcc_unreachable ();
14952 : }
14953 :
14954 42 : op2 = gen_lowpart (SImode, op2);
14955 42 : op1 = gen_lowpart (SImode, op1);
14956 42 : pat = GEN_FCN (icode) (op0, op1, op2);
14957 : }
14958 : else
14959 : {
14960 0 : switch (fcode)
14961 : {
14962 : case IX86_BUILTIN_UMWAIT:
14963 : icode = CODE_FOR_umwait;
14964 : break;
14965 0 : case IX86_BUILTIN_TPAUSE:
14966 0 : icode = CODE_FOR_tpause;
14967 0 : break;
14968 0 : default:
14969 0 : gcc_unreachable ();
14970 : }
14971 0 : pat = GEN_FCN (icode) (op0, op1);
14972 : }
14973 :
14974 42 : if (!pat)
14975 : return 0;
14976 :
14977 42 : emit_insn (pat);
14978 :
14979 42 : if (target == 0
14980 42 : || !register_operand (target, QImode))
14981 0 : target = gen_reg_rtx (QImode);
14982 :
14983 42 : pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
14984 : const0_rtx);
14985 42 : emit_insn (gen_rtx_SET (target, pat));
14986 :
14987 42 : return target;
14988 :
14989 20 : case IX86_BUILTIN_TESTUI:
14990 20 : emit_insn (gen_testui ());
14991 :
14992 20 : if (target == 0
14993 20 : || !register_operand (target, QImode))
14994 0 : target = gen_reg_rtx (QImode);
14995 :
14996 20 : pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
14997 : const0_rtx);
14998 20 : emit_insn (gen_rtx_SET (target, pat));
14999 :
15000 20 : return target;
15001 :
15002 19 : case IX86_BUILTIN_CLZERO:
15003 19 : arg0 = CALL_EXPR_ARG (exp, 0);
15004 19 : op0 = expand_normal (arg0);
15005 19 : if (!REG_P (op0))
15006 9 : op0 = ix86_zero_extend_to_Pmode (op0);
15007 19 : emit_insn (gen_clzero (Pmode, op0));
15008 19 : return 0;
15009 :
15010 19 : case IX86_BUILTIN_CLDEMOTE:
15011 19 : arg0 = CALL_EXPR_ARG (exp, 0);
15012 19 : op0 = expand_normal (arg0);
15013 19 : icode = CODE_FOR_cldemote;
15014 19 : if (!insn_data[icode].operand[0].predicate (op0, Pmode))
15015 9 : op0 = ix86_zero_extend_to_Pmode (op0);
15016 :
15017 19 : emit_insn (gen_cldemote (op0));
15018 19 : return 0;
15019 :
15020 11 : case IX86_BUILTIN_LOADIWKEY:
15021 11 : {
15022 11 : arg0 = CALL_EXPR_ARG (exp, 0);
15023 11 : arg1 = CALL_EXPR_ARG (exp, 1);
15024 11 : arg2 = CALL_EXPR_ARG (exp, 2);
15025 11 : arg3 = CALL_EXPR_ARG (exp, 3);
15026 :
15027 11 : op0 = expand_normal (arg0);
15028 11 : op1 = expand_normal (arg1);
15029 11 : op2 = expand_normal (arg2);
15030 11 : op3 = expand_normal (arg3);
15031 :
15032 11 : if (!REG_P (op0))
15033 5 : op0 = copy_to_mode_reg (V2DImode, op0);
15034 11 : if (!REG_P (op1))
15035 5 : op1 = copy_to_mode_reg (V2DImode, op1);
15036 11 : if (!REG_P (op2))
15037 5 : op2 = copy_to_mode_reg (V2DImode, op2);
15038 11 : if (!REG_P (op3))
15039 5 : op3 = copy_to_mode_reg (SImode, op3);
15040 :
15041 11 : emit_insn (gen_loadiwkey (op0, op1, op2, op3));
15042 :
15043 11 : return 0;
15044 : }
15045 :
15046 12 : case IX86_BUILTIN_AESDEC128KLU8:
15047 12 : icode = CODE_FOR_aesdec128klu8;
15048 12 : goto aesdecenc_expand;
15049 :
15050 12 : case IX86_BUILTIN_AESDEC256KLU8:
15051 12 : icode = CODE_FOR_aesdec256klu8;
15052 12 : goto aesdecenc_expand;
15053 :
15054 12 : case IX86_BUILTIN_AESENC128KLU8:
15055 12 : icode = CODE_FOR_aesenc128klu8;
15056 12 : goto aesdecenc_expand;
15057 :
15058 : case IX86_BUILTIN_AESENC256KLU8:
15059 : icode = CODE_FOR_aesenc256klu8;
15060 :
15061 48 : aesdecenc_expand:
15062 :
15063 48 : arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
15064 48 : arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
15065 48 : arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
15066 :
15067 48 : op0 = expand_normal (arg0);
15068 48 : op1 = expand_normal (arg1);
15069 48 : op2 = expand_normal (arg2);
15070 :
15071 48 : if (!address_operand (op0, V2DImode))
15072 : {
15073 16 : op0 = convert_memory_address (Pmode, op0);
15074 16 : op0 = copy_addr_to_reg (op0);
15075 : }
15076 48 : op0 = gen_rtx_MEM (V2DImode, op0);
15077 :
15078 48 : if (!REG_P (op1))
15079 20 : op1 = copy_to_mode_reg (V2DImode, op1);
15080 :
15081 48 : if (!address_operand (op2, VOIDmode))
15082 : {
15083 16 : op2 = convert_memory_address (Pmode, op2);
15084 16 : op2 = copy_addr_to_reg (op2);
15085 : }
15086 48 : op2 = gen_rtx_MEM (BLKmode, op2);
15087 :
15088 48 : emit_insn (GEN_FCN (icode) (op1, op1, op2));
15089 :
15090 48 : if (target == 0)
15091 4 : target = gen_reg_rtx (QImode);
15092 :
15093 : /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
15094 : error occurs. Then the output should be cleared for safety. */
15095 48 : rtx_code_label *ok_label;
15096 48 : rtx tmp;
15097 :
15098 48 : tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
15099 48 : pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
15100 48 : ok_label = gen_label_rtx ();
15101 48 : emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
15102 : true, ok_label);
15103 : /* Usually the runtime error seldom occur, so predict OK path as
15104 : hotspot to optimize it as fallthrough block. */
15105 48 : predict_jump (REG_BR_PROB_BASE * 90 / 100);
15106 :
15107 48 : emit_insn (gen_rtx_SET (op1, const0_rtx));
15108 :
15109 48 : emit_label (ok_label);
15110 48 : emit_insn (gen_rtx_SET (target, pat));
15111 48 : emit_insn (gen_rtx_SET (op0, op1));
15112 :
15113 48 : return target;
15114 :
15115 11 : case IX86_BUILTIN_AESDECWIDE128KLU8:
15116 11 : icode = CODE_FOR_aesdecwide128klu8;
15117 11 : goto wideaesdecenc_expand;
15118 :
15119 11 : case IX86_BUILTIN_AESDECWIDE256KLU8:
15120 11 : icode = CODE_FOR_aesdecwide256klu8;
15121 11 : goto wideaesdecenc_expand;
15122 :
15123 11 : case IX86_BUILTIN_AESENCWIDE128KLU8:
15124 11 : icode = CODE_FOR_aesencwide128klu8;
15125 11 : goto wideaesdecenc_expand;
15126 :
15127 : case IX86_BUILTIN_AESENCWIDE256KLU8:
15128 : icode = CODE_FOR_aesencwide256klu8;
15129 :
15130 44 : wideaesdecenc_expand:
15131 :
15132 44 : rtx xmm_regs[8];
15133 44 : rtx op;
15134 :
15135 44 : arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
15136 44 : arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
15137 44 : arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
15138 :
15139 44 : op0 = expand_normal (arg0);
15140 44 : op1 = expand_normal (arg1);
15141 44 : op2 = expand_normal (arg2);
15142 :
15143 44 : if (GET_MODE (op1) != Pmode)
15144 0 : op1 = convert_to_mode (Pmode, op1, 1);
15145 :
15146 44 : if (!address_operand (op2, VOIDmode))
15147 : {
15148 16 : op2 = convert_memory_address (Pmode, op2);
15149 16 : op2 = copy_addr_to_reg (op2);
15150 : }
15151 44 : op2 = gen_rtx_MEM (BLKmode, op2);
15152 :
15153 440 : for (i = 0; i < 8; i++)
15154 : {
15155 352 : xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
15156 :
15157 352 : op = gen_rtx_MEM (V2DImode,
15158 352 : plus_constant (Pmode, op1, (i * 16)));
15159 :
15160 352 : emit_move_insn (xmm_regs[i], op);
15161 : }
15162 :
15163 44 : emit_insn (GEN_FCN (icode) (op2));
15164 :
15165 44 : if (target == 0)
15166 0 : target = gen_reg_rtx (QImode);
15167 :
15168 44 : tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
15169 44 : pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
15170 44 : ok_label = gen_label_rtx ();
15171 44 : emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
15172 : true, ok_label);
15173 44 : predict_jump (REG_BR_PROB_BASE * 90 / 100);
15174 :
15175 440 : for (i = 0; i < 8; i++)
15176 352 : emit_insn (gen_rtx_SET (xmm_regs[i], const0_rtx));
15177 :
15178 44 : emit_label (ok_label);
15179 44 : emit_insn (gen_rtx_SET (target, pat));
15180 :
15181 44 : if (GET_MODE (op0) != Pmode)
15182 0 : op0 = convert_to_mode (Pmode, op0, 1);
15183 :
15184 396 : for (i = 0; i < 8; i++)
15185 : {
15186 352 : op = gen_rtx_MEM (V2DImode,
15187 352 : plus_constant (Pmode, op0, (i * 16)));
15188 352 : emit_move_insn (op, xmm_regs[i]);
15189 : }
15190 :
15191 : return target;
15192 :
15193 13 : case IX86_BUILTIN_ENCODEKEY128U32:
15194 13 : {
15195 13 : rtx op, xmm_regs[7];
15196 :
15197 13 : arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
15198 13 : arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
15199 13 : arg2 = CALL_EXPR_ARG (exp, 2); // void *h
15200 :
15201 13 : op0 = expand_normal (arg0);
15202 13 : op1 = expand_normal (arg1);
15203 13 : op2 = expand_normal (arg2);
15204 :
15205 13 : if (!REG_P (op0))
15206 7 : op0 = copy_to_mode_reg (SImode, op0);
15207 :
15208 13 : if (GET_MODE (op2) != Pmode)
15209 1 : op2 = convert_to_mode (Pmode, op2, 1);
15210 :
15211 13 : op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
15212 13 : emit_move_insn (op, op1);
15213 :
15214 65 : for (i = 0; i < 3; i++)
15215 39 : xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
15216 :
15217 13 : if (target == 0 || !register_operand (target, SImode))
15218 2 : target = gen_reg_rtx (SImode);
15219 :
15220 13 : emit_insn (gen_encodekey128u32 (target, op0));
15221 :
15222 65 : for (i = 0; i < 3; i++)
15223 : {
15224 39 : op = gen_rtx_MEM (V2DImode,
15225 39 : plus_constant (Pmode, op2, (i * 16)));
15226 39 : emit_move_insn (op, xmm_regs[i]);
15227 : }
15228 :
15229 13 : return target;
15230 : }
15231 13 : case IX86_BUILTIN_ENCODEKEY256U32:
15232 13 : {
15233 13 : rtx op, xmm_regs[7];
15234 :
15235 13 : arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
15236 13 : arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
15237 13 : arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
15238 13 : arg3 = CALL_EXPR_ARG (exp, 3); // void *h
15239 :
15240 13 : op0 = expand_normal (arg0);
15241 13 : op1 = expand_normal (arg1);
15242 13 : op2 = expand_normal (arg2);
15243 13 : op3 = expand_normal (arg3);
15244 :
15245 13 : if (!REG_P (op0))
15246 7 : op0 = copy_to_mode_reg (SImode, op0);
15247 :
15248 13 : if (GET_MODE (op3) != Pmode)
15249 1 : op3 = convert_to_mode (Pmode, op3, 1);
15250 :
15251 : /* Force to use xmm0, xmm1 for keylow, keyhi*/
15252 13 : op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
15253 13 : emit_move_insn (op, op1);
15254 13 : op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
15255 13 : emit_move_insn (op, op2);
15256 :
15257 78 : for (i = 0; i < 4; i++)
15258 52 : xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
15259 :
15260 13 : if (target == 0 || !register_operand (target, SImode))
15261 2 : target = gen_reg_rtx (SImode);
15262 :
15263 13 : emit_insn (gen_encodekey256u32 (target, op0));
15264 :
15265 78 : for (i = 0; i < 4; i++)
15266 : {
15267 52 : op = gen_rtx_MEM (V2DImode,
15268 52 : plus_constant (Pmode, op3, (i * 16)));
15269 52 : emit_move_insn (op, xmm_regs[i]);
15270 : }
15271 :
15272 13 : return target;
15273 : }
15274 :
15275 48 : case IX86_BUILTIN_PREFETCH:
15276 48 : {
15277 48 : arg0 = CALL_EXPR_ARG (exp, 0); // const void *
15278 48 : arg1 = CALL_EXPR_ARG (exp, 1); // const int
15279 48 : arg2 = CALL_EXPR_ARG (exp, 2); // const int
15280 48 : arg3 = CALL_EXPR_ARG (exp, 3); // const int
15281 :
15282 48 : op0 = expand_normal (arg0);
15283 48 : op1 = expand_normal (arg1);
15284 48 : op2 = expand_normal (arg2);
15285 48 : op3 = expand_normal (arg3);
15286 :
15287 48 : if (!CONST_INT_P (op1) || !CONST_INT_P (op2) || !CONST_INT_P (op3))
15288 : {
15289 0 : error ("second, third and fourth argument must be a const");
15290 0 : return const0_rtx;
15291 : }
15292 :
15293 48 : if (!IN_RANGE (INTVAL (op1), 0, 2))
15294 : {
15295 1 : warning (0, "invalid second argument to"
15296 : " %<__builtin_ia32_prefetch%>; using zero");
15297 1 : op1 = const0_rtx;
15298 : }
15299 :
15300 48 : if (INTVAL (op3) == 1)
15301 : {
15302 4 : if (!IN_RANGE (INTVAL (op2), 2, 3))
15303 : {
15304 1 : error ("invalid third argument");
15305 1 : return const0_rtx;
15306 : }
15307 :
15308 3 : if (TARGET_64BIT && TARGET_PREFETCHI
15309 6 : && local_func_symbolic_operand (op0, GET_MODE (op0)))
15310 2 : emit_insn (gen_prefetchi (op0, op2));
15311 : else
15312 : {
15313 1 : warning (0, "instruction prefetch applies when in 64-bit mode"
15314 : " with RIP-relative addressing and"
15315 : " option %<-mprefetchi%>;"
15316 : " they stay NOPs otherwise");
15317 1 : emit_insn (gen_nop ());
15318 : }
15319 : }
15320 : else
15321 : {
15322 44 : if (INTVAL (op3) != 0)
15323 1 : warning (0, "invalid fourth argument to"
15324 : " %<__builtin_ia32_prefetch%>; using zero");
15325 :
15326 44 : if (!address_operand (op0, VOIDmode))
15327 : {
15328 10 : op0 = convert_memory_address (Pmode, op0);
15329 10 : op0 = copy_addr_to_reg (op0);
15330 : }
15331 :
15332 44 : if (!IN_RANGE (INTVAL (op2), 0, 3))
15333 : {
15334 1 : warning (0, "invalid third argument to %<__builtin_ia32_prefetch%>; using zero");
15335 1 : op2 = const0_rtx;
15336 : }
15337 :
15338 44 : if (TARGET_3DNOW
15339 26 : || TARGET_PREFETCH_SSE
15340 0 : || TARGET_PRFCHW
15341 0 : || TARGET_MOVRS)
15342 44 : emit_insn (gen_prefetch (op0, op1, op2));
15343 0 : else if (!MEM_P (op0) && side_effects_p (op0))
15344 : /* Don't do anything with direct references to volatile memory,
15345 : but generate code to handle other side effects. */
15346 0 : emit_insn (op0);
15347 : }
15348 :
15349 : return 0;
15350 : }
15351 :
15352 21 : case IX86_BUILTIN_PREFETCHI:
15353 21 : {
15354 21 : arg0 = CALL_EXPR_ARG (exp, 0); // const void *
15355 21 : arg1 = CALL_EXPR_ARG (exp, 1); // const int
15356 :
15357 21 : op0 = expand_normal (arg0);
15358 21 : op1 = expand_normal (arg1);
15359 :
15360 21 : if (!CONST_INT_P (op1))
15361 : {
15362 0 : error ("second argument must be a const");
15363 0 : return const0_rtx;
15364 : }
15365 :
15366 : /* GOT/PLT_PIC should not be available for instruction prefetch.
15367 : It must be real instruction address. */
15368 21 : if (TARGET_64BIT
15369 21 : && local_func_symbolic_operand (op0, GET_MODE (op0)))
15370 4 : emit_insn (gen_prefetchi (op0, op1));
15371 : else
15372 : {
15373 : /* Ignore the hint. */
15374 17 : warning (0, "instruction prefetch applies when in 64-bit mode"
15375 : " with RIP-relative addressing and"
15376 : " option %<-mprefetchi%>;"
15377 : " they stay NOPs otherwise");
15378 17 : emit_insn (gen_nop ());
15379 : }
15380 :
15381 : return 0;
15382 : }
15383 :
15384 53 : case IX86_BUILTIN_URDMSR:
15385 53 : case IX86_BUILTIN_UWRMSR:
15386 53 : {
15387 53 : arg0 = CALL_EXPR_ARG (exp, 0);
15388 53 : op0 = expand_normal (arg0);
15389 :
15390 53 : if (CONST_INT_P (op0))
15391 : {
15392 12 : unsigned HOST_WIDE_INT val = UINTVAL (op0);
15393 12 : if (val > 0xffffffff)
15394 2 : op0 = force_reg (DImode, op0);
15395 : }
15396 : else
15397 41 : op0 = force_reg (DImode, op0);
15398 :
15399 53 : if (fcode == IX86_BUILTIN_UWRMSR)
15400 : {
15401 26 : arg1 = CALL_EXPR_ARG (exp, 1);
15402 26 : op1 = expand_normal (arg1);
15403 26 : op1 = force_reg (DImode, op1);
15404 26 : icode = CODE_FOR_uwrmsr;
15405 26 : target = 0;
15406 : }
15407 : else
15408 : {
15409 27 : if (target == 0 || !register_operand (target, DImode))
15410 1 : target = gen_reg_rtx (DImode);
15411 : icode = CODE_FOR_urdmsr;
15412 : op1 = op0;
15413 : op0 = target;
15414 : }
15415 53 : emit_insn (GEN_FCN (icode) (op0, op1));
15416 53 : return target;
15417 : }
15418 :
15419 229 : case IX86_BUILTIN_VEC_INIT_V2SI:
15420 229 : case IX86_BUILTIN_VEC_INIT_V4HI:
15421 229 : case IX86_BUILTIN_VEC_INIT_V8QI:
15422 229 : return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
15423 :
15424 399 : case IX86_BUILTIN_VEC_EXT_V2DF:
15425 399 : case IX86_BUILTIN_VEC_EXT_V2DI:
15426 399 : case IX86_BUILTIN_VEC_EXT_V4SF:
15427 399 : case IX86_BUILTIN_VEC_EXT_V4SI:
15428 399 : case IX86_BUILTIN_VEC_EXT_V8HI:
15429 399 : case IX86_BUILTIN_VEC_EXT_V2SI:
15430 399 : case IX86_BUILTIN_VEC_EXT_V4HI:
15431 399 : case IX86_BUILTIN_VEC_EXT_V16QI:
15432 399 : return ix86_expand_vec_ext_builtin (exp, target);
15433 :
15434 204 : case IX86_BUILTIN_VEC_SET_V2DI:
15435 204 : case IX86_BUILTIN_VEC_SET_V4SF:
15436 204 : case IX86_BUILTIN_VEC_SET_V4SI:
15437 204 : case IX86_BUILTIN_VEC_SET_V8HI:
15438 204 : case IX86_BUILTIN_VEC_SET_V4HI:
15439 204 : case IX86_BUILTIN_VEC_SET_V16QI:
15440 204 : return ix86_expand_vec_set_builtin (exp);
15441 :
15442 0 : case IX86_BUILTIN_NANQ:
15443 0 : case IX86_BUILTIN_NANSQ:
15444 0 : return expand_call (exp, target, ignore);
15445 :
15446 18 : case IX86_BUILTIN_RDPID:
15447 :
15448 18 : op0 = gen_reg_rtx (word_mode);
15449 :
15450 18 : if (TARGET_64BIT)
15451 : {
15452 18 : insn = gen_rdpid_rex64 (op0);
15453 18 : op0 = convert_to_mode (SImode, op0, 1);
15454 : }
15455 : else
15456 0 : insn = gen_rdpid (op0);
15457 :
15458 18 : emit_insn (insn);
15459 :
15460 18 : if (target == 0
15461 18 : || !register_operand (target, SImode))
15462 0 : target = gen_reg_rtx (SImode);
15463 :
15464 18 : emit_move_insn (target, op0);
15465 18 : return target;
15466 :
15467 75 : case IX86_BUILTIN_2INTERSECTD512:
15468 75 : case IX86_BUILTIN_2INTERSECTQ512:
15469 75 : case IX86_BUILTIN_2INTERSECTD256:
15470 75 : case IX86_BUILTIN_2INTERSECTQ256:
15471 75 : case IX86_BUILTIN_2INTERSECTD128:
15472 75 : case IX86_BUILTIN_2INTERSECTQ128:
15473 75 : arg0 = CALL_EXPR_ARG (exp, 0);
15474 75 : arg1 = CALL_EXPR_ARG (exp, 1);
15475 75 : arg2 = CALL_EXPR_ARG (exp, 2);
15476 75 : arg3 = CALL_EXPR_ARG (exp, 3);
15477 75 : op0 = expand_normal (arg0);
15478 75 : op1 = expand_normal (arg1);
15479 75 : op2 = expand_normal (arg2);
15480 75 : op3 = expand_normal (arg3);
15481 :
15482 75 : if (!address_operand (op0, VOIDmode))
15483 : {
15484 25 : op0 = convert_memory_address (Pmode, op0);
15485 25 : op0 = copy_addr_to_reg (op0);
15486 : }
15487 75 : if (!address_operand (op1, VOIDmode))
15488 : {
15489 25 : op1 = convert_memory_address (Pmode, op1);
15490 25 : op1 = copy_addr_to_reg (op1);
15491 : }
15492 :
15493 75 : switch (fcode)
15494 : {
15495 : case IX86_BUILTIN_2INTERSECTD512:
15496 : mode4 = P2HImode;
15497 : icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
15498 : break;
15499 : case IX86_BUILTIN_2INTERSECTQ512:
15500 : mode4 = P2QImode;
15501 : icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
15502 : break;
15503 : case IX86_BUILTIN_2INTERSECTD256:
15504 : mode4 = P2QImode;
15505 : icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
15506 : break;
15507 : case IX86_BUILTIN_2INTERSECTQ256:
15508 : mode4 = P2QImode;
15509 : icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
15510 : break;
15511 : case IX86_BUILTIN_2INTERSECTD128:
15512 : mode4 = P2QImode;
15513 : icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
15514 : break;
15515 : case IX86_BUILTIN_2INTERSECTQ128:
15516 : mode4 = P2QImode;
15517 : icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
15518 : break;
15519 0 : default:
15520 0 : gcc_unreachable ();
15521 : }
15522 :
15523 75 : mode2 = insn_data[icode].operand[1].mode;
15524 75 : mode3 = insn_data[icode].operand[2].mode;
15525 75 : if (!insn_data[icode].operand[1].predicate (op2, mode2))
15526 25 : op2 = copy_to_mode_reg (mode2, op2);
15527 75 : if (!insn_data[icode].operand[2].predicate (op3, mode3))
15528 6 : op3 = copy_to_mode_reg (mode3, op3);
15529 :
15530 75 : op4 = gen_reg_rtx (mode4);
15531 75 : emit_insn (GEN_FCN (icode) (op4, op2, op3));
15532 75 : mode0 = mode4 == P2HImode ? HImode : QImode;
15533 75 : emit_move_insn (gen_rtx_MEM (mode0, op0),
15534 75 : gen_lowpart (mode0, op4));
15535 75 : emit_move_insn (gen_rtx_MEM (mode0, op1),
15536 : gen_highpart (mode0, op4));
15537 :
15538 75 : return 0;
15539 :
15540 102 : case IX86_BUILTIN_RDPMC:
15541 102 : case IX86_BUILTIN_RDTSC:
15542 102 : case IX86_BUILTIN_RDTSCP:
15543 102 : case IX86_BUILTIN_XGETBV:
15544 :
15545 102 : op0 = gen_reg_rtx (DImode);
15546 102 : op1 = gen_reg_rtx (DImode);
15547 :
15548 102 : if (fcode == IX86_BUILTIN_RDPMC)
15549 : {
15550 22 : arg0 = CALL_EXPR_ARG (exp, 0);
15551 22 : op2 = expand_normal (arg0);
15552 22 : if (!register_operand (op2, SImode))
15553 11 : op2 = copy_to_mode_reg (SImode, op2);
15554 :
15555 22 : insn = (TARGET_64BIT
15556 22 : ? gen_rdpmc_rex64 (op0, op1, op2)
15557 0 : : gen_rdpmc (op0, op2));
15558 22 : emit_insn (insn);
15559 : }
15560 80 : else if (fcode == IX86_BUILTIN_XGETBV)
15561 : {
15562 22 : arg0 = CALL_EXPR_ARG (exp, 0);
15563 22 : op2 = expand_normal (arg0);
15564 22 : if (!register_operand (op2, SImode))
15565 1 : op2 = copy_to_mode_reg (SImode, op2);
15566 :
15567 22 : insn = (TARGET_64BIT
15568 22 : ? gen_xgetbv_rex64 (op0, op1, op2)
15569 0 : : gen_xgetbv (op0, op2));
15570 22 : emit_insn (insn);
15571 : }
15572 58 : else if (fcode == IX86_BUILTIN_RDTSC)
15573 : {
15574 36 : insn = (TARGET_64BIT
15575 36 : ? gen_rdtsc_rex64 (op0, op1)
15576 2 : : gen_rdtsc (op0));
15577 36 : emit_insn (insn);
15578 : }
15579 : else
15580 : {
15581 22 : op2 = gen_reg_rtx (SImode);
15582 :
15583 22 : insn = (TARGET_64BIT
15584 22 : ? gen_rdtscp_rex64 (op0, op1, op2)
15585 0 : : gen_rdtscp (op0, op2));
15586 22 : emit_insn (insn);
15587 :
15588 22 : arg0 = CALL_EXPR_ARG (exp, 0);
15589 22 : op4 = expand_normal (arg0);
15590 22 : if (!address_operand (op4, VOIDmode))
15591 : {
15592 10 : op4 = convert_memory_address (Pmode, op4);
15593 10 : op4 = copy_addr_to_reg (op4);
15594 : }
15595 22 : emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
15596 : }
15597 :
15598 102 : if (target == 0
15599 102 : || !register_operand (target, DImode))
15600 10 : target = gen_reg_rtx (DImode);
15601 :
15602 102 : if (TARGET_64BIT)
15603 : {
15604 100 : op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
15605 : op1, 1, OPTAB_DIRECT);
15606 100 : op0 = expand_simple_binop (DImode, IOR, op0, op1,
15607 : op0, 1, OPTAB_DIRECT);
15608 : }
15609 :
15610 102 : emit_move_insn (target, op0);
15611 102 : return target;
15612 :
15613 61 : case IX86_BUILTIN_ENQCMD:
15614 61 : case IX86_BUILTIN_ENQCMDS:
15615 61 : case IX86_BUILTIN_MOVDIR64B:
15616 :
15617 61 : arg0 = CALL_EXPR_ARG (exp, 0);
15618 61 : arg1 = CALL_EXPR_ARG (exp, 1);
15619 61 : op0 = expand_normal (arg0);
15620 61 : op1 = expand_normal (arg1);
15621 :
15622 61 : op0 = ix86_zero_extend_to_Pmode (op0);
15623 61 : if (!address_operand (op1, VOIDmode))
15624 : {
15625 28 : op1 = convert_memory_address (Pmode, op1);
15626 28 : op1 = copy_addr_to_reg (op1);
15627 : }
15628 61 : op1 = gen_rtx_MEM (XImode, op1);
15629 :
15630 61 : if (fcode == IX86_BUILTIN_MOVDIR64B)
15631 : {
15632 24 : emit_insn (gen_movdir64b (Pmode, op0, op1));
15633 23 : return 0;
15634 : }
15635 : else
15636 : {
15637 38 : if (target == 0
15638 38 : || !register_operand (target, SImode))
15639 0 : target = gen_reg_rtx (SImode);
15640 :
15641 38 : emit_move_insn (target, const0_rtx);
15642 38 : target = gen_rtx_SUBREG (QImode, target, 0);
15643 :
15644 19 : int unspecv = (fcode == IX86_BUILTIN_ENQCMD
15645 38 : ? UNSPECV_ENQCMD
15646 : : UNSPECV_ENQCMDS);
15647 38 : icode = code_for_enqcmd (unspecv, Pmode);
15648 38 : emit_insn (GEN_FCN (icode) (op0, op1));
15649 :
15650 38 : emit_insn
15651 38 : (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
15652 : gen_rtx_fmt_ee (EQ, QImode,
15653 : gen_rtx_REG (CCZmode, FLAGS_REG),
15654 : const0_rtx)));
15655 38 : return SUBREG_REG (target);
15656 : }
15657 :
15658 14775 : case IX86_BUILTIN_FXSAVE:
15659 14775 : case IX86_BUILTIN_FXRSTOR:
15660 14775 : case IX86_BUILTIN_FXSAVE64:
15661 14775 : case IX86_BUILTIN_FXRSTOR64:
15662 14775 : case IX86_BUILTIN_FNSTENV:
15663 14775 : case IX86_BUILTIN_FLDENV:
15664 14775 : mode0 = BLKmode;
15665 14775 : switch (fcode)
15666 : {
15667 : case IX86_BUILTIN_FXSAVE:
15668 : icode = CODE_FOR_fxsave;
15669 : break;
15670 19 : case IX86_BUILTIN_FXRSTOR:
15671 19 : icode = CODE_FOR_fxrstor;
15672 19 : break;
15673 23 : case IX86_BUILTIN_FXSAVE64:
15674 23 : icode = CODE_FOR_fxsave64;
15675 23 : break;
15676 21 : case IX86_BUILTIN_FXRSTOR64:
15677 21 : icode = CODE_FOR_fxrstor64;
15678 21 : break;
15679 7257 : case IX86_BUILTIN_FNSTENV:
15680 7257 : icode = CODE_FOR_fnstenv;
15681 7257 : break;
15682 7435 : case IX86_BUILTIN_FLDENV:
15683 7435 : icode = CODE_FOR_fldenv;
15684 7435 : break;
15685 0 : default:
15686 0 : gcc_unreachable ();
15687 : }
15688 :
15689 14775 : arg0 = CALL_EXPR_ARG (exp, 0);
15690 14775 : op0 = expand_normal (arg0);
15691 :
15692 14775 : if (!address_operand (op0, VOIDmode))
15693 : {
15694 36 : op0 = convert_memory_address (Pmode, op0);
15695 36 : op0 = copy_addr_to_reg (op0);
15696 : }
15697 14775 : op0 = gen_rtx_MEM (mode0, op0);
15698 :
15699 14775 : pat = GEN_FCN (icode) (op0);
15700 14775 : if (pat)
15701 14775 : emit_insn (pat);
15702 : return 0;
15703 :
15704 21 : case IX86_BUILTIN_XSETBV:
15705 21 : arg0 = CALL_EXPR_ARG (exp, 0);
15706 21 : arg1 = CALL_EXPR_ARG (exp, 1);
15707 21 : op0 = expand_normal (arg0);
15708 21 : op1 = expand_normal (arg1);
15709 :
15710 21 : if (!REG_P (op0))
15711 1 : op0 = copy_to_mode_reg (SImode, op0);
15712 :
15713 21 : op1 = force_reg (DImode, op1);
15714 :
15715 21 : if (TARGET_64BIT)
15716 : {
15717 21 : op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
15718 : NULL, 1, OPTAB_DIRECT);
15719 :
15720 21 : icode = CODE_FOR_xsetbv_rex64;
15721 :
15722 21 : op2 = gen_lowpart (SImode, op2);
15723 21 : op1 = gen_lowpart (SImode, op1);
15724 21 : pat = GEN_FCN (icode) (op0, op1, op2);
15725 : }
15726 : else
15727 : {
15728 0 : icode = CODE_FOR_xsetbv;
15729 :
15730 0 : pat = GEN_FCN (icode) (op0, op1);
15731 : }
15732 21 : if (pat)
15733 21 : emit_insn (pat);
15734 : return 0;
15735 :
15736 232 : case IX86_BUILTIN_XSAVE:
15737 232 : case IX86_BUILTIN_XRSTOR:
15738 232 : case IX86_BUILTIN_XSAVE64:
15739 232 : case IX86_BUILTIN_XRSTOR64:
15740 232 : case IX86_BUILTIN_XSAVEOPT:
15741 232 : case IX86_BUILTIN_XSAVEOPT64:
15742 232 : case IX86_BUILTIN_XSAVES:
15743 232 : case IX86_BUILTIN_XRSTORS:
15744 232 : case IX86_BUILTIN_XSAVES64:
15745 232 : case IX86_BUILTIN_XRSTORS64:
15746 232 : case IX86_BUILTIN_XSAVEC:
15747 232 : case IX86_BUILTIN_XSAVEC64:
15748 232 : arg0 = CALL_EXPR_ARG (exp, 0);
15749 232 : arg1 = CALL_EXPR_ARG (exp, 1);
15750 232 : op0 = expand_normal (arg0);
15751 232 : op1 = expand_normal (arg1);
15752 :
15753 232 : if (!address_operand (op0, VOIDmode))
15754 : {
15755 108 : op0 = convert_memory_address (Pmode, op0);
15756 108 : op0 = copy_addr_to_reg (op0);
15757 : }
15758 232 : op0 = gen_rtx_MEM (BLKmode, op0);
15759 :
15760 232 : op1 = force_reg (DImode, op1);
15761 :
15762 232 : if (TARGET_64BIT)
15763 : {
15764 232 : op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
15765 : NULL, 1, OPTAB_DIRECT);
15766 232 : switch (fcode)
15767 : {
15768 : case IX86_BUILTIN_XSAVE:
15769 : icode = CODE_FOR_xsave_rex64;
15770 : break;
15771 19 : case IX86_BUILTIN_XRSTOR:
15772 19 : icode = CODE_FOR_xrstor_rex64;
15773 19 : break;
15774 21 : case IX86_BUILTIN_XSAVE64:
15775 21 : icode = CODE_FOR_xsave64;
15776 21 : break;
15777 21 : case IX86_BUILTIN_XRSTOR64:
15778 21 : icode = CODE_FOR_xrstor64;
15779 21 : break;
15780 19 : case IX86_BUILTIN_XSAVEOPT:
15781 19 : icode = CODE_FOR_xsaveopt_rex64;
15782 19 : break;
15783 19 : case IX86_BUILTIN_XSAVEOPT64:
15784 19 : icode = CODE_FOR_xsaveopt64;
15785 19 : break;
15786 19 : case IX86_BUILTIN_XSAVES:
15787 19 : icode = CODE_FOR_xsaves_rex64;
15788 19 : break;
15789 19 : case IX86_BUILTIN_XRSTORS:
15790 19 : icode = CODE_FOR_xrstors_rex64;
15791 19 : break;
15792 19 : case IX86_BUILTIN_XSAVES64:
15793 19 : icode = CODE_FOR_xsaves64;
15794 19 : break;
15795 19 : case IX86_BUILTIN_XRSTORS64:
15796 19 : icode = CODE_FOR_xrstors64;
15797 19 : break;
15798 19 : case IX86_BUILTIN_XSAVEC:
15799 19 : icode = CODE_FOR_xsavec_rex64;
15800 19 : break;
15801 19 : case IX86_BUILTIN_XSAVEC64:
15802 19 : icode = CODE_FOR_xsavec64;
15803 19 : break;
15804 0 : default:
15805 0 : gcc_unreachable ();
15806 : }
15807 :
15808 232 : op2 = gen_lowpart (SImode, op2);
15809 232 : op1 = gen_lowpart (SImode, op1);
15810 232 : pat = GEN_FCN (icode) (op0, op1, op2);
15811 : }
15812 : else
15813 : {
15814 0 : switch (fcode)
15815 : {
15816 : case IX86_BUILTIN_XSAVE:
15817 : icode = CODE_FOR_xsave;
15818 : break;
15819 : case IX86_BUILTIN_XRSTOR:
15820 : icode = CODE_FOR_xrstor;
15821 : break;
15822 : case IX86_BUILTIN_XSAVEOPT:
15823 : icode = CODE_FOR_xsaveopt;
15824 : break;
15825 : case IX86_BUILTIN_XSAVES:
15826 : icode = CODE_FOR_xsaves;
15827 : break;
15828 : case IX86_BUILTIN_XRSTORS:
15829 : icode = CODE_FOR_xrstors;
15830 : break;
15831 : case IX86_BUILTIN_XSAVEC:
15832 : icode = CODE_FOR_xsavec;
15833 : break;
15834 0 : default:
15835 0 : gcc_unreachable ();
15836 : }
15837 0 : pat = GEN_FCN (icode) (op0, op1);
15838 : }
15839 :
15840 232 : if (pat)
15841 232 : emit_insn (pat);
15842 : return 0;
15843 :
15844 144 : case IX86_BUILTIN_LDTILECFG:
15845 144 : case IX86_BUILTIN_STTILECFG:
15846 144 : arg0 = CALL_EXPR_ARG (exp, 0);
15847 144 : op0 = expand_normal (arg0);
15848 :
15849 144 : if (!address_operand (op0, VOIDmode))
15850 : {
15851 8 : op0 = convert_memory_address (Pmode, op0);
15852 8 : op0 = copy_addr_to_reg (op0);
15853 : }
15854 144 : op0 = gen_rtx_MEM (BLKmode, op0);
15855 144 : if (fcode == IX86_BUILTIN_LDTILECFG)
15856 : icode = CODE_FOR_ldtilecfg;
15857 : else
15858 93 : icode = CODE_FOR_sttilecfg;
15859 144 : pat = GEN_FCN (icode) (op0);
15860 144 : emit_insn (pat);
15861 144 : return 0;
15862 :
15863 18 : case IX86_BUILTIN_LLWPCB:
15864 18 : arg0 = CALL_EXPR_ARG (exp, 0);
15865 18 : op0 = expand_normal (arg0);
15866 :
15867 18 : if (!register_operand (op0, Pmode))
15868 9 : op0 = ix86_zero_extend_to_Pmode (op0);
15869 18 : emit_insn (gen_lwp_llwpcb (Pmode, op0));
15870 18 : return 0;
15871 :
15872 18 : case IX86_BUILTIN_SLWPCB:
15873 18 : if (!target
15874 18 : || !register_operand (target, Pmode))
15875 0 : target = gen_reg_rtx (Pmode);
15876 18 : emit_insn (gen_lwp_slwpcb (Pmode, target));
15877 18 : return target;
15878 :
15879 51 : case IX86_BUILTIN_LWPVAL32:
15880 51 : case IX86_BUILTIN_LWPVAL64:
15881 51 : case IX86_BUILTIN_LWPINS32:
15882 51 : case IX86_BUILTIN_LWPINS64:
15883 51 : mode = ((fcode == IX86_BUILTIN_LWPVAL32
15884 51 : || fcode == IX86_BUILTIN_LWPINS32)
15885 51 : ? SImode : DImode);
15886 :
15887 51 : if (fcode == IX86_BUILTIN_LWPVAL32
15888 51 : || fcode == IX86_BUILTIN_LWPVAL64)
15889 26 : icode = code_for_lwp_lwpval (mode);
15890 : else
15891 25 : icode = code_for_lwp_lwpins (mode);
15892 :
15893 51 : arg0 = CALL_EXPR_ARG (exp, 0);
15894 51 : arg1 = CALL_EXPR_ARG (exp, 1);
15895 51 : arg2 = CALL_EXPR_ARG (exp, 2);
15896 51 : op0 = expand_normal (arg0);
15897 51 : op1 = expand_normal (arg1);
15898 51 : op2 = expand_normal (arg2);
15899 51 : mode0 = insn_data[icode].operand[0].mode;
15900 :
15901 51 : if (!insn_data[icode].operand[0].predicate (op0, mode0))
15902 13 : op0 = copy_to_mode_reg (mode0, op0);
15903 51 : if (!insn_data[icode].operand[1].predicate (op1, SImode))
15904 0 : op1 = copy_to_mode_reg (SImode, op1);
15905 :
15906 51 : if (!CONST_INT_P (op2))
15907 : {
15908 0 : error ("the last argument must be a 32-bit immediate");
15909 0 : return const0_rtx;
15910 : }
15911 :
15912 51 : emit_insn (GEN_FCN (icode) (op0, op1, op2));
15913 :
15914 51 : if (fcode == IX86_BUILTIN_LWPINS32
15915 51 : || fcode == IX86_BUILTIN_LWPINS64)
15916 : {
15917 25 : if (target == 0
15918 25 : || !nonimmediate_operand (target, QImode))
15919 0 : target = gen_reg_rtx (QImode);
15920 :
15921 25 : pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
15922 : const0_rtx);
15923 25 : emit_insn (gen_rtx_SET (target, pat));
15924 :
15925 25 : return target;
15926 : }
15927 : else
15928 : return 0;
15929 :
15930 18 : case IX86_BUILTIN_BEXTRI32:
15931 18 : case IX86_BUILTIN_BEXTRI64:
15932 18 : mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
15933 :
15934 18 : arg0 = CALL_EXPR_ARG (exp, 0);
15935 18 : arg1 = CALL_EXPR_ARG (exp, 1);
15936 18 : op0 = expand_normal (arg0);
15937 18 : op1 = expand_normal (arg1);
15938 :
15939 18 : if (!CONST_INT_P (op1))
15940 : {
15941 0 : error ("last argument must be an immediate");
15942 0 : return const0_rtx;
15943 : }
15944 : else
15945 : {
15946 18 : unsigned char lsb_index = UINTVAL (op1);
15947 18 : unsigned char length = UINTVAL (op1) >> 8;
15948 :
15949 18 : unsigned char bitsize = GET_MODE_BITSIZE (mode);
15950 :
15951 18 : icode = code_for_tbm_bextri (mode);
15952 :
15953 18 : mode1 = insn_data[icode].operand[1].mode;
15954 18 : if (!insn_data[icode].operand[1].predicate (op0, mode1))
15955 12 : op0 = copy_to_mode_reg (mode1, op0);
15956 :
15957 18 : mode0 = insn_data[icode].operand[0].mode;
15958 18 : if (target == 0
15959 18 : || !register_operand (target, mode0))
15960 0 : target = gen_reg_rtx (mode0);
15961 :
15962 18 : if (length == 0 || lsb_index >= bitsize)
15963 : {
15964 8 : emit_move_insn (target, const0_rtx);
15965 8 : return target;
15966 : }
15967 :
15968 10 : if (length + lsb_index > bitsize)
15969 5 : length = bitsize - lsb_index;
15970 :
15971 10 : op1 = GEN_INT (length);
15972 10 : op2 = GEN_INT (lsb_index);
15973 :
15974 10 : emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
15975 10 : return target;
15976 : }
15977 :
15978 21 : case IX86_BUILTIN_RDRAND16_STEP:
15979 21 : mode = HImode;
15980 21 : goto rdrand_step;
15981 :
15982 42 : case IX86_BUILTIN_RDRAND32_STEP:
15983 42 : mode = SImode;
15984 42 : goto rdrand_step;
15985 :
15986 : case IX86_BUILTIN_RDRAND64_STEP:
15987 : mode = DImode;
15988 :
15989 83 : rdrand_step:
15990 83 : arg0 = CALL_EXPR_ARG (exp, 0);
15991 83 : op1 = expand_normal (arg0);
15992 83 : if (!address_operand (op1, VOIDmode))
15993 : {
15994 29 : op1 = convert_memory_address (Pmode, op1);
15995 29 : op1 = copy_addr_to_reg (op1);
15996 : }
15997 :
15998 83 : op0 = gen_reg_rtx (mode);
15999 83 : emit_insn (gen_rdrand (mode, op0));
16000 :
16001 83 : emit_move_insn (gen_rtx_MEM (mode, op1), op0);
16002 :
16003 83 : op1 = force_reg (SImode, const1_rtx);
16004 :
16005 : /* Emit SImode conditional move. */
16006 83 : if (mode == HImode)
16007 : {
16008 21 : if (TARGET_ZERO_EXTEND_WITH_AND
16009 21 : && optimize_function_for_speed_p (cfun))
16010 : {
16011 0 : op2 = force_reg (SImode, const0_rtx);
16012 :
16013 0 : emit_insn (gen_movstricthi
16014 0 : (gen_lowpart (HImode, op2), op0));
16015 : }
16016 : else
16017 : {
16018 21 : op2 = gen_reg_rtx (SImode);
16019 :
16020 21 : emit_insn (gen_zero_extendhisi2 (op2, op0));
16021 : }
16022 : }
16023 62 : else if (mode == SImode)
16024 : op2 = op0;
16025 : else
16026 20 : op2 = gen_rtx_SUBREG (SImode, op0, 0);
16027 :
16028 83 : if (target == 0
16029 83 : || !register_operand (target, SImode))
16030 7 : target = gen_reg_rtx (SImode);
16031 :
16032 83 : pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
16033 : const0_rtx);
16034 83 : emit_insn (gen_rtx_SET (target,
16035 : gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
16036 83 : return target;
16037 :
16038 19 : case IX86_BUILTIN_RDSEED16_STEP:
16039 19 : mode = HImode;
16040 19 : goto rdseed_step;
16041 :
16042 28 : case IX86_BUILTIN_RDSEED32_STEP:
16043 28 : mode = SImode;
16044 28 : goto rdseed_step;
16045 :
16046 : case IX86_BUILTIN_RDSEED64_STEP:
16047 : mode = DImode;
16048 :
16049 66 : rdseed_step:
16050 66 : arg0 = CALL_EXPR_ARG (exp, 0);
16051 66 : op1 = expand_normal (arg0);
16052 66 : if (!address_operand (op1, VOIDmode))
16053 : {
16054 28 : op1 = convert_memory_address (Pmode, op1);
16055 28 : op1 = copy_addr_to_reg (op1);
16056 : }
16057 :
16058 66 : op0 = gen_reg_rtx (mode);
16059 66 : emit_insn (gen_rdseed (mode, op0));
16060 :
16061 66 : emit_move_insn (gen_rtx_MEM (mode, op1), op0);
16062 :
16063 66 : op2 = gen_reg_rtx (QImode);
16064 :
16065 66 : pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
16066 : const0_rtx);
16067 66 : emit_insn (gen_rtx_SET (op2, pat));
16068 :
16069 66 : if (target == 0
16070 66 : || !register_operand (target, SImode))
16071 1 : target = gen_reg_rtx (SImode);
16072 :
16073 66 : emit_insn (gen_zero_extendqisi2 (target, op2));
16074 66 : return target;
16075 :
16076 38 : case IX86_BUILTIN_SBB32:
16077 38 : icode = CODE_FOR_subborrowsi;
16078 38 : icode2 = CODE_FOR_subborrowsi_0;
16079 38 : mode0 = SImode;
16080 38 : mode1 = DImode;
16081 38 : mode2 = CCmode;
16082 38 : goto handlecarry;
16083 :
16084 44 : case IX86_BUILTIN_SBB64:
16085 44 : icode = CODE_FOR_subborrowdi;
16086 44 : icode2 = CODE_FOR_subborrowdi_0;
16087 44 : mode0 = DImode;
16088 44 : mode1 = TImode;
16089 44 : mode2 = CCmode;
16090 44 : goto handlecarry;
16091 :
16092 68 : case IX86_BUILTIN_ADDCARRYX32:
16093 68 : icode = CODE_FOR_addcarrysi;
16094 68 : icode2 = CODE_FOR_addcarrysi_0;
16095 68 : mode0 = SImode;
16096 68 : mode1 = DImode;
16097 68 : mode2 = CCCmode;
16098 68 : goto handlecarry;
16099 :
16100 : case IX86_BUILTIN_ADDCARRYX64:
16101 : icode = CODE_FOR_addcarrydi;
16102 : icode2 = CODE_FOR_addcarrydi_0;
16103 : mode0 = DImode;
16104 : mode1 = TImode;
16105 : mode2 = CCCmode;
16106 :
16107 212 : handlecarry:
16108 212 : arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
16109 212 : arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
16110 212 : arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
16111 212 : arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
16112 :
16113 212 : op1 = expand_normal (arg0);
16114 :
16115 212 : op2 = expand_normal (arg1);
16116 212 : if (!register_operand (op2, mode0))
16117 117 : op2 = copy_to_mode_reg (mode0, op2);
16118 :
16119 212 : op3 = expand_normal (arg2);
16120 212 : if (!register_operand (op3, mode0))
16121 120 : op3 = copy_to_mode_reg (mode0, op3);
16122 :
16123 212 : op4 = expand_normal (arg3);
16124 212 : if (!address_operand (op4, VOIDmode))
16125 : {
16126 67 : op4 = convert_memory_address (Pmode, op4);
16127 67 : op4 = copy_addr_to_reg (op4);
16128 : }
16129 :
16130 212 : op0 = gen_reg_rtx (mode0);
16131 212 : if (op1 == const0_rtx)
16132 : {
16133 : /* If arg0 is 0, optimize right away into add or sub
16134 : instruction that sets CCCmode flags. */
16135 21 : op1 = gen_rtx_REG (mode2, FLAGS_REG);
16136 21 : emit_insn (GEN_FCN (icode2) (op0, op2, op3));
16137 : }
16138 : else
16139 : {
16140 : /* Generate CF from input operand. */
16141 191 : ix86_expand_carry (op1);
16142 :
16143 : /* Generate instruction that consumes CF. */
16144 191 : op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
16145 191 : pat = gen_rtx_LTU (mode1, op1, const0_rtx);
16146 191 : pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
16147 191 : emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
16148 : }
16149 :
16150 : /* Return current CF value. */
16151 212 : if (target == 0)
16152 14 : target = gen_reg_rtx (QImode);
16153 :
16154 212 : pat = gen_rtx_LTU (QImode, op1, const0_rtx);
16155 212 : emit_insn (gen_rtx_SET (target, pat));
16156 :
16157 : /* Store the result. */
16158 212 : emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
16159 :
16160 212 : return target;
16161 :
16162 24 : case IX86_BUILTIN_READ_FLAGS:
16163 24 : if (ignore)
16164 1 : return const0_rtx;
16165 :
16166 23 : emit_insn (gen_pushfl ());
16167 :
16168 23 : if (optimize
16169 11 : || target == NULL_RTX
16170 11 : || !nonimmediate_operand (target, word_mode)
16171 34 : || GET_MODE (target) != word_mode)
16172 12 : target = gen_reg_rtx (word_mode);
16173 :
16174 23 : emit_insn (gen_pop (target));
16175 23 : return target;
16176 :
16177 21 : case IX86_BUILTIN_WRITE_FLAGS:
16178 :
16179 21 : arg0 = CALL_EXPR_ARG (exp, 0);
16180 21 : op0 = expand_normal (arg0);
16181 21 : if (!general_no_elim_operand (op0, word_mode))
16182 0 : op0 = copy_to_mode_reg (word_mode, op0);
16183 :
16184 21 : emit_insn (gen_push (op0));
16185 21 : emit_insn (gen_popfl ());
16186 21 : return 0;
16187 :
16188 22 : case IX86_BUILTIN_KTESTC8:
16189 22 : icode = CODE_FOR_ktestqi;
16190 22 : mode3 = CCCmode;
16191 22 : goto kortest;
16192 :
16193 22 : case IX86_BUILTIN_KTESTZ8:
16194 22 : icode = CODE_FOR_ktestqi;
16195 22 : mode3 = CCZmode;
16196 22 : goto kortest;
16197 :
16198 22 : case IX86_BUILTIN_KTESTC16:
16199 22 : icode = CODE_FOR_ktesthi;
16200 22 : mode3 = CCCmode;
16201 22 : goto kortest;
16202 :
16203 22 : case IX86_BUILTIN_KTESTZ16:
16204 22 : icode = CODE_FOR_ktesthi;
16205 22 : mode3 = CCZmode;
16206 22 : goto kortest;
16207 :
16208 22 : case IX86_BUILTIN_KTESTC32:
16209 22 : icode = CODE_FOR_ktestsi;
16210 22 : mode3 = CCCmode;
16211 22 : goto kortest;
16212 :
16213 22 : case IX86_BUILTIN_KTESTZ32:
16214 22 : icode = CODE_FOR_ktestsi;
16215 22 : mode3 = CCZmode;
16216 22 : goto kortest;
16217 :
16218 22 : case IX86_BUILTIN_KTESTC64:
16219 22 : icode = CODE_FOR_ktestdi;
16220 22 : mode3 = CCCmode;
16221 22 : goto kortest;
16222 :
16223 22 : case IX86_BUILTIN_KTESTZ64:
16224 22 : icode = CODE_FOR_ktestdi;
16225 22 : mode3 = CCZmode;
16226 22 : goto kortest;
16227 :
16228 22 : case IX86_BUILTIN_KORTESTC8:
16229 22 : icode = CODE_FOR_kortestqi;
16230 22 : mode3 = CCCmode;
16231 22 : goto kortest;
16232 :
16233 76 : case IX86_BUILTIN_KORTESTZ8:
16234 76 : icode = CODE_FOR_kortestqi;
16235 76 : mode3 = CCZmode;
16236 76 : goto kortest;
16237 :
16238 38 : case IX86_BUILTIN_KORTESTC16:
16239 38 : icode = CODE_FOR_kortesthi;
16240 38 : mode3 = CCCmode;
16241 38 : goto kortest;
16242 :
16243 91 : case IX86_BUILTIN_KORTESTZ16:
16244 91 : icode = CODE_FOR_kortesthi;
16245 91 : mode3 = CCZmode;
16246 91 : goto kortest;
16247 :
16248 22 : case IX86_BUILTIN_KORTESTC32:
16249 22 : icode = CODE_FOR_kortestsi;
16250 22 : mode3 = CCCmode;
16251 22 : goto kortest;
16252 :
16253 79 : case IX86_BUILTIN_KORTESTZ32:
16254 79 : icode = CODE_FOR_kortestsi;
16255 79 : mode3 = CCZmode;
16256 79 : goto kortest;
16257 :
16258 22 : case IX86_BUILTIN_KORTESTC64:
16259 22 : icode = CODE_FOR_kortestdi;
16260 22 : mode3 = CCCmode;
16261 22 : goto kortest;
16262 :
16263 : case IX86_BUILTIN_KORTESTZ64:
16264 : icode = CODE_FOR_kortestdi;
16265 : mode3 = CCZmode;
16266 :
16267 610 : kortest:
16268 610 : arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
16269 610 : arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
16270 610 : op0 = expand_normal (arg0);
16271 610 : op1 = expand_normal (arg1);
16272 :
16273 610 : mode0 = insn_data[icode].operand[0].mode;
16274 610 : mode1 = insn_data[icode].operand[1].mode;
16275 :
16276 610 : if (GET_MODE (op0) != VOIDmode)
16277 610 : op0 = force_reg (GET_MODE (op0), op0);
16278 :
16279 610 : op0 = gen_lowpart (mode0, op0);
16280 :
16281 610 : if (!insn_data[icode].operand[0].predicate (op0, mode0))
16282 0 : op0 = copy_to_mode_reg (mode0, op0);
16283 :
16284 610 : if (GET_MODE (op1) != VOIDmode)
16285 609 : op1 = force_reg (GET_MODE (op1), op1);
16286 :
16287 610 : op1 = gen_lowpart (mode1, op1);
16288 :
16289 610 : if (!insn_data[icode].operand[1].predicate (op1, mode1))
16290 1 : op1 = copy_to_mode_reg (mode1, op1);
16291 :
16292 610 : target = gen_reg_rtx (QImode);
16293 :
16294 : /* Emit kortest. */
16295 610 : emit_insn (GEN_FCN (icode) (op0, op1));
16296 : /* And use setcc to return result from flags. */
16297 610 : ix86_expand_setcc (target, EQ,
16298 : gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
16299 610 : return target;
16300 :
16301 24 : case IX86_BUILTIN_GATHERSIV2DF:
16302 24 : icode = CODE_FOR_avx2_gathersiv2df;
16303 24 : goto gather_gen;
16304 18 : case IX86_BUILTIN_GATHERSIV4DF:
16305 18 : icode = CODE_FOR_avx2_gathersiv4df;
16306 18 : goto gather_gen;
16307 21 : case IX86_BUILTIN_GATHERDIV2DF:
16308 21 : icode = CODE_FOR_avx2_gatherdiv2df;
16309 21 : goto gather_gen;
16310 32 : case IX86_BUILTIN_GATHERDIV4DF:
16311 32 : icode = CODE_FOR_avx2_gatherdiv4df;
16312 32 : goto gather_gen;
16313 30 : case IX86_BUILTIN_GATHERSIV4SF:
16314 30 : icode = CODE_FOR_avx2_gathersiv4sf;
16315 30 : goto gather_gen;
16316 37 : case IX86_BUILTIN_GATHERSIV8SF:
16317 37 : icode = CODE_FOR_avx2_gathersiv8sf;
16318 37 : goto gather_gen;
16319 24 : case IX86_BUILTIN_GATHERDIV4SF:
16320 24 : icode = CODE_FOR_avx2_gatherdiv4sf;
16321 24 : goto gather_gen;
16322 18 : case IX86_BUILTIN_GATHERDIV8SF:
16323 18 : icode = CODE_FOR_avx2_gatherdiv8sf;
16324 18 : goto gather_gen;
16325 18 : case IX86_BUILTIN_GATHERSIV2DI:
16326 18 : icode = CODE_FOR_avx2_gathersiv2di;
16327 18 : goto gather_gen;
16328 18 : case IX86_BUILTIN_GATHERSIV4DI:
16329 18 : icode = CODE_FOR_avx2_gathersiv4di;
16330 18 : goto gather_gen;
16331 27 : case IX86_BUILTIN_GATHERDIV2DI:
16332 27 : icode = CODE_FOR_avx2_gatherdiv2di;
16333 27 : goto gather_gen;
16334 29 : case IX86_BUILTIN_GATHERDIV4DI:
16335 29 : icode = CODE_FOR_avx2_gatherdiv4di;
16336 29 : goto gather_gen;
16337 20 : case IX86_BUILTIN_GATHERSIV4SI:
16338 20 : icode = CODE_FOR_avx2_gathersiv4si;
16339 20 : goto gather_gen;
16340 22 : case IX86_BUILTIN_GATHERSIV8SI:
16341 22 : icode = CODE_FOR_avx2_gathersiv8si;
16342 22 : goto gather_gen;
16343 28 : case IX86_BUILTIN_GATHERDIV4SI:
16344 28 : icode = CODE_FOR_avx2_gatherdiv4si;
16345 28 : goto gather_gen;
16346 18 : case IX86_BUILTIN_GATHERDIV8SI:
16347 18 : icode = CODE_FOR_avx2_gatherdiv8si;
16348 18 : goto gather_gen;
16349 20 : case IX86_BUILTIN_GATHERALTSIV4DF:
16350 20 : icode = CODE_FOR_avx2_gathersiv4df;
16351 20 : goto gather_gen;
16352 16 : case IX86_BUILTIN_GATHERALTDIV8SF:
16353 16 : icode = CODE_FOR_avx2_gatherdiv8sf;
16354 16 : goto gather_gen;
16355 4 : case IX86_BUILTIN_GATHERALTSIV4DI:
16356 4 : icode = CODE_FOR_avx2_gathersiv4di;
16357 4 : goto gather_gen;
16358 12 : case IX86_BUILTIN_GATHERALTDIV8SI:
16359 12 : icode = CODE_FOR_avx2_gatherdiv8si;
16360 12 : goto gather_gen;
16361 36 : case IX86_BUILTIN_GATHER3SIV16SF:
16362 36 : icode = CODE_FOR_avx512f_gathersiv16sf;
16363 36 : goto gather_gen;
16364 24 : case IX86_BUILTIN_GATHER3SIV8DF:
16365 24 : icode = CODE_FOR_avx512f_gathersiv8df;
16366 24 : goto gather_gen;
16367 24 : case IX86_BUILTIN_GATHER3DIV16SF:
16368 24 : icode = CODE_FOR_avx512f_gatherdiv16sf;
16369 24 : goto gather_gen;
16370 37 : case IX86_BUILTIN_GATHER3DIV8DF:
16371 37 : icode = CODE_FOR_avx512f_gatherdiv8df;
16372 37 : goto gather_gen;
16373 30 : case IX86_BUILTIN_GATHER3SIV16SI:
16374 30 : icode = CODE_FOR_avx512f_gathersiv16si;
16375 30 : goto gather_gen;
16376 24 : case IX86_BUILTIN_GATHER3SIV8DI:
16377 24 : icode = CODE_FOR_avx512f_gathersiv8di;
16378 24 : goto gather_gen;
16379 24 : case IX86_BUILTIN_GATHER3DIV16SI:
16380 24 : icode = CODE_FOR_avx512f_gatherdiv16si;
16381 24 : goto gather_gen;
16382 37 : case IX86_BUILTIN_GATHER3DIV8DI:
16383 37 : icode = CODE_FOR_avx512f_gatherdiv8di;
16384 37 : goto gather_gen;
16385 16 : case IX86_BUILTIN_GATHER3ALTSIV8DF:
16386 16 : icode = CODE_FOR_avx512f_gathersiv8df;
16387 16 : goto gather_gen;
16388 22 : case IX86_BUILTIN_GATHER3ALTDIV16SF:
16389 22 : icode = CODE_FOR_avx512f_gatherdiv16sf;
16390 22 : goto gather_gen;
16391 14 : case IX86_BUILTIN_GATHER3ALTSIV8DI:
16392 14 : icode = CODE_FOR_avx512f_gathersiv8di;
16393 14 : goto gather_gen;
16394 18 : case IX86_BUILTIN_GATHER3ALTDIV16SI:
16395 18 : icode = CODE_FOR_avx512f_gatherdiv16si;
16396 18 : goto gather_gen;
16397 18 : case IX86_BUILTIN_GATHER3SIV2DF:
16398 18 : icode = CODE_FOR_avx512vl_gathersiv2df;
16399 18 : goto gather_gen;
16400 10 : case IX86_BUILTIN_GATHER3SIV4DF:
16401 10 : icode = CODE_FOR_avx512vl_gathersiv4df;
16402 10 : goto gather_gen;
16403 15 : case IX86_BUILTIN_GATHER3DIV2DF:
16404 15 : icode = CODE_FOR_avx512vl_gatherdiv2df;
16405 15 : goto gather_gen;
16406 16 : case IX86_BUILTIN_GATHER3DIV4DF:
16407 16 : icode = CODE_FOR_avx512vl_gatherdiv4df;
16408 16 : goto gather_gen;
16409 14 : case IX86_BUILTIN_GATHER3SIV4SF:
16410 14 : icode = CODE_FOR_avx512vl_gathersiv4sf;
16411 14 : goto gather_gen;
16412 12 : case IX86_BUILTIN_GATHER3SIV8SF:
16413 12 : icode = CODE_FOR_avx512vl_gathersiv8sf;
16414 12 : goto gather_gen;
16415 22 : case IX86_BUILTIN_GATHER3DIV4SF:
16416 22 : icode = CODE_FOR_avx512vl_gatherdiv4sf;
16417 22 : goto gather_gen;
16418 10 : case IX86_BUILTIN_GATHER3DIV8SF:
16419 10 : icode = CODE_FOR_avx512vl_gatherdiv8sf;
16420 10 : goto gather_gen;
16421 20 : case IX86_BUILTIN_GATHER3SIV2DI:
16422 20 : icode = CODE_FOR_avx512vl_gathersiv2di;
16423 20 : goto gather_gen;
16424 10 : case IX86_BUILTIN_GATHER3SIV4DI:
16425 10 : icode = CODE_FOR_avx512vl_gathersiv4di;
16426 10 : goto gather_gen;
16427 14 : case IX86_BUILTIN_GATHER3DIV2DI:
16428 14 : icode = CODE_FOR_avx512vl_gatherdiv2di;
16429 14 : goto gather_gen;
16430 13 : case IX86_BUILTIN_GATHER3DIV4DI:
16431 13 : icode = CODE_FOR_avx512vl_gatherdiv4di;
16432 13 : goto gather_gen;
16433 14 : case IX86_BUILTIN_GATHER3SIV4SI:
16434 14 : icode = CODE_FOR_avx512vl_gathersiv4si;
16435 14 : goto gather_gen;
16436 12 : case IX86_BUILTIN_GATHER3SIV8SI:
16437 12 : icode = CODE_FOR_avx512vl_gathersiv8si;
16438 12 : goto gather_gen;
16439 24 : case IX86_BUILTIN_GATHER3DIV4SI:
16440 24 : icode = CODE_FOR_avx512vl_gatherdiv4si;
16441 24 : goto gather_gen;
16442 10 : case IX86_BUILTIN_GATHER3DIV8SI:
16443 10 : icode = CODE_FOR_avx512vl_gatherdiv8si;
16444 10 : goto gather_gen;
16445 4 : case IX86_BUILTIN_GATHER3ALTSIV4DF:
16446 4 : icode = CODE_FOR_avx512vl_gathersiv4df;
16447 4 : goto gather_gen;
16448 8 : case IX86_BUILTIN_GATHER3ALTDIV8SF:
16449 8 : icode = CODE_FOR_avx512vl_gatherdiv8sf;
16450 8 : goto gather_gen;
16451 6 : case IX86_BUILTIN_GATHER3ALTSIV4DI:
16452 6 : icode = CODE_FOR_avx512vl_gathersiv4di;
16453 6 : goto gather_gen;
16454 10 : case IX86_BUILTIN_GATHER3ALTDIV8SI:
16455 10 : icode = CODE_FOR_avx512vl_gatherdiv8si;
16456 10 : goto gather_gen;
16457 40 : case IX86_BUILTIN_SCATTERSIV16SF:
16458 40 : icode = CODE_FOR_avx512f_scattersiv16sf;
16459 40 : goto scatter_gen;
16460 27 : case IX86_BUILTIN_SCATTERSIV8DF:
16461 27 : icode = CODE_FOR_avx512f_scattersiv8df;
16462 27 : goto scatter_gen;
16463 24 : case IX86_BUILTIN_SCATTERDIV16SF:
16464 24 : icode = CODE_FOR_avx512f_scatterdiv16sf;
16465 24 : goto scatter_gen;
16466 33 : case IX86_BUILTIN_SCATTERDIV8DF:
16467 33 : icode = CODE_FOR_avx512f_scatterdiv8df;
16468 33 : goto scatter_gen;
16469 30 : case IX86_BUILTIN_SCATTERSIV16SI:
16470 30 : icode = CODE_FOR_avx512f_scattersiv16si;
16471 30 : goto scatter_gen;
16472 24 : case IX86_BUILTIN_SCATTERSIV8DI:
16473 24 : icode = CODE_FOR_avx512f_scattersiv8di;
16474 24 : goto scatter_gen;
16475 24 : case IX86_BUILTIN_SCATTERDIV16SI:
16476 24 : icode = CODE_FOR_avx512f_scatterdiv16si;
16477 24 : goto scatter_gen;
16478 29 : case IX86_BUILTIN_SCATTERDIV8DI:
16479 29 : icode = CODE_FOR_avx512f_scatterdiv8di;
16480 29 : goto scatter_gen;
16481 18 : case IX86_BUILTIN_SCATTERSIV8SF:
16482 18 : icode = CODE_FOR_avx512vl_scattersiv8sf;
16483 18 : goto scatter_gen;
16484 20 : case IX86_BUILTIN_SCATTERSIV4SF:
16485 20 : icode = CODE_FOR_avx512vl_scattersiv4sf;
16486 20 : goto scatter_gen;
16487 16 : case IX86_BUILTIN_SCATTERSIV4DF:
16488 16 : icode = CODE_FOR_avx512vl_scattersiv4df;
16489 16 : goto scatter_gen;
16490 16 : case IX86_BUILTIN_SCATTERSIV2DF:
16491 16 : icode = CODE_FOR_avx512vl_scattersiv2df;
16492 16 : goto scatter_gen;
16493 16 : case IX86_BUILTIN_SCATTERDIV8SF:
16494 16 : icode = CODE_FOR_avx512vl_scatterdiv8sf;
16495 16 : goto scatter_gen;
16496 16 : case IX86_BUILTIN_SCATTERDIV4SF:
16497 16 : icode = CODE_FOR_avx512vl_scatterdiv4sf;
16498 16 : goto scatter_gen;
16499 18 : case IX86_BUILTIN_SCATTERDIV4DF:
16500 18 : icode = CODE_FOR_avx512vl_scatterdiv4df;
16501 18 : goto scatter_gen;
16502 18 : case IX86_BUILTIN_SCATTERDIV2DF:
16503 18 : icode = CODE_FOR_avx512vl_scatterdiv2df;
16504 18 : goto scatter_gen;
16505 22 : case IX86_BUILTIN_SCATTERSIV8SI:
16506 22 : icode = CODE_FOR_avx512vl_scattersiv8si;
16507 22 : goto scatter_gen;
16508 24 : case IX86_BUILTIN_SCATTERSIV4SI:
16509 24 : icode = CODE_FOR_avx512vl_scattersiv4si;
16510 24 : goto scatter_gen;
16511 16 : case IX86_BUILTIN_SCATTERSIV4DI:
16512 16 : icode = CODE_FOR_avx512vl_scattersiv4di;
16513 16 : goto scatter_gen;
16514 16 : case IX86_BUILTIN_SCATTERSIV2DI:
16515 16 : icode = CODE_FOR_avx512vl_scattersiv2di;
16516 16 : goto scatter_gen;
16517 16 : case IX86_BUILTIN_SCATTERDIV8SI:
16518 16 : icode = CODE_FOR_avx512vl_scatterdiv8si;
16519 16 : goto scatter_gen;
16520 16 : case IX86_BUILTIN_SCATTERDIV4SI:
16521 16 : icode = CODE_FOR_avx512vl_scatterdiv4si;
16522 16 : goto scatter_gen;
16523 18 : case IX86_BUILTIN_SCATTERDIV4DI:
16524 18 : icode = CODE_FOR_avx512vl_scatterdiv4di;
16525 18 : goto scatter_gen;
16526 18 : case IX86_BUILTIN_SCATTERDIV2DI:
16527 18 : icode = CODE_FOR_avx512vl_scatterdiv2di;
16528 18 : goto scatter_gen;
16529 16 : case IX86_BUILTIN_SCATTERALTSIV8DF:
16530 16 : icode = CODE_FOR_avx512f_scattersiv8df;
16531 16 : goto scatter_gen;
16532 12 : case IX86_BUILTIN_SCATTERALTDIV16SF:
16533 12 : icode = CODE_FOR_avx512f_scatterdiv16sf;
16534 12 : goto scatter_gen;
16535 8 : case IX86_BUILTIN_SCATTERALTSIV8DI:
16536 8 : icode = CODE_FOR_avx512f_scattersiv8di;
16537 8 : goto scatter_gen;
16538 24 : case IX86_BUILTIN_SCATTERALTDIV16SI:
16539 24 : icode = CODE_FOR_avx512f_scatterdiv16si;
16540 24 : goto scatter_gen;
16541 4 : case IX86_BUILTIN_SCATTERALTSIV4DF:
16542 4 : icode = CODE_FOR_avx512vl_scattersiv4df;
16543 4 : goto scatter_gen;
16544 4 : case IX86_BUILTIN_SCATTERALTDIV8SF:
16545 4 : icode = CODE_FOR_avx512vl_scatterdiv8sf;
16546 4 : goto scatter_gen;
16547 4 : case IX86_BUILTIN_SCATTERALTSIV4DI:
16548 4 : icode = CODE_FOR_avx512vl_scattersiv4di;
16549 4 : goto scatter_gen;
16550 4 : case IX86_BUILTIN_SCATTERALTDIV8SI:
16551 4 : icode = CODE_FOR_avx512vl_scatterdiv8si;
16552 4 : goto scatter_gen;
16553 8 : case IX86_BUILTIN_SCATTERALTSIV2DF:
16554 8 : icode = CODE_FOR_avx512vl_scattersiv2df;
16555 8 : goto scatter_gen;
16556 8 : case IX86_BUILTIN_SCATTERALTDIV4SF:
16557 8 : icode = CODE_FOR_avx512vl_scatterdiv4sf;
16558 8 : goto scatter_gen;
16559 8 : case IX86_BUILTIN_SCATTERALTSIV2DI:
16560 8 : icode = CODE_FOR_avx512vl_scattersiv2di;
16561 8 : goto scatter_gen;
16562 8 : case IX86_BUILTIN_SCATTERALTDIV4SI:
16563 8 : icode = CODE_FOR_avx512vl_scatterdiv4si;
16564 8 : goto scatter_gen;
16565 :
16566 1004 : gather_gen:
16567 1004 : rtx half;
16568 1004 : rtx (*gen) (rtx, rtx);
16569 :
16570 1004 : arg0 = CALL_EXPR_ARG (exp, 0);
16571 1004 : arg1 = CALL_EXPR_ARG (exp, 1);
16572 1004 : arg2 = CALL_EXPR_ARG (exp, 2);
16573 1004 : arg3 = CALL_EXPR_ARG (exp, 3);
16574 1004 : arg4 = CALL_EXPR_ARG (exp, 4);
16575 1004 : op0 = expand_normal (arg0);
16576 1004 : op1 = expand_normal (arg1);
16577 1004 : op2 = expand_normal (arg2);
16578 1004 : op3 = ix86_expand_unsigned_small_int_cst_argument (arg3);
16579 1004 : op4 = expand_normal (arg4);
16580 : /* Note the arg order is different from the operand order. */
16581 1004 : mode0 = insn_data[icode].operand[1].mode;
16582 1004 : mode2 = insn_data[icode].operand[3].mode;
16583 1004 : mode3 = insn_data[icode].operand[4].mode;
16584 1004 : mode4 = insn_data[icode].operand[5].mode;
16585 :
16586 1004 : if (target == NULL_RTX
16587 1004 : || GET_MODE (target) != insn_data[icode].operand[0].mode
16588 1904 : || !insn_data[icode].operand[0].predicate (target,
16589 : GET_MODE (target)))
16590 105 : subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
16591 : else
16592 : subtarget = target;
16593 :
16594 1004 : switch (fcode)
16595 : {
16596 30 : case IX86_BUILTIN_GATHER3ALTSIV8DF:
16597 30 : case IX86_BUILTIN_GATHER3ALTSIV8DI:
16598 30 : half = gen_reg_rtx (V8SImode);
16599 30 : if (!nonimmediate_operand (op2, V16SImode))
16600 0 : op2 = copy_to_mode_reg (V16SImode, op2);
16601 30 : emit_insn (gen_vec_extract_lo_v16si (half, op2));
16602 30 : op2 = half;
16603 30 : break;
16604 34 : case IX86_BUILTIN_GATHER3ALTSIV4DF:
16605 34 : case IX86_BUILTIN_GATHER3ALTSIV4DI:
16606 34 : case IX86_BUILTIN_GATHERALTSIV4DF:
16607 34 : case IX86_BUILTIN_GATHERALTSIV4DI:
16608 34 : half = gen_reg_rtx (V4SImode);
16609 34 : if (!nonimmediate_operand (op2, V8SImode))
16610 0 : op2 = copy_to_mode_reg (V8SImode, op2);
16611 34 : emit_insn (gen_vec_extract_lo_v8si (half, op2));
16612 34 : op2 = half;
16613 34 : break;
16614 40 : case IX86_BUILTIN_GATHER3ALTDIV16SF:
16615 40 : case IX86_BUILTIN_GATHER3ALTDIV16SI:
16616 40 : half = gen_reg_rtx (mode0);
16617 40 : if (mode0 == V8SFmode)
16618 : gen = gen_vec_extract_lo_v16sf;
16619 : else
16620 18 : gen = gen_vec_extract_lo_v16si;
16621 40 : if (!nonimmediate_operand (op0, GET_MODE (op0)))
16622 40 : op0 = copy_to_mode_reg (GET_MODE (op0), op0);
16623 40 : emit_insn (gen (half, op0));
16624 40 : op0 = half;
16625 40 : op3 = lowpart_subreg (QImode, op3, HImode);
16626 40 : break;
16627 46 : case IX86_BUILTIN_GATHER3ALTDIV8SF:
16628 46 : case IX86_BUILTIN_GATHER3ALTDIV8SI:
16629 46 : case IX86_BUILTIN_GATHERALTDIV8SF:
16630 46 : case IX86_BUILTIN_GATHERALTDIV8SI:
16631 46 : half = gen_reg_rtx (mode0);
16632 46 : if (mode0 == V4SFmode)
16633 : gen = gen_vec_extract_lo_v8sf;
16634 : else
16635 22 : gen = gen_vec_extract_lo_v8si;
16636 46 : if (!nonimmediate_operand (op0, GET_MODE (op0)))
16637 46 : op0 = copy_to_mode_reg (GET_MODE (op0), op0);
16638 46 : emit_insn (gen (half, op0));
16639 46 : op0 = half;
16640 46 : if (VECTOR_MODE_P (GET_MODE (op3)))
16641 : {
16642 28 : half = gen_reg_rtx (mode0);
16643 28 : if (!nonimmediate_operand (op3, GET_MODE (op3)))
16644 12 : op3 = copy_to_mode_reg (GET_MODE (op3), op3);
16645 28 : emit_insn (gen (half, op3));
16646 28 : op3 = half;
16647 : }
16648 : break;
16649 : default:
16650 : break;
16651 : }
16652 :
16653 : /* Force memory operand only with base register here. But we
16654 : don't want to do it on memory operand for other builtin
16655 : functions. */
16656 1004 : op1 = ix86_zero_extend_to_Pmode (op1);
16657 :
16658 1004 : if (!insn_data[icode].operand[1].predicate (op0, mode0))
16659 403 : op0 = copy_to_mode_reg (mode0, op0);
16660 1009 : if (!insn_data[icode].operand[2].predicate (op1, Pmode))
16661 0 : op1 = copy_to_mode_reg (Pmode, op1);
16662 1004 : if (!insn_data[icode].operand[3].predicate (op2, mode2))
16663 221 : op2 = copy_to_mode_reg (mode2, op2);
16664 :
16665 1004 : op3 = fixup_modeless_constant (op3, mode3);
16666 :
16667 1004 : if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
16668 : {
16669 1004 : if (!insn_data[icode].operand[4].predicate (op3, mode3))
16670 356 : op3 = copy_to_mode_reg (mode3, op3);
16671 : }
16672 : else
16673 : {
16674 0 : op3 = copy_to_reg (op3);
16675 0 : op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
16676 : }
16677 1004 : if (!insn_data[icode].operand[5].predicate (op4, mode4))
16678 : {
16679 0 : error ("the last argument must be scale 1, 2, 4, 8");
16680 0 : return const0_rtx;
16681 : }
16682 :
16683 : /* Optimize. If mask is known to have all high bits set,
16684 : replace op0 with pc_rtx to signal that the instruction
16685 : overwrites the whole destination and doesn't use its
16686 : previous contents. */
16687 1004 : if (optimize)
16688 : {
16689 914 : if (TREE_CODE (arg3) == INTEGER_CST)
16690 : {
16691 209 : if (integer_all_onesp (arg3))
16692 201 : op0 = pc_rtx;
16693 : }
16694 705 : else if (TREE_CODE (arg3) == VECTOR_CST)
16695 : {
16696 : unsigned int negative = 0;
16697 755 : for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
16698 : {
16699 620 : tree cst = VECTOR_CST_ELT (arg3, i);
16700 620 : if (TREE_CODE (cst) == INTEGER_CST
16701 620 : && tree_int_cst_sign_bit (cst))
16702 286 : negative++;
16703 334 : else if (TREE_CODE (cst) == REAL_CST
16704 334 : && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
16705 306 : negative++;
16706 : }
16707 135 : if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
16708 121 : op0 = pc_rtx;
16709 : }
16710 570 : else if (TREE_CODE (arg3) == SSA_NAME
16711 570 : && VECTOR_TYPE_P (TREE_TYPE (arg3)))
16712 : {
16713 : /* Recognize also when mask is like:
16714 : __v2df src = _mm_setzero_pd ();
16715 : __v2df mask = _mm_cmpeq_pd (src, src);
16716 : or
16717 : __v8sf src = _mm256_setzero_ps ();
16718 : __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
16719 : as that is a cheaper way to load all ones into
16720 : a register than having to load a constant from
16721 : memory. */
16722 259 : gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
16723 259 : if (is_gimple_call (def_stmt))
16724 : {
16725 76 : tree fndecl = gimple_call_fndecl (def_stmt);
16726 76 : if (fndecl
16727 76 : && fndecl_built_in_p (fndecl, BUILT_IN_MD))
16728 67 : switch (DECL_MD_FUNCTION_CODE (fndecl))
16729 : {
16730 24 : case IX86_BUILTIN_CMPPD:
16731 24 : case IX86_BUILTIN_CMPPS:
16732 24 : case IX86_BUILTIN_CMPPD256:
16733 24 : case IX86_BUILTIN_CMPPS256:
16734 24 : if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
16735 : break;
16736 : /* FALLTHRU */
16737 49 : case IX86_BUILTIN_CMPEQPD:
16738 49 : case IX86_BUILTIN_CMPEQPS:
16739 49 : if (initializer_zerop (gimple_call_arg (def_stmt, 0))
16740 49 : && initializer_zerop (gimple_call_arg (def_stmt,
16741 : 1)))
16742 49 : op0 = pc_rtx;
16743 : break;
16744 : default:
16745 : break;
16746 : }
16747 : }
16748 : }
16749 : }
16750 :
16751 1004 : pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
16752 1004 : if (! pat)
16753 0 : return const0_rtx;
16754 1004 : emit_insn (pat);
16755 :
16756 1004 : switch (fcode)
16757 : {
16758 24 : case IX86_BUILTIN_GATHER3DIV16SF:
16759 24 : if (target == NULL_RTX)
16760 0 : target = gen_reg_rtx (V8SFmode);
16761 24 : emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
16762 24 : break;
16763 24 : case IX86_BUILTIN_GATHER3DIV16SI:
16764 24 : if (target == NULL_RTX)
16765 0 : target = gen_reg_rtx (V8SImode);
16766 24 : emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
16767 24 : break;
16768 28 : case IX86_BUILTIN_GATHER3DIV8SF:
16769 28 : case IX86_BUILTIN_GATHERDIV8SF:
16770 28 : if (target == NULL_RTX)
16771 0 : target = gen_reg_rtx (V4SFmode);
16772 28 : emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
16773 28 : break;
16774 28 : case IX86_BUILTIN_GATHER3DIV8SI:
16775 28 : case IX86_BUILTIN_GATHERDIV8SI:
16776 28 : if (target == NULL_RTX)
16777 0 : target = gen_reg_rtx (V4SImode);
16778 28 : emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
16779 28 : break;
16780 : default:
16781 : target = subtarget;
16782 : break;
16783 : }
16784 : return target;
16785 :
16786 623 : scatter_gen:
16787 623 : arg0 = CALL_EXPR_ARG (exp, 0);
16788 623 : arg1 = CALL_EXPR_ARG (exp, 1);
16789 623 : arg2 = CALL_EXPR_ARG (exp, 2);
16790 623 : arg3 = CALL_EXPR_ARG (exp, 3);
16791 623 : arg4 = CALL_EXPR_ARG (exp, 4);
16792 623 : op0 = expand_normal (arg0);
16793 623 : op1 = ix86_expand_unsigned_small_int_cst_argument (arg1);
16794 623 : op2 = expand_normal (arg2);
16795 623 : op3 = expand_normal (arg3);
16796 623 : op4 = expand_normal (arg4);
16797 623 : mode1 = insn_data[icode].operand[1].mode;
16798 623 : mode2 = insn_data[icode].operand[2].mode;
16799 623 : mode3 = insn_data[icode].operand[3].mode;
16800 623 : mode4 = insn_data[icode].operand[4].mode;
16801 :
16802 : /* Scatter instruction stores operand op3 to memory with
16803 : indices from op2 and scale from op4 under writemask op1.
16804 : If index operand op2 has more elements then source operand
16805 : op3 one need to use only its low half. And vice versa. */
16806 623 : switch (fcode)
16807 : {
16808 24 : case IX86_BUILTIN_SCATTERALTSIV8DF:
16809 24 : case IX86_BUILTIN_SCATTERALTSIV8DI:
16810 24 : half = gen_reg_rtx (V8SImode);
16811 24 : if (!nonimmediate_operand (op2, V16SImode))
16812 0 : op2 = copy_to_mode_reg (V16SImode, op2);
16813 24 : emit_insn (gen_vec_extract_lo_v16si (half, op2));
16814 24 : op2 = half;
16815 24 : break;
16816 36 : case IX86_BUILTIN_SCATTERALTDIV16SF:
16817 36 : case IX86_BUILTIN_SCATTERALTDIV16SI:
16818 36 : half = gen_reg_rtx (mode3);
16819 36 : if (mode3 == V8SFmode)
16820 : gen = gen_vec_extract_lo_v16sf;
16821 : else
16822 24 : gen = gen_vec_extract_lo_v16si;
16823 36 : if (!nonimmediate_operand (op3, GET_MODE (op3)))
16824 0 : op3 = copy_to_mode_reg (GET_MODE (op3), op3);
16825 36 : emit_insn (gen (half, op3));
16826 36 : op3 = half;
16827 36 : break;
16828 8 : case IX86_BUILTIN_SCATTERALTSIV4DF:
16829 8 : case IX86_BUILTIN_SCATTERALTSIV4DI:
16830 8 : half = gen_reg_rtx (V4SImode);
16831 8 : if (!nonimmediate_operand (op2, V8SImode))
16832 0 : op2 = copy_to_mode_reg (V8SImode, op2);
16833 8 : emit_insn (gen_vec_extract_lo_v8si (half, op2));
16834 8 : op2 = half;
16835 8 : break;
16836 8 : case IX86_BUILTIN_SCATTERALTDIV8SF:
16837 8 : case IX86_BUILTIN_SCATTERALTDIV8SI:
16838 8 : half = gen_reg_rtx (mode3);
16839 8 : if (mode3 == V4SFmode)
16840 : gen = gen_vec_extract_lo_v8sf;
16841 : else
16842 4 : gen = gen_vec_extract_lo_v8si;
16843 8 : if (!nonimmediate_operand (op3, GET_MODE (op3)))
16844 0 : op3 = copy_to_mode_reg (GET_MODE (op3), op3);
16845 8 : emit_insn (gen (half, op3));
16846 8 : op3 = half;
16847 8 : break;
16848 16 : case IX86_BUILTIN_SCATTERALTSIV2DF:
16849 16 : case IX86_BUILTIN_SCATTERALTSIV2DI:
16850 16 : if (!nonimmediate_operand (op2, V4SImode))
16851 0 : op2 = copy_to_mode_reg (V4SImode, op2);
16852 : break;
16853 16 : case IX86_BUILTIN_SCATTERALTDIV4SF:
16854 16 : case IX86_BUILTIN_SCATTERALTDIV4SI:
16855 16 : if (!nonimmediate_operand (op3, GET_MODE (op3)))
16856 0 : op3 = copy_to_mode_reg (GET_MODE (op3), op3);
16857 : break;
16858 : default:
16859 : break;
16860 : }
16861 :
16862 : /* Force memory operand only with base register here. But we
16863 : don't want to do it on memory operand for other builtin
16864 : functions. */
16865 633 : op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
16866 :
16867 628 : if (!insn_data[icode].operand[0].predicate (op0, Pmode))
16868 0 : op0 = copy_to_mode_reg (Pmode, op0);
16869 :
16870 623 : op1 = fixup_modeless_constant (op1, mode1);
16871 :
16872 623 : if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
16873 : {
16874 607 : if (!insn_data[icode].operand[1].predicate (op1, mode1))
16875 273 : op1 = copy_to_mode_reg (mode1, op1);
16876 : }
16877 : else
16878 : {
16879 16 : op1 = copy_to_reg (op1);
16880 16 : op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
16881 : }
16882 :
16883 623 : if (!insn_data[icode].operand[2].predicate (op2, mode2))
16884 57 : op2 = copy_to_mode_reg (mode2, op2);
16885 :
16886 623 : if (!insn_data[icode].operand[3].predicate (op3, mode3))
16887 82 : op3 = copy_to_mode_reg (mode3, op3);
16888 :
16889 623 : if (!insn_data[icode].operand[4].predicate (op4, mode4))
16890 : {
16891 0 : error ("the last argument must be scale 1, 2, 4, 8");
16892 0 : return const0_rtx;
16893 : }
16894 :
16895 623 : pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
16896 623 : if (! pat)
16897 0 : return const0_rtx;
16898 :
16899 623 : emit_insn (pat);
16900 623 : return 0;
16901 :
16902 23 : case IX86_BUILTIN_XABORT:
16903 23 : icode = CODE_FOR_xabort;
16904 23 : arg0 = CALL_EXPR_ARG (exp, 0);
16905 23 : op0 = expand_normal (arg0);
16906 23 : mode0 = insn_data[icode].operand[0].mode;
16907 23 : if (!insn_data[icode].operand[0].predicate (op0, mode0))
16908 : {
16909 0 : error ("the argument to %<xabort%> intrinsic must "
16910 : "be an 8-bit immediate");
16911 0 : return const0_rtx;
16912 : }
16913 23 : emit_insn (gen_xabort (op0));
16914 23 : return 0;
16915 :
16916 55 : case IX86_BUILTIN_RDSSPD:
16917 55 : case IX86_BUILTIN_RDSSPQ:
16918 55 : mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
16919 :
16920 55 : if (target == 0
16921 55 : || !register_operand (target, mode))
16922 0 : target = gen_reg_rtx (mode);
16923 :
16924 55 : op0 = force_reg (mode, const0_rtx);
16925 :
16926 55 : emit_insn (gen_rdssp (mode, target, op0));
16927 55 : return target;
16928 :
16929 55 : case IX86_BUILTIN_INCSSPD:
16930 55 : case IX86_BUILTIN_INCSSPQ:
16931 55 : mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
16932 :
16933 55 : arg0 = CALL_EXPR_ARG (exp, 0);
16934 55 : op0 = expand_normal (arg0);
16935 :
16936 55 : op0 = force_reg (mode, op0);
16937 :
16938 55 : emit_insn (gen_incssp (mode, op0));
16939 55 : return 0;
16940 :
16941 20 : case IX86_BUILTIN_HRESET:
16942 20 : icode = CODE_FOR_hreset;
16943 20 : arg0 = CALL_EXPR_ARG (exp, 0);
16944 20 : op0 = expand_normal (arg0);
16945 20 : op0 = force_reg (SImode, op0);
16946 20 : emit_insn (gen_hreset (op0));
16947 20 : return 0;
16948 :
16949 38 : case IX86_BUILTIN_RSTORSSP:
16950 38 : case IX86_BUILTIN_CLRSSBSY:
16951 38 : arg0 = CALL_EXPR_ARG (exp, 0);
16952 38 : op0 = expand_normal (arg0);
16953 19 : icode = (fcode == IX86_BUILTIN_RSTORSSP
16954 38 : ? CODE_FOR_rstorssp
16955 : : CODE_FOR_clrssbsy);
16956 :
16957 38 : if (!address_operand (op0, VOIDmode))
16958 : {
16959 18 : op0 = convert_memory_address (Pmode, op0);
16960 18 : op0 = copy_addr_to_reg (op0);
16961 : }
16962 38 : emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
16963 38 : return 0;
16964 :
16965 80 : case IX86_BUILTIN_WRSSD:
16966 80 : case IX86_BUILTIN_WRSSQ:
16967 80 : case IX86_BUILTIN_WRUSSD:
16968 80 : case IX86_BUILTIN_WRUSSQ:
16969 80 : mode = ((fcode == IX86_BUILTIN_WRSSD
16970 80 : || fcode == IX86_BUILTIN_WRUSSD)
16971 80 : ? SImode : DImode);
16972 :
16973 80 : arg0 = CALL_EXPR_ARG (exp, 0);
16974 80 : op0 = expand_normal (arg0);
16975 80 : arg1 = CALL_EXPR_ARG (exp, 1);
16976 80 : op1 = expand_normal (arg1);
16977 :
16978 80 : op0 = force_reg (mode, op0);
16979 :
16980 80 : if (!address_operand (op1, VOIDmode))
16981 : {
16982 36 : op1 = convert_memory_address (Pmode, op1);
16983 36 : op1 = copy_addr_to_reg (op1);
16984 : }
16985 80 : op1 = gen_rtx_MEM (mode, op1);
16986 :
16987 80 : icode = ((fcode == IX86_BUILTIN_WRSSD
16988 80 : || fcode == IX86_BUILTIN_WRSSQ)
16989 80 : ? code_for_wrss (mode)
16990 40 : : code_for_wruss (mode));
16991 80 : emit_insn (GEN_FCN (icode) (op0, op1));
16992 :
16993 80 : return 0;
16994 :
16995 116627 : default:
16996 116627 : break;
16997 : }
16998 :
16999 116627 : if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
17000 116627 : && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
17001 : {
17002 27053 : i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
17003 27053 : return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
17004 27053 : target);
17005 : }
17006 :
17007 89574 : if (fcode >= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
17008 89574 : && fcode <= IX86_BUILTIN__BDESC_PURE_ARGS_LAST)
17009 : {
17010 93 : i = fcode - IX86_BUILTIN__BDESC_PURE_ARGS_FIRST;
17011 93 : return ix86_expand_special_args_builtin (bdesc_pure_args + i, exp,
17012 93 : target);
17013 : }
17014 :
17015 89481 : if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
17016 89481 : && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
17017 : {
17018 71075 : i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
17019 :
17020 71075 : switch (fcode)
17021 : {
17022 0 : case IX86_BUILTIN_RDPID:
17023 0 : return ix86_expand_special_args_builtin (bdesc_args + i, exp,
17024 0 : target);
17025 74 : case IX86_BUILTIN_VCOMISBF16EQ:
17026 74 : case IX86_BUILTIN_VCOMISBF16NE:
17027 74 : case IX86_BUILTIN_VCOMISBF16GT:
17028 74 : case IX86_BUILTIN_VCOMISBF16GE:
17029 74 : case IX86_BUILTIN_VCOMISBF16LT:
17030 74 : case IX86_BUILTIN_VCOMISBF16LE:
17031 74 : return ix86_expand_sse_comi (bdesc_args + i, exp, target, false);
17032 15 : case IX86_BUILTIN_FABSQ:
17033 15 : case IX86_BUILTIN_COPYSIGNQ:
17034 15 : if (!TARGET_SSE)
17035 : /* Emit a normal call if SSE isn't available. */
17036 0 : return expand_call (exp, target, ignore);
17037 : /* FALLTHRU */
17038 71001 : default:
17039 71001 : return ix86_expand_args_builtin (bdesc_args + i, exp, target);
17040 : }
17041 : }
17042 :
17043 18406 : if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
17044 18406 : && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
17045 : {
17046 473 : i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
17047 473 : return ix86_expand_sse_comi (bdesc_comi + i, exp, target, true);
17048 : }
17049 :
17050 17933 : if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
17051 17933 : && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
17052 : {
17053 15589 : i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
17054 15589 : return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
17055 : }
17056 :
17057 2344 : if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
17058 2344 : && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
17059 : {
17060 216 : i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
17061 216 : return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
17062 : }
17063 :
17064 2128 : if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
17065 2128 : && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
17066 : {
17067 275 : i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
17068 275 : return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
17069 : }
17070 :
17071 1853 : if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
17072 1853 : && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
17073 : {
17074 1815 : i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
17075 1815 : const struct builtin_description *d = bdesc_multi_arg + i;
17076 1815 : return ix86_expand_multi_arg_builtin (d->icode, exp, target,
17077 : (enum ix86_builtin_func_type)
17078 1815 : d->flag, d->comparison);
17079 : }
17080 :
17081 38 : if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
17082 38 : && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
17083 : {
17084 38 : i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
17085 38 : return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
17086 38 : target);
17087 : }
17088 :
17089 0 : gcc_unreachable ();
17090 : }
17091 :
17092 : /* See below where shifts are handled for explanation of this enum. */
17093 : enum ix86_vec_bcast_alg
17094 : {
17095 : VEC_BCAST_PXOR,
17096 : VEC_BCAST_PCMPEQ,
17097 : VEC_BCAST_PABSB,
17098 : VEC_BCAST_PADDB,
17099 : VEC_BCAST_PSRLW,
17100 : VEC_BCAST_PSRLD,
17101 : VEC_BCAST_PSLLW,
17102 : VEC_BCAST_PSLLD
17103 : };
17104 :
17105 : struct ix86_vec_bcast_map_simode_t
17106 : {
17107 : unsigned int key;
17108 : enum ix86_vec_bcast_alg alg;
17109 : unsigned int arg;
17110 : };
17111 :
17112 : /* This table must be kept sorted as values are looked-up using bsearch. */
17113 : static const ix86_vec_bcast_map_simode_t ix86_vec_bcast_map_simode[] = {
17114 : { 0x00000000, VEC_BCAST_PXOR, 0 },
17115 : { 0x00000001, VEC_BCAST_PSRLD, 31 },
17116 : { 0x00000003, VEC_BCAST_PSRLD, 30 },
17117 : { 0x00000007, VEC_BCAST_PSRLD, 29 },
17118 : { 0x0000000f, VEC_BCAST_PSRLD, 28 },
17119 : { 0x0000001f, VEC_BCAST_PSRLD, 27 },
17120 : { 0x0000003f, VEC_BCAST_PSRLD, 26 },
17121 : { 0x0000007f, VEC_BCAST_PSRLD, 25 },
17122 : { 0x000000ff, VEC_BCAST_PSRLD, 24 },
17123 : { 0x000001ff, VEC_BCAST_PSRLD, 23 },
17124 : { 0x000003ff, VEC_BCAST_PSRLD, 22 },
17125 : { 0x000007ff, VEC_BCAST_PSRLD, 21 },
17126 : { 0x00000fff, VEC_BCAST_PSRLD, 20 },
17127 : { 0x00001fff, VEC_BCAST_PSRLD, 19 },
17128 : { 0x00003fff, VEC_BCAST_PSRLD, 18 },
17129 : { 0x00007fff, VEC_BCAST_PSRLD, 17 },
17130 : { 0x0000ffff, VEC_BCAST_PSRLD, 16 },
17131 : { 0x00010001, VEC_BCAST_PSRLW, 15 },
17132 : { 0x0001ffff, VEC_BCAST_PSRLD, 15 },
17133 : { 0x00030003, VEC_BCAST_PSRLW, 14 },
17134 : { 0x0003ffff, VEC_BCAST_PSRLD, 14 },
17135 : { 0x00070007, VEC_BCAST_PSRLW, 13 },
17136 : { 0x0007ffff, VEC_BCAST_PSRLD, 13 },
17137 : { 0x000f000f, VEC_BCAST_PSRLW, 12 },
17138 : { 0x000fffff, VEC_BCAST_PSRLD, 12 },
17139 : { 0x001f001f, VEC_BCAST_PSRLW, 11 },
17140 : { 0x001fffff, VEC_BCAST_PSRLD, 11 },
17141 : { 0x003f003f, VEC_BCAST_PSRLW, 10 },
17142 : { 0x003fffff, VEC_BCAST_PSRLD, 10 },
17143 : { 0x007f007f, VEC_BCAST_PSRLW, 9 },
17144 : { 0x007fffff, VEC_BCAST_PSRLD, 9 },
17145 : { 0x00ff00ff, VEC_BCAST_PSRLW, 8 },
17146 : { 0x00ffffff, VEC_BCAST_PSRLD, 8 },
17147 : { 0x01010101, VEC_BCAST_PABSB, 0 },
17148 : { 0x01ff01ff, VEC_BCAST_PSRLW, 7 },
17149 : { 0x01ffffff, VEC_BCAST_PSRLD, 7 },
17150 : { 0x03ff03ff, VEC_BCAST_PSRLW, 6 },
17151 : { 0x03ffffff, VEC_BCAST_PSRLD, 6 },
17152 : { 0x07ff07ff, VEC_BCAST_PSRLW, 5 },
17153 : { 0x07ffffff, VEC_BCAST_PSRLD, 5 },
17154 : { 0x0fff0fff, VEC_BCAST_PSRLW, 4 },
17155 : { 0x0fffffff, VEC_BCAST_PSRLD, 4 },
17156 : { 0x1fff1fff, VEC_BCAST_PSRLW, 3 },
17157 : { 0x1fffffff, VEC_BCAST_PSRLD, 3 },
17158 : { 0x3fff3fff, VEC_BCAST_PSRLW, 2 },
17159 : { 0x3fffffff, VEC_BCAST_PSRLD, 2 },
17160 : { 0x7fff7fff, VEC_BCAST_PSRLW, 1 },
17161 : { 0x7fffffff, VEC_BCAST_PSRLD, 1 },
17162 : { 0x80000000, VEC_BCAST_PSLLD, 31 },
17163 : { 0x80008000, VEC_BCAST_PSLLW, 15 },
17164 : { 0xc0000000, VEC_BCAST_PSLLD, 30 },
17165 : { 0xc000c000, VEC_BCAST_PSLLW, 14 },
17166 : { 0xe0000000, VEC_BCAST_PSLLD, 29 },
17167 : { 0xe000e000, VEC_BCAST_PSLLW, 13 },
17168 : { 0xf0000000, VEC_BCAST_PSLLD, 28 },
17169 : { 0xf000f000, VEC_BCAST_PSLLW, 12 },
17170 : { 0xf8000000, VEC_BCAST_PSLLD, 27 },
17171 : { 0xf800f800, VEC_BCAST_PSLLW, 11 },
17172 : { 0xfc000000, VEC_BCAST_PSLLD, 26 },
17173 : { 0xfc00fc00, VEC_BCAST_PSLLW, 10 },
17174 : { 0xfe000000, VEC_BCAST_PSLLD, 25 },
17175 : { 0xfe00fe00, VEC_BCAST_PSLLW, 9 },
17176 : { 0xfefefefe, VEC_BCAST_PADDB, 0 },
17177 : { 0xff000000, VEC_BCAST_PSLLD, 24 },
17178 : { 0xff00ff00, VEC_BCAST_PSLLW, 8 },
17179 : { 0xff800000, VEC_BCAST_PSLLD, 23 },
17180 : { 0xff80ff80, VEC_BCAST_PSLLW, 7 },
17181 : { 0xffc00000, VEC_BCAST_PSLLD, 22 },
17182 : { 0xffc0ffc0, VEC_BCAST_PSLLW, 6 },
17183 : { 0xffe00000, VEC_BCAST_PSLLD, 21 },
17184 : { 0xffe0ffe0, VEC_BCAST_PSLLW, 5 },
17185 : { 0xfff00000, VEC_BCAST_PSLLD, 20 },
17186 : { 0xfff0fff0, VEC_BCAST_PSLLW, 4 },
17187 : { 0xfff80000, VEC_BCAST_PSLLD, 19 },
17188 : { 0xfff8fff8, VEC_BCAST_PSLLW, 3 },
17189 : { 0xfffc0000, VEC_BCAST_PSLLD, 18 },
17190 : { 0xfffcfffc, VEC_BCAST_PSLLW, 2 },
17191 : { 0xfffe0000, VEC_BCAST_PSLLD, 17 },
17192 : { 0xfffefffe, VEC_BCAST_PSLLW, 1 },
17193 : { 0xffff0000, VEC_BCAST_PSLLD, 16 },
17194 : { 0xffff8000, VEC_BCAST_PSLLD, 15 },
17195 : { 0xffffc000, VEC_BCAST_PSLLD, 14 },
17196 : { 0xffffe000, VEC_BCAST_PSLLD, 13 },
17197 : { 0xfffff000, VEC_BCAST_PSLLD, 12 },
17198 : { 0xfffff800, VEC_BCAST_PSLLD, 11 },
17199 : { 0xfffffc00, VEC_BCAST_PSLLD, 10 },
17200 : { 0xfffffe00, VEC_BCAST_PSLLD, 9 },
17201 : { 0xffffff00, VEC_BCAST_PSLLD, 8 },
17202 : { 0xffffff80, VEC_BCAST_PSLLD, 7 },
17203 : { 0xffffffc0, VEC_BCAST_PSLLD, 6 },
17204 : { 0xffffffe0, VEC_BCAST_PSLLD, 5 },
17205 : { 0xfffffff0, VEC_BCAST_PSLLD, 4 },
17206 : { 0xfffffff8, VEC_BCAST_PSLLD, 3 },
17207 : { 0xfffffffc, VEC_BCAST_PSLLD, 2 },
17208 : { 0xfffffffe, VEC_BCAST_PSLLD, 1 },
17209 : { 0xffffffff, VEC_BCAST_PCMPEQ, 0 }
17210 : };
17211 :
17212 : /* Comparator for bsearch on ix86_vec_bcast_map. */
17213 : static int
17214 314992 : ix86_vec_bcast_map_simode_cmp (const void *key, const void *entry)
17215 : {
17216 314992 : return (*(const unsigned int*)key)
17217 314992 : - ((const ix86_vec_bcast_map_simode_t*)entry)->key;
17218 : }
17219 :
17220 : /* A subroutine of ix86_vector_duplicate_value. Tries to efficiently
17221 : materialize V4SImode, V8SImode and V16SImode vectors from SImode
17222 : integer constants. */
17223 : static bool
17224 47770 : ix86_vector_duplicate_simode_const (machine_mode mode, rtx target,
17225 : unsigned int val)
17226 : {
17227 47770 : const ix86_vec_bcast_map_simode_t *entry;
17228 47770 : rtx tmp1, tmp2;
17229 :
17230 47770 : entry = (const ix86_vec_bcast_map_simode_t*)
17231 47770 : bsearch(&val, ix86_vec_bcast_map_simode,
17232 : ARRAY_SIZE (ix86_vec_bcast_map_simode),
17233 : sizeof (ix86_vec_bcast_map_simode_t),
17234 : ix86_vec_bcast_map_simode_cmp);
17235 47770 : if (!entry)
17236 : return false;
17237 :
17238 16085 : switch (entry->alg)
17239 : {
17240 0 : case VEC_BCAST_PXOR:
17241 0 : if ((mode == V8SImode && !TARGET_AVX2)
17242 0 : || (mode == V16SImode && !TARGET_AVX512F))
17243 : return false;
17244 0 : emit_move_insn (target, CONST0_RTX (mode));
17245 0 : return true;
17246 :
17247 155 : case VEC_BCAST_PCMPEQ:
17248 155 : if ((mode == V4SImode && !TARGET_SSE2)
17249 154 : || (mode == V8SImode && !TARGET_AVX2)
17250 127 : || (mode == V16SImode && !TARGET_AVX512F))
17251 : return false;
17252 127 : emit_move_insn (target, CONSTM1_RTX (mode));
17253 127 : return true;
17254 :
17255 585 : case VEC_BCAST_PABSB:
17256 585 : if (mode == V4SImode && TARGET_SSE2)
17257 : {
17258 460 : tmp1 = gen_reg_rtx (V16QImode);
17259 460 : emit_move_insn (tmp1, CONSTM1_RTX (V16QImode));
17260 460 : tmp2 = gen_reg_rtx (V16QImode);
17261 460 : emit_insn (gen_absv16qi2 (tmp2, tmp1));
17262 : }
17263 125 : else if (mode == V8SImode && TARGET_AVX2)
17264 : {
17265 68 : tmp1 = gen_reg_rtx (V32QImode);
17266 68 : emit_move_insn (tmp1, CONSTM1_RTX (V32QImode));
17267 68 : tmp2 = gen_reg_rtx (V32QImode);
17268 68 : emit_insn (gen_absv32qi2 (tmp2, tmp1));
17269 : }
17270 57 : else if (mode == V16SImode && TARGET_AVX512BW)
17271 : {
17272 49 : tmp1 = gen_reg_rtx (V64QImode);
17273 49 : emit_move_insn (tmp1, CONSTM1_RTX (V64QImode));
17274 49 : tmp2 = gen_reg_rtx (V64QImode);
17275 49 : emit_insn (gen_absv64qi2 (tmp2, tmp1));
17276 : }
17277 : else
17278 : return false;
17279 : break;
17280 :
17281 101 : case VEC_BCAST_PADDB:
17282 101 : if (mode == V4SImode && TARGET_SSE2)
17283 : {
17284 97 : tmp1 = gen_reg_rtx (V16QImode);
17285 97 : emit_move_insn (tmp1, CONSTM1_RTX (V16QImode));
17286 97 : tmp2 = gen_reg_rtx (V16QImode);
17287 97 : emit_insn (gen_addv16qi3 (tmp2, tmp1, tmp1));
17288 : }
17289 4 : else if (mode == V8SImode && TARGET_AVX2)
17290 : {
17291 1 : tmp1 = gen_reg_rtx (V32QImode);
17292 1 : emit_move_insn (tmp1, CONSTM1_RTX (V32QImode));
17293 1 : tmp2 = gen_reg_rtx (V32QImode);
17294 1 : emit_insn (gen_addv32qi3 (tmp2, tmp1, tmp1));
17295 : }
17296 3 : else if (mode == V16SImode && TARGET_AVX512BW)
17297 : {
17298 3 : tmp1 = gen_reg_rtx (V64QImode);
17299 3 : emit_move_insn (tmp1, CONSTM1_RTX (V64QImode));
17300 3 : tmp2 = gen_reg_rtx (V64QImode);
17301 3 : emit_insn (gen_addv64qi3 (tmp2, tmp1, tmp1));
17302 : }
17303 : else
17304 : return false;
17305 : break;
17306 :
17307 3659 : case VEC_BCAST_PSRLW:
17308 3659 : if (mode == V4SImode && TARGET_SSE2)
17309 : {
17310 3435 : tmp1 = gen_reg_rtx (V8HImode);
17311 3435 : emit_move_insn (tmp1, CONSTM1_RTX (V8HImode));
17312 3435 : tmp2 = gen_reg_rtx (V8HImode);
17313 3435 : emit_insn (gen_lshrv8hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
17314 : }
17315 224 : else if (mode == V8SImode && TARGET_AVX2)
17316 : {
17317 131 : tmp1 = gen_reg_rtx (V16HImode);
17318 131 : emit_move_insn (tmp1, CONSTM1_RTX (V16HImode));
17319 131 : tmp2 = gen_reg_rtx (V16HImode);
17320 131 : emit_insn (gen_lshrv16hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
17321 : }
17322 93 : else if (mode == V16SImode && TARGET_AVX512BW)
17323 : {
17324 90 : tmp1 = gen_reg_rtx (V32HImode);
17325 90 : emit_move_insn (tmp1, CONSTM1_RTX (V32HImode));
17326 90 : tmp2 = gen_reg_rtx (V32HImode);
17327 90 : emit_insn (gen_lshrv32hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
17328 : }
17329 : else
17330 : return false;
17331 : break;
17332 :
17333 9824 : case VEC_BCAST_PSRLD:
17334 9824 : if (mode == V4SImode && TARGET_SSE2)
17335 : {
17336 6981 : tmp1 = gen_reg_rtx (V4SImode);
17337 6981 : emit_move_insn (tmp1, CONSTM1_RTX (V4SImode));
17338 6981 : emit_insn (gen_lshrv4si3 (target, tmp1, GEN_INT (entry->arg)));
17339 6981 : return true;
17340 : }
17341 2843 : else if (mode == V8SImode && TARGET_AVX2)
17342 : {
17343 1056 : tmp1 = gen_reg_rtx (V8SImode);
17344 1056 : emit_move_insn (tmp1, CONSTM1_RTX (V8SImode));
17345 1056 : emit_insn (gen_lshrv8si3 (target, tmp1, GEN_INT (entry->arg)));
17346 1056 : return true;
17347 : }
17348 1787 : else if (mode == V16SImode && TARGET_AVX512F)
17349 : {
17350 948 : tmp1 = gen_reg_rtx (V16SImode);
17351 948 : emit_move_insn (tmp1, CONSTM1_RTX (V16SImode));
17352 948 : emit_insn (gen_lshrv16si3 (target, tmp1, GEN_INT (entry->arg)));
17353 948 : return true;
17354 : }
17355 : else
17356 : return false;
17357 132 : break;
17358 :
17359 132 : case VEC_BCAST_PSLLW:
17360 132 : if (mode == V4SImode && TARGET_SSE2)
17361 : {
17362 102 : tmp1 = gen_reg_rtx (V8HImode);
17363 102 : emit_move_insn (tmp1, CONSTM1_RTX (V8HImode));
17364 102 : tmp2 = gen_reg_rtx (V8HImode);
17365 102 : emit_insn (gen_ashlv8hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
17366 : }
17367 30 : else if (mode == V8SImode && TARGET_AVX2)
17368 : {
17369 21 : tmp1 = gen_reg_rtx (V16HImode);
17370 21 : emit_move_insn (tmp1, CONSTM1_RTX (V16HImode));
17371 21 : tmp2 = gen_reg_rtx (V16HImode);
17372 21 : emit_insn (gen_ashlv16hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
17373 : }
17374 9 : else if (mode == V16SImode && TARGET_AVX512BW)
17375 : {
17376 9 : tmp1 = gen_reg_rtx (V32HImode);
17377 9 : emit_move_insn (tmp1, CONSTM1_RTX (V32HImode));
17378 9 : tmp2 = gen_reg_rtx (V32HImode);
17379 9 : emit_insn (gen_ashlv32hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
17380 : }
17381 : else
17382 : return false;
17383 : break;
17384 :
17385 1629 : case VEC_BCAST_PSLLD:
17386 1629 : if (mode == V4SImode && TARGET_SSE2)
17387 : {
17388 1594 : tmp1 = gen_reg_rtx (V4SImode);
17389 1594 : emit_move_insn (tmp1, CONSTM1_RTX (V4SImode));
17390 1594 : emit_insn (gen_ashlv4si3 (target, tmp1, GEN_INT (entry->arg)));
17391 1594 : return true;
17392 : }
17393 35 : else if (mode == V8SImode && TARGET_AVX2)
17394 : {
17395 17 : tmp1 = gen_reg_rtx (V8SImode);
17396 17 : emit_move_insn (tmp1, CONSTM1_RTX (V8SImode));
17397 17 : emit_insn (gen_ashlv8si3 (target, tmp1, GEN_INT (entry->arg)));
17398 17 : return true;
17399 : }
17400 18 : else if (mode == V16SImode && TARGET_AVX512F)
17401 : {
17402 18 : tmp1 = gen_reg_rtx (V16SImode);
17403 18 : emit_move_insn (tmp1, CONSTM1_RTX (V16SImode));
17404 18 : emit_insn (gen_ashlv16si3 (target, tmp1, GEN_INT (entry->arg)));
17405 18 : return true;
17406 : }
17407 : else
17408 : return false;
17409 :
17410 : default:
17411 : return false;
17412 : }
17413 :
17414 4466 : emit_move_insn (target, gen_lowpart (mode, tmp2));
17415 4466 : return true;
17416 : }
17417 :
17418 : /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
17419 : fill target with val via vec_duplicate. */
17420 :
17421 : static bool
17422 147409 : ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
17423 : {
17424 147409 : bool ok;
17425 147409 : rtx_insn *insn;
17426 147409 : rtx dup;
17427 :
17428 147409 : if ((mode == V4SImode || mode == V8SImode || mode == V16SImode)
17429 55622 : && CONST_INT_P (val)
17430 47770 : && ix86_vector_duplicate_simode_const (mode, target, INTVAL (val)))
17431 : return true;
17432 :
17433 : /* Save/restore recog_data in case this is called from splitters
17434 : or other routines where recog_data needs to stay valid across
17435 : force_reg. See PR106577. */
17436 132202 : recog_data_d recog_data_save = recog_data;
17437 :
17438 : /* First attempt to recognize VAL as-is. */
17439 132202 : dup = gen_vec_duplicate (mode, val);
17440 132202 : insn = emit_insn (gen_rtx_SET (target, dup));
17441 132202 : if (recog_memoized (insn) < 0)
17442 : {
17443 94793 : rtx_insn *seq;
17444 94793 : machine_mode innermode = GET_MODE_INNER (mode);
17445 94793 : rtx reg;
17446 :
17447 : /* If that fails, force VAL into a register or mem. */
17448 :
17449 94793 : start_sequence ();
17450 :
17451 0 : if (!TARGET_PREFER_BCST_FROM_INTEGER && CONST_INT_P (val)
17452 0 : && GET_MODE_BITSIZE (innermode) <= HOST_BITS_PER_WIDE_INT
17453 94793 : && GET_MODE_BITSIZE(mode) >= 128)
17454 0 : reg = validize_mem (force_const_mem (innermode, val));
17455 : else
17456 : {
17457 94793 : reg = force_reg (innermode, val);
17458 94793 : if (GET_MODE (reg) != innermode)
17459 0 : reg = gen_lowpart (innermode, reg);
17460 : }
17461 :
17462 94793 : SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
17463 94793 : seq = end_sequence ();
17464 94793 : if (seq)
17465 94793 : emit_insn_before (seq, insn);
17466 :
17467 94793 : ok = recog_memoized (insn) >= 0;
17468 94793 : gcc_assert (ok);
17469 : }
17470 132202 : recog_data = recog_data_save;
17471 132202 : return true;
17472 : }
17473 :
17474 : /* Get a vector mode of the same size as the original but with elements
17475 : twice as wide. This is only guaranteed to apply to integral vectors. */
17476 :
17477 : static machine_mode
17478 19205 : get_mode_wider_vector (machine_mode o)
17479 : {
17480 : /* ??? Rely on the ordering that genmodes.cc gives to vectors. */
17481 19205 : machine_mode n = GET_MODE_NEXT_MODE (o).require ();
17482 57615 : gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
17483 57615 : gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
17484 19205 : return n;
17485 : }
17486 :
17487 : static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
17488 : static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
17489 :
17490 : /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
17491 : with all elements equal to VAR. Return true if successful. */
17492 :
17493 : bool
17494 167855 : ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
17495 : rtx target, rtx val)
17496 : {
17497 167855 : bool ok;
17498 :
17499 167855 : switch (mode)
17500 : {
17501 70173 : case E_V2DImode:
17502 70173 : if (CONST_INT_P (val))
17503 : {
17504 61390 : int tmp = (int)INTVAL (val);
17505 61390 : if (tmp == (int)(INTVAL (val) >> 32))
17506 : {
17507 109 : rtx reg = gen_reg_rtx (V4SImode);
17508 109 : ok = ix86_vector_duplicate_value (V4SImode, reg,
17509 : GEN_INT (tmp));
17510 109 : if (ok)
17511 : {
17512 109 : emit_move_insn (target, gen_lowpart (V2DImode, reg));
17513 109 : return true;
17514 : }
17515 : }
17516 : }
17517 70064 : return ix86_vector_duplicate_value (mode, target, val);
17518 :
17519 995 : case E_V4DImode:
17520 995 : if (CONST_INT_P (val))
17521 : {
17522 718 : int tmp = (int)INTVAL (val);
17523 718 : if (tmp == (int)(INTVAL (val) >> 32))
17524 : {
17525 54 : rtx reg = gen_reg_rtx (V8SImode);
17526 54 : ok = ix86_vector_duplicate_value (V8SImode, reg,
17527 : GEN_INT (tmp));
17528 54 : if (ok)
17529 : {
17530 54 : emit_move_insn (target, gen_lowpart (V4DImode, reg));
17531 54 : return true;
17532 : }
17533 : }
17534 : }
17535 941 : return ix86_vector_duplicate_value (mode, target, val);
17536 :
17537 463 : case E_V8DImode:
17538 463 : if (CONST_INT_P (val))
17539 : {
17540 264 : int tmp = (int)INTVAL (val);
17541 264 : if (tmp == (int)(INTVAL (val) >> 32))
17542 : {
17543 24 : rtx reg = gen_reg_rtx (V16SImode);
17544 24 : ok = ix86_vector_duplicate_value (V16SImode, reg,
17545 : GEN_INT (tmp));
17546 24 : if (ok)
17547 : {
17548 24 : emit_move_insn (target, gen_lowpart (V8DImode, reg));
17549 24 : return true;
17550 : }
17551 : }
17552 : }
17553 439 : return ix86_vector_duplicate_value (mode, target, val);
17554 :
17555 2604 : case E_V2SImode:
17556 2604 : case E_V2SFmode:
17557 2604 : if (!mmx_ok)
17558 : return false;
17559 : /* FALLTHRU */
17560 :
17561 74792 : case E_V4DFmode:
17562 74792 : case E_V8SFmode:
17563 74792 : case E_V8SImode:
17564 74792 : case E_V2DFmode:
17565 74792 : case E_V4SFmode:
17566 74792 : case E_V4SImode:
17567 74792 : case E_V16SImode:
17568 74792 : case E_V16SFmode:
17569 74792 : case E_V8DFmode:
17570 74792 : return ix86_vector_duplicate_value (mode, target, val);
17571 :
17572 387 : case E_V4HImode:
17573 387 : if (!mmx_ok)
17574 : return false;
17575 384 : if (TARGET_SSE || TARGET_3DNOW_A)
17576 : {
17577 384 : rtx x;
17578 :
17579 384 : val = gen_lowpart (SImode, val);
17580 384 : if (CONST_INT_P (val))
17581 : return false;
17582 382 : x = gen_rtx_TRUNCATE (HImode, val);
17583 382 : x = gen_rtx_VEC_DUPLICATE (mode, x);
17584 382 : emit_insn (gen_rtx_SET (target, x));
17585 382 : return true;
17586 : }
17587 0 : goto widen;
17588 :
17589 5 : case E_V4HFmode:
17590 5 : case E_V4BFmode:
17591 5 : if (TARGET_MMX_WITH_SSE)
17592 : {
17593 10 : val = force_reg (GET_MODE_INNER (mode), val);
17594 5 : rtx x = gen_rtx_VEC_DUPLICATE (mode, val);
17595 5 : emit_insn (gen_rtx_SET (target, x));
17596 5 : return true;
17597 : }
17598 : return false;
17599 :
17600 126 : case E_V2HImode:
17601 126 : if (TARGET_SSE2)
17602 : {
17603 126 : rtx x;
17604 :
17605 126 : val = gen_lowpart (SImode, val);
17606 126 : if (CONST_INT_P (val))
17607 : return false;
17608 126 : x = gen_rtx_TRUNCATE (HImode, val);
17609 126 : x = gen_rtx_VEC_DUPLICATE (mode, x);
17610 126 : emit_insn (gen_rtx_SET (target, x));
17611 126 : return true;
17612 : }
17613 : return false;
17614 :
17615 3 : case E_V2HFmode:
17616 3 : case E_V2BFmode:
17617 3 : if (TARGET_SSE2)
17618 : {
17619 6 : val = force_reg (GET_MODE_INNER (mode), val);
17620 3 : rtx x = gen_rtx_VEC_DUPLICATE (mode, val);
17621 3 : emit_insn (gen_rtx_SET (target, x));
17622 3 : return true;
17623 : }
17624 : return false;
17625 :
17626 303 : case E_V8QImode:
17627 303 : case E_V4QImode:
17628 303 : if (!mmx_ok)
17629 : return false;
17630 299 : goto widen;
17631 :
17632 10285 : case E_V8HImode:
17633 10285 : if (CONST_INT_P (val))
17634 9763 : goto widen;
17635 : /* FALLTHRU */
17636 :
17637 836 : case E_V8HFmode:
17638 836 : case E_V8BFmode:
17639 836 : if (TARGET_AVX2)
17640 392 : return ix86_vector_duplicate_value (mode, target, val);
17641 :
17642 444 : if (TARGET_SSE2)
17643 : {
17644 1135 : struct expand_vec_perm_d dperm;
17645 1135 : rtx tmp1, tmp2;
17646 :
17647 444 : permute:
17648 1135 : memset (&dperm, 0, sizeof (dperm));
17649 1135 : dperm.target = target;
17650 1135 : dperm.vmode = mode;
17651 1135 : dperm.nelt = GET_MODE_NUNITS (mode);
17652 1135 : dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
17653 1135 : dperm.one_operand_p = true;
17654 :
17655 1135 : if (mode == V8HFmode || mode == V8BFmode)
17656 : {
17657 3 : tmp1 = force_reg (GET_MODE_INNER (mode), val);
17658 3 : tmp2 = gen_reg_rtx (mode);
17659 3 : emit_insn (gen_vec_set_0 (mode, tmp2, CONST0_RTX (mode), tmp1));
17660 3 : tmp1 = gen_lowpart (mode, tmp2);
17661 : }
17662 : else
17663 : {
17664 : /* Extend to SImode using a paradoxical SUBREG. */
17665 1132 : tmp1 = gen_reg_rtx (SImode);
17666 1132 : emit_move_insn (tmp1, gen_lowpart (SImode, val));
17667 :
17668 : /* Insert the SImode value as
17669 : low element of a V4SImode vector. */
17670 1132 : tmp2 = gen_reg_rtx (V4SImode);
17671 1132 : emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
17672 1132 : tmp1 = gen_lowpart (mode, tmp2);
17673 : }
17674 :
17675 1135 : emit_move_insn (dperm.op0, tmp1);
17676 1135 : ok = (expand_vec_perm_1 (&dperm)
17677 1135 : || expand_vec_perm_broadcast_1 (&dperm));
17678 0 : gcc_assert (ok);
17679 1135 : return ok;
17680 : }
17681 0 : goto widen;
17682 :
17683 6011 : case E_V16QImode:
17684 6011 : if (CONST_INT_P (val))
17685 5260 : goto widen;
17686 751 : if (TARGET_AVX2)
17687 60 : return ix86_vector_duplicate_value (mode, target, val);
17688 :
17689 691 : if (TARGET_SSE2)
17690 691 : goto permute;
17691 0 : goto widen;
17692 :
17693 17653 : widen:
17694 : /* Replicate the value once into the next wider mode and recurse. */
17695 17653 : {
17696 17653 : machine_mode smode, wsmode, wvmode;
17697 17653 : rtx x;
17698 :
17699 17653 : smode = GET_MODE_INNER (mode);
17700 17653 : wvmode = get_mode_wider_vector (mode);
17701 17653 : wsmode = GET_MODE_INNER (wvmode);
17702 :
17703 17653 : val = convert_modes (wsmode, smode, val, true);
17704 :
17705 17653 : if (CONST_INT_P (val))
17706 : {
17707 34710 : x = simplify_binary_operation (ASHIFT, wsmode, val,
17708 17355 : GEN_INT (GET_MODE_BITSIZE (smode)));
17709 17355 : val = simplify_binary_operation (IOR, wsmode, val, x);
17710 : }
17711 298 : else if (smode == QImode && !TARGET_PARTIAL_REG_STALL)
17712 298 : emit_insn (gen_insv_1 (wsmode, val, val));
17713 : else
17714 : {
17715 0 : x = expand_simple_binop (wsmode, ASHIFT, val,
17716 0 : GEN_INT (GET_MODE_BITSIZE (smode)),
17717 : NULL_RTX, 1, OPTAB_LIB_WIDEN);
17718 0 : val = expand_simple_binop (wsmode, IOR, val, x, x, 1,
17719 : OPTAB_LIB_WIDEN);
17720 : }
17721 :
17722 17653 : x = gen_reg_rtx (wvmode);
17723 17653 : ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
17724 17653 : if (!ok)
17725 : return false;
17726 17652 : emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
17727 17652 : return true;
17728 : }
17729 :
17730 1474 : case E_V16HImode:
17731 1474 : case E_V32QImode:
17732 1474 : if (CONST_INT_P (val))
17733 1182 : goto widen;
17734 : /* FALLTHRU */
17735 :
17736 375 : case E_V16HFmode:
17737 375 : case E_V16BFmode:
17738 375 : if (TARGET_AVX2)
17739 347 : return ix86_vector_duplicate_value (mode, target, val);
17740 : else
17741 : {
17742 28 : machine_mode hvmode;
17743 28 : switch (mode)
17744 : {
17745 : case V16HImode:
17746 : hvmode = V8HImode;
17747 : break;
17748 0 : case V16HFmode:
17749 0 : hvmode = V8HFmode;
17750 0 : break;
17751 1 : case V16BFmode:
17752 1 : hvmode = V8BFmode;
17753 1 : break;
17754 14 : case V32QImode:
17755 14 : hvmode = V16QImode;
17756 14 : break;
17757 0 : default:
17758 0 : gcc_unreachable ();
17759 : }
17760 28 : rtx x = gen_reg_rtx (hvmode);
17761 :
17762 28 : ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
17763 28 : if (!ok)
17764 : return false;
17765 :
17766 28 : x = gen_rtx_VEC_CONCAT (mode, x, x);
17767 28 : emit_insn (gen_rtx_SET (target, x));
17768 : }
17769 28 : return true;
17770 :
17771 1277 : case E_V32HImode:
17772 1277 : case E_V64QImode:
17773 1277 : if (CONST_INT_P (val))
17774 1149 : goto widen;
17775 : /* FALLTHRU */
17776 :
17777 207 : case E_V32HFmode:
17778 207 : case E_V32BFmode:
17779 207 : if (TARGET_AVX512BW)
17780 187 : return ix86_vector_duplicate_value (mode, target, val);
17781 : else
17782 : {
17783 20 : machine_mode hvmode;
17784 20 : switch (mode)
17785 : {
17786 : case V32HImode:
17787 : hvmode = V16HImode;
17788 : break;
17789 0 : case V32HFmode:
17790 0 : hvmode = V16HFmode;
17791 0 : break;
17792 1 : case V32BFmode:
17793 1 : hvmode = V16BFmode;
17794 1 : break;
17795 10 : case V64QImode:
17796 10 : hvmode = V32QImode;
17797 10 : break;
17798 0 : default:
17799 0 : gcc_unreachable ();
17800 : }
17801 20 : rtx x = gen_reg_rtx (hvmode);
17802 :
17803 20 : ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
17804 20 : if (!ok)
17805 : return false;
17806 :
17807 20 : x = gen_rtx_VEC_CONCAT (mode, x, x);
17808 20 : emit_insn (gen_rtx_SET (target, x));
17809 : }
17810 20 : return true;
17811 :
17812 : default:
17813 : return false;
17814 : }
17815 : }
17816 :
17817 : /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
17818 : whose ONE_VAR element is VAR, and other elements are zero. Return true
17819 : if successful. */
17820 :
17821 : bool
17822 10335 : ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
17823 : rtx target, rtx var, int one_var)
17824 : {
17825 10335 : machine_mode vsimode;
17826 10335 : rtx new_target;
17827 10335 : rtx x, tmp;
17828 10335 : bool use_vector_set = false;
17829 10335 : rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
17830 :
17831 10335 : switch (mode)
17832 : {
17833 7925 : case E_V2DImode:
17834 : /* For SSE4.1, we normally use vector set. But if the second
17835 : element is zero and inter-unit moves are OK, we use movq
17836 : instead. */
17837 7916 : use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
17838 8048 : && !(TARGET_INTER_UNIT_MOVES_TO_VEC
17839 : && one_var == 0));
17840 : break;
17841 858 : case E_V16QImode:
17842 858 : case E_V4SImode:
17843 858 : case E_V4SFmode:
17844 858 : use_vector_set = TARGET_SSE4_1;
17845 858 : break;
17846 85 : case E_V8HImode:
17847 85 : use_vector_set = TARGET_SSE2;
17848 85 : gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
17849 85 : ? gen_vec_setv8hi_0 : NULL;
17850 : break;
17851 4 : case E_V8QImode:
17852 4 : use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
17853 : break;
17854 14 : case E_V4HImode:
17855 14 : case E_V4HFmode:
17856 14 : case E_V4BFmode:
17857 14 : use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
17858 : break;
17859 32 : case E_V4QImode:
17860 32 : use_vector_set = TARGET_SSE4_1;
17861 32 : break;
17862 0 : case E_V32QImode:
17863 0 : use_vector_set = TARGET_AVX;
17864 0 : break;
17865 5 : case E_V16HImode:
17866 5 : use_vector_set = TARGET_AVX;
17867 5 : gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
17868 5 : ? gen_vec_setv16hi_0 : NULL;
17869 : break;
17870 5 : case E_V8SImode:
17871 5 : use_vector_set = TARGET_AVX;
17872 5 : gen_vec_set_0 = gen_vec_setv8si_0;
17873 5 : break;
17874 22 : case E_V8SFmode:
17875 22 : use_vector_set = TARGET_AVX;
17876 22 : gen_vec_set_0 = gen_vec_setv8sf_0;
17877 22 : break;
17878 13 : case E_V4DFmode:
17879 13 : use_vector_set = TARGET_AVX;
17880 13 : gen_vec_set_0 = gen_vec_setv4df_0;
17881 13 : break;
17882 7 : case E_V4DImode:
17883 : /* Use ix86_expand_vector_set in 64bit mode only. */
17884 7 : use_vector_set = TARGET_AVX && TARGET_64BIT;
17885 : gen_vec_set_0 = gen_vec_setv4di_0;
17886 : break;
17887 17 : case E_V16SImode:
17888 17 : use_vector_set = TARGET_AVX512F && one_var == 0;
17889 : gen_vec_set_0 = gen_vec_setv16si_0;
17890 : break;
17891 22 : case E_V16SFmode:
17892 22 : use_vector_set = TARGET_AVX512F && one_var == 0;
17893 : gen_vec_set_0 = gen_vec_setv16sf_0;
17894 : break;
17895 0 : case E_V8DFmode:
17896 0 : use_vector_set = TARGET_AVX512F && one_var == 0;
17897 : gen_vec_set_0 = gen_vec_setv8df_0;
17898 : break;
17899 2 : case E_V8DImode:
17900 : /* Use ix86_expand_vector_set in 64bit mode only. */
17901 2 : use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
17902 : gen_vec_set_0 = gen_vec_setv8di_0;
17903 : break;
17904 39 : case E_V8HFmode:
17905 39 : use_vector_set = TARGET_AVX512FP16 && one_var == 0;
17906 : gen_vec_set_0 = gen_vec_setv8hf_0;
17907 : break;
17908 9 : case E_V16HFmode:
17909 9 : use_vector_set = TARGET_AVX512FP16 && one_var == 0;
17910 : gen_vec_set_0 = gen_vec_setv16hf_0;
17911 : break;
17912 6 : case E_V32HFmode:
17913 6 : use_vector_set = TARGET_AVX512FP16 && one_var == 0;
17914 : gen_vec_set_0 = gen_vec_setv32hf_0;
17915 : break;
17916 2 : case E_V8BFmode:
17917 2 : use_vector_set = TARGET_AVX512FP16 && one_var == 0;
17918 : gen_vec_set_0 = gen_vec_setv8bf_0;
17919 : break;
17920 0 : case E_V16BFmode:
17921 0 : use_vector_set = TARGET_AVX512FP16 && one_var == 0;
17922 : gen_vec_set_0 = gen_vec_setv16bf_0;
17923 : break;
17924 0 : case E_V32BFmode:
17925 0 : use_vector_set = TARGET_AVX512FP16 && one_var == 0;
17926 : gen_vec_set_0 = gen_vec_setv32bf_0;
17927 : break;
17928 4 : case E_V32HImode:
17929 4 : use_vector_set = TARGET_AVX512FP16 && one_var == 0;
17930 : gen_vec_set_0 = gen_vec_setv32hi_0;
17931 : default:
17932 : break;
17933 : }
17934 :
17935 8959 : if (use_vector_set)
17936 : {
17937 857 : if (gen_vec_set_0 && one_var == 0)
17938 : {
17939 354 : var = force_reg (GET_MODE_INNER (mode), var);
17940 177 : emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
17941 177 : return true;
17942 : }
17943 680 : emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
17944 1360 : var = force_reg (GET_MODE_INNER (mode), var);
17945 680 : ix86_expand_vector_set (mmx_ok, target, var, one_var);
17946 680 : return true;
17947 : }
17948 :
17949 9478 : switch (mode)
17950 : {
17951 1166 : case E_V2SFmode:
17952 1166 : case E_V2SImode:
17953 1166 : if (!mmx_ok)
17954 : return false;
17955 : /* FALLTHRU */
17956 :
17957 8173 : case E_V2DFmode:
17958 8173 : case E_V2DImode:
17959 8173 : if (one_var != 0)
17960 : return false;
17961 4986 : var = force_reg (GET_MODE_INNER (mode), var);
17962 4986 : x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
17963 2493 : emit_insn (gen_rtx_SET (target, x));
17964 2493 : return true;
17965 :
17966 294 : case E_V4SFmode:
17967 294 : case E_V4SImode:
17968 294 : if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
17969 0 : new_target = gen_reg_rtx (mode);
17970 : else
17971 : new_target = target;
17972 588 : var = force_reg (GET_MODE_INNER (mode), var);
17973 294 : x = gen_rtx_VEC_DUPLICATE (mode, var);
17974 294 : x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
17975 294 : emit_insn (gen_rtx_SET (new_target, x));
17976 294 : if (one_var != 0)
17977 : {
17978 : /* We need to shuffle the value to the correct position, so
17979 : create a new pseudo to store the intermediate result. */
17980 :
17981 : /* With SSE2, we can use the integer shuffle insns. */
17982 41 : if (mode != V4SFmode && TARGET_SSE2)
17983 : {
17984 28 : emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
17985 : const1_rtx,
17986 28 : GEN_INT (one_var == 1 ? 0 : 1),
17987 28 : GEN_INT (one_var == 2 ? 0 : 1),
17988 28 : GEN_INT (one_var == 3 ? 0 : 1)));
17989 28 : if (target != new_target)
17990 0 : emit_move_insn (target, new_target);
17991 28 : return true;
17992 : }
17993 :
17994 : /* Otherwise convert the intermediate result to V4SFmode and
17995 : use the SSE1 shuffle instructions. */
17996 0 : if (mode != V4SFmode)
17997 : {
17998 0 : tmp = gen_reg_rtx (V4SFmode);
17999 0 : emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
18000 : }
18001 : else
18002 : tmp = new_target;
18003 :
18004 43 : emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
18005 : const1_rtx,
18006 13 : GEN_INT (one_var == 1 ? 0 : 1),
18007 : GEN_INT (one_var == 2 ? 0+4 : 1+4),
18008 : GEN_INT (one_var == 3 ? 0+4 : 1+4)));
18009 :
18010 13 : if (mode != V4SFmode)
18011 0 : emit_move_insn (target, gen_lowpart (V4SImode, tmp));
18012 13 : else if (tmp != target)
18013 0 : emit_move_insn (target, tmp);
18014 : }
18015 253 : else if (target != new_target)
18016 0 : emit_move_insn (target, new_target);
18017 : return true;
18018 :
18019 13 : case E_V8HImode:
18020 13 : case E_V16QImode:
18021 13 : vsimode = V4SImode;
18022 13 : goto widen;
18023 3 : case E_V4HImode:
18024 3 : case E_V8QImode:
18025 3 : if (!mmx_ok)
18026 : return false;
18027 3 : vsimode = V2SImode;
18028 3 : goto widen;
18029 16 : widen:
18030 16 : if (one_var != 0)
18031 : return false;
18032 :
18033 : /* Zero extend the variable element to SImode and recurse. */
18034 16 : var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
18035 :
18036 8 : x = gen_reg_rtx (vsimode);
18037 8 : if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
18038 : var, one_var))
18039 0 : gcc_unreachable ();
18040 :
18041 8 : emit_move_insn (target, gen_lowpart (mode, x));
18042 8 : return true;
18043 :
18044 : default:
18045 : return false;
18046 : }
18047 : }
18048 :
18049 : /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
18050 : consisting of the values in VALS. It is known that all elements
18051 : except ONE_VAR are constants. Return true if successful. */
18052 :
18053 : static bool
18054 7838 : ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
18055 : rtx target, rtx vals, int one_var)
18056 : {
18057 7838 : rtx var = XVECEXP (vals, 0, one_var);
18058 7838 : machine_mode wmode;
18059 7838 : rtx const_vec, x;
18060 :
18061 7838 : const_vec = copy_rtx (vals);
18062 7838 : XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
18063 7838 : const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
18064 :
18065 7838 : switch (mode)
18066 : {
18067 : case E_V2DFmode:
18068 : case E_V2DImode:
18069 : case E_V2SFmode:
18070 : case E_V2SImode:
18071 : /* For the two element vectors, it's just as easy to use
18072 : the general case. */
18073 : return false;
18074 :
18075 3 : case E_V4DImode:
18076 : /* Use ix86_expand_vector_set in 64bit mode only. */
18077 3 : if (!TARGET_64BIT)
18078 : return false;
18079 : /* FALLTHRU */
18080 : case E_V8HFmode:
18081 : case E_V16HFmode:
18082 : case E_V8BFmode:
18083 : case E_V16BFmode:
18084 : case E_V4DFmode:
18085 : case E_V8SFmode:
18086 : case E_V8SImode:
18087 : case E_V16HImode:
18088 : case E_V32QImode:
18089 : case E_V4SFmode:
18090 : case E_V4SImode:
18091 : case E_V8HImode:
18092 : case E_V4HImode:
18093 : case E_V4HFmode:
18094 : case E_V4BFmode:
18095 : break;
18096 :
18097 8 : case E_V16QImode:
18098 8 : if (TARGET_SSE4_1)
18099 : break;
18100 8 : wmode = V8HImode;
18101 8 : goto widen;
18102 1 : case E_V8QImode:
18103 1 : if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
18104 : break;
18105 1 : wmode = V4HImode;
18106 1 : goto widen;
18107 38 : case E_V4QImode:
18108 38 : if (TARGET_SSE4_1)
18109 : break;
18110 : wmode = V2HImode;
18111 47 : widen:
18112 : /* There's no way to set one QImode entry easily. Combine
18113 : the variable value with its adjacent constant value, and
18114 : promote to an HImode set. */
18115 47 : x = XVECEXP (vals, 0, one_var ^ 1);
18116 47 : if (one_var & 1)
18117 : {
18118 13 : var = convert_modes (HImode, QImode, var, true);
18119 13 : var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
18120 : NULL_RTX, 1, OPTAB_LIB_WIDEN);
18121 13 : x = GEN_INT (INTVAL (x) & 0xff);
18122 : }
18123 : else
18124 : {
18125 34 : var = convert_modes (HImode, QImode, var, true);
18126 34 : x = gen_int_mode (UINTVAL (x) << 8, HImode);
18127 : }
18128 47 : if (x != const0_rtx)
18129 7 : var = expand_simple_binop (HImode, IOR, var, x, var,
18130 : 1, OPTAB_LIB_WIDEN);
18131 :
18132 47 : x = gen_reg_rtx (wmode);
18133 47 : emit_move_insn (x, gen_lowpart (wmode, const_vec));
18134 47 : ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
18135 :
18136 47 : emit_move_insn (target, gen_lowpart (mode, x));
18137 47 : return true;
18138 :
18139 : default:
18140 : return false;
18141 : }
18142 :
18143 193 : emit_move_insn (target, const_vec);
18144 193 : ix86_expand_vector_set (mmx_ok, target, var, one_var);
18145 193 : return true;
18146 : }
18147 :
18148 : /* A subroutine of ix86_expand_vector_init_general. Use vector
18149 : concatenate to handle the most general case: all values variable,
18150 : and none identical. */
18151 :
18152 : static void
18153 118383 : ix86_expand_vector_init_concat (machine_mode mode,
18154 : rtx target, rtx *ops, int n)
18155 : {
18156 118383 : machine_mode half_mode = VOIDmode;
18157 118383 : rtx half[2];
18158 118383 : rtvec v;
18159 118383 : int i, j;
18160 :
18161 118383 : switch (n)
18162 : {
18163 110234 : case 2:
18164 110234 : switch (mode)
18165 : {
18166 : case E_V32HFmode:
18167 : half_mode = V16HFmode;
18168 : break;
18169 0 : case E_V32BFmode:
18170 0 : half_mode = V16BFmode;
18171 0 : break;
18172 79 : case E_V16SImode:
18173 79 : half_mode = V8SImode;
18174 79 : break;
18175 33 : case E_V16SFmode:
18176 33 : half_mode = V8SFmode;
18177 33 : break;
18178 92 : case E_V8DImode:
18179 92 : half_mode = V4DImode;
18180 92 : break;
18181 59 : case E_V8DFmode:
18182 59 : half_mode = V4DFmode;
18183 59 : break;
18184 0 : case E_V16HFmode:
18185 0 : half_mode = V8HFmode;
18186 0 : break;
18187 0 : case E_V16BFmode:
18188 0 : half_mode = V8BFmode;
18189 0 : break;
18190 191 : case E_V8SImode:
18191 191 : half_mode = V4SImode;
18192 191 : break;
18193 259 : case E_V8SFmode:
18194 259 : half_mode = V4SFmode;
18195 259 : break;
18196 304 : case E_V4DImode:
18197 304 : half_mode = V2DImode;
18198 304 : break;
18199 503 : case E_V4DFmode:
18200 503 : half_mode = V2DFmode;
18201 503 : break;
18202 5808 : case E_V4SImode:
18203 5808 : half_mode = V2SImode;
18204 5808 : break;
18205 2087 : case E_V4SFmode:
18206 2087 : half_mode = V2SFmode;
18207 2087 : break;
18208 65097 : case E_V2DImode:
18209 65097 : half_mode = DImode;
18210 65097 : break;
18211 26919 : case E_V2SImode:
18212 26919 : half_mode = SImode;
18213 26919 : break;
18214 3431 : case E_V2DFmode:
18215 3431 : half_mode = DFmode;
18216 3431 : break;
18217 5372 : case E_V2SFmode:
18218 5372 : half_mode = SFmode;
18219 5372 : break;
18220 0 : default:
18221 0 : gcc_unreachable ();
18222 : }
18223 :
18224 110234 : if (!register_operand (ops[1], half_mode))
18225 47912 : ops[1] = force_reg (half_mode, ops[1]);
18226 110234 : if (!register_operand (ops[0], half_mode))
18227 36205 : ops[0] = force_reg (half_mode, ops[0]);
18228 110234 : emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
18229 : ops[1])));
18230 110234 : break;
18231 :
18232 7541 : case 4:
18233 7541 : switch (mode)
18234 : {
18235 : case E_V4DImode:
18236 : half_mode = V2DImode;
18237 : break;
18238 476 : case E_V4DFmode:
18239 476 : half_mode = V2DFmode;
18240 476 : break;
18241 4875 : case E_V4SImode:
18242 4875 : half_mode = V2SImode;
18243 4875 : break;
18244 2012 : case E_V4SFmode:
18245 2012 : half_mode = V2SFmode;
18246 2012 : break;
18247 0 : default:
18248 0 : gcc_unreachable ();
18249 : }
18250 7541 : goto half;
18251 :
18252 517 : case 8:
18253 517 : switch (mode)
18254 : {
18255 : case E_V8DImode:
18256 : half_mode = V4DImode;
18257 : break;
18258 59 : case E_V8DFmode:
18259 59 : half_mode = V4DFmode;
18260 59 : break;
18261 154 : case E_V8SImode:
18262 154 : half_mode = V4SImode;
18263 154 : break;
18264 253 : case E_V8SFmode:
18265 253 : half_mode = V4SFmode;
18266 253 : break;
18267 0 : default:
18268 0 : gcc_unreachable ();
18269 : }
18270 517 : goto half;
18271 :
18272 91 : case 16:
18273 91 : switch (mode)
18274 : {
18275 : case E_V16SImode:
18276 : half_mode = V8SImode;
18277 : break;
18278 33 : case E_V16SFmode:
18279 33 : half_mode = V8SFmode;
18280 33 : break;
18281 0 : default:
18282 0 : gcc_unreachable ();
18283 : }
18284 91 : goto half;
18285 :
18286 8149 : half:
18287 : /* FIXME: We process inputs backward to help RA. PR 36222. */
18288 8149 : i = n - 1;
18289 24447 : for (j = 1; j != -1; j--)
18290 : {
18291 16298 : half[j] = gen_reg_rtx (half_mode);
18292 16298 : switch (n >> 1)
18293 : {
18294 15082 : case 2:
18295 15082 : v = gen_rtvec (2, ops[i-1], ops[i]);
18296 15082 : i -= 2;
18297 15082 : break;
18298 1034 : case 4:
18299 1034 : v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
18300 1034 : i -= 4;
18301 1034 : break;
18302 182 : case 8:
18303 364 : v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
18304 182 : ops[i-3], ops[i-2], ops[i-1], ops[i]);
18305 182 : i -= 8;
18306 182 : break;
18307 0 : default:
18308 0 : gcc_unreachable ();
18309 : }
18310 16298 : ix86_expand_vector_init (false, half[j],
18311 : gen_rtx_PARALLEL (half_mode, v));
18312 : }
18313 :
18314 8149 : ix86_expand_vector_init_concat (mode, target, half, 2);
18315 8149 : break;
18316 :
18317 0 : default:
18318 0 : gcc_unreachable ();
18319 : }
18320 118383 : }
18321 :
18322 : /* A subroutine of ix86_expand_vector_init_general. Use vector
18323 : interleave to handle the most general case: all values variable,
18324 : and none identical. */
18325 :
18326 : static void
18327 3881 : ix86_expand_vector_init_interleave (machine_mode mode,
18328 : rtx target, rtx *ops, int n)
18329 : {
18330 3881 : machine_mode first_imode, second_imode, third_imode, inner_mode;
18331 3881 : int i, j;
18332 3881 : rtx op, op0, op1;
18333 3881 : rtx (*gen_load_even) (rtx, rtx, rtx);
18334 3881 : rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
18335 3881 : rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
18336 :
18337 3881 : switch (mode)
18338 : {
18339 : case E_V8HFmode:
18340 : gen_load_even = gen_vec_interleave_lowv8hf;
18341 : gen_interleave_first_low = gen_vec_interleave_lowv4si;
18342 : gen_interleave_second_low = gen_vec_interleave_lowv2di;
18343 : inner_mode = HFmode;
18344 : first_imode = V4SImode;
18345 : second_imode = V2DImode;
18346 : third_imode = VOIDmode;
18347 : break;
18348 487 : case E_V8BFmode:
18349 487 : gen_load_even = gen_vec_interleave_lowv8bf;
18350 487 : gen_interleave_first_low = gen_vec_interleave_lowv4si;
18351 487 : gen_interleave_second_low = gen_vec_interleave_lowv2di;
18352 487 : inner_mode = BFmode;
18353 487 : first_imode = V4SImode;
18354 487 : second_imode = V2DImode;
18355 487 : third_imode = VOIDmode;
18356 487 : break;
18357 793 : case E_V8HImode:
18358 793 : gen_load_even = gen_vec_setv8hi;
18359 793 : gen_interleave_first_low = gen_vec_interleave_lowv4si;
18360 793 : gen_interleave_second_low = gen_vec_interleave_lowv2di;
18361 793 : inner_mode = HImode;
18362 793 : first_imode = V4SImode;
18363 793 : second_imode = V2DImode;
18364 793 : third_imode = VOIDmode;
18365 793 : break;
18366 374 : case E_V16QImode:
18367 374 : gen_load_even = gen_vec_setv16qi;
18368 374 : gen_interleave_first_low = gen_vec_interleave_lowv8hi;
18369 374 : gen_interleave_second_low = gen_vec_interleave_lowv4si;
18370 374 : inner_mode = QImode;
18371 374 : first_imode = V8HImode;
18372 374 : second_imode = V4SImode;
18373 374 : third_imode = V2DImode;
18374 374 : break;
18375 0 : default:
18376 0 : gcc_unreachable ();
18377 : }
18378 :
18379 20901 : for (i = 0; i < n; i++)
18380 : {
18381 17020 : op = ops [i + i];
18382 17020 : if (inner_mode == HFmode || inner_mode == BFmode)
18383 : {
18384 10856 : rtx even, odd;
18385 : /* Use vpuncklwd to pack 2 HFmode or BFmode. */
18386 1948 : machine_mode vec_mode =
18387 10856 : (inner_mode == HFmode) ? V8HFmode : V8BFmode;
18388 10856 : op0 = gen_reg_rtx (vec_mode);
18389 10856 : even = lowpart_subreg (vec_mode,
18390 : force_reg (inner_mode, op), inner_mode);
18391 10856 : odd = lowpart_subreg (vec_mode,
18392 10856 : force_reg (inner_mode, ops[i + i + 1]),
18393 : inner_mode);
18394 10856 : emit_insn (gen_load_even (op0, even, odd));
18395 : }
18396 : else
18397 : {
18398 : /* Extend the odd elment to SImode using a paradoxical SUBREG. */
18399 6164 : op0 = gen_reg_rtx (SImode);
18400 6164 : emit_move_insn (op0, gen_lowpart (SImode, op));
18401 :
18402 : /* Insert the SImode value as low element of V4SImode vector. */
18403 6164 : op1 = gen_reg_rtx (V4SImode);
18404 6164 : op0 = gen_rtx_VEC_MERGE (V4SImode,
18405 : gen_rtx_VEC_DUPLICATE (V4SImode,
18406 : op0),
18407 : CONST0_RTX (V4SImode),
18408 : const1_rtx);
18409 6164 : emit_insn (gen_rtx_SET (op1, op0));
18410 :
18411 : /* Cast the V4SImode vector back to a vector in orignal mode. */
18412 6164 : op0 = gen_reg_rtx (mode);
18413 6164 : emit_move_insn (op0, gen_lowpart (mode, op1));
18414 :
18415 : /* Load even elements into the second position. */
18416 6164 : emit_insn (gen_load_even (op0,
18417 : force_reg (inner_mode,
18418 6164 : ops[i + i + 1]),
18419 : const1_rtx));
18420 : }
18421 :
18422 : /* Cast vector to FIRST_IMODE vector. */
18423 17020 : ops[i] = gen_reg_rtx (first_imode);
18424 17020 : emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
18425 : }
18426 :
18427 : /* Interleave low FIRST_IMODE vectors. */
18428 12391 : for (i = j = 0; i < n; i += 2, j++)
18429 : {
18430 8510 : op0 = gen_reg_rtx (first_imode);
18431 8510 : emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
18432 :
18433 : /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
18434 8510 : ops[j] = gen_reg_rtx (second_imode);
18435 8510 : emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
18436 : }
18437 :
18438 : /* Interleave low SECOND_IMODE vectors. */
18439 3881 : switch (second_imode)
18440 : {
18441 : case E_V4SImode:
18442 1122 : for (i = j = 0; i < n / 2; i += 2, j++)
18443 : {
18444 748 : op0 = gen_reg_rtx (second_imode);
18445 748 : emit_insn (gen_interleave_second_low (op0, ops[i],
18446 748 : ops[i + 1]));
18447 :
18448 : /* Cast the SECOND_IMODE vector to the THIRD_IMODE
18449 : vector. */
18450 748 : ops[j] = gen_reg_rtx (third_imode);
18451 748 : emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
18452 : }
18453 : second_imode = V2DImode;
18454 : gen_interleave_second_low = gen_vec_interleave_lowv2di;
18455 : /* FALLTHRU */
18456 :
18457 3881 : case E_V2DImode:
18458 3881 : op0 = gen_reg_rtx (second_imode);
18459 3881 : emit_insn (gen_interleave_second_low (op0, ops[0],
18460 : ops[1]));
18461 :
18462 : /* Cast the SECOND_IMODE vector back to a vector on original
18463 : mode. */
18464 3881 : emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
18465 3881 : break;
18466 :
18467 : default:
18468 : gcc_unreachable ();
18469 : }
18470 3881 : }
18471 :
18472 : /* A subroutine of ix86_expand_vector_init. Handle the most general case:
18473 : all values variable, and none identical. */
18474 :
18475 : static void
18476 119421 : ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
18477 : rtx target, rtx vals)
18478 : {
18479 119421 : rtx ops[64], op0, op1, op2, op3, op4, op5;
18480 119421 : machine_mode half_mode = VOIDmode;
18481 119421 : machine_mode quarter_mode = VOIDmode;
18482 119421 : machine_mode int_inner_mode = VOIDmode;
18483 119421 : int n, i;
18484 :
18485 119421 : switch (mode)
18486 : {
18487 32291 : case E_V2SFmode:
18488 32291 : case E_V2SImode:
18489 32291 : if (!mmx_ok && !TARGET_SSE)
18490 : break;
18491 : /* FALLTHRU */
18492 :
18493 108968 : case E_V16SImode:
18494 108968 : case E_V16SFmode:
18495 108968 : case E_V8DFmode:
18496 108968 : case E_V8DImode:
18497 108968 : case E_V8SFmode:
18498 108968 : case E_V8SImode:
18499 108968 : case E_V4DFmode:
18500 108968 : case E_V4DImode:
18501 108968 : case E_V4SFmode:
18502 108968 : case E_V4SImode:
18503 108968 : case E_V2DFmode:
18504 108968 : case E_V2DImode:
18505 108968 : n = GET_MODE_NUNITS (mode);
18506 346362 : for (i = 0; i < n; i++)
18507 237394 : ops[i] = XVECEXP (vals, 0, i);
18508 108968 : ix86_expand_vector_init_concat (mode, target, ops, n);
18509 220041 : return;
18510 :
18511 : case E_V2TImode:
18512 135 : for (i = 0; i < 2; i++)
18513 90 : ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
18514 45 : op0 = gen_reg_rtx (V4DImode);
18515 45 : ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
18516 45 : emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
18517 45 : return;
18518 :
18519 : case E_V4TImode:
18520 195 : for (i = 0; i < 4; i++)
18521 156 : ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
18522 39 : ops[4] = gen_reg_rtx (V4DImode);
18523 39 : ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
18524 39 : ops[5] = gen_reg_rtx (V4DImode);
18525 39 : ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
18526 39 : op0 = gen_reg_rtx (V8DImode);
18527 39 : ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
18528 39 : emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
18529 39 : return;
18530 :
18531 69 : case E_V32QImode:
18532 69 : half_mode = V16QImode;
18533 69 : goto half;
18534 :
18535 64 : case E_V16HImode:
18536 64 : half_mode = V8HImode;
18537 64 : goto half;
18538 :
18539 237 : case E_V16HFmode:
18540 237 : half_mode = V8HFmode;
18541 237 : goto half;
18542 :
18543 95 : case E_V16BFmode:
18544 95 : half_mode = V8BFmode;
18545 95 : goto half;
18546 :
18547 465 : half:
18548 465 : n = GET_MODE_NUNITS (mode);
18549 9009 : for (i = 0; i < n; i++)
18550 8544 : ops[i] = XVECEXP (vals, 0, i);
18551 465 : op0 = gen_reg_rtx (half_mode);
18552 465 : op1 = gen_reg_rtx (half_mode);
18553 465 : ix86_expand_vector_init_interleave (half_mode, op0, ops,
18554 : n >> 2);
18555 465 : ix86_expand_vector_init_interleave (half_mode, op1,
18556 465 : &ops [n >> 1], n >> 2);
18557 465 : emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
18558 465 : return;
18559 :
18560 56 : case E_V64QImode:
18561 56 : quarter_mode = V16QImode;
18562 56 : half_mode = V32QImode;
18563 56 : goto quarter;
18564 :
18565 71 : case E_V32HImode:
18566 71 : quarter_mode = V8HImode;
18567 71 : half_mode = V16HImode;
18568 71 : goto quarter;
18569 :
18570 287 : case E_V32HFmode:
18571 287 : quarter_mode = V8HFmode;
18572 287 : half_mode = V16HFmode;
18573 287 : goto quarter;
18574 :
18575 51 : case E_V32BFmode:
18576 51 : quarter_mode = V8BFmode;
18577 51 : half_mode = V16BFmode;
18578 51 : goto quarter;
18579 :
18580 465 : quarter:
18581 465 : n = GET_MODE_NUNITS (mode);
18582 17137 : for (i = 0; i < n; i++)
18583 16672 : ops[i] = XVECEXP (vals, 0, i);
18584 465 : op0 = gen_reg_rtx (quarter_mode);
18585 465 : op1 = gen_reg_rtx (quarter_mode);
18586 465 : op2 = gen_reg_rtx (quarter_mode);
18587 465 : op3 = gen_reg_rtx (quarter_mode);
18588 465 : op4 = gen_reg_rtx (half_mode);
18589 465 : op5 = gen_reg_rtx (half_mode);
18590 465 : ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
18591 : n >> 3);
18592 465 : ix86_expand_vector_init_interleave (quarter_mode, op1,
18593 465 : &ops [n >> 2], n >> 3);
18594 465 : ix86_expand_vector_init_interleave (quarter_mode, op2,
18595 465 : &ops [n >> 1], n >> 3);
18596 465 : ix86_expand_vector_init_interleave (quarter_mode, op3,
18597 465 : &ops [(n >> 1) | (n >> 2)], n >> 3);
18598 465 : emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
18599 465 : emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
18600 465 : emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
18601 465 : return;
18602 :
18603 323 : case E_V16QImode:
18604 323 : if (!TARGET_SSE4_1)
18605 : break;
18606 : /* FALLTHRU */
18607 :
18608 517 : case E_V8HImode:
18609 517 : if (!TARGET_SSE2)
18610 : break;
18611 :
18612 : /* Don't use ix86_expand_vector_init_interleave if we can't
18613 : move from GPR to SSE register directly. */
18614 517 : if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
18615 : break;
18616 : /* FALLTHRU */
18617 :
18618 1091 : case E_V8HFmode:
18619 1091 : case E_V8BFmode:
18620 :
18621 1091 : n = GET_MODE_NUNITS (mode);
18622 9915 : for (i = 0; i < n; i++)
18623 8824 : ops[i] = XVECEXP (vals, 0, i);
18624 1091 : ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
18625 1091 : return;
18626 :
18627 : case E_V4HFmode:
18628 : case E_V4BFmode:
18629 : case E_V2HFmode:
18630 : case E_V2BFmode:
18631 8348 : int_inner_mode = HImode;
18632 : break;
18633 :
18634 : case E_V4HImode:
18635 : case E_V8QImode:
18636 :
18637 : case E_V2HImode:
18638 : case E_V4QImode:
18639 : break;
18640 :
18641 0 : default:
18642 0 : gcc_unreachable ();
18643 : }
18644 :
18645 8348 : {
18646 8348 : int i, j, n_elts, n_words, n_elt_per_word;
18647 8348 : machine_mode tmp_mode, inner_mode;
18648 8348 : rtx words[4], shift;
18649 :
18650 16773 : tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode;
18651 :
18652 8348 : inner_mode = GET_MODE_INNER (mode);
18653 8348 : n_elts = GET_MODE_NUNITS (mode);
18654 16696 : n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode);
18655 8348 : n_elt_per_word = n_elts / n_words;
18656 8348 : shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
18657 :
18658 17085 : for (i = 0; i < n_words; ++i)
18659 : {
18660 : rtx word = NULL_RTX;
18661 :
18662 46289 : for (j = 0; j < n_elt_per_word; ++j)
18663 : {
18664 37552 : rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
18665 37552 : if (int_inner_mode != E_VOIDmode)
18666 : {
18667 138 : gcc_assert (TARGET_SSE2 && int_inner_mode == HImode);
18668 138 : rtx tmp = gen_reg_rtx (int_inner_mode);
18669 138 : elt = lowpart_subreg (int_inner_mode,
18670 : force_reg (inner_mode, elt),
18671 : inner_mode);
18672 138 : emit_move_insn (tmp, elt);
18673 138 : elt = tmp;
18674 : }
18675 37552 : elt = convert_modes (tmp_mode, inner_mode, elt, true);
18676 :
18677 37552 : if (j == 0)
18678 : word = elt;
18679 : else
18680 : {
18681 28815 : word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
18682 : NULL_RTX, 1, OPTAB_LIB_WIDEN);
18683 28815 : word = expand_simple_binop (tmp_mode, IOR, word, elt,
18684 : NULL_RTX, 1, OPTAB_LIB_WIDEN);
18685 : }
18686 : }
18687 :
18688 8737 : words[i] = word;
18689 : }
18690 :
18691 8348 : if (n_words == 1)
18692 7959 : emit_move_insn (target, gen_lowpart (mode, words[0]));
18693 389 : else if (n_words == 2)
18694 : {
18695 389 : gcc_assert (tmp_mode == DImode || tmp_mode == SImode);
18696 389 : machine_mode concat_mode = tmp_mode == DImode ? V2DImode : V2SImode;
18697 389 : rtx tmp = gen_reg_rtx (concat_mode);
18698 389 : vals = gen_rtx_PARALLEL (concat_mode, gen_rtvec_v (2, words));
18699 389 : ix86_expand_vector_init_general (mmx_ok, concat_mode, tmp, vals);
18700 389 : emit_move_insn (target, gen_lowpart (mode, tmp));
18701 : }
18702 0 : else if (n_words == 4)
18703 : {
18704 0 : rtx tmp = gen_reg_rtx (V4SImode);
18705 0 : gcc_assert (tmp_mode == SImode);
18706 0 : vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
18707 0 : ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
18708 0 : emit_move_insn (target, gen_lowpart (mode, tmp));
18709 : }
18710 : else
18711 0 : gcc_unreachable ();
18712 : }
18713 : }
18714 :
18715 : /* Initialize vector TARGET via VALS. Suppress the use of MMX
18716 : instructions unless MMX_OK is true. */
18717 :
18718 : void
18719 130475 : ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
18720 : {
18721 130475 : machine_mode mode = GET_MODE (target);
18722 130475 : machine_mode inner_mode = GET_MODE_INNER (mode);
18723 130475 : int n_elts = GET_MODE_NUNITS (mode);
18724 130475 : int n_var = 0, one_var = -1;
18725 130475 : bool all_same = true, all_const_zero = true;
18726 130475 : int i;
18727 130475 : rtx x;
18728 :
18729 : /* Handle first initialization from vector elts. */
18730 130475 : if (n_elts != XVECLEN (vals, 0))
18731 : {
18732 1104 : rtx subtarget = target;
18733 1104 : x = XVECEXP (vals, 0, 0);
18734 2208 : gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
18735 2208 : if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
18736 : {
18737 1104 : rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
18738 1104 : if (inner_mode == QImode
18739 1104 : || inner_mode == HImode
18740 1104 : || inner_mode == TImode
18741 : || inner_mode == HFmode
18742 : || inner_mode == BFmode)
18743 : {
18744 146 : unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
18745 146 : scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
18746 146 : n_bits /= GET_MODE_SIZE (elt_mode);
18747 146 : mode = mode_for_vector (elt_mode, n_bits).require ();
18748 146 : inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
18749 146 : ops[0] = gen_lowpart (inner_mode, ops[0]);
18750 146 : ops[1] = gen_lowpart (inner_mode, ops[1]);
18751 146 : subtarget = gen_reg_rtx (mode);
18752 : }
18753 1104 : ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
18754 1104 : if (subtarget != target)
18755 146 : emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
18756 1104 : return;
18757 : }
18758 0 : gcc_unreachable ();
18759 : }
18760 :
18761 475237 : for (i = 0; i < n_elts; ++i)
18762 : {
18763 345866 : x = XVECEXP (vals, 0, i);
18764 671502 : if (!(CONST_SCALAR_INT_P (x)
18765 329614 : || CONST_DOUBLE_P (x)
18766 : || CONST_FIXED_P (x)))
18767 325636 : n_var++, one_var = i;
18768 20230 : else if (x != CONST0_RTX (inner_mode))
18769 3260 : all_const_zero = false;
18770 345866 : if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
18771 : all_same = false;
18772 : }
18773 :
18774 : /* Handle the zero vector as special case. */
18775 129371 : if (n_var == 0 && all_const_zero)
18776 : {
18777 302 : emit_move_insn (target, CONST0_RTX (mode));
18778 302 : return;
18779 : }
18780 :
18781 : /* If all values are identical, broadcast the value. */
18782 129069 : if (all_same
18783 136275 : && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
18784 7206 : XVECEXP (vals, 0, 0)))
18785 : return;
18786 :
18787 : /* Constants are best loaded from the constant pool. */
18788 122957 : if (n_var == 0)
18789 : {
18790 41 : emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
18791 41 : return;
18792 : }
18793 :
18794 : /* Values where only one field is non-constant are best loaded from
18795 : the pool and overwritten via move later. */
18796 122916 : if (n_var == 1)
18797 : {
18798 11482 : if (all_const_zero
18799 21809 : && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
18800 10327 : XVECEXP (vals, 0, one_var),
18801 : one_var))
18802 : return;
18803 :
18804 7838 : if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
18805 : return;
18806 : }
18807 :
18808 119032 : ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
18809 : }
18810 :
18811 : /* Implemented as
18812 : V setg (V v, int idx, T val)
18813 : {
18814 : V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
18815 : V valv = (V){val, val, val, val, val, val, val, val};
18816 : V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
18817 : v = (v & ~mask) | (valv & mask);
18818 : return v;
18819 : }. */
18820 : void
18821 129 : ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
18822 : {
18823 129 : rtx vec[64];
18824 129 : machine_mode mode = GET_MODE (target);
18825 129 : machine_mode cmp_mode = mode;
18826 129 : int n_elts = GET_MODE_NUNITS (mode);
18827 129 : rtx valv,idxv,constv,idx_tmp;
18828 129 : bool ok = false;
18829 :
18830 : /* 512-bits vector byte/word broadcast and comparison only available
18831 : under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
18832 : when without TARGET_AVX512BW. */
18833 129 : if ((mode == V32HImode || mode == V32HFmode || mode == V32BFmode
18834 123 : || mode == V64QImode)
18835 10 : && !TARGET_AVX512BW)
18836 : {
18837 3 : gcc_assert (TARGET_AVX512F);
18838 3 : rtx vhi, vlo, idx_hi;
18839 3 : machine_mode half_mode;
18840 3 : rtx (*extract_hi)(rtx, rtx);
18841 3 : rtx (*extract_lo)(rtx, rtx);
18842 :
18843 3 : if (mode == V32HImode)
18844 : {
18845 : half_mode = V16HImode;
18846 : extract_hi = gen_vec_extract_hi_v32hi;
18847 : extract_lo = gen_vec_extract_lo_v32hi;
18848 : }
18849 : else if (mode == V32HFmode)
18850 : {
18851 : half_mode = V16HFmode;
18852 : extract_hi = gen_vec_extract_hi_v32hf;
18853 : extract_lo = gen_vec_extract_lo_v32hf;
18854 : }
18855 : else if (mode == V32BFmode)
18856 : {
18857 : half_mode = V16BFmode;
18858 : extract_hi = gen_vec_extract_hi_v32bf;
18859 : extract_lo = gen_vec_extract_lo_v32bf;
18860 : }
18861 : else
18862 : {
18863 3 : half_mode = V32QImode;
18864 3 : extract_hi = gen_vec_extract_hi_v64qi;
18865 3 : extract_lo = gen_vec_extract_lo_v64qi;
18866 : }
18867 :
18868 3 : vhi = gen_reg_rtx (half_mode);
18869 3 : vlo = gen_reg_rtx (half_mode);
18870 3 : idx_hi = gen_reg_rtx (GET_MODE (idx));
18871 3 : emit_insn (extract_hi (vhi, target));
18872 3 : emit_insn (extract_lo (vlo, target));
18873 3 : vec[0] = idx_hi;
18874 3 : vec[1] = idx;
18875 3 : vec[2] = GEN_INT (n_elts/2);
18876 3 : ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
18877 3 : ix86_expand_vector_set_var (vhi, val, idx_hi);
18878 3 : ix86_expand_vector_set_var (vlo, val, idx);
18879 3 : emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
18880 3 : return;
18881 : }
18882 :
18883 504 : if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
18884 : {
18885 42 : switch (mode)
18886 : {
18887 : case E_V2DFmode:
18888 : cmp_mode = V2DImode;
18889 : break;
18890 6 : case E_V4DFmode:
18891 6 : cmp_mode = V4DImode;
18892 6 : break;
18893 4 : case E_V8DFmode:
18894 4 : cmp_mode = V8DImode;
18895 4 : break;
18896 2 : case E_V2SFmode:
18897 2 : cmp_mode = V2SImode;
18898 2 : break;
18899 6 : case E_V4SFmode:
18900 6 : cmp_mode = V4SImode;
18901 6 : break;
18902 6 : case E_V8SFmode:
18903 6 : cmp_mode = V8SImode;
18904 6 : break;
18905 5 : case E_V16SFmode:
18906 5 : cmp_mode = V16SImode;
18907 5 : break;
18908 1 : case E_V2HFmode:
18909 1 : case E_V2BFmode:
18910 1 : cmp_mode = V2HImode;
18911 1 : break;
18912 1 : case E_V4HFmode:
18913 1 : case E_V4BFmode:
18914 1 : cmp_mode = V4HImode;
18915 1 : break;
18916 : case E_V8HFmode:
18917 2 : cmp_mode = V8HImode;
18918 : break;
18919 : case E_V16HFmode:
18920 2 : cmp_mode = V16HImode;
18921 : break;
18922 : case E_V32HFmode:
18923 1 : cmp_mode = V32HImode;
18924 : break;
18925 : case E_V8BFmode:
18926 2 : cmp_mode = V8HImode;
18927 : break;
18928 : case E_V16BFmode:
18929 2 : cmp_mode = V16HImode;
18930 : break;
18931 : case E_V32BFmode:
18932 1 : cmp_mode = V32HImode;
18933 : break;
18934 0 : default:
18935 0 : gcc_unreachable ();
18936 : }
18937 : }
18938 :
18939 1604 : for (int i = 0; i != n_elts; i++)
18940 1478 : vec[i] = GEN_INT (i);
18941 126 : constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
18942 126 : valv = gen_reg_rtx (mode);
18943 126 : idxv = gen_reg_rtx (cmp_mode);
18944 252 : idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
18945 :
18946 126 : ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
18947 : mode, valv, val);
18948 126 : gcc_assert (ok);
18949 126 : ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
18950 : cmp_mode, idxv, idx_tmp);
18951 126 : gcc_assert (ok);
18952 126 : vec[0] = target;
18953 126 : vec[1] = valv;
18954 126 : vec[2] = target;
18955 126 : vec[3] = gen_rtx_EQ (mode, idxv, constv);
18956 126 : vec[4] = idxv;
18957 126 : vec[5] = constv;
18958 126 : ok = ix86_expand_int_vcond (vec);
18959 126 : gcc_assert (ok);
18960 : }
18961 :
18962 : void
18963 8340 : ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
18964 : {
18965 8340 : machine_mode mode = GET_MODE (target);
18966 8340 : machine_mode inner_mode = GET_MODE_INNER (mode);
18967 8340 : machine_mode half_mode;
18968 8340 : bool use_vec_merge = false;
18969 8340 : bool blendm_const = false;
18970 8340 : rtx tmp;
18971 8340 : static rtx (*gen_extract[8][2]) (rtx, rtx)
18972 : = {
18973 : { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
18974 : { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
18975 : { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
18976 : { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
18977 : { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
18978 : { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df },
18979 : { gen_vec_extract_lo_v16hf, gen_vec_extract_hi_v16hf },
18980 : { gen_vec_extract_lo_v16bf, gen_vec_extract_hi_v16bf }
18981 : };
18982 8340 : static rtx (*gen_insert[8][2]) (rtx, rtx, rtx)
18983 : = {
18984 : { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
18985 : { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
18986 : { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
18987 : { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
18988 : { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
18989 : { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df },
18990 : { gen_vec_set_lo_v16hf, gen_vec_set_hi_v16hf },
18991 : { gen_vec_set_lo_v16bf, gen_vec_set_hi_v16bf },
18992 : };
18993 8340 : int i, j, n;
18994 8340 : machine_mode mmode = VOIDmode;
18995 8340 : rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
18996 :
18997 8340 : switch (mode)
18998 : {
18999 188 : case E_V2SImode:
19000 188 : use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
19001 : if (use_vec_merge)
19002 : break;
19003 : /* FALLTHRU */
19004 :
19005 167 : case E_V2SFmode:
19006 167 : if (mmx_ok)
19007 : {
19008 165 : tmp = gen_reg_rtx (GET_MODE_INNER (mode));
19009 165 : ix86_expand_vector_extract (true, tmp, target, 1 - elt);
19010 165 : if (elt == 0)
19011 0 : tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
19012 : else
19013 165 : tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
19014 165 : emit_insn (gen_rtx_SET (target, tmp));
19015 165 : return;
19016 : }
19017 : break;
19018 :
19019 220 : case E_V2DImode:
19020 220 : use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
19021 74 : if (use_vec_merge)
19022 : break;
19023 :
19024 74 : tmp = gen_reg_rtx (GET_MODE_INNER (mode));
19025 74 : ix86_expand_vector_extract (false, tmp, target, 1 - elt);
19026 74 : if (elt == 0)
19027 49 : tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
19028 : else
19029 25 : tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
19030 74 : emit_insn (gen_rtx_SET (target, tmp));
19031 74 : return;
19032 :
19033 130 : case E_V2DFmode:
19034 : /* NB: For ELT == 0, use standard scalar operation patterns which
19035 : preserve the rest of the vector for combiner:
19036 :
19037 : (vec_merge:V2DF
19038 : (vec_duplicate:V2DF (reg:DF))
19039 : (reg:V2DF)
19040 : (const_int 1))
19041 : */
19042 130 : if (elt == 0)
19043 68 : goto do_vec_merge;
19044 :
19045 62 : {
19046 62 : rtx op0, op1;
19047 :
19048 : /* For the two element vectors, we implement a VEC_CONCAT with
19049 : the extraction of the other element. */
19050 :
19051 62 : tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
19052 62 : tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
19053 :
19054 62 : if (elt == 0)
19055 : op0 = val, op1 = tmp;
19056 : else
19057 62 : op0 = tmp, op1 = val;
19058 :
19059 62 : tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
19060 62 : emit_insn (gen_rtx_SET (target, tmp));
19061 : }
19062 62 : return;
19063 :
19064 574 : case E_V4SFmode:
19065 574 : use_vec_merge = TARGET_SSE4_1;
19066 574 : if (use_vec_merge)
19067 : break;
19068 :
19069 62 : switch (elt)
19070 : {
19071 : case 0:
19072 : use_vec_merge = true;
19073 : break;
19074 :
19075 1 : case 1:
19076 : /* tmp = target = A B C D */
19077 1 : tmp = copy_to_reg (target);
19078 : /* target = A A B B */
19079 1 : emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
19080 : /* target = X A B B */
19081 1 : ix86_expand_vector_set (false, target, val, 0);
19082 : /* target = A X C D */
19083 1 : emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
19084 : const1_rtx, const0_rtx,
19085 : GEN_INT (2+4), GEN_INT (3+4)));
19086 1 : return;
19087 :
19088 0 : case 2:
19089 : /* tmp = target = A B C D */
19090 0 : tmp = copy_to_reg (target);
19091 : /* tmp = X B C D */
19092 0 : ix86_expand_vector_set (false, tmp, val, 0);
19093 : /* target = A B X D */
19094 0 : emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
19095 : const0_rtx, const1_rtx,
19096 : GEN_INT (0+4), GEN_INT (3+4)));
19097 0 : return;
19098 :
19099 4 : case 3:
19100 : /* tmp = target = A B C D */
19101 4 : tmp = copy_to_reg (target);
19102 : /* tmp = X B C D */
19103 4 : ix86_expand_vector_set (false, tmp, val, 0);
19104 : /* target = A B X D */
19105 4 : emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
19106 : const0_rtx, const1_rtx,
19107 : GEN_INT (2+4), GEN_INT (0+4)));
19108 4 : return;
19109 :
19110 0 : default:
19111 0 : gcc_unreachable ();
19112 : }
19113 : break;
19114 :
19115 437 : case E_V4SImode:
19116 437 : use_vec_merge = TARGET_SSE4_1;
19117 437 : if (use_vec_merge)
19118 : break;
19119 :
19120 : /* Element 0 handled by vec_merge below. */
19121 277 : if (elt == 0)
19122 : {
19123 : use_vec_merge = true;
19124 : break;
19125 : }
19126 :
19127 87 : if (TARGET_SSE2)
19128 : {
19129 : /* With SSE2, use integer shuffles to swap element 0 and ELT,
19130 : store into element 0, then shuffle them back. */
19131 :
19132 87 : rtx order[4];
19133 :
19134 87 : order[0] = GEN_INT (elt);
19135 87 : order[1] = const1_rtx;
19136 87 : order[2] = const2_rtx;
19137 87 : order[3] = GEN_INT (3);
19138 87 : order[elt] = const0_rtx;
19139 :
19140 87 : emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
19141 : order[1], order[2], order[3]));
19142 :
19143 87 : ix86_expand_vector_set (false, target, val, 0);
19144 :
19145 87 : emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
19146 : order[1], order[2], order[3]));
19147 : }
19148 : else
19149 : {
19150 : /* For SSE1, we have to reuse the V4SF code. */
19151 0 : rtx t = gen_reg_rtx (V4SFmode);
19152 0 : emit_move_insn (t, gen_lowpart (V4SFmode, target));
19153 0 : ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
19154 0 : emit_move_insn (target, gen_lowpart (mode, t));
19155 : }
19156 : return;
19157 :
19158 3534 : case E_V8HImode:
19159 3534 : case E_V8HFmode:
19160 3534 : case E_V8BFmode:
19161 3534 : case E_V2HImode:
19162 3534 : case E_V2HFmode:
19163 3534 : case E_V2BFmode:
19164 3534 : use_vec_merge = TARGET_SSE2;
19165 3534 : break;
19166 50 : case E_V4HImode:
19167 50 : case E_V4HFmode:
19168 50 : case E_V4BFmode:
19169 50 : use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
19170 : break;
19171 :
19172 3067 : case E_V16QImode:
19173 3067 : case E_V4QImode:
19174 3067 : use_vec_merge = TARGET_SSE4_1;
19175 3067 : break;
19176 :
19177 1 : case E_V8QImode:
19178 1 : use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
19179 : break;
19180 :
19181 3 : case E_V32QImode:
19182 3 : half_mode = V16QImode;
19183 3 : j = 0;
19184 3 : n = 16;
19185 3 : goto half;
19186 :
19187 17 : case E_V16HFmode:
19188 17 : case E_V16BFmode:
19189 : /* For ELT == 0, vec_setv8hf_0 can save 1 vpbroadcastw. */
19190 17 : if (TARGET_AVX2 && elt != 0)
19191 : {
19192 12 : mmode = SImode;
19193 12 : gen_blendm = ((mode == E_V16HFmode) ? gen_avx2_pblendph_1
19194 : : gen_avx2_pblendbf_1);
19195 : blendm_const = true;
19196 : break;
19197 : }
19198 : else
19199 : {
19200 5 : half_mode = ((mode == E_V16HFmode) ? V8HFmode : V8BFmode);
19201 3 : j = ((mode == E_V16HFmode) ? 6 : 7);
19202 5 : n = 8;
19203 5 : goto half;
19204 : }
19205 :
19206 5 : case E_V16HImode:
19207 5 : half_mode = V8HImode;
19208 5 : j = 1;
19209 5 : n = 8;
19210 5 : goto half;
19211 :
19212 15 : case E_V8SImode:
19213 15 : half_mode = V4SImode;
19214 15 : j = 2;
19215 15 : n = 4;
19216 15 : goto half;
19217 :
19218 15 : case E_V4DImode:
19219 15 : half_mode = V2DImode;
19220 15 : j = 3;
19221 15 : n = 2;
19222 15 : goto half;
19223 :
19224 4 : case E_V8SFmode:
19225 4 : half_mode = V4SFmode;
19226 4 : j = 4;
19227 4 : n = 4;
19228 4 : goto half;
19229 :
19230 6 : case E_V4DFmode:
19231 6 : half_mode = V2DFmode;
19232 6 : j = 5;
19233 6 : n = 2;
19234 6 : goto half;
19235 :
19236 53 : half:
19237 : /* Compute offset. */
19238 53 : i = elt / n;
19239 53 : elt %= n;
19240 :
19241 53 : gcc_assert (i <= 1);
19242 :
19243 : /* Extract the half. */
19244 53 : tmp = gen_reg_rtx (half_mode);
19245 53 : emit_insn (gen_extract[j][i] (tmp, target));
19246 :
19247 : /* Put val in tmp at elt. */
19248 53 : ix86_expand_vector_set (false, tmp, val, elt);
19249 :
19250 : /* Put it back. */
19251 53 : emit_insn (gen_insert[j][i] (target, target, tmp));
19252 53 : return;
19253 :
19254 8 : case E_V8DFmode:
19255 8 : if (TARGET_AVX512F)
19256 : {
19257 : mmode = QImode;
19258 : gen_blendm = gen_avx512f_blendmv8df;
19259 : }
19260 : break;
19261 :
19262 6 : case E_V8DImode:
19263 6 : if (TARGET_AVX512F)
19264 : {
19265 : mmode = QImode;
19266 : gen_blendm = gen_avx512f_blendmv8di;
19267 : }
19268 : break;
19269 :
19270 0 : case E_V16SFmode:
19271 0 : if (TARGET_AVX512F)
19272 : {
19273 : mmode = HImode;
19274 : gen_blendm = gen_avx512f_blendmv16sf;
19275 : }
19276 : break;
19277 :
19278 0 : case E_V16SImode:
19279 0 : if (TARGET_AVX512F)
19280 : {
19281 : mmode = HImode;
19282 : gen_blendm = gen_avx512f_blendmv16si;
19283 : }
19284 : break;
19285 :
19286 12 : case E_V32HFmode:
19287 12 : if (TARGET_AVX512BW)
19288 : {
19289 : mmode = SImode;
19290 : gen_blendm = gen_avx512bw_blendmv32hf;
19291 : }
19292 : break;
19293 12 : case E_V32BFmode:
19294 12 : if (TARGET_AVX512BW)
19295 : {
19296 : mmode = SImode;
19297 : gen_blendm = gen_avx512bw_blendmv32bf;
19298 : }
19299 : break;
19300 11 : case E_V32HImode:
19301 11 : if (TARGET_AVX512BW)
19302 : {
19303 : mmode = SImode;
19304 : gen_blendm = gen_avx512bw_blendmv32hi;
19305 : }
19306 7 : else if (TARGET_AVX512F)
19307 : {
19308 7 : half_mode = E_V8HImode;
19309 7 : n = 8;
19310 7 : goto quarter;
19311 : }
19312 : break;
19313 :
19314 12 : case E_V64QImode:
19315 12 : if (TARGET_AVX512BW)
19316 : {
19317 : mmode = DImode;
19318 : gen_blendm = gen_avx512bw_blendmv64qi;
19319 : }
19320 6 : else if (TARGET_AVX512F)
19321 : {
19322 6 : half_mode = E_V16QImode;
19323 6 : n = 16;
19324 6 : goto quarter;
19325 : }
19326 : break;
19327 :
19328 13 : quarter:
19329 : /* Compute offset. */
19330 13 : i = elt / n;
19331 13 : elt %= n;
19332 :
19333 13 : gcc_assert (i <= 3);
19334 :
19335 13 : {
19336 : /* Extract the quarter. */
19337 13 : tmp = gen_reg_rtx (V4SImode);
19338 13 : rtx tmp2 = gen_lowpart (V16SImode, target);
19339 13 : rtx mask = gen_reg_rtx (QImode);
19340 :
19341 13 : emit_move_insn (mask, constm1_rtx);
19342 13 : emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
19343 : tmp, mask));
19344 :
19345 13 : tmp2 = gen_reg_rtx (half_mode);
19346 13 : emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
19347 13 : tmp = tmp2;
19348 :
19349 : /* Put val in tmp at elt. */
19350 13 : ix86_expand_vector_set (false, tmp, val, elt);
19351 :
19352 : /* Put it back. */
19353 13 : tmp2 = gen_reg_rtx (V16SImode);
19354 13 : rtx tmp3 = gen_lowpart (V16SImode, target);
19355 13 : mask = gen_reg_rtx (HImode);
19356 13 : emit_move_insn (mask, constm1_rtx);
19357 13 : tmp = gen_lowpart (V4SImode, tmp);
19358 13 : emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
19359 : tmp3, mask));
19360 13 : emit_move_insn (target, gen_lowpart (mode, tmp2));
19361 : }
19362 13 : return;
19363 :
19364 : default:
19365 : break;
19366 : }
19367 :
19368 6601 : if (mmode != VOIDmode)
19369 : {
19370 54 : tmp = gen_reg_rtx (mode);
19371 54 : emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
19372 54 : rtx merge_mask = gen_int_mode (HOST_WIDE_INT_1U << elt, mmode);
19373 : /* The avx512*_blendm<mode> expanders have different operand order
19374 : from VEC_MERGE. In VEC_MERGE, the first input operand is used for
19375 : elements where the mask is set and second input operand otherwise,
19376 : in {sse,avx}*_*blend* the first input operand is used for elements
19377 : where the mask is clear and second input operand otherwise. */
19378 54 : if (!blendm_const)
19379 42 : merge_mask = force_reg (mmode, merge_mask);
19380 54 : emit_insn (gen_blendm (target, target, tmp, merge_mask));
19381 : }
19382 7759 : else if (use_vec_merge)
19383 : {
19384 7747 : do_vec_merge:
19385 7815 : if (!nonimmediate_operand (val, inner_mode))
19386 1 : val = force_reg (inner_mode, val);
19387 7815 : tmp = gen_rtx_VEC_DUPLICATE (mode, val);
19388 7815 : tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
19389 : GEN_INT (HOST_WIDE_INT_1U << elt));
19390 7815 : emit_insn (gen_rtx_SET (target, tmp));
19391 : }
19392 : else
19393 : {
19394 24 : rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
19395 :
19396 12 : emit_move_insn (mem, target);
19397 :
19398 24 : tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
19399 12 : emit_move_insn (tmp, val);
19400 :
19401 12 : emit_move_insn (target, mem);
19402 : }
19403 : }
19404 :
19405 : void
19406 108433 : ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
19407 : {
19408 108433 : machine_mode mode = GET_MODE (vec);
19409 108433 : machine_mode inner_mode = GET_MODE_INNER (mode);
19410 108433 : bool use_vec_extr = false;
19411 108433 : rtx tmp;
19412 :
19413 108433 : switch (mode)
19414 : {
19415 8478 : case E_V2SImode:
19416 8478 : use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
19417 : if (use_vec_extr)
19418 : break;
19419 : /* FALLTHRU */
19420 :
19421 9356 : case E_V2SFmode:
19422 9356 : if (!mmx_ok)
19423 : break;
19424 : /* FALLTHRU */
19425 :
19426 : case E_V2DFmode:
19427 : case E_V2DImode:
19428 : case E_V2TImode:
19429 : case E_V4TImode:
19430 : use_vec_extr = true;
19431 : break;
19432 :
19433 7904 : case E_V4SFmode:
19434 7904 : use_vec_extr = TARGET_SSE4_1;
19435 7904 : if (use_vec_extr)
19436 : break;
19437 :
19438 4035 : switch (elt)
19439 : {
19440 : case 0:
19441 : tmp = vec;
19442 : break;
19443 :
19444 1675 : case 1:
19445 1675 : case 3:
19446 1675 : tmp = gen_reg_rtx (mode);
19447 1675 : emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
19448 : GEN_INT (elt), GEN_INT (elt),
19449 1675 : GEN_INT (elt+4), GEN_INT (elt+4)));
19450 1675 : break;
19451 :
19452 931 : case 2:
19453 931 : tmp = gen_reg_rtx (mode);
19454 931 : emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
19455 931 : break;
19456 :
19457 0 : default:
19458 0 : gcc_unreachable ();
19459 : }
19460 : vec = tmp;
19461 : use_vec_extr = true;
19462 : elt = 0;
19463 : break;
19464 :
19465 23564 : case E_V4SImode:
19466 23564 : use_vec_extr = TARGET_SSE4_1;
19467 23564 : if (use_vec_extr)
19468 : break;
19469 :
19470 17883 : if (TARGET_SSE2)
19471 : {
19472 17879 : switch (elt)
19473 : {
19474 : case 0:
19475 : tmp = vec;
19476 : break;
19477 :
19478 5848 : case 1:
19479 5848 : case 3:
19480 5848 : tmp = gen_reg_rtx (mode);
19481 5848 : emit_insn (gen_sse2_pshufd_1 (tmp, vec,
19482 : GEN_INT (elt), GEN_INT (elt),
19483 : GEN_INT (elt), GEN_INT (elt)));
19484 5848 : break;
19485 :
19486 2906 : case 2:
19487 2906 : tmp = gen_reg_rtx (mode);
19488 2906 : emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
19489 2906 : break;
19490 :
19491 0 : default:
19492 0 : gcc_unreachable ();
19493 : }
19494 : vec = tmp;
19495 : use_vec_extr = true;
19496 : elt = 0;
19497 : }
19498 : else
19499 : {
19500 : /* For SSE1, we have to reuse the V4SF code. */
19501 4 : ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
19502 4 : gen_lowpart (V4SFmode, vec), elt);
19503 4 : return;
19504 : }
19505 : break;
19506 :
19507 6488 : case E_V8HImode:
19508 6488 : case E_V8HFmode:
19509 6488 : case E_V8BFmode:
19510 6488 : case E_V2HImode:
19511 6488 : case E_V2HFmode:
19512 6488 : case E_V2BFmode:
19513 6488 : use_vec_extr = TARGET_SSE2;
19514 6488 : break;
19515 858 : case E_V4HImode:
19516 858 : case E_V4HFmode:
19517 858 : case E_V4BFmode:
19518 858 : use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
19519 : break;
19520 :
19521 7809 : case E_V16QImode:
19522 7809 : use_vec_extr = TARGET_SSE4_1;
19523 7809 : if (!use_vec_extr
19524 6223 : && TARGET_SSE2
19525 6223 : && elt == 0
19526 11683 : && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
19527 : {
19528 3873 : tmp = gen_reg_rtx (SImode);
19529 3873 : ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
19530 : 0);
19531 3873 : emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
19532 3873 : return;
19533 : }
19534 : break;
19535 78 : case E_V4QImode:
19536 78 : use_vec_extr = TARGET_SSE4_1;
19537 78 : break;
19538 :
19539 663 : case E_V8SFmode:
19540 663 : if (TARGET_AVX)
19541 : {
19542 663 : tmp = gen_reg_rtx (V4SFmode);
19543 663 : if (elt < 4)
19544 326 : emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
19545 : else
19546 337 : emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
19547 663 : ix86_expand_vector_extract (false, target, tmp, elt & 3);
19548 663 : return;
19549 : }
19550 : break;
19551 :
19552 578 : case E_V4DFmode:
19553 578 : if (TARGET_AVX)
19554 : {
19555 578 : tmp = gen_reg_rtx (V2DFmode);
19556 578 : if (elt < 2)
19557 303 : emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
19558 : else
19559 275 : emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
19560 578 : ix86_expand_vector_extract (false, target, tmp, elt & 1);
19561 578 : return;
19562 : }
19563 : break;
19564 :
19565 253 : case E_V32QImode:
19566 253 : if (TARGET_AVX)
19567 : {
19568 253 : tmp = gen_reg_rtx (V16QImode);
19569 253 : if (elt < 16)
19570 130 : emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
19571 : else
19572 123 : emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
19573 253 : ix86_expand_vector_extract (false, target, tmp, elt & 15);
19574 253 : return;
19575 : }
19576 : break;
19577 :
19578 616 : case E_V16HImode:
19579 616 : if (TARGET_AVX)
19580 : {
19581 616 : tmp = gen_reg_rtx (V8HImode);
19582 616 : if (elt < 8)
19583 304 : emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
19584 : else
19585 312 : emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
19586 616 : ix86_expand_vector_extract (false, target, tmp, elt & 7);
19587 616 : return;
19588 : }
19589 : break;
19590 :
19591 993 : case E_V8SImode:
19592 993 : if (TARGET_AVX)
19593 : {
19594 993 : tmp = gen_reg_rtx (V4SImode);
19595 993 : if (elt < 4)
19596 479 : emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
19597 : else
19598 514 : emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
19599 993 : ix86_expand_vector_extract (false, target, tmp, elt & 3);
19600 993 : return;
19601 : }
19602 : break;
19603 :
19604 1518 : case E_V4DImode:
19605 1518 : if (TARGET_AVX)
19606 : {
19607 1518 : tmp = gen_reg_rtx (V2DImode);
19608 1518 : if (elt < 2)
19609 813 : emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
19610 : else
19611 705 : emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
19612 1518 : ix86_expand_vector_extract (false, target, tmp, elt & 1);
19613 1518 : return;
19614 : }
19615 : break;
19616 :
19617 8 : case E_V32HImode:
19618 8 : if (TARGET_AVX512BW)
19619 : {
19620 8 : tmp = gen_reg_rtx (V16HImode);
19621 8 : if (elt < 16)
19622 3 : emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
19623 : else
19624 5 : emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
19625 8 : ix86_expand_vector_extract (false, target, tmp, elt & 15);
19626 8 : return;
19627 : }
19628 : break;
19629 :
19630 10 : case E_V64QImode:
19631 10 : if (TARGET_AVX512BW)
19632 : {
19633 10 : tmp = gen_reg_rtx (V32QImode);
19634 10 : if (elt < 32)
19635 5 : emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
19636 : else
19637 5 : emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
19638 10 : ix86_expand_vector_extract (false, target, tmp, elt & 31);
19639 10 : return;
19640 : }
19641 : break;
19642 :
19643 311 : case E_V16SFmode:
19644 311 : tmp = gen_reg_rtx (V8SFmode);
19645 311 : if (elt < 8)
19646 157 : emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
19647 : else
19648 154 : emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
19649 311 : ix86_expand_vector_extract (false, target, tmp, elt & 7);
19650 311 : return;
19651 :
19652 296 : case E_V8DFmode:
19653 296 : tmp = gen_reg_rtx (V4DFmode);
19654 296 : if (elt < 4)
19655 160 : emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
19656 : else
19657 136 : emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
19658 296 : ix86_expand_vector_extract (false, target, tmp, elt & 3);
19659 296 : return;
19660 :
19661 252 : case E_V16SImode:
19662 252 : tmp = gen_reg_rtx (V8SImode);
19663 252 : if (elt < 8)
19664 133 : emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
19665 : else
19666 119 : emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
19667 252 : ix86_expand_vector_extract (false, target, tmp, elt & 7);
19668 252 : return;
19669 :
19670 706 : case E_V8DImode:
19671 706 : tmp = gen_reg_rtx (V4DImode);
19672 706 : if (elt < 4)
19673 403 : emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
19674 : else
19675 303 : emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
19676 706 : ix86_expand_vector_extract (false, target, tmp, elt & 3);
19677 706 : return;
19678 :
19679 45 : case E_V32HFmode:
19680 45 : case E_V32BFmode:
19681 45 : if (TARGET_AVX512BW)
19682 : {
19683 45 : tmp = (mode == E_V32HFmode
19684 45 : ? gen_reg_rtx (V16HFmode)
19685 7 : : gen_reg_rtx (V16BFmode));
19686 45 : if (elt < 16)
19687 31 : emit_insn (gen_vec_extract_lo (mode, tmp, vec));
19688 : else
19689 14 : emit_insn (gen_vec_extract_hi (mode, tmp, vec));
19690 45 : ix86_expand_vector_extract (false, target, tmp, elt & 15);
19691 45 : return;
19692 : }
19693 : break;
19694 :
19695 474 : case E_V16HFmode:
19696 474 : case E_V16BFmode:
19697 474 : if (TARGET_AVX)
19698 : {
19699 474 : tmp = (mode == E_V16HFmode
19700 474 : ? gen_reg_rtx (V8HFmode)
19701 339 : : gen_reg_rtx (V8BFmode));
19702 474 : if (elt < 8)
19703 249 : emit_insn (gen_vec_extract_lo (mode, tmp, vec));
19704 : else
19705 225 : emit_insn (gen_vec_extract_hi (mode, tmp, vec));
19706 474 : ix86_expand_vector_extract (false, target, tmp, elt & 7);
19707 474 : return;
19708 : }
19709 : break;
19710 :
19711 630 : case E_V8QImode:
19712 630 : use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
19713 : /* ??? Could extract the appropriate HImode element and shift. */
19714 : break;
19715 :
19716 : default:
19717 : break;
19718 : }
19719 :
19720 26397 : if (use_vec_extr)
19721 : {
19722 90289 : tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
19723 90289 : tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
19724 :
19725 : /* Let the rtl optimizers know about the zero extension performed. */
19726 90289 : if (inner_mode == QImode || inner_mode == HImode)
19727 : {
19728 8484 : rtx reg = gen_reg_rtx (SImode);
19729 8484 : tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
19730 8484 : emit_move_insn (reg, tmp);
19731 8484 : tmp = gen_lowpart (inner_mode, reg);
19732 8484 : SUBREG_PROMOTED_VAR_P (tmp) = 1;
19733 8484 : SUBREG_PROMOTED_SET (tmp, 1);
19734 : }
19735 :
19736 90289 : emit_move_insn (target, tmp);
19737 : }
19738 : else
19739 : {
19740 15088 : rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
19741 :
19742 7544 : emit_move_insn (mem, vec);
19743 :
19744 15088 : tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
19745 7544 : emit_move_insn (target, tmp);
19746 : }
19747 : }
19748 :
19749 : /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
19750 : to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
19751 : The upper bits of DEST are undefined, though they shouldn't cause
19752 : exceptions (some bits from src or all zeros are ok). */
19753 :
19754 : static void
19755 41449 : emit_reduc_half (rtx dest, rtx src, int i)
19756 : {
19757 41449 : rtx tem, d = dest;
19758 41449 : switch (GET_MODE (src))
19759 : {
19760 5970 : case E_V4SFmode:
19761 5970 : if (i == 128)
19762 2985 : tem = gen_sse_movhlps (dest, src, src);
19763 : else
19764 2985 : tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
19765 : GEN_INT (1 + 4), GEN_INT (1 + 4));
19766 : break;
19767 3333 : case E_V2DFmode:
19768 3333 : tem = gen_vec_interleave_highv2df (dest, src, src);
19769 3333 : break;
19770 76 : case E_V4QImode:
19771 76 : d = gen_reg_rtx (V1SImode);
19772 76 : tem = gen_mmx_lshrv1si3 (d, gen_lowpart (V1SImode, src),
19773 76 : GEN_INT (i / 2));
19774 76 : break;
19775 600 : case E_V8QImode:
19776 600 : case E_V4HImode:
19777 600 : d = gen_reg_rtx (V1DImode);
19778 600 : tem = gen_mmx_lshrv1di3 (d, gen_lowpart (V1DImode, src),
19779 600 : GEN_INT (i / 2));
19780 600 : break;
19781 31470 : case E_V16QImode:
19782 31470 : case E_V8HImode:
19783 31470 : case E_V8HFmode:
19784 31470 : case E_V4SImode:
19785 31470 : case E_V2DImode:
19786 31470 : if (TARGET_SSE_REDUCTION_PREFER_PSHUF)
19787 : {
19788 15 : if (i == 128)
19789 : {
19790 9 : d = gen_reg_rtx (V4SImode);
19791 18 : tem = gen_sse2_pshufd_1 (
19792 9 : d, force_reg (V4SImode, gen_lowpart (V4SImode, src)),
19793 : GEN_INT (2), GEN_INT (3), GEN_INT (2), GEN_INT (3));
19794 9 : break;
19795 : }
19796 6 : else if (i == 64)
19797 : {
19798 5 : d = gen_reg_rtx (V4SImode);
19799 10 : tem = gen_sse2_pshufd_1 (
19800 5 : d, force_reg (V4SImode, gen_lowpart (V4SImode, src)),
19801 : GEN_INT (1), GEN_INT (1), GEN_INT (1), GEN_INT (1));
19802 5 : break;
19803 : }
19804 1 : else if (i == 32)
19805 : {
19806 1 : d = gen_reg_rtx (V8HImode);
19807 2 : tem = gen_sse2_pshuflw_1 (
19808 1 : d, force_reg (V8HImode, gen_lowpart (V8HImode, src)),
19809 : GEN_INT (1), GEN_INT (1), GEN_INT (1), GEN_INT (1));
19810 1 : break;
19811 : }
19812 : }
19813 31455 : d = gen_reg_rtx (V1TImode);
19814 31455 : tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
19815 31455 : GEN_INT (i / 2));
19816 31455 : break;
19817 0 : case E_V8SFmode:
19818 0 : if (i == 256)
19819 0 : tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
19820 : else
19821 0 : tem = gen_avx_shufps256 (dest, src, src,
19822 : GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
19823 : break;
19824 0 : case E_V4DFmode:
19825 0 : if (i == 256)
19826 0 : tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
19827 : else
19828 0 : tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
19829 : break;
19830 0 : case E_V32QImode:
19831 0 : case E_V16HImode:
19832 0 : case E_V16HFmode:
19833 0 : case E_V8SImode:
19834 0 : case E_V4DImode:
19835 0 : if (i == 256)
19836 : {
19837 0 : if (GET_MODE (dest) != V4DImode)
19838 0 : d = gen_reg_rtx (V4DImode);
19839 0 : tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
19840 0 : gen_lowpart (V4DImode, src),
19841 : const1_rtx);
19842 : }
19843 : else
19844 : {
19845 0 : d = gen_reg_rtx (V2TImode);
19846 0 : tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
19847 0 : GEN_INT (i / 2));
19848 : }
19849 : break;
19850 0 : case E_V64QImode:
19851 0 : case E_V32HImode:
19852 0 : case E_V32HFmode:
19853 0 : if (i < 64)
19854 : {
19855 0 : d = gen_reg_rtx (V4TImode);
19856 0 : tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
19857 0 : GEN_INT (i / 2));
19858 0 : break;
19859 : }
19860 : /* FALLTHRU */
19861 0 : case E_V16SImode:
19862 0 : case E_V16SFmode:
19863 0 : case E_V8DImode:
19864 0 : case E_V8DFmode:
19865 0 : if (i > 128)
19866 0 : tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
19867 0 : gen_lowpart (V16SImode, src),
19868 0 : gen_lowpart (V16SImode, src),
19869 : GEN_INT (0x4 + (i == 512 ? 4 : 0)),
19870 : GEN_INT (0x5 + (i == 512 ? 4 : 0)),
19871 : GEN_INT (0x6 + (i == 512 ? 4 : 0)),
19872 : GEN_INT (0x7 + (i == 512 ? 4 : 0)),
19873 : GEN_INT (0xC), GEN_INT (0xD),
19874 : GEN_INT (0xE), GEN_INT (0xF),
19875 : GEN_INT (0x10), GEN_INT (0x11),
19876 : GEN_INT (0x12), GEN_INT (0x13),
19877 : GEN_INT (0x14), GEN_INT (0x15),
19878 : GEN_INT (0x16), GEN_INT (0x17));
19879 : else
19880 0 : tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
19881 0 : gen_lowpart (V16SImode, src),
19882 : GEN_INT (i == 128 ? 0x2 : 0x1),
19883 : GEN_INT (0x3),
19884 : GEN_INT (0x3),
19885 : GEN_INT (0x3),
19886 : GEN_INT (i == 128 ? 0x6 : 0x5),
19887 : GEN_INT (0x7),
19888 : GEN_INT (0x7),
19889 : GEN_INT (0x7),
19890 : GEN_INT (i == 128 ? 0xA : 0x9),
19891 : GEN_INT (0xB),
19892 : GEN_INT (0xB),
19893 : GEN_INT (0xB),
19894 : GEN_INT (i == 128 ? 0xE : 0xD),
19895 : GEN_INT (0xF),
19896 : GEN_INT (0xF),
19897 : GEN_INT (0xF));
19898 : break;
19899 0 : default:
19900 0 : gcc_unreachable ();
19901 : }
19902 41449 : emit_insn (tem);
19903 41449 : if (d != dest)
19904 32146 : emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
19905 41449 : }
19906 :
19907 : /* Expand a vector reduction. FN is the binary pattern to reduce;
19908 : DEST is the destination; IN is the input vector. */
19909 :
19910 : void
19911 20546 : ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
19912 : {
19913 20546 : rtx half, dst, vec = in;
19914 20546 : machine_mode mode = GET_MODE (in);
19915 20546 : int i;
19916 :
19917 : /* SSE4 has a special instruction for V8HImode UMIN reduction. */
19918 20546 : if (TARGET_SSE4_1
19919 9878 : && mode == V8HImode
19920 780 : && fn == gen_uminv8hi3)
19921 : {
19922 4 : emit_insn (gen_sse4_1_phminposuw (dest, in));
19923 4 : return;
19924 : }
19925 :
19926 41084 : for (i = GET_MODE_BITSIZE (mode);
19927 123982 : i > GET_MODE_UNIT_BITSIZE (mode);
19928 41449 : i >>= 1)
19929 : {
19930 41449 : half = gen_reg_rtx (mode);
19931 41449 : emit_reduc_half (half, vec, i);
19932 82898 : if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
19933 : dst = dest;
19934 : else
19935 20907 : dst = gen_reg_rtx (mode);
19936 41449 : emit_insn (fn (dst, half, vec));
19937 41449 : vec = dst;
19938 : }
19939 : }
19940 :
19941 : /* Output code to perform a conditional jump to LABEL, if C2 flag in
19942 : FP status register is set. */
19943 :
19944 : void
19945 284 : ix86_emit_fp_unordered_jump (rtx label)
19946 : {
19947 284 : rtx reg = gen_reg_rtx (HImode);
19948 284 : rtx_insn *insn;
19949 284 : rtx temp;
19950 :
19951 284 : emit_insn (gen_x86_fnstsw_1 (reg));
19952 :
19953 284 : if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
19954 : {
19955 37 : emit_insn (gen_x86_sahf_1 (reg));
19956 :
19957 37 : temp = gen_rtx_REG (CCmode, FLAGS_REG);
19958 37 : temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
19959 : }
19960 : else
19961 : {
19962 247 : emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
19963 :
19964 247 : temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
19965 247 : temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
19966 : }
19967 :
19968 284 : temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
19969 : gen_rtx_LABEL_REF (VOIDmode, label),
19970 : pc_rtx);
19971 284 : insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
19972 284 : predict_jump (REG_BR_PROB_BASE * 10 / 100);
19973 284 : JUMP_LABEL (insn) = label;
19974 284 : }
19975 :
19976 : /* Output code to perform an sinh XFmode calculation. */
19977 :
19978 : void
19979 2 : ix86_emit_i387_sinh (rtx op0, rtx op1)
19980 : {
19981 2 : rtx e1 = gen_reg_rtx (XFmode);
19982 2 : rtx e2 = gen_reg_rtx (XFmode);
19983 2 : rtx scratch = gen_reg_rtx (HImode);
19984 2 : rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
19985 2 : rtx half = const_double_from_real_value (dconsthalf, XFmode);
19986 2 : rtx cst1, tmp;
19987 2 : rtx_code_label *jump_label = gen_label_rtx ();
19988 2 : rtx_insn *insn;
19989 :
19990 : /* scratch = fxam (op1) */
19991 2 : emit_insn (gen_fxamxf2_i387 (scratch, op1));
19992 :
19993 : /* e1 = expm1 (|op1|) */
19994 2 : emit_insn (gen_absxf2 (e2, op1));
19995 2 : emit_insn (gen_expm1xf2 (e1, e2));
19996 :
19997 : /* e2 = e1 / (e1 + 1.0) + e1 */
19998 2 : cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
19999 2 : emit_insn (gen_addxf3 (e2, e1, cst1));
20000 2 : emit_insn (gen_divxf3 (e2, e1, e2));
20001 2 : emit_insn (gen_addxf3 (e2, e2, e1));
20002 :
20003 : /* flags = signbit (op1) */
20004 2 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
20005 :
20006 : /* if (flags) then e2 = -e2 */
20007 2 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
20008 : gen_rtx_EQ (VOIDmode, flags, const0_rtx),
20009 : gen_rtx_LABEL_REF (VOIDmode, jump_label),
20010 : pc_rtx);
20011 2 : insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
20012 2 : predict_jump (REG_BR_PROB_BASE * 50 / 100);
20013 2 : JUMP_LABEL (insn) = jump_label;
20014 :
20015 2 : emit_insn (gen_negxf2 (e2, e2));
20016 :
20017 2 : emit_label (jump_label);
20018 2 : LABEL_NUSES (jump_label) = 1;
20019 :
20020 : /* op0 = 0.5 * e2 */
20021 2 : half = force_reg (XFmode, half);
20022 2 : emit_insn (gen_mulxf3 (op0, e2, half));
20023 2 : }
20024 :
20025 : /* Output code to perform an cosh XFmode calculation. */
20026 :
20027 : void
20028 3 : ix86_emit_i387_cosh (rtx op0, rtx op1)
20029 : {
20030 3 : rtx e1 = gen_reg_rtx (XFmode);
20031 3 : rtx e2 = gen_reg_rtx (XFmode);
20032 3 : rtx half = const_double_from_real_value (dconsthalf, XFmode);
20033 3 : rtx cst1;
20034 :
20035 : /* e1 = exp (op1) */
20036 3 : emit_insn (gen_expxf2 (e1, op1));
20037 :
20038 : /* e2 = e1 + 1.0 / e1 */
20039 3 : cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
20040 3 : emit_insn (gen_divxf3 (e2, cst1, e1));
20041 3 : emit_insn (gen_addxf3 (e2, e1, e2));
20042 :
20043 : /* op0 = 0.5 * e2 */
20044 3 : half = force_reg (XFmode, half);
20045 3 : emit_insn (gen_mulxf3 (op0, e2, half));
20046 3 : }
20047 :
20048 : /* Output code to perform an tanh XFmode calculation. */
20049 :
20050 : void
20051 1 : ix86_emit_i387_tanh (rtx op0, rtx op1)
20052 : {
20053 1 : rtx e1 = gen_reg_rtx (XFmode);
20054 1 : rtx e2 = gen_reg_rtx (XFmode);
20055 1 : rtx scratch = gen_reg_rtx (HImode);
20056 1 : rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
20057 1 : rtx cst2, tmp;
20058 1 : rtx_code_label *jump_label = gen_label_rtx ();
20059 1 : rtx_insn *insn;
20060 :
20061 : /* scratch = fxam (op1) */
20062 1 : emit_insn (gen_fxamxf2_i387 (scratch, op1));
20063 :
20064 : /* e1 = expm1 (-|2 * op1|) */
20065 1 : emit_insn (gen_addxf3 (e2, op1, op1));
20066 1 : emit_insn (gen_absxf2 (e2, e2));
20067 1 : emit_insn (gen_negxf2 (e2, e2));
20068 1 : emit_insn (gen_expm1xf2 (e1, e2));
20069 :
20070 : /* e2 = e1 / (e1 + 2.0) */
20071 1 : cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
20072 1 : emit_insn (gen_addxf3 (e2, e1, cst2));
20073 1 : emit_insn (gen_divxf3 (e2, e1, e2));
20074 :
20075 : /* flags = signbit (op1) */
20076 1 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
20077 :
20078 : /* if (!flags) then e2 = -e2 */
20079 1 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
20080 : gen_rtx_NE (VOIDmode, flags, const0_rtx),
20081 : gen_rtx_LABEL_REF (VOIDmode, jump_label),
20082 : pc_rtx);
20083 1 : insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
20084 1 : predict_jump (REG_BR_PROB_BASE * 50 / 100);
20085 1 : JUMP_LABEL (insn) = jump_label;
20086 :
20087 1 : emit_insn (gen_negxf2 (e2, e2));
20088 :
20089 1 : emit_label (jump_label);
20090 1 : LABEL_NUSES (jump_label) = 1;
20091 :
20092 1 : emit_move_insn (op0, e2);
20093 1 : }
20094 :
20095 : /* Output code to perform an asinh XFmode calculation. */
20096 :
20097 : void
20098 0 : ix86_emit_i387_asinh (rtx op0, rtx op1)
20099 : {
20100 0 : rtx e1 = gen_reg_rtx (XFmode);
20101 0 : rtx e2 = gen_reg_rtx (XFmode);
20102 0 : rtx scratch = gen_reg_rtx (HImode);
20103 0 : rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
20104 0 : rtx cst1, tmp;
20105 0 : rtx_code_label *jump_label = gen_label_rtx ();
20106 0 : rtx_insn *insn;
20107 :
20108 : /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
20109 0 : emit_insn (gen_mulxf3 (e1, op1, op1));
20110 0 : cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
20111 0 : emit_insn (gen_addxf3 (e2, e1, cst1));
20112 0 : emit_insn (gen_sqrtxf2 (e2, e2));
20113 0 : emit_insn (gen_addxf3 (e2, e2, cst1));
20114 :
20115 : /* e1 = e1 / e2 */
20116 0 : emit_insn (gen_divxf3 (e1, e1, e2));
20117 :
20118 : /* scratch = fxam (op1) */
20119 0 : emit_insn (gen_fxamxf2_i387 (scratch, op1));
20120 :
20121 : /* e1 = e1 + |op1| */
20122 0 : emit_insn (gen_absxf2 (e2, op1));
20123 0 : emit_insn (gen_addxf3 (e1, e1, e2));
20124 :
20125 : /* e2 = log1p (e1) */
20126 0 : ix86_emit_i387_log1p (e2, e1);
20127 :
20128 : /* flags = signbit (op1) */
20129 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
20130 :
20131 : /* if (flags) then e2 = -e2 */
20132 0 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
20133 : gen_rtx_EQ (VOIDmode, flags, const0_rtx),
20134 : gen_rtx_LABEL_REF (VOIDmode, jump_label),
20135 : pc_rtx);
20136 0 : insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
20137 0 : predict_jump (REG_BR_PROB_BASE * 50 / 100);
20138 0 : JUMP_LABEL (insn) = jump_label;
20139 :
20140 0 : emit_insn (gen_negxf2 (e2, e2));
20141 :
20142 0 : emit_label (jump_label);
20143 0 : LABEL_NUSES (jump_label) = 1;
20144 :
20145 0 : emit_move_insn (op0, e2);
20146 0 : }
20147 :
20148 : /* Output code to perform an acosh XFmode calculation. */
20149 :
20150 : void
20151 0 : ix86_emit_i387_acosh (rtx op0, rtx op1)
20152 : {
20153 0 : rtx e1 = gen_reg_rtx (XFmode);
20154 0 : rtx e2 = gen_reg_rtx (XFmode);
20155 0 : rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
20156 :
20157 : /* e2 = sqrt (op1 + 1.0) */
20158 0 : emit_insn (gen_addxf3 (e2, op1, cst1));
20159 0 : emit_insn (gen_sqrtxf2 (e2, e2));
20160 :
20161 : /* e1 = sqrt (op1 - 1.0) */
20162 0 : emit_insn (gen_subxf3 (e1, op1, cst1));
20163 0 : emit_insn (gen_sqrtxf2 (e1, e1));
20164 :
20165 : /* e1 = e1 * e2 */
20166 0 : emit_insn (gen_mulxf3 (e1, e1, e2));
20167 :
20168 : /* e1 = e1 + op1 */
20169 0 : emit_insn (gen_addxf3 (e1, e1, op1));
20170 :
20171 : /* op0 = log (e1) */
20172 0 : emit_insn (gen_logxf2 (op0, e1));
20173 0 : }
20174 :
20175 : /* Output code to perform an atanh XFmode calculation. */
20176 :
20177 : void
20178 4 : ix86_emit_i387_atanh (rtx op0, rtx op1)
20179 : {
20180 4 : rtx e1 = gen_reg_rtx (XFmode);
20181 4 : rtx e2 = gen_reg_rtx (XFmode);
20182 4 : rtx scratch = gen_reg_rtx (HImode);
20183 4 : rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
20184 4 : rtx half = const_double_from_real_value (dconsthalf, XFmode);
20185 4 : rtx cst1, tmp;
20186 4 : rtx_code_label *jump_label = gen_label_rtx ();
20187 4 : rtx_insn *insn;
20188 :
20189 : /* scratch = fxam (op1) */
20190 4 : emit_insn (gen_fxamxf2_i387 (scratch, op1));
20191 :
20192 : /* e2 = |op1| */
20193 4 : emit_insn (gen_absxf2 (e2, op1));
20194 :
20195 : /* e1 = -(e2 + e2) / (e2 + 1.0) */
20196 4 : cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
20197 4 : emit_insn (gen_addxf3 (e1, e2, cst1));
20198 4 : emit_insn (gen_addxf3 (e2, e2, e2));
20199 4 : emit_insn (gen_negxf2 (e2, e2));
20200 4 : emit_insn (gen_divxf3 (e1, e2, e1));
20201 :
20202 : /* e2 = log1p (e1) */
20203 4 : ix86_emit_i387_log1p (e2, e1);
20204 :
20205 : /* flags = signbit (op1) */
20206 4 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
20207 :
20208 : /* if (!flags) then e2 = -e2 */
20209 4 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
20210 : gen_rtx_NE (VOIDmode, flags, const0_rtx),
20211 : gen_rtx_LABEL_REF (VOIDmode, jump_label),
20212 : pc_rtx);
20213 4 : insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
20214 4 : predict_jump (REG_BR_PROB_BASE * 50 / 100);
20215 4 : JUMP_LABEL (insn) = jump_label;
20216 :
20217 4 : emit_insn (gen_negxf2 (e2, e2));
20218 :
20219 4 : emit_label (jump_label);
20220 4 : LABEL_NUSES (jump_label) = 1;
20221 :
20222 : /* op0 = 0.5 * e2 */
20223 4 : half = force_reg (XFmode, half);
20224 4 : emit_insn (gen_mulxf3 (op0, e2, half));
20225 4 : }
20226 :
20227 : /* Output code to perform a log1p XFmode calculation. */
20228 :
20229 : void
20230 5 : ix86_emit_i387_log1p (rtx op0, rtx op1)
20231 : {
20232 5 : rtx_code_label *label1 = gen_label_rtx ();
20233 5 : rtx_code_label *label2 = gen_label_rtx ();
20234 :
20235 5 : rtx tmp = gen_reg_rtx (XFmode);
20236 5 : rtx res = gen_reg_rtx (XFmode);
20237 5 : rtx cst, cstln2, cst1;
20238 5 : rtx_insn *insn;
20239 :
20240 : /* The emit_jump call emits pending stack adjust, make sure it is emitted
20241 : before the conditional jump, otherwise the stack adjustment will be
20242 : only conditional. */
20243 5 : do_pending_stack_adjust ();
20244 :
20245 5 : cst = const_double_from_real_value
20246 5 : (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
20247 5 : cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
20248 :
20249 5 : emit_insn (gen_absxf2 (tmp, op1));
20250 :
20251 5 : cst = force_reg (XFmode, cst);
20252 5 : ix86_expand_branch (GE, tmp, cst, label1);
20253 5 : predict_jump (REG_BR_PROB_BASE * 10 / 100);
20254 5 : insn = get_last_insn ();
20255 5 : JUMP_LABEL (insn) = label1;
20256 :
20257 5 : emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
20258 5 : emit_jump (label2);
20259 :
20260 5 : emit_label (label1);
20261 5 : LABEL_NUSES (label1) = 1;
20262 :
20263 5 : cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
20264 5 : emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
20265 5 : emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
20266 :
20267 5 : emit_label (label2);
20268 5 : LABEL_NUSES (label2) = 1;
20269 :
20270 5 : emit_move_insn (op0, res);
20271 5 : }
20272 :
20273 : /* Emit code for round calculation. */
20274 : void
20275 60 : ix86_emit_i387_round (rtx op0, rtx op1)
20276 : {
20277 60 : machine_mode inmode = GET_MODE (op1);
20278 60 : machine_mode outmode = GET_MODE (op0);
20279 60 : rtx e1 = gen_reg_rtx (XFmode);
20280 60 : rtx e2 = gen_reg_rtx (XFmode);
20281 60 : rtx scratch = gen_reg_rtx (HImode);
20282 60 : rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
20283 60 : rtx half = const_double_from_real_value (dconsthalf, XFmode);
20284 60 : rtx res = gen_reg_rtx (outmode);
20285 60 : rtx_code_label *jump_label = gen_label_rtx ();
20286 60 : rtx (*floor_insn) (rtx, rtx);
20287 60 : rtx (*neg_insn) (rtx, rtx);
20288 60 : rtx_insn *insn;
20289 60 : rtx tmp;
20290 :
20291 60 : switch (inmode)
20292 : {
20293 29 : case E_SFmode:
20294 29 : case E_DFmode:
20295 29 : tmp = gen_reg_rtx (XFmode);
20296 :
20297 29 : emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
20298 29 : op1 = tmp;
20299 29 : break;
20300 : case E_XFmode:
20301 : break;
20302 0 : default:
20303 0 : gcc_unreachable ();
20304 : }
20305 :
20306 60 : switch (outmode)
20307 : {
20308 : case E_SFmode:
20309 : floor_insn = gen_frndintxf2_floor;
20310 : neg_insn = gen_negsf2;
20311 : break;
20312 6 : case E_DFmode:
20313 6 : floor_insn = gen_frndintxf2_floor;
20314 6 : neg_insn = gen_negdf2;
20315 6 : break;
20316 10 : case E_XFmode:
20317 10 : floor_insn = gen_frndintxf2_floor;
20318 10 : neg_insn = gen_negxf2;
20319 10 : break;
20320 0 : case E_HImode:
20321 0 : floor_insn = gen_lfloorxfhi2;
20322 0 : neg_insn = gen_neghi2;
20323 0 : break;
20324 6 : case E_SImode:
20325 6 : floor_insn = gen_lfloorxfsi2;
20326 6 : neg_insn = gen_negsi2;
20327 6 : break;
20328 36 : case E_DImode:
20329 36 : floor_insn = gen_lfloorxfdi2;
20330 36 : neg_insn = gen_negdi2;
20331 36 : break;
20332 0 : default:
20333 0 : gcc_unreachable ();
20334 : }
20335 :
20336 : /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
20337 :
20338 : /* scratch = fxam(op1) */
20339 60 : emit_insn (gen_fxamxf2_i387 (scratch, op1));
20340 :
20341 : /* e1 = fabs(op1) */
20342 60 : emit_insn (gen_absxf2 (e1, op1));
20343 :
20344 : /* e2 = e1 + 0.5 */
20345 60 : half = force_reg (XFmode, half);
20346 60 : emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
20347 :
20348 : /* res = floor(e2) */
20349 60 : switch (outmode)
20350 : {
20351 8 : case E_SFmode:
20352 8 : case E_DFmode:
20353 8 : {
20354 8 : tmp = gen_reg_rtx (XFmode);
20355 :
20356 8 : emit_insn (floor_insn (tmp, e2));
20357 8 : emit_insn (gen_rtx_SET (res,
20358 : gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
20359 : UNSPEC_TRUNC_NOOP)));
20360 : }
20361 8 : break;
20362 52 : default:
20363 52 : emit_insn (floor_insn (res, e2));
20364 : }
20365 :
20366 : /* flags = signbit(a) */
20367 60 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
20368 :
20369 : /* if (flags) then res = -res */
20370 60 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
20371 : gen_rtx_EQ (VOIDmode, flags, const0_rtx),
20372 : gen_rtx_LABEL_REF (VOIDmode, jump_label),
20373 : pc_rtx);
20374 60 : insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
20375 60 : predict_jump (REG_BR_PROB_BASE * 50 / 100);
20376 60 : JUMP_LABEL (insn) = jump_label;
20377 :
20378 60 : emit_insn (neg_insn (res, res));
20379 :
20380 60 : emit_label (jump_label);
20381 60 : LABEL_NUSES (jump_label) = 1;
20382 :
20383 60 : emit_move_insn (op0, res);
20384 60 : }
20385 :
20386 : /* Output code to perform a Newton-Rhapson approximation of a single precision
20387 : floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
20388 :
20389 : void
20390 56 : ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
20391 : {
20392 56 : rtx x0, x1, e0, e1;
20393 :
20394 56 : x0 = gen_reg_rtx (mode);
20395 56 : e0 = gen_reg_rtx (mode);
20396 56 : e1 = gen_reg_rtx (mode);
20397 56 : x1 = gen_reg_rtx (mode);
20398 :
20399 56 : b = force_reg (mode, b);
20400 :
20401 : /* x0 = rcp(b) estimate */
20402 56 : if (mode == V16SFmode || mode == V8DFmode)
20403 : {
20404 0 : emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
20405 : UNSPEC_RCP14)));
20406 : }
20407 : else
20408 56 : emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
20409 : UNSPEC_RCP)));
20410 :
20411 56 : unsigned vector_size = GET_MODE_SIZE (mode);
20412 :
20413 : /* (a - (rcp(b) * a * b)) * rcp(b) + rcp(b) * a
20414 : N-R step with 2 fma implementation. */
20415 56 : if (TARGET_FMA
20416 55 : || (TARGET_AVX512F && vector_size == 64)
20417 55 : || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
20418 : {
20419 : /* e0 = x0 * a */
20420 1 : emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
20421 : /* e1 = e0 * b - a */
20422 1 : emit_insn (gen_rtx_SET (e1, gen_rtx_FMA (mode, e0, b,
20423 : gen_rtx_NEG (mode, a))));
20424 : /* res = - e1 * x0 + e0 */
20425 1 : emit_insn (gen_rtx_SET (res, gen_rtx_FMA (mode,
20426 : gen_rtx_NEG (mode, e1),
20427 : x0, e0)));
20428 : }
20429 : else
20430 : /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
20431 : {
20432 : /* e0 = x0 * b */
20433 55 : emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
20434 :
20435 : /* e1 = x0 + x0 */
20436 55 : emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
20437 :
20438 : /* e0 = x0 * e0 */
20439 55 : emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
20440 :
20441 : /* x1 = e1 - e0 */
20442 55 : emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
20443 :
20444 : /* res = a * x1 */
20445 55 : emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
20446 : }
20447 56 : }
20448 :
20449 : /* Output code to perform a Newton-Rhapson approximation of a
20450 : single precision floating point [reciprocal] square root. */
20451 :
20452 : void
20453 85 : ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
20454 : {
20455 85 : rtx x0, e0, e1, e2, e3, mthree, mhalf;
20456 85 : REAL_VALUE_TYPE r;
20457 85 : int unspec;
20458 :
20459 85 : x0 = gen_reg_rtx (mode);
20460 85 : e0 = gen_reg_rtx (mode);
20461 85 : e1 = gen_reg_rtx (mode);
20462 85 : e2 = gen_reg_rtx (mode);
20463 85 : e3 = gen_reg_rtx (mode);
20464 :
20465 85 : real_from_integer (&r, VOIDmode, -3, SIGNED);
20466 85 : mthree = const_double_from_real_value (r, SFmode);
20467 :
20468 85 : real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
20469 85 : mhalf = const_double_from_real_value (r, SFmode);
20470 85 : unspec = UNSPEC_RSQRT;
20471 :
20472 85 : if (VECTOR_MODE_P (mode))
20473 : {
20474 66 : mthree = ix86_build_const_vector (mode, true, mthree);
20475 66 : mhalf = ix86_build_const_vector (mode, true, mhalf);
20476 : /* There is no 512-bit rsqrt. There is however rsqrt14. */
20477 132 : if (GET_MODE_SIZE (mode) == 64)
20478 0 : unspec = UNSPEC_RSQRT14;
20479 : }
20480 :
20481 : /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
20482 : rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
20483 :
20484 85 : a = force_reg (mode, a);
20485 :
20486 : /* x0 = rsqrt(a) estimate */
20487 85 : emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
20488 : unspec)));
20489 :
20490 : /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
20491 85 : if (!recip)
20492 : {
20493 57 : rtx zero = force_reg (mode, CONST0_RTX(mode));
20494 57 : rtx mask;
20495 :
20496 : /* Handle masked compare. */
20497 110 : if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
20498 : {
20499 0 : mask = gen_reg_rtx (HImode);
20500 : /* Imm value 0x4 corresponds to not-equal comparison. */
20501 0 : emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
20502 0 : emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
20503 : }
20504 : else
20505 : {
20506 57 : mask = gen_reg_rtx (mode);
20507 57 : emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
20508 57 : emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
20509 : }
20510 : }
20511 :
20512 85 : mthree = force_reg (mode, mthree);
20513 :
20514 : /* e0 = x0 * a */
20515 85 : emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
20516 :
20517 85 : unsigned vector_size = GET_MODE_SIZE (mode);
20518 85 : if (TARGET_FMA
20519 77 : || (TARGET_AVX512F && vector_size == 64)
20520 77 : || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
20521 16 : emit_insn (gen_rtx_SET (e2,
20522 : gen_rtx_FMA (mode, e0, x0, mthree)));
20523 : else
20524 : {
20525 : /* e1 = e0 * x0 */
20526 69 : emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
20527 :
20528 : /* e2 = e1 - 3. */
20529 69 : emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
20530 : }
20531 :
20532 85 : mhalf = force_reg (mode, mhalf);
20533 85 : if (recip)
20534 : /* e3 = -.5 * x0 */
20535 28 : emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
20536 : else
20537 : /* e3 = -.5 * e0 */
20538 57 : emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
20539 : /* ret = e2 * e3 */
20540 85 : emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
20541 85 : }
20542 :
20543 : /* Expand fabs (OP0) and return a new rtx that holds the result. The
20544 : mask for masking out the sign-bit is stored in *SMASK, if that is
20545 : non-null. */
20546 :
20547 : static rtx
20548 1048 : ix86_expand_sse_fabs (rtx op0, rtx *smask)
20549 : {
20550 1048 : machine_mode vmode, mode = GET_MODE (op0);
20551 1048 : rtx xa, mask;
20552 :
20553 1048 : xa = gen_reg_rtx (mode);
20554 1048 : if (mode == SFmode)
20555 : vmode = V4SFmode;
20556 466 : else if (mode == DFmode)
20557 : vmode = V2DFmode;
20558 : else
20559 0 : vmode = mode;
20560 1048 : mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
20561 1048 : if (!VECTOR_MODE_P (mode))
20562 : {
20563 : /* We need to generate a scalar mode mask in this case. */
20564 1048 : rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
20565 1048 : tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
20566 1048 : mask = gen_reg_rtx (mode);
20567 1048 : emit_insn (gen_rtx_SET (mask, tmp));
20568 : }
20569 1048 : emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
20570 :
20571 1048 : if (smask)
20572 995 : *smask = mask;
20573 :
20574 1048 : return xa;
20575 : }
20576 :
20577 : /* Expands a comparison of OP0 with OP1 using comparison code CODE,
20578 : swapping the operands if SWAP_OPERANDS is true. The expanded
20579 : code is a forward jump to a newly created label in case the
20580 : comparison is true. The generated label rtx is returned. */
20581 : static rtx_code_label *
20582 1063 : ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
20583 : bool swap_operands)
20584 : {
20585 1063 : bool unordered_compare = ix86_unordered_fp_compare (code);
20586 1063 : rtx_code_label *label;
20587 1063 : rtx tmp, reg;
20588 :
20589 1063 : if (swap_operands)
20590 34 : std::swap (op0, op1);
20591 :
20592 1063 : label = gen_label_rtx ();
20593 1063 : tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
20594 1063 : if (unordered_compare)
20595 907 : tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
20596 1063 : reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
20597 1063 : emit_insn (gen_rtx_SET (reg, tmp));
20598 1063 : tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
20599 1063 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
20600 : gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
20601 1063 : tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
20602 1063 : JUMP_LABEL (tmp) = label;
20603 :
20604 1063 : return label;
20605 : }
20606 :
20607 : /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
20608 : using comparison code CODE. Operands are swapped for the comparison if
20609 : SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
20610 : static rtx
20611 539 : ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
20612 : bool swap_operands)
20613 : {
20614 539 : rtx (*insn)(rtx, rtx, rtx, rtx);
20615 539 : machine_mode mode = GET_MODE (op0);
20616 539 : rtx mask = gen_reg_rtx (mode);
20617 :
20618 539 : if (swap_operands)
20619 362 : std::swap (op0, op1);
20620 :
20621 539 : insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
20622 :
20623 539 : emit_insn (insn (mask, op0, op1,
20624 : gen_rtx_fmt_ee (code, mode, op0, op1)));
20625 539 : return mask;
20626 : }
20627 :
20628 : /* Expand copysign from SIGN to the positive value ABS_VALUE
20629 : storing in RESULT. If MASK is non-null, it shall be a mask to mask out
20630 : the sign-bit. */
20631 :
20632 : static void
20633 1015 : ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
20634 : {
20635 1015 : machine_mode mode = GET_MODE (sign);
20636 1015 : rtx sgn = gen_reg_rtx (mode);
20637 1015 : if (mask == NULL_RTX)
20638 : {
20639 28 : machine_mode vmode;
20640 :
20641 28 : if (mode == SFmode)
20642 : vmode = V4SFmode;
20643 : else if (mode == DFmode)
20644 : vmode = V2DFmode;
20645 : else if (mode == HFmode)
20646 : vmode = V8HFmode;
20647 : else
20648 28 : vmode = mode;
20649 :
20650 28 : mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
20651 28 : if (!VECTOR_MODE_P (mode))
20652 : {
20653 : /* We need to generate a scalar mode mask in this case. */
20654 28 : rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
20655 28 : tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
20656 28 : mask = gen_reg_rtx (mode);
20657 28 : emit_insn (gen_rtx_SET (mask, tmp));
20658 : }
20659 : }
20660 : else
20661 987 : mask = gen_rtx_NOT (mode, mask);
20662 1015 : emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
20663 1015 : emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
20664 1015 : }
20665 :
20666 : /* Expand SSE sequence for computing lround from OP1 storing
20667 : into OP0. */
20668 :
20669 : void
20670 28 : ix86_expand_lround (rtx op0, rtx op1)
20671 : {
20672 : /* C code for the stuff we're doing below:
20673 : tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
20674 : return (long)tmp;
20675 : */
20676 28 : machine_mode mode = GET_MODE (op1);
20677 28 : const struct real_format *fmt;
20678 28 : REAL_VALUE_TYPE pred_half, half_minus_pred_half;
20679 28 : rtx adj;
20680 :
20681 : /* load nextafter (0.5, 0.0) */
20682 28 : fmt = REAL_MODE_FORMAT (mode);
20683 28 : real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
20684 28 : real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
20685 :
20686 : /* adj = copysign (0.5, op1) */
20687 28 : adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
20688 28 : ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
20689 :
20690 : /* adj = op1 + adj */
20691 28 : adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
20692 :
20693 : /* op0 = (imode)adj */
20694 28 : expand_fix (op0, adj, 0);
20695 28 : }
20696 :
20697 : /* Expand SSE2 sequence for computing lround from OPERAND1 storing
20698 : into OPERAND0. */
20699 :
20700 : void
20701 68 : ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
20702 : {
20703 : /* C code for the stuff we're doing below (for do_floor):
20704 : xi = (long)op1;
20705 : xi -= (double)xi > op1 ? 1 : 0;
20706 : return xi;
20707 : */
20708 68 : machine_mode fmode = GET_MODE (op1);
20709 68 : machine_mode imode = GET_MODE (op0);
20710 68 : rtx ireg, freg, tmp;
20711 68 : rtx_code_label *label;
20712 :
20713 : /* reg = (long)op1 */
20714 68 : ireg = gen_reg_rtx (imode);
20715 68 : expand_fix (ireg, op1, 0);
20716 :
20717 : /* freg = (double)reg */
20718 68 : freg = gen_reg_rtx (fmode);
20719 68 : expand_float (freg, ireg, 0);
20720 :
20721 : /* ireg = (freg > op1) ? ireg - 1 : ireg */
20722 136 : label = ix86_expand_sse_compare_and_jump (UNLE,
20723 68 : freg, op1, !do_floor);
20724 102 : tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
20725 : ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
20726 68 : emit_move_insn (ireg, tmp);
20727 :
20728 68 : emit_label (label);
20729 68 : LABEL_NUSES (label) = 1;
20730 :
20731 68 : emit_move_insn (op0, ireg);
20732 68 : }
20733 :
20734 : /* Generate and return a rtx of mode MODE for 2**n where n is the number
20735 : of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
20736 :
20737 : static rtx
20738 995 : ix86_gen_TWO52 (machine_mode mode)
20739 : {
20740 995 : const struct real_format *fmt;
20741 995 : REAL_VALUE_TYPE TWO52r;
20742 995 : rtx TWO52;
20743 :
20744 995 : fmt = REAL_MODE_FORMAT (mode);
20745 995 : real_2expN (&TWO52r, fmt->p - 1, mode);
20746 995 : TWO52 = const_double_from_real_value (TWO52r, mode);
20747 995 : TWO52 = force_reg (mode, TWO52);
20748 :
20749 995 : return TWO52;
20750 : }
20751 :
20752 : /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
20753 :
20754 : void
20755 121 : ix86_expand_rint (rtx operand0, rtx operand1)
20756 : {
20757 : /* C code for the stuff we're doing below:
20758 : xa = fabs (operand1);
20759 : if (!isless (xa, 2**52))
20760 : return operand1;
20761 : two52 = 2**52;
20762 : if (flag_rounding_math)
20763 : {
20764 : two52 = copysign (two52, operand1);
20765 : xa = operand1;
20766 : }
20767 : xa = xa + two52 - two52;
20768 : return copysign (xa, operand1);
20769 : */
20770 121 : machine_mode mode = GET_MODE (operand0);
20771 121 : rtx res, xa, TWO52, mask;
20772 121 : rtx_code_label *label;
20773 :
20774 121 : TWO52 = ix86_gen_TWO52 (mode);
20775 :
20776 : /* Temporary for holding the result, initialized to the input
20777 : operand to ease control flow. */
20778 121 : res = copy_to_reg (operand1);
20779 :
20780 : /* xa = abs (operand1) */
20781 121 : xa = ix86_expand_sse_fabs (res, &mask);
20782 :
20783 : /* if (!isless (xa, TWO52)) goto label; */
20784 121 : label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
20785 :
20786 121 : if (flag_rounding_math)
20787 : {
20788 53 : ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
20789 53 : xa = res;
20790 : }
20791 :
20792 121 : xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
20793 121 : xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
20794 :
20795 : /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
20796 121 : if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
20797 53 : xa = ix86_expand_sse_fabs (xa, NULL);
20798 :
20799 121 : ix86_sse_copysign_to_positive (res, xa, res, mask);
20800 :
20801 121 : emit_label (label);
20802 121 : LABEL_NUSES (label) = 1;
20803 :
20804 121 : emit_move_insn (operand0, res);
20805 121 : }
20806 :
20807 : /* Expand SSE2 sequence for computing floor or ceil
20808 : from OPERAND1 storing into OPERAND0. */
20809 : void
20810 539 : ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
20811 : {
20812 : /* C code for the stuff we expand below.
20813 : double xa = fabs (x), x2;
20814 : if (!isless (xa, TWO52))
20815 : return x;
20816 : x2 = (double)(long)x;
20817 :
20818 : Compensate. Floor:
20819 : if (x2 > x)
20820 : x2 -= 1;
20821 : Compensate. Ceil:
20822 : if (x2 < x)
20823 : x2 += 1;
20824 :
20825 : if (HONOR_SIGNED_ZEROS (mode))
20826 : return copysign (x2, x);
20827 : return x2;
20828 : */
20829 539 : machine_mode mode = GET_MODE (operand0);
20830 539 : rtx xa, xi, TWO52, tmp, one, res, mask;
20831 539 : rtx_code_label *label;
20832 :
20833 539 : TWO52 = ix86_gen_TWO52 (mode);
20834 :
20835 : /* Temporary for holding the result, initialized to the input
20836 : operand to ease control flow. */
20837 539 : res = copy_to_reg (operand1);
20838 :
20839 : /* xa = abs (operand1) */
20840 539 : xa = ix86_expand_sse_fabs (res, &mask);
20841 :
20842 : /* if (!isless (xa, TWO52)) goto label; */
20843 539 : label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
20844 :
20845 : /* xa = (double)(long)x */
20846 539 : xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
20847 539 : expand_fix (xi, res, 0);
20848 539 : expand_float (xa, xi, 0);
20849 :
20850 : /* generate 1.0 */
20851 539 : one = force_reg (mode, const_double_from_real_value (dconst1, mode));
20852 :
20853 : /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
20854 539 : tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
20855 539 : emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
20856 901 : tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
20857 : xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
20858 539 : if (HONOR_SIGNED_ZEROS (mode))
20859 : {
20860 : /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
20861 492 : if (do_floor && flag_rounding_math)
20862 0 : tmp = ix86_expand_sse_fabs (tmp, NULL);
20863 :
20864 492 : ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
20865 : }
20866 539 : emit_move_insn (res, tmp);
20867 :
20868 539 : emit_label (label);
20869 539 : LABEL_NUSES (label) = 1;
20870 :
20871 539 : emit_move_insn (operand0, res);
20872 539 : }
20873 :
20874 : /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
20875 : into OPERAND0 without relying on DImode truncation via cvttsd2siq
20876 : that is only available on 64bit targets. */
20877 : void
20878 0 : ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
20879 : {
20880 : /* C code for the stuff we expand below.
20881 : double xa = fabs (x), x2;
20882 : if (!isless (xa, TWO52))
20883 : return x;
20884 : xa = xa + TWO52 - TWO52;
20885 : x2 = copysign (xa, x);
20886 :
20887 : Compensate. Floor:
20888 : if (x2 > x)
20889 : x2 -= 1;
20890 : Compensate. Ceil:
20891 : if (x2 < x)
20892 : x2 += 1;
20893 :
20894 : if (HONOR_SIGNED_ZEROS (mode))
20895 : x2 = copysign (x2, x);
20896 : return x2;
20897 : */
20898 0 : machine_mode mode = GET_MODE (operand0);
20899 0 : rtx xa, TWO52, tmp, one, res, mask;
20900 0 : rtx_code_label *label;
20901 :
20902 0 : TWO52 = ix86_gen_TWO52 (mode);
20903 :
20904 : /* Temporary for holding the result, initialized to the input
20905 : operand to ease control flow. */
20906 0 : res = copy_to_reg (operand1);
20907 :
20908 : /* xa = abs (operand1) */
20909 0 : xa = ix86_expand_sse_fabs (res, &mask);
20910 :
20911 : /* if (!isless (xa, TWO52)) goto label; */
20912 0 : label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
20913 :
20914 : /* xa = xa + TWO52 - TWO52; */
20915 0 : xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
20916 0 : xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
20917 :
20918 : /* xa = copysign (xa, operand1) */
20919 0 : ix86_sse_copysign_to_positive (xa, xa, res, mask);
20920 :
20921 : /* generate 1.0 */
20922 0 : one = force_reg (mode, const_double_from_real_value (dconst1, mode));
20923 :
20924 : /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
20925 0 : tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
20926 0 : emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
20927 0 : tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
20928 : xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
20929 0 : if (HONOR_SIGNED_ZEROS (mode))
20930 : {
20931 : /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
20932 0 : if (do_floor && flag_rounding_math)
20933 0 : tmp = ix86_expand_sse_fabs (tmp, NULL);
20934 :
20935 0 : ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
20936 : }
20937 0 : emit_move_insn (res, tmp);
20938 :
20939 0 : emit_label (label);
20940 0 : LABEL_NUSES (label) = 1;
20941 :
20942 0 : emit_move_insn (operand0, res);
20943 0 : }
20944 :
20945 : /* Expand SSE sequence for computing trunc
20946 : from OPERAND1 storing into OPERAND0. */
20947 : void
20948 321 : ix86_expand_trunc (rtx operand0, rtx operand1)
20949 : {
20950 : /* C code for SSE variant we expand below.
20951 : double xa = fabs (x), x2;
20952 : if (!isless (xa, TWO52))
20953 : return x;
20954 : x2 = (double)(long)x;
20955 : if (HONOR_SIGNED_ZEROS (mode))
20956 : return copysign (x2, x);
20957 : return x2;
20958 : */
20959 321 : machine_mode mode = GET_MODE (operand0);
20960 321 : rtx xa, xi, TWO52, res, mask;
20961 321 : rtx_code_label *label;
20962 :
20963 321 : TWO52 = ix86_gen_TWO52 (mode);
20964 :
20965 : /* Temporary for holding the result, initialized to the input
20966 : operand to ease control flow. */
20967 321 : res = copy_to_reg (operand1);
20968 :
20969 : /* xa = abs (operand1) */
20970 321 : xa = ix86_expand_sse_fabs (res, &mask);
20971 :
20972 : /* if (!isless (xa, TWO52)) goto label; */
20973 321 : label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
20974 :
20975 : /* xa = (double)(long)x */
20976 321 : xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
20977 321 : expand_fix (xi, res, 0);
20978 321 : expand_float (xa, xi, 0);
20979 :
20980 321 : if (HONOR_SIGNED_ZEROS (mode))
20981 307 : ix86_sse_copysign_to_positive (xa, xa, res, mask);
20982 :
20983 321 : emit_move_insn (res, xa);
20984 :
20985 321 : emit_label (label);
20986 321 : LABEL_NUSES (label) = 1;
20987 :
20988 321 : emit_move_insn (operand0, res);
20989 321 : }
20990 :
20991 : /* Expand SSE sequence for computing trunc from OPERAND1 storing
20992 : into OPERAND0 without relying on DImode truncation via cvttsd2siq
20993 : that is only available on 64bit targets. */
20994 : void
20995 0 : ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
20996 : {
20997 0 : machine_mode mode = GET_MODE (operand0);
20998 0 : rtx xa, xa2, TWO52, tmp, one, res, mask;
20999 0 : rtx_code_label *label;
21000 :
21001 : /* C code for SSE variant we expand below.
21002 : double xa = fabs (x), x2;
21003 : if (!isless (xa, TWO52))
21004 : return x;
21005 : xa2 = xa + TWO52 - TWO52;
21006 : Compensate:
21007 : if (xa2 > xa)
21008 : xa2 -= 1.0;
21009 : x2 = copysign (xa2, x);
21010 : return x2;
21011 : */
21012 :
21013 0 : TWO52 = ix86_gen_TWO52 (mode);
21014 :
21015 : /* Temporary for holding the result, initialized to the input
21016 : operand to ease control flow. */
21017 0 : res =copy_to_reg (operand1);
21018 :
21019 : /* xa = abs (operand1) */
21020 0 : xa = ix86_expand_sse_fabs (res, &mask);
21021 :
21022 : /* if (!isless (xa, TWO52)) goto label; */
21023 0 : label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
21024 :
21025 : /* xa2 = xa + TWO52 - TWO52; */
21026 0 : xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
21027 0 : xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
21028 :
21029 : /* generate 1.0 */
21030 0 : one = force_reg (mode, const_double_from_real_value (dconst1, mode));
21031 :
21032 : /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
21033 0 : tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
21034 0 : emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
21035 0 : tmp = expand_simple_binop (mode, MINUS,
21036 : xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
21037 : /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
21038 0 : if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
21039 0 : tmp = ix86_expand_sse_fabs (tmp, NULL);
21040 :
21041 : /* res = copysign (xa2, operand1) */
21042 0 : ix86_sse_copysign_to_positive (res, tmp, res, mask);
21043 :
21044 0 : emit_label (label);
21045 0 : LABEL_NUSES (label) = 1;
21046 :
21047 0 : emit_move_insn (operand0, res);
21048 0 : }
21049 :
21050 : /* Expand SSE sequence for computing round
21051 : from OPERAND1 storing into OPERAND0. */
21052 : void
21053 14 : ix86_expand_round (rtx operand0, rtx operand1)
21054 : {
21055 : /* C code for the stuff we're doing below:
21056 : double xa = fabs (x);
21057 : if (!isless (xa, TWO52))
21058 : return x;
21059 : xa = (double)(long)(xa + nextafter (0.5, 0.0));
21060 : return copysign (xa, x);
21061 : */
21062 14 : machine_mode mode = GET_MODE (operand0);
21063 14 : rtx res, TWO52, xa, xi, half, mask;
21064 14 : rtx_code_label *label;
21065 14 : const struct real_format *fmt;
21066 14 : REAL_VALUE_TYPE pred_half, half_minus_pred_half;
21067 :
21068 : /* Temporary for holding the result, initialized to the input
21069 : operand to ease control flow. */
21070 14 : res = copy_to_reg (operand1);
21071 :
21072 14 : TWO52 = ix86_gen_TWO52 (mode);
21073 14 : xa = ix86_expand_sse_fabs (res, &mask);
21074 14 : label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
21075 :
21076 : /* load nextafter (0.5, 0.0) */
21077 14 : fmt = REAL_MODE_FORMAT (mode);
21078 14 : real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
21079 14 : real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
21080 :
21081 : /* xa = xa + 0.5 */
21082 14 : half = force_reg (mode, const_double_from_real_value (pred_half, mode));
21083 14 : xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
21084 :
21085 : /* xa = (double)(int64_t)xa */
21086 14 : xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
21087 14 : expand_fix (xi, xa, 0);
21088 14 : expand_float (xa, xi, 0);
21089 :
21090 : /* res = copysign (xa, operand1) */
21091 14 : ix86_sse_copysign_to_positive (res, xa, res, mask);
21092 :
21093 14 : emit_label (label);
21094 14 : LABEL_NUSES (label) = 1;
21095 :
21096 14 : emit_move_insn (operand0, res);
21097 14 : }
21098 :
21099 : /* Expand SSE sequence for computing round from OPERAND1 storing
21100 : into OPERAND0 without relying on DImode truncation via cvttsd2siq
21101 : that is only available on 64bit targets. */
21102 : void
21103 0 : ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
21104 : {
21105 : /* C code for the stuff we expand below.
21106 : double xa = fabs (x), xa2, x2;
21107 : if (!isless (xa, TWO52))
21108 : return x;
21109 : Using the absolute value and copying back sign makes
21110 : -0.0 -> -0.0 correct.
21111 : xa2 = xa + TWO52 - TWO52;
21112 : Compensate.
21113 : dxa = xa2 - xa;
21114 : if (dxa <= -0.5)
21115 : xa2 += 1;
21116 : else if (dxa > 0.5)
21117 : xa2 -= 1;
21118 : x2 = copysign (xa2, x);
21119 : return x2;
21120 : */
21121 0 : machine_mode mode = GET_MODE (operand0);
21122 0 : rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
21123 0 : rtx_code_label *label;
21124 :
21125 0 : TWO52 = ix86_gen_TWO52 (mode);
21126 :
21127 : /* Temporary for holding the result, initialized to the input
21128 : operand to ease control flow. */
21129 0 : res = copy_to_reg (operand1);
21130 :
21131 : /* xa = abs (operand1) */
21132 0 : xa = ix86_expand_sse_fabs (res, &mask);
21133 :
21134 : /* if (!isless (xa, TWO52)) goto label; */
21135 0 : label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
21136 :
21137 : /* xa2 = xa + TWO52 - TWO52; */
21138 0 : xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
21139 0 : xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
21140 :
21141 : /* dxa = xa2 - xa; */
21142 0 : dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
21143 :
21144 : /* generate 0.5, 1.0 and -0.5 */
21145 0 : half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
21146 0 : one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
21147 0 : mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
21148 : 0, OPTAB_DIRECT);
21149 :
21150 : /* Compensate. */
21151 : /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
21152 0 : tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
21153 0 : emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
21154 0 : xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
21155 : /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
21156 0 : tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
21157 0 : emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
21158 0 : xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
21159 :
21160 : /* res = copysign (xa2, operand1) */
21161 0 : ix86_sse_copysign_to_positive (res, xa2, res, mask);
21162 :
21163 0 : emit_label (label);
21164 0 : LABEL_NUSES (label) = 1;
21165 :
21166 0 : emit_move_insn (operand0, res);
21167 0 : }
21168 :
21169 : /* Expand SSE sequence for computing round
21170 : from OP1 storing into OP0 using sse4 round insn. */
21171 : void
21172 9 : ix86_expand_round_sse4 (rtx op0, rtx op1)
21173 : {
21174 9 : machine_mode mode = GET_MODE (op0);
21175 9 : rtx e1, e2, res, half;
21176 9 : const struct real_format *fmt;
21177 9 : REAL_VALUE_TYPE pred_half, half_minus_pred_half;
21178 9 : rtx (*gen_copysign) (rtx, rtx, rtx);
21179 9 : rtx (*gen_round) (rtx, rtx, rtx);
21180 :
21181 9 : switch (mode)
21182 : {
21183 : case E_HFmode:
21184 : gen_copysign = gen_copysignhf3;
21185 : gen_round = gen_sse4_1_roundhf2;
21186 : break;
21187 4 : case E_SFmode:
21188 4 : gen_copysign = gen_copysignsf3;
21189 4 : gen_round = gen_sse4_1_roundsf2;
21190 4 : break;
21191 4 : case E_DFmode:
21192 4 : gen_copysign = gen_copysigndf3;
21193 4 : gen_round = gen_sse4_1_rounddf2;
21194 4 : break;
21195 0 : default:
21196 0 : gcc_unreachable ();
21197 : }
21198 :
21199 : /* round (a) = trunc (a + copysign (0.5, a)) */
21200 :
21201 : /* load nextafter (0.5, 0.0) */
21202 9 : fmt = REAL_MODE_FORMAT (mode);
21203 9 : real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
21204 9 : real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
21205 9 : half = const_double_from_real_value (pred_half, mode);
21206 :
21207 : /* e1 = copysign (0.5, op1) */
21208 9 : e1 = gen_reg_rtx (mode);
21209 9 : emit_insn (gen_copysign (e1, half, op1));
21210 :
21211 : /* e2 = op1 + e1 */
21212 9 : e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
21213 :
21214 : /* res = trunc (e2) */
21215 9 : res = gen_reg_rtx (mode);
21216 9 : emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
21217 :
21218 9 : emit_move_insn (op0, res);
21219 9 : }
21220 :
21221 : /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
21222 : insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
21223 : insn every time. */
21224 :
21225 : static GTY(()) rtx_insn *vselect_insn;
21226 :
21227 : /* Initialize vselect_insn. */
21228 :
21229 : static void
21230 7588 : init_vselect_insn (void)
21231 : {
21232 7588 : unsigned i;
21233 7588 : rtx x;
21234 :
21235 7588 : x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
21236 493220 : for (i = 0; i < MAX_VECT_LEN; ++i)
21237 485632 : XVECEXP (x, 0, i) = const0_rtx;
21238 7588 : x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
21239 : const0_rtx), x);
21240 7588 : x = gen_rtx_SET (const0_rtx, x);
21241 7588 : start_sequence ();
21242 7588 : vselect_insn = emit_insn (x);
21243 7588 : end_sequence ();
21244 7588 : }
21245 :
21246 : /* Construct (set target (vec_select op0 (parallel perm))) and
21247 : return true if that's a valid instruction in the active ISA. */
21248 :
21249 : static bool
21250 536360 : expand_vselect (rtx target, rtx op0, const unsigned char *perm,
21251 : unsigned nelt, bool testing_p)
21252 : {
21253 536360 : unsigned int i;
21254 536360 : rtx x, save_vconcat;
21255 536360 : int icode;
21256 :
21257 536360 : if (vselect_insn == NULL_RTX)
21258 1678 : init_vselect_insn ();
21259 :
21260 536360 : x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
21261 536360 : PUT_NUM_ELEM (XVEC (x, 0), nelt);
21262 4118296 : for (i = 0; i < nelt; ++i)
21263 3581936 : XVECEXP (x, 0, i) = GEN_INT (perm[i]);
21264 536360 : save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
21265 536360 : XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
21266 536360 : PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
21267 536360 : SET_DEST (PATTERN (vselect_insn)) = target;
21268 536360 : icode = recog_memoized (vselect_insn);
21269 :
21270 536360 : if (icode >= 0 && !testing_p)
21271 72071 : emit_insn (copy_rtx (PATTERN (vselect_insn)));
21272 :
21273 536360 : SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
21274 536360 : XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
21275 536360 : INSN_CODE (vselect_insn) = -1;
21276 :
21277 536360 : return icode >= 0;
21278 : }
21279 :
21280 : /* Similar, but generate a vec_concat from op0 and op1 as well. */
21281 :
21282 : static bool
21283 471081 : expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
21284 : const unsigned char *perm, unsigned nelt,
21285 : bool testing_p)
21286 : {
21287 471081 : machine_mode v2mode;
21288 471081 : rtx x;
21289 471081 : bool ok;
21290 :
21291 471081 : if (vselect_insn == NULL_RTX)
21292 5910 : init_vselect_insn ();
21293 :
21294 471081 : if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
21295 : return false;
21296 471081 : x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
21297 471081 : PUT_MODE (x, v2mode);
21298 471081 : XEXP (x, 0) = op0;
21299 471081 : XEXP (x, 1) = op1;
21300 471081 : ok = expand_vselect (target, x, perm, nelt, testing_p);
21301 471081 : XEXP (x, 0) = const0_rtx;
21302 471081 : XEXP (x, 1) = const0_rtx;
21303 471081 : return ok;
21304 : }
21305 :
21306 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
21307 : using movss or movsd. */
21308 : static bool
21309 340095 : expand_vec_perm_movs (struct expand_vec_perm_d *d)
21310 : {
21311 340095 : machine_mode vmode = d->vmode;
21312 340095 : unsigned i, nelt = d->nelt;
21313 340095 : rtx x;
21314 :
21315 340095 : if (d->one_operand_p)
21316 : return false;
21317 :
21318 314388 : if (!(TARGET_SSE && (vmode == V4SFmode || vmode == V4SImode))
21319 165506 : && !(TARGET_MMX_WITH_SSE && (vmode == V2SFmode || vmode == V2SImode))
21320 84815 : && !(TARGET_SSE2 && (vmode == V2DFmode || vmode == V2DImode)))
21321 : return false;
21322 :
21323 : /* Only the first element is changed. */
21324 238677 : if (d->perm[0] != nelt && d->perm[0] != 0)
21325 : return false;
21326 203853 : for (i = 1; i < nelt; ++i)
21327 147808 : if (d->perm[i] != i + nelt - d->perm[0])
21328 : return false;
21329 :
21330 56045 : if (d->testing_p)
21331 : return true;
21332 :
21333 6592 : if (d->perm[0] == nelt)
21334 0 : x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
21335 : else
21336 6592 : x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
21337 :
21338 6592 : emit_insn (gen_rtx_SET (d->target, x));
21339 :
21340 6592 : return true;
21341 : }
21342 :
21343 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
21344 : using insertps. */
21345 : static bool
21346 284050 : expand_vec_perm_insertps (struct expand_vec_perm_d *d)
21347 : {
21348 284050 : machine_mode vmode = d->vmode;
21349 284050 : unsigned i, cnt_s, nelt = d->nelt;
21350 284050 : int cnt_d = -1;
21351 284050 : rtx src, dst;
21352 :
21353 284050 : if (d->one_operand_p)
21354 : return false;
21355 :
21356 258343 : if (!(TARGET_SSE4_1
21357 37743 : && (vmode == V4SFmode || vmode == V4SImode
21358 27498 : || (TARGET_MMX_WITH_SSE
21359 21234 : && (vmode == V2SFmode || vmode == V2SImode)))))
21360 : return false;
21361 :
21362 55746 : for (i = 0; i < nelt; ++i)
21363 : {
21364 52491 : if (d->perm[i] == i)
21365 10620 : continue;
21366 41871 : if (cnt_d != -1)
21367 : {
21368 : cnt_d = -1;
21369 : break;
21370 : }
21371 22563 : cnt_d = i;
21372 : }
21373 :
21374 22563 : if (cnt_d == -1)
21375 : {
21376 43673 : for (i = 0; i < nelt; ++i)
21377 : {
21378 40860 : if (d->perm[i] == i + nelt)
21379 5057 : continue;
21380 35803 : if (cnt_d != -1)
21381 : return false;
21382 19308 : cnt_d = i;
21383 : }
21384 :
21385 2813 : if (cnt_d == -1)
21386 : return false;
21387 : }
21388 :
21389 6068 : if (d->testing_p)
21390 : return true;
21391 :
21392 524 : gcc_assert (cnt_d != -1);
21393 :
21394 524 : cnt_s = d->perm[cnt_d];
21395 524 : if (cnt_s < nelt)
21396 : {
21397 221 : src = d->op0;
21398 221 : dst = d->op1;
21399 : }
21400 : else
21401 : {
21402 303 : cnt_s -= nelt;
21403 303 : src = d->op1;
21404 303 : dst = d->op0;
21405 : }
21406 524 : gcc_assert (cnt_s < nelt);
21407 :
21408 524 : rtx x = gen_sse4_1_insertps (vmode, d->target, dst, src,
21409 524 : GEN_INT (cnt_s << 6 | cnt_d << 4));
21410 524 : emit_insn (x);
21411 :
21412 524 : return true;
21413 : }
21414 :
21415 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
21416 : in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
21417 :
21418 : static bool
21419 344817 : expand_vec_perm_blend (struct expand_vec_perm_d *d)
21420 : {
21421 344817 : machine_mode mmode, vmode = d->vmode;
21422 344817 : unsigned i, nelt = d->nelt;
21423 344817 : unsigned HOST_WIDE_INT mask;
21424 344817 : rtx target, op0, op1, maskop, x;
21425 344817 : rtx rperm[32], vperm;
21426 :
21427 344817 : if (d->one_operand_p)
21428 : return false;
21429 6675 : if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
21430 320136 : && (TARGET_AVX512BW
21431 661 : || GET_MODE_UNIT_SIZE (vmode) >= 4))
21432 : ;
21433 331055 : else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
21434 : ;
21435 312685 : else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
21436 : ;
21437 307590 : else if (TARGET_SSE4_1
21438 343547 : && (GET_MODE_SIZE (vmode) == 16
21439 29734 : || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
21440 3636 : || GET_MODE_SIZE (vmode) == 4))
21441 : ;
21442 : else
21443 : return false;
21444 :
21445 : /* This is a blend, not a permute. Elements must stay in their
21446 : respective lanes. */
21447 97298 : for (i = 0; i < nelt; ++i)
21448 : {
21449 92576 : unsigned e = d->perm[i];
21450 92576 : if (!(e == i || e == i + nelt))
21451 : return false;
21452 : }
21453 :
21454 4722 : if (d->testing_p)
21455 : return true;
21456 :
21457 : /* ??? Without SSE4.1, we could implement this with and/andn/or. This
21458 : decision should be extracted elsewhere, so that we only try that
21459 : sequence once all budget==3 options have been tried. */
21460 2786 : target = d->target;
21461 2786 : op0 = d->op0;
21462 2786 : op1 = d->op1;
21463 2786 : mask = 0;
21464 :
21465 2786 : switch (vmode)
21466 : {
21467 : case E_V8DFmode:
21468 : case E_V16SFmode:
21469 : case E_V4DFmode:
21470 : case E_V8SFmode:
21471 : case E_V2DFmode:
21472 : case E_V4SFmode:
21473 : case E_V2SFmode:
21474 : case E_V2HImode:
21475 : case E_V4HImode:
21476 : case E_V8HImode:
21477 : case E_V8SImode:
21478 : case E_V32HImode:
21479 : case E_V64QImode:
21480 : case E_V16SImode:
21481 : case E_V8DImode:
21482 10967 : for (i = 0; i < nelt; ++i)
21483 9472 : mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
21484 : break;
21485 :
21486 : case E_V2DImode:
21487 18 : for (i = 0; i < 2; ++i)
21488 18 : mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
21489 6 : vmode = V8HImode;
21490 6 : goto do_subreg;
21491 :
21492 : case E_V2SImode:
21493 24 : for (i = 0; i < 2; ++i)
21494 24 : mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2);
21495 8 : vmode = V4HImode;
21496 8 : goto do_subreg;
21497 :
21498 871 : case E_V4SImode:
21499 871 : if (TARGET_AVX2)
21500 : {
21501 : /* Use vpblendd instead of vpblendw. */
21502 185 : for (i = 0; i < nelt; ++i)
21503 148 : mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
21504 : break;
21505 : }
21506 : else
21507 : {
21508 4170 : for (i = 0; i < 4; ++i)
21509 5200 : mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
21510 834 : vmode = V8HImode;
21511 834 : goto do_subreg;
21512 : }
21513 :
21514 : case E_V16QImode:
21515 : /* See if bytes move in pairs so we can use pblendw with
21516 : an immediate argument, rather than pblendvb with a vector
21517 : argument. */
21518 102 : for (i = 0; i < 16; i += 2)
21519 100 : if (d->perm[i] + 1 != d->perm[i + 1])
21520 : {
21521 83 : use_pblendvb:
21522 3502 : for (i = 0; i < nelt; ++i)
21523 3212 : rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
21524 :
21525 290 : finish_pblendvb:
21526 291 : vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
21527 291 : vperm = force_reg (vmode, vperm);
21528 :
21529 582 : if (GET_MODE_SIZE (vmode) == 4)
21530 135 : emit_insn (gen_mmx_pblendvb_v4qi (target, op0, op1, vperm));
21531 312 : else if (GET_MODE_SIZE (vmode) == 8)
21532 40 : emit_insn (gen_mmx_pblendvb_v8qi (target, op0, op1, vperm));
21533 232 : else if (GET_MODE_SIZE (vmode) == 16)
21534 83 : emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
21535 : else
21536 33 : emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
21537 291 : if (target != d->target)
21538 1 : emit_move_insn (d->target, gen_lowpart (d->vmode, target));
21539 291 : return true;
21540 : }
21541 :
21542 18 : for (i = 0; i < 8; ++i)
21543 16 : mask |= (d->perm[i * 2] >= 16) << i;
21544 : vmode = V8HImode;
21545 : /* FALLTHRU */
21546 :
21547 931 : do_subreg:
21548 931 : target = gen_reg_rtx (vmode);
21549 931 : op0 = gen_lowpart (vmode, op0);
21550 931 : op1 = gen_lowpart (vmode, op1);
21551 931 : break;
21552 :
21553 : case E_V8QImode:
21554 40 : for (i = 0; i < 8; i += 2)
21555 40 : if (d->perm[i] + 1 != d->perm[i + 1])
21556 40 : goto use_pblendvb;
21557 :
21558 0 : for (i = 0; i < 4; ++i)
21559 0 : mask |= (d->perm[i * 2] >= 8) << i;
21560 0 : vmode = V4HImode;
21561 0 : goto do_subreg;
21562 :
21563 : case E_V4QImode:
21564 153 : for (i = 0; i < 4; i += 2)
21565 150 : if (d->perm[i] + 1 != d->perm[i + 1])
21566 135 : goto use_pblendvb;
21567 :
21568 9 : for (i = 0; i < 2; ++i)
21569 6 : mask |= (d->perm[i * 2] >= 4) << i;
21570 3 : vmode = V2HImode;
21571 3 : goto do_subreg;
21572 :
21573 : case E_V32QImode:
21574 : /* See if bytes move in pairs. If not, vpblendvb must be used. */
21575 916 : for (i = 0; i < 32; i += 2)
21576 864 : if (d->perm[i] + 1 != d->perm[i + 1])
21577 32 : goto use_pblendvb;
21578 : /* See if bytes move in quadruplets. If yes, vpblendd
21579 : with immediate can be used. */
21580 468 : for (i = 0; i < 32; i += 4)
21581 416 : if (d->perm[i] + 2 != d->perm[i + 2])
21582 : break;
21583 52 : if (i < 32)
21584 : {
21585 : /* See if bytes move the same in both lanes. If yes,
21586 : vpblendw with immediate can be used. */
21587 0 : for (i = 0; i < 16; i += 2)
21588 0 : if (d->perm[i] + 16 != d->perm[i + 16])
21589 0 : goto use_pblendvb;
21590 :
21591 : /* Use vpblendw. */
21592 0 : for (i = 0; i < 16; ++i)
21593 0 : mask |= (d->perm[i * 2] >= 32) << i;
21594 0 : vmode = V16HImode;
21595 0 : goto do_subreg;
21596 : }
21597 :
21598 : /* Use vpblendd. */
21599 468 : for (i = 0; i < 8; ++i)
21600 416 : mask |= (d->perm[i * 4] >= 32) << i;
21601 52 : vmode = V8SImode;
21602 52 : goto do_subreg;
21603 :
21604 : case E_V16HImode:
21605 : /* See if words move in pairs. If yes, vpblendd can be used. */
21606 186 : for (i = 0; i < 16; i += 2)
21607 169 : if (d->perm[i] + 1 != d->perm[i + 1])
21608 : break;
21609 50 : if (i < 16)
21610 : {
21611 : /* See if words move the same in both lanes. If not,
21612 : vpblendvb must be used. */
21613 290 : for (i = 0; i < 8; i++)
21614 258 : if (d->perm[i] + 8 != d->perm[i + 8])
21615 : {
21616 : /* Use vpblendvb. */
21617 33 : for (i = 0; i < 32; ++i)
21618 32 : rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
21619 :
21620 1 : vmode = V32QImode;
21621 1 : nelt = 32;
21622 1 : target = gen_reg_rtx (vmode);
21623 1 : op0 = gen_lowpart (vmode, op0);
21624 1 : op1 = gen_lowpart (vmode, op1);
21625 1 : goto finish_pblendvb;
21626 : }
21627 :
21628 : /* Use vpblendw. */
21629 544 : for (i = 0; i < 16; ++i)
21630 512 : mask |= (d->perm[i] >= 16) << i;
21631 : break;
21632 : }
21633 :
21634 : /* Use vpblendd. */
21635 153 : for (i = 0; i < 8; ++i)
21636 136 : mask |= (d->perm[i * 2] >= 16) << i;
21637 17 : vmode = V8SImode;
21638 17 : goto do_subreg;
21639 :
21640 : case E_V4DImode:
21641 : /* Use vpblendd. */
21642 45 : for (i = 0; i < 4; ++i)
21643 54 : mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
21644 9 : vmode = V8SImode;
21645 9 : goto do_subreg;
21646 :
21647 0 : default:
21648 0 : gcc_unreachable ();
21649 : }
21650 :
21651 2495 : switch (vmode)
21652 : {
21653 : case E_V8DFmode:
21654 : case E_V8DImode:
21655 : mmode = QImode;
21656 : break;
21657 5 : case E_V16SFmode:
21658 5 : case E_V16SImode:
21659 5 : mmode = HImode;
21660 5 : break;
21661 6 : case E_V32HImode:
21662 6 : mmode = SImode;
21663 6 : break;
21664 1 : case E_V64QImode:
21665 1 : mmode = DImode;
21666 1 : break;
21667 : default:
21668 : mmode = VOIDmode;
21669 : }
21670 :
21671 : /* Canonicalize vec_merge. */
21672 2495 : if (swap_commutative_operands_p (op1, op0)
21673 : /* Two operands have same precedence, then
21674 : first bit of mask select first operand. */
21675 2495 : || (!swap_commutative_operands_p (op0, op1)
21676 2495 : && !(mask & 1)))
21677 : {
21678 2488 : unsigned n_elts = GET_MODE_NUNITS (vmode);
21679 2488 : std::swap (op0, op1);
21680 2488 : unsigned HOST_WIDE_INT mask_all = HOST_WIDE_INT_1U;
21681 2488 : if (n_elts == HOST_BITS_PER_WIDE_INT)
21682 : mask_all = -1;
21683 : else
21684 2487 : mask_all = (HOST_WIDE_INT_1U << n_elts) - 1;
21685 2488 : mask = ~mask & mask_all;
21686 : }
21687 :
21688 2495 : if (mmode != VOIDmode)
21689 20 : maskop = force_reg (mmode, gen_int_mode (mask, mmode));
21690 : else
21691 2475 : maskop = GEN_INT (mask);
21692 :
21693 : /* This matches five different patterns with the different modes. */
21694 2495 : x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
21695 2495 : x = gen_rtx_SET (target, x);
21696 2495 : emit_insn (x);
21697 2495 : if (target != d->target)
21698 931 : emit_move_insn (d->target, gen_lowpart (d->vmode, target));
21699 :
21700 : return true;
21701 : }
21702 :
21703 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
21704 : in terms of the variable form of vpermilps.
21705 :
21706 : Note that we will have already failed the immediate input vpermilps,
21707 : which requires that the high and low part shuffle be identical; the
21708 : variable form doesn't require that. */
21709 :
21710 : static bool
21711 135971 : expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
21712 : {
21713 135971 : rtx rperm[8], vperm;
21714 135971 : unsigned i;
21715 :
21716 135971 : if (!TARGET_AVX || !d->one_operand_p
21717 11182 : || (d->vmode != V8SImode && d->vmode != V8SFmode))
21718 : return false;
21719 :
21720 : /* We can only permute within the 128-bit lane. */
21721 16201 : for (i = 0; i < 8; ++i)
21722 : {
21723 15551 : unsigned e = d->perm[i];
21724 15551 : if (i < 4 ? e >= 4 : e < 4)
21725 : return false;
21726 : }
21727 :
21728 650 : if (d->testing_p)
21729 : return true;
21730 :
21731 657 : for (i = 0; i < 8; ++i)
21732 : {
21733 584 : unsigned e = d->perm[i];
21734 :
21735 : /* Within each 128-bit lane, the elements of op0 are numbered
21736 : from 0 and the elements of op1 are numbered from 4. */
21737 584 : if (e >= 8 + 4)
21738 0 : e -= 8;
21739 584 : else if (e >= 4)
21740 292 : e -= 4;
21741 :
21742 584 : rperm[i] = GEN_INT (e);
21743 : }
21744 :
21745 73 : vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
21746 73 : vperm = force_reg (V8SImode, vperm);
21747 73 : rtx target = d->target;
21748 73 : rtx op0 = d->op0;
21749 73 : if (d->vmode == V8SImode)
21750 : {
21751 21 : target = lowpart_subreg (V8SFmode, target, V8SImode);
21752 21 : op0 = lowpart_subreg (V8SFmode, op0, V8SImode);
21753 : }
21754 :
21755 73 : emit_insn (gen_avx_vpermilvarv8sf3 (target, op0, vperm));
21756 :
21757 73 : return true;
21758 : }
21759 :
21760 : /* For V*[QHS]Imode permutations, check if the same permutation
21761 : can't be performed in a 2x, 4x or 8x wider inner mode. */
21762 :
21763 : static bool
21764 159340 : canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
21765 : struct expand_vec_perm_d *nd)
21766 : {
21767 159340 : int i;
21768 159340 : machine_mode mode = VOIDmode;
21769 :
21770 159340 : switch (d->vmode)
21771 : {
21772 : case E_V8QImode: mode = V4HImode; break;
21773 29404 : case E_V16QImode: mode = V8HImode; break;
21774 715 : case E_V32QImode: mode = V16HImode; break;
21775 275 : case E_V64QImode: mode = V32HImode; break;
21776 11971 : case E_V4HImode: mode = V2SImode; break;
21777 20468 : case E_V8HImode: mode = V4SImode; break;
21778 1006 : case E_V16HImode: mode = V8SImode; break;
21779 397 : case E_V32HImode: mode = V16SImode; break;
21780 40471 : case E_V4SImode: mode = V2DImode; break;
21781 1485 : case E_V8SImode: mode = V4DImode; break;
21782 65 : case E_V16SImode: mode = V8DImode; break;
21783 : default: return false;
21784 : }
21785 200982 : for (i = 0; i < d->nelt; i += 2)
21786 186644 : if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
21787 : return false;
21788 14338 : nd->vmode = mode;
21789 14338 : nd->nelt = d->nelt / 2;
21790 93646 : for (i = 0; i < nd->nelt; i++)
21791 79308 : nd->perm[i] = d->perm[2 * i] / 2;
21792 28676 : if (GET_MODE_INNER (mode) != DImode)
21793 12606 : canonicalize_vector_int_perm (nd, nd);
21794 14338 : if (nd != d)
21795 : {
21796 9095 : nd->one_operand_p = d->one_operand_p;
21797 9095 : nd->testing_p = d->testing_p;
21798 9095 : if (d->op0 == d->op1)
21799 3031 : nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
21800 : else
21801 : {
21802 6064 : nd->op0 = gen_lowpart (nd->vmode, d->op0);
21803 6064 : nd->op1 = gen_lowpart (nd->vmode, d->op1);
21804 : }
21805 9095 : if (d->testing_p)
21806 5832 : nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
21807 : else
21808 3263 : nd->target = gen_reg_rtx (nd->vmode);
21809 : }
21810 : return true;
21811 : }
21812 :
21813 : /* Return true if permutation D can be performed as VMODE permutation
21814 : instead. */
21815 :
21816 : static bool
21817 5994 : valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
21818 : {
21819 5994 : unsigned int i, j, chunk;
21820 :
21821 5994 : if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
21822 5994 : || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
21823 14742 : || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
21824 : return false;
21825 :
21826 8748 : if (GET_MODE_NUNITS (vmode) >= d->nelt)
21827 : return true;
21828 :
21829 4086 : chunk = d->nelt / GET_MODE_NUNITS (vmode);
21830 5328 : for (i = 0; i < d->nelt; i += chunk)
21831 5081 : if (d->perm[i] & (chunk - 1))
21832 : return false;
21833 : else
21834 7759 : for (j = 1; j < chunk; ++j)
21835 6517 : if (d->perm[i] + j != d->perm[i + j])
21836 : return false;
21837 :
21838 : return true;
21839 : }
21840 :
21841 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
21842 : in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
21843 :
21844 : static bool
21845 135321 : expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
21846 : {
21847 135321 : unsigned i, nelt, eltsz, mask;
21848 135321 : unsigned char perm[64];
21849 135321 : machine_mode vmode;
21850 135321 : struct expand_vec_perm_d nd;
21851 135321 : rtx rperm[64], vperm, target, op0, op1;
21852 :
21853 135321 : nelt = d->nelt;
21854 :
21855 135321 : if (!d->one_operand_p)
21856 220736 : switch (GET_MODE_SIZE (d->vmode))
21857 : {
21858 8329 : case 4:
21859 8329 : if (!TARGET_XOP)
21860 : return false;
21861 : vmode = V4QImode;
21862 : break;
21863 :
21864 19530 : case 8:
21865 19530 : if (!TARGET_XOP)
21866 : return false;
21867 : vmode = V8QImode;
21868 : break;
21869 :
21870 73595 : case 16:
21871 73595 : if (!TARGET_XOP)
21872 : return false;
21873 : vmode = V16QImode;
21874 : break;
21875 :
21876 8004 : case 32:
21877 8004 : if (!TARGET_AVX2)
21878 : return false;
21879 :
21880 4014 : if (valid_perm_using_mode_p (V2TImode, d))
21881 : {
21882 56 : if (d->testing_p)
21883 : return true;
21884 :
21885 : /* Use vperm2i128 insn. The pattern uses
21886 : V4DImode instead of V2TImode. */
21887 52 : target = d->target;
21888 52 : if (d->vmode != V4DImode)
21889 12 : target = gen_reg_rtx (V4DImode);
21890 52 : op0 = gen_lowpart (V4DImode, d->op0);
21891 52 : op1 = gen_lowpart (V4DImode, d->op1);
21892 52 : rperm[0]
21893 52 : = GEN_INT ((d->perm[0] / (nelt / 2))
21894 : | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
21895 52 : emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
21896 52 : if (target != d->target)
21897 12 : emit_move_insn (d->target, gen_lowpart (d->vmode, target));
21898 52 : return true;
21899 : }
21900 : /* FALLTHRU */
21901 :
21902 : default:
21903 : return false;
21904 : }
21905 : else
21906 49906 : switch (GET_MODE_SIZE (d->vmode))
21907 : {
21908 3534 : case 4:
21909 3534 : if (!TARGET_SSSE3)
21910 : return false;
21911 : vmode = V4QImode;
21912 : break;
21913 :
21914 2430 : case 8:
21915 2430 : if (!TARGET_SSSE3)
21916 : return false;
21917 : vmode = V8QImode;
21918 : break;
21919 :
21920 14040 : case 16:
21921 14040 : if (!TARGET_SSSE3)
21922 : return false;
21923 : vmode = V16QImode;
21924 : break;
21925 :
21926 4560 : case 32:
21927 4560 : if (!TARGET_AVX2)
21928 : return false;
21929 :
21930 : /* V4DImode should be already handled through
21931 : expand_vselect by vpermq instruction. */
21932 1995 : gcc_assert (d->vmode != V4DImode);
21933 :
21934 1995 : vmode = V32QImode;
21935 1995 : if (d->vmode == V8SImode
21936 1606 : || d->vmode == V16HImode
21937 1390 : || d->vmode == V32QImode)
21938 : {
21939 : /* First see if vpermq can be used for
21940 : V8SImode/V16HImode/V32QImode. */
21941 903 : if (valid_perm_using_mode_p (V4DImode, d))
21942 : {
21943 770 : for (i = 0; i < 4; i++)
21944 616 : perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
21945 154 : if (d->testing_p)
21946 : return true;
21947 58 : target = gen_reg_rtx (V4DImode);
21948 58 : if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
21949 : perm, 4, false))
21950 : {
21951 116 : emit_move_insn (d->target,
21952 58 : gen_lowpart (d->vmode, target));
21953 58 : return true;
21954 : }
21955 : return false;
21956 : }
21957 :
21958 : /* Next see if vpermd can be used. */
21959 749 : if (valid_perm_using_mode_p (V8SImode, d))
21960 : vmode = V8SImode;
21961 : }
21962 : /* Or if vpermps can be used. */
21963 1092 : else if (d->vmode == V8SFmode)
21964 : vmode = V8SImode;
21965 :
21966 : if (vmode == V32QImode)
21967 : {
21968 : /* vpshufb only works intra lanes, it is not
21969 : possible to shuffle bytes in between the lanes. */
21970 6473 : for (i = 0; i < nelt; ++i)
21971 6291 : if ((d->perm[i] ^ i) & (nelt / 2))
21972 : return false;
21973 : }
21974 : break;
21975 :
21976 389 : case 64:
21977 389 : if (!TARGET_AVX512BW)
21978 : return false;
21979 :
21980 : /* If vpermq didn't work, vpshufb won't work either. */
21981 204 : if (d->vmode == V8DFmode || d->vmode == V8DImode)
21982 : return false;
21983 :
21984 175 : vmode = V64QImode;
21985 175 : if (d->vmode == V16SImode
21986 150 : || d->vmode == V32HImode
21987 50 : || d->vmode == V64QImode)
21988 : {
21989 : /* First see if vpermq can be used for
21990 : V16SImode/V32HImode/V64QImode. */
21991 164 : if (valid_perm_using_mode_p (V8DImode, d))
21992 : {
21993 0 : for (i = 0; i < 8; i++)
21994 0 : perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
21995 0 : if (d->testing_p)
21996 : return true;
21997 0 : target = gen_reg_rtx (V8DImode);
21998 0 : if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
21999 : perm, 8, false))
22000 : {
22001 0 : emit_move_insn (d->target,
22002 0 : gen_lowpart (d->vmode, target));
22003 0 : return true;
22004 : }
22005 : return false;
22006 : }
22007 :
22008 : /* Next see if vpermd can be used. */
22009 164 : if (valid_perm_using_mode_p (V16SImode, d))
22010 : vmode = V16SImode;
22011 : }
22012 : /* Or if vpermps can be used. */
22013 11 : else if (d->vmode == V16SFmode)
22014 : vmode = V16SImode;
22015 :
22016 : if (vmode == V64QImode)
22017 : {
22018 : /* vpshufb only works intra lanes, it is not
22019 : possible to shuffle bytes in between the lanes. */
22020 578 : for (i = 0; i < nelt; ++i)
22021 578 : if ((d->perm[i] ^ i) & (3 * nelt / 4))
22022 : return false;
22023 : }
22024 : break;
22025 :
22026 : default:
22027 : return false;
22028 : }
22029 :
22030 11646 : if (d->testing_p)
22031 : return true;
22032 :
22033 : /* Try to avoid variable permutation instruction. */
22034 8854 : if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
22035 : {
22036 1839 : emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
22037 1839 : return true;
22038 : }
22039 :
22040 7015 : if (vmode == V8SImode)
22041 9603 : for (i = 0; i < 8; ++i)
22042 8536 : rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
22043 5948 : else if (vmode == V16SImode)
22044 612 : for (i = 0; i < 16; ++i)
22045 576 : rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
22046 : else
22047 : {
22048 5912 : eltsz = GET_MODE_UNIT_SIZE (d->vmode);
22049 5912 : if (!d->one_operand_p)
22050 3212 : mask = 2 * nelt - 1;
22051 2700 : else if (vmode == V64QImode)
22052 0 : mask = nelt / 4 - 1;
22053 2700 : else if (vmode == V32QImode)
22054 176 : mask = nelt / 2 - 1;
22055 : else
22056 2524 : mask = nelt - 1;
22057 :
22058 59020 : for (i = 0; i < nelt; ++i)
22059 : {
22060 53108 : unsigned j, e = d->perm[i] & mask;
22061 148168 : for (j = 0; j < eltsz; ++j)
22062 95060 : rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
22063 : }
22064 : }
22065 :
22066 7015 : machine_mode vpmode = vmode;
22067 :
22068 7015 : nelt = GET_MODE_SIZE (vmode);
22069 :
22070 : /* Emulate narrow modes with V16QI instructions. */
22071 7015 : if (nelt < 16)
22072 : {
22073 222 : rtx m128 = GEN_INT (-128);
22074 :
22075 : /* Remap elements from the second operand, as we have to
22076 : account for inactive top elements from the first operand. */
22077 222 : if (!d->one_operand_p)
22078 : {
22079 243 : for (i = 0; i < nelt; ++i)
22080 : {
22081 216 : unsigned ival = UINTVAL (rperm[i]);
22082 216 : if (ival >= nelt)
22083 108 : rperm[i] = GEN_INT (ival + 16 - nelt);
22084 : }
22085 : }
22086 :
22087 : /* Fill inactive elements in the top positions with zeros. */
22088 2570 : for (i = nelt; i < 16; ++i)
22089 2348 : rperm[i] = m128;
22090 :
22091 : vpmode = V16QImode;
22092 : }
22093 :
22094 14030 : vperm = gen_rtx_CONST_VECTOR (vpmode,
22095 7015 : gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm));
22096 7015 : vperm = force_reg (vpmode, vperm);
22097 :
22098 7015 : if (vmode == d->vmode)
22099 2422 : target = d->target;
22100 : else
22101 4593 : target = gen_reg_rtx (vmode);
22102 :
22103 7015 : op0 = gen_lowpart (vmode, d->op0);
22104 :
22105 7015 : if (d->one_operand_p)
22106 : {
22107 3803 : rtx (*gen) (rtx, rtx, rtx);
22108 :
22109 3803 : if (vmode == V4QImode)
22110 : gen = gen_mmx_pshufbv4qi3;
22111 : else if (vmode == V8QImode)
22112 : gen = gen_mmx_pshufbv8qi3;
22113 : else if (vmode == V16QImode)
22114 : gen = gen_ssse3_pshufbv16qi3;
22115 : else if (vmode == V32QImode)
22116 : gen = gen_avx2_pshufbv32qi3;
22117 : else if (vmode == V64QImode)
22118 : gen = gen_avx512bw_pshufbv64qi3;
22119 : else if (vmode == V8SFmode)
22120 : gen = gen_avx2_permvarv8sf;
22121 : else if (vmode == V8SImode)
22122 : gen = gen_avx2_permvarv8si;
22123 : else if (vmode == V16SFmode)
22124 : gen = gen_avx512f_permvarv16sf;
22125 : else if (vmode == V16SImode)
22126 : gen = gen_avx512f_permvarv16si;
22127 : else
22128 : gcc_unreachable ();
22129 :
22130 3803 : emit_insn (gen (target, op0, vperm));
22131 : }
22132 : else
22133 : {
22134 3212 : rtx (*gen) (rtx, rtx, rtx, rtx);
22135 :
22136 3212 : op1 = gen_lowpart (vmode, d->op1);
22137 :
22138 3212 : if (vmode == V4QImode)
22139 : gen = gen_mmx_ppermv32;
22140 : else if (vmode == V8QImode)
22141 : gen = gen_mmx_ppermv64;
22142 : else if (vmode == V16QImode)
22143 : gen = gen_xop_pperm;
22144 : else
22145 0 : gcc_unreachable ();
22146 :
22147 3212 : emit_insn (gen (target, op0, op1, vperm));
22148 : }
22149 :
22150 7015 : if (target != d->target)
22151 4593 : emit_move_insn (d->target, gen_lowpart (d->vmode, target));
22152 :
22153 : return true;
22154 : }
22155 :
22156 : /* Try to expand one-operand permutation with constant mask. */
22157 :
22158 : static bool
22159 123341 : ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
22160 : {
22161 123341 : machine_mode mode = GET_MODE (d->op0);
22162 123341 : machine_mode maskmode = mode;
22163 246682 : unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
22164 123341 : rtx (*gen) (rtx, rtx, rtx) = NULL;
22165 123341 : rtx target, op0, mask;
22166 123341 : rtx vec[64];
22167 :
22168 123341 : if (!rtx_equal_p (d->op0, d->op1))
22169 : return false;
22170 :
22171 17510 : if (!TARGET_AVX512F)
22172 : return false;
22173 :
22174 : /* Accept VNxHImode and VNxQImode now. */
22175 719 : if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
22176 : return false;
22177 :
22178 : /* vpermw. */
22179 453 : if (!TARGET_AVX512BW && inner_size == 2)
22180 : return false;
22181 :
22182 : /* vpermb. */
22183 319 : if (!TARGET_AVX512VBMI && inner_size == 1)
22184 : return false;
22185 :
22186 200 : switch (mode)
22187 : {
22188 : case E_V16SImode:
22189 : gen = gen_avx512f_permvarv16si;
22190 : break;
22191 4 : case E_V16SFmode:
22192 4 : gen = gen_avx512f_permvarv16sf;
22193 4 : maskmode = V16SImode;
22194 4 : break;
22195 1 : case E_V8DImode:
22196 1 : gen = gen_avx512f_permvarv8di;
22197 1 : break;
22198 30 : case E_V8DFmode:
22199 30 : gen = gen_avx512f_permvarv8df;
22200 30 : maskmode = V8DImode;
22201 30 : break;
22202 106 : case E_V32HImode:
22203 106 : gen = gen_avx512bw_permvarv32hi;
22204 106 : break;
22205 14 : case E_V16HImode:
22206 14 : gen = gen_avx512vl_permvarv16hi;
22207 14 : break;
22208 6 : case E_V8HImode:
22209 6 : gen = gen_avx512vl_permvarv8hi;
22210 6 : break;
22211 4 : case E_V64QImode:
22212 4 : gen = gen_avx512bw_permvarv64qi;
22213 4 : break;
22214 2 : case E_V32QImode:
22215 2 : gen = gen_avx512vl_permvarv32qi;
22216 2 : break;
22217 0 : case E_V16QImode:
22218 0 : gen = gen_avx512vl_permvarv16qi;
22219 0 : break;
22220 :
22221 : default:
22222 : return false;
22223 : }
22224 :
22225 199 : if (d->testing_p)
22226 : return true;
22227 :
22228 190 : target = d->target;
22229 190 : op0 = d->op0;
22230 4854 : for (int i = 0; i < d->nelt; ++i)
22231 4664 : vec[i] = GEN_INT (d->perm[i]);
22232 190 : mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
22233 190 : emit_insn (gen (target, op0, force_reg (maskmode, mask)));
22234 190 : return true;
22235 : }
22236 :
22237 : static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
22238 :
22239 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
22240 : in a single instruction. */
22241 :
22242 : static bool
22243 378583 : expand_vec_perm_1 (struct expand_vec_perm_d *d)
22244 : {
22245 378583 : unsigned i, nelt = d->nelt;
22246 378583 : struct expand_vec_perm_d nd;
22247 :
22248 : /* Check plain VEC_SELECT first, because AVX has instructions that could
22249 : match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
22250 : input where SEL+CONCAT may not. */
22251 378583 : if (d->one_operand_p)
22252 : {
22253 : int mask = nelt - 1;
22254 : bool identity_perm = true;
22255 : bool broadcast_perm = true;
22256 :
22257 505443 : for (i = 0; i < nelt; i++)
22258 : {
22259 443464 : nd.perm[i] = d->perm[i] & mask;
22260 443464 : if (nd.perm[i] != i)
22261 340007 : identity_perm = false;
22262 443464 : if (nd.perm[i])
22263 365782 : broadcast_perm = false;
22264 : }
22265 :
22266 61979 : if (identity_perm)
22267 : {
22268 11 : if (!d->testing_p)
22269 5 : emit_move_insn (d->target, d->op0);
22270 11 : return true;
22271 : }
22272 61968 : else if (broadcast_perm && TARGET_AVX2)
22273 : {
22274 : /* Use vpbroadcast{b,w,d}. */
22275 397 : rtx (*gen) (rtx, rtx) = NULL;
22276 397 : switch (d->vmode)
22277 : {
22278 1 : case E_V64QImode:
22279 1 : if (TARGET_AVX512BW)
22280 : gen = gen_avx512bw_vec_dupv64qi_1;
22281 : break;
22282 4 : case E_V32QImode:
22283 4 : gen = gen_avx2_pbroadcastv32qi_1;
22284 4 : break;
22285 1 : case E_V32HImode:
22286 1 : if (TARGET_AVX512BW)
22287 : gen = gen_avx512bw_vec_dupv32hi_1;
22288 : break;
22289 4 : case E_V16HImode:
22290 4 : gen = gen_avx2_pbroadcastv16hi_1;
22291 4 : break;
22292 1 : case E_V16SImode:
22293 1 : if (TARGET_AVX512F)
22294 : gen = gen_avx512f_vec_dupv16si_1;
22295 : break;
22296 4 : case E_V8SImode:
22297 4 : gen = gen_avx2_pbroadcastv8si_1;
22298 4 : break;
22299 4 : case E_V16QImode:
22300 4 : gen = gen_avx2_pbroadcastv16qi;
22301 4 : break;
22302 5 : case E_V8HImode:
22303 5 : gen = gen_avx2_pbroadcastv8hi;
22304 5 : break;
22305 0 : case E_V16SFmode:
22306 0 : if (TARGET_AVX512F)
22307 : gen = gen_avx512f_vec_dupv16sf_1;
22308 : break;
22309 : case E_V8SFmode:
22310 : gen = gen_avx2_vec_dupv8sf_1;
22311 : break;
22312 0 : case E_V8DFmode:
22313 0 : if (TARGET_AVX512F)
22314 : gen = gen_avx512f_vec_dupv8df_1;
22315 : break;
22316 0 : case E_V8DImode:
22317 0 : if (TARGET_AVX512F)
22318 : gen = gen_avx512f_vec_dupv8di_1;
22319 : break;
22320 : /* For other modes prefer other shuffles this function creates. */
22321 : default: break;
22322 : }
22323 21 : if (gen != NULL)
22324 : {
22325 24 : if (!d->testing_p)
22326 24 : emit_insn (gen (d->target, d->op0));
22327 24 : return true;
22328 : }
22329 : }
22330 :
22331 61944 : if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
22332 : return true;
22333 :
22334 : /* There are plenty of patterns in sse.md that are written for
22335 : SEL+CONCAT and are not replicated for a single op. Perhaps
22336 : that should be changed, to avoid the nastiness here. */
22337 :
22338 : /* Recognize interleave style patterns, which means incrementing
22339 : every other permutation operand. */
22340 199619 : for (i = 0; i < nelt; i += 2)
22341 : {
22342 163300 : nd.perm[i] = d->perm[i] & mask;
22343 163300 : nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
22344 : }
22345 36319 : if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
22346 36319 : d->testing_p))
22347 : return true;
22348 :
22349 : /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
22350 31365 : if (nelt >= 4)
22351 : {
22352 107157 : for (i = 0; i < nelt; i += 4)
22353 : {
22354 75792 : nd.perm[i + 0] = d->perm[i + 0] & mask;
22355 75792 : nd.perm[i + 1] = d->perm[i + 1] & mask;
22356 75792 : nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
22357 75792 : nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
22358 : }
22359 :
22360 31365 : if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
22361 31365 : d->testing_p))
22362 : return true;
22363 : }
22364 : }
22365 :
22366 : /* Try the SSE4.1 blend variable merge instructions. */
22367 342311 : if (expand_vec_perm_blend (d))
22368 : return true;
22369 :
22370 : /* Try movss/movsd instructions. */
22371 340095 : if (expand_vec_perm_movs (d))
22372 : return true;
22373 :
22374 : /* Try the SSE4.1 insertps instruction. */
22375 284050 : if (expand_vec_perm_insertps (d))
22376 : return true;
22377 :
22378 : /* Try the fully general two operand permute. */
22379 277982 : if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
22380 277982 : d->testing_p))
22381 : return true;
22382 :
22383 : /* Recognize interleave style patterns with reversed operands. */
22384 136099 : if (!d->one_operand_p)
22385 : {
22386 884730 : for (i = 0; i < nelt; ++i)
22387 : {
22388 774234 : unsigned e = d->perm[i];
22389 774234 : if (e >= nelt)
22390 380066 : e -= nelt;
22391 : else
22392 394168 : e += nelt;
22393 774234 : nd.perm[i] = e;
22394 : }
22395 :
22396 110496 : if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
22397 110496 : d->testing_p))
22398 : return true;
22399 : }
22400 :
22401 : /* Try one of the AVX vpermil variable permutations. */
22402 135971 : if (expand_vec_perm_vpermil (d))
22403 : return true;
22404 :
22405 : /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
22406 : vpshufb, vpermd, vpermps or vpermq variable permutation. */
22407 135321 : if (expand_vec_perm_pshufb (d))
22408 : return true;
22409 :
22410 : /* Try the AVX2 vpalignr instruction. */
22411 123465 : if (expand_vec_perm_palignr (d, true))
22412 : return true;
22413 :
22414 : /* Try the AVX512F vperm{w,b,s,d} instructions */
22415 123341 : if (ix86_expand_vec_one_operand_perm_avx512 (d))
22416 : return true;
22417 :
22418 : /* Try the AVX512F vpermt2/vpermi2 instructions. */
22419 123142 : if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
22420 : return true;
22421 :
22422 : /* See if we can get the same permutation in different vector integer
22423 : mode. */
22424 122241 : if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
22425 : {
22426 6654 : if (!d->testing_p)
22427 1207 : emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
22428 6654 : return true;
22429 : }
22430 : return false;
22431 : }
22432 :
22433 : /* Canonicalize vec_perm index to make the first index
22434 : always comes from the first vector. */
22435 : static void
22436 8129 : ix86_vec_perm_index_canon (struct expand_vec_perm_d *d)
22437 : {
22438 8129 : unsigned nelt = d->nelt;
22439 8129 : if (d->perm[0] < nelt)
22440 : return;
22441 :
22442 5 : for (unsigned i = 0; i != nelt; i++)
22443 4 : d->perm[i] = (d->perm[i] + nelt) % (2 * nelt);
22444 :
22445 1 : std::swap (d->op0, d->op1);
22446 1 : return;
22447 : }
22448 :
22449 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
22450 : in terms of a pair of shufps+ shufps/pshufd instructions. */
22451 : static bool
22452 84421 : expand_vec_perm_shufps_shufps (struct expand_vec_perm_d *d)
22453 : {
22454 84421 : unsigned char perm1[4];
22455 84421 : machine_mode vmode = d->vmode;
22456 84421 : bool ok;
22457 84421 : unsigned i, j, k, count = 0;
22458 :
22459 84421 : if (d->one_operand_p
22460 79111 : || (vmode != V4SImode && vmode != V4SFmode))
22461 : return false;
22462 :
22463 34754 : if (d->testing_p)
22464 : return true;
22465 :
22466 8129 : ix86_vec_perm_index_canon (d);
22467 48774 : for (i = 0; i < 4; ++i)
22468 50873 : count += d->perm[i] > 3 ? 1 : 0;
22469 :
22470 8129 : gcc_assert (count & 3);
22471 :
22472 8129 : rtx tmp = gen_reg_rtx (vmode);
22473 : /* 2 from op0 and 2 from op1. */
22474 8129 : if (count == 2)
22475 : {
22476 : unsigned char perm2[4];
22477 18070 : for (i = 0, j = 0, k = 2; i < 4; ++i)
22478 14456 : if (d->perm[i] & 4)
22479 : {
22480 7228 : perm1[k++] = d->perm[i];
22481 7228 : perm2[i] = k - 1;
22482 : }
22483 : else
22484 : {
22485 7228 : perm1[j++] = d->perm[i];
22486 7228 : perm2[i] = j - 1;
22487 : }
22488 :
22489 : /* shufps. */
22490 7228 : ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
22491 3614 : perm1, d->nelt, false);
22492 3614 : gcc_assert (ok);
22493 3614 : if (vmode == V4SImode && TARGET_SSE2)
22494 : /* pshufd. */
22495 2058 : ok = expand_vselect (d->target, tmp,
22496 2058 : perm2, d->nelt, false);
22497 : else
22498 : {
22499 : /* shufps. */
22500 1556 : perm2[2] += 4;
22501 1556 : perm2[3] += 4;
22502 1556 : ok = expand_vselect_vconcat (d->target, tmp, tmp,
22503 1556 : perm2, d->nelt, false);
22504 : }
22505 3614 : gcc_assert (ok);
22506 : }
22507 : /* 3 from one op and 1 from another. */
22508 : else
22509 : {
22510 22575 : unsigned pair_idx = 8, lone_idx = 8, shift;
22511 :
22512 : /* Find the lone index. */
22513 22575 : for (i = 0; i < 4; ++i)
22514 18060 : if ((d->perm[i] > 3 && count == 1)
22515 14753 : || (d->perm[i] < 4 && count == 3))
22516 18060 : lone_idx = i;
22517 :
22518 : /* When lone_idx is not 0, it must from second op(count == 1). */
22519 5723 : gcc_assert (count == (lone_idx ? 1 : 3));
22520 :
22521 : /* Find the pair index that sits in the same half as the lone index. */
22522 4515 : shift = lone_idx & 2;
22523 4515 : pair_idx = 1 - lone_idx + 2 * shift;
22524 :
22525 : /* First permutate lone index and pair index into the same vector as
22526 : [ lone, lone, pair, pair ]. */
22527 9030 : perm1[1] = perm1[0]
22528 4515 : = (count == 3) ? d->perm[lone_idx] : d->perm[lone_idx] - 4;
22529 9030 : perm1[3] = perm1[2]
22530 4515 : = (count == 3) ? d->perm[pair_idx] : d->perm[pair_idx] + 4;
22531 :
22532 : /* Alway put the vector contains lone indx at the first. */
22533 4515 : if (count == 1)
22534 3307 : std::swap (d->op0, d->op1);
22535 :
22536 : /* shufps. */
22537 9030 : ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
22538 4515 : perm1, d->nelt, false);
22539 4515 : gcc_assert (ok);
22540 :
22541 : /* Refine lone and pair index to original order. */
22542 4515 : perm1[shift] = lone_idx << 1;
22543 4515 : perm1[shift + 1] = pair_idx << 1;
22544 :
22545 : /* Select the remaining 2 elements in another vector. */
22546 13545 : for (i = 2 - shift; i < 4 - shift; ++i)
22547 9030 : perm1[i] = lone_idx == 1 ? d->perm[i] + 4 : d->perm[i];
22548 :
22549 : /* Adjust to original selector. */
22550 4515 : if (lone_idx > 1)
22551 2233 : std::swap (tmp, d->op1);
22552 :
22553 : /* shufps. */
22554 9030 : ok = expand_vselect_vconcat (d->target, tmp, d->op1,
22555 4515 : perm1, d->nelt, false);
22556 :
22557 4515 : gcc_assert (ok);
22558 : }
22559 :
22560 : return true;
22561 : }
22562 :
22563 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
22564 : in terms of a pair of pshuflw + pshufhw instructions. */
22565 :
22566 : static bool
22567 101362 : expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
22568 : {
22569 101362 : unsigned char perm2[MAX_VECT_LEN];
22570 101362 : unsigned i;
22571 101362 : bool ok;
22572 :
22573 101362 : if (d->vmode != V8HImode || !d->one_operand_p)
22574 : return false;
22575 :
22576 : /* The two permutations only operate in 64-bit lanes. */
22577 12859 : for (i = 0; i < 4; ++i)
22578 10382 : if (d->perm[i] >= 4)
22579 : return false;
22580 12329 : for (i = 4; i < 8; ++i)
22581 9866 : if (d->perm[i] < 4)
22582 : return false;
22583 :
22584 2463 : if (d->testing_p)
22585 : return true;
22586 :
22587 : /* Emit the pshuflw. */
22588 134 : memcpy (perm2, d->perm, 4);
22589 670 : for (i = 4; i < 8; ++i)
22590 536 : perm2[i] = i;
22591 134 : ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
22592 134 : gcc_assert (ok);
22593 :
22594 : /* Emit the pshufhw. */
22595 134 : memcpy (perm2 + 4, d->perm + 4, 4);
22596 670 : for (i = 0; i < 4; ++i)
22597 536 : perm2[i] = i;
22598 134 : ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
22599 134 : gcc_assert (ok);
22600 :
22601 : return true;
22602 : }
22603 :
22604 : /* Try to permute 2 64-bit vectors by punpckldq + 128-bit vector shuffle. */
22605 : static bool
22606 49667 : expand_vec_perm_punpckldq_pshuf (struct expand_vec_perm_d *d)
22607 : {
22608 49667 : if (GET_MODE_BITSIZE (d->vmode) != 64
22609 15989 : || !TARGET_MMX_WITH_SSE
22610 65656 : || d->one_operand_p)
22611 : return false;
22612 :
22613 14579 : machine_mode widen_vmode;
22614 14579 : switch (d->vmode)
22615 : {
22616 : /* pshufd. */
22617 : case E_V2SImode:
22618 : widen_vmode = V4SImode;
22619 : break;
22620 :
22621 : /* pshufd. */
22622 1122 : case E_V2SFmode:
22623 1122 : widen_vmode = V4SFmode;
22624 1122 : break;
22625 :
22626 4979 : case E_V4HImode:
22627 4979 : widen_vmode = V8HImode;
22628 : /* pshufb. */
22629 4979 : if (!TARGET_SSSE3)
22630 : return false;
22631 : break;
22632 :
22633 5987 : case E_V8QImode:
22634 : /* pshufb. */
22635 5987 : widen_vmode = V16QImode;
22636 5987 : if (!TARGET_SSSE3)
22637 : return false;
22638 : break;
22639 :
22640 : default:
22641 : return false;
22642 : }
22643 :
22644 5682 : if (d->testing_p)
22645 : return true;
22646 :
22647 370 : struct expand_vec_perm_d dperm;
22648 370 : dperm.target = gen_reg_rtx (widen_vmode);
22649 370 : rtx op0 = gen_reg_rtx (widen_vmode);
22650 370 : emit_move_insn (op0, gen_rtx_VEC_CONCAT (widen_vmode, d->op0, d->op1));
22651 370 : dperm.op0 = op0;
22652 370 : dperm.op1 = op0;
22653 370 : dperm.vmode = widen_vmode;
22654 370 : unsigned nelt = GET_MODE_NUNITS (widen_vmode);
22655 370 : dperm.nelt = nelt;
22656 370 : dperm.one_operand_p = true;
22657 370 : dperm.testing_p = false;
22658 :
22659 1996 : for (unsigned i = 0; i != nelt / 2; i++)
22660 : {
22661 1626 : dperm.perm[i] = d->perm[i];
22662 1626 : dperm.perm[i + nelt / 2] = d->perm[i];
22663 : }
22664 :
22665 370 : gcc_assert (expand_vec_perm_1 (&dperm));
22666 370 : emit_move_insn (d->target, lowpart_subreg (d->vmode,
22667 : dperm.target,
22668 : dperm.vmode));
22669 370 : return true;
22670 : }
22671 :
22672 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
22673 : the permutation using the SSSE3 palignr instruction. This succeeds
22674 : when all of the elements in PERM fit within one vector and we merely
22675 : need to shift them down so that a single vector permutation has a
22676 : chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
22677 : the vpalignr instruction itself can perform the requested permutation. */
22678 :
22679 : static bool
22680 222364 : expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
22681 : {
22682 222364 : unsigned i, nelt = d->nelt;
22683 222364 : unsigned min, max, minswap, maxswap;
22684 222364 : bool in_order, ok, swap = false;
22685 222364 : rtx shift, target;
22686 222364 : struct expand_vec_perm_d dcopy;
22687 :
22688 : /* Even with AVX, palignr only operates on 128-bit vectors,
22689 : in AVX2 palignr operates on both 128-bit lanes. */
22690 117576 : if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
22691 264786 : && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
22692 : return false;
22693 :
22694 34464 : min = 2 * nelt;
22695 34464 : max = 0;
22696 34464 : minswap = 2 * nelt;
22697 34464 : maxswap = 0;
22698 239124 : for (i = 0; i < nelt; ++i)
22699 : {
22700 204660 : unsigned e = d->perm[i];
22701 204660 : unsigned eswap = d->perm[i] ^ nelt;
22702 409320 : if (GET_MODE_SIZE (d->vmode) == 32)
22703 : {
22704 70000 : e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
22705 70000 : eswap = e ^ (nelt / 2);
22706 : }
22707 204660 : if (e < min)
22708 : min = e;
22709 204660 : if (e > max)
22710 : max = e;
22711 204660 : if (eswap < minswap)
22712 : minswap = eswap;
22713 204660 : if (eswap > maxswap)
22714 : maxswap = eswap;
22715 : }
22716 34464 : if (min == 0
22717 50381 : || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
22718 : {
22719 31264 : if (d->one_operand_p
22720 30995 : || minswap == 0
22721 66970 : || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
22722 17853 : ? nelt / 2 : nelt))
22723 : return false;
22724 : swap = true;
22725 : min = minswap;
22726 6420 : max = maxswap;
22727 : }
22728 :
22729 : /* Given that we have SSSE3, we know we'll be able to implement the
22730 : single operand permutation after the palignr with pshufb for
22731 : 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
22732 : first. */
22733 6474 : if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
22734 : return true;
22735 :
22736 6420 : dcopy = *d;
22737 6420 : if (swap)
22738 : {
22739 3220 : dcopy.op0 = d->op1;
22740 3220 : dcopy.op1 = d->op0;
22741 16172 : for (i = 0; i < nelt; ++i)
22742 12952 : dcopy.perm[i] ^= nelt;
22743 : }
22744 :
22745 : in_order = true;
22746 32668 : for (i = 0; i < nelt; ++i)
22747 : {
22748 26248 : unsigned e = dcopy.perm[i];
22749 26248 : if (GET_MODE_SIZE (d->vmode) == 32
22750 1152 : && e >= nelt
22751 26510 : && (e & (nelt / 2 - 1)) < min)
22752 262 : e = e - min - (nelt / 2);
22753 : else
22754 25986 : e = e - min;
22755 26248 : if (e != i)
22756 19394 : in_order = false;
22757 26248 : dcopy.perm[i] = e;
22758 : }
22759 6420 : dcopy.one_operand_p = true;
22760 :
22761 6420 : if (single_insn_only_p && !in_order)
22762 : return false;
22763 :
22764 : /* For AVX2, test whether we can permute the result in one instruction. */
22765 3271 : if (d->testing_p)
22766 : {
22767 54 : if (in_order)
22768 : return true;
22769 0 : dcopy.op1 = dcopy.op0;
22770 0 : return expand_vec_perm_1 (&dcopy);
22771 : }
22772 :
22773 6434 : shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
22774 6434 : if (GET_MODE_SIZE (d->vmode) == 16)
22775 : {
22776 3145 : target = gen_reg_rtx (V1TImode);
22777 3145 : emit_insn (gen_ssse3_palignrv1ti (target,
22778 3145 : gen_lowpart (V1TImode, dcopy.op1),
22779 3145 : gen_lowpart (V1TImode, dcopy.op0),
22780 : shift));
22781 : }
22782 : else
22783 : {
22784 72 : target = gen_reg_rtx (V2TImode);
22785 72 : emit_insn (gen_avx2_palignrv2ti (target,
22786 72 : gen_lowpart (V2TImode, dcopy.op1),
22787 72 : gen_lowpart (V2TImode, dcopy.op0),
22788 : shift));
22789 : }
22790 :
22791 3217 : dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
22792 :
22793 : /* Test for the degenerate case where the alignment by itself
22794 : produces the desired permutation. */
22795 3217 : if (in_order)
22796 : {
22797 70 : emit_move_insn (d->target, dcopy.op0);
22798 70 : return true;
22799 : }
22800 :
22801 3147 : ok = expand_vec_perm_1 (&dcopy);
22802 3159 : gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
22803 :
22804 : return ok;
22805 : }
22806 :
22807 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
22808 : the permutation using the SSE4_1 pblendv instruction. Potentially
22809 : reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
22810 :
22811 : static bool
22812 89327 : expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
22813 : {
22814 89327 : unsigned i, which, nelt = d->nelt;
22815 89327 : struct expand_vec_perm_d dcopy, dcopy1;
22816 89327 : machine_mode vmode = d->vmode;
22817 89327 : bool ok;
22818 :
22819 : /* Use the same checks as in expand_vec_perm_blend. */
22820 89327 : if (d->one_operand_p)
22821 : return false;
22822 87867 : if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
22823 : ;
22824 82269 : else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
22825 : ;
22826 78994 : else if (TARGET_SSE4_1
22827 89582 : && (GET_MODE_SIZE (vmode) == 16
22828 9228 : || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
22829 2733 : || GET_MODE_SIZE (vmode) == 4))
22830 : ;
22831 : else
22832 : return false;
22833 :
22834 : /* Figure out where permutation elements stay not in their
22835 : respective lanes. */
22836 108231 : for (i = 0, which = 0; i < nelt; ++i)
22837 : {
22838 92964 : unsigned e = d->perm[i];
22839 92964 : if (e != i)
22840 127952 : which |= (e < nelt ? 1 : 2);
22841 : }
22842 : /* We can pblend the part where elements stay not in their
22843 : respective lanes only when these elements are all in one
22844 : half of a permutation.
22845 : {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
22846 : lanes, but both 8 and 9 >= 8
22847 : {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
22848 : respective lanes and 8 >= 8, but 2 not. */
22849 15267 : if (which != 1 && which != 2)
22850 : return false;
22851 3175 : if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
22852 : return true;
22853 :
22854 : /* First we apply one operand permutation to the part where
22855 : elements stay not in their respective lanes. */
22856 1958 : dcopy = *d;
22857 1958 : if (which == 2)
22858 1958 : dcopy.op0 = dcopy.op1 = d->op1;
22859 : else
22860 0 : dcopy.op0 = dcopy.op1 = d->op0;
22861 1958 : if (!d->testing_p)
22862 741 : dcopy.target = gen_reg_rtx (vmode);
22863 1958 : dcopy.one_operand_p = true;
22864 :
22865 15762 : for (i = 0; i < nelt; ++i)
22866 13804 : dcopy.perm[i] = d->perm[i] & (nelt - 1);
22867 :
22868 1958 : ok = expand_vec_perm_1 (&dcopy);
22869 3916 : if (GET_MODE_SIZE (vmode) != 16 && !ok)
22870 : return false;
22871 : else
22872 1663 : gcc_assert (ok);
22873 1663 : if (d->testing_p)
22874 : return true;
22875 :
22876 : /* Next we put permuted elements into their positions. */
22877 679 : dcopy1 = *d;
22878 679 : if (which == 2)
22879 679 : dcopy1.op1 = dcopy.target;
22880 : else
22881 0 : dcopy1.op0 = dcopy.target;
22882 :
22883 5751 : for (i = 0; i < nelt; ++i)
22884 5072 : dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
22885 :
22886 679 : ok = expand_vec_perm_blend (&dcopy1);
22887 679 : gcc_assert (ok);
22888 :
22889 : return true;
22890 : }
22891 :
22892 : static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
22893 :
22894 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
22895 : a two vector permutation into a single vector permutation by using
22896 : an interleave operation to merge the vectors. */
22897 :
22898 : static bool
22899 95764 : expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
22900 : {
22901 95764 : struct expand_vec_perm_d dremap, dfinal;
22902 95764 : unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
22903 95764 : unsigned HOST_WIDE_INT contents;
22904 95764 : unsigned char remap[2 * MAX_VECT_LEN];
22905 95764 : rtx_insn *seq;
22906 95764 : bool ok, same_halves = false;
22907 :
22908 95764 : if (GET_MODE_SIZE (d->vmode) == 4
22909 171582 : || GET_MODE_SIZE (d->vmode) == 8
22910 231626 : || GET_MODE_SIZE (d->vmode) == 16)
22911 : {
22912 89367 : if (d->one_operand_p)
22913 : return false;
22914 : }
22915 12794 : else if (GET_MODE_SIZE (d->vmode) == 32)
22916 : {
22917 6072 : if (!TARGET_AVX)
22918 : return false;
22919 : /* For 32-byte modes allow even d->one_operand_p.
22920 : The lack of cross-lane shuffling in some instructions
22921 : might prevent a single insn shuffle. */
22922 6072 : dfinal = *d;
22923 6072 : dfinal.testing_p = true;
22924 : /* If expand_vec_perm_interleave3 can expand this into
22925 : a 3 insn sequence, give up and let it be expanded as
22926 : 3 insn sequence. While that is one insn longer,
22927 : it doesn't need a memory operand and in the common
22928 : case that both interleave low and high permutations
22929 : with the same operands are adjacent needs 4 insns
22930 : for both after CSE. */
22931 6072 : if (expand_vec_perm_interleave3 (&dfinal))
22932 : return false;
22933 : }
22934 : else
22935 : return false;
22936 :
22937 : /* Examine from whence the elements come. */
22938 89991 : contents = 0;
22939 675225 : for (i = 0; i < nelt; ++i)
22940 585234 : contents |= HOST_WIDE_INT_1U << d->perm[i];
22941 :
22942 89991 : memset (remap, 0xff, sizeof (remap));
22943 89991 : dremap = *d;
22944 :
22945 89991 : if (GET_MODE_SIZE (d->vmode) == 4
22946 171689 : || GET_MODE_SIZE (d->vmode) == 8)
22947 : {
22948 24739 : unsigned HOST_WIDE_INT h1, h2, h3, h4;
22949 :
22950 : /* Split the two input vectors into 4 halves. */
22951 24739 : h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
22952 24739 : h2 = h1 << nelt2;
22953 24739 : h3 = h2 << nelt2;
22954 24739 : h4 = h3 << nelt2;
22955 :
22956 : /* If the elements from the low halves use interleave low,
22957 : and similarly for interleave high. */
22958 24739 : if ((contents & (h1 | h3)) == contents)
22959 : {
22960 : /* punpckl* */
22961 3348 : for (i = 0; i < nelt2; ++i)
22962 : {
22963 2360 : remap[i] = i * 2;
22964 2360 : remap[i + nelt] = i * 2 + 1;
22965 2360 : dremap.perm[i * 2] = i;
22966 2360 : dremap.perm[i * 2 + 1] = i + nelt;
22967 : }
22968 : }
22969 23751 : else if ((contents & (h2 | h4)) == contents)
22970 : {
22971 : /* punpckh* */
22972 2877 : for (i = 0; i < nelt2; ++i)
22973 : {
22974 2028 : remap[i + nelt2] = i * 2;
22975 2028 : remap[i + nelt + nelt2] = i * 2 + 1;
22976 2028 : dremap.perm[i * 2] = i + nelt2;
22977 2028 : dremap.perm[i * 2 + 1] = i + nelt + nelt2;
22978 : }
22979 : }
22980 : else
22981 : return false;
22982 : }
22983 130504 : else if (GET_MODE_SIZE (d->vmode) == 16)
22984 : {
22985 59398 : unsigned HOST_WIDE_INT h1, h2, h3, h4;
22986 :
22987 : /* Split the two input vectors into 4 halves. */
22988 59398 : h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
22989 59398 : h2 = h1 << nelt2;
22990 59398 : h3 = h2 << nelt2;
22991 59398 : h4 = h3 << nelt2;
22992 :
22993 : /* If the elements from the low halves use interleave low, and similarly
22994 : for interleave high. If the elements are from mis-matched halves, we
22995 : can use shufps for V4SF/V4SI or do a DImode shuffle. */
22996 59398 : if ((contents & (h1 | h3)) == contents)
22997 : {
22998 : /* punpckl* */
22999 5779 : for (i = 0; i < nelt2; ++i)
23000 : {
23001 4286 : remap[i] = i * 2;
23002 4286 : remap[i + nelt] = i * 2 + 1;
23003 4286 : dremap.perm[i * 2] = i;
23004 4286 : dremap.perm[i * 2 + 1] = i + nelt;
23005 : }
23006 1493 : if (!TARGET_SSE2 && d->vmode == V4SImode)
23007 0 : dremap.vmode = V4SFmode;
23008 : }
23009 57905 : else if ((contents & (h2 | h4)) == contents)
23010 : {
23011 : /* punpckh* */
23012 4986 : for (i = 0; i < nelt2; ++i)
23013 : {
23014 3666 : remap[i + nelt2] = i * 2;
23015 3666 : remap[i + nelt + nelt2] = i * 2 + 1;
23016 3666 : dremap.perm[i * 2] = i + nelt2;
23017 3666 : dremap.perm[i * 2 + 1] = i + nelt + nelt2;
23018 : }
23019 1320 : if (!TARGET_SSE2 && d->vmode == V4SImode)
23020 0 : dremap.vmode = V4SFmode;
23021 : }
23022 56585 : else if ((contents & (h1 | h4)) == contents)
23023 : {
23024 : /* shufps */
23025 2537 : for (i = 0; i < nelt2; ++i)
23026 : {
23027 1828 : remap[i] = i;
23028 1828 : remap[i + nelt + nelt2] = i + nelt2;
23029 1828 : dremap.perm[i] = i;
23030 1828 : dremap.perm[i + nelt2] = i + nelt + nelt2;
23031 : }
23032 709 : if (nelt != 4)
23033 : {
23034 : /* shufpd */
23035 69 : dremap.vmode = V2DImode;
23036 69 : dremap.nelt = 2;
23037 69 : dremap.perm[0] = 0;
23038 69 : dremap.perm[1] = 3;
23039 : }
23040 : }
23041 55876 : else if ((contents & (h2 | h3)) == contents)
23042 : {
23043 : /* shufps */
23044 3483 : for (i = 0; i < nelt2; ++i)
23045 : {
23046 2458 : remap[i + nelt2] = i;
23047 2458 : remap[i + nelt] = i + nelt2;
23048 2458 : dremap.perm[i] = i + nelt2;
23049 2458 : dremap.perm[i + nelt2] = i + nelt;
23050 : }
23051 1025 : if (nelt != 4)
23052 : {
23053 : /* shufpd */
23054 76 : dremap.vmode = V2DImode;
23055 76 : dremap.nelt = 2;
23056 76 : dremap.perm[0] = 1;
23057 76 : dremap.perm[1] = 2;
23058 : }
23059 : }
23060 : else
23061 : return false;
23062 : }
23063 : else
23064 : {
23065 5854 : unsigned int nelt4 = nelt / 4, nzcnt = 0;
23066 5854 : unsigned HOST_WIDE_INT q[8];
23067 5854 : unsigned int nonzero_halves[4];
23068 :
23069 : /* Split the two input vectors into 8 quarters. */
23070 5854 : q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
23071 46832 : for (i = 1; i < 8; ++i)
23072 40978 : q[i] = q[0] << (nelt4 * i);
23073 29270 : for (i = 0; i < 4; ++i)
23074 23416 : if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
23075 : {
23076 21169 : nonzero_halves[nzcnt] = i;
23077 21169 : ++nzcnt;
23078 : }
23079 :
23080 5854 : if (nzcnt == 1)
23081 : {
23082 215 : gcc_assert (d->one_operand_p);
23083 215 : nonzero_halves[1] = nonzero_halves[0];
23084 215 : same_halves = true;
23085 : }
23086 5639 : else if (d->one_operand_p)
23087 : {
23088 23 : gcc_assert (nonzero_halves[0] == 0);
23089 23 : gcc_assert (nonzero_halves[1] == 1);
23090 : }
23091 :
23092 5854 : if (nzcnt <= 2)
23093 : {
23094 490 : if (d->perm[0] / nelt2 == nonzero_halves[1])
23095 : {
23096 : /* Attempt to increase the likelihood that dfinal
23097 : shuffle will be intra-lane. */
23098 223 : std::swap (nonzero_halves[0], nonzero_halves[1]);
23099 : }
23100 :
23101 : /* vperm2f128 or vperm2i128. */
23102 3256 : for (i = 0; i < nelt2; ++i)
23103 : {
23104 2766 : remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
23105 2766 : remap[i + nonzero_halves[0] * nelt2] = i;
23106 2766 : dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
23107 2766 : dremap.perm[i] = i + nonzero_halves[0] * nelt2;
23108 : }
23109 :
23110 490 : if (d->vmode != V8SFmode
23111 : && d->vmode != V4DFmode
23112 : && d->vmode != V8SImode)
23113 : {
23114 132 : dremap.vmode = V8SImode;
23115 132 : dremap.nelt = 8;
23116 660 : for (i = 0; i < 4; ++i)
23117 : {
23118 528 : dremap.perm[i] = i + nonzero_halves[0] * 4;
23119 528 : dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
23120 : }
23121 : }
23122 : }
23123 5364 : else if (d->one_operand_p)
23124 4947 : return false;
23125 5364 : else if (TARGET_AVX2
23126 2125 : && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
23127 : {
23128 : /* vpunpckl* */
23129 443 : for (i = 0; i < nelt4; ++i)
23130 : {
23131 223 : remap[i] = i * 2;
23132 223 : remap[i + nelt] = i * 2 + 1;
23133 223 : remap[i + nelt2] = i * 2 + nelt2;
23134 223 : remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
23135 223 : dremap.perm[i * 2] = i;
23136 223 : dremap.perm[i * 2 + 1] = i + nelt;
23137 223 : dremap.perm[i * 2 + nelt2] = i + nelt2;
23138 223 : dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
23139 : }
23140 : }
23141 5144 : else if (TARGET_AVX2
23142 1905 : && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
23143 : {
23144 : /* vpunpckh* */
23145 397 : for (i = 0; i < nelt4; ++i)
23146 : {
23147 200 : remap[i + nelt4] = i * 2;
23148 200 : remap[i + nelt + nelt4] = i * 2 + 1;
23149 200 : remap[i + nelt2 + nelt4] = i * 2 + nelt2;
23150 200 : remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
23151 200 : dremap.perm[i * 2] = i + nelt4;
23152 200 : dremap.perm[i * 2 + 1] = i + nelt + nelt4;
23153 200 : dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
23154 200 : dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
23155 : }
23156 : }
23157 : else
23158 : return false;
23159 : }
23160 :
23161 : /* Use the remapping array set up above to move the elements from their
23162 : swizzled locations into their final destinations. */
23163 7291 : dfinal = *d;
23164 47767 : for (i = 0; i < nelt; ++i)
23165 : {
23166 40476 : unsigned e = remap[d->perm[i]];
23167 40476 : gcc_assert (e < nelt);
23168 : /* If same_halves is true, both halves of the remapped vector are the
23169 : same. Avoid cross-lane accesses if possible. */
23170 40476 : if (same_halves && i >= nelt2)
23171 : {
23172 792 : gcc_assert (e < nelt2);
23173 792 : dfinal.perm[i] = e + nelt2;
23174 : }
23175 : else
23176 39684 : dfinal.perm[i] = e;
23177 : }
23178 7291 : if (!d->testing_p)
23179 : {
23180 2703 : dremap.target = gen_reg_rtx (dremap.vmode);
23181 2703 : dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
23182 : }
23183 7291 : dfinal.op1 = dfinal.op0;
23184 7291 : dfinal.one_operand_p = true;
23185 :
23186 : /* Test if the final remap can be done with a single insn. For V4SFmode or
23187 : V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
23188 7291 : start_sequence ();
23189 7291 : ok = expand_vec_perm_1 (&dfinal);
23190 7291 : seq = end_sequence ();
23191 :
23192 7291 : if (!ok)
23193 : return false;
23194 :
23195 6269 : if (d->testing_p)
23196 : return true;
23197 :
23198 2664 : if (dremap.vmode != dfinal.vmode)
23199 : {
23200 55 : dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
23201 55 : dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
23202 : }
23203 :
23204 2664 : ok = expand_vec_perm_1 (&dremap);
23205 2664 : gcc_assert (ok);
23206 :
23207 2664 : emit_insn (seq);
23208 2664 : return true;
23209 : }
23210 :
23211 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
23212 : a single vector cross-lane permutation into vpermq followed
23213 : by any of the single insn permutations. */
23214 :
23215 : static bool
23216 89395 : expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
23217 : {
23218 89395 : struct expand_vec_perm_d dremap, dfinal;
23219 89395 : unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
23220 89395 : unsigned contents[2];
23221 89395 : bool ok;
23222 :
23223 89395 : if (!(TARGET_AVX2
23224 4080 : && (d->vmode == V32QImode || d->vmode == V16HImode)
23225 256 : && d->one_operand_p))
23226 : return false;
23227 :
23228 7 : contents[0] = 0;
23229 7 : contents[1] = 0;
23230 103 : for (i = 0; i < nelt2; ++i)
23231 : {
23232 96 : contents[0] |= 1u << (d->perm[i] / nelt4);
23233 96 : contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
23234 : }
23235 :
23236 7 : for (i = 0; i < 2; ++i)
23237 : {
23238 : unsigned int cnt = 0;
23239 21 : for (j = 0; j < 4; ++j)
23240 21 : if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
23241 : return false;
23242 : }
23243 :
23244 0 : if (d->testing_p)
23245 : return true;
23246 :
23247 0 : dremap = *d;
23248 0 : dremap.vmode = V4DImode;
23249 0 : dremap.nelt = 4;
23250 0 : dremap.target = gen_reg_rtx (V4DImode);
23251 0 : dremap.op0 = gen_lowpart (V4DImode, d->op0);
23252 0 : dremap.op1 = dremap.op0;
23253 0 : dremap.one_operand_p = true;
23254 0 : for (i = 0; i < 2; ++i)
23255 : {
23256 : unsigned int cnt = 0;
23257 0 : for (j = 0; j < 4; ++j)
23258 0 : if ((contents[i] & (1u << j)) != 0)
23259 0 : dremap.perm[2 * i + cnt++] = j;
23260 0 : for (; cnt < 2; ++cnt)
23261 0 : dremap.perm[2 * i + cnt] = 0;
23262 : }
23263 :
23264 0 : dfinal = *d;
23265 0 : dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
23266 0 : dfinal.op1 = dfinal.op0;
23267 0 : dfinal.one_operand_p = true;
23268 0 : for (i = 0, j = 0; i < nelt; ++i)
23269 : {
23270 0 : if (i == nelt2)
23271 0 : j = 2;
23272 0 : dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
23273 0 : if ((d->perm[i] / nelt4) == dremap.perm[j])
23274 : ;
23275 0 : else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
23276 0 : dfinal.perm[i] |= nelt4;
23277 : else
23278 0 : gcc_unreachable ();
23279 : }
23280 :
23281 0 : ok = expand_vec_perm_1 (&dremap);
23282 0 : gcc_assert (ok);
23283 :
23284 0 : ok = expand_vec_perm_1 (&dfinal);
23285 0 : gcc_assert (ok);
23286 :
23287 : return true;
23288 : }
23289 :
23290 : static bool canonicalize_perm (struct expand_vec_perm_d *d);
23291 :
23292 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
23293 : a vector permutation using two instructions, vperm2f128 resp.
23294 : vperm2i128 followed by any single in-lane permutation. */
23295 :
23296 : static bool
23297 89395 : expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
23298 : {
23299 89395 : struct expand_vec_perm_d dfirst, dsecond;
23300 89395 : unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
23301 89395 : bool ok;
23302 :
23303 89395 : if (!TARGET_AVX
23304 22132 : || GET_MODE_SIZE (d->vmode) != 32
23305 94685 : || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
23306 : return false;
23307 :
23308 5106 : dsecond = *d;
23309 5106 : dsecond.one_operand_p = false;
23310 5106 : dsecond.testing_p = true;
23311 :
23312 : /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
23313 : immediate. For perm < 16 the second permutation uses
23314 : d->op0 as first operand, for perm >= 16 it uses d->op1
23315 : as first operand. The second operand is the result of
23316 : vperm2[fi]128. */
23317 167184 : for (perm = 0; perm < 32; perm++)
23318 : {
23319 : /* Ignore permutations which do not move anything cross-lane. */
23320 162161 : if (perm < 16)
23321 : {
23322 : /* The second shuffle for e.g. V4DFmode has
23323 : 0123 and ABCD operands.
23324 : Ignore AB23, as 23 is already in the second lane
23325 : of the first operand. */
23326 81334 : if ((perm & 0xc) == (1 << 2)) continue;
23327 : /* And 01CD, as 01 is in the first lane of the first
23328 : operand. */
23329 60986 : if ((perm & 3) == 0) continue;
23330 : /* And 4567, as then the vperm2[fi]128 doesn't change
23331 : anything on the original 4567 second operand. */
23332 45722 : if ((perm & 0xf) == ((3 << 2) | 2)) continue;
23333 : }
23334 : else
23335 : {
23336 : /* The second shuffle for e.g. V4DFmode has
23337 : 4567 and ABCD operands.
23338 : Ignore AB67, as 67 is already in the second lane
23339 : of the first operand. */
23340 80827 : if ((perm & 0xc) == (3 << 2)) continue;
23341 : /* And 45CD, as 45 is in the first lane of the first
23342 : operand. */
23343 60735 : if ((perm & 3) == 2) continue;
23344 : /* And 0123, as then the vperm2[fi]128 doesn't change
23345 : anything on the original 0123 first operand. */
23346 45575 : if ((perm & 0xf) == (1 << 2)) continue;
23347 : }
23348 :
23349 210202 : for (i = 0; i < nelt; i++)
23350 : {
23351 209051 : j = d->perm[i] / nelt2;
23352 388758 : if (j == ((perm >> (2 * (i >= nelt2))) & 3))
23353 51827 : dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
23354 260841 : else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
23355 77218 : dsecond.perm[i] = d->perm[i] & (nelt - 1);
23356 : else
23357 : break;
23358 : }
23359 :
23360 81157 : if (i == nelt)
23361 : {
23362 1151 : start_sequence ();
23363 1151 : ok = expand_vec_perm_1 (&dsecond);
23364 1151 : end_sequence ();
23365 : }
23366 : else
23367 : ok = false;
23368 :
23369 1151 : if (ok)
23370 : {
23371 68 : if (d->testing_p)
23372 : return true;
23373 :
23374 : /* Found a usable second shuffle. dfirst will be
23375 : vperm2f128 on d->op0 and d->op1. */
23376 46 : dsecond.testing_p = false;
23377 46 : dfirst = *d;
23378 46 : dfirst.target = gen_reg_rtx (d->vmode);
23379 270 : for (i = 0; i < nelt; i++)
23380 448 : dfirst.perm[i] = (i & (nelt2 - 1))
23381 336 : + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
23382 :
23383 46 : canonicalize_perm (&dfirst);
23384 46 : ok = expand_vec_perm_1 (&dfirst);
23385 46 : gcc_assert (ok);
23386 :
23387 : /* And dsecond is some single insn shuffle, taking
23388 : d->op0 and result of vperm2f128 (if perm < 16) or
23389 : d->op1 and result of vperm2f128 (otherwise). */
23390 46 : if (perm >= 16)
23391 46 : dsecond.op0 = dsecond.op1;
23392 46 : dsecond.op1 = dfirst.target;
23393 :
23394 46 : ok = expand_vec_perm_1 (&dsecond);
23395 46 : gcc_assert (ok);
23396 :
23397 : return true;
23398 : }
23399 :
23400 : /* For one operand, the only useful vperm2f128 permutation is 0x01
23401 : aka lanes swap. */
23402 81089 : if (d->one_operand_p)
23403 : return false;
23404 : }
23405 :
23406 : return false;
23407 : }
23408 :
23409 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
23410 : a two vector permutation using 2 intra-lane interleave insns
23411 : and cross-lane shuffle for 32-byte vectors. */
23412 :
23413 : static bool
23414 33409 : expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
23415 : {
23416 33409 : unsigned i, nelt;
23417 33409 : rtx (*gen) (rtx, rtx, rtx);
23418 :
23419 33409 : if (d->one_operand_p)
23420 : return false;
23421 31267 : if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
23422 : ;
23423 24701 : else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
23424 : ;
23425 : else
23426 : return false;
23427 :
23428 8198 : nelt = d->nelt;
23429 8198 : if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
23430 : return false;
23431 8555 : for (i = 0; i < nelt; i += 2)
23432 8199 : if (d->perm[i] != d->perm[0] + i / 2
23433 7326 : || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
23434 : return false;
23435 :
23436 356 : if (d->testing_p)
23437 : return true;
23438 :
23439 56 : switch (d->vmode)
23440 : {
23441 32 : case E_V32QImode:
23442 32 : if (d->perm[0])
23443 : gen = gen_vec_interleave_highv32qi;
23444 : else
23445 16 : gen = gen_vec_interleave_lowv32qi;
23446 : break;
23447 18 : case E_V16HImode:
23448 18 : if (d->perm[0])
23449 : gen = gen_vec_interleave_highv16hi;
23450 : else
23451 9 : gen = gen_vec_interleave_lowv16hi;
23452 : break;
23453 0 : case E_V8SImode:
23454 0 : if (d->perm[0])
23455 : gen = gen_vec_interleave_highv8si;
23456 : else
23457 0 : gen = gen_vec_interleave_lowv8si;
23458 : break;
23459 4 : case E_V4DImode:
23460 4 : if (d->perm[0])
23461 : gen = gen_vec_interleave_highv4di;
23462 : else
23463 2 : gen = gen_vec_interleave_lowv4di;
23464 : break;
23465 2 : case E_V8SFmode:
23466 2 : if (d->perm[0])
23467 : gen = gen_vec_interleave_highv8sf;
23468 : else
23469 1 : gen = gen_vec_interleave_lowv8sf;
23470 : break;
23471 0 : case E_V4DFmode:
23472 0 : if (d->perm[0])
23473 : gen = gen_vec_interleave_highv4df;
23474 : else
23475 0 : gen = gen_vec_interleave_lowv4df;
23476 : break;
23477 0 : default:
23478 0 : gcc_unreachable ();
23479 : }
23480 :
23481 56 : emit_insn (gen (d->target, d->op0, d->op1));
23482 56 : return true;
23483 : }
23484 :
23485 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
23486 : a single vector permutation using a single intra-lane vector
23487 : permutation, vperm2f128 swapping the lanes and vblend* insn blending
23488 : the non-swapped and swapped vectors together. */
23489 :
23490 : static bool
23491 27199 : expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
23492 : {
23493 27199 : struct expand_vec_perm_d dfirst, dsecond;
23494 27199 : unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
23495 27199 : rtx_insn *seq;
23496 27199 : bool ok;
23497 27199 : rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
23498 :
23499 27199 : if (!TARGET_AVX
23500 2933 : || TARGET_AVX2
23501 1814 : || (d->vmode != V8SFmode && d->vmode != V4DFmode)
23502 1630 : || !d->one_operand_p)
23503 : return false;
23504 :
23505 0 : dfirst = *d;
23506 0 : for (i = 0; i < nelt; i++)
23507 0 : dfirst.perm[i] = 0xff;
23508 0 : for (i = 0, msk = 0; i < nelt; i++)
23509 : {
23510 0 : j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
23511 0 : if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
23512 : return false;
23513 0 : dfirst.perm[j] = d->perm[i];
23514 0 : if (j != i)
23515 0 : msk |= (1 << i);
23516 : }
23517 0 : for (i = 0; i < nelt; i++)
23518 0 : if (dfirst.perm[i] == 0xff)
23519 0 : dfirst.perm[i] = i;
23520 :
23521 0 : if (!d->testing_p)
23522 0 : dfirst.target = gen_reg_rtx (dfirst.vmode);
23523 :
23524 0 : start_sequence ();
23525 0 : ok = expand_vec_perm_1 (&dfirst);
23526 0 : seq = end_sequence ();
23527 :
23528 0 : if (!ok)
23529 : return false;
23530 :
23531 0 : if (d->testing_p)
23532 : return true;
23533 :
23534 0 : emit_insn (seq);
23535 :
23536 0 : dsecond = *d;
23537 0 : dsecond.op0 = dfirst.target;
23538 0 : dsecond.op1 = dfirst.target;
23539 0 : dsecond.one_operand_p = true;
23540 0 : dsecond.target = gen_reg_rtx (dsecond.vmode);
23541 0 : for (i = 0; i < nelt; i++)
23542 0 : dsecond.perm[i] = i ^ nelt2;
23543 :
23544 0 : ok = expand_vec_perm_1 (&dsecond);
23545 0 : gcc_assert (ok);
23546 :
23547 0 : blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
23548 0 : emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
23549 0 : return true;
23550 : }
23551 :
23552 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
23553 : a two vector permutation using two single vector permutations and
23554 : {,v}{,p}unpckl{ps,pd,bw,wd,dq}. If two_insn, succeed only if one
23555 : of dfirst or dsecond is identity permutation. */
23556 :
23557 : static bool
23558 114863 : expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
23559 : {
23560 114863 : unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
23561 114863 : struct expand_vec_perm_d dfirst, dsecond, dfinal;
23562 114863 : bool ident1 = true, ident2 = true;
23563 :
23564 114863 : if (d->one_operand_p)
23565 : return false;
23566 :
23567 208572 : if (GET_MODE_SIZE (d->vmode) == 16)
23568 : {
23569 62536 : if (!TARGET_SSE)
23570 : return false;
23571 62536 : if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
23572 : return false;
23573 : }
23574 83500 : else if (GET_MODE_SIZE (d->vmode) == 32)
23575 : {
23576 7220 : if (!TARGET_AVX)
23577 : return false;
23578 7220 : if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
23579 : return false;
23580 : lane = nelt2;
23581 : }
23582 : else
23583 : return false;
23584 :
23585 231246 : for (i = 1; i < nelt; i++)
23586 198111 : if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
23587 : return false;
23588 :
23589 33135 : dfirst = *d;
23590 33135 : dsecond = *d;
23591 33135 : dfinal = *d;
23592 33135 : dfirst.op1 = dfirst.op0;
23593 33135 : dfirst.one_operand_p = true;
23594 33135 : dsecond.op0 = dsecond.op1;
23595 33135 : dsecond.one_operand_p = true;
23596 :
23597 218699 : for (i = 0; i < nelt; i++)
23598 185564 : if (d->perm[i] >= nelt)
23599 : {
23600 92782 : dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
23601 92782 : if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
23602 84306 : ident2 = false;
23603 92782 : dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
23604 92782 : = d->perm[i] - nelt;
23605 : }
23606 : else
23607 : {
23608 92782 : dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
23609 92782 : if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
23610 75948 : ident1 = false;
23611 92782 : dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
23612 : }
23613 :
23614 33135 : if (two_insn && !ident1 && !ident2)
23615 : return false;
23616 :
23617 3957 : if (!d->testing_p)
23618 : {
23619 214 : if (!ident1)
23620 144 : dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
23621 214 : if (!ident2)
23622 148 : dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
23623 214 : if (d->perm[0] >= nelt)
23624 0 : std::swap (dfinal.op0, dfinal.op1);
23625 : }
23626 :
23627 3957 : bool ok;
23628 3957 : rtx_insn *seq1 = NULL, *seq2 = NULL;
23629 :
23630 3957 : if (!ident1)
23631 : {
23632 2645 : start_sequence ();
23633 2645 : ok = expand_vec_perm_1 (&dfirst);
23634 2645 : seq1 = end_sequence ();
23635 :
23636 2645 : if (!ok)
23637 : return false;
23638 : }
23639 :
23640 2168 : if (!ident2)
23641 : {
23642 2074 : start_sequence ();
23643 2074 : ok = expand_vec_perm_1 (&dsecond);
23644 2074 : seq2 = end_sequence ();
23645 :
23646 2074 : if (!ok)
23647 : return false;
23648 : }
23649 :
23650 602 : if (d->testing_p)
23651 : return true;
23652 :
23653 680 : for (i = 0; i < nelt; i++)
23654 : {
23655 544 : dfinal.perm[i] = i / 2;
23656 544 : if (i >= lane)
23657 4 : dfinal.perm[i] += lane / 2;
23658 544 : if ((i & 1) != 0)
23659 272 : dfinal.perm[i] += nelt;
23660 : }
23661 136 : emit_insn (seq1);
23662 136 : emit_insn (seq2);
23663 136 : ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1,
23664 : dfinal.perm, dfinal.nelt, false);
23665 136 : gcc_assert (ok);
23666 : return true;
23667 : }
23668 :
23669 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
23670 : the permutation using two single vector permutations and the SSE4_1 pblendv
23671 : instruction. If two_insn, succeed only if one of dfirst or dsecond is
23672 : identity permutation. */
23673 :
23674 : static bool
23675 114261 : expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
23676 : {
23677 114261 : unsigned i, nelt = d->nelt;
23678 114261 : struct expand_vec_perm_d dfirst, dsecond, dfinal;
23679 114261 : machine_mode vmode = d->vmode;
23680 114261 : bool ident1 = true, ident2 = true;
23681 :
23682 : /* Use the same checks as in expand_vec_perm_blend. */
23683 114261 : if (d->one_operand_p)
23684 : return false;
23685 107653 : if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
23686 : ;
23687 101705 : else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
23688 : ;
23689 97080 : else if (TARGET_SSE4_1
23690 106873 : && (GET_MODE_SIZE (vmode) == 16
23691 8878 : || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
23692 2629 : || GET_MODE_SIZE (vmode) == 4))
23693 : ;
23694 : else
23695 : return false;
23696 :
23697 15709 : dfirst = *d;
23698 15709 : dsecond = *d;
23699 15709 : dfinal = *d;
23700 15709 : dfirst.op1 = dfirst.op0;
23701 15709 : dfirst.one_operand_p = true;
23702 15709 : dsecond.op0 = dsecond.op1;
23703 15709 : dsecond.one_operand_p = true;
23704 :
23705 116717 : for (i = 0; i < nelt; ++i)
23706 101008 : if (d->perm[i] >= nelt)
23707 : {
23708 51027 : dfirst.perm[i] = 0xff;
23709 51027 : dsecond.perm[i] = d->perm[i] - nelt;
23710 51027 : if (d->perm[i] != i + nelt)
23711 101008 : ident2 = false;
23712 : }
23713 : else
23714 : {
23715 49981 : dsecond.perm[i] = 0xff;
23716 49981 : dfirst.perm[i] = d->perm[i];
23717 49981 : if (d->perm[i] != i)
23718 101008 : ident1 = false;
23719 : }
23720 :
23721 15709 : if (two_insn && !ident1 && !ident2)
23722 : return false;
23723 :
23724 : /* For now. Ideally treat 0xff as a wildcard. */
23725 44289 : for (i = 0; i < nelt; ++i)
23726 38888 : if (dfirst.perm[i] == 0xff)
23727 : {
23728 20736 : if (GET_MODE_SIZE (vmode) == 32
23729 20736 : && dfirst.perm[i ^ (nelt / 2)] != 0xff)
23730 11732 : dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
23731 : else
23732 9004 : dfirst.perm[i] = i;
23733 : }
23734 : else
23735 : {
23736 18152 : if (GET_MODE_SIZE (vmode) == 32
23737 18152 : && dsecond.perm[i ^ (nelt / 2)] != 0xff)
23738 9964 : dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
23739 : else
23740 8188 : dsecond.perm[i] = i;
23741 : }
23742 :
23743 5401 : if (!d->testing_p)
23744 : {
23745 2169 : if (!ident1)
23746 2045 : dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
23747 2169 : if (!ident2)
23748 855 : dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
23749 : }
23750 :
23751 5401 : bool ok;
23752 5401 : rtx_insn *seq1 = NULL, *seq2 = NULL;
23753 :
23754 5401 : if (!ident1)
23755 : {
23756 4812 : start_sequence ();
23757 4812 : ok = expand_vec_perm_1 (&dfirst);
23758 4812 : seq1 = end_sequence ();
23759 :
23760 4812 : if (!ok)
23761 : return false;
23762 : }
23763 :
23764 4014 : if (!ident2)
23765 : {
23766 1133 : start_sequence ();
23767 1133 : ok = expand_vec_perm_1 (&dsecond);
23768 1133 : seq2 = end_sequence ();
23769 :
23770 1133 : if (!ok)
23771 : return false;
23772 : }
23773 :
23774 3425 : if (d->testing_p)
23775 : return true;
23776 :
23777 14047 : for (i = 0; i < nelt; ++i)
23778 12220 : dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
23779 :
23780 1827 : emit_insn (seq1);
23781 1827 : emit_insn (seq2);
23782 1827 : ok = expand_vec_perm_blend (&dfinal);
23783 1827 : gcc_assert (ok);
23784 : return true;
23785 : }
23786 :
23787 : /* A subroutine of ix86_expand_vec_perm_const_1.
23788 : Implement a permutation with psrlw, psllw and por.
23789 : It handles case:
23790 : __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
23791 : __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */
23792 :
23793 : static bool
23794 26415 : expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d)
23795 : {
23796 26415 : unsigned i;
23797 26415 : rtx (*gen_shr) (rtx, rtx, rtx);
23798 26415 : rtx (*gen_shl) (rtx, rtx, rtx);
23799 26415 : rtx (*gen_or) (rtx, rtx, rtx);
23800 26415 : machine_mode mode = VOIDmode;
23801 :
23802 26415 : if (!TARGET_SSE2 || !d->one_operand_p)
23803 : return false;
23804 :
23805 5267 : switch (d->vmode)
23806 : {
23807 1410 : case E_V8QImode:
23808 1410 : if (!TARGET_MMX_WITH_SSE)
23809 : return false;
23810 : mode = V4HImode;
23811 : gen_shr = gen_lshrv4hi3;
23812 : gen_shl = gen_ashlv4hi3;
23813 : gen_or = gen_iorv4hi3;
23814 : break;
23815 : case E_V16QImode:
23816 : mode = V8HImode;
23817 : gen_shr = gen_lshrv8hi3;
23818 : gen_shl = gen_ashlv8hi3;
23819 : gen_or = gen_iorv8hi3;
23820 : break;
23821 : default: return false;
23822 : }
23823 :
23824 3141 : if (!rtx_equal_p (d->op0, d->op1))
23825 : return false;
23826 :
23827 12181 : for (i = 0; i < d->nelt; i += 2)
23828 10743 : if (d->perm[i] != i + 1 || d->perm[i + 1] != i)
23829 : return false;
23830 :
23831 1438 : if (d->testing_p)
23832 : return true;
23833 :
23834 26 : rtx tmp1 = gen_reg_rtx (mode);
23835 26 : rtx tmp2 = gen_reg_rtx (mode);
23836 26 : rtx op0 = force_reg (d->vmode, d->op0);
23837 :
23838 26 : emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode));
23839 26 : emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode));
23840 26 : emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8)));
23841 26 : emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8)));
23842 26 : emit_insn (gen_or (tmp1, tmp1, tmp2));
23843 26 : emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode));
23844 :
23845 26 : return true;
23846 : }
23847 :
23848 : /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
23849 : permutation using two vperm2f128, followed by a vshufpd insn blending
23850 : the two vectors together. */
23851 :
23852 : static bool
23853 30181 : expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
23854 : {
23855 30181 : struct expand_vec_perm_d dfirst, dsecond, dthird;
23856 30181 : bool ok;
23857 :
23858 30181 : if (!TARGET_AVX || (d->vmode != V4DFmode))
23859 : return false;
23860 :
23861 1213 : if (d->testing_p)
23862 : return true;
23863 :
23864 190 : dfirst = *d;
23865 190 : dsecond = *d;
23866 190 : dthird = *d;
23867 :
23868 190 : dfirst.perm[0] = (d->perm[0] & ~1);
23869 190 : dfirst.perm[1] = (d->perm[0] & ~1) + 1;
23870 190 : dfirst.perm[2] = (d->perm[2] & ~1);
23871 190 : dfirst.perm[3] = (d->perm[2] & ~1) + 1;
23872 190 : dsecond.perm[0] = (d->perm[1] & ~1);
23873 190 : dsecond.perm[1] = (d->perm[1] & ~1) + 1;
23874 190 : dsecond.perm[2] = (d->perm[3] & ~1);
23875 190 : dsecond.perm[3] = (d->perm[3] & ~1) + 1;
23876 190 : dthird.perm[0] = (d->perm[0] % 2);
23877 190 : dthird.perm[1] = (d->perm[1] % 2) + 4;
23878 190 : dthird.perm[2] = (d->perm[2] % 2) + 2;
23879 190 : dthird.perm[3] = (d->perm[3] % 2) + 6;
23880 :
23881 190 : dfirst.target = gen_reg_rtx (dfirst.vmode);
23882 190 : dsecond.target = gen_reg_rtx (dsecond.vmode);
23883 190 : dthird.op0 = dfirst.target;
23884 190 : dthird.op1 = dsecond.target;
23885 190 : dthird.one_operand_p = false;
23886 :
23887 190 : canonicalize_perm (&dfirst);
23888 190 : canonicalize_perm (&dsecond);
23889 :
23890 190 : ok = expand_vec_perm_1 (&dfirst)
23891 190 : && expand_vec_perm_1 (&dsecond)
23892 380 : && expand_vec_perm_1 (&dthird);
23893 :
23894 0 : gcc_assert (ok);
23895 :
23896 : return true;
23897 : }
23898 :
23899 : static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
23900 :
23901 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
23902 : a two vector permutation using two intra-lane vector
23903 : permutations, vperm2f128 swapping the lanes and vblend* insn blending
23904 : the non-swapped and swapped vectors together. */
23905 :
23906 : static bool
23907 15639 : expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
23908 : {
23909 15639 : struct expand_vec_perm_d dfirst, dsecond, dthird;
23910 15639 : unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
23911 15639 : rtx_insn *seq1, *seq2;
23912 15639 : bool ok;
23913 15639 : rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
23914 :
23915 15639 : if (!TARGET_AVX
23916 794 : || TARGET_AVX2
23917 530 : || (d->vmode != V8SFmode && d->vmode != V4DFmode)
23918 403 : || d->one_operand_p)
23919 : return false;
23920 :
23921 403 : dfirst = *d;
23922 403 : dsecond = *d;
23923 3627 : for (i = 0; i < nelt; i++)
23924 : {
23925 3224 : dfirst.perm[i] = 0xff;
23926 3224 : dsecond.perm[i] = 0xff;
23927 : }
23928 3627 : for (i = 0, msk = 0; i < nelt; i++)
23929 : {
23930 3224 : j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
23931 3224 : if (j == i)
23932 : {
23933 2498 : dfirst.perm[j] = d->perm[i];
23934 4322 : which1 |= (d->perm[i] < nelt ? 1 : 2);
23935 : }
23936 : else
23937 : {
23938 726 : dsecond.perm[j] = d->perm[i];
23939 726 : which2 |= (d->perm[i] < nelt ? 1 : 2);
23940 726 : msk |= (1U << i);
23941 : }
23942 : }
23943 403 : if (msk == 0 || msk == (1U << nelt) - 1)
23944 : return false;
23945 :
23946 403 : if (!d->testing_p)
23947 : {
23948 40 : dfirst.target = gen_reg_rtx (dfirst.vmode);
23949 40 : dsecond.target = gen_reg_rtx (dsecond.vmode);
23950 : }
23951 :
23952 3627 : for (i = 0; i < nelt; i++)
23953 : {
23954 3224 : if (dfirst.perm[i] == 0xff)
23955 726 : dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
23956 3224 : if (dsecond.perm[i] == 0xff)
23957 2498 : dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
23958 : }
23959 403 : canonicalize_perm (&dfirst);
23960 403 : start_sequence ();
23961 403 : ok = ix86_expand_vec_perm_const_1 (&dfirst);
23962 403 : seq1 = end_sequence ();
23963 :
23964 403 : if (!ok)
23965 : return false;
23966 :
23967 403 : canonicalize_perm (&dsecond);
23968 403 : start_sequence ();
23969 403 : ok = ix86_expand_vec_perm_const_1 (&dsecond);
23970 403 : seq2 = end_sequence ();
23971 :
23972 403 : if (!ok)
23973 : return false;
23974 :
23975 403 : if (d->testing_p)
23976 : return true;
23977 :
23978 40 : emit_insn (seq1);
23979 40 : emit_insn (seq2);
23980 :
23981 40 : dthird = *d;
23982 40 : dthird.op0 = dsecond.target;
23983 40 : dthird.op1 = dsecond.target;
23984 40 : dthird.one_operand_p = true;
23985 40 : dthird.target = gen_reg_rtx (dthird.vmode);
23986 360 : for (i = 0; i < nelt; i++)
23987 320 : dthird.perm[i] = i ^ nelt2;
23988 :
23989 40 : ok = expand_vec_perm_1 (&dthird);
23990 40 : gcc_assert (ok);
23991 :
23992 40 : blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
23993 40 : emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
23994 40 : return true;
23995 : }
23996 :
23997 : /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
23998 : permutation with two pshufb insns and an ior. We should have already
23999 : failed all two instruction sequences. */
24000 :
24001 : static bool
24002 28989 : expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
24003 : {
24004 28989 : rtx rperm[2][16], vperm, l, h, op, m128;
24005 28989 : unsigned int i, nelt, eltsz;
24006 28989 : machine_mode mode;
24007 28989 : rtx (*gen) (rtx, rtx, rtx);
24008 :
24009 33469 : if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
24010 8870 : && GET_MODE_SIZE (d->vmode) != 8
24011 8830 : && GET_MODE_SIZE (d->vmode) != 4))
24012 : return false;
24013 1409 : gcc_assert (!d->one_operand_p);
24014 :
24015 1409 : if (d->testing_p)
24016 : return true;
24017 :
24018 202 : switch (GET_MODE_SIZE (d->vmode))
24019 : {
24020 : case 4:
24021 : mode = V4QImode;
24022 : gen = gen_mmx_pshufbv4qi3;
24023 : break;
24024 20 : case 8:
24025 20 : mode = V8QImode;
24026 20 : gen = gen_mmx_pshufbv8qi3;
24027 20 : break;
24028 45 : case 16:
24029 45 : mode = V16QImode;
24030 45 : gen = gen_ssse3_pshufbv16qi3;
24031 45 : break;
24032 0 : default:
24033 0 : gcc_unreachable ();
24034 : }
24035 :
24036 101 : nelt = d->nelt;
24037 101 : eltsz = GET_MODE_UNIT_SIZE (d->vmode);
24038 :
24039 : /* Generate two permutation masks. If the required element is within
24040 : the given vector it is shuffled into the proper lane. If the required
24041 : element is in the other vector, force a zero into the lane by setting
24042 : bit 7 in the permutation mask. */
24043 101 : m128 = GEN_INT (-128);
24044 1029 : for (i = 0; i < nelt; ++i)
24045 : {
24046 928 : unsigned j, k, e = d->perm[i];
24047 928 : unsigned which = (e >= nelt);
24048 928 : if (e >= nelt)
24049 480 : e -= nelt;
24050 :
24051 1952 : for (j = 0; j < eltsz; ++j)
24052 : {
24053 1024 : rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
24054 1024 : rperm[1-which][i*eltsz + j] = m128;
24055 : }
24056 :
24057 9024 : for (k = i*eltsz + j; k < 16; ++k)
24058 8096 : rperm[0][k] = rperm[1][k] = m128;
24059 : }
24060 :
24061 101 : vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
24062 101 : vperm = force_reg (V16QImode, vperm);
24063 :
24064 101 : l = gen_reg_rtx (mode);
24065 101 : op = gen_lowpart (mode, d->op0);
24066 101 : emit_insn (gen (l, op, vperm));
24067 :
24068 101 : vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
24069 101 : vperm = force_reg (V16QImode, vperm);
24070 :
24071 101 : h = gen_reg_rtx (mode);
24072 101 : op = gen_lowpart (mode, d->op1);
24073 101 : emit_insn (gen (h, op, vperm));
24074 :
24075 101 : op = d->target;
24076 101 : if (d->vmode != mode)
24077 22 : op = gen_reg_rtx (mode);
24078 101 : ix86_emit_vec_binop (IOR, mode, op, l, h);
24079 101 : if (op != d->target)
24080 22 : emit_move_insn (d->target, gen_lowpart (d->vmode, op));
24081 :
24082 : return true;
24083 : }
24084 :
24085 : /* Implement arbitrary permutation of one V32QImode and V16QImode operand
24086 : with two vpshufb insns, vpermq and vpor. We should have already failed
24087 : all two or three instruction sequences. */
24088 :
24089 : static bool
24090 23895 : expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
24091 : {
24092 23895 : rtx rperm[2][32], vperm, l, h, hp, op, m128;
24093 23895 : unsigned int i, nelt, eltsz;
24094 :
24095 23895 : if (!TARGET_AVX2
24096 374 : || !d->one_operand_p
24097 172 : || (d->vmode != V32QImode && d->vmode != V16HImode))
24098 : return false;
24099 :
24100 7 : if (d->testing_p)
24101 : return true;
24102 :
24103 7 : nelt = d->nelt;
24104 7 : eltsz = GET_MODE_UNIT_SIZE (d->vmode);
24105 :
24106 : /* Generate two permutation masks. If the required element is within
24107 : the same lane, it is shuffled in. If the required element from the
24108 : other lane, force a zero by setting bit 7 in the permutation mask.
24109 : In the other mask the mask has non-negative elements if element
24110 : is requested from the other lane, but also moved to the other lane,
24111 : so that the result of vpshufb can have the two V2TImode halves
24112 : swapped. */
24113 7 : m128 = GEN_INT (-128);
24114 199 : for (i = 0; i < nelt; ++i)
24115 : {
24116 192 : unsigned j, e = d->perm[i] & (nelt / 2 - 1);
24117 192 : unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
24118 :
24119 416 : for (j = 0; j < eltsz; ++j)
24120 : {
24121 224 : rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
24122 224 : rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
24123 : }
24124 : }
24125 :
24126 7 : vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
24127 7 : vperm = force_reg (V32QImode, vperm);
24128 :
24129 7 : h = gen_reg_rtx (V32QImode);
24130 7 : op = gen_lowpart (V32QImode, d->op0);
24131 7 : emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
24132 :
24133 : /* Swap the 128-byte lanes of h into hp. */
24134 7 : hp = gen_reg_rtx (V4DImode);
24135 7 : op = gen_lowpart (V4DImode, h);
24136 7 : emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
24137 : const1_rtx));
24138 :
24139 7 : vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
24140 7 : vperm = force_reg (V32QImode, vperm);
24141 :
24142 7 : l = gen_reg_rtx (V32QImode);
24143 7 : op = gen_lowpart (V32QImode, d->op0);
24144 7 : emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
24145 :
24146 7 : op = d->target;
24147 7 : if (d->vmode != V32QImode)
24148 2 : op = gen_reg_rtx (V32QImode);
24149 7 : emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
24150 7 : if (op != d->target)
24151 2 : emit_move_insn (d->target, gen_lowpart (d->vmode, op));
24152 :
24153 : return true;
24154 : }
24155 :
24156 : /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
24157 : and extract-odd permutations of two V32QImode and V16QImode operand
24158 : with two vpshufb insns, vpor and vpermq. We should have already
24159 : failed all two or three instruction sequences. */
24160 :
24161 : static bool
24162 23888 : expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
24163 : {
24164 23888 : rtx rperm[2][32], vperm, l, h, ior, op, m128;
24165 23888 : unsigned int i, nelt, eltsz;
24166 :
24167 23888 : if (!TARGET_AVX2
24168 367 : || d->one_operand_p
24169 202 : || (d->vmode != V32QImode && d->vmode != V16HImode))
24170 : return false;
24171 :
24172 112 : for (i = 0; i < d->nelt; ++i)
24173 112 : if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
24174 : return false;
24175 :
24176 0 : if (d->testing_p)
24177 : return true;
24178 :
24179 0 : nelt = d->nelt;
24180 0 : eltsz = GET_MODE_UNIT_SIZE (d->vmode);
24181 :
24182 : /* Generate two permutation masks. In the first permutation mask
24183 : the first quarter will contain indexes for the first half
24184 : of the op0, the second quarter will contain bit 7 set, third quarter
24185 : will contain indexes for the second half of the op0 and the
24186 : last quarter bit 7 set. In the second permutation mask
24187 : the first quarter will contain bit 7 set, the second quarter
24188 : indexes for the first half of the op1, the third quarter bit 7 set
24189 : and last quarter indexes for the second half of the op1.
24190 : I.e. the first mask e.g. for V32QImode extract even will be:
24191 : 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
24192 : (all values masked with 0xf except for -128) and second mask
24193 : for extract even will be
24194 : -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
24195 0 : m128 = GEN_INT (-128);
24196 0 : for (i = 0; i < nelt; ++i)
24197 : {
24198 0 : unsigned j, e = d->perm[i] & (nelt / 2 - 1);
24199 0 : unsigned which = d->perm[i] >= nelt;
24200 0 : unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
24201 :
24202 0 : for (j = 0; j < eltsz; ++j)
24203 : {
24204 0 : rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
24205 0 : rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
24206 : }
24207 : }
24208 :
24209 0 : vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
24210 0 : vperm = force_reg (V32QImode, vperm);
24211 :
24212 0 : l = gen_reg_rtx (V32QImode);
24213 0 : op = gen_lowpart (V32QImode, d->op0);
24214 0 : emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
24215 :
24216 0 : vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
24217 0 : vperm = force_reg (V32QImode, vperm);
24218 :
24219 0 : h = gen_reg_rtx (V32QImode);
24220 0 : op = gen_lowpart (V32QImode, d->op1);
24221 0 : emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
24222 :
24223 0 : ior = gen_reg_rtx (V32QImode);
24224 0 : emit_insn (gen_iorv32qi3 (ior, l, h));
24225 :
24226 : /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
24227 0 : op = gen_reg_rtx (V4DImode);
24228 0 : ior = gen_lowpart (V4DImode, ior);
24229 0 : emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
24230 : const1_rtx, GEN_INT (3)));
24231 0 : emit_move_insn (d->target, gen_lowpart (d->vmode, op));
24232 :
24233 0 : return true;
24234 : }
24235 :
24236 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement a
24237 : permutation (which is a bland) with and, andnot and or when pshufb is not available.
24238 :
24239 : It handles case:
24240 : __builtin_shufflevector (v1, v2, 0, 9, 2, 11, 4, 13, 6, 15);
24241 : __builtin_shufflevector (v1, v2, 8, 1, 2, 11, 4, 13, 6, 15);
24242 :
24243 : An element[i] must be chosen between op0[i] and op1[i] to satisfy the
24244 : requirement.
24245 : */
24246 :
24247 : static bool
24248 24977 : expand_vec_perm_pand_pandn_por (struct expand_vec_perm_d *d)
24249 : {
24250 24977 : rtx rperm[16], vperm;
24251 24977 : unsigned int i, nelt = d->nelt;
24252 :
24253 24977 : if (!TARGET_SSE2
24254 24977 : || d->one_operand_p
24255 21148 : || (d->vmode != V16QImode && d->vmode != V8HImode))
24256 : return false;
24257 :
24258 7743 : if (d->perm[0] != 0)
24259 : return false;
24260 :
24261 : /* The dest[i] must select an element between op0[i] and op1[i]. */
24262 16310 : for (i = 1; i < nelt; i++)
24263 15240 : if ((d->perm[i] % nelt) != i)
24264 : return false;
24265 :
24266 1070 : if (d->testing_p)
24267 : return true;
24268 :
24269 : /* Generates a blend mask for the operators AND and ANDNOT. */
24270 121 : machine_mode inner_mode = GET_MODE_INNER (d->vmode);
24271 1337 : for (i = 0; i < nelt; i++)
24272 1790 : rperm[i] = (d->perm[i] < nelt) ? CONSTM1_RTX (inner_mode)
24273 574 : : CONST0_RTX (inner_mode);
24274 :
24275 121 : vperm = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (nelt, rperm));
24276 121 : vperm = force_reg (d->vmode, vperm);
24277 :
24278 121 : ix86_expand_sse_movcc (d->target, vperm, d->op0, d->op1);
24279 :
24280 121 : return true;
24281 : }
24282 :
24283 : /* Implement permutation with pslldq + psrldq + por when pshufb is not
24284 : available. */
24285 : static bool
24286 43553 : expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d *d, bool pandn)
24287 : {
24288 43553 : unsigned i, nelt = d->nelt;
24289 43553 : unsigned start1, end1 = -1;
24290 43553 : machine_mode vmode = d->vmode, imode;
24291 43553 : int start2 = -1;
24292 43553 : bool clear_op0, clear_op1;
24293 43553 : unsigned inner_size;
24294 43553 : rtx op0, op1, dop1;
24295 43553 : rtx (*gen_vec_shr) (rtx, rtx, rtx);
24296 43553 : rtx (*gen_vec_shl) (rtx, rtx, rtx);
24297 :
24298 : /* pshufd can be used for V4SI/V2DI under TARGET_SSE2. */
24299 43553 : if (!TARGET_SSE2 || (vmode != E_V16QImode && vmode != E_V8HImode))
24300 : return false;
24301 :
24302 13800 : start1 = d->perm[0];
24303 38746 : for (i = 1; i < nelt; i++)
24304 : {
24305 37934 : if (d->perm[i] != d->perm[i-1] + 1
24306 11392 : || d->perm[i] == nelt)
24307 : {
24308 26788 : if (start2 == -1)
24309 : {
24310 13800 : start2 = d->perm[i];
24311 13800 : end1 = d->perm[i-1];
24312 : }
24313 : else
24314 : return false;
24315 : }
24316 : }
24317 :
24318 812 : clear_op0 = end1 != nelt - 1;
24319 812 : clear_op1 = start2 % nelt != 0;
24320 : /* pandn/pand is needed to clear upper/lower bits of op0/op1. */
24321 812 : if (!pandn && (clear_op0 || clear_op1))
24322 : return false;
24323 :
24324 523 : if (d->testing_p)
24325 : return true;
24326 :
24327 65 : gen_vec_shr = vmode == E_V16QImode ? gen_vec_shr_v16qi : gen_vec_shr_v8hi;
24328 24 : gen_vec_shl = vmode == E_V16QImode ? gen_vec_shl_v16qi : gen_vec_shl_v8hi;
24329 65 : imode = GET_MODE_INNER (vmode);
24330 65 : inner_size = GET_MODE_BITSIZE (imode);
24331 65 : op0 = gen_reg_rtx (vmode);
24332 65 : op1 = gen_reg_rtx (vmode);
24333 :
24334 65 : if (start1)
24335 61 : emit_insn (gen_vec_shr (op0, d->op0, GEN_INT (start1 * inner_size)));
24336 : else
24337 4 : emit_move_insn (op0, d->op0);
24338 :
24339 65 : dop1 = d->op1;
24340 65 : if (d->one_operand_p)
24341 44 : dop1 = d->op0;
24342 :
24343 65 : int shl_offset = end1 - start1 + 1 - start2 % nelt;
24344 65 : if (shl_offset)
24345 45 : emit_insn (gen_vec_shl (op1, dop1, GEN_INT (shl_offset * inner_size)));
24346 : else
24347 20 : emit_move_insn (op1, dop1);
24348 :
24349 : /* Clear lower/upper bits for op0/op1. */
24350 65 : if (clear_op0 || clear_op1)
24351 : {
24352 : rtx vec[16];
24353 : rtx const_vec;
24354 : rtx clear;
24355 664 : for (i = 0; i != nelt; i++)
24356 : {
24357 616 : if (i < (end1 - start1 + 1))
24358 251 : vec[i] = gen_int_mode ((HOST_WIDE_INT_1U << inner_size) - 1, imode);
24359 : else
24360 365 : vec[i] = CONST0_RTX (imode);
24361 : }
24362 48 : const_vec = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, vec));
24363 48 : const_vec = validize_mem (force_const_mem (vmode, const_vec));
24364 48 : clear = force_reg (vmode, const_vec);
24365 :
24366 48 : if (clear_op0)
24367 40 : emit_move_insn (op0, gen_rtx_AND (vmode, op0, clear));
24368 48 : if (clear_op1)
24369 36 : emit_move_insn (op1, gen_rtx_AND (vmode,
24370 : gen_rtx_NOT (vmode, clear),
24371 : op1));
24372 : }
24373 :
24374 65 : emit_move_insn (d->target, gen_rtx_IOR (vmode, op0, op1));
24375 65 : return true;
24376 : }
24377 :
24378 : /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
24379 : and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
24380 : operands with two "and" and "pack" or two "shift" and "pack" insns.
24381 : We should have already failed all two instruction sequences. */
24382 :
24383 : static bool
24384 46304 : expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
24385 : {
24386 46304 : rtx op, dop0, dop1, t;
24387 46304 : unsigned i, odd, c, s, nelt = d->nelt;
24388 46304 : int pblendw_i = 0;
24389 46304 : bool end_perm = false;
24390 46304 : machine_mode half_mode;
24391 46304 : rtx (*gen_and) (rtx, rtx, rtx);
24392 46304 : rtx (*gen_pack) (rtx, rtx, rtx);
24393 46304 : rtx (*gen_shift) (rtx, rtx, rtx);
24394 :
24395 46304 : if (d->one_operand_p)
24396 : return false;
24397 :
24398 40994 : switch (d->vmode)
24399 : {
24400 4471 : case E_V4HImode:
24401 : /* Required for "pack". */
24402 4471 : if (!TARGET_SSE4_1)
24403 : return false;
24404 : c = 0xffff;
24405 : s = 16;
24406 : half_mode = V2SImode;
24407 : gen_and = gen_andv2si3;
24408 : gen_pack = gen_mmx_packusdw;
24409 : gen_shift = gen_lshrv2si3;
24410 : pblendw_i = 0x5;
24411 : break;
24412 5931 : case E_V8HImode:
24413 : /* Required for "pack". */
24414 5931 : if (!TARGET_SSE4_1)
24415 : return false;
24416 : c = 0xffff;
24417 : s = 16;
24418 : half_mode = V4SImode;
24419 : gen_and = gen_andv4si3;
24420 : gen_pack = gen_sse4_1_packusdw;
24421 : gen_shift = gen_lshrv4si3;
24422 : pblendw_i = 0x55;
24423 : break;
24424 : case E_V8QImode:
24425 : /* No check as all instructions are SSE2. */
24426 : c = 0xff;
24427 : s = 8;
24428 : half_mode = V4HImode;
24429 : gen_and = gen_andv4hi3;
24430 : gen_pack = gen_mmx_packuswb;
24431 : gen_shift = gen_lshrv4hi3;
24432 : break;
24433 14217 : case E_V16QImode:
24434 : /* No check as all instructions are SSE2. */
24435 14217 : c = 0xff;
24436 14217 : s = 8;
24437 14217 : half_mode = V8HImode;
24438 14217 : gen_and = gen_andv8hi3;
24439 14217 : gen_pack = gen_sse2_packuswb;
24440 14217 : gen_shift = gen_lshrv8hi3;
24441 14217 : break;
24442 440 : case E_V16HImode:
24443 440 : if (!TARGET_AVX2)
24444 : return false;
24445 : c = 0xffff;
24446 : s = 16;
24447 : half_mode = V8SImode;
24448 : gen_and = gen_andv8si3;
24449 : gen_pack = gen_avx2_packusdw;
24450 : gen_shift = gen_lshrv8si3;
24451 : pblendw_i = 0x5555;
24452 : end_perm = true;
24453 : break;
24454 276 : case E_V32QImode:
24455 276 : if (!TARGET_AVX2)
24456 : return false;
24457 : c = 0xff;
24458 : s = 8;
24459 : half_mode = V16HImode;
24460 : gen_and = gen_andv16hi3;
24461 : gen_pack = gen_avx2_packuswb;
24462 : gen_shift = gen_lshrv16hi3;
24463 : end_perm = true;
24464 : break;
24465 : default:
24466 : /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
24467 : are more profitable than general shuffles. */
24468 : return false;
24469 : }
24470 :
24471 : /* Check that permutation is even or odd. */
24472 20066 : odd = d->perm[0];
24473 20066 : if (odd > 1)
24474 : return false;
24475 :
24476 229473 : for (i = 1; i < nelt; ++i)
24477 213410 : if (d->perm[i] != 2 * i + odd)
24478 : return false;
24479 :
24480 16063 : if (d->testing_p)
24481 : return true;
24482 :
24483 5511 : dop0 = gen_reg_rtx (half_mode);
24484 5511 : dop1 = gen_reg_rtx (half_mode);
24485 5511 : if (odd == 0)
24486 : {
24487 : /* Use pblendw since const_vector 0 should be cheaper than
24488 : const_vector 0xffff. */
24489 4789 : if (d->vmode == V4HImode
24490 : || d->vmode == E_V8HImode
24491 : || d->vmode == E_V16HImode)
24492 : {
24493 872 : rtx dop0_t = gen_reg_rtx (d->vmode);
24494 872 : rtx dop1_t = gen_reg_rtx (d->vmode);
24495 872 : t = gen_reg_rtx (d->vmode);
24496 872 : emit_move_insn (t, CONST0_RTX (d->vmode));
24497 :
24498 872 : emit_move_insn (dop0_t, gen_rtx_VEC_MERGE (d->vmode, d->op0, t,
24499 : GEN_INT (pblendw_i)));
24500 872 : emit_move_insn (dop1_t, gen_rtx_VEC_MERGE (d->vmode, d->op1, t,
24501 : GEN_INT (pblendw_i)));
24502 :
24503 872 : emit_move_insn (dop0, gen_lowpart (half_mode, dop0_t));
24504 872 : emit_move_insn (dop1, gen_lowpart (half_mode, dop1_t));
24505 872 : }
24506 : else
24507 : {
24508 3917 : t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
24509 3917 : t = force_reg (half_mode, t);
24510 3917 : emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
24511 3917 : emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
24512 : }
24513 : }
24514 : else
24515 : {
24516 1444 : emit_insn (gen_shift (dop0,
24517 722 : gen_lowpart (half_mode, d->op0),
24518 : GEN_INT (s)));
24519 1444 : emit_insn (gen_shift (dop1,
24520 722 : gen_lowpart (half_mode, d->op1),
24521 : GEN_INT (s)));
24522 : }
24523 : /* In AVX2 for 256 bit case we need to permute pack result. */
24524 5511 : if (TARGET_AVX2 && end_perm)
24525 : {
24526 419 : op = gen_reg_rtx (d->vmode);
24527 419 : t = gen_reg_rtx (V4DImode);
24528 419 : emit_insn (gen_pack (op, dop0, dop1));
24529 838 : emit_insn (gen_avx2_permv4di_1 (t,
24530 419 : gen_lowpart (V4DImode, op),
24531 : const0_rtx,
24532 : const2_rtx,
24533 : const1_rtx,
24534 : GEN_INT (3)));
24535 419 : emit_move_insn (d->target, gen_lowpart (d->vmode, t));
24536 : }
24537 : else
24538 5092 : emit_insn (gen_pack (d->target, dop0, dop1));
24539 :
24540 : return true;
24541 : }
24542 :
24543 : /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
24544 : and extract-odd permutations of two V64QI operands
24545 : with two "shifts", two "truncs" and one "concat" insns for "odd"
24546 : and two "truncs" and one concat insn for "even."
24547 : Have already failed all two instruction sequences. */
24548 :
24549 : static bool
24550 23932 : expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
24551 : {
24552 23932 : rtx t1, t2, t3, t4;
24553 23932 : unsigned i, odd, nelt = d->nelt;
24554 :
24555 23932 : if (!TARGET_AVX512BW
24556 74 : || d->one_operand_p
24557 38 : || d->vmode != V64QImode)
24558 : return false;
24559 :
24560 : /* Check that permutation is even or odd. */
24561 38 : odd = d->perm[0];
24562 38 : if (odd > 1)
24563 : return false;
24564 :
24565 1662 : for (i = 1; i < nelt; ++i)
24566 1637 : if (d->perm[i] != 2 * i + odd)
24567 : return false;
24568 :
24569 25 : if (d->testing_p)
24570 : return true;
24571 :
24572 :
24573 25 : if (odd)
24574 : {
24575 5 : t1 = gen_reg_rtx (V32HImode);
24576 5 : t2 = gen_reg_rtx (V32HImode);
24577 10 : emit_insn (gen_lshrv32hi3 (t1,
24578 5 : gen_lowpart (V32HImode, d->op0),
24579 : GEN_INT (8)));
24580 10 : emit_insn (gen_lshrv32hi3 (t2,
24581 5 : gen_lowpart (V32HImode, d->op1),
24582 : GEN_INT (8)));
24583 : }
24584 : else
24585 : {
24586 20 : t1 = gen_lowpart (V32HImode, d->op0);
24587 20 : t2 = gen_lowpart (V32HImode, d->op1);
24588 : }
24589 :
24590 25 : t3 = gen_reg_rtx (V32QImode);
24591 25 : t4 = gen_reg_rtx (V32QImode);
24592 25 : emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
24593 25 : emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
24594 25 : emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
24595 :
24596 25 : return true;
24597 : }
24598 :
24599 : /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
24600 : and extract-odd permutations. */
24601 :
24602 : static bool
24603 12950 : expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
24604 : {
24605 12950 : rtx t1, t2, t3, t4, t5;
24606 :
24607 12950 : switch (d->vmode)
24608 : {
24609 19 : case E_V4DFmode:
24610 19 : if (d->testing_p)
24611 : break;
24612 1 : t1 = gen_reg_rtx (V4DFmode);
24613 1 : t2 = gen_reg_rtx (V4DFmode);
24614 :
24615 : /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
24616 1 : emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
24617 1 : emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
24618 :
24619 : /* Now an unpck[lh]pd will produce the result required. */
24620 1 : if (odd)
24621 0 : t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
24622 : else
24623 1 : t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
24624 1 : emit_insn (t3);
24625 1 : break;
24626 :
24627 1214 : case E_V8SFmode:
24628 1214 : {
24629 1214 : int mask = odd ? 0xdd : 0x88;
24630 :
24631 1214 : if (d->testing_p)
24632 : break;
24633 186 : t1 = gen_reg_rtx (V8SFmode);
24634 186 : t2 = gen_reg_rtx (V8SFmode);
24635 186 : t3 = gen_reg_rtx (V8SFmode);
24636 :
24637 : /* Shuffle within the 128-bit lanes to produce:
24638 : { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
24639 186 : emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
24640 : GEN_INT (mask)));
24641 :
24642 : /* Shuffle the lanes around to produce:
24643 : { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
24644 186 : emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
24645 : GEN_INT (0x3)));
24646 :
24647 : /* Shuffle within the 128-bit lanes to produce:
24648 : { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
24649 186 : emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
24650 :
24651 : /* Shuffle within the 128-bit lanes to produce:
24652 : { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
24653 186 : emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
24654 :
24655 : /* Shuffle the lanes around to produce:
24656 : { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
24657 186 : emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
24658 : GEN_INT (0x20)));
24659 : }
24660 186 : break;
24661 :
24662 0 : case E_V2DFmode:
24663 0 : case E_V4SFmode:
24664 0 : case E_V2DImode:
24665 0 : case E_V2SImode:
24666 0 : case E_V4SImode:
24667 0 : case E_V2HImode:
24668 : /* These are always directly implementable by expand_vec_perm_1. */
24669 0 : gcc_unreachable ();
24670 :
24671 0 : case E_V2SFmode:
24672 0 : gcc_assert (TARGET_MMX_WITH_SSE);
24673 : /* We have no suitable instructions. */
24674 0 : if (d->testing_p)
24675 : return false;
24676 : break;
24677 :
24678 1550 : case E_V4QImode:
24679 1550 : if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
24680 0 : return expand_vec_perm_pshufb2 (d);
24681 : else
24682 : {
24683 1550 : if (d->testing_p)
24684 : break;
24685 : /* We need 2*log2(N)-1 operations to achieve odd/even
24686 : with interleave. */
24687 178 : t1 = gen_reg_rtx (V4QImode);
24688 178 : emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1));
24689 178 : emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1));
24690 178 : if (odd)
24691 41 : t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1);
24692 : else
24693 137 : t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1);
24694 178 : emit_insn (t2);
24695 : }
24696 178 : break;
24697 :
24698 1527 : case E_V4HImode:
24699 1527 : if (TARGET_SSE4_1)
24700 92 : return expand_vec_perm_even_odd_pack (d);
24701 1435 : else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
24702 20 : return expand_vec_perm_pshufb2 (d);
24703 : else
24704 : {
24705 1415 : if (d->testing_p)
24706 : break;
24707 : /* We need 2*log2(N)-1 operations to achieve odd/even
24708 : with interleave. */
24709 454 : t1 = gen_reg_rtx (V4HImode);
24710 454 : emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
24711 454 : emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
24712 454 : if (odd)
24713 8 : t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
24714 : else
24715 446 : t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
24716 454 : emit_insn (t2);
24717 : }
24718 454 : break;
24719 :
24720 6633 : case E_V8HImode:
24721 6633 : if (TARGET_SSE4_1)
24722 440 : return expand_vec_perm_even_odd_pack (d);
24723 6193 : else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
24724 1 : return expand_vec_perm_pshufb2 (d);
24725 : else
24726 : {
24727 6192 : if (d->testing_p)
24728 : break;
24729 : /* We need 2*log2(N)-1 operations to achieve odd/even
24730 : with interleave. */
24731 2698 : t1 = gen_reg_rtx (V8HImode);
24732 2698 : t2 = gen_reg_rtx (V8HImode);
24733 2698 : emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
24734 2698 : emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
24735 2698 : emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
24736 2698 : emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
24737 2698 : if (odd)
24738 92 : t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
24739 : else
24740 2606 : t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
24741 2698 : emit_insn (t3);
24742 : }
24743 2698 : break;
24744 :
24745 1320 : case E_V8QImode:
24746 1320 : case E_V16QImode:
24747 1320 : return expand_vec_perm_even_odd_pack (d);
24748 :
24749 467 : case E_V16HImode:
24750 467 : case E_V32QImode:
24751 467 : return expand_vec_perm_even_odd_pack (d);
24752 :
24753 25 : case E_V64QImode:
24754 25 : return expand_vec_perm_even_odd_trunc (d);
24755 :
24756 19 : case E_V4DImode:
24757 19 : if (!TARGET_AVX2)
24758 : {
24759 19 : struct expand_vec_perm_d d_copy = *d;
24760 19 : d_copy.vmode = V4DFmode;
24761 19 : if (d->testing_p)
24762 18 : d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
24763 : else
24764 1 : d_copy.target = gen_reg_rtx (V4DFmode);
24765 19 : d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
24766 19 : d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
24767 19 : if (expand_vec_perm_even_odd_1 (&d_copy, odd))
24768 : {
24769 19 : if (!d->testing_p)
24770 1 : emit_move_insn (d->target,
24771 1 : gen_lowpart (V4DImode, d_copy.target));
24772 19 : return true;
24773 : }
24774 : return false;
24775 : }
24776 :
24777 0 : if (d->testing_p)
24778 : break;
24779 :
24780 0 : t1 = gen_reg_rtx (V4DImode);
24781 0 : t2 = gen_reg_rtx (V4DImode);
24782 :
24783 : /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
24784 0 : emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
24785 0 : emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
24786 :
24787 : /* Now an vpunpck[lh]qdq will produce the result required. */
24788 0 : if (odd)
24789 0 : t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
24790 : else
24791 0 : t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
24792 0 : emit_insn (t3);
24793 0 : break;
24794 :
24795 176 : case E_V8SImode:
24796 176 : if (!TARGET_AVX2)
24797 : {
24798 38 : struct expand_vec_perm_d d_copy = *d;
24799 38 : d_copy.vmode = V8SFmode;
24800 38 : if (d->testing_p)
24801 38 : d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
24802 : else
24803 0 : d_copy.target = gen_reg_rtx (V8SFmode);
24804 38 : d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
24805 38 : d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
24806 38 : if (expand_vec_perm_even_odd_1 (&d_copy, odd))
24807 : {
24808 38 : if (!d->testing_p)
24809 0 : emit_move_insn (d->target,
24810 0 : gen_lowpart (V8SImode, d_copy.target));
24811 38 : return true;
24812 : }
24813 : return false;
24814 : }
24815 :
24816 138 : if (d->testing_p)
24817 : break;
24818 :
24819 138 : t1 = gen_reg_rtx (V8SImode);
24820 138 : t2 = gen_reg_rtx (V8SImode);
24821 138 : t3 = gen_reg_rtx (V4DImode);
24822 138 : t4 = gen_reg_rtx (V4DImode);
24823 138 : t5 = gen_reg_rtx (V4DImode);
24824 :
24825 : /* Shuffle the lanes around into
24826 : { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
24827 276 : emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
24828 138 : gen_lowpart (V4DImode, d->op1),
24829 : GEN_INT (0x20)));
24830 276 : emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
24831 138 : gen_lowpart (V4DImode, d->op1),
24832 : GEN_INT (0x31)));
24833 :
24834 : /* Swap the 2nd and 3rd position in each lane into
24835 : { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
24836 138 : emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
24837 : GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
24838 138 : emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
24839 : GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
24840 :
24841 : /* Now an vpunpck[lh]qdq will produce
24842 : { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
24843 138 : if (odd)
24844 0 : t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
24845 0 : gen_lowpart (V4DImode, t2));
24846 : else
24847 138 : t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
24848 138 : gen_lowpart (V4DImode, t2));
24849 138 : emit_insn (t3);
24850 138 : emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
24851 138 : break;
24852 :
24853 0 : default:
24854 0 : gcc_unreachable ();
24855 : }
24856 :
24857 : return true;
24858 : }
24859 :
24860 : /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
24861 : extract-even and extract-odd permutations. */
24862 :
24863 : static bool
24864 23839 : expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
24865 : {
24866 23839 : unsigned i, odd, nelt = d->nelt;
24867 :
24868 23839 : odd = d->perm[0];
24869 23839 : if (odd != 0 && odd != 1)
24870 : return false;
24871 :
24872 65500 : for (i = 1; i < nelt; ++i)
24873 57550 : if (d->perm[i] != 2 * i + odd)
24874 : return false;
24875 :
24876 7950 : if (d->vmode == E_V32HImode
24877 12 : && d->testing_p
24878 12 : && !TARGET_AVX512BW)
24879 : return false;
24880 :
24881 7938 : return expand_vec_perm_even_odd_1 (d, odd);
24882 : }
24883 :
24884 : /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
24885 : permutations. We assume that expand_vec_perm_1 has already failed. */
24886 :
24887 : static bool
24888 1033 : expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
24889 : {
24890 1033 : unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
24891 1033 : machine_mode vmode = d->vmode;
24892 1033 : rtx (*gen) (rtx, rtx, rtx);
24893 1033 : unsigned char perm2[4];
24894 1033 : rtx op0 = d->op0, dest;
24895 1033 : bool ok;
24896 :
24897 1033 : switch (vmode)
24898 : {
24899 0 : case E_V4DFmode:
24900 0 : case E_V8SFmode:
24901 : /* These are special-cased in sse.md so that we can optionally
24902 : use the vbroadcast instruction. They expand to two insns
24903 : if the input happens to be in a register. */
24904 0 : gcc_unreachable ();
24905 :
24906 0 : case E_V2DFmode:
24907 0 : case E_V2SFmode:
24908 0 : case E_V4SFmode:
24909 0 : case E_V2DImode:
24910 0 : case E_V2SImode:
24911 0 : case E_V4SImode:
24912 0 : case E_V2HImode:
24913 0 : case E_V4HImode:
24914 : /* These are always implementable using standard shuffle patterns. */
24915 0 : gcc_unreachable ();
24916 :
24917 16 : case E_V4QImode:
24918 : /* This can be implemented via interleave and pshuflw. */
24919 16 : if (d->testing_p)
24920 : return true;
24921 :
24922 8 : if (elt >= nelt2)
24923 : {
24924 4 : gen = gen_mmx_punpckhbw_low;
24925 4 : elt -= nelt2;
24926 : }
24927 : else
24928 : gen = gen_mmx_punpcklbw_low;
24929 :
24930 8 : dest = gen_reg_rtx (vmode);
24931 8 : emit_insn (gen (dest, op0, op0));
24932 8 : vmode = get_mode_wider_vector (vmode);
24933 8 : op0 = gen_lowpart (vmode, dest);
24934 :
24935 8 : memset (perm2, elt, 2);
24936 8 : dest = gen_reg_rtx (vmode);
24937 8 : ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
24938 8 : gcc_assert (ok);
24939 :
24940 8 : emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
24941 8 : return true;
24942 :
24943 4 : case E_V8QImode:
24944 : /* This can be implemented via interleave. We save one insn by
24945 : stopping once we have promoted to V2SImode and then use pshufd. */
24946 4 : if (d->testing_p)
24947 : return true;
24948 4 : do
24949 : {
24950 4 : if (elt >= nelt2)
24951 : {
24952 1 : gen = vmode == V8QImode ? gen_mmx_punpckhbw
24953 : : gen_mmx_punpckhwd;
24954 1 : elt -= nelt2;
24955 : }
24956 : else
24957 3 : gen = vmode == V8QImode ? gen_mmx_punpcklbw
24958 : : gen_mmx_punpcklwd;
24959 4 : nelt2 /= 2;
24960 :
24961 4 : dest = gen_reg_rtx (vmode);
24962 4 : emit_insn (gen (dest, op0, op0));
24963 4 : vmode = get_mode_wider_vector (vmode);
24964 4 : op0 = gen_lowpart (vmode, dest);
24965 : }
24966 4 : while (vmode != V2SImode);
24967 :
24968 2 : memset (perm2, elt, 2);
24969 2 : dest = gen_reg_rtx (vmode);
24970 2 : ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
24971 2 : gcc_assert (ok);
24972 :
24973 2 : emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
24974 2 : return true;
24975 :
24976 1004 : case E_V8HImode:
24977 1004 : case E_V16QImode:
24978 : /* These can be implemented via interleave. We save one insn by
24979 : stopping once we have promoted to V4SImode and then use pshufd. */
24980 1004 : if (d->testing_p)
24981 : return true;
24982 1540 : do
24983 : {
24984 1540 : if (elt >= nelt2)
24985 : {
24986 16 : gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
24987 : : gen_vec_interleave_highv8hi;
24988 16 : elt -= nelt2;
24989 : }
24990 : else
24991 1524 : gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi
24992 : : gen_vec_interleave_lowv8hi;
24993 1540 : nelt2 /= 2;
24994 :
24995 1540 : dest = gen_reg_rtx (vmode);
24996 1540 : emit_insn (gen (dest, op0, op0));
24997 1540 : vmode = get_mode_wider_vector (vmode);
24998 1540 : op0 = gen_lowpart (vmode, dest);
24999 : }
25000 1540 : while (vmode != V4SImode);
25001 :
25002 940 : memset (perm2, elt, 4);
25003 940 : dest = gen_reg_rtx (vmode);
25004 940 : ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
25005 940 : gcc_assert (ok);
25006 :
25007 940 : emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
25008 940 : return true;
25009 :
25010 1 : case E_V8HFmode:
25011 1 : case E_V8BFmode:
25012 : /* This can be implemented via interleave and pshufd. */
25013 1 : if (d->testing_p)
25014 : return true;
25015 :
25016 1 : rtx (*gen_interleave) (machine_mode, rtx, rtx, rtx);
25017 1 : if (elt >= nelt2)
25018 : {
25019 0 : gen_interleave = gen_vec_interleave_high;
25020 0 : elt -= nelt2;
25021 : }
25022 : else
25023 : gen_interleave = gen_vec_interleave_low;
25024 1 : nelt2 /= 2;
25025 :
25026 1 : dest = gen_reg_rtx (vmode);
25027 1 : emit_insn (gen_interleave (vmode, dest, op0, op0));
25028 :
25029 1 : vmode = V4SImode;
25030 1 : op0 = gen_lowpart (vmode, dest);
25031 :
25032 1 : memset (perm2, elt, 4);
25033 1 : dest = gen_reg_rtx (vmode);
25034 1 : ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
25035 1 : gcc_assert (ok);
25036 :
25037 1 : emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
25038 1 : return true;
25039 :
25040 0 : case E_V32QImode:
25041 0 : case E_V16HImode:
25042 0 : case E_V8SImode:
25043 0 : case E_V4DImode:
25044 : /* For AVX2 broadcasts of the first element vpbroadcast* or
25045 : vpermq should be used by expand_vec_perm_1. */
25046 0 : gcc_assert (!TARGET_AVX2 || d->perm[0]);
25047 : return false;
25048 :
25049 6 : case E_V64QImode:
25050 6 : gcc_assert (!TARGET_AVX512BW || d->perm[0]);
25051 : return false;
25052 :
25053 2 : case E_V32HImode:
25054 2 : gcc_assert (!TARGET_AVX512BW);
25055 : return false;
25056 :
25057 0 : default:
25058 0 : gcc_unreachable ();
25059 : }
25060 : }
25061 :
25062 : /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
25063 : broadcast permutations. */
25064 :
25065 : static bool
25066 89495 : expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
25067 : {
25068 89495 : unsigned i, elt, nelt = d->nelt;
25069 :
25070 89495 : if (!d->one_operand_p)
25071 : return false;
25072 :
25073 5414 : elt = d->perm[0];
25074 8285 : for (i = 1; i < nelt; ++i)
25075 8177 : if (d->perm[i] != elt)
25076 : return false;
25077 :
25078 108 : return expand_vec_perm_broadcast_1 (d);
25079 : }
25080 :
25081 : /* Implement arbitrary permutations of two V64QImode operands
25082 : with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
25083 : static bool
25084 23888 : expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
25085 : {
25086 23888 : if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
25087 : return false;
25088 :
25089 49 : if (d->testing_p)
25090 : return true;
25091 :
25092 49 : struct expand_vec_perm_d ds[2];
25093 49 : rtx rperm[128], vperm, target0, target1;
25094 49 : unsigned int i, nelt;
25095 49 : machine_mode vmode;
25096 :
25097 49 : nelt = d->nelt;
25098 49 : vmode = V64QImode;
25099 :
25100 147 : for (i = 0; i < 2; i++)
25101 : {
25102 98 : ds[i] = *d;
25103 98 : ds[i].vmode = V32HImode;
25104 98 : ds[i].nelt = 32;
25105 98 : ds[i].target = gen_reg_rtx (V32HImode);
25106 98 : ds[i].op0 = gen_lowpart (V32HImode, d->op0);
25107 98 : ds[i].op1 = gen_lowpart (V32HImode, d->op1);
25108 : }
25109 :
25110 : /* Prepare permutations such that the first one takes care of
25111 : putting the even bytes into the right positions or one higher
25112 : positions (ds[0]) and the second one takes care of
25113 : putting the odd bytes into the right positions or one below
25114 : (ds[1]). */
25115 :
25116 3185 : for (i = 0; i < nelt; i++)
25117 : {
25118 3136 : ds[i & 1].perm[i / 2] = d->perm[i] / 2;
25119 3136 : if (i & 1)
25120 : {
25121 1568 : rperm[i] = constm1_rtx;
25122 1568 : rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
25123 : }
25124 : else
25125 : {
25126 1568 : rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
25127 1568 : rperm[i + 64] = constm1_rtx;
25128 : }
25129 : }
25130 :
25131 49 : bool ok = expand_vec_perm_1 (&ds[0]);
25132 49 : gcc_assert (ok);
25133 49 : ds[0].target = gen_lowpart (V64QImode, ds[0].target);
25134 :
25135 49 : ok = expand_vec_perm_1 (&ds[1]);
25136 49 : gcc_assert (ok);
25137 49 : ds[1].target = gen_lowpart (V64QImode, ds[1].target);
25138 :
25139 49 : vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
25140 49 : vperm = force_reg (vmode, vperm);
25141 49 : target0 = gen_reg_rtx (V64QImode);
25142 49 : emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
25143 :
25144 49 : vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
25145 49 : vperm = force_reg (vmode, vperm);
25146 49 : target1 = gen_reg_rtx (V64QImode);
25147 49 : emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
25148 :
25149 49 : emit_insn (gen_iorv64qi3 (d->target, target0, target1));
25150 49 : return true;
25151 : }
25152 :
25153 : /* Implement arbitrary permutation of two V32QImode and V16QImode operands
25154 : with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
25155 : all the shorter instruction sequences. */
25156 :
25157 : static bool
25158 15693 : expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
25159 : {
25160 15693 : rtx rperm[4][32], vperm, l[2], h[2], op, m128;
25161 15693 : unsigned int i, nelt, eltsz;
25162 15693 : bool used[4];
25163 :
25164 15693 : if (!TARGET_AVX2
25165 318 : || d->one_operand_p
25166 189 : || (d->vmode != V32QImode && d->vmode != V16HImode))
25167 : return false;
25168 :
25169 54 : if (d->testing_p)
25170 : return true;
25171 :
25172 54 : nelt = d->nelt;
25173 54 : eltsz = GET_MODE_UNIT_SIZE (d->vmode);
25174 :
25175 : /* Generate 4 permutation masks. If the required element is within
25176 : the same lane, it is shuffled in. If the required element from the
25177 : other lane, force a zero by setting bit 7 in the permutation mask.
25178 : In the other mask the mask has non-negative elements if element
25179 : is requested from the other lane, but also moved to the other lane,
25180 : so that the result of vpshufb can have the two V2TImode halves
25181 : swapped. */
25182 54 : m128 = GEN_INT (-128);
25183 1836 : for (i = 0; i < 32; ++i)
25184 : {
25185 1728 : rperm[0][i] = m128;
25186 1728 : rperm[1][i] = m128;
25187 1728 : rperm[2][i] = m128;
25188 1728 : rperm[3][i] = m128;
25189 : }
25190 54 : used[0] = false;
25191 54 : used[1] = false;
25192 54 : used[2] = false;
25193 54 : used[3] = false;
25194 1590 : for (i = 0; i < nelt; ++i)
25195 : {
25196 1536 : unsigned j, e = d->perm[i] & (nelt / 2 - 1);
25197 1536 : unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
25198 2074 : unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
25199 :
25200 3264 : for (j = 0; j < eltsz; ++j)
25201 1728 : rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
25202 1536 : used[which] = true;
25203 : }
25204 :
25205 162 : for (i = 0; i < 2; ++i)
25206 : {
25207 108 : if (!used[2 * i + 1])
25208 : {
25209 22 : h[i] = NULL_RTX;
25210 22 : continue;
25211 : }
25212 86 : vperm = gen_rtx_CONST_VECTOR (V32QImode,
25213 86 : gen_rtvec_v (32, rperm[2 * i + 1]));
25214 86 : vperm = force_reg (V32QImode, vperm);
25215 86 : h[i] = gen_reg_rtx (V32QImode);
25216 86 : op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
25217 86 : emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
25218 : }
25219 :
25220 : /* Swap the 128-byte lanes of h[X]. */
25221 162 : for (i = 0; i < 2; ++i)
25222 : {
25223 108 : if (h[i] == NULL_RTX)
25224 22 : continue;
25225 86 : op = gen_reg_rtx (V4DImode);
25226 86 : emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
25227 : const2_rtx, GEN_INT (3), const0_rtx,
25228 : const1_rtx));
25229 86 : h[i] = gen_lowpart (V32QImode, op);
25230 : }
25231 :
25232 162 : for (i = 0; i < 2; ++i)
25233 : {
25234 108 : if (!used[2 * i])
25235 : {
25236 0 : l[i] = NULL_RTX;
25237 0 : continue;
25238 : }
25239 108 : vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
25240 108 : vperm = force_reg (V32QImode, vperm);
25241 108 : l[i] = gen_reg_rtx (V32QImode);
25242 108 : op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
25243 108 : emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
25244 : }
25245 :
25246 162 : for (i = 0; i < 2; ++i)
25247 : {
25248 108 : if (h[i] && l[i])
25249 : {
25250 86 : op = gen_reg_rtx (V32QImode);
25251 86 : emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
25252 86 : l[i] = op;
25253 : }
25254 22 : else if (h[i])
25255 0 : l[i] = h[i];
25256 : }
25257 :
25258 54 : gcc_assert (l[0] && l[1]);
25259 54 : op = d->target;
25260 54 : if (d->vmode != V32QImode)
25261 12 : op = gen_reg_rtx (V32QImode);
25262 54 : emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
25263 54 : if (op != d->target)
25264 12 : emit_move_insn (d->target, gen_lowpart (d->vmode, op));
25265 : return true;
25266 : }
25267 :
25268 : /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
25269 : taken care of, perform the expansion in D and return true on success. */
25270 :
25271 : static bool
25272 332160 : ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
25273 : {
25274 : /* Try a single instruction expansion. */
25275 332160 : if (expand_vec_perm_1 (d))
25276 : return true;
25277 :
25278 : /* Try sequences of two instructions. */
25279 :
25280 101362 : if (expand_vec_perm_pshuflw_pshufhw (d))
25281 : return true;
25282 :
25283 98899 : if (expand_vec_perm_palignr (d, false))
25284 : return true;
25285 :
25286 95764 : if (expand_vec_perm_interleave2 (d))
25287 : return true;
25288 :
25289 89495 : if (expand_vec_perm_broadcast (d))
25290 : return true;
25291 :
25292 89395 : if (expand_vec_perm_vpermq_perm_1 (d))
25293 : return true;
25294 :
25295 89395 : if (expand_vec_perm_vperm2f128 (d))
25296 : return true;
25297 :
25298 89327 : if (expand_vec_perm_pblendv (d))
25299 : return true;
25300 :
25301 87664 : if (expand_vec_perm_2perm_interleave (d, true))
25302 : return true;
25303 :
25304 87302 : if (expand_vec_perm_2perm_pblendv (d, true))
25305 : return true;
25306 :
25307 84421 : if (expand_vec_perm_shufps_shufps (d))
25308 : return true;
25309 :
25310 49667 : if (expand_vec_perm_punpckldq_pshuf (d))
25311 : return true;
25312 :
25313 : /* Try sequences of three instructions. */
25314 :
25315 43985 : if (expand_vec_perm_even_odd_pack (d))
25316 : return true;
25317 :
25318 30181 : if (expand_vec_perm_2vperm2f128_vshuf (d))
25319 : return true;
25320 :
25321 28968 : if (expand_vec_perm_pshufb2 (d))
25322 : return true;
25323 :
25324 27580 : if (expand_vec_perm_pslldq_psrldq_por (d, false))
25325 : return true;
25326 :
25327 27337 : if (expand_vec_perm_interleave3 (d))
25328 : return true;
25329 :
25330 27199 : if (expand_vec_perm_vperm2f128_vblend (d))
25331 : return true;
25332 :
25333 27199 : if (expand_vec_perm_2perm_interleave (d, false))
25334 : return true;
25335 :
25336 26959 : if (expand_vec_perm_2perm_pblendv (d, false))
25337 : return true;
25338 :
25339 26415 : if (expand_vec_perm_psrlw_psllw_por (d))
25340 : return true;
25341 :
25342 24977 : if (expand_vec_perm_pand_pandn_por (d))
25343 : return true;
25344 :
25345 : /* Try sequences of four instructions. */
25346 :
25347 23907 : if (expand_vec_perm_even_odd_trunc (d))
25348 : return true;
25349 23895 : if (expand_vec_perm_vpshufb2_vpermq (d))
25350 : return true;
25351 :
25352 23888 : if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
25353 : return true;
25354 :
25355 23888 : if (expand_vec_perm_vpermt2_vpshub2 (d))
25356 : return true;
25357 :
25358 : /* ??? Look for narrow permutations whose element orderings would
25359 : allow the promotion to a wider mode. */
25360 :
25361 : /* ??? Look for sequences of interleave or a wider permute that place
25362 : the data into the correct lanes for a half-vector shuffle like
25363 : pshuf[lh]w or vpermilps. */
25364 :
25365 : /* ??? Look for sequences of interleave that produce the desired results.
25366 : The combinatorics of punpck[lh] get pretty ugly... */
25367 :
25368 23839 : if (expand_vec_perm_even_odd (d))
25369 : return true;
25370 :
25371 : /* Generate four or five instructions. */
25372 15973 : if (expand_vec_perm_pslldq_psrldq_por (d, true))
25373 : return true;
25374 :
25375 : /* Even longer sequences. */
25376 15693 : if (expand_vec_perm_vpshufb4_vpermq2 (d))
25377 : return true;
25378 :
25379 : /* See if we can get the same permutation in different vector integer
25380 : mode. */
25381 15639 : struct expand_vec_perm_d nd;
25382 15639 : if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
25383 : {
25384 0 : if (!d->testing_p)
25385 0 : emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
25386 0 : return true;
25387 : }
25388 :
25389 : /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
25390 15639 : if (expand_vec_perm2_vperm2f128_vblend (d))
25391 : return true;
25392 :
25393 : return false;
25394 : }
25395 :
25396 : /* If a permutation only uses one operand, make it clear. Returns true
25397 : if the permutation references both operands. */
25398 :
25399 : static bool
25400 74080 : canonicalize_perm (struct expand_vec_perm_d *d)
25401 : {
25402 74080 : int i, which, nelt = d->nelt;
25403 :
25404 445068 : for (i = which = 0; i < nelt; ++i)
25405 504329 : which |= (d->perm[i] < nelt ? 1 : 2);
25406 :
25407 74080 : d->one_operand_p = true;
25408 74080 : switch (which)
25409 : {
25410 0 : default:
25411 0 : gcc_unreachable();
25412 :
25413 55069 : case 3:
25414 55069 : if (!rtx_equal_p (d->op0, d->op1))
25415 : {
25416 55018 : d->one_operand_p = false;
25417 55018 : break;
25418 : }
25419 : /* The elements of PERM do not suggest that only the first operand
25420 : is used, but both operands are identical. Allow easier matching
25421 : of the permutation by folding the permutation into the single
25422 : input vector. */
25423 : /* FALLTHRU */
25424 :
25425 : case 2:
25426 2913 : for (i = 0; i < nelt; ++i)
25427 2576 : d->perm[i] &= nelt - 1;
25428 337 : d->op0 = d->op1;
25429 337 : break;
25430 :
25431 18725 : case 1:
25432 18725 : d->op1 = d->op0;
25433 18725 : break;
25434 : }
25435 :
25436 74080 : return (which == 3);
25437 : }
25438 :
25439 : /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
25440 :
25441 : bool
25442 863027 : ix86_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
25443 : rtx target, rtx op0, rtx op1,
25444 : const vec_perm_indices &sel)
25445 : {
25446 863027 : if (vmode != op_mode)
25447 : return false;
25448 :
25449 861204 : struct expand_vec_perm_d d;
25450 861204 : unsigned char perm[MAX_VECT_LEN];
25451 861204 : unsigned int i, nelt, which;
25452 861204 : bool two_args;
25453 :
25454 : /* For HF and BF mode vector, convert it to HI using subreg. */
25455 2582714 : if (GET_MODE_INNER (vmode) == HFmode || GET_MODE_INNER (vmode) == BFmode)
25456 : {
25457 924 : machine_mode orig_mode = vmode;
25458 1848 : vmode = mode_for_vector (HImode,
25459 924 : GET_MODE_NUNITS (vmode)).require ();
25460 924 : if (target)
25461 437 : target = lowpart_subreg (vmode, target, orig_mode);
25462 924 : if (op0)
25463 437 : op0 = lowpart_subreg (vmode, op0, orig_mode);
25464 924 : if (op1)
25465 437 : op1 = lowpart_subreg (vmode, op1, orig_mode);
25466 : }
25467 :
25468 861204 : d.target = target;
25469 861204 : d.op0 = op0;
25470 861204 : d.op1 = op1;
25471 :
25472 861204 : d.vmode = vmode;
25473 861204 : gcc_assert (VECTOR_MODE_P (d.vmode));
25474 861204 : d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
25475 861204 : d.testing_p = !target;
25476 :
25477 861204 : gcc_assert (sel.length () == nelt);
25478 861204 : gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
25479 :
25480 : /* Given sufficient ISA support we can just return true here
25481 : for selected vector modes. */
25482 861204 : switch (d.vmode)
25483 : {
25484 1733 : case E_V16SFmode:
25485 1733 : case E_V16SImode:
25486 1733 : case E_V8DImode:
25487 1733 : case E_V8DFmode:
25488 1733 : if (!TARGET_AVX512F)
25489 : return false;
25490 : /* All implementable with a single vperm[it]2 insn. */
25491 1733 : if (d.testing_p)
25492 : return true;
25493 : break;
25494 323 : case E_V32HImode:
25495 323 : if (!TARGET_AVX512F)
25496 : return false;
25497 323 : if (d.testing_p && TARGET_AVX512BW)
25498 : /* All implementable with a single vperm[it]2 insn. */
25499 : return true;
25500 : break;
25501 747 : case E_V64QImode:
25502 747 : if (!TARGET_AVX512F)
25503 : return false;
25504 747 : if (d.testing_p && TARGET_AVX512BW)
25505 : /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
25506 : return true;
25507 : break;
25508 11325 : case E_V8SImode:
25509 11325 : case E_V8SFmode:
25510 11325 : case E_V4DFmode:
25511 11325 : case E_V4DImode:
25512 11325 : if (!TARGET_AVX)
25513 : return false;
25514 11325 : if (d.testing_p && TARGET_AVX512VL)
25515 : /* All implementable with a single vperm[it]2 insn. */
25516 : return true;
25517 : break;
25518 614 : case E_V16HImode:
25519 614 : if (!TARGET_SSE2)
25520 : return false;
25521 614 : if (d.testing_p && TARGET_AVX2)
25522 : /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
25523 : return true;
25524 : break;
25525 693 : case E_V32QImode:
25526 693 : if (!TARGET_SSE2)
25527 : return false;
25528 693 : if (d.testing_p && TARGET_AVX2)
25529 : /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
25530 : return true;
25531 : break;
25532 38046 : case E_V8HImode:
25533 38046 : case E_V16QImode:
25534 38046 : if (!TARGET_SSE2)
25535 : return false;
25536 : /* Fall through. */
25537 235810 : case E_V4SImode:
25538 235810 : case E_V4SFmode:
25539 235810 : if (!TARGET_SSE)
25540 : return false;
25541 : /* All implementable with a single vpperm insn. */
25542 235810 : if (d.testing_p && TARGET_XOP)
25543 : return true;
25544 : /* All implementable with 2 pshufb + 1 ior. */
25545 235704 : if (d.testing_p && TARGET_SSSE3)
25546 : return true;
25547 : break;
25548 177050 : case E_V2SFmode:
25549 177050 : case E_V2SImode:
25550 177050 : case E_V4HImode:
25551 177050 : case E_V8QImode:
25552 177050 : if (!TARGET_MMX_WITH_SSE)
25553 : return false;
25554 : break;
25555 27268 : case E_V2HImode:
25556 27268 : if (!TARGET_SSE2)
25557 : return false;
25558 : /* All implementable with *punpckwd. */
25559 27268 : if (d.testing_p)
25560 : return true;
25561 : break;
25562 11848 : case E_V4QImode:
25563 11848 : if (!TARGET_SSE2)
25564 : return false;
25565 : break;
25566 391897 : case E_V2DImode:
25567 391897 : case E_V2DFmode:
25568 391897 : if (!TARGET_SSE)
25569 : return false;
25570 : /* All implementable with shufpd or unpck[lh]pd. */
25571 391897 : if (d.testing_p)
25572 : return true;
25573 : break;
25574 : default:
25575 : return false;
25576 : }
25577 :
25578 2339760 : for (i = which = 0; i < nelt; ++i)
25579 : {
25580 1897200 : unsigned char e = sel[i];
25581 1897200 : gcc_assert (e < 2 * nelt);
25582 1897200 : d.perm[i] = e;
25583 1897200 : perm[i] = e;
25584 2569615 : which |= (e < nelt ? 1 : 2);
25585 : }
25586 :
25587 442560 : if (d.testing_p)
25588 : {
25589 : /* For all elements from second vector, fold the elements to first. */
25590 369712 : if (which == 2)
25591 1375 : for (i = 0; i < nelt; ++i)
25592 1260 : d.perm[i] -= nelt;
25593 :
25594 : /* Check whether the mask can be applied to the vector type. */
25595 369712 : d.one_operand_p = (which != 3);
25596 :
25597 : /* Implementable with shufps, pshufd or pshuflw. */
25598 369712 : if (d.one_operand_p
25599 : && (d.vmode == V4SFmode || d.vmode == V2SFmode
25600 : || d.vmode == V4SImode || d.vmode == V2SImode
25601 : || d.vmode == V4HImode || d.vmode == V2HImode))
25602 : return true;
25603 :
25604 : /* Otherwise we have to go through the motions and see if we can
25605 : figure out how to generate the requested permutation. */
25606 255453 : d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
25607 255453 : d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
25608 255453 : if (!d.one_operand_p)
25609 241428 : d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
25610 :
25611 255453 : start_sequence ();
25612 255453 : bool ret = ix86_expand_vec_perm_const_1 (&d);
25613 255453 : end_sequence ();
25614 :
25615 255453 : return ret;
25616 : }
25617 :
25618 72848 : two_args = canonicalize_perm (&d);
25619 :
25620 : /* If one of the operands is a zero vector, try to match pmovzx. */
25621 72848 : if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
25622 : {
25623 583 : struct expand_vec_perm_d dzero = d;
25624 583 : if (d.op0 == CONST0_RTX (vmode))
25625 : {
25626 387 : d.op1 = dzero.op1 = force_reg (vmode, d.op1);
25627 387 : std::swap (dzero.op0, dzero.op1);
25628 7527 : for (i = 0; i < nelt; ++i)
25629 7140 : dzero.perm[i] ^= nelt;
25630 : }
25631 : else
25632 196 : d.op0 = dzero.op0 = force_reg (vmode, d.op0);
25633 :
25634 583 : if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
25635 583 : dzero.perm, nelt, dzero.testing_p))
25636 122 : return true;
25637 : }
25638 :
25639 : /* Force operands into registers. */
25640 72726 : rtx nop0 = force_reg (vmode, d.op0);
25641 72726 : if (d.op0 == d.op1)
25642 18661 : d.op1 = nop0;
25643 72726 : d.op0 = nop0;
25644 72726 : d.op1 = force_reg (vmode, d.op1);
25645 :
25646 72726 : if (ix86_expand_vec_perm_const_1 (&d))
25647 : return true;
25648 :
25649 : /* If the selector says both arguments are needed, but the operands are the
25650 : same, the above tried to expand with one_operand_p and flattened selector.
25651 : If that didn't work, retry without one_operand_p; we succeeded with that
25652 : during testing. */
25653 22 : if (two_args && d.one_operand_p)
25654 : {
25655 22 : d.one_operand_p = false;
25656 22 : memcpy (d.perm, perm, sizeof (perm));
25657 22 : return ix86_expand_vec_perm_const_1 (&d);
25658 : }
25659 :
25660 : return false;
25661 : }
25662 :
25663 : void
25664 8148 : ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
25665 : {
25666 8148 : struct expand_vec_perm_d d;
25667 8148 : unsigned i, nelt;
25668 :
25669 8148 : d.target = targ;
25670 8148 : d.op0 = op0;
25671 8148 : d.op1 = op1;
25672 8148 : d.vmode = GET_MODE (targ);
25673 8148 : d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
25674 8148 : d.one_operand_p = false;
25675 8148 : d.testing_p = false;
25676 :
25677 77136 : for (i = 0; i < nelt; ++i)
25678 68988 : d.perm[i] = i * 2 + odd;
25679 :
25680 : /* We'll either be able to implement the permutation directly... */
25681 8148 : if (expand_vec_perm_1 (&d))
25682 3193 : return;
25683 :
25684 : /* ... or we use the special-case patterns. */
25685 4955 : expand_vec_perm_even_odd_1 (&d, odd);
25686 : }
25687 :
25688 : static void
25689 922 : ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
25690 : {
25691 922 : struct expand_vec_perm_d d;
25692 922 : unsigned i, nelt, base;
25693 922 : bool ok;
25694 :
25695 922 : d.target = targ;
25696 922 : d.op0 = op0;
25697 922 : d.op1 = op1;
25698 922 : d.vmode = GET_MODE (targ);
25699 922 : d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
25700 922 : d.one_operand_p = false;
25701 922 : d.testing_p = false;
25702 :
25703 922 : base = high_p ? nelt / 2 : 0;
25704 3642 : for (i = 0; i < nelt / 2; ++i)
25705 : {
25706 2720 : d.perm[i * 2] = i + base;
25707 2720 : d.perm[i * 2 + 1] = i + base + nelt;
25708 : }
25709 :
25710 : /* Note that for AVX this isn't one instruction. */
25711 922 : ok = ix86_expand_vec_perm_const_1 (&d);
25712 922 : gcc_assert (ok);
25713 922 : }
25714 :
25715 : /* Expand a vector operation shift by constant for a V*QImode in terms of the
25716 : same operation on V*HImode. Return true if success. */
25717 : static bool
25718 386 : ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
25719 : rtx dest, rtx op1, rtx op2)
25720 : {
25721 386 : machine_mode qimode, himode;
25722 386 : HOST_WIDE_INT and_constant, xor_constant;
25723 386 : HOST_WIDE_INT shift_amount;
25724 386 : rtx vec_const_and, vec_const_xor;
25725 386 : rtx tmp, op1_subreg;
25726 386 : rtx (*gen_shift) (rtx, rtx, rtx);
25727 386 : rtx (*gen_and) (rtx, rtx, rtx);
25728 386 : rtx (*gen_xor) (rtx, rtx, rtx);
25729 386 : rtx (*gen_sub) (rtx, rtx, rtx);
25730 :
25731 : /* Only optimize shift by constant. */
25732 386 : if (!CONST_INT_P (op2))
25733 : return false;
25734 :
25735 386 : qimode = GET_MODE (dest);
25736 386 : shift_amount = INTVAL (op2);
25737 : /* Do nothing when shift amount greater equal 8. */
25738 386 : if (shift_amount > 7)
25739 : return false;
25740 :
25741 386 : gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
25742 :
25743 :
25744 386 : if (shift_amount == 7
25745 386 : && code == ASHIFTRT)
25746 : {
25747 40 : if (qimode == V16QImode
25748 8 : || qimode == V32QImode)
25749 : {
25750 39 : rtx zero = gen_reg_rtx (qimode);
25751 39 : emit_move_insn (zero, CONST0_RTX (qimode));
25752 39 : emit_move_insn (dest, gen_rtx_fmt_ee (GT, qimode, zero, op1));
25753 39 : }
25754 : else
25755 : {
25756 1 : gcc_assert (qimode == V64QImode);
25757 1 : rtx kmask = gen_reg_rtx (DImode);
25758 1 : emit_insn (gen_avx512bw_cvtb2maskv64qi (kmask, op1));
25759 1 : emit_insn (gen_avx512bw_cvtmask2bv64qi (dest, kmask));
25760 : }
25761 40 : return true;
25762 : }
25763 :
25764 : /* Record sign bit. */
25765 346 : xor_constant = 1 << (8 - shift_amount - 1);
25766 :
25767 : /* Zero upper/lower bits shift from left/right element. */
25768 346 : and_constant
25769 346 : = (code == ASHIFT ? 256 - (1 << shift_amount)
25770 317 : : (1 << (8 - shift_amount)) - 1);
25771 :
25772 346 : switch (qimode)
25773 : {
25774 333 : case V16QImode:
25775 333 : himode = V8HImode;
25776 281 : gen_shift =
25777 : ((code == ASHIFT)
25778 333 : ? gen_ashlv8hi3
25779 313 : : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
25780 : gen_and = gen_andv16qi3;
25781 : gen_xor = gen_xorv16qi3;
25782 : gen_sub = gen_subv16qi3;
25783 : break;
25784 6 : case V32QImode:
25785 6 : himode = V16HImode;
25786 1 : gen_shift =
25787 : ((code == ASHIFT)
25788 6 : ? gen_ashlv16hi3
25789 2 : : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
25790 : gen_and = gen_andv32qi3;
25791 : gen_xor = gen_xorv32qi3;
25792 : gen_sub = gen_subv32qi3;
25793 : break;
25794 7 : case V64QImode:
25795 7 : himode = V32HImode;
25796 1 : gen_shift =
25797 : ((code == ASHIFT)
25798 7 : ? gen_ashlv32hi3
25799 2 : : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
25800 : gen_and = gen_andv64qi3;
25801 : gen_xor = gen_xorv64qi3;
25802 : gen_sub = gen_subv64qi3;
25803 : break;
25804 0 : default:
25805 0 : gcc_unreachable ();
25806 : }
25807 :
25808 346 : tmp = gen_reg_rtx (himode);
25809 346 : vec_const_and = gen_reg_rtx (qimode);
25810 346 : op1_subreg = lowpart_subreg (himode, op1, qimode);
25811 :
25812 : /* For ASHIFT and LSHIFTRT, perform operation like
25813 : vpsllw/vpsrlw $shift_amount, %op1, %dest.
25814 : vpand %vec_const_and, %dest. */
25815 346 : emit_insn (gen_shift (tmp, op1_subreg, op2));
25816 346 : emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
25817 346 : emit_move_insn (vec_const_and,
25818 : ix86_build_const_vector (qimode, true,
25819 346 : gen_int_mode (and_constant, QImode)));
25820 346 : emit_insn (gen_and (dest, dest, vec_const_and));
25821 :
25822 : /* For ASHIFTRT, perform extra operation like
25823 : vpxor %vec_const_xor, %dest, %dest
25824 : vpsubb %vec_const_xor, %dest, %dest */
25825 346 : if (code == ASHIFTRT)
25826 : {
25827 34 : vec_const_xor = gen_reg_rtx (qimode);
25828 34 : emit_move_insn (vec_const_xor,
25829 : ix86_build_const_vector (qimode, true,
25830 34 : gen_int_mode (xor_constant, QImode)));
25831 34 : emit_insn (gen_xor (dest, dest, vec_const_xor));
25832 34 : emit_insn (gen_sub (dest, dest, vec_const_xor));
25833 : }
25834 : return true;
25835 : }
25836 :
25837 : void
25838 1440 : ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
25839 : {
25840 1440 : machine_mode qimode = GET_MODE (dest);
25841 1440 : rtx qop1, qop2, hop1, hop2, qdest, hdest;
25842 1440 : bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
25843 1440 : bool uns_p = code != ASHIFTRT;
25844 :
25845 1440 : switch (qimode)
25846 : {
25847 1440 : case E_V4QImode:
25848 1440 : case E_V8QImode:
25849 1440 : break;
25850 0 : default:
25851 0 : gcc_unreachable ();
25852 : }
25853 :
25854 1440 : qop1 = lowpart_subreg (V16QImode, force_reg (qimode, op1), qimode);
25855 :
25856 1440 : if (op2vec)
25857 1310 : qop2 = lowpart_subreg (V16QImode, force_reg (qimode, op2), qimode);
25858 : else
25859 : qop2 = op2;
25860 :
25861 1440 : qdest = gen_reg_rtx (V16QImode);
25862 :
25863 1440 : if (CONST_INT_P (op2)
25864 118 : && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
25865 : /* With AVX512 it's cheaper to do vpmovsxbw/op/vpmovwb.
25866 : Even with SSE4.1 the alternative is better. */
25867 118 : && !TARGET_SSE4_1
25868 1494 : && ix86_expand_vec_shift_qihi_constant (code, qdest, qop1, qop2))
25869 : {
25870 54 : emit_move_insn (dest, gen_lowpart (qimode, qdest));
25871 54 : return;
25872 : }
25873 :
25874 1386 : if (CONST_INT_P (op2)
25875 64 : && code == ASHIFTRT
25876 14 : && INTVAL (op2) == 7)
25877 : {
25878 4 : rtx zero = gen_reg_rtx (qimode);
25879 4 : emit_move_insn (zero, CONST0_RTX (qimode));
25880 4 : emit_move_insn (dest, gen_rtx_fmt_ee (GT, qimode, zero, op1));
25881 4 : return;
25882 : }
25883 :
25884 1382 : switch (code)
25885 : {
25886 1297 : case MULT:
25887 1297 : gcc_assert (op2vec);
25888 1297 : if (!TARGET_SSE4_1)
25889 : {
25890 : /* Unpack data such that we've got a source byte in each low byte
25891 : of each word. We don't care what goes into the high byte of
25892 : each word. Rather than trying to get zero in there, most
25893 : convenient is to let it be a copy of the low byte. */
25894 244 : hop1 = copy_to_reg (qop1);
25895 244 : hop2 = copy_to_reg (qop2);
25896 244 : emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1));
25897 244 : emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2));
25898 244 : break;
25899 : }
25900 : /* FALLTHRU */
25901 1138 : case ASHIFT:
25902 1138 : case ASHIFTRT:
25903 1138 : case LSHIFTRT:
25904 1138 : hop1 = gen_reg_rtx (V8HImode);
25905 1138 : ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
25906 : /* mult/vashr/vlshr/vashl */
25907 1138 : if (op2vec)
25908 : {
25909 1066 : hop2 = gen_reg_rtx (V8HImode);
25910 1066 : ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
25911 : }
25912 : else
25913 : hop2 = qop2;
25914 :
25915 : break;
25916 0 : default:
25917 0 : gcc_unreachable ();
25918 : }
25919 :
25920 1382 : if (code != MULT && op2vec)
25921 : {
25922 : /* Expand vashr/vlshr/vashl. */
25923 13 : hdest = gen_reg_rtx (V8HImode);
25924 13 : emit_insn (gen_rtx_SET (hdest,
25925 : simplify_gen_binary (code, V8HImode,
25926 : hop1, hop2)));
25927 : }
25928 : else
25929 : /* Expand mult/ashr/lshr/ashl. */
25930 1369 : hdest = expand_simple_binop (V8HImode, code, hop1, hop2,
25931 : NULL_RTX, 1, OPTAB_DIRECT);
25932 :
25933 1382 : if (TARGET_AVX512BW && TARGET_AVX512VL)
25934 : {
25935 57 : if (qimode == V8QImode)
25936 : qdest = dest;
25937 : else
25938 10 : qdest = gen_reg_rtx (V8QImode);
25939 :
25940 57 : emit_insn (gen_truncv8hiv8qi2 (qdest, hdest));
25941 : }
25942 : else
25943 : {
25944 1325 : struct expand_vec_perm_d d;
25945 1325 : rtx qres = gen_lowpart (V16QImode, hdest);
25946 1325 : bool ok;
25947 1325 : int i;
25948 :
25949 : /* Merge the data back into the right place. */
25950 1325 : d.target = qdest;
25951 1325 : d.op0 = d.op1 = qres;
25952 1325 : d.vmode = V16QImode;
25953 1325 : d.nelt = 16;
25954 1325 : d.one_operand_p = TARGET_SSSE3;
25955 1325 : d.testing_p = false;
25956 :
25957 22525 : for (i = 0; i < d.nelt; ++i)
25958 21200 : d.perm[i] = i * 2;
25959 :
25960 1325 : ok = ix86_expand_vec_perm_const_1 (&d);
25961 1325 : gcc_assert (ok);
25962 : }
25963 :
25964 1382 : if (qdest != dest)
25965 1335 : emit_move_insn (dest, gen_lowpart (qimode, qdest));
25966 : }
25967 :
25968 : /* Emit instruction in 2x wider mode. For example, optimize
25969 : vector MUL generation like
25970 :
25971 : vpmovzxbw ymm2, xmm0
25972 : vpmovzxbw ymm3, xmm1
25973 : vpmullw ymm4, ymm2, ymm3
25974 : vpmovwb xmm0, ymm4
25975 :
25976 : it would take less instructions than ix86_expand_vecop_qihi.
25977 : Return true if success. */
25978 :
25979 : static bool
25980 1155 : ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
25981 : {
25982 1155 : machine_mode himode, qimode = GET_MODE (dest);
25983 1155 : machine_mode wqimode;
25984 1155 : rtx qop1, qop2, hop1, hop2, hdest;
25985 1155 : rtx (*gen_truncate)(rtx, rtx) = NULL;
25986 1155 : bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
25987 1155 : bool uns_p = code != ASHIFTRT;
25988 :
25989 : /* Without VPMOVWB (provided by AVX512BW ISA), the expansion uses the
25990 : generic permutation to merge the data back into the right place. This
25991 : permutation results in VPERMQ, which is slow, so better fall back to
25992 : ix86_expand_vecop_qihi. */
25993 1155 : if (!TARGET_AVX512BW
25994 327 : || (qimode == V16QImode && !TARGET_AVX512VL)
25995 : /* There are no V64HImode instructions. */
25996 327 : || qimode == V64QImode)
25997 : return false;
25998 :
25999 : /* Do not generate ymm/zmm instructions when
26000 : target prefers 128/256 bit vector width. */
26001 317 : if ((qimode == V16QImode && TARGET_PREFER_AVX128)
26002 317 : || (qimode == V32QImode && TARGET_PREFER_AVX256))
26003 : return false;
26004 :
26005 312 : switch (qimode)
26006 : {
26007 : case E_V16QImode:
26008 : himode = V16HImode;
26009 : gen_truncate = gen_truncv16hiv16qi2;
26010 : break;
26011 57 : case E_V32QImode:
26012 57 : himode = V32HImode;
26013 57 : gen_truncate = gen_truncv32hiv32qi2;
26014 57 : break;
26015 0 : default:
26016 0 : gcc_unreachable ();
26017 : }
26018 :
26019 312 : wqimode = GET_MODE_2XWIDER_MODE (qimode).require ();
26020 312 : qop1 = lowpart_subreg (wqimode, force_reg (qimode, op1), qimode);
26021 :
26022 312 : if (op2vec)
26023 312 : qop2 = lowpart_subreg (wqimode, force_reg (qimode, op2), qimode);
26024 : else
26025 : qop2 = op2;
26026 :
26027 312 : hop1 = gen_reg_rtx (himode);
26028 312 : ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
26029 :
26030 312 : if (op2vec)
26031 : {
26032 312 : hop2 = gen_reg_rtx (himode);
26033 312 : ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
26034 : }
26035 : else
26036 : hop2 = qop2;
26037 :
26038 312 : if (code != MULT && op2vec)
26039 : {
26040 : /* Expand vashr/vlshr/vashl. */
26041 14 : hdest = gen_reg_rtx (himode);
26042 14 : emit_insn (gen_rtx_SET (hdest,
26043 : simplify_gen_binary (code, himode,
26044 : hop1, hop2)));
26045 : }
26046 : else
26047 : /* Expand mult/ashr/lshr/ashl. */
26048 298 : hdest = expand_simple_binop (himode, code, hop1, hop2,
26049 : NULL_RTX, 1, OPTAB_DIRECT);
26050 :
26051 312 : emit_insn (gen_truncate (dest, hdest));
26052 312 : return true;
26053 : }
26054 :
26055 : /* Expand a vector operation CODE for a V*QImode in terms of the
26056 : same operation on V*HImode. */
26057 :
26058 : void
26059 1487 : ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
26060 : {
26061 1487 : machine_mode qimode = GET_MODE (dest);
26062 1487 : machine_mode himode;
26063 1487 : rtx (*gen_il) (rtx, rtx, rtx);
26064 1487 : rtx (*gen_ih) (rtx, rtx, rtx);
26065 1487 : rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
26066 1487 : bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
26067 1487 : struct expand_vec_perm_d d;
26068 1487 : bool full_interleave = true;
26069 1487 : bool uns_p = code != ASHIFTRT;
26070 1487 : bool ok;
26071 1487 : int i;
26072 :
26073 1487 : if (CONST_INT_P (op2)
26074 332 : && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
26075 1819 : && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
26076 644 : return;
26077 :
26078 1155 : if (ix86_expand_vecop_qihi2 (code, dest, op1, op2))
26079 : return;
26080 :
26081 843 : switch (qimode)
26082 : {
26083 : case E_V16QImode:
26084 : himode = V8HImode;
26085 : break;
26086 44 : case E_V32QImode:
26087 44 : himode = V16HImode;
26088 44 : break;
26089 10 : case E_V64QImode:
26090 10 : himode = V32HImode;
26091 10 : break;
26092 0 : default:
26093 0 : gcc_unreachable ();
26094 : }
26095 :
26096 843 : switch (code)
26097 : {
26098 816 : case MULT:
26099 816 : gcc_assert (op2vec);
26100 : /* Unpack data such that we've got a source byte in each low byte of
26101 : each word. We don't care what goes into the high byte of each word.
26102 : Rather than trying to get zero in there, most convenient is to let
26103 : it be a copy of the low byte. */
26104 816 : switch (qimode)
26105 : {
26106 : case E_V16QImode:
26107 : gen_il = gen_vec_interleave_lowv16qi;
26108 : gen_ih = gen_vec_interleave_highv16qi;
26109 : break;
26110 44 : case E_V32QImode:
26111 44 : gen_il = gen_avx2_interleave_lowv32qi;
26112 44 : gen_ih = gen_avx2_interleave_highv32qi;
26113 44 : full_interleave = false;
26114 44 : break;
26115 8 : case E_V64QImode:
26116 8 : gen_il = gen_avx512bw_interleave_lowv64qi;
26117 8 : gen_ih = gen_avx512bw_interleave_highv64qi;
26118 8 : full_interleave = false;
26119 8 : break;
26120 0 : default:
26121 0 : gcc_unreachable ();
26122 : }
26123 :
26124 816 : op2_l = gen_reg_rtx (qimode);
26125 816 : op2_h = gen_reg_rtx (qimode);
26126 816 : emit_insn (gen_il (op2_l, op2, op2));
26127 816 : emit_insn (gen_ih (op2_h, op2, op2));
26128 :
26129 816 : op1_l = gen_reg_rtx (qimode);
26130 816 : op1_h = gen_reg_rtx (qimode);
26131 816 : emit_insn (gen_il (op1_l, op1, op1));
26132 816 : emit_insn (gen_ih (op1_h, op1, op1));
26133 816 : break;
26134 :
26135 27 : case ASHIFT:
26136 27 : case ASHIFTRT:
26137 27 : case LSHIFTRT:
26138 27 : op1_l = gen_reg_rtx (himode);
26139 27 : op1_h = gen_reg_rtx (himode);
26140 27 : ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
26141 27 : ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
26142 : /* vashr/vlshr/vashl */
26143 27 : if (op2vec)
26144 : {
26145 2 : rtx tmp = force_reg (qimode, op2);
26146 2 : op2_l = gen_reg_rtx (himode);
26147 2 : op2_h = gen_reg_rtx (himode);
26148 2 : ix86_expand_sse_unpack (op2_l, tmp, uns_p, false);
26149 2 : ix86_expand_sse_unpack (op2_h, tmp, uns_p, true);
26150 : }
26151 : else
26152 : op2_l = op2_h = op2;
26153 :
26154 : break;
26155 0 : default:
26156 0 : gcc_unreachable ();
26157 : }
26158 :
26159 843 : if (code != MULT && op2vec)
26160 : {
26161 : /* Expand vashr/vlshr/vashl. */
26162 2 : res_l = gen_reg_rtx (himode);
26163 2 : res_h = gen_reg_rtx (himode);
26164 2 : emit_insn (gen_rtx_SET (res_l,
26165 : simplify_gen_binary (code, himode,
26166 : op1_l, op2_l)));
26167 2 : emit_insn (gen_rtx_SET (res_h,
26168 : simplify_gen_binary (code, himode,
26169 : op1_h, op2_h)));
26170 : }
26171 : else
26172 : {
26173 : /* Expand mult/ashr/lshr/ashl. */
26174 841 : res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
26175 : 1, OPTAB_DIRECT);
26176 841 : res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
26177 : 1, OPTAB_DIRECT);
26178 : }
26179 :
26180 843 : gcc_assert (res_l && res_h);
26181 :
26182 : /* Merge the data back into the right place. */
26183 843 : d.target = dest;
26184 843 : d.op0 = gen_lowpart (qimode, res_l);
26185 843 : d.op1 = gen_lowpart (qimode, res_h);
26186 843 : d.vmode = qimode;
26187 843 : d.nelt = GET_MODE_NUNITS (qimode);
26188 843 : d.one_operand_p = false;
26189 843 : d.testing_p = false;
26190 :
26191 843 : if (full_interleave)
26192 : {
26193 : /* We used the full interleave, the desired
26194 : results are in the even elements. */
26195 13543 : for (i = 0; i < d.nelt; ++i)
26196 12752 : d.perm[i] = i * 2;
26197 : }
26198 : else
26199 : {
26200 : /* For AVX, the interleave used above was not cross-lane. So the
26201 : extraction is evens but with the second and third quarter swapped.
26202 : Happily, that is even one insn shorter than even extraction.
26203 : For AVX512BW we have 4 lanes. We extract evens from within a lane,
26204 : always first from the first and then from the second source operand,
26205 : the index bits above the low 4 bits remains the same.
26206 : Thus, for d.nelt == 32 we want permutation
26207 : 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
26208 : and for d.nelt == 64 we want permutation
26209 : 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
26210 : 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
26211 1972 : for (i = 0; i < d.nelt; ++i)
26212 2880 : d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
26213 : }
26214 :
26215 843 : ok = ix86_expand_vec_perm_const_1 (&d);
26216 843 : gcc_assert (ok);
26217 : }
26218 :
26219 : /* Helper function of ix86_expand_mul_widen_evenodd. Return true
26220 : if op is CONST_VECTOR with all odd elements equal to their
26221 : preceding element. */
26222 :
26223 : static bool
26224 8744 : const_vector_equal_evenodd_p (rtx op)
26225 : {
26226 8744 : machine_mode mode = GET_MODE (op);
26227 8744 : int i, nunits = GET_MODE_NUNITS (mode);
26228 8744 : if (!CONST_VECTOR_P (op)
26229 8744 : || nunits != CONST_VECTOR_NUNITS (op))
26230 : return false;
26231 3560 : for (i = 0; i < nunits; i += 2)
26232 2869 : if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
26233 : return false;
26234 : return true;
26235 : }
26236 :
26237 : void
26238 8856 : ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
26239 : bool uns_p, bool odd_p)
26240 : {
26241 8856 : machine_mode mode = GET_MODE (op1);
26242 8856 : machine_mode wmode = GET_MODE (dest);
26243 8856 : rtx x;
26244 8856 : rtx orig_op1 = op1, orig_op2 = op2;
26245 :
26246 8856 : if (!nonimmediate_operand (op1, mode))
26247 0 : op1 = force_reg (mode, op1);
26248 8856 : if (!nonimmediate_operand (op2, mode))
26249 3316 : op2 = force_reg (mode, op2);
26250 :
26251 : /* We only play even/odd games with vectors of SImode. */
26252 8856 : gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
26253 :
26254 : /* If we're looking for the odd results, shift those members down to
26255 : the even slots. For some cpus this is faster than a PSHUFD. */
26256 8856 : if (odd_p)
26257 : {
26258 : /* For XOP use vpmacsdqh, but only for smult, as it is only
26259 : signed. */
26260 4390 : if (TARGET_XOP && mode == V4SImode && !uns_p)
26261 : {
26262 18 : x = force_reg (wmode, CONST0_RTX (wmode));
26263 18 : emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
26264 18 : return;
26265 : }
26266 :
26267 8744 : x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
26268 4372 : if (!const_vector_equal_evenodd_p (orig_op1))
26269 4372 : op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
26270 : x, NULL, 1, OPTAB_DIRECT);
26271 4372 : if (!const_vector_equal_evenodd_p (orig_op2))
26272 3681 : op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
26273 : x, NULL, 1, OPTAB_DIRECT);
26274 4372 : op1 = gen_lowpart (mode, op1);
26275 4372 : op2 = gen_lowpart (mode, op2);
26276 : }
26277 :
26278 8838 : if (mode == V16SImode)
26279 : {
26280 6 : if (uns_p)
26281 0 : x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
26282 : else
26283 6 : x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
26284 : }
26285 8832 : else if (mode == V8SImode)
26286 : {
26287 139 : if (uns_p)
26288 59 : x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
26289 : else
26290 80 : x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
26291 : }
26292 8693 : else if (uns_p)
26293 7638 : x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
26294 1055 : else if (TARGET_SSE4_1)
26295 369 : x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
26296 : else
26297 : {
26298 686 : rtx s1, s2, t0, t1, t2;
26299 :
26300 : /* The easiest way to implement this without PMULDQ is to go through
26301 : the motions as if we are performing a full 64-bit multiply. With
26302 : the exception that we need to do less shuffling of the elements. */
26303 :
26304 : /* Compute the sign-extension, aka highparts, of the two operands. */
26305 686 : s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
26306 : op1, pc_rtx, pc_rtx);
26307 686 : s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
26308 : op2, pc_rtx, pc_rtx);
26309 :
26310 : /* Multiply LO(A) * HI(B), and vice-versa. */
26311 686 : t1 = gen_reg_rtx (wmode);
26312 686 : t2 = gen_reg_rtx (wmode);
26313 686 : emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
26314 686 : emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
26315 :
26316 : /* Multiply LO(A) * LO(B). */
26317 686 : t0 = gen_reg_rtx (wmode);
26318 686 : emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
26319 :
26320 : /* Combine and shift the highparts into place. */
26321 686 : t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
26322 686 : t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
26323 : 1, OPTAB_DIRECT);
26324 :
26325 : /* Combine high and low parts. */
26326 686 : force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
26327 686 : return;
26328 : }
26329 8152 : emit_insn (x);
26330 : }
26331 :
26332 : void
26333 975 : ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
26334 : bool uns_p, bool high_p)
26335 : {
26336 975 : machine_mode wmode = GET_MODE (dest);
26337 975 : machine_mode mode = GET_MODE (op1);
26338 975 : rtx t1, t2, t3, t4, mask;
26339 :
26340 975 : switch (mode)
26341 : {
26342 297 : case E_V4SImode:
26343 297 : t1 = gen_reg_rtx (mode);
26344 297 : t2 = gen_reg_rtx (mode);
26345 297 : if (TARGET_XOP && !uns_p)
26346 : {
26347 : /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
26348 : shuffle the elements once so that all elements are in the right
26349 : place for immediate use: { A C B D }. */
26350 33 : emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
26351 : const1_rtx, GEN_INT (3)));
26352 33 : emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
26353 : const1_rtx, GEN_INT (3)));
26354 : }
26355 : else
26356 : {
26357 : /* Put the elements into place for the multiply. */
26358 264 : ix86_expand_vec_interleave (t1, op1, op1, high_p);
26359 264 : ix86_expand_vec_interleave (t2, op2, op2, high_p);
26360 264 : high_p = false;
26361 : }
26362 297 : ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
26363 297 : break;
26364 :
26365 70 : case E_V8SImode:
26366 : /* Shuffle the elements between the lanes. After this we
26367 : have { A B E F | C D G H } for each operand. */
26368 70 : t1 = gen_reg_rtx (V4DImode);
26369 70 : t2 = gen_reg_rtx (V4DImode);
26370 70 : emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
26371 : const0_rtx, const2_rtx,
26372 : const1_rtx, GEN_INT (3)));
26373 70 : emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
26374 : const0_rtx, const2_rtx,
26375 : const1_rtx, GEN_INT (3)));
26376 :
26377 : /* Shuffle the elements within the lanes. After this we
26378 : have { A A B B | C C D D } or { E E F F | G G H H }. */
26379 70 : t3 = gen_reg_rtx (V8SImode);
26380 70 : t4 = gen_reg_rtx (V8SImode);
26381 105 : mask = GEN_INT (high_p
26382 : ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
26383 : : 0 + (0 << 2) + (1 << 4) + (1 << 6));
26384 70 : emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
26385 70 : emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
26386 :
26387 70 : ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
26388 70 : break;
26389 :
26390 394 : case E_V8HImode:
26391 394 : case E_V16HImode:
26392 394 : t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
26393 : uns_p, OPTAB_DIRECT);
26394 626 : t2 = expand_binop (mode,
26395 : uns_p ? umul_highpart_optab : smul_highpart_optab,
26396 : op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
26397 394 : gcc_assert (t1 && t2);
26398 :
26399 394 : t3 = gen_reg_rtx (mode);
26400 394 : ix86_expand_vec_interleave (t3, t1, t2, high_p);
26401 394 : emit_move_insn (dest, gen_lowpart (wmode, t3));
26402 394 : break;
26403 :
26404 214 : case E_V16QImode:
26405 214 : case E_V32QImode:
26406 214 : case E_V32HImode:
26407 214 : case E_V16SImode:
26408 214 : case E_V64QImode:
26409 214 : t1 = gen_reg_rtx (wmode);
26410 214 : t2 = gen_reg_rtx (wmode);
26411 214 : ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
26412 214 : ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
26413 :
26414 214 : emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
26415 214 : break;
26416 :
26417 0 : default:
26418 0 : gcc_unreachable ();
26419 : }
26420 975 : }
26421 :
26422 : void
26423 3651 : ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
26424 : {
26425 3651 : rtx res_1, res_2, res_3, res_4;
26426 :
26427 3651 : res_1 = gen_reg_rtx (V4SImode);
26428 3651 : res_2 = gen_reg_rtx (V4SImode);
26429 3651 : res_3 = gen_reg_rtx (V2DImode);
26430 3651 : res_4 = gen_reg_rtx (V2DImode);
26431 3651 : ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
26432 3651 : ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
26433 :
26434 : /* Move the results in element 2 down to element 1; we don't care
26435 : what goes in elements 2 and 3. Then we can merge the parts
26436 : back together with an interleave.
26437 :
26438 : Note that two other sequences were tried:
26439 : (1) Use interleaves at the start instead of psrldq, which allows
26440 : us to use a single shufps to merge things back at the end.
26441 : (2) Use shufps here to combine the two vectors, then pshufd to
26442 : put the elements in the correct order.
26443 : In both cases the cost of the reformatting stall was too high
26444 : and the overall sequence slower. */
26445 :
26446 3651 : emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
26447 : const0_rtx, const2_rtx,
26448 : const0_rtx, const0_rtx));
26449 3651 : emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
26450 : const0_rtx, const2_rtx,
26451 : const0_rtx, const0_rtx));
26452 3651 : res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
26453 :
26454 3651 : set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
26455 3651 : }
26456 :
26457 : void
26458 527 : ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
26459 : {
26460 527 : machine_mode mode = GET_MODE (op0);
26461 527 : rtx t1, t2, t3, t4, t5, t6;
26462 :
26463 527 : if (TARGET_AVX512DQ && mode == V8DImode)
26464 32 : emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
26465 495 : else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
26466 32 : emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
26467 463 : else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
26468 36 : emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
26469 427 : else if (TARGET_XOP && mode == V2DImode)
26470 : {
26471 : /* op1: A,B,C,D, op2: E,F,G,H */
26472 2 : op1 = gen_lowpart (V4SImode, op1);
26473 2 : op2 = gen_lowpart (V4SImode, op2);
26474 :
26475 2 : t1 = gen_reg_rtx (V4SImode);
26476 2 : t2 = gen_reg_rtx (V4SImode);
26477 2 : t3 = gen_reg_rtx (V2DImode);
26478 2 : t4 = gen_reg_rtx (V2DImode);
26479 :
26480 : /* t1: B,A,D,C */
26481 2 : emit_insn (gen_sse2_pshufd_1 (t1, op1,
26482 : GEN_INT (1),
26483 : GEN_INT (0),
26484 : GEN_INT (3),
26485 : GEN_INT (2)));
26486 :
26487 : /* t2: (B*E),(A*F),(D*G),(C*H) */
26488 2 : emit_insn (gen_mulv4si3 (t2, t1, op2));
26489 :
26490 : /* t3: (B*E)+(A*F), (D*G)+(C*H) */
26491 2 : emit_insn (gen_xop_phadddq (t3, t2));
26492 :
26493 : /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
26494 2 : emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
26495 :
26496 : /* Multiply lower parts and add all */
26497 2 : t5 = gen_reg_rtx (V2DImode);
26498 2 : emit_insn (gen_vec_widen_umult_even_v4si (t5,
26499 2 : gen_lowpart (V4SImode, op1),
26500 2 : gen_lowpart (V4SImode, op2)));
26501 2 : force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
26502 : }
26503 : else
26504 : {
26505 425 : machine_mode nmode;
26506 425 : rtx (*umul) (rtx, rtx, rtx);
26507 :
26508 425 : if (mode == V2DImode)
26509 : {
26510 : umul = gen_vec_widen_umult_even_v4si;
26511 : nmode = V4SImode;
26512 : }
26513 295 : else if (mode == V4DImode)
26514 : {
26515 : umul = gen_vec_widen_umult_even_v8si;
26516 : nmode = V8SImode;
26517 : }
26518 116 : else if (mode == V8DImode)
26519 : {
26520 : umul = gen_vec_widen_umult_even_v16si;
26521 : nmode = V16SImode;
26522 : }
26523 : else
26524 0 : gcc_unreachable ();
26525 :
26526 :
26527 : /* Multiply low parts. */
26528 425 : t1 = gen_reg_rtx (mode);
26529 425 : emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
26530 :
26531 : /* Shift input vectors right 32 bits so we can multiply high parts. */
26532 425 : t6 = GEN_INT (32);
26533 425 : t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
26534 425 : t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
26535 :
26536 : /* Multiply high parts by low parts. */
26537 425 : t4 = gen_reg_rtx (mode);
26538 425 : t5 = gen_reg_rtx (mode);
26539 425 : emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
26540 425 : emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
26541 :
26542 : /* Combine and shift the highparts back. */
26543 425 : t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
26544 425 : t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
26545 :
26546 : /* Combine high and low parts. */
26547 425 : force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
26548 : }
26549 :
26550 527 : set_unique_reg_note (get_last_insn (), REG_EQUAL,
26551 : gen_rtx_MULT (mode, op1, op2));
26552 527 : }
26553 :
26554 : /* Return 1 if control tansfer instruction INSN
26555 : should be encoded with notrack prefix. */
26556 :
26557 : bool
26558 14849053 : ix86_notrack_prefixed_insn_p (rtx_insn *insn)
26559 : {
26560 14849053 : if (!insn || !((flag_cf_protection & CF_BRANCH)))
26561 : return false;
26562 :
26563 3918377 : if (CALL_P (insn))
26564 : {
26565 1387738 : rtx call = get_call_rtx_from (insn);
26566 1387738 : gcc_assert (call != NULL_RTX);
26567 1387738 : rtx addr = XEXP (call, 0);
26568 :
26569 : /* Do not emit 'notrack' if it's not an indirect call. */
26570 1387738 : if (MEM_P (addr)
26571 1387738 : && SYMBOL_REF_P (XEXP (addr, 0)))
26572 : return false;
26573 : else
26574 64646 : return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
26575 : }
26576 :
26577 2530639 : if (JUMP_P (insn) && !flag_cet_switch)
26578 : {
26579 2517084 : rtx target = JUMP_LABEL (insn);
26580 2517084 : if (target == NULL_RTX || ANY_RETURN_P (target))
26581 : return false;
26582 :
26583 : /* Check the jump is a switch table. */
26584 2517046 : rtx_insn *label = as_a<rtx_insn *> (target);
26585 2517046 : rtx_insn *table = next_insn (label);
26586 2517046 : if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
26587 : return false;
26588 : else
26589 : return true;
26590 : }
26591 : return false;
26592 : }
26593 :
26594 : /* Calculate integer abs() using only SSE2 instructions. */
26595 :
26596 : void
26597 552 : ix86_expand_sse2_abs (rtx target, rtx input)
26598 : {
26599 552 : machine_mode mode = GET_MODE (target);
26600 552 : rtx tmp0, tmp1, x;
26601 :
26602 552 : switch (mode)
26603 : {
26604 33 : case E_V2DImode:
26605 33 : case E_V4DImode:
26606 : /* For 64-bit signed integer X, with SSE4.2 use
26607 : pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
26608 : Otherwise handle it similarly to V4SImode, except use 64 as W instead of
26609 : 32 and use logical instead of arithmetic right shift (which is
26610 : unimplemented) and subtract. */
26611 33 : if (TARGET_SSE4_2)
26612 : {
26613 9 : tmp0 = gen_reg_rtx (mode);
26614 9 : tmp1 = gen_reg_rtx (mode);
26615 9 : emit_move_insn (tmp1, CONST0_RTX (mode));
26616 9 : if (mode == E_V2DImode)
26617 6 : emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
26618 : else
26619 3 : emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
26620 : }
26621 : else
26622 : {
26623 48 : tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
26624 24 : GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
26625 : - 1), NULL, 0, OPTAB_DIRECT);
26626 24 : tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
26627 : }
26628 :
26629 33 : tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
26630 : NULL, 0, OPTAB_DIRECT);
26631 33 : x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
26632 : target, 0, OPTAB_DIRECT);
26633 33 : break;
26634 :
26635 61 : case E_V4SImode:
26636 : /* For 32-bit signed integer X, the best way to calculate the absolute
26637 : value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
26638 61 : tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
26639 61 : GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
26640 : NULL, 0, OPTAB_DIRECT);
26641 61 : tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
26642 : NULL, 0, OPTAB_DIRECT);
26643 61 : x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
26644 : target, 0, OPTAB_DIRECT);
26645 61 : break;
26646 :
26647 91 : case E_V8HImode:
26648 : /* For 16-bit signed integer X, the best way to calculate the absolute
26649 : value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
26650 91 : tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
26651 :
26652 91 : x = expand_simple_binop (mode, SMAX, tmp0, input,
26653 : target, 0, OPTAB_DIRECT);
26654 91 : break;
26655 :
26656 367 : case E_V16QImode:
26657 : /* For 8-bit signed integer X, the best way to calculate the absolute
26658 : value of X is min ((unsigned char) X, (unsigned char) (-X)),
26659 : as SSE2 provides the PMINUB insn. */
26660 367 : tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
26661 :
26662 367 : x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
26663 : target, 0, OPTAB_DIRECT);
26664 367 : break;
26665 :
26666 0 : default:
26667 0 : gcc_unreachable ();
26668 : }
26669 :
26670 552 : if (x != target)
26671 0 : emit_move_insn (target, x);
26672 552 : }
26673 :
26674 : /* Expand an extract from a vector register through pextr insn.
26675 : Return true if successful. */
26676 :
26677 : bool
26678 101780 : ix86_expand_pextr (rtx *operands)
26679 : {
26680 101780 : rtx dst = operands[0];
26681 101780 : rtx src = operands[1];
26682 :
26683 101780 : unsigned int size = INTVAL (operands[2]);
26684 101780 : unsigned int pos = INTVAL (operands[3]);
26685 :
26686 101780 : if (SUBREG_P (dst))
26687 : {
26688 : /* Reject non-lowpart subregs. */
26689 58612 : if (SUBREG_BYTE (dst) > 0)
26690 : return false;
26691 58483 : dst = SUBREG_REG (dst);
26692 : }
26693 :
26694 101651 : if (SUBREG_P (src))
26695 : {
26696 33907 : pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
26697 33907 : src = SUBREG_REG (src);
26698 : }
26699 :
26700 101651 : switch (GET_MODE (src))
26701 : {
26702 0 : case E_V16QImode:
26703 0 : case E_V8HImode:
26704 0 : case E_V4SImode:
26705 0 : case E_V2DImode:
26706 0 : case E_V1TImode:
26707 0 : {
26708 0 : machine_mode srcmode, dstmode;
26709 0 : rtx d, pat;
26710 :
26711 0 : if (!int_mode_for_size (size, 0).exists (&dstmode))
26712 0 : return false;
26713 :
26714 0 : switch (dstmode)
26715 : {
26716 0 : case E_QImode:
26717 0 : if (!TARGET_SSE4_1)
26718 : return false;
26719 : srcmode = V16QImode;
26720 : break;
26721 :
26722 0 : case E_HImode:
26723 0 : if (!TARGET_SSE2)
26724 : return false;
26725 : srcmode = V8HImode;
26726 : break;
26727 :
26728 0 : case E_SImode:
26729 0 : if (!TARGET_SSE4_1)
26730 : return false;
26731 : srcmode = V4SImode;
26732 : break;
26733 :
26734 0 : case E_DImode:
26735 0 : gcc_assert (TARGET_64BIT);
26736 0 : if (!TARGET_SSE4_1)
26737 : return false;
26738 : srcmode = V2DImode;
26739 : break;
26740 :
26741 : default:
26742 : return false;
26743 : }
26744 :
26745 : /* Reject extractions from misaligned positions. */
26746 0 : if (pos & (size-1))
26747 : return false;
26748 :
26749 0 : if (GET_MODE (dst) == dstmode)
26750 : d = dst;
26751 : else
26752 0 : d = gen_reg_rtx (dstmode);
26753 :
26754 : /* Construct insn pattern. */
26755 0 : pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
26756 0 : pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
26757 :
26758 : /* Let the rtl optimizers know about the zero extension performed. */
26759 0 : if (dstmode == QImode || dstmode == HImode)
26760 : {
26761 0 : pat = gen_rtx_ZERO_EXTEND (SImode, pat);
26762 0 : d = gen_lowpart (SImode, d);
26763 : }
26764 :
26765 0 : emit_insn (gen_rtx_SET (d, pat));
26766 :
26767 0 : if (d != dst)
26768 0 : emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
26769 : return true;
26770 : }
26771 :
26772 : default:
26773 : return false;
26774 : }
26775 : }
26776 :
26777 : /* Expand an insert into a vector register through pinsr insn.
26778 : Return true if successful. */
26779 :
26780 : bool
26781 109797 : ix86_expand_pinsr (rtx *operands)
26782 : {
26783 109797 : rtx dst = operands[0];
26784 109797 : rtx src = operands[3];
26785 :
26786 109797 : unsigned int size = INTVAL (operands[1]);
26787 109797 : unsigned int pos = INTVAL (operands[2]);
26788 :
26789 109797 : if (SUBREG_P (dst))
26790 : {
26791 61699 : pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
26792 61699 : dst = SUBREG_REG (dst);
26793 : }
26794 :
26795 109797 : switch (GET_MODE (dst))
26796 : {
26797 20 : case E_V16QImode:
26798 20 : case E_V8HImode:
26799 20 : case E_V4SImode:
26800 20 : case E_V2DImode:
26801 20 : case E_V1TImode:
26802 20 : {
26803 20 : machine_mode srcmode, dstmode;
26804 20 : rtx (*pinsr)(rtx, rtx, rtx, rtx);
26805 20 : rtx d;
26806 :
26807 20 : if (!int_mode_for_size (size, 0).exists (&srcmode))
26808 0 : return false;
26809 :
26810 20 : switch (srcmode)
26811 : {
26812 1 : case E_QImode:
26813 1 : if (!TARGET_SSE4_1)
26814 : return false;
26815 : dstmode = V16QImode;
26816 : pinsr = gen_sse4_1_pinsrb;
26817 : break;
26818 :
26819 5 : case E_HImode:
26820 5 : if (!TARGET_SSE2)
26821 : return false;
26822 : dstmode = V8HImode;
26823 : pinsr = gen_sse2_pinsrw;
26824 : break;
26825 :
26826 14 : case E_SImode:
26827 14 : if (!TARGET_SSE4_1)
26828 : return false;
26829 : dstmode = V4SImode;
26830 : pinsr = gen_sse4_1_pinsrd;
26831 : break;
26832 :
26833 0 : case E_DImode:
26834 0 : gcc_assert (TARGET_64BIT);
26835 0 : if (!TARGET_SSE4_1)
26836 : return false;
26837 : dstmode = V2DImode;
26838 : pinsr = gen_sse4_1_pinsrq;
26839 : break;
26840 :
26841 : default:
26842 : return false;
26843 : }
26844 :
26845 : /* Reject insertions to misaligned positions. */
26846 7 : if (pos & (size-1))
26847 : return false;
26848 :
26849 7 : if (SUBREG_P (src))
26850 : {
26851 7 : unsigned int srcpos = SUBREG_BYTE (src);
26852 :
26853 7 : if (srcpos > 0)
26854 : {
26855 0 : rtx extr_ops[4];
26856 :
26857 0 : extr_ops[0] = gen_reg_rtx (srcmode);
26858 0 : extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
26859 0 : extr_ops[2] = GEN_INT (size);
26860 0 : extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
26861 :
26862 0 : if (!ix86_expand_pextr (extr_ops))
26863 0 : return false;
26864 :
26865 0 : src = extr_ops[0];
26866 : }
26867 : else
26868 7 : src = gen_lowpart (srcmode, SUBREG_REG (src));
26869 : }
26870 :
26871 7 : if (GET_MODE (dst) == dstmode)
26872 : d = dst;
26873 : else
26874 7 : d = gen_reg_rtx (dstmode);
26875 :
26876 7 : emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
26877 7 : gen_lowpart (srcmode, src),
26878 7 : GEN_INT (1 << (pos / size))));
26879 7 : if (d != dst)
26880 7 : emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
26881 : return true;
26882 : }
26883 :
26884 : default:
26885 : return false;
26886 : }
26887 : }
26888 :
26889 : /* All CPUs prefer to avoid cross-lane operations so perform reductions
26890 : upper against lower halves up to SSE reg size. */
26891 :
26892 : machine_mode
26893 1992 : ix86_split_reduction (machine_mode mode)
26894 : {
26895 : /* Reduce lowpart against highpart until we reach SSE reg width to
26896 : avoid cross-lane operations. */
26897 1992 : switch (mode)
26898 : {
26899 : case E_V8DImode:
26900 : case E_V4DImode:
26901 : return V2DImode;
26902 9 : case E_V16SImode:
26903 9 : case E_V8SImode:
26904 9 : return V4SImode;
26905 8 : case E_V32HImode:
26906 8 : case E_V16HImode:
26907 8 : return V8HImode;
26908 4 : case E_V64QImode:
26909 4 : case E_V32QImode:
26910 4 : return V16QImode;
26911 5 : case E_V16SFmode:
26912 5 : case E_V8SFmode:
26913 5 : return V4SFmode;
26914 16 : case E_V8DFmode:
26915 16 : case E_V4DFmode:
26916 16 : return V2DFmode;
26917 1945 : default:
26918 1945 : return mode;
26919 : }
26920 : }
26921 :
26922 : /* Generate call to __divmoddi4. */
26923 :
26924 : void
26925 896 : ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
26926 : rtx op0, rtx op1,
26927 : rtx *quot_p, rtx *rem_p)
26928 : {
26929 1792 : rtx rem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
26930 :
26931 896 : rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
26932 : mode, op0, mode, op1, mode,
26933 896 : XEXP (rem, 0), Pmode);
26934 896 : *quot_p = quot;
26935 896 : *rem_p = rem;
26936 896 : }
26937 :
26938 : void
26939 64 : ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
26940 : enum rtx_code code, bool after,
26941 : bool doubleword)
26942 : {
26943 64 : rtx old_reg, new_reg, old_mem, success;
26944 64 : machine_mode mode = GET_MODE (target);
26945 64 : rtx_code_label *loop_label = NULL;
26946 :
26947 64 : old_reg = gen_reg_rtx (mode);
26948 64 : new_reg = old_reg;
26949 64 : old_mem = copy_to_reg (mem);
26950 64 : loop_label = gen_label_rtx ();
26951 64 : emit_label (loop_label);
26952 64 : emit_move_insn (old_reg, old_mem);
26953 :
26954 : /* return value for atomic_fetch_op. */
26955 64 : if (!after)
26956 32 : emit_move_insn (target, old_reg);
26957 :
26958 64 : if (code == NOT)
26959 : {
26960 16 : new_reg = expand_simple_binop (mode, AND, new_reg, val, NULL_RTX,
26961 : true, OPTAB_LIB_WIDEN);
26962 16 : new_reg = expand_simple_unop (mode, code, new_reg, NULL_RTX, true);
26963 : }
26964 : else
26965 48 : new_reg = expand_simple_binop (mode, code, new_reg, val, NULL_RTX,
26966 : true, OPTAB_LIB_WIDEN);
26967 :
26968 : /* return value for atomic_op_fetch. */
26969 64 : if (after)
26970 32 : emit_move_insn (target, new_reg);
26971 :
26972 64 : success = NULL_RTX;
26973 :
26974 64 : ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg,
26975 : gen_int_mode (MEMMODEL_SYNC_SEQ_CST,
26976 : SImode),
26977 : doubleword, loop_label);
26978 64 : }
26979 :
26980 : /* Relax cmpxchg instruction, param loop_label indicates whether
26981 : the instruction should be relaxed with a pause loop. If not,
26982 : it will be relaxed to an atomic load + compare, and skip
26983 : cmpxchg instruction if mem != exp_input. */
26984 :
26985 : void
26986 72 : ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
26987 : rtx mem, rtx exp_input, rtx new_input,
26988 : rtx mem_model, bool doubleword,
26989 : rtx_code_label *loop_label)
26990 : {
26991 72 : rtx_code_label *cmp_label = NULL;
26992 72 : rtx_code_label *done_label = NULL;
26993 72 : rtx target_bool = NULL_RTX, new_mem = NULL_RTX;
26994 72 : rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL;
26995 72 : rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL;
26996 72 : machine_mode mode = GET_MODE (target_val), hmode = mode;
26997 :
26998 72 : if (*ptarget_bool == NULL)
26999 64 : target_bool = gen_reg_rtx (QImode);
27000 : else
27001 : target_bool = *ptarget_bool;
27002 :
27003 72 : cmp_label = gen_label_rtx ();
27004 72 : done_label = gen_label_rtx ();
27005 :
27006 72 : new_mem = gen_reg_rtx (mode);
27007 : /* Load memory first. */
27008 72 : expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST);
27009 :
27010 72 : switch (mode)
27011 : {
27012 : case E_TImode:
27013 : gendw = gen_atomic_compare_and_swapti_doubleword;
27014 : hmode = DImode;
27015 : break;
27016 18 : case E_DImode:
27017 18 : if (doubleword)
27018 : {
27019 : gendw = gen_atomic_compare_and_swapdi_doubleword;
27020 : hmode = SImode;
27021 : }
27022 : else
27023 : gen = gen_atomic_compare_and_swapdi_1;
27024 : break;
27025 18 : case E_SImode:
27026 18 : gen = gen_atomic_compare_and_swapsi_1;
27027 18 : break;
27028 18 : case E_HImode:
27029 18 : gen = gen_atomic_compare_and_swaphi_1;
27030 18 : break;
27031 18 : case E_QImode:
27032 18 : gen = gen_atomic_compare_and_swapqi_1;
27033 18 : break;
27034 0 : default:
27035 0 : gcc_unreachable ();
27036 : }
27037 :
27038 : /* Compare mem value with expected value. */
27039 54 : if (doubleword)
27040 : {
27041 0 : rtx low_new_mem = gen_lowpart (hmode, new_mem);
27042 0 : rtx low_exp_input = gen_lowpart (hmode, exp_input);
27043 0 : rtx high_new_mem = gen_highpart (hmode, new_mem);
27044 0 : rtx high_exp_input = gen_highpart (hmode, exp_input);
27045 0 : emit_cmp_and_jump_insns (low_new_mem, low_exp_input, NE, NULL_RTX,
27046 : hmode, 1, cmp_label,
27047 : profile_probability::guessed_never ());
27048 0 : emit_cmp_and_jump_insns (high_new_mem, high_exp_input, NE, NULL_RTX,
27049 : hmode, 1, cmp_label,
27050 : profile_probability::guessed_never ());
27051 : }
27052 : else
27053 72 : emit_cmp_and_jump_insns (new_mem, exp_input, NE, NULL_RTX,
27054 72 : GET_MODE (exp_input), 1, cmp_label,
27055 : profile_probability::guessed_never ());
27056 :
27057 : /* Directly emits cmpxchg here. */
27058 72 : if (doubleword)
27059 0 : emit_insn (gendw (target_val, mem, exp_input,
27060 0 : gen_lowpart (hmode, new_input),
27061 : gen_highpart (hmode, new_input),
27062 : mem_model));
27063 : else
27064 72 : emit_insn (gen (target_val, mem, exp_input, new_input, mem_model));
27065 :
27066 72 : if (!loop_label)
27067 : {
27068 8 : emit_jump_insn (gen_jump (done_label));
27069 8 : emit_barrier ();
27070 8 : emit_label (cmp_label);
27071 8 : emit_move_insn (target_val, new_mem);
27072 8 : emit_label (done_label);
27073 8 : ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
27074 : const0_rtx);
27075 : }
27076 : else
27077 : {
27078 64 : ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
27079 : const0_rtx);
27080 64 : emit_cmp_and_jump_insns (target_bool, const0_rtx, EQ, const0_rtx,
27081 64 : GET_MODE (target_bool), 1, loop_label,
27082 : profile_probability::guessed_never ());
27083 64 : emit_jump_insn (gen_jump (done_label));
27084 64 : emit_barrier ();
27085 :
27086 : /* If mem is not expected, pause and loop back. */
27087 64 : emit_label (cmp_label);
27088 64 : emit_move_insn (target_val, new_mem);
27089 64 : emit_insn (gen_pause ());
27090 64 : emit_jump_insn (gen_jump (loop_label));
27091 64 : emit_barrier ();
27092 64 : emit_label (done_label);
27093 : }
27094 :
27095 72 : *ptarget_bool = target_bool;
27096 72 : }
27097 :
27098 : /* Convert a BFmode VAL to SFmode without signaling sNaNs.
27099 : This is done by returning SF SUBREG of ((HI SUBREG) (VAL)) << 16. */
27100 :
27101 : rtx
27102 2832 : ix86_expand_fast_convert_bf_to_sf (rtx val)
27103 : {
27104 2832 : rtx op = gen_lowpart (HImode, val), ret;
27105 2832 : if (CONST_INT_P (op))
27106 : {
27107 514 : ret = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
27108 : val, BFmode);
27109 514 : if (ret)
27110 : return ret;
27111 : /* FLOAT_EXTEND simplification will fail if VAL is a sNaN. */
27112 1 : ret = gen_reg_rtx (SImode);
27113 1 : emit_move_insn (ret, GEN_INT (INTVAL (op) & 0xffff));
27114 1 : emit_insn (gen_ashlsi3 (ret, ret, GEN_INT (16)));
27115 1 : return gen_lowpart (SFmode, ret);
27116 : }
27117 :
27118 2318 : ret = gen_reg_rtx (SFmode);
27119 2318 : emit_insn (gen_extendbfsf2_1 (ret, force_reg (BFmode, val)));
27120 2318 : return ret;
27121 : }
27122 :
27123 : rtx
27124 65576 : ix86_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
27125 : rtx_code code, tree treeop0, tree treeop1)
27126 : {
27127 65576 : if (!TARGET_APX_CCMP)
27128 : return NULL_RTX;
27129 :
27130 65576 : rtx op0, op1, res;
27131 65576 : machine_mode op_mode;
27132 :
27133 65576 : start_sequence ();
27134 65576 : expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
27135 :
27136 65576 : op_mode = GET_MODE (op0);
27137 65576 : if (op_mode == VOIDmode)
27138 0 : op_mode = GET_MODE (op1);
27139 :
27140 : /* We only supports following scalar comparisons that use just 1
27141 : instruction: DI/SI/QI/HI/DF/SF/HF.
27142 : Unordered/Ordered compare cannot be corretly indentified by
27143 : ccmp so they are not supported. */
27144 98348 : if (!(op_mode == DImode || op_mode == SImode || op_mode == HImode
27145 65576 : || op_mode == QImode || op_mode == DFmode || op_mode == SFmode
27146 32772 : || op_mode == HFmode)
27147 32806 : || code == ORDERED
27148 32806 : || code == UNORDERED)
27149 : {
27150 32770 : end_sequence ();
27151 32770 : return NULL_RTX;
27152 : }
27153 :
27154 : /* Canonicalize the operands according to mode. */
27155 32806 : if (SCALAR_INT_MODE_P (op_mode))
27156 : {
27157 32799 : if (!nonimmediate_operand (op0, op_mode))
27158 0 : op0 = force_reg (op_mode, op0);
27159 32799 : if (!x86_64_general_operand (op1, op_mode))
27160 0 : op1 = force_reg (op_mode, op1);
27161 : }
27162 : else
27163 : {
27164 : /* op0/op1 can be canonicallized from expand_fp_compare, so
27165 : just adjust the code to make it generate supported fp
27166 : condition. */
27167 7 : if (ix86_fp_compare_code_to_integer (code) == UNKNOWN)
27168 : {
27169 : /* First try to split condition if we don't need to honor
27170 : NaNs, as the ORDERED/UNORDERED check always fall
27171 : through. */
27172 6 : if (!HONOR_NANS (op_mode))
27173 : {
27174 6 : rtx_code first_code;
27175 6 : split_comparison (code, op_mode, &first_code, &code);
27176 : }
27177 : /* Otherwise try to swap the operand order and check if
27178 : the comparison is supported. */
27179 : else
27180 : {
27181 0 : code = swap_condition (code);
27182 0 : std::swap (op0, op1);
27183 : }
27184 :
27185 6 : if (ix86_fp_compare_code_to_integer (code) == UNKNOWN)
27186 : {
27187 0 : end_sequence ();
27188 0 : return NULL_RTX;
27189 : }
27190 : }
27191 : }
27192 :
27193 32806 : *prep_seq = end_sequence ();
27194 :
27195 32806 : start_sequence ();
27196 :
27197 32806 : res = ix86_expand_compare (code, op0, op1);
27198 :
27199 32806 : if (!res)
27200 : {
27201 : end_sequence ();
27202 : return NULL_RTX;
27203 : }
27204 32806 : *gen_seq = end_sequence ();
27205 :
27206 32806 : return res;
27207 : }
27208 :
27209 : rtx
27210 32809 : ix86_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
27211 : rtx_code cmp_code, tree treeop0, tree treeop1,
27212 : rtx_code bit_code)
27213 : {
27214 32809 : if (!TARGET_APX_CCMP)
27215 : return NULL_RTX;
27216 :
27217 32809 : rtx op0, op1, target;
27218 32809 : machine_mode op_mode, cmp_mode, cc_mode = CCmode;
27219 32809 : int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
27220 32809 : insn_code icode;
27221 32809 : rtx_code prev_code;
27222 32809 : struct expand_operand ops[5];
27223 32809 : int dfv;
27224 :
27225 : /* Exit early for non integer modes to avoid O(n^2) part of expand_operands. */
27226 32809 : cmp_mode = op_mode = TYPE_MODE (TREE_TYPE (treeop0));
27227 :
27228 32809 : if (!(op_mode == DImode || op_mode == SImode || op_mode == HImode
27229 : || op_mode == QImode))
27230 : return NULL_RTX;
27231 :
27232 32 : push_to_sequence (*prep_seq);
27233 32 : expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
27234 :
27235 32 : icode = code_for_ccmp (op_mode);
27236 :
27237 32 : op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
27238 32 : op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
27239 32 : if (!op0 || !op1)
27240 : {
27241 0 : end_sequence ();
27242 0 : return NULL_RTX;
27243 : }
27244 :
27245 32 : *prep_seq = end_sequence ();
27246 :
27247 32 : target = gen_rtx_REG (cc_mode, FLAGS_REG);
27248 32 : dfv = ix86_get_flags_cc ((rtx_code) cmp_code);
27249 :
27250 32 : prev_code = GET_CODE (prev);
27251 : /* Fixup FP compare code here. */
27252 32 : if (GET_MODE (XEXP (prev, 0)) == CCFPmode)
27253 7 : prev_code = ix86_fp_compare_code_to_integer (prev_code);
27254 :
27255 32 : if (bit_code != AND)
27256 17 : prev_code = reverse_condition (prev_code);
27257 : else
27258 15 : dfv = (int)(dfv ^ 1);
27259 :
27260 32 : prev = gen_rtx_fmt_ee (prev_code, VOIDmode, XEXP (prev, 0),
27261 : const0_rtx);
27262 :
27263 32 : create_fixed_operand (&ops[0], target);
27264 32 : create_fixed_operand (&ops[1], prev);
27265 32 : create_fixed_operand (&ops[2], op0);
27266 32 : create_fixed_operand (&ops[3], op1);
27267 32 : create_fixed_operand (&ops[4], GEN_INT (dfv));
27268 :
27269 32 : push_to_sequence (*gen_seq);
27270 32 : if (!maybe_expand_insn (icode, 5, ops))
27271 : {
27272 0 : end_sequence ();
27273 0 : return NULL_RTX;
27274 : }
27275 :
27276 32 : *gen_seq = end_sequence ();
27277 :
27278 32 : return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
27279 : }
27280 :
27281 : /* Attempt to convert a CONST_VECTOR into a bcst_mem_operand.
27282 : Returns NULL_RTX if X is cannot be expressed as a suitable
27283 : VEC_DUPLICATE in mode MODE. */
27284 :
27285 : static rtx
27286 48 : ix86_gen_bcst_mem (machine_mode mode, rtx x)
27287 : {
27288 48 : if (!TARGET_AVX512F
27289 48 : || !CONST_VECTOR_P (x)
27290 64 : || (!TARGET_AVX512VL && GET_MODE_SIZE (mode) != 64)
27291 147 : || !VALID_BCST_MODE_P (GET_MODE_INNER (mode))
27292 : /* Disallow HFmode broadcast. */
27293 126 : || GET_MODE_SIZE (GET_MODE_INNER (mode)) < 4)
27294 : return NULL_RTX;
27295 :
27296 21 : rtx cst = CONST_VECTOR_ELT (x, 0);
27297 21 : if (!CONST_SCALAR_INT_P (cst)
27298 15 : && !CONST_DOUBLE_P (cst)
27299 0 : && !CONST_FIXED_P (cst))
27300 : return NULL_RTX;
27301 :
27302 21 : int n_elts = GET_MODE_NUNITS (mode);
27303 42 : if (CONST_VECTOR_NUNITS (x) != n_elts)
27304 : return NULL_RTX;
27305 :
27306 150 : for (int i = 1; i < n_elts; i++)
27307 129 : if (!rtx_equal_p (cst, CONST_VECTOR_ELT (x, i)))
27308 : return NULL_RTX;
27309 :
27310 42 : rtx mem = force_const_mem (GET_MODE_INNER (mode), cst);
27311 21 : return gen_rtx_VEC_DUPLICATE (mode, validize_mem (mem));
27312 : }
27313 :
27314 : /* Determine the ternlog immediate index that implements 3-operand
27315 : ternary logic expression OP. This uses and modifies the 3 element
27316 : array ARGS to record and check the leaves, either 3 REGs, or 2 REGs
27317 : and MEM. Returns an index between 0 and 255 for a valid ternlog,
27318 : or -1 if the expression isn't suitable. */
27319 :
27320 : int
27321 7280332 : ix86_ternlog_idx (rtx op, rtx *args)
27322 : {
27323 7280332 : int idx0, idx1;
27324 :
27325 7280332 : if (!op)
27326 : return -1;
27327 :
27328 7280332 : switch (GET_CODE (op))
27329 : {
27330 751232 : case SUBREG:
27331 751232 : if (!register_operand (op, GET_MODE (op)))
27332 : return -1;
27333 : /* FALLTHRU */
27334 :
27335 3565035 : case REG:
27336 3565035 : if (!args[0])
27337 : {
27338 1848599 : args[0] = op;
27339 1848599 : return 0xf0;
27340 : }
27341 1716436 : if (rtx_equal_p (op, args[0]))
27342 : return 0xf0;
27343 1689839 : if (!args[1])
27344 : {
27345 1425231 : args[1] = op;
27346 1425231 : return 0xcc;
27347 : }
27348 264608 : if (rtx_equal_p (op, args[1]))
27349 : return 0xcc;
27350 248036 : if (!args[2])
27351 : {
27352 225700 : args[2] = op;
27353 225700 : return 0xaa;
27354 : }
27355 22336 : if (rtx_equal_p (op, args[2]))
27356 : return 0xaa;
27357 : return -1;
27358 :
27359 17708 : case VEC_DUPLICATE:
27360 17708 : if (!bcst_mem_operand (op, GET_MODE (op)))
27361 : return -1;
27362 302 : goto do_mem_operand;
27363 :
27364 365347 : case MEM:
27365 365347 : if (!memory_operand (op, GET_MODE (op)))
27366 : return -1;
27367 365182 : if (MEM_P (op)
27368 365182 : && MEM_VOLATILE_P (op)
27369 365276 : && !volatile_ok)
27370 : return -1;
27371 : /* FALLTHRU */
27372 :
27373 473669 : case CONST_VECTOR:
27374 473669 : do_mem_operand:
27375 473669 : if (!args[2])
27376 : {
27377 426415 : args[2] = op;
27378 426415 : return 0xaa;
27379 : }
27380 : /* Maximum of one volatile memory reference per expression. */
27381 47254 : if (side_effects_p (op))
27382 : return -1;
27383 47254 : if (rtx_equal_p (op, args[2]))
27384 : return 0xaa;
27385 : /* Check if CONST_VECTOR is the ones-complement of args[2]. */
27386 47203 : if (CONST_VECTOR_P (op)
27387 3446 : && CONST_VECTOR_P (args[2])
27388 47448 : && rtx_equal_p (simplify_const_unary_operation (NOT, GET_MODE (op),
27389 245 : op, GET_MODE (op)),
27390 : args[2]))
27391 : return 0x55;
27392 47016 : if (!args[0])
27393 : {
27394 45218 : args[0] = op;
27395 45218 : return 0xf0;
27396 : }
27397 1798 : if (rtx_equal_p (op, args[0]))
27398 : return 0xf0;
27399 : /* Check if CONST_VECTOR is the ones-complement of args[0]. */
27400 1798 : if (CONST_VECTOR_P (op)
27401 101 : && CONST_VECTOR_P (args[0])
27402 1840 : && rtx_equal_p (simplify_const_unary_operation (NOT, GET_MODE (op),
27403 42 : op, GET_MODE (op)),
27404 : args[0]))
27405 : return 0x0f;
27406 1756 : if (!args[1])
27407 : {
27408 1744 : args[1] = op;
27409 1744 : return 0xcc;
27410 : }
27411 12 : if (rtx_equal_p (op, args[1]))
27412 : return 0xcc;
27413 : /* Check if CONST_VECTOR is the ones-complement of args[1]. */
27414 12 : if (CONST_VECTOR_P (op)
27415 0 : && CONST_VECTOR_P (args[1])
27416 12 : && rtx_equal_p (simplify_const_unary_operation (NOT, GET_MODE (op),
27417 0 : op, GET_MODE (op)),
27418 : args[1]))
27419 : return 0x33;
27420 : return -1;
27421 :
27422 185167 : case NOT:
27423 185167 : idx0 = ix86_ternlog_idx (XEXP (op, 0), args);
27424 185167 : return (idx0 >= 0) ? idx0 ^ 0xff : -1;
27425 :
27426 1301572 : case AND:
27427 1301572 : idx0 = ix86_ternlog_idx (XEXP (op, 0), args);
27428 1301572 : if (idx0 < 0)
27429 : return -1;
27430 1071839 : idx1 = ix86_ternlog_idx (XEXP (op, 1), args);
27431 1071839 : return (idx1 >= 0) ? idx0 & idx1 : -1;
27432 :
27433 953110 : case IOR:
27434 953110 : idx0 = ix86_ternlog_idx (XEXP (op, 0), args);
27435 953110 : if (idx0 < 0)
27436 : return -1;
27437 708705 : idx1 = ix86_ternlog_idx (XEXP (op, 1), args);
27438 708705 : return (idx1 >= 0) ? idx0 | idx1 : -1;
27439 :
27440 402407 : case XOR:
27441 402407 : idx0 = ix86_ternlog_idx (XEXP (op, 0), args);
27442 402407 : if (idx0 < 0)
27443 : return -1;
27444 383203 : if (vector_all_ones_operand (XEXP (op, 1), GET_MODE (op)))
27445 6671 : return idx0 ^ 0xff;
27446 376532 : idx1 = ix86_ternlog_idx (XEXP (op, 1), args);
27447 376532 : return (idx1 >= 0) ? idx0 ^ idx1 : -1;
27448 :
27449 7198 : case UNSPEC:
27450 7198 : if (XINT (op, 1) != UNSPEC_VTERNLOG
27451 0 : || XVECLEN (op, 0) != 4
27452 0 : || !CONST_INT_P (XVECEXP (op, 0, 3)))
27453 : return -1;
27454 :
27455 : /* TODO: Handle permuted operands. */
27456 0 : if (ix86_ternlog_idx (XVECEXP (op, 0, 0), args) != 0xf0
27457 0 : || ix86_ternlog_idx (XVECEXP (op, 0, 1), args) != 0xcc
27458 0 : || ix86_ternlog_idx (XVECEXP (op, 0, 2), args) != 0xaa)
27459 0 : return -1;
27460 0 : return INTVAL (XVECEXP (op, 0, 3));
27461 :
27462 : default:
27463 : return -1;
27464 : }
27465 : }
27466 :
27467 : /* Return TRUE if OP (in mode MODE) is the leaf of a ternary logic
27468 : expression, such as a register or a memory reference. */
27469 :
27470 : bool
27471 3377536 : ix86_ternlog_leaf_p (rtx op, machine_mode mode)
27472 : {
27473 : /* We can't use memory_operand here, as it may return a different
27474 : value before and after reload (for volatile MEMs) which creates
27475 : problems splitting instructions. */
27476 3377536 : return register_operand (op, mode)
27477 735344 : || MEM_P (op)
27478 384059 : || CONST_VECTOR_P (op)
27479 3659351 : || bcst_mem_operand (op, mode);
27480 : }
27481 :
27482 : /* Test whether OP is a 3-operand ternary logic expression suitable
27483 : for use in a ternlog instruction. */
27484 :
27485 : bool
27486 2245014 : ix86_ternlog_operand_p (rtx op)
27487 : {
27488 2245014 : rtx op0, op1;
27489 2245014 : rtx args[3];
27490 :
27491 2245014 : args[0] = NULL_RTX;
27492 2245014 : args[1] = NULL_RTX;
27493 2245014 : args[2] = NULL_RTX;
27494 2245014 : int idx = ix86_ternlog_idx (op, args);
27495 2245014 : if (idx < 0)
27496 : return false;
27497 :
27498 : /* Don't match simple (binary or unary) expressions. */
27499 1824755 : machine_mode mode = GET_MODE (op);
27500 1824755 : switch (GET_CODE (op))
27501 : {
27502 843125 : case AND:
27503 843125 : op0 = XEXP (op, 0);
27504 843125 : op1 = XEXP (op, 1);
27505 :
27506 : /* Prefer pand. */
27507 843125 : if (ix86_ternlog_leaf_p (op0, mode)
27508 843125 : && ix86_ternlog_leaf_p (op1, mode))
27509 : return false;
27510 : /* Prefer pandn. */
27511 109040 : if (GET_CODE (op0) == NOT
27512 77461 : && register_operand (XEXP (op0, 0), mode)
27513 182908 : && ix86_ternlog_leaf_p (op1, mode))
27514 : return false;
27515 : break;
27516 :
27517 622274 : case IOR:
27518 : /* Prefer por. */
27519 622274 : if (ix86_ternlog_leaf_p (XEXP (op, 0), mode)
27520 622274 : && ix86_ternlog_leaf_p (XEXP (op, 1), mode))
27521 : return false;
27522 : break;
27523 :
27524 326490 : case XOR:
27525 326490 : op1 = XEXP (op, 1);
27526 : /* Prefer pxor, or one_cmpl<vmode>2. */
27527 326490 : if (ix86_ternlog_leaf_p (XEXP (op, 0), mode)
27528 326490 : && ix86_ternlog_leaf_p (XEXP (op, 1), mode))
27529 : return false;
27530 : break;
27531 :
27532 : default:
27533 : break;
27534 : }
27535 : return true;
27536 : }
27537 :
27538 : /* Helper function for ix86_expand_ternlog. */
27539 : static rtx
27540 0 : ix86_expand_ternlog_binop (enum rtx_code code, machine_mode mode,
27541 : rtx op0, rtx op1, rtx target)
27542 : {
27543 0 : if (GET_MODE (op0) != mode)
27544 0 : op0 = gen_lowpart (mode, op0);
27545 0 : if (GET_MODE (op1) != mode)
27546 0 : op1 = gen_lowpart (mode, op1);
27547 :
27548 0 : if (CONST_VECTOR_P (op0))
27549 0 : op0 = validize_mem (force_const_mem (mode, op0));
27550 0 : if (CONST_VECTOR_P (op1))
27551 0 : op1 = validize_mem (force_const_mem (mode, op1));
27552 :
27553 0 : if (!register_operand (op0, mode))
27554 : {
27555 0 : if (!register_operand (op1, mode))
27556 : {
27557 : /* We can't use force_reg (op0, mode). */
27558 0 : rtx reg = gen_reg_rtx (mode);
27559 0 : emit_move_insn (reg, op0);
27560 0 : op0 = reg;
27561 : }
27562 : else
27563 : std::swap (op0, op1);
27564 : }
27565 0 : rtx ops[3] = { target, op0, op1 };
27566 0 : ix86_expand_vector_logical_operator (code, mode, ops);
27567 0 : return target;
27568 : }
27569 :
27570 :
27571 : /* Helper function for ix86_expand_ternlog. */
27572 : static rtx
27573 0 : ix86_expand_ternlog_andnot (machine_mode mode, rtx op0, rtx op1, rtx target)
27574 : {
27575 0 : if (GET_MODE (op0) != mode)
27576 0 : op0 = gen_lowpart (mode, op0);
27577 0 : op0 = gen_rtx_NOT (mode, op0);
27578 0 : if (GET_MODE (op1) != mode)
27579 0 : op1 = gen_lowpart (mode, op1);
27580 0 : if (CONST_VECTOR_P (op1))
27581 0 : op1 = validize_mem (force_const_mem (mode, op1));
27582 0 : emit_move_insn (target, gen_rtx_AND (mode, op0, op1));
27583 0 : return target;
27584 : }
27585 :
27586 : /* Expand a 3-operand ternary logic expression. Return TARGET. */
27587 : rtx
27588 2420 : ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx,
27589 : rtx target)
27590 : {
27591 2420 : rtx tmp0, tmp1, tmp2;
27592 :
27593 2420 : if (!target)
27594 3 : target = gen_reg_rtx (mode);
27595 :
27596 : /* Canonicalize ternlog index for degenerate (duplicated) operands. */
27597 2420 : if (rtx_equal_p (op0, op1) && rtx_equal_p (op0, op2))
27598 0 : switch (idx & 0x81)
27599 : {
27600 : case 0x00:
27601 : idx = 0x00;
27602 : break;
27603 : case 0x01:
27604 : idx = 0x0f;
27605 : break;
27606 : case 0x80:
27607 : idx = 0xf0;
27608 : break;
27609 : case 0x81:
27610 : idx = 0xff;
27611 : break;
27612 : }
27613 :
27614 2420 : switch (idx & 0xff)
27615 : {
27616 0 : case 0x00:
27617 0 : if ((!op0 || !side_effects_p (op0))
27618 0 : && (!op1 || !side_effects_p (op1))
27619 0 : && (!op2 || !side_effects_p (op2)))
27620 : {
27621 0 : emit_move_insn (target, CONST0_RTX (mode));
27622 0 : return target;
27623 : }
27624 : break;
27625 :
27626 0 : case 0x0a: /* ~a&c */
27627 0 : if ((!op1 || !side_effects_p (op1))
27628 0 : && op0 && register_operand (op0, mode)
27629 0 : && op2 && ix86_ternlog_leaf_p (op2, mode))
27630 0 : return ix86_expand_ternlog_andnot (mode, op0, op2, target);
27631 : break;
27632 :
27633 0 : case 0x0c: /* ~a&b */
27634 0 : if ((!op2 || !side_effects_p (op2))
27635 0 : && op0 && register_operand (op0, mode)
27636 0 : && op1 && ix86_ternlog_leaf_p (op1, mode))
27637 0 : return ix86_expand_ternlog_andnot (mode, op0, op1, target);
27638 : break;
27639 :
27640 78 : case 0x0f: /* ~a */
27641 0 : if ((!op1 || !side_effects_p (op1))
27642 78 : && (!op2 || !side_effects_p (op2))
27643 156 : && op0)
27644 : {
27645 78 : emit_move_insn (target, gen_rtx_XOR (mode, op0, CONSTM1_RTX (mode)));
27646 78 : return target;
27647 : }
27648 : break;
27649 :
27650 0 : case 0x22: /* ~b&c */
27651 0 : if ((!op0 || !side_effects_p (op0))
27652 0 : && op1 && register_operand (op1, mode)
27653 0 : && op2 && ix86_ternlog_leaf_p (op2, mode))
27654 0 : return ix86_expand_ternlog_andnot (mode, op1, op2, target);
27655 : break;
27656 :
27657 0 : case 0x30: /* ~b&a */
27658 0 : if ((!op2 || !side_effects_p (op2))
27659 0 : && op0 && ix86_ternlog_leaf_p (op0, mode)
27660 0 : && op1 && register_operand (op1, mode))
27661 0 : return ix86_expand_ternlog_andnot (mode, op1, op0, target);
27662 : break;
27663 :
27664 0 : case 0x33: /* ~b */
27665 0 : if ((!op0 || !side_effects_p (op0))
27666 0 : && (!op2 || !side_effects_p (op2))
27667 0 : && op1)
27668 : {
27669 0 : emit_move_insn (target, gen_rtx_XOR (mode, op1, CONSTM1_RTX (mode)));
27670 0 : return target;
27671 : }
27672 : break;
27673 :
27674 0 : case 0x3c: /* a^b */
27675 0 : if (op0 && ix86_ternlog_leaf_p (op0, mode)
27676 0 : && op1 && ix86_ternlog_leaf_p (op1, mode)
27677 0 : && (!op2 || !side_effects_p (op2)))
27678 0 : return ix86_expand_ternlog_binop (XOR, mode, op0, op1, target);
27679 : break;
27680 :
27681 0 : case 0x44: /* ~c&b */
27682 0 : if ((!op0 || !side_effects_p (op0))
27683 0 : && op1 && ix86_ternlog_leaf_p (op1, mode)
27684 0 : && op2 && register_operand (op2, mode))
27685 0 : return ix86_expand_ternlog_andnot (mode, op2, op1, target);
27686 : break;
27687 :
27688 2 : case 0x50: /* ~c&a */
27689 0 : if ((!op1 || !side_effects_p (op1))
27690 2 : && op0 && ix86_ternlog_leaf_p (op0, mode)
27691 4 : && op2 && register_operand (op2, mode))
27692 0 : return ix86_expand_ternlog_andnot (mode, op2, op0, target);
27693 : break;
27694 :
27695 4 : case 0x55: /* ~c */
27696 1 : if ((!op0 || !side_effects_p (op0))
27697 4 : && (!op1 || !side_effects_p (op1))
27698 8 : && op2)
27699 : {
27700 4 : emit_move_insn (target, gen_rtx_XOR (mode, op2, CONSTM1_RTX (mode)));
27701 4 : return target;
27702 : }
27703 : break;
27704 :
27705 0 : case 0x5a: /* a^c */
27706 0 : if (op0 && ix86_ternlog_leaf_p (op0, mode)
27707 0 : && op2 && ix86_ternlog_leaf_p (op2, mode)
27708 0 : && (!op1 || !side_effects_p (op1)))
27709 0 : return ix86_expand_ternlog_binop (XOR, mode, op0, op2, target);
27710 : break;
27711 :
27712 0 : case 0x66: /* b^c */
27713 0 : if ((!op0 || !side_effects_p (op0))
27714 0 : && op1 && ix86_ternlog_leaf_p (op1, mode)
27715 0 : && op2 && ix86_ternlog_leaf_p (op2, mode))
27716 0 : return ix86_expand_ternlog_binop (XOR, mode, op1, op2, target);
27717 : break;
27718 :
27719 0 : case 0x88: /* b&c */
27720 0 : if ((!op0 || !side_effects_p (op0))
27721 0 : && op1 && ix86_ternlog_leaf_p (op1, mode)
27722 0 : && op2 && ix86_ternlog_leaf_p (op2, mode))
27723 0 : return ix86_expand_ternlog_binop (AND, mode, op1, op2, target);
27724 : break;
27725 :
27726 0 : case 0xa0: /* a&c */
27727 0 : if ((!op1 || !side_effects_p (op1))
27728 0 : && op0 && ix86_ternlog_leaf_p (op0, mode)
27729 0 : && op2 && ix86_ternlog_leaf_p (op2, mode))
27730 0 : return ix86_expand_ternlog_binop (AND, mode, op0, op2, target);
27731 : break;
27732 :
27733 0 : case 0xaa: /* c */
27734 0 : if ((!op0 || !side_effects_p (op0))
27735 0 : && (!op1 || !side_effects_p (op1))
27736 0 : && op2)
27737 : {
27738 0 : if (GET_MODE (op2) != mode)
27739 0 : op2 = gen_lowpart (mode, op2);
27740 0 : emit_move_insn (target, op2);
27741 0 : return target;
27742 : }
27743 : break;
27744 :
27745 0 : case 0xc0: /* a&b */
27746 0 : if (op0 && ix86_ternlog_leaf_p (op0, mode)
27747 0 : && op1 && ix86_ternlog_leaf_p (op1, mode)
27748 0 : && (!op2 || !side_effects_p (op2)))
27749 0 : return ix86_expand_ternlog_binop (AND, mode, op0, op1, target);
27750 : break;
27751 :
27752 0 : case 0xcc: /* b */
27753 0 : if ((!op0 || !side_effects_p (op0))
27754 0 : && op1
27755 0 : && (!op2 || !side_effects_p (op2)))
27756 : {
27757 0 : if (GET_MODE (op1) != mode)
27758 0 : op1 = gen_lowpart (mode, op1);
27759 0 : emit_move_insn (target, op1);
27760 0 : return target;
27761 : }
27762 : break;
27763 :
27764 0 : case 0xee: /* b|c */
27765 0 : if ((!op0 || !side_effects_p (op0))
27766 0 : && op1 && ix86_ternlog_leaf_p (op1, mode)
27767 0 : && op2 && ix86_ternlog_leaf_p (op2, mode))
27768 0 : return ix86_expand_ternlog_binop (IOR, mode, op1, op2, target);
27769 : break;
27770 :
27771 6 : case 0xf0: /* a */
27772 6 : if (op0
27773 6 : && (!op1 || !side_effects_p (op1))
27774 12 : && (!op2 || !side_effects_p (op2)))
27775 : {
27776 6 : if (GET_MODE (op0) != mode)
27777 0 : op0 = gen_lowpart (mode, op0);
27778 6 : emit_move_insn (target, op0);
27779 6 : return target;
27780 : }
27781 : break;
27782 :
27783 0 : case 0xfa: /* a|c */
27784 0 : if (op0 && ix86_ternlog_leaf_p (op0, mode)
27785 0 : && op2 && ix86_ternlog_leaf_p (op2, mode)
27786 0 : && (!op1 || !side_effects_p (op1)))
27787 0 : return ix86_expand_ternlog_binop (IOR, mode, op0, op2, target);
27788 : break;
27789 :
27790 0 : case 0xfc: /* a|b */
27791 0 : if (op0 && ix86_ternlog_leaf_p (op0, mode)
27792 0 : && op1 && ix86_ternlog_leaf_p (op1, mode)
27793 0 : && (!op2 || !side_effects_p (op2)))
27794 0 : return ix86_expand_ternlog_binop (IOR, mode, op0, op1, target);
27795 : break;
27796 :
27797 0 : case 0xff:
27798 0 : if ((!op0 || !side_effects_p (op0))
27799 0 : && (!op1 || !side_effects_p (op1))
27800 0 : && (!op2 || !side_effects_p (op2)))
27801 : {
27802 0 : emit_move_insn (target, CONSTM1_RTX (mode));
27803 0 : return target;
27804 : }
27805 : break;
27806 : }
27807 :
27808 2332 : if (!register_operand (op0, mode))
27809 : {
27810 : /* We can't use force_reg (mode, op0). */
27811 12 : tmp0 = gen_reg_rtx (GET_MODE (op0));
27812 12 : emit_move_insn (tmp0,op0);
27813 : }
27814 : else
27815 : tmp0 = op0;
27816 2332 : if (GET_MODE (tmp0) != mode)
27817 0 : tmp0 = gen_lowpart (mode, tmp0);
27818 :
27819 2332 : if (!op1 || rtx_equal_p (op0, op1))
27820 6 : tmp1 = copy_rtx (tmp0);
27821 2326 : else if (!register_operand (op1, mode))
27822 : {
27823 : /* We can't use force_reg (mode, op1). */
27824 28 : tmp1 = gen_reg_rtx (GET_MODE (op1));
27825 28 : emit_move_insn (tmp1, op1);
27826 : }
27827 : else
27828 : tmp1 = op1;
27829 2332 : if (GET_MODE (tmp1) != mode)
27830 0 : tmp1 = gen_lowpart (mode, tmp1);
27831 :
27832 2332 : if (!op2 || rtx_equal_p (op0, op2))
27833 75 : tmp2 = copy_rtx (tmp0);
27834 2257 : else if (rtx_equal_p (op1, op2))
27835 0 : tmp2 = copy_rtx (tmp1);
27836 2257 : else if (CONST_VECTOR_P (op2))
27837 : {
27838 43 : if (GET_MODE (op2) != mode)
27839 0 : op2 = gen_lowpart (mode, op2);
27840 43 : tmp2 = ix86_gen_bcst_mem (mode, op2);
27841 43 : if (!tmp2)
27842 : {
27843 25 : machine_mode bcst32_mode = mode;
27844 25 : machine_mode bcst64_mode = mode;
27845 25 : switch (mode)
27846 : {
27847 1 : case V1TImode:
27848 1 : case V4SImode:
27849 1 : case V4SFmode:
27850 1 : case V8HImode:
27851 1 : case V16QImode:
27852 1 : bcst32_mode = V4SImode;
27853 1 : bcst64_mode = V2DImode;
27854 1 : break;
27855 :
27856 0 : case V2TImode:
27857 0 : case V8SImode:
27858 0 : case V8SFmode:
27859 0 : case V16HImode:
27860 0 : case V32QImode:
27861 0 : bcst32_mode = V8SImode;
27862 0 : bcst64_mode = V4DImode;
27863 0 : break;
27864 :
27865 3 : case V4TImode:
27866 3 : case V16SImode:
27867 3 : case V16SFmode:
27868 3 : case V32HImode:
27869 3 : case V64QImode:
27870 3 : bcst32_mode = V16SImode;
27871 3 : bcst64_mode = V8DImode;
27872 3 : break;
27873 :
27874 : default:
27875 : break;
27876 : }
27877 :
27878 25 : if (bcst32_mode != mode)
27879 : {
27880 4 : tmp2 = gen_lowpart (bcst32_mode, op2);
27881 4 : if (ix86_gen_bcst_mem (bcst32_mode, tmp2))
27882 : {
27883 3 : tmp2 = ix86_expand_ternlog (bcst32_mode,
27884 3 : gen_lowpart (bcst32_mode, tmp0),
27885 3 : gen_lowpart (bcst32_mode, tmp1),
27886 : tmp2, idx, NULL_RTX);
27887 3 : emit_move_insn (target, gen_lowpart (mode, tmp2));
27888 3 : return target;
27889 : }
27890 : }
27891 :
27892 22 : if (bcst64_mode != mode)
27893 : {
27894 1 : tmp2 = gen_lowpart (bcst64_mode, op2);
27895 1 : if (ix86_gen_bcst_mem (bcst64_mode, tmp2))
27896 : {
27897 0 : tmp2 = ix86_expand_ternlog (bcst64_mode,
27898 0 : gen_lowpart (bcst64_mode, tmp0),
27899 0 : gen_lowpart (bcst64_mode, tmp1),
27900 : tmp2, idx, NULL_RTX);
27901 0 : emit_move_insn (target, gen_lowpart (mode, tmp2));
27902 0 : return target;
27903 : }
27904 : }
27905 :
27906 22 : tmp2 = force_const_mem (mode, op2);
27907 22 : rtx bcast = ix86_broadcast_from_constant (mode, tmp2);
27908 22 : tmp2 = validize_mem (tmp2);
27909 22 : if (bcast)
27910 : {
27911 12 : rtx reg2 = gen_reg_rtx (mode);
27912 12 : bool ok = ix86_expand_vector_init_duplicate (false, mode,
27913 : reg2, bcast);
27914 12 : if (ok)
27915 2329 : tmp2 = reg2;
27916 : }
27917 : }
27918 : }
27919 : else
27920 : tmp2 = op2;
27921 2329 : if (GET_MODE (tmp2) != mode)
27922 0 : tmp2 = gen_lowpart (mode, tmp2);
27923 : /* Some memory_operands are not vector_memory_operands. */
27924 2329 : if (!bcst_vector_operand (tmp2, mode))
27925 0 : tmp2 = force_reg (mode, tmp2);
27926 :
27927 2329 : rtvec vec = gen_rtvec (4, tmp0, tmp1, tmp2, GEN_INT (idx));
27928 2329 : emit_move_insn (target, gen_rtx_UNSPEC (mode, vec, UNSPEC_VTERNLOG));
27929 2329 : return target;
27930 : }
27931 :
27932 : /* GF2P8AFFINEQB matrixes to implement shift and rotate. */
27933 :
27934 : static const uint64_t matrix_ashift[8] =
27935 : {
27936 : 0,
27937 : 0x0001020408102040, /* 1 l */
27938 : 0x0000010204081020, /* 2 l */
27939 : 0x0000000102040810, /* 3 l */
27940 : 0x0000000001020408, /* 4 l */
27941 : 0x0000000000010204, /* 5 l */
27942 : 0x0000000000000102, /* 6 l */
27943 : 0x0000000000000001 /* 7 l */
27944 : };
27945 :
27946 : static const uint64_t matrix_lshiftrt[8] =
27947 : {
27948 : 0,
27949 : 0x0204081020408000, /* 1 r */
27950 : 0x0408102040800000, /* 2 r */
27951 : 0x0810204080000000, /* 3 r */
27952 : 0x1020408000000000, /* 4 r */
27953 : 0x2040800000000000, /* 5 r */
27954 : 0x4080000000000000, /* 6 r */
27955 : 0x8000000000000000 /* 7 r */
27956 : };
27957 :
27958 : static const uint64_t matrix_ashiftrt[8] =
27959 : {
27960 : 0,
27961 : 0x0204081020408080, /* 1 r */
27962 : 0x0408102040808080, /* 2 r */
27963 : 0x0810204080808080, /* 3 r */
27964 : 0x1020408080808080, /* 4 r */
27965 : 0x2040808080808080, /* 5 r */
27966 : 0x4080808080808080, /* 6 r */
27967 : 0x8080808080808080 /* 7 r */
27968 : };
27969 :
27970 : static const uint64_t matrix_rotate[8] =
27971 : {
27972 : 0,
27973 : 0x8001020408102040, /* 1 rol8 */
27974 : 0x4080010204081020, /* 2 rol8 */
27975 : 0x2040800102040810, /* 3 rol8 */
27976 : 0x1020408001020408, /* 4 rol8 */
27977 : 0x0810204080010204, /* 5 rol8 */
27978 : 0x0408102040800102, /* 6 rol8 */
27979 : 0x0204081020408001 /* 7 rol8 */
27980 : };
27981 :
27982 : static const uint64_t matrix_rotatert[8] =
27983 : {
27984 : 0,
27985 : 0x0204081020408001, /* 1 ror8 */
27986 : 0x0408102040800102, /* 2 ror8 */
27987 : 0x0810204080010204, /* 3 ror8 */
27988 : 0x1020408001020408, /* 4 ror8 */
27989 : 0x2040800102040810, /* 5 ror8 */
27990 : 0x4080010204081020, /* 6 ror8 */
27991 : 0x8001020408102040 /* 7 ror8 */
27992 : };
27993 :
27994 : /* Return rtx to load a 64bit GF2P8AFFINE GP(2) matrix implementing a shift
27995 : for CODE and shift count COUNT into register with vector of size of SRC. */
27996 :
27997 : rtx
27998 202 : ix86_vgf2p8affine_shift_matrix (rtx src, rtx count, enum rtx_code code)
27999 : {
28000 202 : machine_mode mode = GET_MODE (src);
28001 202 : const uint64_t *matrix;
28002 202 : unsigned shift = INTVAL (count) & 7;
28003 202 : gcc_assert (shift > 0 && shift < 8);
28004 :
28005 202 : switch (code)
28006 : {
28007 : case ASHIFT:
28008 : matrix = matrix_ashift;
28009 : break;
28010 27 : case ASHIFTRT:
28011 27 : matrix = matrix_ashiftrt;
28012 27 : break;
28013 30 : case LSHIFTRT:
28014 30 : matrix = matrix_lshiftrt;
28015 30 : break;
28016 34 : case ROTATE:
28017 34 : matrix = matrix_rotate;
28018 34 : break;
28019 35 : case ROTATERT:
28020 35 : matrix = matrix_rotatert;
28021 35 : break;
28022 0 : default:
28023 0 : gcc_unreachable ();
28024 : }
28025 :
28026 202 : int nelts = GET_MODE_NUNITS (mode);
28027 202 : rtvec vec = rtvec_alloc (nelts);
28028 202 : uint64_t ma = matrix[shift];
28029 6922 : for (int i = 0; i < nelts; i++)
28030 6720 : RTVEC_ELT (vec, i) = gen_int_mode ((ma >> ((i % 8) * 8)) & 0xff, QImode);
28031 :
28032 202 : return force_reg (mode, gen_rtx_CONST_VECTOR (mode, vec));
28033 : }
28034 :
28035 : /* Trunc a vector to a narrow vector, like v4di -> v4si. */
28036 :
28037 : void
28038 63 : ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input, machine_mode cvt_mode)
28039 : {
28040 63 : machine_mode out_mode = GET_MODE (output);
28041 63 : machine_mode in_mode = GET_MODE (input);
28042 63 : int len = GET_MODE_SIZE (in_mode);
28043 252 : gcc_assert (len == GET_MODE_SIZE (cvt_mode)
28044 : && GET_MODE_INNER (out_mode) == GET_MODE_INNER (cvt_mode)
28045 : && (REG_P (input) || SUBREG_P (input)));
28046 63 : scalar_mode inner_out_mode = GET_MODE_INNER (out_mode);
28047 126 : int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode));
28048 63 : int out_innersize = GET_MODE_SIZE (inner_out_mode);
28049 :
28050 63 : struct expand_vec_perm_d d;
28051 63 : d.target = gen_reg_rtx (cvt_mode);
28052 63 : d.op0 = lowpart_subreg (cvt_mode, force_reg(in_mode, input), in_mode);
28053 63 : d.op1 = d.op0;
28054 63 : d.vmode = cvt_mode;
28055 63 : d.nelt = GET_MODE_NUNITS (cvt_mode);
28056 63 : d.testing_p = false;
28057 63 : d.one_operand_p = true;
28058 :
28059 : /* Init perm. Put the needed bits of input in order and
28060 : fill the rest of bits by default. */
28061 687 : for (int i = 0; i < d.nelt; ++i)
28062 : {
28063 624 : d.perm[i] = i;
28064 1248 : if (i < GET_MODE_NUNITS (out_mode))
28065 246 : d.perm[i] = i * (in_innersize / out_innersize);
28066 : }
28067 :
28068 63 : bool ok = ix86_expand_vec_perm_const_1(&d);
28069 63 : gcc_assert (ok);
28070 63 : emit_move_insn (output, gen_lowpart (out_mode, d.target));
28071 63 : }
28072 :
28073 : /* Implement truncv8sfv8bf2 with vector permutation. */
28074 : void
28075 8 : ix86_expand_vector_sf2bf_with_vec_perm (rtx dest, rtx src)
28076 : {
28077 8 : machine_mode vperm_mode, src_mode = GET_MODE (src);
28078 8 : switch (src_mode)
28079 : {
28080 : case V16SFmode:
28081 : vperm_mode = V32BFmode;
28082 : break;
28083 2 : case V8SFmode:
28084 2 : vperm_mode = V16BFmode;
28085 2 : break;
28086 4 : case V4SFmode:
28087 4 : vperm_mode = V8BFmode;
28088 4 : break;
28089 0 : default:
28090 0 : gcc_unreachable ();
28091 : }
28092 :
28093 8 : int nelt = GET_MODE_NUNITS (vperm_mode);
28094 8 : vec_perm_builder sel (nelt, nelt, 1);
28095 8 : sel.quick_grow (nelt);
28096 136 : for (int i = 0; i != nelt; i++)
28097 128 : sel[i] = (2 * i + 1) % nelt;
28098 16 : vec_perm_indices indices (sel, 1, nelt);
28099 :
28100 8 : rtx target = gen_reg_rtx (vperm_mode);
28101 8 : rtx op0 = lowpart_subreg (vperm_mode,
28102 : force_reg (src_mode, src),
28103 : src_mode);
28104 8 : bool ok = targetm.vectorize.vec_perm_const (vperm_mode, vperm_mode,
28105 : target, op0, op0, indices);
28106 8 : gcc_assert (ok);
28107 8 : emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target, vperm_mode));
28108 8 : }
28109 :
28110 : /* Implement extendv8bf2v8sf2 with vector permutation. */
28111 : void
28112 8 : ix86_expand_vector_bf2sf_with_vec_perm (rtx dest, rtx src)
28113 : {
28114 8 : machine_mode vperm_mode, src_mode = GET_MODE (src);
28115 8 : switch (src_mode)
28116 : {
28117 : case V16BFmode:
28118 : vperm_mode = V32BFmode;
28119 : break;
28120 2 : case V8BFmode:
28121 2 : vperm_mode = V16BFmode;
28122 2 : break;
28123 4 : case V4BFmode:
28124 4 : vperm_mode = V8BFmode;
28125 4 : break;
28126 0 : default:
28127 0 : gcc_unreachable ();
28128 : }
28129 :
28130 8 : int nelt = GET_MODE_NUNITS (vperm_mode);
28131 8 : vec_perm_builder sel (nelt, nelt, 1);
28132 8 : sel.quick_grow (nelt);
28133 136 : for (int i = 0, k = 0, j = nelt; i != nelt; i++)
28134 128 : sel[i] = i & 1 ? j++ : k++;
28135 :
28136 16 : vec_perm_indices indices (sel, 2, nelt);
28137 :
28138 8 : rtx target = gen_reg_rtx (vperm_mode);
28139 8 : rtx op1 = lowpart_subreg (vperm_mode,
28140 : force_reg (src_mode, src),
28141 : src_mode);
28142 8 : rtx op0 = CONST0_RTX (vperm_mode);
28143 8 : bool ok = targetm.vectorize.vec_perm_const (vperm_mode, vperm_mode,
28144 : target, op0, op1, indices);
28145 8 : gcc_assert (ok);
28146 8 : emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target, vperm_mode));
28147 8 : }
28148 :
28149 : /* Implement bitreverse<mode>2 using gf2p8affineqb. */
28150 :
28151 : void
28152 5 : ix86_expand_gfni_bitreverse (rtx dest, rtx src)
28153 : {
28154 5 : machine_mode mode = GET_MODE (dest);
28155 5 : rtx temp;
28156 10 : if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
28157 : {
28158 1 : rtx temp1 = gen_reg_rtx (mode == TImode ? V2DImode : V4SImode);
28159 1 : rtx temp2 = gen_reg_rtx (mode == TImode ? V2DImode : V4SImode);
28160 1 : if (mode == TImode)
28161 : {
28162 1 : temp = lowpart_subreg (DImode, src, TImode);
28163 1 : emit_insn (gen_rtx_SET (temp1, gen_rtx_VEC_CONCAT (V2DImode, temp,
28164 : const0_rtx)));
28165 1 : temp = gen_highpart (DImode, src);
28166 1 : emit_insn (gen_rtx_SET (temp2, gen_rtx_VEC_CONCAT (V2DImode, temp,
28167 : const0_rtx)));
28168 : }
28169 : else
28170 : {
28171 0 : temp = lowpart_subreg (SImode, src, DImode);
28172 0 : emit_insn (gen_vec_setv4si_0 (temp1, CONST0_RTX (V4SImode), temp));
28173 0 : temp = gen_highpart (SImode, src);
28174 0 : emit_insn (gen_vec_setv4si_0 (temp2, CONST0_RTX (V4SImode), temp));
28175 0 : temp1 = lowpart_subreg (V2DImode, temp1, V4SImode);
28176 0 : temp2 = lowpart_subreg (V2DImode, temp2, V4SImode);
28177 : }
28178 1 : temp = gen_reg_rtx (V2DImode);
28179 1 : emit_insn (gen_vec_interleave_lowv2di (temp, temp1, temp2));
28180 : }
28181 4 : else if (mode != DImode)
28182 : {
28183 3 : if (mode != SImode)
28184 : {
28185 2 : src = force_reg (mode, src);
28186 2 : src = lowpart_subreg (SImode, src, mode);
28187 : }
28188 3 : temp = gen_reg_rtx (V4SImode);
28189 3 : emit_insn (gen_vec_setv4si_0 (temp, CONST0_RTX (V4SImode), src));
28190 : }
28191 : else
28192 : {
28193 1 : temp = gen_reg_rtx (V2DImode);
28194 1 : emit_insn (gen_rtx_SET (temp, gen_rtx_VEC_CONCAT (V2DImode, src,
28195 : const0_rtx)));
28196 : }
28197 5 : src = temp;
28198 5 : temp = gen_reg_rtx (V16QImode);
28199 5 : rtx src2 = gen_rtx_CONST_VECTOR (V16QImode,
28200 : gen_rtvec (16, GEN_INT (1), GEN_INT (2),
28201 : GEN_INT (4), GEN_INT (8),
28202 : GEN_INT (16), GEN_INT (32),
28203 : GEN_INT (64), GEN_INT (-128),
28204 : GEN_INT (1), GEN_INT (2),
28205 : GEN_INT (4), GEN_INT (8),
28206 : GEN_INT (16), GEN_INT (32),
28207 : GEN_INT (64), GEN_INT (-128)));
28208 5 : src2 = validize_mem (force_const_mem (V16QImode, src2));
28209 5 : src = lowpart_subreg (V16QImode, src, GET_MODE (src));
28210 5 : emit_insn (gen_vgf2p8affineqb_v16qi (temp, src, src2, const0_rtx));
28211 5 : if (mode == QImode)
28212 : {
28213 1 : rtx temp1 = gen_reg_rtx (SImode);
28214 1 : rtx temp2 = lowpart_subreg (V4SImode, temp, V16QImode);
28215 1 : rtx temp3 = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
28216 1 : emit_insn (gen_rtx_SET (temp1,
28217 : gen_rtx_VEC_SELECT (SImode, temp2, temp3)));
28218 1 : emit_move_insn (dest, lowpart_subreg (QImode, temp1, SImode));
28219 1 : return;
28220 : }
28221 11 : rtx target = gen_reg_rtx ((GET_MODE_SIZE (mode) < 4 || !TARGET_64BIT)
28222 3 : ? SImode : mode == TImode ? DImode : mode);
28223 4 : emit_move_insn (target, lowpart_subreg (GET_MODE (target), temp, V16QImode));
28224 8 : if (GET_MODE_SIZE (mode) > UNITS_PER_WORD)
28225 : {
28226 1 : rtx temp1 = gen_reg_rtx (GET_MODE (target));
28227 1 : if (mode == TImode || TARGET_SSE4_1)
28228 : {
28229 1 : rtx temp2 = lowpart_subreg (mode == TImode ? V2DImode : V4SImode,
28230 : temp, V16QImode);
28231 1 : rtx temp3 = gen_rtx_PARALLEL (VOIDmode,
28232 : gen_rtvec (1, GEN_INT (mode == TImode
28233 : ? 1 : 2)));
28234 1 : emit_insn (gen_rtx_SET (temp1,
28235 : gen_rtx_VEC_SELECT (GET_MODE (target), temp2,
28236 : temp3)));
28237 1 : }
28238 : else
28239 : {
28240 0 : rtx temp2 = gen_reg_rtx (V4SImode);
28241 0 : rtx temp3 = lowpart_subreg (V4SImode, temp, V16QImode);
28242 0 : emit_insn (gen_sse2_pshufd (temp2, temp3, GEN_INT (0xaa)));
28243 0 : emit_move_insn (temp1, lowpart_subreg (GET_MODE (target), temp2,
28244 : V4SImode));
28245 : }
28246 1 : rtx temp4 = gen_reg_rtx (GET_MODE (target));
28247 1 : rtx temp5 = gen_reg_rtx (GET_MODE (target));
28248 0 : rtx (*gen_bswap) (rtx, rtx)
28249 1 : = mode == TImode ? gen_bswapdi2 : gen_bswapsi2;
28250 1 : emit_insn (gen_bswap (temp4, target));
28251 1 : emit_insn (gen_bswap (temp5, temp1));
28252 1 : temp4 = gen_rtx_ZERO_EXTEND (mode, temp4);
28253 1 : temp5 = gen_rtx_ZERO_EXTEND (mode, temp5);
28254 1 : rtx shift = GEN_INT (GET_MODE_PRECISION (GET_MODE (target)));
28255 1 : temp4 = gen_rtx_ASHIFT (mode, temp4, shift);
28256 1 : emit_insn (gen_rtx_SET (dest, gen_rtx_IOR (mode, temp4, temp5)));
28257 1 : return;
28258 : }
28259 3 : if (mode == HImode)
28260 1 : target = lowpart_subreg (mode, target, SImode);
28261 3 : if (mode == SImode)
28262 1 : emit_insn (gen_bswapsi2 (dest, target));
28263 : else
28264 2 : emit_insn (gen_rtx_SET (dest, gen_rtx_BSWAP (mode, target)));
28265 : }
28266 :
28267 : #include "gt-i386-expand.h"
|