Line data Source code
1 : /* Copyright (C) 1988-2026 Free Software Foundation, Inc.
2 :
3 : This file is part of GCC.
4 :
5 : GCC is free software; you can redistribute it and/or modify
6 : it under the terms of the GNU General Public License as published by
7 : the Free Software Foundation; either version 3, or (at your option)
8 : any later version.
9 :
10 : GCC is distributed in the hope that it will be useful,
11 : but WITHOUT ANY WARRANTY; without even the implied warranty of
12 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 : GNU General Public License for more details.
14 :
15 : You should have received a copy of the GNU General Public License
16 : along with GCC; see the file COPYING3. If not see
17 : <http://www.gnu.org/licenses/>. */
18 :
19 : #define IN_TARGET_CODE 1
20 :
21 : #include "config.h"
22 : #include "system.h"
23 : #include "coretypes.h"
24 : #include "backend.h"
25 : #include "rtl.h"
26 : #include "tree.h"
27 : #include "memmodel.h"
28 : #include "gimple.h"
29 : #include "cfghooks.h"
30 : #include "cfgloop.h"
31 : #include "df.h"
32 : #include "tm_p.h"
33 : #include "stringpool.h"
34 : #include "expmed.h"
35 : #include "optabs.h"
36 : #include "regs.h"
37 : #include "emit-rtl.h"
38 : #include "recog.h"
39 : #include "cgraph.h"
40 : #include "diagnostic.h"
41 : #include "cfgbuild.h"
42 : #include "alias.h"
43 : #include "fold-const.h"
44 : #include "attribs.h"
45 : #include "calls.h"
46 : #include "stor-layout.h"
47 : #include "varasm.h"
48 : #include "output.h"
49 : #include "insn-attr.h"
50 : #include "flags.h"
51 : #include "except.h"
52 : #include "explow.h"
53 : #include "expr.h"
54 : #include "cfgrtl.h"
55 : #include "common/common-target.h"
56 : #include "langhooks.h"
57 : #include "reload.h"
58 : #include "gimplify.h"
59 : #include "dwarf2.h"
60 : #include "tm-constrs.h"
61 : #include "cselib.h"
62 : #include "sched-int.h"
63 : #include "opts.h"
64 : #include "tree-pass.h"
65 : #include "context.h"
66 : #include "pass_manager.h"
67 : #include "target-globals.h"
68 : #include "gimple-iterator.h"
69 : #include "shrink-wrap.h"
70 : #include "builtins.h"
71 : #include "rtl-iter.h"
72 : #include "tree-iterator.h"
73 : #include "dbgcnt.h"
74 : #include "case-cfn-macros.h"
75 : #include "dojump.h"
76 : #include "fold-const-call.h"
77 : #include "tree-vrp.h"
78 : #include "tree-ssanames.h"
79 : #include "selftest.h"
80 : #include "selftest-rtl.h"
81 : #include "print-rtl.h"
82 : #include "intl.h"
83 : #include "ifcvt.h"
84 : #include "symbol-summary.h"
85 : #include "sreal.h"
86 : #include "ipa-cp.h"
87 : #include "ipa-prop.h"
88 : #include "ipa-fnsummary.h"
89 : #include "wide-int-bitmask.h"
90 : #include "tree-vector-builder.h"
91 : #include "debug.h"
92 : #include "dwarf2out.h"
93 : #include "i386-options.h"
94 : #include "i386-builtins.h"
95 : #include "i386-expand.h"
96 : #include "asan.h"
97 :
98 : /* Split one or more double-mode RTL references into pairs of half-mode
99 : references. The RTL can be REG, offsettable MEM, integer constant, or
100 : CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
101 : split and "num" is its length. lo_half and hi_half are output arrays
102 : that parallel "operands". */
103 :
104 : void
105 4160724 : split_double_mode (machine_mode mode, rtx operands[],
106 : int num, rtx lo_half[], rtx hi_half[])
107 : {
108 4160724 : machine_mode half_mode;
109 4160724 : unsigned int byte;
110 4160724 : rtx mem_op = NULL_RTX;
111 4160724 : int mem_num = 0;
112 :
113 4160724 : switch (mode)
114 : {
115 : case E_TImode:
116 : half_mode = DImode;
117 : break;
118 607141 : case E_DImode:
119 607141 : half_mode = SImode;
120 607141 : break;
121 6 : case E_P2HImode:
122 6 : half_mode = HImode;
123 6 : break;
124 30 : case E_P2QImode:
125 30 : half_mode = QImode;
126 30 : break;
127 0 : default:
128 0 : gcc_unreachable ();
129 : }
130 :
131 4160724 : byte = GET_MODE_SIZE (half_mode);
132 :
133 8532615 : while (num--)
134 : {
135 4371891 : rtx op = operands[num];
136 :
137 : /* simplify_subreg refuse to split volatile memory addresses,
138 : but we still have to handle it. */
139 4371891 : if (MEM_P (op))
140 : {
141 1740924 : if (mem_op && rtx_equal_p (op, mem_op))
142 : {
143 2432 : lo_half[num] = lo_half[mem_num];
144 2432 : hi_half[num] = hi_half[mem_num];
145 : }
146 : else
147 : {
148 1738492 : mem_op = op;
149 1738492 : mem_num = num;
150 1738492 : lo_half[num] = adjust_address (op, half_mode, 0);
151 1738492 : hi_half[num] = adjust_address (op, half_mode, byte);
152 : }
153 : }
154 : else
155 : {
156 2630967 : lo_half[num] = simplify_gen_subreg (half_mode, op,
157 2630967 : GET_MODE (op) == VOIDmode
158 : ? mode : GET_MODE (op), 0);
159 :
160 2630967 : rtx tmp = simplify_gen_subreg (half_mode, op,
161 2630967 : GET_MODE (op) == VOIDmode
162 2630967 : ? mode : GET_MODE (op), byte);
163 : /* simplify_gen_subreg will return NULL RTX for the
164 : high half of the paradoxical subreg. */
165 2630967 : hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode);
166 : }
167 : }
168 4160724 : }
169 :
170 : /* Emit the double word assignment DST = { LO, HI }. */
171 :
172 : void
173 100025 : split_double_concat (machine_mode mode, rtx dst, rtx lo, rtx hi)
174 : {
175 100025 : rtx dlo, dhi;
176 100025 : int deleted_move_count = 0;
177 100025 : split_double_mode (mode, &dst, 1, &dlo, &dhi);
178 : /* Constraints ensure that if both lo and hi are MEMs, then
179 : dst has early-clobber and thus addresses of MEMs don't use
180 : dlo/dhi registers. Otherwise if at least one of li and hi are MEMs,
181 : dlo/dhi are registers. */
182 100025 : if (MEM_P (lo)
183 5537 : && rtx_equal_p (dlo, hi)
184 101003 : && reg_overlap_mentioned_p (dhi, lo))
185 : {
186 : /* If dlo is same as hi and lo's address uses dhi register,
187 : code below would first emit_move_insn (dhi, hi)
188 : and then emit_move_insn (dlo, lo). But the former
189 : would invalidate lo's address. Load into dhi first,
190 : then swap. */
191 193 : emit_move_insn (dhi, lo);
192 193 : lo = dhi;
193 : }
194 99832 : else if (MEM_P (hi)
195 9461 : && !MEM_P (lo)
196 6645 : && !rtx_equal_p (dlo, lo)
197 101152 : && reg_overlap_mentioned_p (dlo, hi))
198 : {
199 : /* In this case, code below would first emit_move_insn (dlo, lo)
200 : and then emit_move_insn (dhi, hi). But the former would
201 : invalidate hi's address. */
202 15 : if (rtx_equal_p (dhi, lo))
203 : {
204 : /* We can't load into dhi first, so load into dlo
205 : first and we'll swap. */
206 9 : emit_move_insn (dlo, hi);
207 9 : hi = dlo;
208 : }
209 : else
210 : {
211 : /* Load into dhi first. */
212 6 : emit_move_insn (dhi, hi);
213 6 : hi = dhi;
214 : }
215 : }
216 100025 : if (!rtx_equal_p (dlo, hi))
217 : {
218 86187 : if (!rtx_equal_p (dlo, lo))
219 38214 : emit_move_insn (dlo, lo);
220 : else
221 : deleted_move_count++;
222 86187 : if (!rtx_equal_p (dhi, hi))
223 80125 : emit_move_insn (dhi, hi);
224 : else
225 6062 : deleted_move_count++;
226 : }
227 13838 : else if (!rtx_equal_p (lo, dhi))
228 : {
229 6863 : if (!rtx_equal_p (dhi, hi))
230 6863 : emit_move_insn (dhi, hi);
231 : else
232 : deleted_move_count++;
233 6863 : if (!rtx_equal_p (dlo, lo))
234 6763 : emit_move_insn (dlo, lo);
235 : else
236 100 : deleted_move_count++;
237 : }
238 6975 : else if (mode == TImode)
239 6957 : emit_insn (gen_swapdi (dlo, dhi));
240 : else
241 18 : emit_insn (gen_swapsi (dlo, dhi));
242 :
243 100025 : if (deleted_move_count == 2)
244 3116 : emit_note (NOTE_INSN_DELETED);
245 100025 : }
246 :
247 :
248 : /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
249 : for the target. */
250 :
251 : void
252 112807 : ix86_expand_clear (rtx dest)
253 : {
254 112807 : rtx tmp;
255 :
256 : /* We play register width games, which are only valid after reload. */
257 112807 : gcc_assert (reload_completed);
258 :
259 : /* Avoid HImode and its attendant prefix byte. */
260 225614 : if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
261 990 : dest = gen_rtx_REG (SImode, REGNO (dest));
262 112807 : tmp = gen_rtx_SET (dest, const0_rtx);
263 :
264 112807 : if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
265 : {
266 112807 : rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
267 112807 : tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
268 : }
269 :
270 112807 : emit_insn (tmp);
271 112807 : }
272 :
273 : /* Return true if V can be broadcasted from an integer of WIDTH bits
274 : which is returned in VAL_BROADCAST. Otherwise, return false. */
275 :
276 : static bool
277 4851 : ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
278 : HOST_WIDE_INT &val_broadcast)
279 : {
280 4851 : wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT);
281 4851 : val_broadcast = wi::extract_uhwi (val, 0, width);
282 6543 : for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
283 : {
284 5089 : HOST_WIDE_INT each = wi::extract_uhwi (val, i, width);
285 5089 : if (val_broadcast != each)
286 : return false;
287 : }
288 1454 : val_broadcast = sext_hwi (val_broadcast, width);
289 1454 : return true;
290 4851 : }
291 :
292 : /* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
293 :
294 : rtx
295 32844 : ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
296 : {
297 : /* Don't use integer vector broadcast if we can't move from GPR to SSE
298 : register directly. */
299 32844 : if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
300 : return nullptr;
301 :
302 32844 : unsigned int msize = GET_MODE_SIZE (mode);
303 :
304 : /* Only optimized for vpbroadcast[bwsd]/vbroadcastss with xmm/ymm/zmm. */
305 32844 : if (msize != 16 && msize != 32 && msize != 64)
306 : return nullptr;
307 :
308 : /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
309 : broadcast only if vector broadcast is available. */
310 32844 : if (!TARGET_AVX
311 1610 : || !CONST_WIDE_INT_P (op)
312 1603 : || standard_sse_constant_p (op, mode)
313 34447 : || (CONST_WIDE_INT_NUNITS (op) * HOST_BITS_PER_WIDE_INT
314 1603 : != GET_MODE_BITSIZE (mode)))
315 31249 : return nullptr;
316 :
317 1595 : HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
318 1595 : HOST_WIDE_INT val_broadcast;
319 1595 : scalar_int_mode broadcast_mode;
320 : /* vpbroadcastb zmm requires TARGET_AVX512BW. */
321 712 : if ((msize == 64 ? TARGET_AVX512BW : TARGET_AVX2)
322 2089 : && ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
323 : val_broadcast))
324 : broadcast_mode = QImode;
325 654 : else if ((msize == 64 ? TARGET_AVX512BW : TARGET_AVX2)
326 1968 : && ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
327 : val_broadcast))
328 : broadcast_mode = HImode;
329 : /* vbroadcasts[sd] only support memory operand w/o AVX2.
330 : When msize == 16, pshufs is used for vec_duplicate.
331 : when msize == 64, vpbroadcastd is used, and TARGET_AVX512F must be existed. */
332 412 : else if ((msize != 32 || TARGET_AVX2)
333 1768 : && ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
334 : val_broadcast))
335 : broadcast_mode = SImode;
336 1391 : else if (TARGET_64BIT && (msize != 32 || TARGET_AVX2)
337 2641 : && ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
338 : val_broadcast))
339 : broadcast_mode = DImode;
340 : else
341 141 : return nullptr;
342 :
343 : /* Check if OP can be broadcasted from VAL. */
344 1776 : for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
345 1561 : if (val != CONST_WIDE_INT_ELT (op, i))
346 : return nullptr;
347 :
348 215 : unsigned int nunits = (GET_MODE_SIZE (mode)
349 215 : / GET_MODE_SIZE (broadcast_mode));
350 215 : machine_mode vector_mode;
351 215 : if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
352 0 : gcc_unreachable ();
353 215 : rtx target = gen_reg_rtx (vector_mode);
354 215 : bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
355 : target,
356 : GEN_INT (val_broadcast));
357 215 : if (!ok)
358 : return nullptr;
359 215 : target = lowpart_subreg (mode, target, vector_mode);
360 215 : return target;
361 : }
362 :
363 : void
364 73026595 : ix86_expand_move (machine_mode mode, rtx operands[])
365 : {
366 73026595 : rtx op0, op1;
367 73026595 : rtx tmp, addend = NULL_RTX;
368 73026595 : enum tls_model model;
369 :
370 73026595 : op0 = operands[0];
371 73026595 : op1 = operands[1];
372 :
373 : /* Avoid complex sets of likely spilled hard registers before reload. */
374 73026595 : if (!ix86_hardreg_mov_ok (op0, op1))
375 : {
376 138440 : tmp = gen_reg_rtx (mode);
377 138440 : operands[0] = tmp;
378 138440 : ix86_expand_move (mode, operands);
379 138440 : operands[0] = op0;
380 138440 : operands[1] = tmp;
381 138440 : op1 = tmp;
382 : }
383 :
384 73026595 : switch (GET_CODE (op1))
385 : {
386 347974 : case CONST:
387 347974 : tmp = XEXP (op1, 0);
388 :
389 347974 : if (GET_CODE (tmp) != PLUS
390 336286 : || !SYMBOL_REF_P (XEXP (tmp, 0)))
391 : break;
392 :
393 333623 : op1 = XEXP (tmp, 0);
394 333623 : addend = XEXP (tmp, 1);
395 : /* FALLTHRU */
396 :
397 4916627 : case SYMBOL_REF:
398 4916627 : model = SYMBOL_REF_TLS_MODEL (op1);
399 :
400 4916627 : if (model)
401 10114 : op1 = legitimize_tls_address (op1, model, true);
402 4906513 : else if (ix86_force_load_from_GOT_p (op1))
403 : {
404 : /* Load the external function address via GOT slot to avoid PLT. */
405 24 : op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
406 : (TARGET_64BIT
407 : ? UNSPEC_GOTPCREL
408 : : UNSPEC_GOT));
409 24 : op1 = gen_rtx_CONST (Pmode, op1);
410 24 : op1 = gen_const_mem (Pmode, op1);
411 20 : set_mem_alias_set (op1, GOT_ALIAS_SET);
412 : }
413 : else
414 : {
415 : #if TARGET_PECOFF
416 : tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
417 :
418 : if (tmp)
419 : {
420 : op1 = tmp;
421 : if (!addend)
422 : break;
423 : }
424 : else
425 : #endif
426 4906493 : {
427 4906493 : op1 = operands[1];
428 4906493 : break;
429 : }
430 : }
431 :
432 10134 : if (addend)
433 : {
434 2786 : op1 = force_operand (op1, NULL_RTX);
435 2795 : op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
436 : op0, 1, OPTAB_DIRECT);
437 : }
438 : else
439 7348 : op1 = force_operand (op1, op0);
440 :
441 10134 : if (op1 == op0)
442 : return;
443 :
444 1147 : op1 = convert_to_mode (mode, op1, 1);
445 :
446 : default:
447 : break;
448 :
449 1484838 : case SUBREG:
450 : /* Transform TImode paradoxical SUBREG into zero_extendditi2. */
451 1484838 : if (TARGET_64BIT
452 1257174 : && mode == TImode
453 : && SUBREG_P (op1)
454 74281 : && GET_MODE (SUBREG_REG (op1)) == DImode
455 1530614 : && SUBREG_BYTE (op1) == 0)
456 45776 : op1 = gen_rtx_ZERO_EXTEND (TImode, SUBREG_REG (op1));
457 : /* As not all values in XFmode are representable in real_value,
458 : we might be called with unfoldable SUBREGs of constants. */
459 1484838 : if (mode == XFmode
460 3130 : && CONSTANT_P (SUBREG_REG (op1))
461 0 : && can_create_pseudo_p ())
462 : {
463 0 : machine_mode imode = GET_MODE (SUBREG_REG (op1));
464 0 : rtx r = force_const_mem (imode, SUBREG_REG (op1));
465 0 : if (r)
466 0 : r = validize_mem (r);
467 : else
468 0 : r = force_reg (imode, SUBREG_REG (op1));
469 0 : op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
470 : }
471 : break;
472 : }
473 :
474 73017608 : if ((flag_pic || MACHOPIC_INDIRECT)
475 73017608 : && symbolic_operand (op1, mode))
476 : {
477 : #if TARGET_MACHO
478 : if (TARGET_MACHO && !TARGET_64BIT)
479 : {
480 : /* dynamic-no-pic */
481 : if (MACHOPIC_INDIRECT)
482 : {
483 : tmp = (op0 && REG_P (op0) && mode == Pmode)
484 : ? op0 : gen_reg_rtx (Pmode);
485 : op1 = machopic_indirect_data_reference (op1, tmp);
486 : if (MACHOPIC_PURE)
487 : op1 = machopic_legitimize_pic_address (op1, mode,
488 : tmp == op1 ? 0 : tmp);
489 : }
490 : if (op0 != op1 && !MEM_P (op0))
491 : {
492 : rtx insn = gen_rtx_SET (op0, op1);
493 : emit_insn (insn);
494 : return;
495 : }
496 : }
497 : #endif
498 :
499 335809 : if (MEM_P (op0))
500 87509 : op1 = force_reg (mode, op1);
501 248300 : else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
502 : {
503 248243 : rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
504 248243 : op1 = legitimize_pic_address (op1, reg);
505 248243 : if (op0 == op1)
506 : return;
507 248243 : op1 = convert_to_mode (mode, op1, 1);
508 : }
509 : }
510 : else
511 : {
512 72681799 : if (MEM_P (op0)
513 99212104 : && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
514 10723599 : || !push_operand (op0, mode))
515 84947231 : && MEM_P (op1))
516 2162751 : op1 = force_reg (mode, op1);
517 :
518 72681799 : if (push_operand (op0, mode)
519 72681799 : && ! general_no_elim_operand (op1, mode))
520 1004 : op1 = copy_to_mode_reg (mode, op1);
521 :
522 : /* Force large constants in 64bit compilation into register
523 : to get them CSEed. */
524 72681799 : if (can_create_pseudo_p ()
525 66983393 : && (mode == DImode) && TARGET_64BIT
526 34867395 : && immediate_operand (op1, mode)
527 7898982 : && !x86_64_zext_immediate_operand (op1, VOIDmode)
528 723270 : && !register_operand (op0, mode)
529 72856817 : && optimize)
530 123293 : op1 = copy_to_mode_reg (mode, op1);
531 :
532 72681799 : if (can_create_pseudo_p ())
533 : {
534 66983393 : if (CONST_DOUBLE_P (op1))
535 : {
536 : /* If we are loading a floating point constant to a
537 : register, force the value to memory now, since we'll
538 : get better code out the back end. */
539 :
540 895645 : op1 = validize_mem (force_const_mem (mode, op1));
541 895645 : if (!register_operand (op0, mode))
542 : {
543 129759 : tmp = gen_reg_rtx (mode);
544 129759 : emit_insn (gen_rtx_SET (tmp, op1));
545 129759 : emit_move_insn (op0, tmp);
546 129759 : return;
547 : }
548 : }
549 : }
550 : }
551 :
552 : /* Special case inserting 64-bit values into a TImode register. */
553 72887849 : if (TARGET_64BIT
554 : /* Disable for -O0 (see PR110587) unless naked (PR110533). */
555 63164632 : && (optimize || ix86_function_naked (current_function_decl))
556 43264495 : && (mode == DImode || mode == DFmode)
557 29493119 : && SUBREG_P (op0)
558 480002 : && GET_MODE (SUBREG_REG (op0)) == TImode
559 397582 : && REG_P (SUBREG_REG (op0))
560 73285431 : && REG_P (op1))
561 : {
562 : /* Use *insvti_lowpart_1 to set lowpart. */
563 177523 : if (SUBREG_BYTE (op0) == 0)
564 : {
565 52518 : wide_int mask = wi::mask (64, true, 128);
566 52518 : tmp = immed_wide_int_const (mask, TImode);
567 52518 : op0 = SUBREG_REG (op0);
568 52518 : tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
569 52518 : if (mode == DFmode)
570 355 : op1 = gen_lowpart (DImode, op1);
571 52518 : op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
572 52518 : op1 = gen_rtx_IOR (TImode, tmp, op1);
573 52518 : }
574 : /* Use *insvti_highpart_1 to set highpart. */
575 125005 : else if (SUBREG_BYTE (op0) == 8)
576 : {
577 125005 : wide_int mask = wi::mask (64, false, 128);
578 125005 : tmp = immed_wide_int_const (mask, TImode);
579 125005 : op0 = SUBREG_REG (op0);
580 125005 : tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
581 125005 : if (mode == DFmode)
582 206 : op1 = gen_lowpart (DImode, op1);
583 125005 : op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
584 125005 : op1 = gen_rtx_ASHIFT (TImode, op1, GEN_INT (64));
585 125005 : op1 = gen_rtx_IOR (TImode, tmp, op1);
586 125005 : }
587 : }
588 :
589 72887849 : emit_insn (gen_rtx_SET (op0, op1));
590 : }
591 :
592 : /* OP is a memref of CONST_VECTOR, return scalar constant mem
593 : if CONST_VECTOR is a vec_duplicate, else return NULL. */
594 : rtx
595 2462446 : ix86_broadcast_from_constant (machine_mode mode, rtx op)
596 : {
597 2462446 : int nunits = GET_MODE_NUNITS (mode);
598 2462446 : if (nunits < 2)
599 : return nullptr;
600 :
601 : /* Don't use integer vector broadcast if we can't move from GPR to SSE
602 : register directly. */
603 2333928 : if (!TARGET_INTER_UNIT_MOVES_TO_VEC
604 8172 : && INTEGRAL_MODE_P (mode))
605 : return nullptr;
606 :
607 : /* Convert CONST_VECTOR to a non-standard SSE constant integer
608 : broadcast only if vector broadcast is available. */
609 2328366 : if (standard_sse_constant_p (op, mode))
610 : return nullptr;
611 :
612 4656726 : if (GET_MODE_INNER (mode) == TImode)
613 : return nullptr;
614 :
615 2328253 : rtx constant = get_pool_constant (XEXP (op, 0));
616 2328253 : if (!CONST_VECTOR_P (constant))
617 : return nullptr;
618 :
619 : /* There could be some rtx like
620 : (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
621 : but with "*.LC1" refer to V2DI constant vector. */
622 2328253 : if (GET_MODE (constant) != mode)
623 : {
624 659 : constant = simplify_subreg (mode, constant, GET_MODE (constant),
625 : 0);
626 659 : if (constant == nullptr || !CONST_VECTOR_P (constant))
627 : return nullptr;
628 : }
629 :
630 2328253 : rtx first = XVECEXP (constant, 0, 0);
631 :
632 7692460 : for (int i = 1; i < nunits; ++i)
633 : {
634 7075952 : rtx tmp = XVECEXP (constant, 0, i);
635 : /* Vector duplicate value. */
636 7075952 : if (!rtx_equal_p (tmp, first))
637 : return nullptr;
638 : }
639 :
640 : return first;
641 : }
642 :
643 : void
644 4799117 : ix86_expand_vector_move (machine_mode mode, rtx operands[])
645 : {
646 4799117 : rtx op0 = operands[0], op1 = operands[1];
647 : /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
648 : psABI since the biggest alignment is 4 byte for IA MCU psABI. */
649 4799117 : unsigned int align = (TARGET_IAMCU
650 4799117 : ? GET_MODE_BITSIZE (mode)
651 4799117 : : GET_MODE_ALIGNMENT (mode));
652 :
653 4799117 : if (push_operand (op0, VOIDmode))
654 2899 : op0 = emit_move_resolve_push (mode, op0);
655 :
656 : /* Force constants other than zero into memory. We do not know how
657 : the instructions used to build constants modify the upper 64 bits
658 : of the register, once we have that information we may be able
659 : to handle some of them more efficiently. */
660 4799117 : if (can_create_pseudo_p ()
661 4604430 : && (CONSTANT_P (op1)
662 4291603 : || (SUBREG_P (op1)
663 310973 : && CONSTANT_P (SUBREG_REG (op1))))
664 5111958 : && ((register_operand (op0, mode)
665 259151 : && !standard_sse_constant_p (op1, mode))
666 : /* ix86_expand_vector_move_misalign() does not like constants. */
667 : || (SSE_REG_MODE_P (mode)
668 255862 : && MEM_P (op0)
669 38241 : && MEM_ALIGN (op0) < align)))
670 : {
671 2236 : if (SUBREG_P (op1))
672 : {
673 14 : machine_mode imode = GET_MODE (SUBREG_REG (op1));
674 14 : rtx r = force_const_mem (imode, SUBREG_REG (op1));
675 14 : if (r)
676 14 : r = validize_mem (r);
677 : else
678 0 : r = force_reg (imode, SUBREG_REG (op1));
679 14 : op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
680 : }
681 : else
682 : {
683 2222 : machine_mode mode = GET_MODE (op0);
684 2222 : rtx tmp = ix86_convert_const_wide_int_to_broadcast
685 2222 : (mode, op1);
686 2222 : if (tmp == nullptr)
687 2201 : op1 = validize_mem (force_const_mem (mode, op1));
688 : else
689 : op1 = tmp;
690 : }
691 : }
692 :
693 4799117 : if (can_create_pseudo_p ()
694 4604430 : && GET_MODE_SIZE (mode) >= 16
695 3894618 : && VECTOR_MODE_P (mode)
696 8478337 : && (MEM_P (op1)
697 887977 : && SYMBOL_REF_P (XEXP (op1, 0))
698 491799 : && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
699 : {
700 475178 : rtx first = ix86_broadcast_from_constant (mode, op1);
701 475178 : if (first != nullptr)
702 : {
703 : /* Broadcast to XMM/YMM/ZMM register from an integer
704 : constant or scalar mem. */
705 121547 : rtx tmp = gen_reg_rtx (mode);
706 121547 : if (FLOAT_MODE_P (mode))
707 29228 : first = force_const_mem (GET_MODE_INNER (mode), first);
708 121547 : bool ok = ix86_expand_vector_init_duplicate (false, mode,
709 : tmp, first);
710 121547 : if (!ok && !TARGET_64BIT && GET_MODE_INNER (mode) == DImode)
711 : {
712 0 : first = force_const_mem (GET_MODE_INNER (mode), first);
713 0 : ok = ix86_expand_vector_init_duplicate (false, mode,
714 : tmp, first);
715 : }
716 121547 : if (ok)
717 : {
718 121547 : emit_move_insn (op0, tmp);
719 121547 : return;
720 : }
721 : }
722 : }
723 :
724 : /* We need to check memory alignment for SSE mode since attribute
725 : can make operands unaligned. */
726 4677570 : if (can_create_pseudo_p ()
727 : && SSE_REG_MODE_P (mode)
728 9501641 : && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
729 4224767 : || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
730 : {
731 509840 : rtx tmp[2];
732 :
733 : /* ix86_expand_vector_move_misalign() does not like both
734 : arguments in memory. */
735 509840 : if (!register_operand (op0, mode)
736 509840 : && !register_operand (op1, mode))
737 : {
738 160966 : rtx scratch = gen_reg_rtx (mode);
739 160966 : emit_move_insn (scratch, op1);
740 160966 : op1 = scratch;
741 : }
742 :
743 509840 : tmp[0] = op0; tmp[1] = op1;
744 509840 : ix86_expand_vector_move_misalign (mode, tmp);
745 509840 : return;
746 : }
747 :
748 : /* Special case TImode to 128-bit vector conversions via V2DI. */
749 1138529 : if (VECTOR_MODE_P (mode)
750 4116412 : && GET_MODE_SIZE (mode) == 16
751 2908451 : && SUBREG_P (op1)
752 242295 : && GET_MODE (SUBREG_REG (op1)) == TImode
753 3219 : && TARGET_64BIT && TARGET_SSE
754 4170296 : && ix86_pre_reload_split ())
755 : {
756 2460 : rtx tmp = gen_reg_rtx (V2DImode);
757 2460 : rtx lo = gen_reg_rtx (DImode);
758 2460 : rtx hi = gen_reg_rtx (DImode);
759 2460 : emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
760 2460 : emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
761 2460 : emit_insn (gen_vec_concatv2di (tmp, lo, hi));
762 2460 : emit_move_insn (op0, gen_lowpart (mode, tmp));
763 2460 : return;
764 : }
765 :
766 : /* If operand0 is a hard register, make operand1 a pseudo. */
767 4165270 : if (can_create_pseudo_p ()
768 8135853 : && !ix86_hardreg_mov_ok (op0, op1))
769 : {
770 135 : rtx tmp = gen_reg_rtx (GET_MODE (op0));
771 135 : emit_move_insn (tmp, op1);
772 135 : emit_move_insn (op0, tmp);
773 135 : return;
774 : }
775 :
776 : /* Make operand1 a register if it isn't already. */
777 4165135 : if (can_create_pseudo_p ()
778 3970448 : && !register_operand (op0, mode)
779 5275067 : && !register_operand (op1, mode))
780 : {
781 214224 : rtx tmp = gen_reg_rtx (GET_MODE (op0));
782 214224 : emit_move_insn (tmp, op1);
783 214224 : emit_move_insn (op0, tmp);
784 214224 : return;
785 : }
786 :
787 3950911 : emit_insn (gen_rtx_SET (op0, op1));
788 : }
789 :
790 : /* Split 32-byte AVX unaligned load and store if needed. */
791 :
792 : static void
793 13475 : ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
794 : {
795 13475 : rtx m;
796 13475 : rtx (*extract) (rtx, rtx, rtx);
797 13475 : machine_mode mode;
798 :
799 13475 : if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
800 4755 : || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
801 : {
802 13423 : emit_insn (gen_rtx_SET (op0, op1));
803 13423 : return;
804 : }
805 :
806 52 : rtx orig_op0 = NULL_RTX;
807 52 : mode = GET_MODE (op0);
808 52 : switch (GET_MODE_CLASS (mode))
809 : {
810 35 : case MODE_VECTOR_INT:
811 35 : case MODE_INT:
812 35 : if (mode != V32QImode)
813 : {
814 7 : if (!MEM_P (op0))
815 : {
816 3 : orig_op0 = op0;
817 3 : op0 = gen_reg_rtx (V32QImode);
818 : }
819 : else
820 4 : op0 = gen_lowpart (V32QImode, op0);
821 7 : op1 = gen_lowpart (V32QImode, op1);
822 7 : mode = V32QImode;
823 : }
824 : break;
825 : case MODE_VECTOR_FLOAT:
826 : break;
827 0 : default:
828 0 : gcc_unreachable ();
829 : }
830 :
831 52 : switch (mode)
832 : {
833 0 : default:
834 0 : gcc_unreachable ();
835 : case E_V32QImode:
836 : extract = gen_avx_vextractf128v32qi;
837 : mode = V16QImode;
838 : break;
839 1 : case E_V16BFmode:
840 1 : extract = gen_avx_vextractf128v16bf;
841 1 : mode = V8BFmode;
842 1 : break;
843 0 : case E_V16HFmode:
844 0 : extract = gen_avx_vextractf128v16hf;
845 0 : mode = V8HFmode;
846 0 : break;
847 8 : case E_V8SFmode:
848 8 : extract = gen_avx_vextractf128v8sf;
849 8 : mode = V4SFmode;
850 8 : break;
851 8 : case E_V4DFmode:
852 8 : extract = gen_avx_vextractf128v4df;
853 8 : mode = V2DFmode;
854 8 : break;
855 : }
856 :
857 52 : if (MEM_P (op1))
858 : {
859 9 : rtx r = gen_reg_rtx (mode);
860 9 : m = adjust_address (op1, mode, 0);
861 9 : emit_move_insn (r, m);
862 9 : m = adjust_address (op1, mode, 16);
863 9 : r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
864 9 : emit_move_insn (op0, r);
865 : }
866 43 : else if (MEM_P (op0))
867 : {
868 43 : m = adjust_address (op0, mode, 0);
869 43 : emit_insn (extract (m, op1, const0_rtx));
870 43 : m = adjust_address (op0, mode, 16);
871 43 : emit_insn (extract (m, copy_rtx (op1), const1_rtx));
872 : }
873 : else
874 0 : gcc_unreachable ();
875 :
876 52 : if (orig_op0)
877 3 : emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
878 : }
879 :
880 : /* Implement the movmisalign patterns for SSE. Non-SSE modes go
881 : straight to ix86_expand_vector_move. */
882 : /* Code generation for scalar reg-reg moves of single and double precision data:
883 : if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
884 : movaps reg, reg
885 : else
886 : movss reg, reg
887 : if (x86_sse_partial_reg_dependency == true)
888 : movapd reg, reg
889 : else
890 : movsd reg, reg
891 :
892 : Code generation for scalar loads of double precision data:
893 : if (x86_sse_split_regs == true)
894 : movlpd mem, reg (gas syntax)
895 : else
896 : movsd mem, reg
897 :
898 : Code generation for unaligned packed loads of single precision data
899 : (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
900 : if (x86_sse_unaligned_move_optimal)
901 : movups mem, reg
902 :
903 : if (x86_sse_partial_reg_dependency == true)
904 : {
905 : xorps reg, reg
906 : movlps mem, reg
907 : movhps mem+8, reg
908 : }
909 : else
910 : {
911 : movlps mem, reg
912 : movhps mem+8, reg
913 : }
914 :
915 : Code generation for unaligned packed loads of double precision data
916 : (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
917 : if (x86_sse_unaligned_move_optimal)
918 : movupd mem, reg
919 :
920 : if (x86_sse_split_regs == true)
921 : {
922 : movlpd mem, reg
923 : movhpd mem+8, reg
924 : }
925 : else
926 : {
927 : movsd mem, reg
928 : movhpd mem+8, reg
929 : }
930 : */
931 :
932 : void
933 838473 : ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
934 : {
935 838473 : rtx op0, op1, m;
936 :
937 838473 : op0 = operands[0];
938 838473 : op1 = operands[1];
939 :
940 : /* Use unaligned load/store for AVX512 or when optimizing for size. */
941 1676946 : if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
942 : {
943 24108 : emit_insn (gen_rtx_SET (op0, op1));
944 24108 : return;
945 : }
946 :
947 814365 : if (TARGET_AVX)
948 : {
949 62732 : if (GET_MODE_SIZE (mode) == 32)
950 13475 : ix86_avx256_split_vector_move_misalign (op0, op1);
951 : else
952 : /* Always use 128-bit mov<mode>_internal pattern for AVX. */
953 17891 : emit_insn (gen_rtx_SET (op0, op1));
954 31366 : return;
955 : }
956 :
957 782999 : if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
958 95 : || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
959 : {
960 782904 : emit_insn (gen_rtx_SET (op0, op1));
961 782904 : return;
962 : }
963 :
964 : /* ??? If we have typed data, then it would appear that using
965 : movdqu is the only way to get unaligned data loaded with
966 : integer type. */
967 95 : if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
968 : {
969 81 : emit_insn (gen_rtx_SET (op0, op1));
970 81 : return;
971 : }
972 :
973 14 : if (MEM_P (op1))
974 : {
975 6 : if (TARGET_SSE2 && mode == V2DFmode)
976 : {
977 2 : rtx zero;
978 :
979 : /* When SSE registers are split into halves, we can avoid
980 : writing to the top half twice. */
981 2 : if (TARGET_SSE_SPLIT_REGS)
982 : {
983 2 : emit_clobber (op0);
984 2 : zero = op0;
985 : }
986 : else
987 : {
988 : /* ??? Not sure about the best option for the Intel chips.
989 : The following would seem to satisfy; the register is
990 : entirely cleared, breaking the dependency chain. We
991 : then store to the upper half, with a dependency depth
992 : of one. A rumor has it that Intel recommends two movsd
993 : followed by an unpacklpd, but this is unconfirmed. And
994 : given that the dependency depth of the unpacklpd would
995 : still be one, I'm not sure why this would be better. */
996 0 : zero = CONST0_RTX (V2DFmode);
997 : }
998 :
999 2 : m = adjust_address (op1, DFmode, 0);
1000 2 : emit_insn (gen_sse2_loadlpd (op0, zero, m));
1001 2 : m = adjust_address (op1, DFmode, 8);
1002 2 : emit_insn (gen_sse2_loadhpd (op0, op0, m));
1003 2 : }
1004 : else
1005 : {
1006 4 : rtx t;
1007 :
1008 4 : if (mode != V4SFmode)
1009 0 : t = gen_reg_rtx (V4SFmode);
1010 : else
1011 : t = op0;
1012 :
1013 4 : if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
1014 2 : emit_move_insn (t, CONST0_RTX (V4SFmode));
1015 : else
1016 2 : emit_clobber (t);
1017 :
1018 4 : m = adjust_address (op1, V2SFmode, 0);
1019 4 : emit_insn (gen_sse_loadlps (t, t, m));
1020 4 : m = adjust_address (op1, V2SFmode, 8);
1021 4 : emit_insn (gen_sse_loadhps (t, t, m));
1022 4 : if (mode != V4SFmode)
1023 0 : emit_move_insn (op0, gen_lowpart (mode, t));
1024 : }
1025 : }
1026 8 : else if (MEM_P (op0))
1027 : {
1028 8 : if (TARGET_SSE2 && mode == V2DFmode)
1029 : {
1030 2 : m = adjust_address (op0, DFmode, 0);
1031 2 : emit_insn (gen_sse2_storelpd (m, op1));
1032 2 : m = adjust_address (op0, DFmode, 8);
1033 2 : emit_insn (gen_sse2_storehpd (m, op1));
1034 : }
1035 : else
1036 : {
1037 6 : if (mode != V4SFmode)
1038 0 : op1 = gen_lowpart (V4SFmode, op1);
1039 :
1040 6 : m = adjust_address (op0, V2SFmode, 0);
1041 6 : emit_insn (gen_sse_storelps (m, op1));
1042 6 : m = adjust_address (op0, V2SFmode, 8);
1043 6 : emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
1044 : }
1045 : }
1046 : else
1047 0 : gcc_unreachable ();
1048 : }
1049 :
1050 : /* Move bits 64:95 to bits 32:63. */
1051 :
1052 : void
1053 868 : ix86_move_vector_high_sse_to_mmx (rtx op)
1054 : {
1055 868 : rtx mask = gen_rtx_PARALLEL (VOIDmode,
1056 : gen_rtvec (4, GEN_INT (0), GEN_INT (2),
1057 : GEN_INT (0), GEN_INT (0)));
1058 868 : rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
1059 868 : op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1060 868 : rtx insn = gen_rtx_SET (dest, op);
1061 868 : emit_insn (insn);
1062 868 : }
1063 :
1064 : /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
1065 :
1066 : void
1067 778 : ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
1068 : {
1069 778 : rtx op0 = operands[0];
1070 778 : rtx op1 = operands[1];
1071 778 : rtx op2 = operands[2];
1072 778 : rtx src;
1073 :
1074 778 : machine_mode dmode = GET_MODE (op0);
1075 778 : machine_mode smode = GET_MODE (op1);
1076 778 : machine_mode inner_dmode = GET_MODE_INNER (dmode);
1077 778 : machine_mode inner_smode = GET_MODE_INNER (smode);
1078 :
1079 : /* Get the corresponding SSE mode for destination. */
1080 778 : int nunits = 16 / GET_MODE_SIZE (inner_dmode);
1081 1556 : machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
1082 1556 : nunits).require ();
1083 778 : machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
1084 1556 : nunits / 2).require ();
1085 :
1086 : /* Get the corresponding SSE mode for source. */
1087 778 : nunits = 16 / GET_MODE_SIZE (inner_smode);
1088 1556 : machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
1089 1556 : nunits).require ();
1090 :
1091 : /* Generate SSE pack with signed/unsigned saturation. */
1092 778 : rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
1093 778 : op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
1094 778 : op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
1095 :
1096 : /* paskusdw/packuswb does unsigned saturation of a signed source
1097 : which is different from generic us_truncate RTX. */
1098 778 : if (code == US_TRUNCATE)
1099 676 : src = gen_rtx_UNSPEC (sse_dmode,
1100 : gen_rtvec (2, op1, op2),
1101 : UNSPEC_US_TRUNCATE);
1102 : else
1103 : {
1104 102 : op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
1105 102 : op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
1106 102 : src = gen_rtx_VEC_CONCAT (sse_dmode, op1, op2);
1107 : }
1108 :
1109 778 : emit_move_insn (dest, src);
1110 :
1111 778 : ix86_move_vector_high_sse_to_mmx (op0);
1112 778 : }
1113 :
1114 : /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. This is also used
1115 : for a full unpack of OPERANDS[1] and OPERANDS[2] into a wider
1116 : OPERANDS[0]. */
1117 :
1118 : void
1119 6027 : ix86_split_mmx_punpck (rtx operands[], bool high_p)
1120 : {
1121 6027 : rtx op0 = operands[0];
1122 6027 : rtx op1 = operands[1];
1123 6027 : rtx op2 = operands[2];
1124 6027 : machine_mode mode = GET_MODE (op1);
1125 6027 : rtx mask;
1126 : /* The corresponding SSE mode. */
1127 6027 : machine_mode sse_mode, double_sse_mode;
1128 :
1129 6027 : switch (mode)
1130 : {
1131 1582 : case E_V8QImode:
1132 1582 : case E_V4QImode:
1133 1582 : case E_V2QImode:
1134 1582 : sse_mode = V16QImode;
1135 1582 : double_sse_mode = V32QImode;
1136 1582 : mask = gen_rtx_PARALLEL (VOIDmode,
1137 : gen_rtvec (16,
1138 : GEN_INT (0), GEN_INT (16),
1139 : GEN_INT (1), GEN_INT (17),
1140 : GEN_INT (2), GEN_INT (18),
1141 : GEN_INT (3), GEN_INT (19),
1142 : GEN_INT (4), GEN_INT (20),
1143 : GEN_INT (5), GEN_INT (21),
1144 : GEN_INT (6), GEN_INT (22),
1145 : GEN_INT (7), GEN_INT (23)));
1146 1582 : break;
1147 :
1148 3316 : case E_V4HImode:
1149 3316 : case E_V2HImode:
1150 3316 : sse_mode = V8HImode;
1151 3316 : double_sse_mode = V16HImode;
1152 3316 : mask = gen_rtx_PARALLEL (VOIDmode,
1153 : gen_rtvec (8,
1154 : GEN_INT (0), GEN_INT (8),
1155 : GEN_INT (1), GEN_INT (9),
1156 : GEN_INT (2), GEN_INT (10),
1157 : GEN_INT (3), GEN_INT (11)));
1158 3316 : break;
1159 :
1160 778 : case E_V2SImode:
1161 778 : sse_mode = V4SImode;
1162 778 : double_sse_mode = V8SImode;
1163 778 : mask = gen_rtx_PARALLEL (VOIDmode,
1164 : gen_rtvec (4,
1165 : GEN_INT (0), GEN_INT (4),
1166 : GEN_INT (1), GEN_INT (5)));
1167 778 : break;
1168 :
1169 351 : case E_V2SFmode:
1170 351 : sse_mode = V4SFmode;
1171 351 : double_sse_mode = V8SFmode;
1172 351 : mask = gen_rtx_PARALLEL (VOIDmode,
1173 : gen_rtvec (4,
1174 : GEN_INT (0), GEN_INT (4),
1175 : GEN_INT (1), GEN_INT (5)));
1176 351 : break;
1177 :
1178 0 : default:
1179 0 : gcc_unreachable ();
1180 : }
1181 :
1182 : /* Generate SSE punpcklXX. */
1183 6027 : rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
1184 6027 : op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
1185 6027 : op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
1186 :
1187 6027 : op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
1188 6027 : op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
1189 6027 : rtx insn = gen_rtx_SET (dest, op2);
1190 6027 : emit_insn (insn);
1191 :
1192 : /* Move high bits to low bits. */
1193 6027 : if (high_p)
1194 : {
1195 2429 : if (sse_mode == V4SFmode)
1196 : {
1197 121 : mask = gen_rtx_PARALLEL (VOIDmode,
1198 : gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1199 : GEN_INT (4), GEN_INT (5)));
1200 121 : op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
1201 121 : op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
1202 : }
1203 : else
1204 : {
1205 2308 : int sz = GET_MODE_SIZE (mode);
1206 :
1207 2308 : if (sz == 4)
1208 239 : mask = gen_rtx_PARALLEL (VOIDmode,
1209 : gen_rtvec (4, GEN_INT (1), GEN_INT (0),
1210 : GEN_INT (0), GEN_INT (1)));
1211 2069 : else if (sz == 8)
1212 2069 : mask = gen_rtx_PARALLEL (VOIDmode,
1213 : gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1214 : GEN_INT (0), GEN_INT (1)));
1215 : else
1216 0 : gcc_unreachable ();
1217 :
1218 2308 : dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
1219 2308 : op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1220 : }
1221 :
1222 2429 : insn = gen_rtx_SET (dest, op1);
1223 2429 : emit_insn (insn);
1224 : }
1225 6027 : }
1226 :
1227 : /* Helper function of ix86_fixup_binary_operands to canonicalize
1228 : operand order. Returns true if the operands should be swapped. */
1229 :
1230 : static bool
1231 173561513 : ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
1232 : rtx operands[])
1233 : {
1234 173561513 : rtx dst = operands[0];
1235 173561513 : rtx src1 = operands[1];
1236 173561513 : rtx src2 = operands[2];
1237 :
1238 : /* If the operation is not commutative, we can't do anything. */
1239 173561513 : if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
1240 26289958 : && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
1241 : return false;
1242 :
1243 : /* Highest priority is that src1 should match dst. */
1244 147282784 : if (rtx_equal_p (dst, src1))
1245 : return false;
1246 107068271 : if (rtx_equal_p (dst, src2))
1247 : return true;
1248 :
1249 : /* Next highest priority is that immediate constants come second. */
1250 106983972 : if (immediate_operand (src2, mode))
1251 : return false;
1252 25794075 : if (immediate_operand (src1, mode))
1253 : return true;
1254 :
1255 : /* Lowest priority is that memory references should come second. */
1256 25794075 : if (MEM_P (src2))
1257 : return false;
1258 24369593 : if (MEM_P (src1))
1259 : return true;
1260 :
1261 : return false;
1262 : }
1263 :
1264 : /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
1265 : destination to use for the operation. If different from the true
1266 : destination in operands[0], a copy operation will be required except
1267 : under TARGET_APX_NDD. */
1268 :
1269 : rtx
1270 13484290 : ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
1271 : rtx operands[], bool use_ndd)
1272 : {
1273 13484290 : rtx dst = operands[0];
1274 13484290 : rtx src1 = operands[1];
1275 13484290 : rtx src2 = operands[2];
1276 :
1277 : /* Canonicalize operand order. */
1278 13484290 : if (ix86_swap_binary_operands_p (code, mode, operands))
1279 : {
1280 : /* It is invalid to swap operands of different modes. */
1281 87854 : gcc_assert (GET_MODE (src1) == GET_MODE (src2));
1282 :
1283 : std::swap (src1, src2);
1284 : }
1285 :
1286 : /* Both source operands cannot be in memory. */
1287 13484290 : if (MEM_P (src1) && MEM_P (src2))
1288 : {
1289 : /* Optimization: Only read from memory once. */
1290 109858 : if (rtx_equal_p (src1, src2))
1291 : {
1292 17 : src2 = force_reg (mode, src2);
1293 17 : src1 = src2;
1294 : }
1295 109841 : else if (rtx_equal_p (dst, src1))
1296 3327 : src2 = force_reg (mode, src2);
1297 : else
1298 106514 : src1 = force_reg (mode, src1);
1299 : }
1300 :
1301 : /* If the destination is memory, and we do not have matching source
1302 : operands, do things in registers. */
1303 13484290 : if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1304 482825 : dst = gen_reg_rtx (mode);
1305 :
1306 : /* Source 1 cannot be a constant. */
1307 13484290 : if (CONSTANT_P (src1))
1308 713 : src1 = force_reg (mode, src1);
1309 :
1310 : /* Source 1 cannot be a non-matching memory. */
1311 13484290 : if (!use_ndd && MEM_P (src1) && !rtx_equal_p (dst, src1))
1312 438150 : src1 = force_reg (mode, src1);
1313 :
1314 : /* Improve address combine. */
1315 13484290 : if (code == PLUS
1316 9932506 : && GET_MODE_CLASS (mode) == MODE_INT
1317 9821853 : && MEM_P (src2))
1318 176183 : src2 = force_reg (mode, src2);
1319 :
1320 13484290 : operands[1] = src1;
1321 13484290 : operands[2] = src2;
1322 13484290 : return dst;
1323 : }
1324 :
1325 : /* Similarly, but assume that the destination has already been
1326 : set up properly. */
1327 :
1328 : void
1329 290591 : ix86_fixup_binary_operands_no_copy (enum rtx_code code,
1330 : machine_mode mode, rtx operands[],
1331 : bool use_ndd)
1332 : {
1333 290591 : rtx dst = ix86_fixup_binary_operands (code, mode, operands, use_ndd);
1334 290591 : gcc_assert (dst == operands[0]);
1335 290591 : }
1336 :
1337 : /* Attempt to expand a binary operator. Make the expansion closer to the
1338 : actual machine, then just general_operand, which will allow 3 separate
1339 : memory references (one output, two input) in a single insn. */
1340 :
1341 : void
1342 13193570 : ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
1343 : rtx operands[], bool use_ndd)
1344 : {
1345 13193570 : rtx src1, src2, dst, op, clob;
1346 :
1347 13193570 : dst = ix86_fixup_binary_operands (code, mode, operands, use_ndd);
1348 13193570 : src1 = operands[1];
1349 13193570 : src2 = operands[2];
1350 :
1351 : /* Emit the instruction. */
1352 :
1353 13193570 : op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
1354 :
1355 13193570 : if (reload_completed
1356 80854 : && code == PLUS
1357 904 : && !rtx_equal_p (dst, src1)
1358 13193570 : && !use_ndd)
1359 : {
1360 : /* This is going to be an LEA; avoid splitting it later. */
1361 0 : emit_insn (op);
1362 : }
1363 : else
1364 : {
1365 13193570 : clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1366 13193570 : emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1367 : }
1368 :
1369 : /* Fix up the destination if needed. */
1370 13193570 : if (dst != operands[0])
1371 482816 : emit_move_insn (operands[0], dst);
1372 13193570 : }
1373 :
1374 : /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
1375 : the given OPERANDS. */
1376 :
1377 : void
1378 83572 : ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
1379 : rtx operands[])
1380 : {
1381 83572 : rtx op1 = NULL_RTX, op2 = NULL_RTX;
1382 83572 : if (SUBREG_P (operands[1]))
1383 : {
1384 312 : op1 = operands[1];
1385 312 : op2 = operands[2];
1386 : }
1387 83260 : else if (SUBREG_P (operands[2]))
1388 : {
1389 : op1 = operands[2];
1390 : op2 = operands[1];
1391 : }
1392 : /* Optimize (__m128i) d | (__m128i) e and similar code
1393 : when d and e are float vectors into float vector logical
1394 : insn. In C/C++ without using intrinsics there is no other way
1395 : to express vector logical operation on float vectors than
1396 : to cast them temporarily to integer vectors. */
1397 3145 : if (op1
1398 3145 : && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
1399 3145 : && (SUBREG_P (op2) || CONST_VECTOR_P (op2))
1400 298 : && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
1401 303 : && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
1402 101 : && SUBREG_BYTE (op1) == 0
1403 101 : && (CONST_VECTOR_P (op2)
1404 1 : || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
1405 1 : && SUBREG_BYTE (op2) == 0))
1406 101 : && can_create_pseudo_p ())
1407 : {
1408 101 : rtx dst;
1409 101 : switch (GET_MODE (SUBREG_REG (op1)))
1410 : {
1411 17 : case E_V4SFmode:
1412 17 : case E_V8SFmode:
1413 17 : case E_V16SFmode:
1414 17 : case E_V2DFmode:
1415 17 : case E_V4DFmode:
1416 17 : case E_V8DFmode:
1417 17 : dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1418 17 : if (CONST_VECTOR_P (op2))
1419 : {
1420 16 : op2 = gen_lowpart (GET_MODE (dst), op2);
1421 16 : op2 = force_reg (GET_MODE (dst), op2);
1422 : }
1423 : else
1424 : {
1425 1 : op1 = operands[1];
1426 1 : op2 = SUBREG_REG (operands[2]);
1427 1 : if (!vector_operand (op2, GET_MODE (dst)))
1428 0 : op2 = force_reg (GET_MODE (dst), op2);
1429 : }
1430 17 : op1 = SUBREG_REG (op1);
1431 17 : if (!vector_operand (op1, GET_MODE (dst)))
1432 0 : op1 = force_reg (GET_MODE (dst), op1);
1433 17 : emit_insn (gen_rtx_SET (dst,
1434 : gen_rtx_fmt_ee (code, GET_MODE (dst),
1435 : op1, op2)));
1436 17 : emit_move_insn (operands[0], gen_lowpart (mode, dst));
1437 17 : return;
1438 : default:
1439 : break;
1440 : }
1441 : }
1442 83555 : if (!vector_operand (operands[1], mode))
1443 0 : operands[1] = force_reg (mode, operands[1]);
1444 83555 : if (!vector_operand (operands[2], mode))
1445 11332 : operands[2] = force_reg (mode, operands[2]);
1446 83555 : ix86_fixup_binary_operands_no_copy (code, mode, operands);
1447 83555 : emit_insn (gen_rtx_SET (operands[0],
1448 : gen_rtx_fmt_ee (code, mode, operands[1],
1449 : operands[2])));
1450 : }
1451 :
1452 : /* Return TRUE or FALSE depending on whether the binary operator meets the
1453 : appropriate constraints. */
1454 :
1455 : bool
1456 161062737 : ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1457 : rtx operands[3], bool use_ndd)
1458 : {
1459 161062737 : rtx dst = operands[0];
1460 161062737 : rtx src1 = operands[1];
1461 161062737 : rtx src2 = operands[2];
1462 :
1463 : /* Both source operands cannot be in memory. */
1464 153716375 : if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
1465 161063122 : && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
1466 985514 : return false;
1467 :
1468 : /* Canonicalize operand order for commutative operators. */
1469 160077223 : if (ix86_swap_binary_operands_p (code, mode, operands))
1470 533055 : std::swap (src1, src2);
1471 :
1472 : /* If the destination is memory, we must have a matching source operand. */
1473 160077223 : if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1474 : return false;
1475 :
1476 : /* Source 1 cannot be a constant. */
1477 155052503 : if (CONSTANT_P (src1))
1478 : return false;
1479 :
1480 : /* Source 1 cannot be a non-matching memory. */
1481 155049454 : if (!use_ndd && MEM_P (src1) && !rtx_equal_p (dst, src1))
1482 : /* Support "andhi/andsi/anddi" as a zero-extending move. */
1483 4421760 : return (code == AND
1484 512177 : && (mode == HImode
1485 512177 : || mode == SImode
1486 308983 : || (TARGET_64BIT && mode == DImode))
1487 4720228 : && satisfies_constraint_L (src2));
1488 :
1489 : return true;
1490 : }
1491 :
1492 : /* Attempt to expand a unary operator. Make the expansion closer to the
1493 : actual machine, then just general_operand, which will allow 2 separate
1494 : memory references (one output, one input) in a single insn. */
1495 :
1496 : void
1497 118904 : ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1498 : rtx operands[], bool use_ndd)
1499 : {
1500 118904 : bool matching_memory = false;
1501 118904 : rtx src, dst, op, clob;
1502 :
1503 118904 : dst = operands[0];
1504 118904 : src = operands[1];
1505 :
1506 : /* If the destination is memory, and we do not have matching source
1507 : operands, do things in registers. */
1508 118904 : if (MEM_P (dst))
1509 : {
1510 3225 : if (rtx_equal_p (dst, src))
1511 : matching_memory = true;
1512 : else
1513 2910 : dst = gen_reg_rtx (mode);
1514 : }
1515 :
1516 : /* When source operand is memory, destination must match. */
1517 118904 : if (!use_ndd && MEM_P (src) && !matching_memory)
1518 4684 : src = force_reg (mode, src);
1519 :
1520 : /* Emit the instruction. */
1521 :
1522 118904 : op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1523 :
1524 118904 : if (code == NOT)
1525 68249 : emit_insn (op);
1526 : else
1527 : {
1528 50655 : clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1529 50655 : emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1530 : }
1531 :
1532 : /* Fix up the destination if needed. */
1533 118904 : if (dst != operands[0])
1534 2910 : emit_move_insn (operands[0], dst);
1535 118904 : }
1536 :
1537 : /* Return TRUE or FALSE depending on whether the unary operator meets the
1538 : appropriate constraints. */
1539 :
1540 : bool
1541 1723352 : ix86_unary_operator_ok (enum rtx_code,
1542 : machine_mode,
1543 : rtx operands[2],
1544 : bool use_ndd)
1545 : {
1546 : /* If one of operands is memory, source and destination must match. */
1547 1723352 : if ((MEM_P (operands[0])
1548 1680131 : || (!use_ndd && MEM_P (operands[1])))
1549 1752365 : && ! rtx_equal_p (operands[0], operands[1]))
1550 : return false;
1551 : return true;
1552 : }
1553 :
1554 : /* Predict just emitted jump instruction to be taken with probability PROB. */
1555 :
1556 : static void
1557 70669 : predict_jump (int prob)
1558 : {
1559 70669 : rtx_insn *insn = get_last_insn ();
1560 70669 : gcc_assert (JUMP_P (insn));
1561 70669 : add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1562 70669 : }
1563 :
1564 : /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1565 : divisor are within the range [0-255]. */
1566 :
1567 : void
1568 27 : ix86_split_idivmod (machine_mode mode, rtx operands[],
1569 : bool unsigned_p)
1570 : {
1571 27 : rtx_code_label *end_label, *qimode_label;
1572 27 : rtx div, mod;
1573 27 : rtx_insn *insn;
1574 27 : rtx scratch, tmp0, tmp1, tmp2;
1575 27 : rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1576 :
1577 27 : operands[2] = force_reg (mode, operands[2]);
1578 27 : operands[3] = force_reg (mode, operands[3]);
1579 :
1580 27 : switch (mode)
1581 : {
1582 20 : case E_SImode:
1583 20 : if (GET_MODE (operands[0]) == SImode)
1584 : {
1585 16 : if (GET_MODE (operands[1]) == SImode)
1586 14 : gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1587 : else
1588 2 : gen_divmod4_1
1589 2 : = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1590 : }
1591 : else
1592 4 : gen_divmod4_1
1593 4 : = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1594 : break;
1595 :
1596 7 : case E_DImode:
1597 7 : gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1598 : break;
1599 :
1600 0 : default:
1601 0 : gcc_unreachable ();
1602 : }
1603 :
1604 27 : end_label = gen_label_rtx ();
1605 27 : qimode_label = gen_label_rtx ();
1606 :
1607 27 : scratch = gen_reg_rtx (mode);
1608 :
1609 : /* Use 8bit unsigned divimod if dividend and divisor are within
1610 : the range [0-255]. */
1611 27 : emit_move_insn (scratch, operands[2]);
1612 27 : scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1613 : scratch, 1, OPTAB_DIRECT);
1614 27 : emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
1615 27 : tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1616 27 : tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1617 27 : tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1618 : gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1619 : pc_rtx);
1620 27 : insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1621 27 : predict_jump (REG_BR_PROB_BASE * 50 / 100);
1622 27 : JUMP_LABEL (insn) = qimode_label;
1623 :
1624 : /* Generate original signed/unsigned divimod. */
1625 27 : emit_insn (gen_divmod4_1 (operands[0], operands[1],
1626 : operands[2], operands[3]));
1627 :
1628 : /* Branch to the end. */
1629 27 : emit_jump_insn (gen_jump (end_label));
1630 27 : emit_barrier ();
1631 :
1632 : /* Generate 8bit unsigned divide. */
1633 27 : emit_label (qimode_label);
1634 : /* Don't use operands[0] for result of 8bit divide since not all
1635 : registers support QImode ZERO_EXTRACT. */
1636 27 : tmp0 = lowpart_subreg (HImode, scratch, mode);
1637 27 : tmp1 = lowpart_subreg (HImode, operands[2], mode);
1638 27 : tmp2 = lowpart_subreg (QImode, operands[3], mode);
1639 27 : emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1640 :
1641 27 : if (unsigned_p)
1642 : {
1643 12 : div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1644 12 : mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1645 : }
1646 : else
1647 : {
1648 15 : div = gen_rtx_DIV (mode, operands[2], operands[3]);
1649 15 : mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1650 : }
1651 27 : if (mode == SImode)
1652 : {
1653 20 : if (GET_MODE (operands[0]) != SImode)
1654 4 : div = gen_rtx_ZERO_EXTEND (DImode, div);
1655 20 : if (GET_MODE (operands[1]) != SImode)
1656 2 : mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1657 : }
1658 :
1659 : /* Extract remainder from AH. */
1660 27 : scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
1661 27 : tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
1662 : GEN_INT (8), GEN_INT (8));
1663 27 : insn = emit_move_insn (operands[1], tmp1);
1664 27 : set_unique_reg_note (insn, REG_EQUAL, mod);
1665 :
1666 : /* Zero extend quotient from AL. */
1667 27 : tmp1 = gen_lowpart (QImode, tmp0);
1668 27 : insn = emit_insn (gen_extend_insn
1669 27 : (operands[0], tmp1,
1670 27 : GET_MODE (operands[0]), QImode, 1));
1671 27 : set_unique_reg_note (insn, REG_EQUAL, div);
1672 :
1673 27 : emit_label (end_label);
1674 27 : }
1675 :
1676 : /* Emit x86 binary operand CODE in mode MODE, where the first operand
1677 : matches destination. RTX includes clobber of FLAGS_REG. */
1678 :
1679 : void
1680 7832 : ix86_emit_binop (enum rtx_code code, machine_mode mode,
1681 : rtx dst, rtx src)
1682 : {
1683 7832 : rtx op, clob;
1684 :
1685 7832 : op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1686 7832 : clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1687 :
1688 7832 : emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1689 7832 : }
1690 :
1691 : /* Return true if regno1 def is nearest to the insn. */
1692 :
1693 : static bool
1694 15 : find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1695 : {
1696 15 : rtx_insn *prev = insn;
1697 15 : rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1698 :
1699 15 : if (insn == start)
1700 : return false;
1701 40 : while (prev && prev != start)
1702 : {
1703 30 : if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1704 : {
1705 10 : prev = PREV_INSN (prev);
1706 10 : continue;
1707 : }
1708 20 : if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1709 : return true;
1710 15 : else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1711 : return false;
1712 15 : prev = PREV_INSN (prev);
1713 : }
1714 :
1715 : /* None of the regs is defined in the bb. */
1716 : return false;
1717 : }
1718 :
1719 : /* INSN_UID of the last insn emitted by zero store peephole2s. */
1720 : int ix86_last_zero_store_uid;
1721 :
1722 : /* Split lea instructions into a sequence of instructions
1723 : which are executed on ALU to avoid AGU stalls.
1724 : It is assumed that it is allowed to clobber flags register
1725 : at lea position. */
1726 :
1727 : void
1728 6013 : ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1729 : {
1730 6013 : unsigned int regno0, regno1, regno2;
1731 6013 : struct ix86_address parts;
1732 6013 : rtx target, tmp;
1733 6013 : int ok, adds;
1734 :
1735 6013 : ok = ix86_decompose_address (operands[1], &parts);
1736 6013 : gcc_assert (ok);
1737 :
1738 6013 : target = gen_lowpart (mode, operands[0]);
1739 :
1740 6013 : regno0 = true_regnum (target);
1741 6013 : regno1 = INVALID_REGNUM;
1742 6013 : regno2 = INVALID_REGNUM;
1743 :
1744 6013 : if (parts.base)
1745 : {
1746 6005 : parts.base = gen_lowpart (mode, parts.base);
1747 6005 : regno1 = true_regnum (parts.base);
1748 : }
1749 :
1750 6013 : if (parts.index)
1751 : {
1752 6010 : parts.index = gen_lowpart (mode, parts.index);
1753 6010 : regno2 = true_regnum (parts.index);
1754 : }
1755 :
1756 6013 : if (parts.disp)
1757 173 : parts.disp = gen_lowpart (mode, parts.disp);
1758 :
1759 6013 : if (parts.scale > 1)
1760 : {
1761 : /* Case r1 = r1 + ... */
1762 11 : if (regno1 == regno0)
1763 : {
1764 : /* If we have a case r1 = r1 + C * r2 then we
1765 : should use multiplication which is very
1766 : expensive. Assume cost model is wrong if we
1767 : have such case here. */
1768 0 : gcc_assert (regno2 != regno0);
1769 :
1770 0 : for (adds = parts.scale; adds > 0; adds--)
1771 0 : ix86_emit_binop (PLUS, mode, target, parts.index);
1772 : }
1773 : else
1774 : {
1775 : /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1776 11 : if (regno0 != regno2)
1777 8 : emit_insn (gen_rtx_SET (target, parts.index));
1778 :
1779 : /* Use shift for scaling, but emit it as MULT instead
1780 : to avoid it being immediately peephole2 optimized back
1781 : into lea. */
1782 11 : ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
1783 :
1784 11 : if (parts.base)
1785 3 : ix86_emit_binop (PLUS, mode, target, parts.base);
1786 :
1787 11 : if (parts.disp && parts.disp != const0_rtx)
1788 3 : ix86_emit_binop (PLUS, mode, target, parts.disp);
1789 : }
1790 : }
1791 6002 : else if (!parts.base && !parts.index)
1792 : {
1793 0 : gcc_assert(parts.disp);
1794 0 : emit_insn (gen_rtx_SET (target, parts.disp));
1795 : }
1796 : else
1797 : {
1798 6002 : if (!parts.base)
1799 : {
1800 0 : if (regno0 != regno2)
1801 0 : emit_insn (gen_rtx_SET (target, parts.index));
1802 : }
1803 6002 : else if (!parts.index)
1804 : {
1805 3 : if (regno0 != regno1)
1806 1 : emit_insn (gen_rtx_SET (target, parts.base));
1807 : }
1808 : else
1809 : {
1810 5999 : if (regno0 == regno1)
1811 : tmp = parts.index;
1812 3111 : else if (regno0 == regno2)
1813 : tmp = parts.base;
1814 : else
1815 : {
1816 15 : rtx tmp1;
1817 :
1818 : /* Find better operand for SET instruction, depending
1819 : on which definition is farther from the insn. */
1820 15 : if (find_nearest_reg_def (insn, regno1, regno2))
1821 5 : tmp = parts.index, tmp1 = parts.base;
1822 : else
1823 10 : tmp = parts.base, tmp1 = parts.index;
1824 :
1825 15 : emit_insn (gen_rtx_SET (target, tmp));
1826 :
1827 15 : if (parts.disp && parts.disp != const0_rtx)
1828 0 : ix86_emit_binop (PLUS, mode, target, parts.disp);
1829 :
1830 15 : ix86_emit_binop (PLUS, mode, target, tmp1);
1831 15 : return;
1832 : }
1833 :
1834 5984 : ix86_emit_binop (PLUS, mode, target, tmp);
1835 : }
1836 :
1837 5987 : if (parts.disp && parts.disp != const0_rtx)
1838 4 : ix86_emit_binop (PLUS, mode, target, parts.disp);
1839 : }
1840 : }
1841 :
1842 : /* Post-reload splitter for converting an SF or DFmode value in an
1843 : SSE register into an unsigned SImode. */
1844 :
1845 : void
1846 0 : ix86_split_convert_uns_si_sse (rtx operands[])
1847 : {
1848 0 : machine_mode vecmode;
1849 0 : rtx value, large, zero_or_two31, input, two31, x;
1850 :
1851 0 : large = operands[1];
1852 0 : zero_or_two31 = operands[2];
1853 0 : input = operands[3];
1854 0 : two31 = operands[4];
1855 0 : vecmode = GET_MODE (large);
1856 0 : value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1857 :
1858 : /* Load up the value into the low element. We must ensure that the other
1859 : elements are valid floats -- zero is the easiest such value. */
1860 0 : if (MEM_P (input))
1861 : {
1862 0 : if (vecmode == V4SFmode)
1863 0 : emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1864 : else
1865 0 : emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1866 : }
1867 : else
1868 : {
1869 0 : input = gen_rtx_REG (vecmode, REGNO (input));
1870 0 : emit_move_insn (value, CONST0_RTX (vecmode));
1871 0 : if (vecmode == V4SFmode)
1872 0 : emit_insn (gen_sse_movss_v4sf (value, value, input));
1873 : else
1874 0 : emit_insn (gen_sse2_movsd_v2df (value, value, input));
1875 : }
1876 :
1877 0 : emit_move_insn (large, two31);
1878 0 : emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1879 :
1880 0 : x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1881 0 : emit_insn (gen_rtx_SET (large, x));
1882 :
1883 0 : x = gen_rtx_AND (vecmode, zero_or_two31, large);
1884 0 : emit_insn (gen_rtx_SET (zero_or_two31, x));
1885 :
1886 0 : x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1887 0 : emit_insn (gen_rtx_SET (value, x));
1888 :
1889 0 : large = gen_rtx_REG (V4SImode, REGNO (large));
1890 0 : emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1891 :
1892 0 : x = gen_rtx_REG (V4SImode, REGNO (value));
1893 0 : if (vecmode == V4SFmode)
1894 0 : emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1895 : else
1896 0 : emit_insn (gen_sse2_cvttpd2dq (x, value));
1897 0 : value = x;
1898 :
1899 0 : emit_insn (gen_xorv4si3 (value, value, large));
1900 0 : }
1901 :
1902 : /* Convert an unsigned DImode value into a DFmode, using only SSE.
1903 : Expects the 64-bit DImode to be supplied in a pair of integral
1904 : registers. Requires SSE2; will use SSE3 if available. For x86_32,
1905 : -mfpmath=sse, !optimize_size only. */
1906 :
1907 : void
1908 0 : ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1909 : {
1910 0 : REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1911 0 : rtx int_xmm, fp_xmm;
1912 0 : rtx biases, exponents;
1913 0 : rtx x;
1914 :
1915 0 : int_xmm = gen_reg_rtx (V4SImode);
1916 0 : if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1917 0 : emit_insn (gen_movdi_to_sse (int_xmm, input));
1918 0 : else if (TARGET_SSE_SPLIT_REGS)
1919 : {
1920 0 : emit_clobber (int_xmm);
1921 0 : emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1922 : }
1923 : else
1924 : {
1925 0 : x = gen_reg_rtx (V2DImode);
1926 0 : ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1927 0 : emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1928 : }
1929 :
1930 0 : x = gen_rtx_CONST_VECTOR (V4SImode,
1931 : gen_rtvec (4, GEN_INT (0x43300000UL),
1932 : GEN_INT (0x45300000UL),
1933 : const0_rtx, const0_rtx));
1934 0 : exponents = validize_mem (force_const_mem (V4SImode, x));
1935 :
1936 : /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1937 0 : emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1938 :
1939 : /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1940 : yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1941 : Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1942 : (0x1.0p84 + double(fp_value_hi_xmm)).
1943 : Note these exponents differ by 32. */
1944 :
1945 0 : fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1946 :
1947 : /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1948 : in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1949 0 : real_ldexp (&bias_lo_rvt, &dconst1, 52);
1950 0 : real_ldexp (&bias_hi_rvt, &dconst1, 84);
1951 0 : biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1952 0 : x = const_double_from_real_value (bias_hi_rvt, DFmode);
1953 0 : biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1954 0 : biases = validize_mem (force_const_mem (V2DFmode, biases));
1955 0 : emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1956 :
1957 : /* Add the upper and lower DFmode values together. */
1958 0 : if (TARGET_SSE3)
1959 0 : emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1960 : else
1961 : {
1962 0 : x = copy_to_mode_reg (V2DFmode, fp_xmm);
1963 0 : emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1964 0 : emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1965 : }
1966 :
1967 0 : ix86_expand_vector_extract (false, target, fp_xmm, 0);
1968 0 : }
1969 :
1970 : /* Not used, but eases macroization of patterns. */
1971 : void
1972 0 : ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1973 : {
1974 0 : gcc_unreachable ();
1975 : }
1976 :
1977 : static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
1978 :
1979 : /* Convert an unsigned SImode value into a DFmode. Only currently used
1980 : for SSE, but applicable anywhere. */
1981 :
1982 : void
1983 0 : ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1984 : {
1985 0 : REAL_VALUE_TYPE TWO31r;
1986 0 : rtx x, fp;
1987 :
1988 0 : x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1989 : NULL, 1, OPTAB_DIRECT);
1990 :
1991 0 : fp = gen_reg_rtx (DFmode);
1992 0 : emit_insn (gen_floatsidf2 (fp, x));
1993 :
1994 0 : real_ldexp (&TWO31r, &dconst1, 31);
1995 0 : x = const_double_from_real_value (TWO31r, DFmode);
1996 :
1997 0 : x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1998 :
1999 : /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
2000 0 : if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
2001 0 : x = ix86_expand_sse_fabs (x, NULL);
2002 :
2003 0 : if (x != target)
2004 0 : emit_move_insn (target, x);
2005 0 : }
2006 :
2007 : /* Convert a signed DImode value into a DFmode. Only used for SSE in
2008 : 32-bit mode; otherwise we have a direct convert instruction. */
2009 :
2010 : void
2011 0 : ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
2012 : {
2013 0 : REAL_VALUE_TYPE TWO32r;
2014 0 : rtx fp_lo, fp_hi, x;
2015 :
2016 0 : fp_lo = gen_reg_rtx (DFmode);
2017 0 : fp_hi = gen_reg_rtx (DFmode);
2018 :
2019 0 : emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
2020 :
2021 0 : real_ldexp (&TWO32r, &dconst1, 32);
2022 0 : x = const_double_from_real_value (TWO32r, DFmode);
2023 0 : fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
2024 :
2025 0 : ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
2026 :
2027 0 : x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
2028 : 0, OPTAB_DIRECT);
2029 0 : if (x != target)
2030 0 : emit_move_insn (target, x);
2031 0 : }
2032 :
2033 : /* Convert an unsigned SImode value into a SFmode, using only SSE.
2034 : For x86_32, -mfpmath=sse, !optimize_size only. */
2035 : void
2036 0 : ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
2037 : {
2038 0 : REAL_VALUE_TYPE ONE16r;
2039 0 : rtx fp_hi, fp_lo, int_hi, int_lo, x;
2040 :
2041 0 : real_ldexp (&ONE16r, &dconst1, 16);
2042 0 : x = const_double_from_real_value (ONE16r, SFmode);
2043 0 : int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
2044 : NULL, 0, OPTAB_DIRECT);
2045 0 : int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
2046 : NULL, 0, OPTAB_DIRECT);
2047 0 : fp_hi = gen_reg_rtx (SFmode);
2048 0 : fp_lo = gen_reg_rtx (SFmode);
2049 0 : emit_insn (gen_floatsisf2 (fp_hi, int_hi));
2050 0 : emit_insn (gen_floatsisf2 (fp_lo, int_lo));
2051 0 : if (TARGET_FMA)
2052 : {
2053 0 : x = validize_mem (force_const_mem (SFmode, x));
2054 0 : fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo);
2055 0 : emit_move_insn (target, fp_hi);
2056 : }
2057 : else
2058 : {
2059 0 : fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
2060 : 0, OPTAB_DIRECT);
2061 0 : fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
2062 : 0, OPTAB_DIRECT);
2063 0 : if (!rtx_equal_p (target, fp_hi))
2064 0 : emit_move_insn (target, fp_hi);
2065 : }
2066 0 : }
2067 :
2068 : /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
2069 : a vector of unsigned ints VAL to vector of floats TARGET. */
2070 :
2071 : void
2072 54 : ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
2073 : {
2074 54 : rtx tmp[8];
2075 54 : REAL_VALUE_TYPE TWO16r;
2076 54 : machine_mode intmode = GET_MODE (val);
2077 54 : machine_mode fltmode = GET_MODE (target);
2078 54 : rtx (*cvt) (rtx, rtx);
2079 :
2080 54 : if (intmode == V4SImode)
2081 : cvt = gen_floatv4siv4sf2;
2082 : else
2083 2 : cvt = gen_floatv8siv8sf2;
2084 54 : tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
2085 54 : tmp[0] = force_reg (intmode, tmp[0]);
2086 54 : tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
2087 : OPTAB_DIRECT);
2088 54 : tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
2089 : NULL_RTX, 1, OPTAB_DIRECT);
2090 54 : tmp[3] = gen_reg_rtx (fltmode);
2091 54 : emit_insn (cvt (tmp[3], tmp[1]));
2092 54 : tmp[4] = gen_reg_rtx (fltmode);
2093 54 : emit_insn (cvt (tmp[4], tmp[2]));
2094 54 : real_ldexp (&TWO16r, &dconst1, 16);
2095 54 : tmp[5] = const_double_from_real_value (TWO16r, SFmode);
2096 54 : tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
2097 54 : if (TARGET_FMA)
2098 : {
2099 1 : tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]);
2100 1 : emit_move_insn (target, tmp[6]);
2101 : }
2102 : else
2103 : {
2104 53 : tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5],
2105 : NULL_RTX, 1, OPTAB_DIRECT);
2106 53 : tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6],
2107 : target, 1, OPTAB_DIRECT);
2108 53 : if (tmp[7] != target)
2109 0 : emit_move_insn (target, tmp[7]);
2110 : }
2111 54 : }
2112 :
2113 : /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
2114 : pattern can be used on it instead of fixuns_trunc*.
2115 : This is done by doing just signed conversion if < 0x1p31, and otherwise by
2116 : subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
2117 :
2118 : rtx
2119 286 : ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
2120 : {
2121 286 : REAL_VALUE_TYPE TWO31r;
2122 286 : rtx two31r, tmp[4];
2123 286 : machine_mode mode = GET_MODE (val);
2124 286 : machine_mode scalarmode = GET_MODE_INNER (mode);
2125 572 : machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
2126 286 : rtx (*cmp) (rtx, rtx, rtx, rtx);
2127 286 : int i;
2128 :
2129 1144 : for (i = 0; i < 3; i++)
2130 858 : tmp[i] = gen_reg_rtx (mode);
2131 286 : real_ldexp (&TWO31r, &dconst1, 31);
2132 286 : two31r = const_double_from_real_value (TWO31r, scalarmode);
2133 286 : two31r = ix86_build_const_vector (mode, 1, two31r);
2134 286 : two31r = force_reg (mode, two31r);
2135 286 : switch (mode)
2136 : {
2137 : case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
2138 10 : case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
2139 16 : case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
2140 260 : case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
2141 0 : default: gcc_unreachable ();
2142 : }
2143 286 : tmp[3] = gen_rtx_LE (mode, two31r, val);
2144 286 : emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
2145 286 : tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
2146 : 0, OPTAB_DIRECT);
2147 286 : if (intmode == V4SImode || TARGET_AVX2)
2148 572 : *xorp = expand_simple_binop (intmode, ASHIFT,
2149 286 : gen_lowpart (intmode, tmp[0]),
2150 : GEN_INT (31), NULL_RTX, 0,
2151 : OPTAB_DIRECT);
2152 : else
2153 : {
2154 0 : rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
2155 0 : two31 = ix86_build_const_vector (intmode, 1, two31);
2156 0 : *xorp = expand_simple_binop (intmode, AND,
2157 0 : gen_lowpart (intmode, tmp[0]),
2158 : two31, NULL_RTX, 0,
2159 : OPTAB_DIRECT);
2160 : }
2161 286 : return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
2162 286 : 0, OPTAB_DIRECT);
2163 : }
2164 :
2165 : /* Generate code for floating point ABS or NEG. */
2166 :
2167 : void
2168 32345 : ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2169 : rtx operands[])
2170 : {
2171 32345 : rtx set, dst, src;
2172 32345 : bool use_sse = false;
2173 32345 : bool vector_mode = VECTOR_MODE_P (mode);
2174 32345 : machine_mode vmode = mode;
2175 32345 : rtvec par;
2176 :
2177 32345 : switch (mode)
2178 : {
2179 : case E_HFmode:
2180 : use_sse = true;
2181 : vmode = V8HFmode;
2182 : break;
2183 0 : case E_BFmode:
2184 0 : use_sse = true;
2185 0 : vmode = V8BFmode;
2186 0 : break;
2187 8693 : case E_SFmode:
2188 8693 : use_sse = TARGET_SSE_MATH && TARGET_SSE;
2189 : vmode = V4SFmode;
2190 : break;
2191 15163 : case E_DFmode:
2192 15163 : use_sse = TARGET_SSE_MATH && TARGET_SSE2;
2193 : vmode = V2DFmode;
2194 : break;
2195 8290 : default:
2196 8290 : use_sse = vector_mode || mode == TFmode;
2197 8290 : break;
2198 : }
2199 :
2200 32345 : dst = operands[0];
2201 32345 : src = operands[1];
2202 :
2203 32345 : set = gen_rtx_fmt_e (code, mode, src);
2204 32345 : set = gen_rtx_SET (dst, set);
2205 :
2206 32345 : if (use_sse)
2207 : {
2208 27142 : rtx mask, use, clob;
2209 :
2210 : /* NEG and ABS performed with SSE use bitwise mask operations.
2211 : Create the appropriate mask now. */
2212 27142 : mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
2213 27142 : use = gen_rtx_USE (VOIDmode, mask);
2214 27142 : if (vector_mode || mode == TFmode)
2215 4404 : par = gen_rtvec (2, set, use);
2216 : else
2217 : {
2218 22738 : clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2219 22738 : par = gen_rtvec (3, set, use, clob);
2220 : }
2221 : }
2222 : else
2223 : {
2224 5203 : rtx clob;
2225 :
2226 : /* Changing of sign for FP values is doable using integer unit too. */
2227 5203 : clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2228 5203 : par = gen_rtvec (2, set, clob);
2229 : }
2230 :
2231 32345 : emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2232 32345 : }
2233 :
2234 : /* Deconstruct a floating point ABS or NEG operation
2235 : with integer registers into integer operations. */
2236 :
2237 : void
2238 24 : ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2239 : rtx operands[])
2240 : {
2241 24 : enum rtx_code absneg_op;
2242 24 : rtx dst, set;
2243 :
2244 24 : gcc_assert (operands_match_p (operands[0], operands[1]));
2245 :
2246 24 : switch (mode)
2247 : {
2248 0 : case E_SFmode:
2249 0 : dst = gen_lowpart (SImode, operands[0]);
2250 :
2251 0 : if (code == ABS)
2252 : {
2253 0 : set = gen_int_mode (0x7fffffff, SImode);
2254 0 : absneg_op = AND;
2255 : }
2256 : else
2257 : {
2258 0 : set = gen_int_mode (0x80000000, SImode);
2259 0 : absneg_op = XOR;
2260 : }
2261 0 : set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2262 0 : break;
2263 :
2264 1 : case E_DFmode:
2265 1 : if (TARGET_64BIT)
2266 : {
2267 1 : dst = gen_lowpart (DImode, operands[0]);
2268 1 : dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
2269 :
2270 1 : if (code == ABS)
2271 0 : set = const0_rtx;
2272 : else
2273 1 : set = gen_rtx_NOT (DImode, dst);
2274 : }
2275 : else
2276 : {
2277 0 : dst = gen_highpart (SImode, operands[0]);
2278 :
2279 0 : if (code == ABS)
2280 : {
2281 0 : set = gen_int_mode (0x7fffffff, SImode);
2282 0 : absneg_op = AND;
2283 : }
2284 : else
2285 : {
2286 0 : set = gen_int_mode (0x80000000, SImode);
2287 0 : absneg_op = XOR;
2288 : }
2289 0 : set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2290 : }
2291 : break;
2292 :
2293 23 : case E_XFmode:
2294 23 : dst = gen_rtx_REG (SImode,
2295 23 : REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
2296 23 : if (code == ABS)
2297 : {
2298 1 : set = GEN_INT (0x7fff);
2299 1 : absneg_op = AND;
2300 : }
2301 : else
2302 : {
2303 22 : set = GEN_INT (0x8000);
2304 22 : absneg_op = XOR;
2305 : }
2306 23 : set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2307 23 : break;
2308 :
2309 0 : default:
2310 0 : gcc_unreachable ();
2311 : }
2312 :
2313 24 : set = gen_rtx_SET (dst, set);
2314 :
2315 24 : rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2316 24 : rtvec par = gen_rtvec (2, set, clob);
2317 :
2318 24 : emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2319 24 : }
2320 :
2321 : /* Expand a copysign operation. Special case operand 0 being a constant. */
2322 :
2323 : void
2324 23243 : ix86_expand_copysign (rtx operands[])
2325 : {
2326 23243 : machine_mode mode, vmode;
2327 23243 : rtx dest, vdest, op0, op1, mask, op2, op3;
2328 :
2329 23243 : mode = GET_MODE (operands[0]);
2330 :
2331 23243 : switch (mode)
2332 : {
2333 : case E_HFmode:
2334 : vmode = V8HFmode;
2335 : break;
2336 0 : case E_BFmode:
2337 0 : vmode = V8BFmode;
2338 0 : break;
2339 11566 : case E_SFmode:
2340 11566 : vmode = V4SFmode;
2341 11566 : break;
2342 11538 : case E_DFmode:
2343 11538 : vmode = V2DFmode;
2344 11538 : break;
2345 127 : case E_TFmode:
2346 127 : vmode = mode;
2347 127 : break;
2348 0 : default:
2349 0 : gcc_unreachable();
2350 : }
2351 :
2352 23243 : if (rtx_equal_p (operands[1], operands[2]))
2353 : {
2354 0 : emit_move_insn (operands[0], operands[1]);
2355 0 : return;
2356 : }
2357 :
2358 23243 : dest = operands[0];
2359 23243 : vdest = lowpart_subreg (vmode, dest, mode);
2360 23243 : if (vdest == NULL_RTX)
2361 0 : vdest = gen_reg_rtx (vmode);
2362 : else
2363 : dest = NULL_RTX;
2364 23243 : op1 = lowpart_subreg (vmode, force_reg (mode, operands[1]), mode);
2365 46472 : mask = ix86_build_signbit_mask (vmode, TARGET_AVX512F && mode != HFmode, 0);
2366 :
2367 23243 : if (CONST_DOUBLE_P (operands[2]))
2368 : {
2369 79 : if (real_isneg (CONST_DOUBLE_REAL_VALUE (operands[2])))
2370 : /* Simplify b = copysign (a, negative) to b = mask | a. */
2371 76 : op1 = gen_rtx_IOR (vmode, mask, op1);
2372 : else
2373 : {
2374 : /* Simplify b = copysign (a, positive) to b = invert_mask & a. */
2375 3 : rtx invert_mask
2376 3 : = ix86_build_signbit_mask (vmode,
2377 3 : TARGET_AVX512F && mode != HFmode,
2378 : true);
2379 3 : op1 = gen_rtx_AND (vmode, invert_mask, op1);
2380 : }
2381 79 : emit_move_insn (vdest, op1);
2382 79 : if (dest)
2383 0 : emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2384 79 : return;
2385 : }
2386 : else
2387 23164 : op0 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
2388 :
2389 23164 : op2 = gen_reg_rtx (vmode);
2390 23164 : op3 = gen_reg_rtx (vmode);
2391 23164 : rtx invert_mask;
2392 : /* NB: Generate vmovdqa, vpandn, vpand, vpor for AVX and generate pand,
2393 : pand, por for SSE. */
2394 23164 : if (TARGET_AVX)
2395 33 : invert_mask = gen_rtx_NOT (vmode, mask);
2396 : else
2397 23131 : invert_mask = ix86_build_signbit_mask (vmode,
2398 23131 : TARGET_AVX512F && mode != HFmode,
2399 : true);
2400 23164 : emit_move_insn (op2, gen_rtx_AND (vmode, invert_mask, op1));
2401 23164 : emit_move_insn (op3, gen_rtx_AND (vmode, mask, op0));
2402 23164 : emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3));
2403 23164 : if (dest)
2404 0 : emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2405 : }
2406 :
2407 : /* Expand an xorsign operation. */
2408 :
2409 : void
2410 20 : ix86_expand_xorsign (rtx operands[])
2411 : {
2412 20 : machine_mode mode, vmode;
2413 20 : rtx dest, vdest, op0, op1, mask, x, temp;
2414 :
2415 20 : dest = operands[0];
2416 20 : op0 = operands[1];
2417 20 : op1 = operands[2];
2418 :
2419 20 : mode = GET_MODE (dest);
2420 :
2421 20 : switch (mode)
2422 : {
2423 : case E_HFmode:
2424 : vmode = V8HFmode;
2425 : break;
2426 : case E_BFmode:
2427 : vmode = V8BFmode;
2428 : break;
2429 : case E_SFmode:
2430 : vmode = V4SFmode;
2431 : break;
2432 : case E_DFmode:
2433 : vmode = V2DFmode;
2434 : break;
2435 0 : default:
2436 0 : gcc_unreachable ();
2437 20 : break;
2438 : }
2439 :
2440 20 : temp = gen_reg_rtx (vmode);
2441 20 : mask = ix86_build_signbit_mask (vmode, 0, 0);
2442 :
2443 20 : op1 = lowpart_subreg (vmode, force_reg (mode, op1), mode);
2444 20 : x = gen_rtx_AND (vmode, op1, mask);
2445 20 : emit_insn (gen_rtx_SET (temp, x));
2446 :
2447 20 : op0 = lowpart_subreg (vmode, force_reg (mode, op0), mode);
2448 20 : x = gen_rtx_XOR (vmode, temp, op0);
2449 :
2450 20 : vdest = lowpart_subreg (vmode, dest, mode);
2451 20 : if (vdest == NULL_RTX)
2452 0 : vdest = gen_reg_rtx (vmode);
2453 : else
2454 : dest = NULL_RTX;
2455 20 : emit_insn (gen_rtx_SET (vdest, x));
2456 :
2457 20 : if (dest)
2458 0 : emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2459 20 : }
2460 :
2461 : static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2462 :
2463 : void
2464 6639541 : ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2465 : {
2466 6639541 : machine_mode mode = GET_MODE (op0);
2467 6639541 : rtx tmp;
2468 :
2469 : /* Handle special case - vector comparsion with boolean result, transform
2470 : it using ptest instruction or vpcmpeq + kortest. */
2471 6639541 : if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
2472 6620371 : || (mode == TImode && !TARGET_64BIT)
2473 6620371 : || mode == OImode
2474 13259912 : || GET_MODE_SIZE (mode) == 64)
2475 : {
2476 19170 : unsigned msize = GET_MODE_SIZE (mode);
2477 19170 : machine_mode p_mode
2478 19170 : = msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode;
2479 : /* kortest set CF when result is 0xFFFF (op0 == op1). */
2480 19170 : rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, FLAGS_REG);
2481 :
2482 19170 : gcc_assert (code == EQ || code == NE);
2483 :
2484 : /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors. */
2485 19170 : if (msize == 64)
2486 : {
2487 2195 : if (mode != V16SImode)
2488 : {
2489 2195 : op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
2490 2195 : op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
2491 : }
2492 :
2493 2195 : tmp = gen_reg_rtx (HImode);
2494 2195 : emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0)));
2495 2195 : emit_insn (gen_kortesthi_ccc (tmp, tmp));
2496 : }
2497 : /* Using ptest for 128/256-bit vectors. */
2498 : else
2499 : {
2500 16975 : if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
2501 : {
2502 0 : op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
2503 0 : op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
2504 0 : mode = p_mode;
2505 : }
2506 :
2507 : /* Generate XOR since we can't check that one operand is zero
2508 : vector. */
2509 16975 : tmp = gen_reg_rtx (mode);
2510 16975 : rtx ops[3] = { tmp, op0, op1 };
2511 16975 : ix86_expand_vector_logical_operator (XOR, mode, ops);
2512 16975 : tmp = gen_lowpart (p_mode, tmp);
2513 16975 : emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
2514 : gen_rtx_UNSPEC (CCZmode,
2515 : gen_rtvec (2, tmp, tmp),
2516 : UNSPEC_PTEST)));
2517 : }
2518 19170 : tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2519 19170 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2520 : gen_rtx_LABEL_REF (VOIDmode, label),
2521 : pc_rtx);
2522 19170 : emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2523 19170 : return;
2524 : }
2525 :
2526 6620371 : switch (mode)
2527 : {
2528 6589652 : case E_HFmode:
2529 6589652 : case E_SFmode:
2530 6589652 : case E_DFmode:
2531 6589652 : case E_XFmode:
2532 6589652 : case E_QImode:
2533 6589652 : case E_HImode:
2534 6589652 : case E_SImode:
2535 6589652 : simple:
2536 6589652 : tmp = ix86_expand_compare (code, op0, op1);
2537 6589652 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2538 : gen_rtx_LABEL_REF (VOIDmode, label),
2539 : pc_rtx);
2540 6589652 : emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2541 6589652 : return;
2542 :
2543 7 : case E_BFmode:
2544 7 : gcc_assert (TARGET_AVX10_2 && !flag_trapping_math);
2545 7 : goto simple;
2546 :
2547 2674289 : case E_DImode:
2548 2674289 : if (TARGET_64BIT)
2549 2645631 : goto simple;
2550 : /* FALLTHRU */
2551 88567 : case E_TImode:
2552 : /* DI and TI mode equality/inequality comparisons may be performed
2553 : on SSE registers. Avoid splitting them, except when optimizing
2554 : for size. */
2555 88567 : if ((code == EQ || code == NE)
2556 88567 : && !optimize_insn_for_size_p ())
2557 57848 : goto simple;
2558 :
2559 : /* Expand DImode branch into multiple compare+branch. */
2560 30719 : {
2561 30719 : rtx lo[2], hi[2];
2562 30719 : rtx_code_label *label2;
2563 30719 : enum rtx_code code1, code2, code3;
2564 30719 : machine_mode submode;
2565 :
2566 30719 : if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2567 : {
2568 0 : std::swap (op0, op1);
2569 0 : code = swap_condition (code);
2570 : }
2571 :
2572 30719 : split_double_mode (mode, &op0, 1, lo+0, hi+0);
2573 30719 : split_double_mode (mode, &op1, 1, lo+1, hi+1);
2574 :
2575 30719 : submode = mode == DImode ? SImode : DImode;
2576 :
2577 : /* If we are doing less-than or greater-or-equal-than,
2578 : op1 is a constant and the low word is zero, then we can just
2579 : examine the high word. Similarly for low word -1 and
2580 : less-or-equal-than or greater-than. */
2581 :
2582 30719 : if (CONST_INT_P (hi[1]))
2583 19777 : switch (code)
2584 : {
2585 10560 : case LT: case LTU: case GE: case GEU:
2586 10560 : if (lo[1] == const0_rtx)
2587 : {
2588 10151 : ix86_expand_branch (code, hi[0], hi[1], label);
2589 10151 : return;
2590 : }
2591 : break;
2592 7644 : case LE: case LEU: case GT: case GTU:
2593 7644 : if (lo[1] == constm1_rtx)
2594 : {
2595 530 : ix86_expand_branch (code, hi[0], hi[1], label);
2596 530 : return;
2597 : }
2598 : break;
2599 : default:
2600 : break;
2601 : }
2602 :
2603 : /* Emulate comparisons that do not depend on Zero flag with
2604 : double-word subtraction. Note that only Overflow, Sign
2605 : and Carry flags are valid, so swap arguments and condition
2606 : of comparisons that would otherwise test Zero flag. */
2607 :
2608 20038 : switch (code)
2609 : {
2610 12570 : case LE: case LEU: case GT: case GTU:
2611 12570 : std::swap (lo[0], lo[1]);
2612 12570 : std::swap (hi[0], hi[1]);
2613 12570 : code = swap_condition (code);
2614 : /* FALLTHRU */
2615 :
2616 16972 : case LT: case LTU: case GE: case GEU:
2617 16972 : {
2618 16972 : bool uns = (code == LTU || code == GEU);
2619 3989 : rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2620 16972 : = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2621 :
2622 16972 : if (!nonimmediate_operand (lo[0], submode))
2623 7114 : lo[0] = force_reg (submode, lo[0]);
2624 16972 : if (!x86_64_general_operand (lo[1], submode))
2625 0 : lo[1] = force_reg (submode, lo[1]);
2626 :
2627 16972 : if (!register_operand (hi[0], submode))
2628 7936 : hi[0] = force_reg (submode, hi[0]);
2629 12983 : if ((uns && !nonimmediate_operand (hi[1], submode))
2630 16972 : || (!uns && !x86_64_general_operand (hi[1], submode)))
2631 315 : hi[1] = force_reg (submode, hi[1]);
2632 :
2633 16972 : emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2634 :
2635 16972 : tmp = gen_rtx_SCRATCH (submode);
2636 16972 : emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2637 :
2638 20961 : tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2639 16972 : ix86_expand_branch (code, tmp, const0_rtx, label);
2640 16972 : return;
2641 : }
2642 :
2643 3066 : default:
2644 3066 : break;
2645 : }
2646 :
2647 : /* Otherwise, we need two or three jumps. */
2648 :
2649 3066 : label2 = gen_label_rtx ();
2650 :
2651 3066 : code1 = code;
2652 3066 : code2 = swap_condition (code);
2653 3066 : code3 = unsigned_condition (code);
2654 :
2655 3066 : switch (code)
2656 : {
2657 : case LT: case GT: case LTU: case GTU:
2658 : break;
2659 :
2660 : case LE: code1 = LT; code2 = GT; break;
2661 : case GE: code1 = GT; code2 = LT; break;
2662 0 : case LEU: code1 = LTU; code2 = GTU; break;
2663 0 : case GEU: code1 = GTU; code2 = LTU; break;
2664 :
2665 : case EQ: code1 = UNKNOWN; code2 = NE; break;
2666 : case NE: code2 = UNKNOWN; break;
2667 :
2668 0 : default:
2669 0 : gcc_unreachable ();
2670 : }
2671 :
2672 : /*
2673 : * a < b =>
2674 : * if (hi(a) < hi(b)) goto true;
2675 : * if (hi(a) > hi(b)) goto false;
2676 : * if (lo(a) < lo(b)) goto true;
2677 : * false:
2678 : */
2679 :
2680 0 : if (code1 != UNKNOWN)
2681 2328 : ix86_expand_branch (code1, hi[0], hi[1], label);
2682 3066 : if (code2 != UNKNOWN)
2683 738 : ix86_expand_branch (code2, hi[0], hi[1], label2);
2684 :
2685 3066 : ix86_expand_branch (code3, lo[0], lo[1], label);
2686 :
2687 3066 : if (code2 != UNKNOWN)
2688 738 : emit_label (label2);
2689 : return;
2690 : }
2691 :
2692 17436 : default:
2693 17436 : gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2694 17436 : goto simple;
2695 : }
2696 : }
2697 :
2698 : /* Figure out whether to use unordered fp comparisons. */
2699 :
2700 : static bool
2701 1147075 : ix86_unordered_fp_compare (enum rtx_code code)
2702 : {
2703 1147075 : if (!TARGET_IEEE_FP)
2704 : return false;
2705 :
2706 1142693 : switch (code)
2707 : {
2708 : case LT:
2709 : case LE:
2710 : case GT:
2711 : case GE:
2712 : case LTGT:
2713 : return false;
2714 :
2715 : case EQ:
2716 : case NE:
2717 :
2718 : case UNORDERED:
2719 : case ORDERED:
2720 : case UNLT:
2721 : case UNLE:
2722 : case UNGT:
2723 : case UNGE:
2724 : case UNEQ:
2725 : return true;
2726 :
2727 0 : default:
2728 0 : gcc_unreachable ();
2729 : }
2730 : }
2731 :
2732 : /* Return a comparison we can do and that it is equivalent to
2733 : swap_condition (code) apart possibly from orderedness.
2734 : But, never change orderedness if TARGET_IEEE_FP, returning
2735 : UNKNOWN in that case if necessary. */
2736 :
2737 : static enum rtx_code
2738 37367 : ix86_fp_swap_condition (enum rtx_code code)
2739 : {
2740 37367 : switch (code)
2741 : {
2742 1847 : case GT: /* GTU - CF=0 & ZF=0 */
2743 1847 : return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2744 533 : case GE: /* GEU - CF=0 */
2745 533 : return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2746 446 : case UNLT: /* LTU - CF=1 */
2747 446 : return TARGET_IEEE_FP ? UNKNOWN : GT;
2748 6315 : case UNLE: /* LEU - CF=1 | ZF=1 */
2749 6315 : return TARGET_IEEE_FP ? UNKNOWN : GE;
2750 28226 : default:
2751 28226 : return swap_condition (code);
2752 : }
2753 : }
2754 :
2755 : /* Return cost of comparison CODE using the best strategy for performance.
2756 : All following functions do use number of instructions as a cost metrics.
2757 : In future this should be tweaked to compute bytes for optimize_size and
2758 : take into account performance of various instructions on various CPUs. */
2759 :
2760 : static int
2761 1145940 : ix86_fp_comparison_cost (enum rtx_code code)
2762 : {
2763 1145940 : int arith_cost;
2764 :
2765 : /* The cost of code using bit-twiddling on %ah. */
2766 1145940 : switch (code)
2767 : {
2768 : case UNLE:
2769 : case UNLT:
2770 : case LTGT:
2771 : case GT:
2772 : case GE:
2773 : case UNORDERED:
2774 : case ORDERED:
2775 : case UNEQ:
2776 : arith_cost = 4;
2777 : break;
2778 84249 : case LT:
2779 84249 : case NE:
2780 84249 : case EQ:
2781 84249 : case UNGE:
2782 84249 : arith_cost = TARGET_IEEE_FP ? 5 : 4;
2783 : break;
2784 24913 : case LE:
2785 24913 : case UNGT:
2786 1062502 : arith_cost = TARGET_IEEE_FP ? 6 : 4;
2787 : break;
2788 0 : default:
2789 0 : gcc_unreachable ();
2790 : }
2791 :
2792 1145940 : switch (ix86_fp_comparison_strategy (code))
2793 : {
2794 1145940 : case IX86_FPCMP_COMI:
2795 1145940 : return arith_cost > 4 ? 3 : 2;
2796 0 : case IX86_FPCMP_SAHF:
2797 0 : return arith_cost > 4 ? 4 : 3;
2798 : default:
2799 : return arith_cost;
2800 : }
2801 : }
2802 :
2803 : /* Swap, force into registers, or otherwise massage the two operands
2804 : to a fp comparison. The operands are updated in place; the new
2805 : comparison code is returned. */
2806 :
2807 : static enum rtx_code
2808 572970 : ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2809 : {
2810 573041 : bool unordered_compare = ix86_unordered_fp_compare (code);
2811 573041 : rtx op0 = *pop0, op1 = *pop1;
2812 573041 : machine_mode op_mode = GET_MODE (op0);
2813 573041 : bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (op_mode);
2814 :
2815 570650 : if (op_mode == BFmode && (!TARGET_AVX10_2 || flag_trapping_math))
2816 : {
2817 71 : rtx op = gen_lowpart (HImode, op0);
2818 71 : if (CONST_INT_P (op))
2819 0 : op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
2820 : op0, BFmode);
2821 : else
2822 : {
2823 71 : rtx t1 = gen_reg_rtx (SImode);
2824 71 : emit_insn (gen_zero_extendhisi2 (t1, op));
2825 71 : emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
2826 71 : op = gen_lowpart (SFmode, t1);
2827 : }
2828 71 : *pop0 = op;
2829 71 : op = gen_lowpart (HImode, op1);
2830 71 : if (CONST_INT_P (op))
2831 6 : op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
2832 : op1, BFmode);
2833 : else
2834 : {
2835 65 : rtx t1 = gen_reg_rtx (SImode);
2836 65 : emit_insn (gen_zero_extendhisi2 (t1, op));
2837 65 : emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
2838 65 : op = gen_lowpart (SFmode, t1);
2839 : }
2840 71 : *pop1 = op;
2841 71 : return ix86_prepare_fp_compare_args (code, pop0, pop1);
2842 : }
2843 :
2844 : /* All of the unordered compare instructions only work on registers.
2845 : The same is true of the fcomi compare instructions. The XFmode
2846 : compare instructions require registers except when comparing
2847 : against zero or when converting operand 1 from fixed point to
2848 : floating point. */
2849 :
2850 572970 : if (!is_sse
2851 572970 : && (unordered_compare
2852 8222 : || (op_mode == XFmode
2853 10523 : && ! (standard_80387_constant_p (op0) == 1
2854 5259 : || standard_80387_constant_p (op1) == 1)
2855 4918 : && GET_CODE (op1) != FLOAT)
2856 3304 : || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2857 : {
2858 147498 : op0 = force_reg (op_mode, op0);
2859 147498 : op1 = force_reg (op_mode, op1);
2860 : }
2861 : else
2862 : {
2863 : /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2864 : things around if they appear profitable, otherwise force op0
2865 : into a register. */
2866 :
2867 425472 : if (standard_80387_constant_p (op0) == 0
2868 425472 : || (MEM_P (op0)
2869 56480 : && ! (standard_80387_constant_p (op1) == 0
2870 41129 : || MEM_P (op1))))
2871 : {
2872 37367 : enum rtx_code new_code = ix86_fp_swap_condition (code);
2873 37367 : if (new_code != UNKNOWN)
2874 : {
2875 : std::swap (op0, op1);
2876 425472 : code = new_code;
2877 : }
2878 : }
2879 :
2880 425472 : if (!REG_P (op0))
2881 52832 : op0 = force_reg (op_mode, op0);
2882 :
2883 425472 : if (CONSTANT_P (op1))
2884 : {
2885 193117 : int tmp = standard_80387_constant_p (op1);
2886 193117 : if (tmp == 0)
2887 73810 : op1 = validize_mem (force_const_mem (op_mode, op1));
2888 119307 : else if (tmp == 1)
2889 : {
2890 65213 : if (TARGET_CMOVE)
2891 65213 : op1 = force_reg (op_mode, op1);
2892 : }
2893 : else
2894 54094 : op1 = force_reg (op_mode, op1);
2895 : }
2896 : }
2897 :
2898 : /* Try to rearrange the comparison to make it cheaper. */
2899 572970 : if (ix86_fp_comparison_cost (code)
2900 572970 : > ix86_fp_comparison_cost (swap_condition (code))
2901 572970 : && (REG_P (op1) || can_create_pseudo_p ()))
2902 : {
2903 0 : std::swap (op0, op1);
2904 0 : code = swap_condition (code);
2905 0 : if (!REG_P (op0))
2906 0 : op0 = force_reg (op_mode, op0);
2907 : }
2908 :
2909 572970 : *pop0 = op0;
2910 572970 : *pop1 = op1;
2911 572970 : return code;
2912 : }
2913 :
2914 : /* Generate insn patterns to do a floating point compare of OPERANDS. */
2915 :
2916 : static rtx
2917 572970 : ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2918 : {
2919 572970 : bool unordered_compare = ix86_unordered_fp_compare (code);
2920 572970 : machine_mode cmp_mode;
2921 572970 : rtx tmp, scratch;
2922 :
2923 572970 : code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2924 :
2925 572970 : tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2926 572970 : if (unordered_compare)
2927 498078 : tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2928 :
2929 : /* Do fcomi/sahf based test when profitable. */
2930 572970 : switch (ix86_fp_comparison_strategy (code))
2931 : {
2932 572970 : case IX86_FPCMP_COMI:
2933 572970 : tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2934 : /* We only have vcomisbf16, No vcomubf16 nor vcomxbf16 */
2935 572970 : if (GET_MODE (op0) != E_BFmode)
2936 : {
2937 572942 : if (TARGET_AVX10_2 && (code == EQ || code == NE))
2938 972 : tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_OPTCOMX);
2939 572942 : if (unordered_compare)
2940 498070 : tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2941 : }
2942 572970 : cmp_mode = CCFPmode;
2943 572970 : emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2944 572970 : break;
2945 :
2946 0 : case IX86_FPCMP_SAHF:
2947 0 : cmp_mode = CCFPmode;
2948 0 : tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2949 0 : scratch = gen_reg_rtx (HImode);
2950 0 : emit_insn (gen_rtx_SET (scratch, tmp));
2951 0 : emit_insn (gen_x86_sahf_1 (scratch));
2952 0 : break;
2953 :
2954 0 : case IX86_FPCMP_ARITH:
2955 0 : cmp_mode = CCNOmode;
2956 0 : tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2957 0 : scratch = gen_reg_rtx (HImode);
2958 0 : emit_insn (gen_rtx_SET (scratch, tmp));
2959 :
2960 : /* In the unordered case, we have to check C2 for NaN's, which
2961 : doesn't happen to work out to anything nice combination-wise.
2962 : So do some bit twiddling on the value we've got in AH to come
2963 : up with an appropriate set of condition codes. */
2964 :
2965 0 : switch (code)
2966 : {
2967 0 : case GT:
2968 0 : case UNGT:
2969 0 : if (code == GT || !TARGET_IEEE_FP)
2970 : {
2971 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2972 0 : code = EQ;
2973 : }
2974 : else
2975 : {
2976 0 : emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2977 0 : emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2978 0 : emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2979 0 : cmp_mode = CCmode;
2980 0 : code = GEU;
2981 : }
2982 : break;
2983 0 : case LT:
2984 0 : case UNLT:
2985 0 : if (code == LT && TARGET_IEEE_FP)
2986 : {
2987 0 : emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2988 0 : emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2989 0 : cmp_mode = CCmode;
2990 0 : code = EQ;
2991 : }
2992 : else
2993 : {
2994 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2995 0 : code = NE;
2996 : }
2997 : break;
2998 0 : case GE:
2999 0 : case UNGE:
3000 0 : if (code == GE || !TARGET_IEEE_FP)
3001 : {
3002 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
3003 0 : code = EQ;
3004 : }
3005 : else
3006 : {
3007 0 : emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
3008 0 : emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
3009 0 : code = NE;
3010 : }
3011 : break;
3012 0 : case LE:
3013 0 : case UNLE:
3014 0 : if (code == LE && TARGET_IEEE_FP)
3015 : {
3016 0 : emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
3017 0 : emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
3018 0 : emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
3019 0 : cmp_mode = CCmode;
3020 0 : code = LTU;
3021 : }
3022 : else
3023 : {
3024 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
3025 0 : code = NE;
3026 : }
3027 : break;
3028 0 : case EQ:
3029 0 : case UNEQ:
3030 0 : if (code == EQ && TARGET_IEEE_FP)
3031 : {
3032 0 : emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
3033 0 : emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
3034 0 : cmp_mode = CCmode;
3035 0 : code = EQ;
3036 : }
3037 : else
3038 : {
3039 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
3040 0 : code = NE;
3041 : }
3042 : break;
3043 0 : case NE:
3044 0 : case LTGT:
3045 0 : if (code == NE && TARGET_IEEE_FP)
3046 : {
3047 0 : emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
3048 0 : emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
3049 : GEN_INT (0x40)));
3050 0 : code = NE;
3051 : }
3052 : else
3053 : {
3054 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
3055 0 : code = EQ;
3056 : }
3057 : break;
3058 :
3059 0 : case UNORDERED:
3060 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
3061 0 : code = NE;
3062 0 : break;
3063 0 : case ORDERED:
3064 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
3065 0 : code = EQ;
3066 0 : break;
3067 :
3068 0 : default:
3069 0 : gcc_unreachable ();
3070 : }
3071 : break;
3072 :
3073 0 : default:
3074 0 : gcc_unreachable();
3075 : }
3076 :
3077 : /* Return the test that should be put into the flags user, i.e.
3078 : the bcc, scc, or cmov instruction. */
3079 572970 : return gen_rtx_fmt_ee (code, VOIDmode,
3080 : gen_rtx_REG (cmp_mode, FLAGS_REG),
3081 : const0_rtx);
3082 : }
3083 :
3084 : /* Generate insn patterns to do an integer compare of OPERANDS. */
3085 :
3086 : static rtx
3087 6961396 : ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
3088 : {
3089 6961396 : machine_mode cmpmode;
3090 6961396 : rtx tmp, flags;
3091 :
3092 : /* Swap operands to emit carry flag comparison. */
3093 6961396 : if ((code == GTU || code == LEU)
3094 6961396 : && nonimmediate_operand (op1, VOIDmode))
3095 : {
3096 144542 : std::swap (op0, op1);
3097 144542 : code = swap_condition (code);
3098 : }
3099 :
3100 6961396 : cmpmode = SELECT_CC_MODE (code, op0, op1);
3101 6961396 : flags = gen_rtx_REG (cmpmode, FLAGS_REG);
3102 :
3103 : /* Attempt to use PTEST, if available, when testing vector modes for
3104 : equality/inequality against zero. */
3105 6961396 : if (op1 == const0_rtx
3106 2909622 : && SUBREG_P (op0)
3107 22748 : && cmpmode == CCZmode
3108 10296 : && SUBREG_BYTE (op0) == 0
3109 8635 : && REG_P (SUBREG_REG (op0))
3110 8635 : && VECTOR_MODE_P (GET_MODE (SUBREG_REG (op0)))
3111 7 : && TARGET_SSE4_1
3112 1 : && GET_MODE (op0) == TImode
3113 6961398 : && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op0))) == 16)
3114 : {
3115 1 : tmp = SUBREG_REG (op0);
3116 1 : if (GET_MODE (tmp) == V8HFmode || GET_MODE (tmp) == V8BFmode)
3117 1 : tmp = gen_lowpart (V8HImode, tmp);
3118 1 : tmp = gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, tmp, tmp), UNSPEC_PTEST);
3119 : }
3120 : else
3121 6961395 : tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
3122 :
3123 : /* This is very simple, but making the interface the same as in the
3124 : FP case makes the rest of the code easier. */
3125 6961396 : emit_insn (gen_rtx_SET (flags, tmp));
3126 :
3127 : /* Return the test that should be put into the flags user, i.e.
3128 : the bcc, scc, or cmov instruction. */
3129 6961396 : return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
3130 : }
3131 :
3132 : static rtx
3133 7662638 : ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
3134 : {
3135 7662638 : rtx ret;
3136 :
3137 7662638 : if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
3138 130358 : ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
3139 :
3140 7532280 : else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
3141 : {
3142 570884 : gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
3143 570884 : ret = ix86_expand_fp_compare (code, op0, op1);
3144 : }
3145 : else
3146 6961396 : ret = ix86_expand_int_compare (code, op0, op1);
3147 :
3148 7662638 : return ret;
3149 : }
3150 :
3151 : void
3152 585615 : ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
3153 : {
3154 585615 : rtx ret;
3155 :
3156 585615 : gcc_assert (GET_MODE (dest) == QImode);
3157 :
3158 585615 : ret = ix86_expand_compare (code, op0, op1);
3159 585615 : PUT_MODE (ret, QImode);
3160 585615 : emit_insn (gen_rtx_SET (dest, ret));
3161 585615 : }
3162 :
3163 : /* Expand floating point op0 <=> op1, i.e.
3164 : dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : -128. */
3165 :
3166 : void
3167 244 : ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1, rtx op2)
3168 : {
3169 244 : gcc_checking_assert (ix86_fp_comparison_strategy (GT) != IX86_FPCMP_ARITH);
3170 244 : rtx zero = NULL_RTX;
3171 244 : if (op2 != const0_rtx
3172 52 : && (TARGET_IEEE_FP || TARGET_ZERO_EXTEND_WITH_AND)
3173 34 : && GET_MODE (dest) == SImode)
3174 34 : zero = force_reg (SImode, const0_rtx);
3175 244 : rtx gt = ix86_expand_fp_compare (GT, op0, op1);
3176 244 : rtx l0 = op2 == const0_rtx ? gen_label_rtx () : NULL_RTX;
3177 244 : rtx l1 = op2 == const0_rtx ? gen_label_rtx () : NULL_RTX;
3178 244 : rtx l2 = TARGET_IEEE_FP ? gen_label_rtx () : NULL_RTX;
3179 244 : rtx lend = gen_label_rtx ();
3180 244 : rtx tmp;
3181 244 : rtx_insn *jmp;
3182 244 : if (l2)
3183 : {
3184 207 : rtx un = gen_rtx_fmt_ee (UNORDERED, VOIDmode,
3185 : gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
3186 207 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, un,
3187 : gen_rtx_LABEL_REF (VOIDmode, l2), pc_rtx);
3188 207 : jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3189 207 : add_reg_br_prob_note (jmp, profile_probability:: very_unlikely ());
3190 : }
3191 244 : if (op2 == const0_rtx)
3192 : {
3193 192 : rtx eq = gen_rtx_fmt_ee (UNEQ, VOIDmode,
3194 : gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
3195 192 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, eq,
3196 : gen_rtx_LABEL_REF (VOIDmode, l0), pc_rtx);
3197 192 : jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3198 192 : add_reg_br_prob_note (jmp, profile_probability::unlikely ());
3199 192 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, gt,
3200 : gen_rtx_LABEL_REF (VOIDmode, l1), pc_rtx);
3201 192 : jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3202 192 : add_reg_br_prob_note (jmp, profile_probability::even ());
3203 192 : emit_move_insn (dest, constm1_rtx);
3204 192 : emit_jump (lend);
3205 192 : emit_label (l0);
3206 192 : emit_move_insn (dest, const0_rtx);
3207 192 : emit_jump (lend);
3208 192 : emit_label (l1);
3209 192 : emit_move_insn (dest, const1_rtx);
3210 : }
3211 : else
3212 : {
3213 52 : rtx lt_tmp = NULL_RTX;
3214 52 : if (GET_MODE (dest) != SImode || !TARGET_ZERO_EXTEND_WITH_AND)
3215 : {
3216 52 : lt_tmp = gen_reg_rtx (QImode);
3217 52 : ix86_expand_setcc (lt_tmp, UNLT, gen_rtx_REG (CCFPmode, FLAGS_REG),
3218 : const0_rtx);
3219 52 : if (GET_MODE (dest) != QImode)
3220 : {
3221 52 : tmp = gen_reg_rtx (GET_MODE (dest));
3222 52 : emit_insn (gen_rtx_SET (tmp,
3223 : gen_rtx_ZERO_EXTEND (GET_MODE (dest),
3224 : lt_tmp)));
3225 52 : lt_tmp = tmp;
3226 : }
3227 : }
3228 52 : rtx gt_tmp;
3229 52 : if (zero)
3230 : {
3231 : /* If TARGET_IEEE_FP and dest has SImode, emit SImode clear
3232 : before the floating point comparison and use setcc_si_slp
3233 : pattern to hide it from the combiner, so that it doesn't
3234 : undo it. Similarly for TARGET_ZERO_EXTEND_WITH_AND, where
3235 : the ZERO_EXTEND normally emitted would need to be AND
3236 : with flags clobber. */
3237 34 : tmp = ix86_expand_compare (GT, XEXP (gt, 0), const0_rtx);
3238 34 : PUT_MODE (tmp, QImode);
3239 34 : emit_insn (gen_setcc_si_slp (zero, tmp, zero));
3240 34 : gt_tmp = zero;
3241 : }
3242 : else
3243 : {
3244 18 : gt_tmp = gen_reg_rtx (QImode);
3245 18 : ix86_expand_setcc (gt_tmp, GT, XEXP (gt, 0), const0_rtx);
3246 18 : if (GET_MODE (dest) != QImode)
3247 : {
3248 18 : tmp = gen_reg_rtx (GET_MODE (dest));
3249 18 : emit_insn (gen_rtx_SET (tmp,
3250 : gen_rtx_ZERO_EXTEND (GET_MODE (dest),
3251 : gt_tmp)));
3252 18 : gt_tmp = tmp;
3253 : }
3254 : }
3255 52 : if (lt_tmp)
3256 : {
3257 52 : tmp = expand_simple_binop (GET_MODE (dest), MINUS, gt_tmp, lt_tmp,
3258 : dest, 0, OPTAB_DIRECT);
3259 52 : if (!rtx_equal_p (tmp, dest))
3260 0 : emit_move_insn (dest, tmp);
3261 : }
3262 : else
3263 : {
3264 : /* For TARGET_ZERO_EXTEND_WITH_AND emit sbb directly, as we can't
3265 : do ZERO_EXTEND without clobbering flags. */
3266 0 : tmp = ix86_expand_compare (UNLT, XEXP (gt, 0), const0_rtx);
3267 0 : PUT_MODE (tmp, SImode);
3268 0 : emit_insn (gen_subsi3_carry (dest, gt_tmp,
3269 0 : force_reg (GET_MODE (dest), const0_rtx),
3270 : XEXP (gt, 0), tmp));
3271 : }
3272 : }
3273 244 : emit_jump (lend);
3274 244 : if (l2)
3275 : {
3276 207 : emit_label (l2);
3277 207 : emit_move_insn (dest, op2 == const0_rtx ? GEN_INT (-128) : op2);
3278 : }
3279 244 : emit_label (lend);
3280 244 : }
3281 :
3282 : /* Expand integral op0 <=> op1, i.e.
3283 : dest = op0 == op1 ? 0 : op0 < op1 ? -1 : 1. */
3284 :
3285 : void
3286 35 : ix86_expand_int_spaceship (rtx dest, rtx op0, rtx op1, rtx op2)
3287 : {
3288 35 : gcc_assert (INTVAL (op2));
3289 35 : rtx zero1 = NULL_RTX, zero2 = NULL_RTX;
3290 35 : if (TARGET_ZERO_EXTEND_WITH_AND && GET_MODE (dest) == SImode)
3291 : {
3292 0 : zero1 = force_reg (SImode, const0_rtx);
3293 0 : if (INTVAL (op2) != 1)
3294 0 : zero2 = force_reg (SImode, const0_rtx);
3295 : }
3296 :
3297 : /* Not using ix86_expand_int_compare here, so that it doesn't swap
3298 : operands nor optimize CC mode - we need a mode usable for both
3299 : LT and GT resp. LTU and GTU comparisons with the same unswapped
3300 : operands. */
3301 51 : rtx flags = gen_rtx_REG (INTVAL (op2) != 1 ? CCGCmode : CCmode, FLAGS_REG);
3302 35 : rtx tmp = gen_rtx_COMPARE (GET_MODE (flags), op0, op1);
3303 35 : emit_insn (gen_rtx_SET (flags, tmp));
3304 35 : rtx lt_tmp = NULL_RTX;
3305 35 : if (zero2)
3306 : {
3307 : /* For TARGET_ZERO_EXTEND_WITH_AND, emit setcc_si_slp to avoid
3308 : ZERO_EXTEND. */
3309 0 : tmp = ix86_expand_compare (LT, flags, const0_rtx);
3310 0 : PUT_MODE (tmp, QImode);
3311 0 : emit_insn (gen_setcc_si_slp (zero2, tmp, zero2));
3312 0 : lt_tmp = zero2;
3313 : }
3314 35 : else if (!zero1)
3315 : {
3316 35 : lt_tmp = gen_reg_rtx (QImode);
3317 51 : ix86_expand_setcc (lt_tmp, INTVAL (op2) != 1 ? LT : LTU, flags,
3318 : const0_rtx);
3319 35 : if (GET_MODE (dest) != QImode)
3320 : {
3321 35 : tmp = gen_reg_rtx (GET_MODE (dest));
3322 35 : emit_insn (gen_rtx_SET (tmp, gen_rtx_ZERO_EXTEND (GET_MODE (dest),
3323 : lt_tmp)));
3324 35 : lt_tmp = tmp;
3325 : }
3326 : }
3327 35 : rtx gt_tmp;
3328 35 : if (zero1)
3329 : {
3330 : /* For TARGET_ZERO_EXTEND_WITH_AND, emit setcc_si_slp to avoid
3331 : ZERO_EXTEND. */
3332 0 : tmp = ix86_expand_compare (INTVAL (op2) != 1 ? GT : GTU, flags,
3333 : const0_rtx);
3334 0 : PUT_MODE (tmp, QImode);
3335 0 : emit_insn (gen_setcc_si_slp (zero1, tmp, zero1));
3336 0 : gt_tmp = zero1;
3337 : }
3338 : else
3339 : {
3340 35 : gt_tmp = gen_reg_rtx (QImode);
3341 51 : ix86_expand_setcc (gt_tmp, INTVAL (op2) != 1 ? GT : GTU, flags,
3342 : const0_rtx);
3343 35 : if (GET_MODE (dest) != QImode)
3344 : {
3345 35 : tmp = gen_reg_rtx (GET_MODE (dest));
3346 35 : emit_insn (gen_rtx_SET (tmp, gen_rtx_ZERO_EXTEND (GET_MODE (dest),
3347 : gt_tmp)));
3348 35 : gt_tmp = tmp;
3349 : }
3350 : }
3351 35 : if (lt_tmp)
3352 : {
3353 35 : tmp = expand_simple_binop (GET_MODE (dest), MINUS, gt_tmp, lt_tmp, dest,
3354 : 0, OPTAB_DIRECT);
3355 35 : if (!rtx_equal_p (tmp, dest))
3356 0 : emit_move_insn (dest, tmp);
3357 : }
3358 : else
3359 : {
3360 : /* For TARGET_ZERO_EXTEND_WITH_AND emit sbb directly, as we can't
3361 : do ZERO_EXTEND without clobbering flags. */
3362 0 : tmp = ix86_expand_compare (LTU, flags, const0_rtx);
3363 0 : PUT_MODE (tmp, SImode);
3364 0 : emit_insn (gen_subsi3_carry (dest, gt_tmp,
3365 0 : force_reg (GET_MODE (dest), const0_rtx),
3366 : flags, tmp));
3367 : }
3368 35 : }
3369 :
3370 : /* Expand comparison setting or clearing carry flag. Return true when
3371 : successful and set pop for the operation. */
3372 : static bool
3373 29317 : ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
3374 : {
3375 58634 : machine_mode mode
3376 29317 : = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
3377 :
3378 : /* Do not handle double-mode compares that go through special path. */
3379 31703 : if (mode == (TARGET_64BIT ? TImode : DImode))
3380 : return false;
3381 :
3382 29307 : if (SCALAR_FLOAT_MODE_P (mode))
3383 : {
3384 1844 : rtx compare_op;
3385 1844 : rtx_insn *compare_seq;
3386 :
3387 1844 : gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
3388 :
3389 : /* Shortcut: following common codes never translate
3390 : into carry flag compares. */
3391 1844 : if (code == EQ || code == NE || code == UNEQ || code == LTGT
3392 : || code == ORDERED || code == UNORDERED)
3393 : return false;
3394 :
3395 : /* These comparisons require zero flag; swap operands so they won't. */
3396 : if ((code == GT || code == UNLE || code == LE || code == UNGT)
3397 1779 : && !TARGET_IEEE_FP)
3398 : {
3399 2 : std::swap (op0, op1);
3400 2 : code = swap_condition (code);
3401 : }
3402 :
3403 : /* Try to expand the comparison and verify that we end up with
3404 : carry flag based comparison. This fails to be true only when
3405 : we decide to expand comparison using arithmetic that is not
3406 : too common scenario. */
3407 1842 : start_sequence ();
3408 1842 : compare_op = ix86_expand_fp_compare (code, op0, op1);
3409 1842 : compare_seq = end_sequence ();
3410 :
3411 1842 : if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
3412 1842 : code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
3413 : else
3414 0 : code = GET_CODE (compare_op);
3415 :
3416 1842 : if (code != LTU && code != GEU)
3417 : return false;
3418 :
3419 63 : emit_insn (compare_seq);
3420 63 : *pop = compare_op;
3421 63 : return true;
3422 : }
3423 :
3424 27463 : if (!INTEGRAL_MODE_P (mode))
3425 : return false;
3426 :
3427 27331 : switch (code)
3428 : {
3429 : case LTU:
3430 : case GEU:
3431 : break;
3432 :
3433 : /* Convert a==0 into (unsigned)a<1. */
3434 23807 : case EQ:
3435 23807 : case NE:
3436 23807 : if (op1 != const0_rtx)
3437 : return false;
3438 10038 : op1 = const1_rtx;
3439 10038 : code = (code == EQ ? LTU : GEU);
3440 : break;
3441 :
3442 : /* Convert a>b into b<a or a>=b-1. */
3443 698 : case GTU:
3444 698 : case LEU:
3445 698 : if (CONST_INT_P (op1))
3446 : {
3447 656 : op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
3448 : /* Bail out on overflow. We still can swap operands but that
3449 : would force loading of the constant into register. */
3450 656 : if (op1 == const0_rtx
3451 656 : || !x86_64_immediate_operand (op1, GET_MODE (op1)))
3452 0 : return false;
3453 656 : code = (code == GTU ? GEU : LTU);
3454 : }
3455 : else
3456 : {
3457 42 : std::swap (op0, op1);
3458 42 : code = (code == GTU ? LTU : GEU);
3459 : }
3460 : break;
3461 :
3462 : /* Convert a>=0 into (unsigned)a<0x80000000. */
3463 1300 : case LT:
3464 1300 : case GE:
3465 1300 : if (mode == DImode || op1 != const0_rtx)
3466 : return false;
3467 204 : op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3468 102 : code = (code == LT ? GEU : LTU);
3469 : break;
3470 833 : case LE:
3471 833 : case GT:
3472 833 : if (mode == DImode || op1 != constm1_rtx)
3473 : return false;
3474 0 : op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3475 0 : code = (code == LE ? GEU : LTU);
3476 : break;
3477 :
3478 : default:
3479 : return false;
3480 : }
3481 : /* Swapping operands may cause constant to appear as first operand. */
3482 11531 : if (!nonimmediate_operand (op0, VOIDmode))
3483 : {
3484 0 : if (!can_create_pseudo_p ())
3485 : return false;
3486 0 : op0 = force_reg (mode, op0);
3487 : }
3488 11531 : *pop = ix86_expand_compare (code, op0, op1);
3489 11531 : gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
3490 : return true;
3491 : }
3492 :
3493 : /* Expand conditional increment or decrement using adb/sbb instructions.
3494 : The default case using setcc followed by the conditional move can be
3495 : done by generic code. */
3496 : bool
3497 6780 : ix86_expand_int_addcc (rtx operands[])
3498 : {
3499 6780 : enum rtx_code code = GET_CODE (operands[1]);
3500 6780 : rtx flags;
3501 6780 : rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
3502 6780 : rtx compare_op;
3503 6780 : rtx val = const0_rtx;
3504 6780 : bool fpcmp = false;
3505 6780 : machine_mode mode;
3506 6780 : rtx op0 = XEXP (operands[1], 0);
3507 6780 : rtx op1 = XEXP (operands[1], 1);
3508 :
3509 6780 : if (operands[3] != const1_rtx
3510 2814 : && operands[3] != constm1_rtx)
3511 : return false;
3512 4689 : if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3513 : return false;
3514 1279 : code = GET_CODE (compare_op);
3515 :
3516 1279 : flags = XEXP (compare_op, 0);
3517 :
3518 1279 : if (GET_MODE (flags) == CCFPmode)
3519 : {
3520 4 : fpcmp = true;
3521 4 : code = ix86_fp_compare_code_to_integer (code);
3522 : }
3523 :
3524 1279 : if (code != LTU)
3525 : {
3526 735 : val = constm1_rtx;
3527 735 : if (fpcmp)
3528 4 : PUT_CODE (compare_op,
3529 : reverse_condition_maybe_unordered
3530 : (GET_CODE (compare_op)));
3531 : else
3532 731 : PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
3533 : }
3534 :
3535 1279 : mode = GET_MODE (operands[0]);
3536 :
3537 : /* Construct either adc or sbb insn. */
3538 1279 : if ((code == LTU) == (operands[3] == constm1_rtx))
3539 : insn = gen_sub3_carry;
3540 : else
3541 526 : insn = gen_add3_carry;
3542 :
3543 1279 : emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
3544 :
3545 1279 : return true;
3546 : }
3547 :
3548 : bool
3549 428902 : ix86_expand_int_movcc (rtx operands[])
3550 : {
3551 428902 : enum rtx_code code = GET_CODE (operands[1]), compare_code;
3552 428902 : rtx_insn *compare_seq;
3553 428902 : rtx compare_op;
3554 428902 : machine_mode mode = GET_MODE (operands[0]);
3555 428902 : bool sign_bit_compare_p = false;
3556 428902 : bool negate_cc_compare_p = false;
3557 428902 : rtx op0 = XEXP (operands[1], 0);
3558 428902 : rtx op1 = XEXP (operands[1], 1);
3559 428902 : rtx op2 = operands[2];
3560 428902 : rtx op3 = operands[3];
3561 :
3562 428902 : if (GET_MODE (op0) == TImode
3563 413508 : || (GET_MODE (op0) == DImode
3564 99795 : && !TARGET_64BIT))
3565 : return false;
3566 :
3567 412412 : if (GET_MODE (op0) == BFmode
3568 412412 : && !ix86_fp_comparison_operator (operands[1], VOIDmode))
3569 : return false;
3570 :
3571 412412 : start_sequence ();
3572 412412 : compare_op = ix86_expand_compare (code, op0, op1);
3573 412412 : compare_seq = end_sequence ();
3574 :
3575 412412 : compare_code = GET_CODE (compare_op);
3576 :
3577 412412 : if ((op1 == const0_rtx && (code == GE || code == LT))
3578 370441 : || (op1 == constm1_rtx && (code == GT || code == LE)))
3579 : sign_bit_compare_p = true;
3580 :
3581 : /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
3582 : but if op1 is a constant, the latter form allows more optimizations,
3583 : either through the last 2 ops being constant handling, or the one
3584 : constant and one variable cases. On the other side, for cmov the
3585 : former might be better as we don't need to load the constant into
3586 : another register. */
3587 370441 : if (code == EQ && CONST_INT_P (op1) && rtx_equal_p (op0, op2))
3588 : op2 = op1;
3589 : /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1. */
3590 411886 : else if (code == NE && CONST_INT_P (op1) && rtx_equal_p (op0, op3))
3591 : op3 = op1;
3592 :
3593 : /* Don't attempt mode expansion here -- if we had to expand 5 or 6
3594 : HImode insns, we'd be swallowed in word prefix ops. */
3595 :
3596 4854 : if ((mode != HImode || TARGET_FAST_PREFIX)
3597 442061 : && (mode != (TARGET_64BIT ? TImode : DImode))
3598 412412 : && CONST_INT_P (op2)
3599 444798 : && CONST_INT_P (op3))
3600 : {
3601 25425 : rtx out = operands[0];
3602 25425 : HOST_WIDE_INT ct = INTVAL (op2);
3603 25425 : HOST_WIDE_INT cf = INTVAL (op3);
3604 25425 : HOST_WIDE_INT diff;
3605 :
3606 25425 : if ((mode == SImode
3607 11902 : || (TARGET_64BIT && mode == DImode))
3608 18233 : && (GET_MODE (op0) == SImode
3609 14275 : || (TARGET_64BIT && GET_MODE (op0) == DImode)))
3610 : {
3611 : /* Special case x != 0 ? -1 : y. */
3612 13102 : if (code == NE && op1 == const0_rtx && ct == -1)
3613 : {
3614 : negate_cc_compare_p = true;
3615 : std::swap (ct, cf);
3616 : code = EQ;
3617 : }
3618 13003 : else if (code == EQ && op1 == const0_rtx && cf == -1)
3619 25425 : negate_cc_compare_p = true;
3620 : }
3621 :
3622 25425 : diff = (unsigned HOST_WIDE_INT) ct - cf;
3623 : /* Make sure we can represent the difference between the two values. */
3624 25425 : if ((diff > 0) != ((cf < 0) != (ct < 0) ? cf < 0 : cf < ct))
3625 428902 : return false;
3626 :
3627 : /* Sign bit compares are better done using shifts than we do by using
3628 : sbb. */
3629 25277 : if (sign_bit_compare_p
3630 25277 : || negate_cc_compare_p
3631 25277 : || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3632 : {
3633 : /* Detect overlap between destination and compare sources. */
3634 10964 : rtx tmp = out;
3635 :
3636 10964 : if (negate_cc_compare_p)
3637 : {
3638 280 : if (GET_MODE (op0) == DImode)
3639 106 : emit_insn (gen_x86_negdi_ccc (gen_reg_rtx (DImode), op0));
3640 : else
3641 174 : emit_insn (gen_x86_negsi_ccc (gen_reg_rtx (SImode),
3642 174 : gen_lowpart (SImode, op0)));
3643 :
3644 280 : tmp = gen_reg_rtx (mode);
3645 280 : if (mode == DImode)
3646 123 : emit_insn (gen_x86_movdicc_0_m1_neg (tmp));
3647 : else
3648 157 : emit_insn (gen_x86_movsicc_0_m1_neg (gen_lowpart (SImode,
3649 : tmp)));
3650 : }
3651 10684 : else if (!sign_bit_compare_p)
3652 : {
3653 10315 : rtx flags;
3654 10315 : bool fpcmp = false;
3655 :
3656 10315 : compare_code = GET_CODE (compare_op);
3657 :
3658 10315 : flags = XEXP (compare_op, 0);
3659 :
3660 10315 : if (GET_MODE (flags) == CCFPmode)
3661 : {
3662 59 : fpcmp = true;
3663 59 : compare_code
3664 59 : = ix86_fp_compare_code_to_integer (compare_code);
3665 : }
3666 :
3667 : /* To simplify rest of code, restrict to the GEU case. */
3668 10315 : if (compare_code == LTU)
3669 : {
3670 5963 : std::swap (ct, cf);
3671 5963 : compare_code = reverse_condition (compare_code);
3672 5963 : code = reverse_condition (code);
3673 : }
3674 : else
3675 : {
3676 4352 : if (fpcmp)
3677 59 : PUT_CODE (compare_op,
3678 : reverse_condition_maybe_unordered
3679 : (GET_CODE (compare_op)));
3680 : else
3681 4293 : PUT_CODE (compare_op,
3682 : reverse_condition (GET_CODE (compare_op)));
3683 : }
3684 :
3685 10315 : diff = (unsigned HOST_WIDE_INT) ct - cf;
3686 : /* Make sure we can represent the difference
3687 : between the two values. */
3688 10315 : if ((diff > 0) != ((cf < 0) != (ct < 0) ? cf < 0 : cf < ct))
3689 : return false;
3690 :
3691 10314 : if (reg_overlap_mentioned_p (out, compare_op))
3692 0 : tmp = gen_reg_rtx (mode);
3693 :
3694 10314 : if (mode == DImode)
3695 2036 : emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
3696 : else
3697 8278 : emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
3698 : flags, compare_op));
3699 : }
3700 : else
3701 : {
3702 369 : if (code == GT || code == GE)
3703 153 : code = reverse_condition (code);
3704 : else
3705 : {
3706 216 : std::swap (ct, cf);
3707 :
3708 216 : diff = (unsigned HOST_WIDE_INT) ct - cf;
3709 : /* Make sure we can represent the difference
3710 : between the two values. */
3711 216 : if ((diff > 0) != ((cf < 0) != (ct < 0) ? cf < 0 : cf < ct))
3712 : return false;
3713 : }
3714 364 : tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
3715 : }
3716 :
3717 10958 : if (diff == 1)
3718 : {
3719 : /*
3720 : * cmpl op0,op1
3721 : * sbbl dest,dest
3722 : * [addl dest, ct]
3723 : *
3724 : * Size 5 - 8.
3725 : */
3726 1006 : if (ct)
3727 833 : tmp = expand_simple_binop (mode, PLUS,
3728 : tmp, GEN_INT (ct),
3729 : copy_rtx (tmp), 1, OPTAB_DIRECT);
3730 : }
3731 9952 : else if (cf == -1)
3732 : {
3733 : /*
3734 : * cmpl op0,op1
3735 : * sbbl dest,dest
3736 : * orl $ct, dest
3737 : *
3738 : * Size 8.
3739 : */
3740 599 : tmp = expand_simple_binop (mode, IOR,
3741 : tmp, GEN_INT (ct),
3742 : copy_rtx (tmp), 1, OPTAB_DIRECT);
3743 : }
3744 9353 : else if (diff == -1 && ct)
3745 : {
3746 : /*
3747 : * cmpl op0,op1
3748 : * sbbl dest,dest
3749 : * notl dest
3750 : * [addl dest, cf]
3751 : *
3752 : * Size 8 - 11.
3753 : */
3754 599 : tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3755 599 : if (cf)
3756 581 : tmp = expand_simple_binop (mode, PLUS,
3757 : copy_rtx (tmp), GEN_INT (cf),
3758 : copy_rtx (tmp), 1, OPTAB_DIRECT);
3759 : }
3760 : else
3761 : {
3762 : /*
3763 : * cmpl op0,op1
3764 : * sbbl dest,dest
3765 : * [notl dest]
3766 : * andl cf - ct, dest
3767 : * [addl dest, ct]
3768 : *
3769 : * Size 8 - 11.
3770 : */
3771 :
3772 8754 : if (cf == 0)
3773 : {
3774 903 : cf = ct;
3775 903 : ct = 0;
3776 903 : tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3777 : }
3778 :
3779 8754 : HOST_WIDE_INT ival = (unsigned HOST_WIDE_INT) cf - ct;
3780 : /* Make sure we can represent the difference
3781 : between the two values. */
3782 8754 : if ((ival > 0) != ((ct < 0) != (cf < 0) ? ct < 0 : ct < cf))
3783 16644 : return false;
3784 :
3785 8754 : tmp = expand_simple_binop (mode, AND,
3786 : copy_rtx (tmp),
3787 8754 : gen_int_mode (ival, mode),
3788 : copy_rtx (tmp), 1, OPTAB_DIRECT);
3789 8754 : if (ct)
3790 7057 : tmp = expand_simple_binop (mode, PLUS,
3791 : copy_rtx (tmp), GEN_INT (ct),
3792 : copy_rtx (tmp), 1, OPTAB_DIRECT);
3793 : }
3794 :
3795 10958 : if (!rtx_equal_p (tmp, out))
3796 474 : emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3797 :
3798 10958 : return true;
3799 : }
3800 :
3801 14313 : if (diff < 0)
3802 : {
3803 4766 : machine_mode cmp_mode = GET_MODE (op0);
3804 4766 : enum rtx_code new_code;
3805 :
3806 4766 : if (SCALAR_FLOAT_MODE_P (cmp_mode))
3807 : {
3808 54 : gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3809 :
3810 : /* We may be reversing a non-trapping
3811 : comparison to a trapping comparison. */
3812 104 : if (HONOR_NANS (cmp_mode) && flag_trapping_math
3813 41 : && code != EQ && code != NE
3814 95 : && code != ORDERED && code != UNORDERED)
3815 : new_code = UNKNOWN;
3816 : else
3817 13 : new_code = reverse_condition_maybe_unordered (code);
3818 : }
3819 : else
3820 4712 : new_code = ix86_reverse_condition (code, cmp_mode);
3821 4725 : if (new_code != UNKNOWN)
3822 : {
3823 4725 : std::swap (ct, cf);
3824 :
3825 4725 : diff = (unsigned HOST_WIDE_INT) ct - cf;
3826 : /* Make sure we can represent the difference
3827 : between the two values. */
3828 4725 : if ((diff > 0) != ((cf < 0) != (ct < 0) ? cf < 0 : cf < ct))
3829 : return false;
3830 :
3831 : code = new_code;
3832 : }
3833 : }
3834 :
3835 14313 : compare_code = UNKNOWN;
3836 14313 : if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3837 12546 : && CONST_INT_P (op1))
3838 : {
3839 6662 : if (op1 == const0_rtx
3840 214 : && (code == LT || code == GE))
3841 : compare_code = code;
3842 6662 : else if (op1 == constm1_rtx)
3843 : {
3844 289 : if (code == LE)
3845 : compare_code = LT;
3846 289 : else if (code == GT)
3847 : compare_code = GE;
3848 : }
3849 : }
3850 :
3851 : /* Optimize dest = (op0 < 0) ? -1 : cf. */
3852 : if (compare_code != UNKNOWN
3853 0 : && GET_MODE (op0) == GET_MODE (out)
3854 0 : && (cf == -1 || ct == -1))
3855 : {
3856 : /* If lea code below could be used, only optimize
3857 : if it results in a 2 insn sequence. */
3858 :
3859 0 : if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3860 0 : || diff == 3 || diff == 5 || diff == 9)
3861 0 : || (compare_code == LT && ct == -1)
3862 0 : || (compare_code == GE && cf == -1))
3863 : {
3864 : /*
3865 : * notl op1 (if necessary)
3866 : * sarl $31, op1
3867 : * orl cf, op1
3868 : */
3869 0 : if (ct != -1)
3870 : {
3871 0 : cf = ct;
3872 0 : ct = -1;
3873 0 : code = reverse_condition (code);
3874 : }
3875 :
3876 0 : out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3877 :
3878 0 : out = expand_simple_binop (mode, IOR,
3879 : out, GEN_INT (cf),
3880 : out, 1, OPTAB_DIRECT);
3881 0 : if (out != operands[0])
3882 0 : emit_move_insn (operands[0], out);
3883 :
3884 0 : return true;
3885 : }
3886 : }
3887 :
3888 :
3889 21037 : if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3890 6724 : || diff == 3 || diff == 5 || diff == 9)
3891 7932 : && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3892 22245 : && (mode != DImode
3893 1930 : || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3894 : {
3895 : /*
3896 : * xorl dest,dest
3897 : * cmpl op1,op2
3898 : * setcc dest
3899 : * lea cf(dest*(ct-cf)),dest
3900 : *
3901 : * Size 14.
3902 : *
3903 : * This also catches the degenerate setcc-only case.
3904 : */
3905 :
3906 7932 : rtx tmp;
3907 7932 : int nops;
3908 :
3909 7932 : out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3910 :
3911 7932 : nops = 0;
3912 : /* On x86_64 the lea instruction operates on Pmode, so we need
3913 : to get arithmetics done in proper mode to match. */
3914 7932 : if (diff == 1)
3915 6712 : tmp = copy_rtx (out);
3916 : else
3917 : {
3918 1220 : rtx out1;
3919 1220 : out1 = copy_rtx (out);
3920 1220 : tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3921 1220 : nops++;
3922 1220 : if (diff & 1)
3923 : {
3924 254 : tmp = gen_rtx_PLUS (mode, tmp, out1);
3925 254 : nops++;
3926 : }
3927 : }
3928 7932 : if (cf != 0)
3929 : {
3930 6968 : tmp = plus_constant (mode, tmp, cf);
3931 6968 : nops++;
3932 : }
3933 7932 : if (!rtx_equal_p (tmp, out))
3934 : {
3935 7208 : if (nops == 1)
3936 6086 : out = force_operand (tmp, copy_rtx (out));
3937 : else
3938 1122 : emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3939 : }
3940 7932 : if (!rtx_equal_p (out, operands[0]))
3941 888 : emit_move_insn (operands[0], copy_rtx (out));
3942 :
3943 7932 : return true;
3944 : }
3945 :
3946 : /*
3947 : * General case: Jumpful:
3948 : * xorl dest,dest cmpl op1, op2
3949 : * cmpl op1, op2 movl ct, dest
3950 : * setcc dest jcc 1f
3951 : * decl dest movl cf, dest
3952 : * andl (cf-ct),dest 1:
3953 : * addl ct,dest
3954 : *
3955 : * Size 20. Size 14.
3956 : *
3957 : * This is reasonably steep, but branch mispredict costs are
3958 : * high on modern cpus, so consider failing only if optimizing
3959 : * for space.
3960 : */
3961 :
3962 6381 : if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3963 6381 : && BRANCH_COST (optimize_insn_for_speed_p (),
3964 : false) >= 2)
3965 : {
3966 0 : if (cf == 0)
3967 : {
3968 0 : machine_mode cmp_mode = GET_MODE (op0);
3969 0 : enum rtx_code new_code;
3970 :
3971 0 : if (SCALAR_FLOAT_MODE_P (cmp_mode))
3972 : {
3973 0 : gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3974 :
3975 : /* We may be reversing a non-trapping
3976 : comparison to a trapping comparison. */
3977 0 : if (HONOR_NANS (cmp_mode) && flag_trapping_math
3978 0 : && code != EQ && code != NE
3979 0 : && code != ORDERED && code != UNORDERED)
3980 : new_code = UNKNOWN;
3981 : else
3982 0 : new_code = reverse_condition_maybe_unordered (code);
3983 :
3984 : }
3985 : else
3986 : {
3987 0 : new_code = ix86_reverse_condition (code, cmp_mode);
3988 0 : if (compare_code != UNKNOWN && new_code != UNKNOWN)
3989 0 : compare_code = reverse_condition (compare_code);
3990 : }
3991 :
3992 0 : if (new_code != UNKNOWN)
3993 : {
3994 0 : cf = ct;
3995 0 : ct = 0;
3996 0 : code = new_code;
3997 : }
3998 : }
3999 :
4000 0 : if (compare_code != UNKNOWN)
4001 : {
4002 : /* notl op1 (if needed)
4003 : sarl $31, op1
4004 : andl (cf-ct), op1
4005 : addl ct, op1
4006 :
4007 : For x < 0 (resp. x <= -1) there will be no notl,
4008 : so if possible swap the constants to get rid of the
4009 : complement.
4010 : True/false will be -1/0 while code below (store flag
4011 : followed by decrement) is 0/-1, so the constants need
4012 : to be exchanged once more. */
4013 :
4014 0 : if (compare_code == GE || !cf)
4015 : {
4016 0 : code = reverse_condition (code);
4017 0 : compare_code = LT;
4018 : }
4019 : else
4020 : std::swap (ct, cf);
4021 :
4022 0 : out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
4023 : }
4024 : else
4025 : {
4026 0 : out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
4027 :
4028 0 : out = expand_simple_binop (mode, PLUS, copy_rtx (out),
4029 : constm1_rtx,
4030 : copy_rtx (out), 1, OPTAB_DIRECT);
4031 : }
4032 :
4033 0 : HOST_WIDE_INT ival = (unsigned HOST_WIDE_INT) cf - ct;
4034 : /* Make sure we can represent the difference
4035 : between the two values. */
4036 0 : if ((ival > 0) != ((ct < 0) != (cf < 0) ? ct < 0 : ct < cf))
4037 : return false;
4038 :
4039 0 : out = expand_simple_binop (mode, AND, copy_rtx (out),
4040 0 : gen_int_mode (ival, mode),
4041 : copy_rtx (out), 1, OPTAB_DIRECT);
4042 0 : if (ct)
4043 0 : out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
4044 : copy_rtx (out), 1, OPTAB_DIRECT);
4045 0 : if (!rtx_equal_p (out, operands[0]))
4046 0 : emit_move_insn (operands[0], copy_rtx (out));
4047 :
4048 0 : return true;
4049 : }
4050 : }
4051 :
4052 393368 : if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
4053 : {
4054 : /* Try a few things more with specific constants and a variable. */
4055 :
4056 0 : optab op;
4057 0 : rtx var, orig_out, out, tmp;
4058 :
4059 0 : if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
4060 : return false;
4061 :
4062 0 : operands[2] = op2;
4063 0 : operands[3] = op3;
4064 :
4065 : /* If one of the two operands is an interesting constant, load a
4066 : constant with the above and mask it in with a logical operation. */
4067 :
4068 0 : if (CONST_INT_P (operands[2]))
4069 : {
4070 0 : var = operands[3];
4071 0 : if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
4072 0 : operands[3] = constm1_rtx, op = and_optab;
4073 0 : else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
4074 0 : operands[3] = const0_rtx, op = ior_optab;
4075 : else
4076 : return false;
4077 : }
4078 0 : else if (CONST_INT_P (operands[3]))
4079 : {
4080 0 : var = operands[2];
4081 0 : if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
4082 : {
4083 : /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
4084 : "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
4085 0 : if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
4086 0 : operands[1] = simplify_gen_relational (LT, VOIDmode,
4087 0 : GET_MODE (op0),
4088 : op0, const0_rtx);
4089 :
4090 0 : operands[2] = constm1_rtx;
4091 0 : op = and_optab;
4092 : }
4093 0 : else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
4094 0 : operands[2] = const0_rtx, op = ior_optab;
4095 : else
4096 : return false;
4097 : }
4098 : else
4099 : return false;
4100 :
4101 0 : orig_out = operands[0];
4102 0 : tmp = gen_reg_rtx (mode);
4103 0 : operands[0] = tmp;
4104 :
4105 : /* Recurse to get the constant loaded. */
4106 0 : if (!ix86_expand_int_movcc (operands))
4107 : return false;
4108 :
4109 : /* Mask in the interesting variable. */
4110 0 : out = expand_binop (mode, op, var, tmp, orig_out, 0,
4111 : OPTAB_WIDEN);
4112 0 : if (!rtx_equal_p (out, orig_out))
4113 0 : emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
4114 :
4115 0 : return true;
4116 : }
4117 :
4118 : /*
4119 : * For comparison with above,
4120 : *
4121 : * movl cf,dest
4122 : * movl ct,tmp
4123 : * cmpl op1,op2
4124 : * cmovcc tmp,dest
4125 : *
4126 : * Size 15.
4127 : */
4128 :
4129 393368 : if (! nonimmediate_operand (operands[2], mode))
4130 22461 : operands[2] = force_reg (mode, operands[2]);
4131 393368 : if (! nonimmediate_operand (operands[3], mode))
4132 172914 : operands[3] = force_reg (mode, operands[3]);
4133 :
4134 393368 : if (! register_operand (operands[2], VOIDmode)
4135 393368 : && (mode == QImode
4136 1093 : || ! register_operand (operands[3], VOIDmode)))
4137 1564 : operands[2] = force_reg (mode, operands[2]);
4138 :
4139 393368 : if (mode == QImode
4140 393368 : && ! register_operand (operands[3], VOIDmode))
4141 592 : operands[3] = force_reg (mode, operands[3]);
4142 :
4143 393368 : emit_insn (compare_seq);
4144 393368 : emit_insn (gen_rtx_SET (operands[0],
4145 : gen_rtx_IF_THEN_ELSE (mode,
4146 : compare_op, operands[2],
4147 : operands[3])));
4148 393368 : return true;
4149 : }
4150 :
4151 : /* Detect conditional moves that exactly match min/max operational
4152 : semantics. Note that this is IEEE safe, as long as we don't
4153 : interchange the operands.
4154 :
4155 : Returns FALSE if this conditional move doesn't match a MIN/MAX,
4156 : and TRUE if the operation is successful and instructions are emitted. */
4157 :
4158 : static bool
4159 9764 : ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
4160 : rtx cmp_op1, rtx if_true, rtx if_false)
4161 : {
4162 9764 : machine_mode mode = GET_MODE (dest);
4163 9764 : bool is_min;
4164 9764 : rtx tmp;
4165 :
4166 9764 : if (code == LT)
4167 : ;
4168 3284 : else if (code == LE && !HONOR_NANS (mode))
4169 : {
4170 : /* We can swap LE to GE and then invert to LT. */
4171 : std::swap (cmp_op0, cmp_op1);
4172 : std::swap (if_true, if_false);
4173 : }
4174 3243 : else if (code == UNGE)
4175 : std::swap (if_true, if_false);
4176 : else
4177 : return false;
4178 :
4179 8638 : if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
4180 : is_min = true;
4181 4586 : else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
4182 : is_min = false;
4183 : else
4184 1017 : return false;
4185 :
4186 7621 : if (immediate_operand (if_false, mode))
4187 8 : if_false = force_reg (mode, if_false);
4188 7621 : if (immediate_operand (if_true, mode))
4189 0 : if_true = force_reg (mode, if_true);
4190 :
4191 : /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
4192 : but MODE may be a vector mode and thus not appropriate. */
4193 7621 : if (!flag_finite_math_only || flag_signed_zeros)
4194 : {
4195 7621 : int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
4196 7621 : rtvec v;
4197 :
4198 7621 : if_true = force_reg (mode, if_true);
4199 7621 : v = gen_rtvec (2, if_true, if_false);
4200 7621 : tmp = gen_rtx_UNSPEC (mode, v, u);
4201 7621 : }
4202 : else
4203 : {
4204 0 : code = is_min ? SMIN : SMAX;
4205 0 : if (MEM_P (if_true) && MEM_P (if_false))
4206 0 : if_true = force_reg (mode, if_true);
4207 0 : tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
4208 : }
4209 :
4210 7621 : emit_insn (gen_rtx_SET (dest, tmp));
4211 7621 : return true;
4212 : }
4213 :
4214 : /* Return true if MODE is valid for vector compare to mask register,
4215 : Same result for conditionl vector move with mask register. */
4216 : static bool
4217 14990 : ix86_valid_mask_cmp_mode (machine_mode mode)
4218 : {
4219 : /* XOP has its own vector conditional movement. */
4220 14990 : if (TARGET_XOP && !TARGET_AVX512F)
4221 : return false;
4222 :
4223 : /* HFmode only supports vcmpsh whose dest is mask register. */
4224 14984 : if (TARGET_AVX512FP16 && mode == HFmode)
4225 : return true;
4226 :
4227 : /* AVX512F is needed for mask operation. */
4228 14892 : if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
4229 : return false;
4230 :
4231 : /* AVX512BW is needed for vector QI/HImode,
4232 : AVX512VL is needed for 128/256-bit vector. */
4233 182 : machine_mode inner_mode = GET_MODE_INNER (mode);
4234 182 : int vector_size = GET_MODE_SIZE (mode);
4235 182 : if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
4236 : return false;
4237 :
4238 162 : return vector_size == 64 || TARGET_AVX512VL;
4239 : }
4240 :
4241 : /* Return true if integer mask comparison should be used. */
4242 : static bool
4243 52748 : ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
4244 : rtx op_true, rtx op_false)
4245 : {
4246 52748 : int vector_size = GET_MODE_SIZE (mode);
4247 :
4248 52748 : if (cmp_mode == HFmode)
4249 : return true;
4250 52656 : else if (vector_size < 16)
4251 : return false;
4252 46349 : else if (vector_size == 64)
4253 : return true;
4254 92582 : else if (GET_MODE_INNER (cmp_mode) == HFmode)
4255 : return true;
4256 92582 : else if (GET_MODE_INNER (cmp_mode) == BFmode)
4257 : return true;
4258 :
4259 : /* When op_true is NULL, op_false must be NULL, or vice versa. */
4260 46291 : gcc_assert (!op_true == !op_false);
4261 :
4262 : /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
4263 : vector dest is required. */
4264 46291 : if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
4265 : return false;
4266 :
4267 : /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
4268 48 : if (op_false == CONST0_RTX (mode)
4269 48 : || op_true == CONST0_RTX (mode)
4270 48 : || (INTEGRAL_MODE_P (mode)
4271 40 : && (op_true == CONSTM1_RTX (mode)
4272 40 : || op_false == CONSTM1_RTX (mode))))
4273 0 : return false;
4274 :
4275 : return true;
4276 : }
4277 :
4278 : /* Expand an SSE comparison. Return the register with the result. */
4279 :
4280 : static rtx
4281 35771 : ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
4282 : rtx op_true, rtx op_false)
4283 : {
4284 35771 : machine_mode mode = GET_MODE (dest);
4285 35771 : machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
4286 :
4287 : /* In general case result of comparison can differ from operands' type. */
4288 35771 : machine_mode cmp_mode;
4289 :
4290 : /* In AVX512F the result of comparison is an integer mask. */
4291 35771 : bool maskcmp = false;
4292 35771 : rtx x;
4293 :
4294 35771 : if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
4295 : {
4296 145 : unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
4297 145 : maskcmp = true;
4298 145 : cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
4299 : }
4300 : else
4301 : cmp_mode = cmp_ops_mode;
4302 :
4303 35771 : cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
4304 :
4305 71542 : bool (*op1_predicate)(rtx, machine_mode)
4306 35771 : = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
4307 :
4308 35771 : if (!op1_predicate (cmp_op1, cmp_ops_mode))
4309 0 : cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
4310 :
4311 35771 : if (optimize
4312 505 : || (maskcmp && cmp_mode != mode)
4313 505 : || (op_true && reg_overlap_mentioned_p (dest, op_true))
4314 36276 : || (op_false && reg_overlap_mentioned_p (dest, op_false)))
4315 70387 : dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
4316 :
4317 35771 : if (maskcmp)
4318 : {
4319 145 : bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
4320 145 : gcc_assert (ok);
4321 : return dest;
4322 : }
4323 :
4324 35626 : x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
4325 :
4326 35626 : if (cmp_mode != mode)
4327 : {
4328 7197 : x = force_reg (cmp_ops_mode, x);
4329 7197 : convert_move (dest, x, false);
4330 : }
4331 : else
4332 28429 : emit_insn (gen_rtx_SET (dest, x));
4333 :
4334 : return dest;
4335 : }
4336 :
4337 : /* Emit x86 binary operand CODE in mode MODE for SSE vector
4338 : instructions that can be performed using GP registers. */
4339 :
4340 : static void
4341 7217 : ix86_emit_vec_binop (enum rtx_code code, machine_mode mode,
4342 : rtx dst, rtx src1, rtx src2)
4343 : {
4344 7217 : rtx tmp;
4345 :
4346 7217 : tmp = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
4347 :
4348 7217 : if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (SImode)
4349 7217 : && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
4350 : {
4351 94 : rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
4352 94 : tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
4353 : }
4354 :
4355 7217 : emit_insn (tmp);
4356 7217 : }
4357 :
4358 : /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
4359 : operations. This is used for both scalar and vector conditional moves. */
4360 :
4361 : void
4362 10299 : ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
4363 : {
4364 10299 : machine_mode mode = GET_MODE (dest);
4365 10299 : machine_mode cmpmode = GET_MODE (cmp);
4366 10299 : rtx x;
4367 :
4368 : /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
4369 10299 : if (rtx_equal_p (op_true, op_false))
4370 : {
4371 0 : emit_move_insn (dest, op_true);
4372 0 : return;
4373 : }
4374 :
4375 : /* If we have an integer mask and FP value then we need
4376 : to cast mask to FP mode. */
4377 10299 : if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
4378 : {
4379 1536 : cmp = force_reg (cmpmode, cmp);
4380 1536 : cmp = gen_rtx_SUBREG (mode, cmp, 0);
4381 : }
4382 :
4383 : /* In AVX512F the result of comparison is an integer mask. */
4384 10299 : if (mode != cmpmode
4385 1681 : && GET_MODE_CLASS (cmpmode) == MODE_INT)
4386 : {
4387 145 : gcc_assert (ix86_valid_mask_cmp_mode (mode));
4388 : /* Using scalar/vector move with mask register. */
4389 145 : cmp = force_reg (cmpmode, cmp);
4390 : /* Optimize for mask zero. */
4391 290 : op_true = (op_true != CONST0_RTX (mode)
4392 145 : ? force_reg (mode, op_true) : op_true);
4393 290 : op_false = (op_false != CONST0_RTX (mode)
4394 145 : ? force_reg (mode, op_false) : op_false);
4395 145 : if (op_true == CONST0_RTX (mode))
4396 : {
4397 0 : if (cmpmode == E_DImode && !TARGET_64BIT)
4398 : {
4399 0 : x = gen_reg_rtx (cmpmode);
4400 0 : emit_insn (gen_knotdi (x, cmp));
4401 : }
4402 : else
4403 0 : x = expand_simple_unop (cmpmode, NOT, cmp, NULL, 1);
4404 : cmp = x;
4405 : /* Reverse op_true op_false. */
4406 : std::swap (op_true, op_false);
4407 : }
4408 :
4409 145 : if (mode == HFmode)
4410 92 : emit_insn (gen_movhf_mask (dest, op_true, op_false, cmp));
4411 : else
4412 53 : emit_insn (gen_rtx_SET (dest,
4413 : gen_rtx_VEC_MERGE (mode,
4414 : op_true, op_false, cmp)));
4415 145 : return;
4416 : }
4417 :
4418 10154 : if (vector_all_ones_operand (op_true, mode)
4419 10154 : && op_false == CONST0_RTX (mode))
4420 : {
4421 2 : emit_move_insn (dest, cmp);
4422 2 : return;
4423 : }
4424 10152 : else if (op_false == CONST0_RTX (mode))
4425 : {
4426 902 : x = expand_simple_binop (mode, AND, cmp, op_true,
4427 : dest, 1, OPTAB_DIRECT);
4428 902 : if (x != dest)
4429 0 : emit_move_insn (dest, x);
4430 902 : return;
4431 : }
4432 9250 : else if (op_true == CONST0_RTX (mode))
4433 : {
4434 116 : op_false = force_reg (mode, op_false);
4435 116 : x = gen_rtx_NOT (mode, cmp);
4436 116 : ix86_emit_vec_binop (AND, mode, dest, x, op_false);
4437 116 : return;
4438 : }
4439 9134 : else if (vector_all_ones_operand (op_true, mode))
4440 : {
4441 2 : x = expand_simple_binop (mode, IOR, cmp, op_false,
4442 : dest, 1, OPTAB_DIRECT);
4443 2 : if (x != dest)
4444 0 : emit_move_insn (dest, x);
4445 2 : return;
4446 : }
4447 :
4448 9132 : if (TARGET_XOP)
4449 : {
4450 65 : op_true = force_reg (mode, op_true);
4451 :
4452 65 : if (GET_MODE_SIZE (mode) < 16
4453 65 : || !nonimmediate_operand (op_false, mode))
4454 49 : op_false = force_reg (mode, op_false);
4455 :
4456 65 : emit_insn (gen_rtx_SET (dest,
4457 : gen_rtx_IF_THEN_ELSE (mode, cmp,
4458 : op_true, op_false)));
4459 65 : return;
4460 : }
4461 :
4462 9067 : rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4463 9067 : machine_mode blend_mode = mode;
4464 :
4465 9067 : if (GET_MODE_SIZE (mode) < 16
4466 9067 : || !vector_operand (op_true, mode))
4467 2425 : op_true = force_reg (mode, op_true);
4468 :
4469 9067 : op_false = force_reg (mode, op_false);
4470 :
4471 9067 : switch (mode)
4472 : {
4473 29 : case E_V2SFmode:
4474 29 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4475 : gen = gen_mmx_blendvps;
4476 : break;
4477 320 : case E_V4SFmode:
4478 320 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4479 : gen = gen_sse4_1_blendvps;
4480 : break;
4481 157 : case E_V2DFmode:
4482 157 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4483 : gen = gen_sse4_1_blendvpd;
4484 : break;
4485 1093 : case E_SFmode:
4486 1093 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4487 : gen = gen_sse4_1_blendvss;
4488 : break;
4489 818 : case E_DFmode:
4490 818 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4491 : gen = gen_sse4_1_blendvsd;
4492 : break;
4493 362 : case E_V8QImode:
4494 362 : case E_V4HImode:
4495 362 : case E_V4HFmode:
4496 362 : case E_V4BFmode:
4497 362 : case E_V2SImode:
4498 362 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4499 : {
4500 : gen = gen_mmx_pblendvb_v8qi;
4501 : blend_mode = V8QImode;
4502 : }
4503 : break;
4504 87 : case E_V4QImode:
4505 87 : case E_V2HImode:
4506 87 : case E_V2HFmode:
4507 87 : case E_V2BFmode:
4508 87 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4509 : {
4510 : gen = gen_mmx_pblendvb_v4qi;
4511 : blend_mode = V4QImode;
4512 : }
4513 : break;
4514 36 : case E_V2QImode:
4515 36 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4516 : gen = gen_mmx_pblendvb_v2qi;
4517 : break;
4518 5502 : case E_V16QImode:
4519 5502 : case E_V8HImode:
4520 5502 : case E_V8HFmode:
4521 5502 : case E_V8BFmode:
4522 5502 : case E_V4SImode:
4523 5502 : case E_V2DImode:
4524 5502 : case E_V1TImode:
4525 5502 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4526 : {
4527 : gen = gen_sse4_1_pblendvb;
4528 : blend_mode = V16QImode;
4529 : }
4530 : break;
4531 91 : case E_V8SFmode:
4532 91 : if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV)
4533 : gen = gen_avx_blendvps256;
4534 : break;
4535 192 : case E_V4DFmode:
4536 192 : if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV)
4537 : gen = gen_avx_blendvpd256;
4538 : break;
4539 380 : case E_V32QImode:
4540 380 : case E_V16HImode:
4541 380 : case E_V16HFmode:
4542 380 : case E_V16BFmode:
4543 380 : case E_V8SImode:
4544 380 : case E_V4DImode:
4545 380 : if (TARGET_AVX2 && TARGET_SSE_MOVCC_USE_BLENDV)
4546 : {
4547 : gen = gen_avx2_pblendvb;
4548 : blend_mode = V32QImode;
4549 : }
4550 : break;
4551 :
4552 0 : case E_V64QImode:
4553 0 : gen = gen_avx512bw_blendmv64qi;
4554 0 : break;
4555 0 : case E_V32HImode:
4556 0 : gen = gen_avx512bw_blendmv32hi;
4557 0 : break;
4558 0 : case E_V32HFmode:
4559 0 : gen = gen_avx512bw_blendmv32hf;
4560 0 : break;
4561 0 : case E_V32BFmode:
4562 0 : gen = gen_avx512bw_blendmv32bf;
4563 0 : break;
4564 0 : case E_V16SImode:
4565 0 : gen = gen_avx512f_blendmv16si;
4566 0 : break;
4567 0 : case E_V8DImode:
4568 0 : gen = gen_avx512f_blendmv8di;
4569 0 : break;
4570 0 : case E_V8DFmode:
4571 0 : gen = gen_avx512f_blendmv8df;
4572 0 : break;
4573 : case E_V16SFmode:
4574 : gen = gen_avx512f_blendmv16sf;
4575 : break;
4576 :
4577 : default:
4578 : break;
4579 : }
4580 :
4581 0 : if (gen != NULL)
4582 : {
4583 2067 : if (blend_mode == mode)
4584 : x = dest;
4585 : else
4586 : {
4587 1004 : x = gen_reg_rtx (blend_mode);
4588 1004 : op_false = gen_lowpart (blend_mode, op_false);
4589 1004 : op_true = gen_lowpart (blend_mode, op_true);
4590 1004 : cmp = gen_lowpart (blend_mode, cmp);
4591 : }
4592 :
4593 2067 : emit_insn (gen (x, op_false, op_true, cmp));
4594 :
4595 2067 : if (x != dest)
4596 1004 : emit_move_insn (dest, gen_lowpart (mode, x));
4597 : }
4598 : else
4599 : {
4600 7000 : rtx t2, t3;
4601 :
4602 7000 : t2 = expand_simple_binop (mode, AND, op_true, cmp,
4603 : NULL, 1, OPTAB_DIRECT);
4604 :
4605 7000 : t3 = gen_reg_rtx (mode);
4606 7000 : x = gen_rtx_NOT (mode, cmp);
4607 7000 : ix86_emit_vec_binop (AND, mode, t3, x, op_false);
4608 :
4609 7000 : x = expand_simple_binop (mode, IOR, t3, t2,
4610 : dest, 1, OPTAB_DIRECT);
4611 7000 : if (x != dest)
4612 0 : emit_move_insn (dest, x);
4613 : }
4614 : }
4615 :
4616 : /* Swap, force into registers, or otherwise massage the two operands
4617 : to an sse comparison with a mask result. Thus we differ a bit from
4618 : ix86_prepare_fp_compare_args which expects to produce a flags result.
4619 :
4620 : The DEST operand exists to help determine whether to commute commutative
4621 : operators. The POP0/POP1 operands are updated in place. The new
4622 : comparison code is returned, or UNKNOWN if not implementable. */
4623 :
4624 : static enum rtx_code
4625 16961 : ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
4626 : rtx *pop0, rtx *pop1)
4627 : {
4628 16961 : switch (code)
4629 : {
4630 67 : case LTGT:
4631 67 : case UNEQ:
4632 : /* AVX supports all the needed comparisons. */
4633 67 : if (TARGET_AVX)
4634 : break;
4635 : /* We have no LTGT as an operator. We could implement it with
4636 : NE & ORDERED, but this requires an extra temporary. It's
4637 : not clear that it's worth it. */
4638 : return UNKNOWN;
4639 :
4640 : case LT:
4641 : case LE:
4642 : case UNGT:
4643 : case UNGE:
4644 : /* These are supported directly. */
4645 : break;
4646 :
4647 5350 : case EQ:
4648 5350 : case NE:
4649 5350 : case UNORDERED:
4650 5350 : case ORDERED:
4651 : /* AVX has 3 operand comparisons, no need to swap anything. */
4652 5350 : if (TARGET_AVX)
4653 : break;
4654 : /* For commutative operators, try to canonicalize the destination
4655 : operand to be first in the comparison - this helps reload to
4656 : avoid extra moves. */
4657 771 : if (!dest || !rtx_equal_p (dest, *pop1))
4658 : break;
4659 : /* FALLTHRU */
4660 :
4661 10562 : case GE:
4662 10562 : case GT:
4663 10562 : case UNLE:
4664 10562 : case UNLT:
4665 : /* These are not supported directly before AVX, and furthermore
4666 : ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
4667 : comparison operands to transform into something that is
4668 : supported. */
4669 10562 : std::swap (*pop0, *pop1);
4670 10562 : code = swap_condition (code);
4671 10562 : break;
4672 :
4673 0 : default:
4674 0 : gcc_unreachable ();
4675 : }
4676 :
4677 : return code;
4678 : }
4679 :
4680 : /* Expand a floating-point conditional move. Return true if successful. */
4681 :
4682 : bool
4683 95972 : ix86_expand_fp_movcc (rtx operands[])
4684 : {
4685 95972 : machine_mode mode = GET_MODE (operands[0]);
4686 95972 : enum rtx_code code = GET_CODE (operands[1]);
4687 95972 : rtx tmp, compare_op;
4688 95972 : rtx op0 = XEXP (operands[1], 0);
4689 95972 : rtx op1 = XEXP (operands[1], 1);
4690 :
4691 95972 : if (GET_MODE (op0) == BFmode
4692 95972 : && !ix86_fp_comparison_operator (operands[1], VOIDmode))
4693 : return false;
4694 :
4695 95972 : if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
4696 : {
4697 65458 : machine_mode cmode;
4698 :
4699 : /* Since we've no cmove for sse registers, don't force bad register
4700 : allocation just to gain access to it. Deny movcc when the
4701 : comparison mode doesn't match the move mode. */
4702 65458 : cmode = GET_MODE (op0);
4703 65458 : if (cmode == VOIDmode)
4704 0 : cmode = GET_MODE (op1);
4705 65458 : if (cmode != mode)
4706 : return false;
4707 :
4708 9784 : code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
4709 9784 : if (code == UNKNOWN)
4710 : return false;
4711 :
4712 9764 : if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
4713 : operands[2], operands[3]))
4714 : return true;
4715 :
4716 2143 : tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
4717 : operands[2], operands[3]);
4718 2143 : ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
4719 2143 : return true;
4720 : }
4721 :
4722 30514 : if (GET_MODE (op0) == TImode
4723 30514 : || (GET_MODE (op0) == DImode
4724 72 : && !TARGET_64BIT))
4725 : return false;
4726 :
4727 : /* The floating point conditional move instructions don't directly
4728 : support conditions resulting from a signed integer comparison. */
4729 :
4730 30442 : compare_op = ix86_expand_compare (code, op0, op1);
4731 30442 : if (!fcmov_comparison_operator (compare_op, VOIDmode))
4732 : {
4733 146 : tmp = gen_reg_rtx (QImode);
4734 146 : ix86_expand_setcc (tmp, code, op0, op1);
4735 :
4736 146 : compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
4737 : }
4738 :
4739 30442 : operands[2] = force_reg (mode, operands[2]);
4740 30442 : operands[3] = force_reg (mode, operands[3]);
4741 30442 : emit_insn (gen_rtx_SET (operands[0],
4742 : gen_rtx_IF_THEN_ELSE (mode, compare_op,
4743 : operands[2], operands[3])));
4744 :
4745 30442 : return true;
4746 : }
4747 :
4748 : /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
4749 :
4750 : static int
4751 4854 : ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
4752 : {
4753 4854 : switch (code)
4754 : {
4755 : case EQ:
4756 : return 0;
4757 377 : case LT:
4758 377 : case LTU:
4759 377 : return 1;
4760 212 : case LE:
4761 212 : case LEU:
4762 212 : return 2;
4763 3051 : case NE:
4764 3051 : return 4;
4765 307 : case GE:
4766 307 : case GEU:
4767 307 : return 5;
4768 498 : case GT:
4769 498 : case GTU:
4770 498 : return 6;
4771 0 : default:
4772 0 : gcc_unreachable ();
4773 : }
4774 : }
4775 :
4776 : /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
4777 :
4778 : static int
4779 1781 : ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
4780 : {
4781 1781 : switch (code)
4782 : {
4783 : case EQ:
4784 : return 0x00;
4785 354 : case NE:
4786 354 : return 0x04;
4787 514 : case GT:
4788 514 : return 0x0e;
4789 88 : case LE:
4790 88 : return 0x02;
4791 53 : case GE:
4792 53 : return 0x0d;
4793 620 : case LT:
4794 620 : return 0x01;
4795 2 : case UNLE:
4796 2 : return 0x0a;
4797 2 : case UNLT:
4798 2 : return 0x09;
4799 11 : case UNGE:
4800 11 : return 0x05;
4801 44 : case UNGT:
4802 44 : return 0x06;
4803 2 : case UNEQ:
4804 2 : return 0x18;
4805 0 : case LTGT:
4806 0 : return 0x0c;
4807 2 : case ORDERED:
4808 2 : return 0x07;
4809 2 : case UNORDERED:
4810 2 : return 0x03;
4811 0 : default:
4812 0 : gcc_unreachable ();
4813 : }
4814 : }
4815 :
4816 : /* Return immediate value to be used in UNSPEC_PCMP
4817 : for comparison CODE in MODE. */
4818 :
4819 : static int
4820 6635 : ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
4821 : {
4822 6635 : if (FLOAT_MODE_P (mode))
4823 1781 : return ix86_fp_cmp_code_to_pcmp_immediate (code);
4824 4854 : return ix86_int_cmp_code_to_pcmp_immediate (code);
4825 : }
4826 :
4827 : /* Expand AVX-512 vector comparison. */
4828 :
4829 : bool
4830 6635 : ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
4831 : {
4832 6635 : machine_mode mask_mode = GET_MODE (dest);
4833 6635 : machine_mode cmp_mode = GET_MODE (cmp_op0);
4834 6635 : rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
4835 6635 : int unspec_code;
4836 6635 : rtx unspec;
4837 :
4838 6635 : switch (code)
4839 : {
4840 : case LEU:
4841 : case GTU:
4842 : case GEU:
4843 : case LTU:
4844 : unspec_code = UNSPEC_UNSIGNED_PCMP;
4845 : break;
4846 :
4847 6221 : default:
4848 6221 : unspec_code = UNSPEC_PCMP;
4849 : }
4850 :
4851 6635 : unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
4852 : unspec_code);
4853 6635 : emit_insn (gen_rtx_SET (dest, unspec));
4854 :
4855 6635 : return true;
4856 : }
4857 :
4858 : /* Expand fp vector comparison. */
4859 :
4860 : bool
4861 7177 : ix86_expand_fp_vec_cmp (rtx operands[])
4862 : {
4863 7177 : enum rtx_code code = GET_CODE (operands[1]);
4864 7177 : rtx cmp;
4865 :
4866 7177 : code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4867 : &operands[2], &operands[3]);
4868 7177 : if (code == UNKNOWN)
4869 : {
4870 20 : rtx temp;
4871 20 : switch (GET_CODE (operands[1]))
4872 : {
4873 2 : case LTGT:
4874 2 : temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
4875 : operands[3], NULL, NULL);
4876 2 : cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
4877 : operands[3], NULL, NULL);
4878 2 : code = AND;
4879 2 : break;
4880 18 : case UNEQ:
4881 18 : temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4882 : operands[3], NULL, NULL);
4883 18 : cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4884 : operands[3], NULL, NULL);
4885 18 : code = IOR;
4886 18 : break;
4887 0 : default:
4888 0 : gcc_unreachable ();
4889 : }
4890 20 : cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4891 : OPTAB_DIRECT);
4892 : }
4893 : else
4894 7157 : cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
4895 : NULL, NULL);
4896 :
4897 7177 : if (operands[0] != cmp)
4898 7094 : emit_move_insn (operands[0], cmp);
4899 :
4900 7177 : return true;
4901 : }
4902 :
4903 : static rtx
4904 17175 : ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4905 : rtx op_true, rtx op_false, bool *negate)
4906 : {
4907 17175 : machine_mode data_mode = GET_MODE (dest);
4908 17175 : machine_mode mode = GET_MODE (cop0);
4909 17175 : rtx x;
4910 :
4911 17175 : *negate = false;
4912 :
4913 : /* XOP supports all of the comparisons on all 128-bit vector int types. */
4914 17175 : if (TARGET_XOP
4915 201 : && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4916 17376 : && GET_MODE_SIZE (mode) <= 16)
4917 : ;
4918 : /* AVX512F supports all of the comparsions
4919 : on all 128/256/512-bit vector int types. */
4920 16977 : else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
4921 : ;
4922 : else
4923 : {
4924 : /* Canonicalize the comparison to EQ, GT, GTU. */
4925 16924 : switch (code)
4926 : {
4927 : case EQ:
4928 : case GT:
4929 : case GTU:
4930 : break;
4931 :
4932 842 : case LE:
4933 842 : case LEU:
4934 : /* x <= cst can be handled as x < cst + 1 unless there is
4935 : wrap around in cst + 1. */
4936 842 : if (CONST_VECTOR_P (cop1)
4937 1416 : && GET_MODE_INNER (mode) != TImode)
4938 : {
4939 574 : unsigned int n_elts = GET_MODE_NUNITS (mode), i;
4940 574 : machine_mode eltmode = GET_MODE_INNER (mode);
4941 3659 : for (i = 0; i < n_elts; ++i)
4942 : {
4943 3086 : rtx elt = CONST_VECTOR_ELT (cop1, i);
4944 3086 : if (!CONST_INT_P (elt))
4945 : break;
4946 3086 : if (code == LE)
4947 : {
4948 : /* For LE punt if some element is signed maximum. */
4949 2062 : if ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
4950 : == (GET_MODE_MASK (eltmode) >> 1))
4951 : break;
4952 : }
4953 : /* For LEU punt if some element is unsigned maximum. */
4954 1024 : else if (elt == constm1_rtx)
4955 : break;
4956 : }
4957 574 : if (i == n_elts)
4958 : {
4959 573 : rtvec v = rtvec_alloc (n_elts);
4960 4230 : for (i = 0; i < n_elts; ++i)
4961 3084 : RTVEC_ELT (v, i)
4962 3084 : = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) + 1,
4963 : eltmode);
4964 573 : cop1 = gen_rtx_CONST_VECTOR (mode, v);
4965 573 : std::swap (cop0, cop1);
4966 573 : code = code == LE ? GT : GTU;
4967 : break;
4968 : }
4969 : }
4970 : /* FALLTHRU */
4971 3357 : case NE:
4972 3357 : code = reverse_condition (code);
4973 3357 : *negate = true;
4974 3357 : break;
4975 :
4976 442 : case GE:
4977 442 : case GEU:
4978 : /* x >= cst can be handled as x > cst - 1 unless there is
4979 : wrap around in cst - 1. */
4980 442 : if (CONST_VECTOR_P (cop1)
4981 651 : && GET_MODE_INNER (mode) != TImode)
4982 : {
4983 209 : unsigned int n_elts = GET_MODE_NUNITS (mode), i;
4984 209 : machine_mode eltmode = GET_MODE_INNER (mode);
4985 1453 : for (i = 0; i < n_elts; ++i)
4986 : {
4987 1292 : rtx elt = CONST_VECTOR_ELT (cop1, i);
4988 1292 : if (!CONST_INT_P (elt))
4989 : break;
4990 1292 : if (code == GE)
4991 : {
4992 : /* For GE punt if some element is signed minimum. */
4993 1244 : if (INTVAL (elt) < 0
4994 136 : && ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
4995 : == 0))
4996 : break;
4997 : }
4998 : /* For GEU punt if some element is zero. */
4999 48 : else if (elt == const0_rtx)
5000 : break;
5001 : }
5002 209 : if (i == n_elts)
5003 : {
5004 161 : rtvec v = rtvec_alloc (n_elts);
5005 1566 : for (i = 0; i < n_elts; ++i)
5006 1244 : RTVEC_ELT (v, i)
5007 1244 : = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) - 1,
5008 : eltmode);
5009 161 : cop1 = gen_rtx_CONST_VECTOR (mode, v);
5010 161 : code = code == GE ? GT : GTU;
5011 : break;
5012 : }
5013 : }
5014 281 : code = reverse_condition (code);
5015 281 : *negate = true;
5016 : /* FALLTHRU */
5017 :
5018 1639 : case LT:
5019 1639 : case LTU:
5020 1639 : std::swap (cop0, cop1);
5021 1639 : code = swap_condition (code);
5022 1639 : break;
5023 :
5024 0 : default:
5025 0 : gcc_unreachable ();
5026 : }
5027 :
5028 : /* Only SSE4.1/SSE4.2 supports V2DImode. */
5029 16924 : if (mode == V2DImode)
5030 : {
5031 779 : switch (code)
5032 : {
5033 575 : case EQ:
5034 : /* SSE4.1 supports EQ. */
5035 575 : if (!TARGET_SSE4_1)
5036 17175 : return NULL;
5037 : break;
5038 :
5039 204 : case GT:
5040 204 : case GTU:
5041 : /* SSE4.2 supports GT/GTU. */
5042 204 : if (!TARGET_SSE4_2)
5043 : return NULL;
5044 : break;
5045 :
5046 0 : default:
5047 0 : gcc_unreachable ();
5048 : }
5049 : }
5050 :
5051 16924 : if (CONST_VECTOR_P (cop0))
5052 1233 : cop0 = force_reg (mode, cop0);
5053 15691 : else if (CONST_VECTOR_P (cop1))
5054 7280 : cop1 = force_reg (mode, cop1);
5055 :
5056 16924 : rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
5057 16924 : rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
5058 16924 : if (*negate)
5059 3638 : std::swap (optrue, opfalse);
5060 :
5061 : /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
5062 : not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
5063 : min (x, y) == x). While we add one instruction (the minimum),
5064 : we remove the need for two instructions in the negation, as the
5065 : result is done this way.
5066 : When using masks, do it for SI/DImode element types, as it is shorter
5067 : than the two subtractions. */
5068 16924 : if ((code != EQ
5069 7201 : && GET_MODE_SIZE (mode) != 64
5070 7201 : && vector_all_ones_operand (opfalse, data_mode)
5071 550 : && optrue == CONST0_RTX (data_mode))
5072 23575 : || (code == GTU
5073 1942 : && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
5074 : /* Don't do it if not using integer masks and we'd end up with
5075 : the right values in the registers though. */
5076 652 : && (GET_MODE_SIZE (mode) == 64
5077 652 : || !vector_all_ones_operand (optrue, data_mode)
5078 535 : || opfalse != CONST0_RTX (data_mode))))
5079 : {
5080 667 : rtx (*gen) (rtx, rtx, rtx) = NULL;
5081 :
5082 667 : switch (mode)
5083 : {
5084 0 : case E_V16SImode:
5085 0 : gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
5086 : break;
5087 0 : case E_V8DImode:
5088 0 : gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
5089 0 : cop0 = force_reg (mode, cop0);
5090 0 : cop1 = force_reg (mode, cop1);
5091 0 : break;
5092 24 : case E_V32QImode:
5093 24 : if (TARGET_AVX2)
5094 24 : gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
5095 : break;
5096 24 : case E_V16HImode:
5097 24 : if (TARGET_AVX2)
5098 24 : gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
5099 : break;
5100 25 : case E_V8SImode:
5101 25 : if (TARGET_AVX2)
5102 25 : gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
5103 : break;
5104 20 : case E_V4DImode:
5105 20 : if (TARGET_AVX512VL)
5106 : {
5107 0 : gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
5108 0 : cop0 = force_reg (mode, cop0);
5109 0 : cop1 = force_reg (mode, cop1);
5110 : }
5111 : break;
5112 60 : case E_V16QImode:
5113 60 : if (code == GTU && TARGET_SSE2)
5114 : gen = gen_uminv16qi3;
5115 24 : else if (code == GT && TARGET_SSE4_1)
5116 : gen = gen_sminv16qi3;
5117 : break;
5118 40 : case E_V8QImode:
5119 40 : if (code == GTU && TARGET_SSE2)
5120 : gen = gen_uminv8qi3;
5121 38 : else if (code == GT && TARGET_SSE4_1)
5122 : gen = gen_sminv8qi3;
5123 : break;
5124 13 : case E_V4QImode:
5125 13 : if (code == GTU && TARGET_SSE2)
5126 : gen = gen_uminv4qi3;
5127 2 : else if (code == GT && TARGET_SSE4_1)
5128 : gen = gen_sminv4qi3;
5129 : break;
5130 8 : case E_V2QImode:
5131 8 : if (code == GTU && TARGET_SSE2)
5132 : gen = gen_uminv2qi3;
5133 6 : else if (code == GT && TARGET_SSE4_1)
5134 : gen = gen_sminv2qi3;
5135 : break;
5136 69 : case E_V8HImode:
5137 69 : if (code == GTU && TARGET_SSE4_1)
5138 : gen = gen_uminv8hi3;
5139 59 : else if (code == GT && TARGET_SSE2)
5140 : gen = gen_sminv8hi3;
5141 : break;
5142 4 : case E_V4HImode:
5143 4 : if (code == GTU && TARGET_SSE4_1)
5144 : gen = gen_uminv4hi3;
5145 4 : else if (code == GT && TARGET_SSE2)
5146 : gen = gen_sminv4hi3;
5147 : break;
5148 16 : case E_V2HImode:
5149 16 : if (code == GTU && TARGET_SSE4_1)
5150 : gen = gen_uminv2hi3;
5151 16 : else if (code == GT && TARGET_SSE2)
5152 : gen = gen_sminv2hi3;
5153 : break;
5154 239 : case E_V4SImode:
5155 239 : if (TARGET_SSE4_1)
5156 52 : gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
5157 : break;
5158 101 : case E_V2SImode:
5159 101 : if (TARGET_SSE4_1)
5160 0 : gen = (code == GTU) ? gen_uminv2si3 : gen_sminv2si3;
5161 : break;
5162 24 : case E_V2DImode:
5163 24 : if (TARGET_AVX512VL)
5164 : {
5165 0 : gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
5166 0 : cop0 = force_reg (mode, cop0);
5167 0 : cop1 = force_reg (mode, cop1);
5168 : }
5169 : break;
5170 : default:
5171 : break;
5172 : }
5173 :
5174 0 : if (gen)
5175 : {
5176 276 : rtx tem = gen_reg_rtx (mode);
5177 276 : if (!vector_operand (cop0, mode))
5178 0 : cop0 = force_reg (mode, cop0);
5179 276 : if (!vector_operand (cop1, mode))
5180 0 : cop1 = force_reg (mode, cop1);
5181 276 : *negate = !*negate;
5182 276 : emit_insn (gen (tem, cop0, cop1));
5183 276 : cop1 = tem;
5184 276 : code = EQ;
5185 : }
5186 : }
5187 :
5188 : /* Unsigned parallel compare is not supported by the hardware.
5189 : Play some tricks to turn this into a signed comparison
5190 : against 0. */
5191 16924 : if (code == GTU)
5192 : {
5193 1103 : cop0 = force_reg (mode, cop0);
5194 :
5195 1103 : switch (mode)
5196 : {
5197 753 : case E_V16SImode:
5198 753 : case E_V8DImode:
5199 753 : case E_V8SImode:
5200 753 : case E_V4DImode:
5201 753 : case E_V4SImode:
5202 753 : case E_V2SImode:
5203 753 : case E_V2DImode:
5204 753 : {
5205 753 : rtx t1, t2, mask;
5206 :
5207 : /* Subtract (-(INT MAX) - 1) from both operands to make
5208 : them signed. */
5209 753 : mask = ix86_build_signbit_mask (mode, true, false);
5210 753 : t1 = gen_reg_rtx (mode);
5211 753 : emit_insn (gen_sub3_insn (t1, cop0, mask));
5212 :
5213 753 : t2 = gen_reg_rtx (mode);
5214 753 : emit_insn (gen_sub3_insn (t2, cop1, mask));
5215 :
5216 753 : cop0 = t1;
5217 753 : cop1 = t2;
5218 753 : code = GT;
5219 : }
5220 753 : break;
5221 :
5222 350 : case E_V64QImode:
5223 350 : case E_V32HImode:
5224 350 : case E_V32QImode:
5225 350 : case E_V16HImode:
5226 350 : case E_V16QImode:
5227 350 : case E_V8QImode:
5228 350 : case E_V4QImode:
5229 350 : case E_V2QImode:
5230 350 : case E_V8HImode:
5231 350 : case E_V4HImode:
5232 350 : case E_V2HImode:
5233 : /* Perform a parallel unsigned saturating subtraction. */
5234 350 : x = gen_reg_rtx (mode);
5235 350 : emit_insn (gen_rtx_SET
5236 : (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
5237 350 : cop0 = x;
5238 350 : cop1 = CONST0_RTX (mode);
5239 350 : code = EQ;
5240 350 : *negate = !*negate;
5241 350 : break;
5242 :
5243 0 : default:
5244 0 : gcc_unreachable ();
5245 : }
5246 : }
5247 : }
5248 :
5249 17175 : if (*negate)
5250 3650 : std::swap (op_true, op_false);
5251 :
5252 17175 : if (CONST_VECTOR_P (cop1))
5253 419 : cop1 = force_reg (mode, cop1);
5254 :
5255 : /* Allow the comparison to be done in one mode, but the movcc to
5256 : happen in another mode. */
5257 17175 : if (data_mode == mode)
5258 17133 : x = ix86_expand_sse_cmp (dest, code, cop0, cop1, op_true, op_false);
5259 : else
5260 : {
5261 126 : gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
5262 42 : x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
5263 : op_true, op_false);
5264 42 : if (GET_MODE (x) == mode)
5265 24 : x = gen_lowpart (data_mode, x);
5266 : }
5267 :
5268 : return x;
5269 : }
5270 :
5271 : /* Expand integer vector comparison. */
5272 :
5273 : bool
5274 10358 : ix86_expand_int_vec_cmp (rtx operands[])
5275 : {
5276 10358 : rtx_code code = GET_CODE (operands[1]);
5277 10358 : bool negate = false;
5278 10358 : rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
5279 : operands[3], NULL, NULL, &negate);
5280 :
5281 10358 : if (!cmp)
5282 : return false;
5283 :
5284 10358 : if (negate)
5285 : {
5286 3695 : if (TARGET_AVX512F && GET_MODE_SIZE (GET_MODE (cmp)) >= 16)
5287 106 : cmp = gen_rtx_XOR (GET_MODE (cmp), cmp, CONSTM1_RTX (GET_MODE (cmp)));
5288 : else
5289 : {
5290 6896 : cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
5291 3448 : CONST0_RTX (GET_MODE (cmp)),
5292 : NULL, NULL, &negate);
5293 3448 : gcc_assert (!negate);
5294 : }
5295 : }
5296 :
5297 10358 : if (operands[0] != cmp)
5298 10064 : emit_move_insn (operands[0], cmp);
5299 :
5300 : return true;
5301 : }
5302 :
5303 : /* Expand a floating-point vector conditional move; a vcond operation
5304 : rather than a movcc operation. */
5305 :
5306 : bool
5307 0 : ix86_expand_fp_vcond (rtx operands[])
5308 : {
5309 0 : enum rtx_code code = GET_CODE (operands[3]);
5310 0 : rtx cmp;
5311 :
5312 0 : code = ix86_prepare_sse_fp_compare_args (operands[0], code,
5313 : &operands[4], &operands[5]);
5314 0 : if (code == UNKNOWN)
5315 : {
5316 0 : rtx temp;
5317 0 : switch (GET_CODE (operands[3]))
5318 : {
5319 0 : case LTGT:
5320 0 : temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
5321 : operands[5], operands[0], operands[0]);
5322 0 : cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
5323 : operands[5], operands[1], operands[2]);
5324 0 : code = AND;
5325 0 : break;
5326 0 : case UNEQ:
5327 0 : temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
5328 : operands[5], operands[0], operands[0]);
5329 0 : cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
5330 : operands[5], operands[1], operands[2]);
5331 0 : code = IOR;
5332 0 : break;
5333 0 : default:
5334 0 : gcc_unreachable ();
5335 : }
5336 0 : cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
5337 : OPTAB_DIRECT);
5338 0 : ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
5339 0 : return true;
5340 : }
5341 :
5342 0 : if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
5343 : operands[5], operands[1], operands[2]))
5344 : return true;
5345 :
5346 0 : cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
5347 : operands[1], operands[2]);
5348 0 : ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
5349 0 : return true;
5350 : }
5351 :
5352 : /* Expand a signed/unsigned integral vector conditional move. */
5353 :
5354 : bool
5355 3369 : ix86_expand_int_vcond (rtx operands[])
5356 : {
5357 3369 : machine_mode data_mode = GET_MODE (operands[0]);
5358 3369 : machine_mode mode = GET_MODE (operands[4]);
5359 3369 : enum rtx_code code = GET_CODE (operands[3]);
5360 3369 : bool negate = false;
5361 3369 : rtx x, cop0, cop1;
5362 :
5363 3369 : cop0 = operands[4];
5364 3369 : cop1 = operands[5];
5365 :
5366 : /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
5367 : and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
5368 3369 : if ((code == LT || code == GE)
5369 0 : && data_mode == mode
5370 0 : && cop1 == CONST0_RTX (mode)
5371 0 : && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
5372 0 : && GET_MODE_UNIT_SIZE (data_mode) > 1
5373 0 : && GET_MODE_UNIT_SIZE (data_mode) <= 8
5374 3369 : && (GET_MODE_SIZE (data_mode) == 16
5375 0 : || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
5376 : {
5377 0 : rtx negop = operands[2 - (code == LT)];
5378 0 : int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
5379 0 : if (negop == CONST1_RTX (data_mode))
5380 : {
5381 0 : rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
5382 : operands[0], 1, OPTAB_DIRECT);
5383 0 : if (res != operands[0])
5384 0 : emit_move_insn (operands[0], res);
5385 0 : return true;
5386 : }
5387 0 : else if (GET_MODE_INNER (data_mode) != DImode
5388 0 : && vector_all_ones_operand (negop, data_mode))
5389 : {
5390 0 : rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
5391 : operands[0], 0, OPTAB_DIRECT);
5392 0 : if (res != operands[0])
5393 0 : emit_move_insn (operands[0], res);
5394 0 : return true;
5395 : }
5396 : }
5397 :
5398 3369 : if (!nonimmediate_operand (cop1, mode))
5399 126 : cop1 = force_reg (mode, cop1);
5400 3369 : if (!general_operand (operands[1], data_mode))
5401 0 : operands[1] = force_reg (data_mode, operands[1]);
5402 3369 : if (!general_operand (operands[2], data_mode))
5403 0 : operands[2] = force_reg (data_mode, operands[2]);
5404 :
5405 3369 : x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
5406 : operands[1], operands[2], &negate);
5407 :
5408 3369 : if (!x)
5409 : return false;
5410 :
5411 3369 : ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
5412 3369 : operands[2-negate]);
5413 3369 : return true;
5414 : }
5415 :
5416 : static bool
5417 123810 : ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
5418 : struct expand_vec_perm_d *d)
5419 : {
5420 : /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5421 : expander, so args are either in d, or in op0, op1 etc. */
5422 123810 : machine_mode mode = GET_MODE (d ? d->op0 : op0);
5423 123810 : machine_mode maskmode = mode;
5424 123810 : rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
5425 :
5426 123810 : switch (mode)
5427 : {
5428 23443 : case E_V16QImode:
5429 23443 : if (TARGET_AVX512VL && TARGET_AVX512VBMI)
5430 : gen = gen_avx512vl_vpermt2varv16qi3;
5431 : break;
5432 750 : case E_V32QImode:
5433 750 : if (TARGET_AVX512VL && TARGET_AVX512VBMI)
5434 : gen = gen_avx512vl_vpermt2varv32qi3;
5435 : break;
5436 235 : case E_V64QImode:
5437 235 : if (TARGET_AVX512VBMI)
5438 : gen = gen_avx512bw_vpermt2varv64qi3;
5439 : break;
5440 13143 : case E_V8HImode:
5441 13143 : if (TARGET_AVX512VL && TARGET_AVX512BW)
5442 : gen = gen_avx512vl_vpermt2varv8hi3;
5443 : break;
5444 758 : case E_V16HImode:
5445 758 : if (TARGET_AVX512VL && TARGET_AVX512BW)
5446 : gen = gen_avx512vl_vpermt2varv16hi3;
5447 : break;
5448 391 : case E_V32HImode:
5449 391 : if (TARGET_AVX512BW)
5450 : gen = gen_avx512bw_vpermt2varv32hi3;
5451 : break;
5452 33452 : case E_V4SImode:
5453 33452 : if (TARGET_AVX512VL)
5454 : gen = gen_avx512vl_vpermt2varv4si3;
5455 : break;
5456 1171 : case E_V8SImode:
5457 1171 : if (TARGET_AVX512VL)
5458 : gen = gen_avx512vl_vpermt2varv8si3;
5459 : break;
5460 126 : case E_V16SImode:
5461 126 : if (TARGET_AVX512F)
5462 : gen = gen_avx512f_vpermt2varv16si3;
5463 : break;
5464 10333 : case E_V4SFmode:
5465 10333 : if (TARGET_AVX512VL)
5466 : {
5467 : gen = gen_avx512vl_vpermt2varv4sf3;
5468 : maskmode = V4SImode;
5469 : }
5470 : break;
5471 7647 : case E_V8SFmode:
5472 7647 : if (TARGET_AVX512VL)
5473 : {
5474 : gen = gen_avx512vl_vpermt2varv8sf3;
5475 : maskmode = V8SImode;
5476 : }
5477 : break;
5478 239 : case E_V16SFmode:
5479 239 : if (TARGET_AVX512F)
5480 : {
5481 : gen = gen_avx512f_vpermt2varv16sf3;
5482 : maskmode = V16SImode;
5483 : }
5484 : break;
5485 0 : case E_V2DImode:
5486 0 : if (TARGET_AVX512VL)
5487 : gen = gen_avx512vl_vpermt2varv2di3;
5488 : break;
5489 290 : case E_V4DImode:
5490 290 : if (TARGET_AVX512VL)
5491 : gen = gen_avx512vl_vpermt2varv4di3;
5492 : break;
5493 10 : case E_V8DImode:
5494 10 : if (TARGET_AVX512F)
5495 : gen = gen_avx512f_vpermt2varv8di3;
5496 : break;
5497 0 : case E_V2DFmode:
5498 0 : if (TARGET_AVX512VL)
5499 : {
5500 : gen = gen_avx512vl_vpermt2varv2df3;
5501 : maskmode = V2DImode;
5502 : }
5503 : break;
5504 1960 : case E_V4DFmode:
5505 1960 : if (TARGET_AVX512VL)
5506 : {
5507 : gen = gen_avx512vl_vpermt2varv4df3;
5508 : maskmode = V4DImode;
5509 : }
5510 : break;
5511 202 : case E_V8DFmode:
5512 202 : if (TARGET_AVX512F)
5513 : {
5514 : gen = gen_avx512f_vpermt2varv8df3;
5515 : maskmode = V8DImode;
5516 : }
5517 : break;
5518 : default:
5519 : break;
5520 : }
5521 :
5522 : if (gen == NULL)
5523 : return false;
5524 :
5525 964 : if (d && d->testing_p)
5526 : return true;
5527 :
5528 : /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5529 : expander, so args are either in d, or in op0, op1 etc. */
5530 953 : if (d)
5531 : {
5532 953 : rtx vec[64];
5533 953 : target = d->target;
5534 953 : op0 = d->op0;
5535 953 : op1 = d->op1;
5536 17421 : for (int i = 0; i < d->nelt; ++i)
5537 16468 : vec[i] = GEN_INT (d->perm[i]);
5538 953 : mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
5539 : }
5540 :
5541 961 : emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
5542 961 : return true;
5543 : }
5544 :
5545 : /* Expand a variable vector permutation. */
5546 :
5547 : void
5548 10 : ix86_expand_vec_perm (rtx operands[])
5549 : {
5550 10 : rtx target = operands[0];
5551 10 : rtx op0 = operands[1];
5552 10 : rtx op1 = operands[2];
5553 10 : rtx mask = operands[3];
5554 10 : rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
5555 10 : machine_mode mode = GET_MODE (op0);
5556 10 : machine_mode maskmode = GET_MODE (mask);
5557 10 : int w, e, i;
5558 10 : bool one_operand_shuffle = rtx_equal_p (op0, op1);
5559 :
5560 : /* Number of elements in the vector. */
5561 10 : w = GET_MODE_NUNITS (mode);
5562 10 : e = GET_MODE_UNIT_SIZE (mode);
5563 10 : gcc_assert (w <= 64);
5564 :
5565 : /* For HF mode vector, convert it to HI using subreg. */
5566 20 : if (GET_MODE_INNER (mode) == HFmode)
5567 : {
5568 6 : machine_mode orig_mode = mode;
5569 6 : mode = mode_for_vector (HImode, w).require ();
5570 6 : target = lowpart_subreg (mode, target, orig_mode);
5571 6 : op0 = lowpart_subreg (mode, op0, orig_mode);
5572 6 : op1 = lowpart_subreg (mode, op1, orig_mode);
5573 : }
5574 :
5575 10 : if (TARGET_AVX512F && one_operand_shuffle)
5576 : {
5577 5 : rtx (*gen) (rtx, rtx, rtx) = NULL;
5578 5 : switch (mode)
5579 : {
5580 : case E_V16SImode:
5581 : gen =gen_avx512f_permvarv16si;
5582 : break;
5583 0 : case E_V16SFmode:
5584 0 : gen = gen_avx512f_permvarv16sf;
5585 0 : break;
5586 0 : case E_V8DImode:
5587 0 : gen = gen_avx512f_permvarv8di;
5588 0 : break;
5589 0 : case E_V8DFmode:
5590 0 : gen = gen_avx512f_permvarv8df;
5591 0 : break;
5592 : default:
5593 : break;
5594 : }
5595 0 : if (gen != NULL)
5596 : {
5597 0 : emit_insn (gen (target, op0, mask));
5598 8 : return;
5599 : }
5600 : }
5601 :
5602 10 : if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
5603 : return;
5604 :
5605 2 : if (TARGET_AVX2)
5606 : {
5607 1 : if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
5608 : {
5609 : /* Unfortunately, the VPERMQ and VPERMPD instructions only support
5610 : an constant shuffle operand. With a tiny bit of effort we can
5611 : use VPERMD instead. A re-interpretation stall for V4DFmode is
5612 : unfortunate but there's no avoiding it.
5613 : Similarly for V16HImode we don't have instructions for variable
5614 : shuffling, while for V32QImode we can use after preparing suitable
5615 : masks vpshufb; vpshufb; vpermq; vpor. */
5616 :
5617 : if (mode == V16HImode)
5618 : {
5619 : maskmode = mode = V32QImode;
5620 : w = 32;
5621 : e = 1;
5622 : }
5623 : else
5624 : {
5625 : maskmode = mode = V8SImode;
5626 : w = 8;
5627 : e = 4;
5628 : }
5629 0 : t1 = gen_reg_rtx (maskmode);
5630 :
5631 : /* Replicate the low bits of the V4DImode mask into V8SImode:
5632 : mask = { A B C D }
5633 : t1 = { A A B B C C D D }. */
5634 0 : for (i = 0; i < w / 2; ++i)
5635 0 : vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
5636 0 : vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5637 0 : vt = force_reg (maskmode, vt);
5638 0 : mask = gen_lowpart (maskmode, mask);
5639 0 : if (maskmode == V8SImode)
5640 0 : emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
5641 : else
5642 0 : emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
5643 :
5644 : /* Multiply the shuffle indicies by two. */
5645 0 : t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
5646 : OPTAB_DIRECT);
5647 :
5648 : /* Add one to the odd shuffle indicies:
5649 : t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
5650 0 : for (i = 0; i < w / 2; ++i)
5651 : {
5652 0 : vec[i * 2] = const0_rtx;
5653 0 : vec[i * 2 + 1] = const1_rtx;
5654 : }
5655 0 : vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5656 0 : vt = validize_mem (force_const_mem (maskmode, vt));
5657 0 : t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
5658 : OPTAB_DIRECT);
5659 :
5660 : /* Continue as if V8SImode (resp. V32QImode) was used initially. */
5661 0 : operands[3] = mask = t1;
5662 0 : target = gen_reg_rtx (mode);
5663 0 : op0 = gen_lowpart (mode, op0);
5664 0 : op1 = gen_lowpart (mode, op1);
5665 : }
5666 :
5667 1 : switch (mode)
5668 : {
5669 1 : case E_V8SImode:
5670 : /* The VPERMD and VPERMPS instructions already properly ignore
5671 : the high bits of the shuffle elements. No need for us to
5672 : perform an AND ourselves. */
5673 1 : if (one_operand_shuffle)
5674 : {
5675 0 : emit_insn (gen_avx2_permvarv8si (target, op0, mask));
5676 0 : if (target != operands[0])
5677 0 : emit_move_insn (operands[0],
5678 0 : gen_lowpart (GET_MODE (operands[0]), target));
5679 : }
5680 : else
5681 : {
5682 1 : t1 = gen_reg_rtx (V8SImode);
5683 1 : t2 = gen_reg_rtx (V8SImode);
5684 1 : emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
5685 1 : emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
5686 1 : goto merge_two;
5687 : }
5688 0 : return;
5689 :
5690 0 : case E_V8SFmode:
5691 0 : mask = gen_lowpart (V8SImode, mask);
5692 0 : if (one_operand_shuffle)
5693 0 : emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
5694 : else
5695 : {
5696 0 : t1 = gen_reg_rtx (V8SFmode);
5697 0 : t2 = gen_reg_rtx (V8SFmode);
5698 0 : emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
5699 0 : emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
5700 0 : goto merge_two;
5701 : }
5702 0 : return;
5703 :
5704 0 : case E_V4SImode:
5705 : /* By combining the two 128-bit input vectors into one 256-bit
5706 : input vector, we can use VPERMD and VPERMPS for the full
5707 : two-operand shuffle. */
5708 0 : t1 = gen_reg_rtx (V8SImode);
5709 0 : t2 = gen_reg_rtx (V8SImode);
5710 0 : emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
5711 0 : emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5712 0 : emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
5713 0 : emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
5714 0 : return;
5715 :
5716 0 : case E_V4SFmode:
5717 0 : t1 = gen_reg_rtx (V8SFmode);
5718 0 : t2 = gen_reg_rtx (V8SImode);
5719 0 : mask = gen_lowpart (V4SImode, mask);
5720 0 : emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
5721 0 : emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5722 0 : emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
5723 0 : emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
5724 0 : return;
5725 :
5726 0 : case E_V32QImode:
5727 0 : t1 = gen_reg_rtx (V32QImode);
5728 0 : t2 = gen_reg_rtx (V32QImode);
5729 0 : t3 = gen_reg_rtx (V32QImode);
5730 0 : vt2 = GEN_INT (-128);
5731 0 : vt = gen_const_vec_duplicate (V32QImode, vt2);
5732 0 : vt = force_reg (V32QImode, vt);
5733 0 : for (i = 0; i < 32; i++)
5734 0 : vec[i] = i < 16 ? vt2 : const0_rtx;
5735 0 : vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
5736 0 : vt2 = force_reg (V32QImode, vt2);
5737 : /* From mask create two adjusted masks, which contain the same
5738 : bits as mask in the low 7 bits of each vector element.
5739 : The first mask will have the most significant bit clear
5740 : if it requests element from the same 128-bit lane
5741 : and MSB set if it requests element from the other 128-bit lane.
5742 : The second mask will have the opposite values of the MSB,
5743 : and additionally will have its 128-bit lanes swapped.
5744 : E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
5745 : t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
5746 : t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
5747 : stands for other 12 bytes. */
5748 : /* The bit whether element is from the same lane or the other
5749 : lane is bit 4, so shift it up by 3 to the MSB position. */
5750 0 : t5 = gen_reg_rtx (V4DImode);
5751 0 : emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
5752 : GEN_INT (3)));
5753 : /* Clear MSB bits from the mask just in case it had them set. */
5754 0 : emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
5755 : /* After this t1 will have MSB set for elements from other lane. */
5756 0 : emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
5757 : /* Clear bits other than MSB. */
5758 0 : emit_insn (gen_andv32qi3 (t1, t1, vt));
5759 : /* Or in the lower bits from mask into t3. */
5760 0 : emit_insn (gen_iorv32qi3 (t3, t1, t2));
5761 : /* And invert MSB bits in t1, so MSB is set for elements from the same
5762 : lane. */
5763 0 : emit_insn (gen_xorv32qi3 (t1, t1, vt));
5764 : /* Swap 128-bit lanes in t3. */
5765 0 : t6 = gen_reg_rtx (V4DImode);
5766 0 : emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
5767 : const2_rtx, GEN_INT (3),
5768 : const0_rtx, const1_rtx));
5769 : /* And or in the lower bits from mask into t1. */
5770 0 : emit_insn (gen_iorv32qi3 (t1, t1, t2));
5771 0 : if (one_operand_shuffle)
5772 : {
5773 : /* Each of these shuffles will put 0s in places where
5774 : element from the other 128-bit lane is needed, otherwise
5775 : will shuffle in the requested value. */
5776 0 : emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
5777 0 : gen_lowpart (V32QImode, t6)));
5778 0 : emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
5779 : /* For t3 the 128-bit lanes are swapped again. */
5780 0 : t7 = gen_reg_rtx (V4DImode);
5781 0 : emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
5782 : const2_rtx, GEN_INT (3),
5783 : const0_rtx, const1_rtx));
5784 : /* And oring both together leads to the result. */
5785 0 : emit_insn (gen_iorv32qi3 (target, t1,
5786 0 : gen_lowpart (V32QImode, t7)));
5787 0 : if (target != operands[0])
5788 0 : emit_move_insn (operands[0],
5789 0 : gen_lowpart (GET_MODE (operands[0]), target));
5790 0 : return;
5791 : }
5792 :
5793 0 : t4 = gen_reg_rtx (V32QImode);
5794 : /* Similarly to the above one_operand_shuffle code,
5795 : just for repeated twice for each operand. merge_two:
5796 : code will merge the two results together. */
5797 0 : emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
5798 0 : gen_lowpart (V32QImode, t6)));
5799 0 : emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
5800 0 : gen_lowpart (V32QImode, t6)));
5801 0 : emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
5802 0 : emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
5803 0 : t7 = gen_reg_rtx (V4DImode);
5804 0 : emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
5805 : const2_rtx, GEN_INT (3),
5806 : const0_rtx, const1_rtx));
5807 0 : t8 = gen_reg_rtx (V4DImode);
5808 0 : emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
5809 : const2_rtx, GEN_INT (3),
5810 : const0_rtx, const1_rtx));
5811 0 : emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
5812 0 : emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
5813 0 : t1 = t4;
5814 0 : t2 = t3;
5815 0 : goto merge_two;
5816 :
5817 0 : default:
5818 0 : gcc_assert (GET_MODE_SIZE (mode) <= 16);
5819 : break;
5820 : }
5821 : }
5822 :
5823 1 : if (TARGET_XOP)
5824 : {
5825 : /* The XOP VPPERM insn supports three inputs. By ignoring the
5826 : one_operand_shuffle special case, we avoid creating another
5827 : set of constant vectors in memory. */
5828 0 : one_operand_shuffle = false;
5829 :
5830 : /* mask = mask & {2*w-1, ...} */
5831 0 : vt = GEN_INT (2*w - 1);
5832 : }
5833 : else
5834 : {
5835 : /* mask = mask & {w-1, ...} */
5836 1 : vt = GEN_INT (w - 1);
5837 : }
5838 :
5839 1 : vt = gen_const_vec_duplicate (maskmode, vt);
5840 1 : mask = expand_simple_binop (maskmode, AND, mask, vt,
5841 : NULL_RTX, 0, OPTAB_DIRECT);
5842 :
5843 : /* For non-QImode operations, convert the word permutation control
5844 : into a byte permutation control. */
5845 1 : if (mode != V16QImode)
5846 : {
5847 1 : mask = expand_simple_binop (maskmode, ASHIFT, mask,
5848 2 : GEN_INT (exact_log2 (e)),
5849 : NULL_RTX, 0, OPTAB_DIRECT);
5850 :
5851 : /* Convert mask to vector of chars. */
5852 1 : mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
5853 :
5854 : /* Replicate each of the input bytes into byte positions:
5855 : (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
5856 : (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
5857 : (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
5858 18 : for (i = 0; i < 16; ++i)
5859 16 : vec[i] = GEN_INT (i/e * e);
5860 1 : vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5861 1 : vt = validize_mem (force_const_mem (V16QImode, vt));
5862 1 : if (TARGET_XOP)
5863 0 : emit_insn (gen_xop_pperm (mask, mask, mask, vt));
5864 : else
5865 1 : emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
5866 :
5867 : /* Convert it into the byte positions by doing
5868 : mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
5869 17 : for (i = 0; i < 16; ++i)
5870 16 : vec[i] = GEN_INT (i % e);
5871 1 : vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5872 1 : vt = validize_mem (force_const_mem (V16QImode, vt));
5873 1 : emit_insn (gen_addv16qi3 (mask, mask, vt));
5874 : }
5875 :
5876 : /* The actual shuffle operations all operate on V16QImode. */
5877 1 : op0 = gen_lowpart (V16QImode, op0);
5878 1 : op1 = gen_lowpart (V16QImode, op1);
5879 :
5880 1 : if (TARGET_XOP)
5881 : {
5882 0 : if (GET_MODE (target) != V16QImode)
5883 0 : target = gen_reg_rtx (V16QImode);
5884 0 : emit_insn (gen_xop_pperm (target, op0, op1, mask));
5885 0 : if (target != operands[0])
5886 0 : emit_move_insn (operands[0],
5887 0 : gen_lowpart (GET_MODE (operands[0]), target));
5888 : }
5889 1 : else if (one_operand_shuffle)
5890 : {
5891 1 : if (GET_MODE (target) != V16QImode)
5892 1 : target = gen_reg_rtx (V16QImode);
5893 1 : emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
5894 1 : if (target != operands[0])
5895 1 : emit_move_insn (operands[0],
5896 1 : gen_lowpart (GET_MODE (operands[0]), target));
5897 : }
5898 : else
5899 : {
5900 0 : rtx xops[6];
5901 0 : bool ok;
5902 :
5903 : /* Shuffle the two input vectors independently. */
5904 0 : t1 = gen_reg_rtx (V16QImode);
5905 0 : t2 = gen_reg_rtx (V16QImode);
5906 0 : emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
5907 0 : emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
5908 :
5909 1 : merge_two:
5910 : /* Then merge them together. The key is whether any given control
5911 : element contained a bit set that indicates the second word. */
5912 1 : mask = operands[3];
5913 1 : vt = GEN_INT (w);
5914 1 : if (maskmode == V2DImode && !TARGET_SSE4_1)
5915 : {
5916 : /* Without SSE4.1, we don't have V2DImode EQ. Perform one
5917 : more shuffle to convert the V2DI input mask into a V4SI
5918 : input mask. At which point the masking that expand_int_vcond
5919 : will work as desired. */
5920 0 : rtx t3 = gen_reg_rtx (V4SImode);
5921 0 : emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
5922 : const0_rtx, const0_rtx,
5923 : const2_rtx, const2_rtx));
5924 0 : mask = t3;
5925 0 : maskmode = V4SImode;
5926 0 : e = w = 4;
5927 : }
5928 :
5929 1 : vt = gen_const_vec_duplicate (maskmode, vt);
5930 1 : vt = force_reg (maskmode, vt);
5931 1 : mask = expand_simple_binop (maskmode, AND, mask, vt,
5932 : NULL_RTX, 0, OPTAB_DIRECT);
5933 :
5934 1 : if (GET_MODE (target) != mode)
5935 0 : target = gen_reg_rtx (mode);
5936 1 : xops[0] = target;
5937 1 : xops[1] = gen_lowpart (mode, t2);
5938 1 : xops[2] = gen_lowpart (mode, t1);
5939 1 : xops[3] = gen_rtx_EQ (maskmode, mask, vt);
5940 1 : xops[4] = mask;
5941 1 : xops[5] = vt;
5942 1 : ok = ix86_expand_int_vcond (xops);
5943 1 : gcc_assert (ok);
5944 1 : if (target != operands[0])
5945 0 : emit_move_insn (operands[0],
5946 0 : gen_lowpart (GET_MODE (operands[0]), target));
5947 : }
5948 : }
5949 :
5950 : /* Extend SRC into next wider integer vector type. UNSIGNED_P is
5951 : true if we should do zero extension, else sign extension. */
5952 :
5953 : void
5954 343 : ix86_expand_sse_extend (rtx dest, rtx src, bool unsigned_p)
5955 : {
5956 343 : machine_mode imode = GET_MODE (src);
5957 343 : rtx ops[3];
5958 :
5959 343 : switch (imode)
5960 : {
5961 343 : case E_V8QImode:
5962 343 : case E_V4QImode:
5963 343 : case E_V2QImode:
5964 343 : case E_V4HImode:
5965 343 : case E_V2HImode:
5966 343 : case E_V2SImode:
5967 343 : break;
5968 0 : default:
5969 0 : gcc_unreachable ();
5970 : }
5971 :
5972 343 : ops[0] = dest;
5973 :
5974 343 : ops[1] = force_reg (imode, src);
5975 :
5976 343 : if (unsigned_p)
5977 97 : ops[2] = force_reg (imode, CONST0_RTX (imode));
5978 : else
5979 246 : ops[2] = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5980 : ops[1], pc_rtx, pc_rtx);
5981 :
5982 343 : ix86_split_mmx_punpck (ops, false);
5983 343 : }
5984 :
5985 : /* Unpack SRC into the next wider integer vector type. UNSIGNED_P is
5986 : true if we should do zero extension, else sign extension. HIGH_P is
5987 : true if we want the N/2 high elements, else the low elements. */
5988 :
5989 : void
5990 18845 : ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
5991 : {
5992 18845 : machine_mode imode = GET_MODE (src);
5993 18845 : rtx tmp;
5994 :
5995 18845 : if (TARGET_SSE4_1)
5996 : {
5997 6313 : rtx (*unpack)(rtx, rtx);
5998 6313 : rtx (*extract)(rtx, rtx) = NULL;
5999 6313 : machine_mode halfmode = BLKmode;
6000 :
6001 6313 : switch (imode)
6002 : {
6003 116 : case E_V64QImode:
6004 116 : if (unsigned_p)
6005 : unpack = gen_avx512bw_zero_extendv32qiv32hi2;
6006 : else
6007 62 : unpack = gen_avx512bw_sign_extendv32qiv32hi2;
6008 116 : halfmode = V32QImode;
6009 116 : extract
6010 116 : = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
6011 : break;
6012 697 : case E_V32QImode:
6013 697 : if (unsigned_p)
6014 : unpack = gen_avx2_zero_extendv16qiv16hi2;
6015 : else
6016 150 : unpack = gen_avx2_sign_extendv16qiv16hi2;
6017 697 : halfmode = V16QImode;
6018 697 : extract
6019 697 : = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
6020 : break;
6021 102 : case E_V32HImode:
6022 102 : if (unsigned_p)
6023 : unpack = gen_avx512f_zero_extendv16hiv16si2;
6024 : else
6025 60 : unpack = gen_avx512f_sign_extendv16hiv16si2;
6026 102 : halfmode = V16HImode;
6027 102 : extract
6028 102 : = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
6029 : break;
6030 455 : case E_V16HImode:
6031 455 : if (unsigned_p)
6032 : unpack = gen_avx2_zero_extendv8hiv8si2;
6033 : else
6034 332 : unpack = gen_avx2_sign_extendv8hiv8si2;
6035 455 : halfmode = V8HImode;
6036 455 : extract
6037 455 : = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
6038 : break;
6039 108 : case E_V16SImode:
6040 108 : if (unsigned_p)
6041 : unpack = gen_avx512f_zero_extendv8siv8di2;
6042 : else
6043 90 : unpack = gen_avx512f_sign_extendv8siv8di2;
6044 108 : halfmode = V8SImode;
6045 108 : extract
6046 108 : = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
6047 : break;
6048 396 : case E_V8SImode:
6049 396 : if (unsigned_p)
6050 : unpack = gen_avx2_zero_extendv4siv4di2;
6051 : else
6052 334 : unpack = gen_avx2_sign_extendv4siv4di2;
6053 396 : halfmode = V4SImode;
6054 396 : extract
6055 396 : = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
6056 : break;
6057 2558 : case E_V16QImode:
6058 2558 : if (unsigned_p)
6059 : unpack = gen_sse4_1_zero_extendv8qiv8hi2;
6060 : else
6061 257 : unpack = gen_sse4_1_sign_extendv8qiv8hi2;
6062 : break;
6063 963 : case E_V8HImode:
6064 963 : if (unsigned_p)
6065 : unpack = gen_sse4_1_zero_extendv4hiv4si2;
6066 : else
6067 750 : unpack = gen_sse4_1_sign_extendv4hiv4si2;
6068 : break;
6069 538 : case E_V4SImode:
6070 538 : if (unsigned_p)
6071 : unpack = gen_sse4_1_zero_extendv2siv2di2;
6072 : else
6073 478 : unpack = gen_sse4_1_sign_extendv2siv2di2;
6074 : break;
6075 111 : case E_V8QImode:
6076 111 : if (unsigned_p)
6077 : unpack = gen_sse4_1_zero_extendv4qiv4hi2;
6078 : else
6079 72 : unpack = gen_sse4_1_sign_extendv4qiv4hi2;
6080 : break;
6081 263 : case E_V4HImode:
6082 263 : if (unsigned_p)
6083 : unpack = gen_sse4_1_zero_extendv2hiv2si2;
6084 : else
6085 208 : unpack = gen_sse4_1_sign_extendv2hiv2si2;
6086 : break;
6087 6 : case E_V4QImode:
6088 6 : if (unsigned_p)
6089 : unpack = gen_sse4_1_zero_extendv2qiv2hi2;
6090 : else
6091 0 : unpack = gen_sse4_1_sign_extendv2qiv2hi2;
6092 : break;
6093 0 : default:
6094 0 : gcc_unreachable ();
6095 : }
6096 :
6097 12626 : if (GET_MODE_SIZE (imode) >= 32)
6098 : {
6099 1874 : tmp = gen_reg_rtx (halfmode);
6100 1874 : emit_insn (extract (tmp, src));
6101 : }
6102 4439 : else if (high_p)
6103 : {
6104 2280 : switch (GET_MODE_SIZE (imode))
6105 : {
6106 947 : case 16:
6107 : /* Shift higher 8 bytes to lower 8 bytes. */
6108 947 : tmp = gen_reg_rtx (V1TImode);
6109 947 : emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
6110 : GEN_INT (64)));
6111 947 : break;
6112 190 : case 8:
6113 : /* Shift higher 4 bytes to lower 4 bytes. */
6114 190 : tmp = gen_reg_rtx (V1DImode);
6115 190 : emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
6116 : GEN_INT (32)));
6117 190 : break;
6118 3 : case 4:
6119 : /* Shift higher 2 bytes to lower 2 bytes. */
6120 3 : tmp = gen_reg_rtx (V1SImode);
6121 3 : emit_insn (gen_mmx_lshrv1si3 (tmp, gen_lowpart (V1SImode, src),
6122 : GEN_INT (16)));
6123 3 : break;
6124 0 : default:
6125 0 : gcc_unreachable ();
6126 : }
6127 :
6128 1140 : tmp = gen_lowpart (imode, tmp);
6129 : }
6130 : else
6131 : tmp = src;
6132 :
6133 6313 : emit_insn (unpack (dest, tmp));
6134 : }
6135 : else
6136 : {
6137 12532 : rtx (*unpack)(rtx, rtx, rtx);
6138 :
6139 12532 : switch (imode)
6140 : {
6141 3378 : case E_V16QImode:
6142 3378 : if (high_p)
6143 : unpack = gen_vec_interleave_highv16qi;
6144 : else
6145 1692 : unpack = gen_vec_interleave_lowv16qi;
6146 : break;
6147 5174 : case E_V8HImode:
6148 5174 : if (high_p)
6149 : unpack = gen_vec_interleave_highv8hi;
6150 : else
6151 2587 : unpack = gen_vec_interleave_lowv8hi;
6152 : break;
6153 2352 : case E_V4SImode:
6154 2352 : if (high_p)
6155 : unpack = gen_vec_interleave_highv4si;
6156 : else
6157 1176 : unpack = gen_vec_interleave_lowv4si;
6158 : break;
6159 556 : case E_V8QImode:
6160 556 : if (high_p)
6161 : unpack = gen_mmx_punpckhbw;
6162 : else
6163 278 : unpack = gen_mmx_punpcklbw;
6164 : break;
6165 1058 : case E_V4HImode:
6166 1058 : if (high_p)
6167 : unpack = gen_mmx_punpckhwd;
6168 : else
6169 529 : unpack = gen_mmx_punpcklwd;
6170 : break;
6171 14 : case E_V4QImode:
6172 14 : if (high_p)
6173 : unpack = gen_mmx_punpckhbw_low;
6174 : else
6175 7 : unpack = gen_mmx_punpcklbw_low;
6176 : break;
6177 0 : default:
6178 0 : gcc_unreachable ();
6179 : }
6180 :
6181 12532 : if (unsigned_p)
6182 4894 : tmp = force_reg (imode, CONST0_RTX (imode));
6183 : else
6184 7638 : tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
6185 : src, pc_rtx, pc_rtx);
6186 :
6187 12532 : rtx tmp2 = gen_reg_rtx (imode);
6188 12532 : emit_insn (unpack (tmp2, src, tmp));
6189 12532 : emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
6190 : }
6191 18845 : }
6192 :
6193 : /* Return true if mem is pool constant which contains a const_vector
6194 : perm index, assign the index to PERM. */
6195 : bool
6196 35 : ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
6197 : {
6198 35 : machine_mode mode = GET_MODE (mem);
6199 35 : int nelt = GET_MODE_NUNITS (mode);
6200 :
6201 35 : if (!INTEGRAL_MODE_P (mode))
6202 : return false;
6203 :
6204 : /* Needs to be constant pool. */
6205 35 : if (!(MEM_P (mem))
6206 35 : || !SYMBOL_REF_P (XEXP (mem, 0))
6207 70 : || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
6208 : return false;
6209 :
6210 35 : rtx constant = get_pool_constant (XEXP (mem, 0));
6211 :
6212 35 : if (!CONST_VECTOR_P (constant))
6213 : return false;
6214 :
6215 : /* There could be some rtx like
6216 : (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
6217 : but with "*.LC1" refer to V2DI constant vector. */
6218 35 : if (GET_MODE (constant) != mode)
6219 : {
6220 0 : constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
6221 :
6222 0 : if (constant == nullptr || !CONST_VECTOR_P (constant))
6223 : return false;
6224 : }
6225 :
6226 771 : for (int i = 0; i != nelt; i++)
6227 736 : perm[i] = UINTVAL (XVECEXP (constant, 0, i));
6228 :
6229 : return true;
6230 : }
6231 :
6232 : /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
6233 : but works for floating pointer parameters and nonoffsetable memories.
6234 : For pushes, it returns just stack offsets; the values will be saved
6235 : in the right order. Maximally three parts are generated. */
6236 :
6237 : static int
6238 4124152 : ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
6239 : {
6240 4124152 : int size;
6241 :
6242 4124152 : if (!TARGET_64BIT)
6243 1565994 : size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
6244 : else
6245 6681000 : size = (GET_MODE_SIZE (mode) + 4) / 8;
6246 :
6247 4124152 : gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
6248 4124152 : gcc_assert (size >= 2 && size <= 4);
6249 :
6250 : /* Optimize constant pool reference to immediates. This is used by fp
6251 : moves, that force all constants to memory to allow combining. */
6252 4124152 : if (MEM_P (operand) && MEM_READONLY_P (operand))
6253 37117 : operand = avoid_constant_pool_reference (operand);
6254 :
6255 4124152 : if (MEM_P (operand) && !offsettable_memref_p (operand))
6256 : {
6257 : /* The only non-offsetable memories we handle are pushes. */
6258 184034 : int ok = push_operand (operand, VOIDmode);
6259 :
6260 184034 : gcc_assert (ok);
6261 :
6262 184034 : operand = copy_rtx (operand);
6263 184034 : PUT_MODE (operand, word_mode);
6264 184034 : parts[0] = parts[1] = parts[2] = parts[3] = operand;
6265 184034 : return size;
6266 : }
6267 :
6268 3940118 : if (CONST_VECTOR_P (operand))
6269 : {
6270 41917 : scalar_int_mode imode = int_mode_for_mode (mode).require ();
6271 : /* Caution: if we looked through a constant pool memory above,
6272 : the operand may actually have a different mode now. That's
6273 : ok, since we want to pun this all the way back to an integer. */
6274 41917 : operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
6275 41917 : gcc_assert (operand != NULL);
6276 41917 : mode = imode;
6277 : }
6278 :
6279 3940118 : if (!TARGET_64BIT)
6280 : {
6281 625028 : if (mode == DImode)
6282 495241 : split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
6283 : else
6284 : {
6285 129787 : int i;
6286 :
6287 129787 : if (REG_P (operand))
6288 : {
6289 67371 : gcc_assert (reload_completed);
6290 202113 : for (i = 0; i < size; i++)
6291 134742 : parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
6292 : }
6293 62416 : else if (offsettable_memref_p (operand))
6294 : {
6295 61139 : operand = adjust_address (operand, SImode, 0);
6296 61139 : parts[0] = operand;
6297 122804 : for (i = 1; i < size; i++)
6298 61665 : parts[i] = adjust_address (operand, SImode, 4 * i);
6299 : }
6300 1277 : else if (CONST_DOUBLE_P (operand))
6301 : {
6302 1277 : const REAL_VALUE_TYPE *r;
6303 1277 : long l[4];
6304 :
6305 1277 : r = CONST_DOUBLE_REAL_VALUE (operand);
6306 1277 : switch (mode)
6307 : {
6308 0 : case E_TFmode:
6309 0 : real_to_target (l, r, mode);
6310 0 : parts[3] = gen_int_mode (l[3], SImode);
6311 0 : parts[2] = gen_int_mode (l[2], SImode);
6312 0 : break;
6313 204 : case E_XFmode:
6314 : /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
6315 : long double may not be 80-bit. */
6316 204 : real_to_target (l, r, mode);
6317 204 : parts[2] = gen_int_mode (l[2], SImode);
6318 204 : break;
6319 1073 : case E_DFmode:
6320 1073 : REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
6321 1073 : break;
6322 0 : default:
6323 0 : gcc_unreachable ();
6324 : }
6325 1277 : parts[1] = gen_int_mode (l[1], SImode);
6326 1277 : parts[0] = gen_int_mode (l[0], SImode);
6327 : }
6328 : else
6329 0 : gcc_unreachable ();
6330 : }
6331 : }
6332 : else
6333 : {
6334 3315090 : if (mode == TImode)
6335 3295827 : split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
6336 3315090 : if (mode == XFmode || mode == TFmode)
6337 : {
6338 19263 : machine_mode upper_mode = mode==XFmode ? SImode : DImode;
6339 19263 : if (REG_P (operand))
6340 : {
6341 1502 : gcc_assert (reload_completed);
6342 1502 : parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
6343 1502 : parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
6344 : }
6345 17761 : else if (offsettable_memref_p (operand))
6346 : {
6347 14093 : operand = adjust_address (operand, DImode, 0);
6348 14093 : parts[0] = operand;
6349 14093 : parts[1] = adjust_address (operand, upper_mode, 8);
6350 : }
6351 3668 : else if (CONST_DOUBLE_P (operand))
6352 : {
6353 3668 : long l[4];
6354 :
6355 3668 : real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
6356 :
6357 : /* real_to_target puts 32-bit pieces in each long. */
6358 7336 : parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
6359 3668 : | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
6360 3668 : << 32), DImode);
6361 :
6362 3668 : if (upper_mode == SImode)
6363 2734 : parts[1] = gen_int_mode (l[2], SImode);
6364 : else
6365 934 : parts[1]
6366 934 : = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
6367 934 : | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
6368 934 : << 32), DImode);
6369 : }
6370 : else
6371 0 : gcc_unreachable ();
6372 : }
6373 : }
6374 :
6375 : return size;
6376 : }
6377 :
6378 : /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
6379 : Return false when normal moves are needed; true when all required
6380 : insns have been emitted. Operands 2-4 contain the input values
6381 : int the correct order; operands 5-7 contain the output values. */
6382 :
6383 : void
6384 2074989 : ix86_split_long_move (rtx operands[])
6385 : {
6386 2074989 : rtx part[2][4];
6387 2074989 : int nparts, i, j;
6388 2074989 : int push = 0;
6389 2074989 : int collisions = 0;
6390 2074989 : machine_mode mode = GET_MODE (operands[0]);
6391 2074989 : bool collisionparts[4];
6392 :
6393 : /* The DFmode expanders may ask us to move double.
6394 : For 64bit target this is single move. By hiding the fact
6395 : here we simplify i386.md splitters. */
6396 3758152 : if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
6397 : {
6398 : /* Optimize constant pool reference to immediates. This is used by
6399 : fp moves, that force all constants to memory to allow combining. */
6400 :
6401 12913 : if (MEM_P (operands[1])
6402 12499 : && SYMBOL_REF_P (XEXP (operands[1], 0))
6403 13519 : && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
6404 117 : operands[1] = get_pool_constant (XEXP (operands[1], 0));
6405 12913 : if (push_operand (operands[0], VOIDmode))
6406 : {
6407 12913 : operands[0] = copy_rtx (operands[0]);
6408 12913 : PUT_MODE (operands[0], word_mode);
6409 : }
6410 : else
6411 0 : operands[0] = gen_lowpart (DImode, operands[0]);
6412 12913 : operands[1] = gen_lowpart (DImode, operands[1]);
6413 12913 : emit_move_insn (operands[0], operands[1]);
6414 12913 : return;
6415 : }
6416 :
6417 : /* The only non-offsettable memory we handle is push. */
6418 2062076 : if (push_operand (operands[0], VOIDmode))
6419 : push = 1;
6420 : else
6421 1878042 : gcc_assert (!MEM_P (operands[0])
6422 : || offsettable_memref_p (operands[0]));
6423 :
6424 2062076 : nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
6425 2062076 : ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
6426 :
6427 : /* When emitting push, take care for source operands on the stack. */
6428 184034 : if (push && MEM_P (operands[1])
6429 2159252 : && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
6430 : {
6431 56324 : rtx src_base = XEXP (part[1][nparts - 1], 0);
6432 :
6433 : /* Compensate for the stack decrement by 4. */
6434 56324 : if (!TARGET_64BIT && nparts == 3
6435 51632 : && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
6436 0 : src_base = plus_constant (Pmode, src_base, 4);
6437 :
6438 : /* src_base refers to the stack pointer and is
6439 : automatically decreased by emitted push. */
6440 169251 : for (i = 0; i < nparts; i++)
6441 112927 : part[1][i] = change_address (part[1][i],
6442 112927 : GET_MODE (part[1][i]), src_base);
6443 : }
6444 :
6445 : /* We need to do copy in the right order in case an address register
6446 : of the source overlaps the destination. */
6447 2062076 : if (REG_P (part[0][0]) && MEM_P (part[1][0]))
6448 : {
6449 : rtx tmp;
6450 :
6451 2356776 : for (i = 0; i < nparts; i++)
6452 : {
6453 1571184 : collisionparts[i]
6454 1571184 : = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
6455 1571184 : if (collisionparts[i])
6456 16854 : collisions++;
6457 : }
6458 :
6459 : /* Collision in the middle part can be handled by reordering. */
6460 785592 : if (collisions == 1 && nparts == 3 && collisionparts [1])
6461 : {
6462 0 : std::swap (part[0][1], part[0][2]);
6463 0 : std::swap (part[1][1], part[1][2]);
6464 : }
6465 785592 : else if (collisions == 1
6466 785592 : && nparts == 4
6467 0 : && (collisionparts [1] || collisionparts [2]))
6468 : {
6469 0 : if (collisionparts [1])
6470 : {
6471 0 : std::swap (part[0][1], part[0][2]);
6472 0 : std::swap (part[1][1], part[1][2]);
6473 : }
6474 : else
6475 : {
6476 0 : std::swap (part[0][2], part[0][3]);
6477 0 : std::swap (part[1][2], part[1][3]);
6478 : }
6479 : }
6480 :
6481 : /* If there are more collisions, we can't handle it by reordering.
6482 : Do an lea to the last part and use only one colliding move. */
6483 785592 : else if (collisions > 1)
6484 : {
6485 83 : rtx base, addr;
6486 :
6487 83 : collisions = 1;
6488 :
6489 83 : base = part[0][nparts - 1];
6490 :
6491 : /* Handle the case when the last part isn't valid for lea.
6492 : Happens in 64-bit mode storing the 12-byte XFmode. */
6493 124 : if (GET_MODE (base) != Pmode)
6494 0 : base = gen_rtx_REG (Pmode, REGNO (base));
6495 :
6496 83 : addr = XEXP (part[1][0], 0);
6497 83 : if (TARGET_TLS_DIRECT_SEG_REFS)
6498 : {
6499 83 : struct ix86_address parts;
6500 83 : int ok = ix86_decompose_address (addr, &parts);
6501 83 : gcc_assert (ok);
6502 : /* It is not valid to use %gs: or %fs: in lea. */
6503 83 : gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
6504 : }
6505 83 : emit_insn (gen_rtx_SET (base, addr));
6506 83 : part[1][0] = replace_equiv_address (part[1][0], base);
6507 166 : for (i = 1; i < nparts; i++)
6508 : {
6509 165 : tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
6510 83 : part[1][i] = replace_equiv_address (part[1][i], tmp);
6511 : }
6512 : }
6513 : }
6514 :
6515 2062076 : if (push)
6516 : {
6517 184034 : if (!TARGET_64BIT)
6518 : {
6519 158624 : if (nparts == 3)
6520 : {
6521 580 : if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
6522 0 : emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
6523 580 : emit_move_insn (part[0][2], part[1][2]);
6524 : }
6525 158044 : else if (nparts == 4)
6526 : {
6527 0 : emit_move_insn (part[0][3], part[1][3]);
6528 0 : emit_move_insn (part[0][2], part[1][2]);
6529 : }
6530 : }
6531 : else
6532 : {
6533 : /* In 64bit mode we don't have 32bit push available. In case this is
6534 : register, it is OK - we will just use larger counterpart. We also
6535 : retype memory - these comes from attempt to avoid REX prefix on
6536 : moving of second half of TFmode value. */
6537 25410 : if (GET_MODE (part[1][1]) == SImode)
6538 : {
6539 11268 : switch (GET_CODE (part[1][1]))
6540 : {
6541 10826 : case MEM:
6542 10826 : part[1][1] = adjust_address (part[1][1], DImode, 0);
6543 10826 : break;
6544 :
6545 442 : case REG:
6546 442 : part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
6547 442 : break;
6548 :
6549 0 : default:
6550 0 : gcc_unreachable ();
6551 : }
6552 :
6553 11268 : if (GET_MODE (part[1][0]) == SImode)
6554 0 : part[1][0] = part[1][1];
6555 : }
6556 : }
6557 184034 : emit_move_insn (part[0][1], part[1][1]);
6558 184034 : emit_move_insn (part[0][0], part[1][0]);
6559 184034 : return;
6560 : }
6561 :
6562 : /* Choose correct order to not overwrite the source before it is copied. */
6563 1878042 : if ((REG_P (part[0][0])
6564 1023141 : && REG_P (part[1][1])
6565 80249 : && (REGNO (part[0][0]) == REGNO (part[1][1])
6566 65070 : || (nparts == 3
6567 0 : && REGNO (part[0][0]) == REGNO (part[1][2]))
6568 65070 : || (nparts == 4
6569 0 : && REGNO (part[0][0]) == REGNO (part[1][3]))))
6570 2886004 : || (collisions > 0
6571 16771 : && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
6572 : {
6573 95064 : for (i = 0, j = nparts - 1; i < nparts; i++, j--)
6574 : {
6575 63376 : operands[2 + i] = part[0][j];
6576 63376 : operands[6 + i] = part[1][j];
6577 : }
6578 : }
6579 : else
6580 : {
6581 5539137 : for (i = 0; i < nparts; i++)
6582 : {
6583 3692783 : operands[2 + i] = part[0][i];
6584 3692783 : operands[6 + i] = part[1][i];
6585 : }
6586 : }
6587 :
6588 : /* Attempt to locally unCSE nonzero constants. */
6589 3756159 : for (j = 0; j < nparts - 1; j++)
6590 1878117 : if (CONST_INT_P (operands[6 + j])
6591 223999 : && operands[6 + j] != const0_rtx
6592 62867 : && REG_P (operands[2 + j]))
6593 111878 : for (i = j; i < nparts - 1; i++)
6594 55939 : if (CONST_INT_P (operands[7 + i])
6595 55939 : && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
6596 22570 : operands[7 + i] = operands[2 + j];
6597 :
6598 5634201 : for (i = 0; i < nparts; i++)
6599 3756159 : emit_move_insn (operands[2 + i], operands[6 + i]);
6600 :
6601 : return;
6602 : }
6603 :
6604 : /* Helper function of ix86_split_ashl used to generate an SImode/DImode
6605 : left shift by a constant, either using a single shift or
6606 : a sequence of add instructions. */
6607 :
6608 : static void
6609 4343 : ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
6610 : {
6611 4343 : if (count == 1
6612 4343 : || (count * ix86_cost->add <= ix86_cost->shift_const
6613 0 : && !optimize_insn_for_size_p ()))
6614 : {
6615 16 : while (count-- > 0)
6616 8 : emit_insn (gen_add2_insn (operand, operand));
6617 : }
6618 : else
6619 : {
6620 4335 : rtx (*insn)(rtx, rtx, rtx);
6621 :
6622 4335 : insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6623 4335 : emit_insn (insn (operand, operand, GEN_INT (count)));
6624 : }
6625 4343 : }
6626 :
6627 : void
6628 10220 : ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
6629 : {
6630 10220 : rtx (*gen_ashl3)(rtx, rtx, rtx);
6631 10220 : rtx (*gen_shld)(rtx, rtx, rtx);
6632 10220 : int half_width = GET_MODE_BITSIZE (mode) >> 1;
6633 10220 : machine_mode half_mode;
6634 :
6635 10220 : rtx low[2], high[2];
6636 10220 : int count;
6637 :
6638 10220 : if (CONST_INT_P (operands[2]))
6639 : {
6640 8523 : split_double_mode (mode, operands, 2, low, high);
6641 8523 : count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6642 :
6643 8523 : if (count >= half_width)
6644 : {
6645 2464 : emit_move_insn (high[0], low[1]);
6646 2464 : ix86_expand_clear (low[0]);
6647 :
6648 2464 : if (count > half_width)
6649 141 : ix86_expand_ashl_const (high[0], count - half_width, mode);
6650 : }
6651 6059 : else if (count == 1)
6652 : {
6653 1857 : if (!rtx_equal_p (operands[0], operands[1]))
6654 0 : emit_move_insn (operands[0], operands[1]);
6655 1857 : rtx x3 = gen_rtx_REG (CCCmode, FLAGS_REG);
6656 1857 : rtx x4 = gen_rtx_LTU (mode, x3, const0_rtx);
6657 1857 : half_mode = mode == DImode ? SImode : DImode;
6658 1857 : emit_insn (gen_add3_cc_overflow_1 (half_mode, low[0],
6659 : low[0], low[0]));
6660 1857 : emit_insn (gen_add3_carry (half_mode, high[0], high[0], high[0],
6661 : x3, x4));
6662 : }
6663 : else
6664 : {
6665 4202 : gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6666 :
6667 4202 : if (!rtx_equal_p (operands[0], operands[1]))
6668 0 : emit_move_insn (operands[0], operands[1]);
6669 :
6670 4202 : emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
6671 4202 : ix86_expand_ashl_const (low[0], count, mode);
6672 : }
6673 8795 : return;
6674 : }
6675 :
6676 1697 : split_double_mode (mode, operands, 1, low, high);
6677 1697 : half_mode = mode == DImode ? SImode : DImode;
6678 :
6679 1697 : gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6680 :
6681 1697 : if (operands[1] == const1_rtx)
6682 : {
6683 : /* Assuming we've chosen a QImode capable registers, then 1 << N
6684 : can be done with two 32/64-bit shifts, no branches, no cmoves. */
6685 272 : if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
6686 : {
6687 159 : rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
6688 :
6689 159 : ix86_expand_clear (low[0]);
6690 159 : ix86_expand_clear (high[0]);
6691 159 : emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
6692 :
6693 159 : d = gen_lowpart (QImode, low[0]);
6694 159 : d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6695 159 : s = gen_rtx_EQ (QImode, flags, const0_rtx);
6696 159 : emit_insn (gen_rtx_SET (d, s));
6697 :
6698 159 : d = gen_lowpart (QImode, high[0]);
6699 159 : d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6700 159 : s = gen_rtx_NE (QImode, flags, const0_rtx);
6701 159 : emit_insn (gen_rtx_SET (d, s));
6702 : }
6703 :
6704 : /* Otherwise, we can get the same results by manually performing
6705 : a bit extract operation on bit 5/6, and then performing the two
6706 : shifts. The two methods of getting 0/1 into low/high are exactly
6707 : the same size. Avoiding the shift in the bit extract case helps
6708 : pentium4 a bit; no one else seems to care much either way. */
6709 : else
6710 : {
6711 113 : rtx (*gen_lshr3)(rtx, rtx, rtx);
6712 113 : rtx (*gen_and3)(rtx, rtx, rtx);
6713 113 : rtx (*gen_xor3)(rtx, rtx, rtx);
6714 113 : HOST_WIDE_INT bits;
6715 113 : rtx x;
6716 :
6717 113 : if (mode == DImode)
6718 : {
6719 : gen_lshr3 = gen_lshrsi3;
6720 : gen_and3 = gen_andsi3;
6721 : gen_xor3 = gen_xorsi3;
6722 : bits = 5;
6723 : }
6724 : else
6725 : {
6726 0 : gen_lshr3 = gen_lshrdi3;
6727 0 : gen_and3 = gen_anddi3;
6728 0 : gen_xor3 = gen_xordi3;
6729 0 : bits = 6;
6730 : }
6731 :
6732 113 : if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
6733 0 : x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
6734 : else
6735 113 : x = gen_lowpart (half_mode, operands[2]);
6736 113 : emit_insn (gen_rtx_SET (high[0], x));
6737 :
6738 113 : emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
6739 113 : emit_insn (gen_and3 (high[0], high[0], const1_rtx));
6740 113 : emit_move_insn (low[0], high[0]);
6741 113 : emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
6742 : }
6743 :
6744 272 : emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6745 272 : emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
6746 272 : return;
6747 : }
6748 :
6749 1425 : if (operands[1] == constm1_rtx)
6750 : {
6751 : /* For -1 << N, we can avoid the shld instruction, because we
6752 : know that we're shifting 0...31/63 ones into a -1. */
6753 118 : emit_move_insn (low[0], constm1_rtx);
6754 118 : if (optimize_insn_for_size_p ())
6755 6 : emit_move_insn (high[0], low[0]);
6756 : else
6757 112 : emit_move_insn (high[0], constm1_rtx);
6758 : }
6759 : else
6760 : {
6761 1307 : gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6762 :
6763 1307 : if (!rtx_equal_p (operands[0], operands[1]))
6764 0 : emit_move_insn (operands[0], operands[1]);
6765 :
6766 1307 : split_double_mode (mode, operands, 1, low, high);
6767 1307 : emit_insn (gen_shld (high[0], low[0], operands[2]));
6768 : }
6769 :
6770 1425 : emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6771 :
6772 1425 : if (TARGET_CMOVE && scratch)
6773 : {
6774 947 : ix86_expand_clear (scratch);
6775 947 : emit_insn (gen_x86_shift_adj_1
6776 : (half_mode, high[0], low[0], operands[2], scratch));
6777 : }
6778 : else
6779 478 : emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
6780 : }
6781 :
6782 : void
6783 6040 : ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
6784 : {
6785 4798 : rtx (*gen_ashr3)(rtx, rtx, rtx)
6786 6040 : = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
6787 6040 : rtx (*gen_shrd)(rtx, rtx, rtx);
6788 6040 : int half_width = GET_MODE_BITSIZE (mode) >> 1;
6789 :
6790 6040 : rtx low[2], high[2];
6791 6040 : int count;
6792 :
6793 6040 : if (CONST_INT_P (operands[2]))
6794 : {
6795 5863 : split_double_mode (mode, operands, 2, low, high);
6796 5863 : count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6797 :
6798 11726 : if (count == GET_MODE_BITSIZE (mode) - 1)
6799 : {
6800 87 : emit_move_insn (high[0], high[1]);
6801 87 : emit_insn (gen_ashr3 (high[0], high[0],
6802 87 : GEN_INT (half_width - 1)));
6803 87 : emit_move_insn (low[0], high[0]);
6804 :
6805 : }
6806 5776 : else if (count >= half_width)
6807 : {
6808 1619 : emit_move_insn (low[0], high[1]);
6809 1619 : emit_move_insn (high[0], low[0]);
6810 1619 : emit_insn (gen_ashr3 (high[0], high[0],
6811 1619 : GEN_INT (half_width - 1)));
6812 :
6813 1619 : if (count > half_width)
6814 38 : emit_insn (gen_ashr3 (low[0], low[0],
6815 38 : GEN_INT (count - half_width)));
6816 : }
6817 4157 : else if (count == 1
6818 766 : && (TARGET_USE_RCR || optimize_size > 1))
6819 : {
6820 1 : if (!rtx_equal_p (operands[0], operands[1]))
6821 0 : emit_move_insn (operands[0], operands[1]);
6822 1 : if (mode == DImode)
6823 : {
6824 0 : emit_insn (gen_ashrsi3_carry (high[0], high[0]));
6825 0 : emit_insn (gen_rcrsi2 (low[0], low[0]));
6826 : }
6827 : else
6828 : {
6829 1 : emit_insn (gen_ashrdi3_carry (high[0], high[0]));
6830 1 : emit_insn (gen_rcrdi2 (low[0], low[0]));
6831 : }
6832 : }
6833 : else
6834 : {
6835 4156 : gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6836 :
6837 4156 : if (!rtx_equal_p (operands[0], operands[1]))
6838 0 : emit_move_insn (operands[0], operands[1]);
6839 :
6840 4156 : emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6841 4156 : emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
6842 : }
6843 : }
6844 : else
6845 : {
6846 177 : machine_mode half_mode;
6847 :
6848 177 : gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6849 :
6850 177 : if (!rtx_equal_p (operands[0], operands[1]))
6851 0 : emit_move_insn (operands[0], operands[1]);
6852 :
6853 177 : split_double_mode (mode, operands, 1, low, high);
6854 177 : half_mode = mode == DImode ? SImode : DImode;
6855 :
6856 177 : emit_insn (gen_shrd (low[0], high[0], operands[2]));
6857 177 : emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
6858 :
6859 177 : if (TARGET_CMOVE && scratch)
6860 : {
6861 140 : emit_move_insn (scratch, high[0]);
6862 140 : emit_insn (gen_ashr3 (scratch, scratch,
6863 140 : GEN_INT (half_width - 1)));
6864 140 : emit_insn (gen_x86_shift_adj_1
6865 : (half_mode, low[0], high[0], operands[2], scratch));
6866 : }
6867 : else
6868 37 : emit_insn (gen_x86_shift_adj_3
6869 : (half_mode, low[0], high[0], operands[2]));
6870 : }
6871 6040 : }
6872 :
6873 : void
6874 13276 : ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
6875 : {
6876 5875 : rtx (*gen_lshr3)(rtx, rtx, rtx)
6877 13276 : = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
6878 13276 : rtx (*gen_shrd)(rtx, rtx, rtx);
6879 13276 : int half_width = GET_MODE_BITSIZE (mode) >> 1;
6880 :
6881 13276 : rtx low[2], high[2];
6882 13276 : int count;
6883 :
6884 13276 : if (CONST_INT_P (operands[2]))
6885 : {
6886 11878 : split_double_mode (mode, operands, 2, low, high);
6887 11878 : count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6888 :
6889 11878 : if (count >= half_width)
6890 : {
6891 8468 : emit_move_insn (low[0], high[1]);
6892 8468 : ix86_expand_clear (high[0]);
6893 :
6894 8468 : if (count > half_width)
6895 651 : emit_insn (gen_lshr3 (low[0], low[0],
6896 651 : GEN_INT (count - half_width)));
6897 : }
6898 3410 : else if (count == 1
6899 676 : && (TARGET_USE_RCR || optimize_size > 1))
6900 : {
6901 1 : if (!rtx_equal_p (operands[0], operands[1]))
6902 0 : emit_move_insn (operands[0], operands[1]);
6903 1 : if (mode == DImode)
6904 : {
6905 0 : emit_insn (gen_lshrsi3_carry (high[0], high[0]));
6906 0 : emit_insn (gen_rcrsi2 (low[0], low[0]));
6907 : }
6908 : else
6909 : {
6910 1 : emit_insn (gen_lshrdi3_carry (high[0], high[0]));
6911 1 : emit_insn (gen_rcrdi2 (low[0], low[0]));
6912 : }
6913 : }
6914 : else
6915 : {
6916 3409 : gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6917 :
6918 3409 : if (!rtx_equal_p (operands[0], operands[1]))
6919 0 : emit_move_insn (operands[0], operands[1]);
6920 :
6921 3409 : emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6922 3409 : emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
6923 : }
6924 : }
6925 : else
6926 : {
6927 1398 : machine_mode half_mode;
6928 :
6929 1398 : gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6930 :
6931 1398 : if (!rtx_equal_p (operands[0], operands[1]))
6932 0 : emit_move_insn (operands[0], operands[1]);
6933 :
6934 1398 : split_double_mode (mode, operands, 1, low, high);
6935 1398 : half_mode = mode == DImode ? SImode : DImode;
6936 :
6937 1398 : emit_insn (gen_shrd (low[0], high[0], operands[2]));
6938 1398 : emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
6939 :
6940 1398 : if (TARGET_CMOVE && scratch)
6941 : {
6942 1133 : ix86_expand_clear (scratch);
6943 1133 : emit_insn (gen_x86_shift_adj_1
6944 : (half_mode, low[0], high[0], operands[2], scratch));
6945 : }
6946 : else
6947 265 : emit_insn (gen_x86_shift_adj_2
6948 : (half_mode, low[0], high[0], operands[2]));
6949 : }
6950 13276 : }
6951 :
6952 : /* Helper function to split TImode ashl under NDD. */
6953 : void
6954 1 : ix86_split_ashl_ndd (rtx *operands, rtx scratch)
6955 : {
6956 1 : gcc_assert (TARGET_APX_NDD);
6957 1 : int half_width = GET_MODE_BITSIZE (TImode) >> 1;
6958 :
6959 1 : rtx low[2], high[2];
6960 1 : int count;
6961 :
6962 1 : split_double_mode (TImode, operands, 2, low, high);
6963 1 : if (CONST_INT_P (operands[2]))
6964 : {
6965 0 : count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (TImode) - 1);
6966 :
6967 0 : if (count >= half_width)
6968 : {
6969 0 : count = count - half_width;
6970 0 : if (count == 0)
6971 : {
6972 0 : if (!rtx_equal_p (high[0], low[1]))
6973 0 : emit_move_insn (high[0], low[1]);
6974 : }
6975 0 : else if (count == 1)
6976 0 : emit_insn (gen_adddi3 (high[0], low[1], low[1]));
6977 : else
6978 0 : emit_insn (gen_ashldi3 (high[0], low[1], GEN_INT (count)));
6979 :
6980 0 : ix86_expand_clear (low[0]);
6981 : }
6982 0 : else if (count == 1)
6983 : {
6984 0 : rtx x3 = gen_rtx_REG (CCCmode, FLAGS_REG);
6985 0 : rtx x4 = gen_rtx_LTU (TImode, x3, const0_rtx);
6986 0 : emit_insn (gen_add3_cc_overflow_1 (DImode, low[0],
6987 : low[1], low[1]));
6988 0 : emit_insn (gen_add3_carry (DImode, high[0], high[1], high[1],
6989 : x3, x4));
6990 : }
6991 : else
6992 : {
6993 0 : emit_insn (gen_x86_64_shld_ndd (high[0], high[1], low[1],
6994 : GEN_INT (count)));
6995 0 : emit_insn (gen_ashldi3 (low[0], low[1], GEN_INT (count)));
6996 : }
6997 : }
6998 : else
6999 : {
7000 1 : emit_insn (gen_x86_64_shld_ndd (high[0], high[1], low[1],
7001 : operands[2]));
7002 1 : emit_insn (gen_ashldi3 (low[0], low[1], operands[2]));
7003 1 : if (TARGET_CMOVE && scratch)
7004 : {
7005 1 : ix86_expand_clear (scratch);
7006 1 : emit_insn (gen_x86_shift_adj_1
7007 : (DImode, high[0], low[0], operands[2], scratch));
7008 : }
7009 : else
7010 0 : emit_insn (gen_x86_shift_adj_2 (DImode, high[0], low[0], operands[2]));
7011 : }
7012 1 : }
7013 :
7014 : /* Helper function to split TImode l/ashr under NDD. */
7015 : void
7016 2 : ix86_split_rshift_ndd (enum rtx_code code, rtx *operands, rtx scratch)
7017 : {
7018 2 : gcc_assert (TARGET_APX_NDD);
7019 2 : int half_width = GET_MODE_BITSIZE (TImode) >> 1;
7020 2 : bool ashr_p = code == ASHIFTRT;
7021 2 : rtx (*gen_shr)(rtx, rtx, rtx) = ashr_p ? gen_ashrdi3
7022 : : gen_lshrdi3;
7023 :
7024 2 : rtx low[2], high[2];
7025 2 : int count;
7026 :
7027 2 : split_double_mode (TImode, operands, 2, low, high);
7028 2 : if (CONST_INT_P (operands[2]))
7029 : {
7030 0 : count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (TImode) - 1);
7031 :
7032 0 : if (ashr_p && (count == GET_MODE_BITSIZE (TImode) - 1))
7033 : {
7034 0 : emit_insn (gen_shr (high[0], high[1],
7035 : GEN_INT (half_width - 1)));
7036 0 : emit_move_insn (low[0], high[0]);
7037 : }
7038 0 : else if (count >= half_width)
7039 : {
7040 0 : if (ashr_p)
7041 0 : emit_insn (gen_shr (high[0], high[1],
7042 : GEN_INT (half_width - 1)));
7043 : else
7044 0 : ix86_expand_clear (high[0]);
7045 :
7046 0 : if (count > half_width)
7047 0 : emit_insn (gen_shr (low[0], high[1],
7048 0 : GEN_INT (count - half_width)));
7049 : else
7050 0 : emit_move_insn (low[0], high[1]);
7051 : }
7052 : else
7053 : {
7054 0 : emit_insn (gen_x86_64_shrd_ndd (low[0], low[1], high[1],
7055 : GEN_INT (count)));
7056 0 : emit_insn (gen_shr (high[0], high[1], GEN_INT (count)));
7057 : }
7058 : }
7059 : else
7060 : {
7061 2 : emit_insn (gen_x86_64_shrd_ndd (low[0], low[1], high[1],
7062 : operands[2]));
7063 2 : emit_insn (gen_shr (high[0], high[1], operands[2]));
7064 :
7065 2 : if (TARGET_CMOVE && scratch)
7066 : {
7067 2 : if (ashr_p)
7068 : {
7069 1 : emit_move_insn (scratch, high[0]);
7070 1 : emit_insn (gen_shr (scratch, scratch,
7071 : GEN_INT (half_width - 1)));
7072 : }
7073 : else
7074 1 : ix86_expand_clear (scratch);
7075 :
7076 2 : emit_insn (gen_x86_shift_adj_1
7077 : (DImode, low[0], high[0], operands[2], scratch));
7078 : }
7079 0 : else if (ashr_p)
7080 0 : emit_insn (gen_x86_shift_adj_3
7081 : (DImode, low[0], high[0], operands[2]));
7082 : else
7083 0 : emit_insn (gen_x86_shift_adj_2
7084 : (DImode, low[0], high[0], operands[2]));
7085 : }
7086 2 : }
7087 :
7088 : /* Expand move of V1TI mode register X to a new TI mode register. */
7089 : static rtx
7090 17 : ix86_expand_v1ti_to_ti (rtx x)
7091 : {
7092 17 : rtx result = gen_reg_rtx (TImode);
7093 17 : if (TARGET_SSE2)
7094 : {
7095 17 : rtx temp = force_reg (V2DImode, gen_lowpart (V2DImode, x));
7096 17 : rtx lo = gen_lowpart (DImode, result);
7097 17 : emit_insn (gen_vec_extractv2didi (lo, temp, const0_rtx));
7098 17 : rtx hi = gen_highpart (DImode, result);
7099 17 : emit_insn (gen_vec_extractv2didi (hi, temp, const1_rtx));
7100 : }
7101 : else
7102 0 : emit_move_insn (result, gen_lowpart (TImode, x));
7103 17 : return result;
7104 : }
7105 :
7106 : /* Expand move of TI mode register X to a new V1TI mode register. */
7107 : static rtx
7108 17 : ix86_expand_ti_to_v1ti (rtx x)
7109 : {
7110 17 : if (TARGET_SSE2)
7111 : {
7112 17 : rtx lo = gen_lowpart (DImode, x);
7113 17 : rtx hi = gen_highpart (DImode, x);
7114 17 : rtx tmp = gen_reg_rtx (V2DImode);
7115 17 : emit_insn (gen_vec_concatv2di (tmp, lo, hi));
7116 17 : return force_reg (V1TImode, gen_lowpart (V1TImode, tmp));
7117 : }
7118 :
7119 0 : return force_reg (V1TImode, gen_lowpart (V1TImode, x));
7120 : }
7121 :
7122 : /* Expand V1TI mode shift (of rtx_code CODE) by constant. */
7123 : void
7124 42 : ix86_expand_v1ti_shift (enum rtx_code code, rtx operands[])
7125 : {
7126 42 : rtx op1 = force_reg (V1TImode, operands[1]);
7127 :
7128 42 : if (!CONST_INT_P (operands[2]))
7129 : {
7130 6 : rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
7131 6 : rtx tmp2 = gen_reg_rtx (TImode);
7132 3 : rtx (*shift) (rtx, rtx, rtx)
7133 6 : = (code == ASHIFT) ? gen_ashlti3 : gen_lshrti3;
7134 6 : emit_insn (shift (tmp2, tmp1, operands[2]));
7135 6 : rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
7136 6 : emit_move_insn (operands[0], tmp3);
7137 6 : return;
7138 : }
7139 :
7140 36 : HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
7141 :
7142 36 : if (bits == 0)
7143 : {
7144 0 : emit_move_insn (operands[0], op1);
7145 0 : return;
7146 : }
7147 :
7148 36 : if ((bits & 7) == 0)
7149 : {
7150 0 : rtx tmp = gen_reg_rtx (V1TImode);
7151 0 : if (code == ASHIFT)
7152 0 : emit_insn (gen_sse2_ashlv1ti3 (tmp, op1, GEN_INT (bits)));
7153 : else
7154 0 : emit_insn (gen_sse2_lshrv1ti3 (tmp, op1, GEN_INT (bits)));
7155 0 : emit_move_insn (operands[0], tmp);
7156 0 : return;
7157 : }
7158 :
7159 36 : rtx tmp1 = gen_reg_rtx (V1TImode);
7160 36 : if (code == ASHIFT)
7161 18 : emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (64)));
7162 : else
7163 18 : emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
7164 :
7165 : /* tmp2 is operands[1] shifted by 64, in V2DImode. */
7166 36 : rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
7167 :
7168 : /* tmp3 will be the V2DImode result. */
7169 36 : rtx tmp3 = gen_reg_rtx (V2DImode);
7170 :
7171 36 : if (bits > 64)
7172 : {
7173 18 : if (code == ASHIFT)
7174 9 : emit_insn (gen_ashlv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
7175 : else
7176 9 : emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
7177 : }
7178 : else
7179 : {
7180 : /* tmp4 is operands[1], in V2DImode. */
7181 18 : rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7182 :
7183 18 : rtx tmp5 = gen_reg_rtx (V2DImode);
7184 18 : if (code == ASHIFT)
7185 9 : emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (bits)));
7186 : else
7187 9 : emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
7188 :
7189 18 : rtx tmp6 = gen_reg_rtx (V2DImode);
7190 18 : if (code == ASHIFT)
7191 9 : emit_insn (gen_lshrv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
7192 : else
7193 9 : emit_insn (gen_ashlv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
7194 :
7195 18 : emit_insn (gen_iorv2di3 (tmp3, tmp5, tmp6));
7196 : }
7197 :
7198 : /* Convert the result back to V1TImode and store in operands[0]. */
7199 36 : rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7200 36 : emit_move_insn (operands[0], tmp7);
7201 : }
7202 :
7203 : /* Expand V1TI mode rotate (of rtx_code CODE) by constant. */
7204 : void
7205 39 : ix86_expand_v1ti_rotate (enum rtx_code code, rtx operands[])
7206 : {
7207 39 : rtx op1 = force_reg (V1TImode, operands[1]);
7208 :
7209 39 : if (!CONST_INT_P (operands[2]))
7210 : {
7211 8 : rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
7212 8 : rtx tmp2 = gen_reg_rtx (TImode);
7213 4 : rtx (*rotate) (rtx, rtx, rtx)
7214 8 : = (code == ROTATE) ? gen_rotlti3 : gen_rotrti3;
7215 8 : emit_insn (rotate (tmp2, tmp1, operands[2]));
7216 8 : rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
7217 8 : emit_move_insn (operands[0], tmp3);
7218 8 : return;
7219 : }
7220 :
7221 31 : HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
7222 :
7223 31 : if (bits == 0)
7224 : {
7225 0 : emit_move_insn (operands[0], op1);
7226 0 : return;
7227 : }
7228 :
7229 31 : if (code == ROTATERT)
7230 16 : bits = 128 - bits;
7231 :
7232 31 : if ((bits & 31) == 0)
7233 : {
7234 5 : rtx tmp2 = gen_reg_rtx (V4SImode);
7235 5 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7236 5 : if (bits == 32)
7237 1 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x93)));
7238 4 : else if (bits == 64)
7239 2 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x4e)));
7240 : else
7241 2 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x39)));
7242 5 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp2));
7243 5 : return;
7244 : }
7245 :
7246 26 : if ((bits & 7) == 0)
7247 : {
7248 6 : rtx tmp1 = gen_reg_rtx (V1TImode);
7249 6 : rtx tmp2 = gen_reg_rtx (V1TImode);
7250 6 : rtx tmp3 = gen_reg_rtx (V1TImode);
7251 :
7252 6 : emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (bits)));
7253 6 : emit_insn (gen_sse2_lshrv1ti3 (tmp2, op1, GEN_INT (128 - bits)));
7254 6 : emit_insn (gen_iorv1ti3 (tmp3, tmp1, tmp2));
7255 6 : emit_move_insn (operands[0], tmp3);
7256 6 : return;
7257 : }
7258 :
7259 20 : rtx op1_v4si = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7260 :
7261 20 : rtx lobits;
7262 20 : rtx hibits;
7263 :
7264 20 : switch (bits >> 5)
7265 : {
7266 7 : case 0:
7267 7 : lobits = op1_v4si;
7268 7 : hibits = gen_reg_rtx (V4SImode);
7269 7 : emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x93)));
7270 7 : break;
7271 :
7272 2 : case 1:
7273 2 : lobits = gen_reg_rtx (V4SImode);
7274 2 : hibits = gen_reg_rtx (V4SImode);
7275 2 : emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x93)));
7276 2 : emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x4e)));
7277 2 : break;
7278 :
7279 2 : case 2:
7280 2 : lobits = gen_reg_rtx (V4SImode);
7281 2 : hibits = gen_reg_rtx (V4SImode);
7282 2 : emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x4e)));
7283 2 : emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x39)));
7284 2 : break;
7285 :
7286 9 : default:
7287 9 : lobits = gen_reg_rtx (V4SImode);
7288 9 : emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x39)));
7289 9 : hibits = op1_v4si;
7290 9 : break;
7291 : }
7292 :
7293 20 : rtx tmp1 = gen_reg_rtx (V4SImode);
7294 20 : rtx tmp2 = gen_reg_rtx (V4SImode);
7295 20 : rtx tmp3 = gen_reg_rtx (V4SImode);
7296 :
7297 20 : emit_insn (gen_ashlv4si3 (tmp1, lobits, GEN_INT (bits & 31)));
7298 20 : emit_insn (gen_lshrv4si3 (tmp2, hibits, GEN_INT (32 - (bits & 31))));
7299 20 : emit_insn (gen_iorv4si3 (tmp3, tmp1, tmp2));
7300 :
7301 20 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
7302 : }
7303 :
7304 : /* Expand V1TI mode ashiftrt by constant. */
7305 : void
7306 109 : ix86_expand_v1ti_ashiftrt (rtx operands[])
7307 : {
7308 109 : rtx op1 = force_reg (V1TImode, operands[1]);
7309 :
7310 109 : if (!CONST_INT_P (operands[2]))
7311 : {
7312 3 : rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
7313 3 : rtx tmp2 = gen_reg_rtx (TImode);
7314 3 : emit_insn (gen_ashrti3 (tmp2, tmp1, operands[2]));
7315 3 : rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
7316 3 : emit_move_insn (operands[0], tmp3);
7317 3 : return;
7318 : }
7319 :
7320 106 : HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
7321 :
7322 106 : if (bits == 0)
7323 : {
7324 0 : emit_move_insn (operands[0], op1);
7325 0 : return;
7326 : }
7327 :
7328 106 : if (bits == 127)
7329 : {
7330 : /* Two operations. */
7331 3 : rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
7332 3 : rtx tmp2 = gen_reg_rtx (V4SImode);
7333 3 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7334 :
7335 3 : rtx tmp3 = gen_reg_rtx (V4SImode);
7336 3 : emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7337 :
7338 3 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
7339 3 : return;
7340 : }
7341 :
7342 103 : if (bits == 64)
7343 : {
7344 : /* Three operations. */
7345 3 : rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
7346 3 : rtx tmp2 = gen_reg_rtx (V4SImode);
7347 3 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7348 :
7349 3 : rtx tmp3 = gen_reg_rtx (V4SImode);
7350 3 : emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7351 :
7352 3 : rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
7353 3 : rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7354 3 : rtx tmp6 = gen_reg_rtx (V2DImode);
7355 3 : emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
7356 :
7357 3 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
7358 3 : return;
7359 : }
7360 :
7361 100 : if (bits == 96)
7362 : {
7363 : /* Three operations. */
7364 3 : rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
7365 3 : rtx tmp2 = gen_reg_rtx (V4SImode);
7366 3 : emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
7367 :
7368 3 : rtx tmp3 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
7369 3 : rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
7370 3 : rtx tmp5 = gen_reg_rtx (V2DImode);
7371 3 : emit_insn (gen_vec_interleave_highv2di (tmp5, tmp3, tmp4));
7372 :
7373 3 : rtx tmp6 = force_reg(V4SImode, gen_lowpart (V4SImode, tmp5));
7374 3 : rtx tmp7 = gen_reg_rtx (V4SImode);
7375 3 : emit_insn (gen_sse2_pshufd (tmp7, tmp6, GEN_INT (0xfd)));
7376 :
7377 3 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
7378 3 : return;
7379 : }
7380 :
7381 97 : if (bits >= 111)
7382 : {
7383 : /* Three operations. */
7384 21 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7385 21 : rtx tmp2 = gen_reg_rtx (V4SImode);
7386 21 : emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
7387 :
7388 21 : rtx tmp3 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
7389 21 : rtx tmp4 = gen_reg_rtx (V8HImode);
7390 21 : emit_insn (gen_sse2_pshufhw (tmp4, tmp3, GEN_INT (0xfe)));
7391 :
7392 21 : rtx tmp5 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp4));
7393 21 : rtx tmp6 = gen_reg_rtx (V4SImode);
7394 21 : emit_insn (gen_sse2_pshufd (tmp6, tmp5, GEN_INT (0xfe)));
7395 :
7396 21 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
7397 21 : return;
7398 : }
7399 :
7400 76 : if (TARGET_AVX2 || TARGET_SSE4_1)
7401 : {
7402 : /* Three operations. */
7403 50 : if (bits == 32)
7404 : {
7405 2 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7406 2 : rtx tmp2 = gen_reg_rtx (V4SImode);
7407 2 : emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
7408 :
7409 2 : rtx tmp3 = gen_reg_rtx (V1TImode);
7410 2 : emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (32)));
7411 :
7412 2 : if (TARGET_AVX2)
7413 : {
7414 1 : rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
7415 1 : rtx tmp5 = gen_reg_rtx (V4SImode);
7416 1 : emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
7417 : GEN_INT (7)));
7418 :
7419 1 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
7420 : }
7421 : else
7422 : {
7423 1 : rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
7424 1 : rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
7425 1 : rtx tmp6 = gen_reg_rtx (V8HImode);
7426 1 : emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
7427 : GEN_INT (0x3f)));
7428 :
7429 1 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
7430 : }
7431 2 : return;
7432 : }
7433 :
7434 : /* Three operations. */
7435 48 : if (bits == 8 || bits == 16 || bits == 24)
7436 : {
7437 6 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7438 6 : rtx tmp2 = gen_reg_rtx (V4SImode);
7439 6 : emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
7440 :
7441 6 : rtx tmp3 = gen_reg_rtx (V1TImode);
7442 6 : emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (bits)));
7443 :
7444 6 : if (TARGET_AVX2)
7445 : {
7446 3 : rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
7447 3 : rtx tmp5 = gen_reg_rtx (V4SImode);
7448 3 : emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
7449 : GEN_INT (7)));
7450 :
7451 3 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
7452 : }
7453 : else
7454 : {
7455 3 : rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
7456 3 : rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
7457 3 : rtx tmp6 = gen_reg_rtx (V8HImode);
7458 3 : emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
7459 : GEN_INT (0x3f)));
7460 :
7461 3 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
7462 : }
7463 6 : return;
7464 : }
7465 : }
7466 :
7467 68 : if (bits > 96)
7468 : {
7469 : /* Four operations. */
7470 3 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7471 3 : rtx tmp2 = gen_reg_rtx (V4SImode);
7472 3 : emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
7473 :
7474 3 : rtx tmp3 = gen_reg_rtx (V4SImode);
7475 3 : emit_insn (gen_ashrv4si3 (tmp3, tmp1, GEN_INT (31)));
7476 :
7477 3 : rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
7478 3 : rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7479 3 : rtx tmp6 = gen_reg_rtx (V2DImode);
7480 3 : emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
7481 :
7482 3 : rtx tmp7 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp6));
7483 3 : rtx tmp8 = gen_reg_rtx (V4SImode);
7484 3 : emit_insn (gen_sse2_pshufd (tmp8, tmp7, GEN_INT (0xfd)));
7485 :
7486 3 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp8));
7487 3 : return;
7488 : }
7489 :
7490 65 : if (TARGET_SSE4_1 && (bits == 48 || bits == 80))
7491 : {
7492 : /* Four operations. */
7493 4 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7494 4 : rtx tmp2 = gen_reg_rtx (V4SImode);
7495 4 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7496 :
7497 4 : rtx tmp3 = gen_reg_rtx (V4SImode);
7498 4 : emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7499 :
7500 4 : rtx tmp4 = gen_reg_rtx (V1TImode);
7501 4 : emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
7502 :
7503 4 : rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
7504 4 : rtx tmp6 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp4));
7505 4 : rtx tmp7 = gen_reg_rtx (V8HImode);
7506 6 : emit_insn (gen_sse4_1_pblendw (tmp7, tmp5, tmp6,
7507 : GEN_INT (bits == 48 ? 0x1f : 0x07)));
7508 :
7509 4 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
7510 4 : return;
7511 : }
7512 :
7513 61 : if ((bits & 7) == 0)
7514 : {
7515 : /* Five operations. */
7516 9 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7517 9 : rtx tmp2 = gen_reg_rtx (V4SImode);
7518 9 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7519 :
7520 9 : rtx tmp3 = gen_reg_rtx (V4SImode);
7521 9 : emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7522 :
7523 9 : rtx tmp4 = gen_reg_rtx (V1TImode);
7524 9 : emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
7525 :
7526 9 : rtx tmp5 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7527 9 : rtx tmp6 = gen_reg_rtx (V1TImode);
7528 9 : emit_insn (gen_sse2_ashlv1ti3 (tmp6, tmp5, GEN_INT (128 - bits)));
7529 :
7530 9 : rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
7531 9 : rtx tmp8 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp6));
7532 9 : rtx tmp9 = gen_reg_rtx (V2DImode);
7533 9 : emit_insn (gen_iorv2di3 (tmp9, tmp7, tmp8));
7534 :
7535 9 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp9));
7536 9 : return;
7537 : }
7538 :
7539 52 : if (TARGET_AVX2 && bits < 32)
7540 : {
7541 : /* Six operations. */
7542 9 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7543 9 : rtx tmp2 = gen_reg_rtx (V4SImode);
7544 9 : emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
7545 :
7546 9 : rtx tmp3 = gen_reg_rtx (V1TImode);
7547 9 : emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
7548 :
7549 9 : rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7550 9 : rtx tmp5 = gen_reg_rtx (V2DImode);
7551 9 : emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
7552 :
7553 9 : rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7554 9 : rtx tmp7 = gen_reg_rtx (V2DImode);
7555 9 : emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
7556 :
7557 9 : rtx tmp8 = gen_reg_rtx (V2DImode);
7558 9 : emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
7559 :
7560 9 : rtx tmp9 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp8));
7561 9 : rtx tmp10 = gen_reg_rtx (V4SImode);
7562 9 : emit_insn (gen_avx2_pblenddv4si (tmp10, tmp2, tmp9, GEN_INT (7)));
7563 :
7564 9 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp10));
7565 9 : return;
7566 : }
7567 :
7568 43 : if (TARGET_SSE4_1 && bits < 15)
7569 : {
7570 : /* Six operations. */
7571 4 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7572 4 : rtx tmp2 = gen_reg_rtx (V4SImode);
7573 4 : emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
7574 :
7575 4 : rtx tmp3 = gen_reg_rtx (V1TImode);
7576 4 : emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
7577 :
7578 4 : rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7579 4 : rtx tmp5 = gen_reg_rtx (V2DImode);
7580 4 : emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
7581 :
7582 4 : rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7583 4 : rtx tmp7 = gen_reg_rtx (V2DImode);
7584 4 : emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
7585 :
7586 4 : rtx tmp8 = gen_reg_rtx (V2DImode);
7587 4 : emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
7588 :
7589 4 : rtx tmp9 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
7590 4 : rtx tmp10 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp8));
7591 4 : rtx tmp11 = gen_reg_rtx (V8HImode);
7592 4 : emit_insn (gen_sse4_1_pblendw (tmp11, tmp9, tmp10, GEN_INT (0x3f)));
7593 :
7594 4 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp11));
7595 4 : return;
7596 : }
7597 :
7598 18 : if (bits == 1)
7599 : {
7600 : /* Eight operations. */
7601 1 : rtx tmp1 = gen_reg_rtx (V1TImode);
7602 1 : emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
7603 :
7604 1 : rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7605 1 : rtx tmp3 = gen_reg_rtx (V2DImode);
7606 1 : emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (1)));
7607 :
7608 1 : rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
7609 1 : rtx tmp5 = gen_reg_rtx (V2DImode);
7610 1 : emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (63)));
7611 :
7612 1 : rtx tmp6 = gen_reg_rtx (V2DImode);
7613 1 : emit_insn (gen_iorv2di3 (tmp6, tmp3, tmp5));
7614 :
7615 1 : rtx tmp7 = gen_reg_rtx (V2DImode);
7616 1 : emit_insn (gen_lshrv2di3 (tmp7, tmp2, GEN_INT (63)));
7617 :
7618 1 : rtx tmp8 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp7));
7619 1 : rtx tmp9 = gen_reg_rtx (V4SImode);
7620 1 : emit_insn (gen_sse2_pshufd (tmp9, tmp8, GEN_INT (0xbf)));
7621 :
7622 1 : rtx tmp10 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp9));
7623 1 : rtx tmp11 = gen_reg_rtx (V2DImode);
7624 1 : emit_insn (gen_ashlv2di3 (tmp11, tmp10, GEN_INT (31)));
7625 :
7626 1 : rtx tmp12 = gen_reg_rtx (V2DImode);
7627 1 : emit_insn (gen_iorv2di3 (tmp12, tmp6, tmp11));
7628 :
7629 1 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp12));
7630 1 : return;
7631 : }
7632 :
7633 38 : if (bits > 64)
7634 : {
7635 : /* Eight operations. */
7636 12 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7637 12 : rtx tmp2 = gen_reg_rtx (V4SImode);
7638 12 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7639 :
7640 12 : rtx tmp3 = gen_reg_rtx (V4SImode);
7641 12 : emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7642 :
7643 12 : rtx tmp4 = gen_reg_rtx (V1TImode);
7644 12 : emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
7645 :
7646 12 : rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
7647 12 : rtx tmp6 = gen_reg_rtx (V2DImode);
7648 12 : emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits - 64)));
7649 :
7650 12 : rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7651 12 : rtx tmp8 = gen_reg_rtx (V1TImode);
7652 12 : emit_insn (gen_sse2_ashlv1ti3 (tmp8, tmp7, GEN_INT (64)));
7653 :
7654 12 : rtx tmp9 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7655 12 : rtx tmp10 = gen_reg_rtx (V2DImode);
7656 12 : emit_insn (gen_ashlv2di3 (tmp10, tmp9, GEN_INT (128 - bits)));
7657 :
7658 12 : rtx tmp11 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp8));
7659 12 : rtx tmp12 = gen_reg_rtx (V2DImode);
7660 12 : emit_insn (gen_iorv2di3 (tmp12, tmp10, tmp11));
7661 :
7662 12 : rtx tmp13 = gen_reg_rtx (V2DImode);
7663 12 : emit_insn (gen_iorv2di3 (tmp13, tmp6, tmp12));
7664 :
7665 12 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp13));
7666 : }
7667 : else
7668 : {
7669 : /* Nine operations. */
7670 26 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7671 26 : rtx tmp2 = gen_reg_rtx (V4SImode);
7672 26 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7673 :
7674 26 : rtx tmp3 = gen_reg_rtx (V4SImode);
7675 26 : emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7676 :
7677 26 : rtx tmp4 = gen_reg_rtx (V1TImode);
7678 26 : emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
7679 :
7680 26 : rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7681 26 : rtx tmp6 = gen_reg_rtx (V2DImode);
7682 26 : emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits)));
7683 :
7684 26 : rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
7685 26 : rtx tmp8 = gen_reg_rtx (V2DImode);
7686 26 : emit_insn (gen_ashlv2di3 (tmp8, tmp7, GEN_INT (64 - bits)));
7687 :
7688 26 : rtx tmp9 = gen_reg_rtx (V2DImode);
7689 26 : emit_insn (gen_iorv2di3 (tmp9, tmp6, tmp8));
7690 :
7691 26 : rtx tmp10 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7692 26 : rtx tmp11 = gen_reg_rtx (V1TImode);
7693 26 : emit_insn (gen_sse2_ashlv1ti3 (tmp11, tmp10, GEN_INT (64)));
7694 :
7695 26 : rtx tmp12 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp11));
7696 26 : rtx tmp13 = gen_reg_rtx (V2DImode);
7697 26 : emit_insn (gen_ashlv2di3 (tmp13, tmp12, GEN_INT (64 - bits)));
7698 :
7699 26 : rtx tmp14 = gen_reg_rtx (V2DImode);
7700 26 : emit_insn (gen_iorv2di3 (tmp14, tmp9, tmp13));
7701 :
7702 26 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp14));
7703 : }
7704 : }
7705 :
7706 : /* Expand V2DI mode ashiftrt. */
7707 : void
7708 414 : ix86_expand_v2di_ashiftrt (rtx operands[])
7709 : {
7710 414 : if (operands[2] == const0_rtx)
7711 : {
7712 0 : emit_move_insn (operands[0], operands[1]);
7713 0 : return;
7714 : }
7715 :
7716 414 : if (TARGET_SSE4_2
7717 133 : && CONST_INT_P (operands[2])
7718 133 : && UINTVAL (operands[2]) >= 63
7719 422 : && !optimize_insn_for_size_p ())
7720 : {
7721 8 : rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode));
7722 8 : emit_insn (gen_sse4_2_gtv2di3 (operands[0], zero, operands[1]));
7723 8 : return;
7724 : }
7725 :
7726 406 : if (CONST_INT_P (operands[2])
7727 386 : && (!TARGET_XOP || UINTVAL (operands[2]) >= 63))
7728 : {
7729 290 : vec_perm_builder sel (4, 4, 1);
7730 290 : sel.quick_grow (4);
7731 290 : rtx arg0, arg1;
7732 290 : rtx op1 = lowpart_subreg (V4SImode,
7733 : force_reg (V2DImode, operands[1]),
7734 : V2DImode);
7735 290 : rtx target = gen_reg_rtx (V4SImode);
7736 290 : if (UINTVAL (operands[2]) >= 63)
7737 : {
7738 99 : arg0 = arg1 = gen_reg_rtx (V4SImode);
7739 99 : emit_insn (gen_ashrv4si3 (arg0, op1, GEN_INT (31)));
7740 99 : sel[0] = 1;
7741 99 : sel[1] = 1;
7742 99 : sel[2] = 3;
7743 99 : sel[3] = 3;
7744 : }
7745 191 : else if (INTVAL (operands[2]) > 32)
7746 : {
7747 18 : arg0 = gen_reg_rtx (V4SImode);
7748 18 : arg1 = gen_reg_rtx (V4SImode);
7749 18 : emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31)));
7750 18 : emit_insn (gen_ashrv4si3 (arg0, op1,
7751 18 : GEN_INT (INTVAL (operands[2]) - 32)));
7752 18 : sel[0] = 1;
7753 18 : sel[1] = 5;
7754 18 : sel[2] = 3;
7755 18 : sel[3] = 7;
7756 : }
7757 173 : else if (INTVAL (operands[2]) == 32)
7758 : {
7759 3 : arg0 = op1;
7760 3 : arg1 = gen_reg_rtx (V4SImode);
7761 3 : emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31)));
7762 3 : sel[0] = 1;
7763 3 : sel[1] = 5;
7764 3 : sel[2] = 3;
7765 3 : sel[3] = 7;
7766 : }
7767 : else
7768 : {
7769 170 : arg0 = gen_reg_rtx (V2DImode);
7770 170 : arg1 = gen_reg_rtx (V4SImode);
7771 170 : emit_insn (gen_lshrv2di3 (arg0, operands[1], operands[2]));
7772 170 : emit_insn (gen_ashrv4si3 (arg1, op1, operands[2]));
7773 170 : arg0 = lowpart_subreg (V4SImode, arg0, V2DImode);
7774 170 : sel[0] = 0;
7775 170 : sel[1] = 5;
7776 170 : sel[2] = 2;
7777 170 : sel[3] = 7;
7778 : }
7779 389 : vec_perm_indices indices (sel, arg0 != arg1 ? 2 : 1, 4);
7780 290 : rtx op0 = operands[0];
7781 290 : bool ok = targetm.vectorize.vec_perm_const (V4SImode, V4SImode,
7782 : target, arg0, arg1,
7783 : indices);
7784 290 : gcc_assert (ok);
7785 290 : emit_move_insn (op0, lowpart_subreg (V2DImode, target, V4SImode));
7786 290 : return;
7787 290 : }
7788 116 : if (!TARGET_XOP)
7789 : {
7790 20 : rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode));
7791 20 : rtx zero_or_all_ones;
7792 20 : if (TARGET_SSE4_2)
7793 : {
7794 0 : zero_or_all_ones = gen_reg_rtx (V2DImode);
7795 0 : emit_insn (gen_sse4_2_gtv2di3 (zero_or_all_ones, zero,
7796 : operands[1]));
7797 : }
7798 : else
7799 : {
7800 20 : rtx temp = gen_reg_rtx (V4SImode);
7801 20 : emit_insn (gen_ashrv4si3 (temp,
7802 : lowpart_subreg (V4SImode,
7803 : force_reg (V2DImode,
7804 : operands[1]),
7805 : V2DImode),
7806 : GEN_INT (31)));
7807 20 : zero_or_all_ones = gen_reg_rtx (V4SImode);
7808 20 : emit_insn (gen_sse2_pshufd_1 (zero_or_all_ones, temp,
7809 : const1_rtx, const1_rtx,
7810 : GEN_INT (3), GEN_INT (3)));
7811 20 : zero_or_all_ones = lowpart_subreg (V2DImode, zero_or_all_ones,
7812 : V4SImode);
7813 : }
7814 20 : rtx lshr_res = gen_reg_rtx (V2DImode);
7815 20 : emit_insn (gen_lshrv2di3 (lshr_res, operands[1], operands[2]));
7816 20 : rtx ashl_res = gen_reg_rtx (V2DImode);
7817 20 : rtx amount;
7818 20 : if (TARGET_64BIT)
7819 : {
7820 20 : amount = gen_reg_rtx (DImode);
7821 20 : emit_insn (gen_subdi3 (amount, force_reg (DImode, GEN_INT (64)),
7822 : operands[2]));
7823 : }
7824 : else
7825 : {
7826 0 : rtx temp = gen_reg_rtx (SImode);
7827 0 : emit_insn (gen_subsi3 (temp, force_reg (SImode, GEN_INT (64)),
7828 : lowpart_subreg (SImode, operands[2],
7829 : DImode)));
7830 0 : amount = gen_reg_rtx (V4SImode);
7831 0 : emit_insn (gen_vec_setv4si_0 (amount, CONST0_RTX (V4SImode),
7832 : temp));
7833 : }
7834 20 : amount = lowpart_subreg (DImode, amount, GET_MODE (amount));
7835 20 : emit_insn (gen_ashlv2di3 (ashl_res, zero_or_all_ones, amount));
7836 20 : emit_insn (gen_iorv2di3 (operands[0], lshr_res, ashl_res));
7837 20 : return;
7838 : }
7839 :
7840 96 : rtx reg = gen_reg_rtx (V2DImode);
7841 96 : rtx par;
7842 96 : bool negate = false;
7843 96 : int i;
7844 :
7845 96 : if (CONST_INT_P (operands[2]))
7846 96 : operands[2] = GEN_INT (-INTVAL (operands[2]));
7847 : else
7848 : negate = true;
7849 :
7850 96 : par = gen_rtx_PARALLEL (V2DImode, rtvec_alloc (2));
7851 288 : for (i = 0; i < 2; i++)
7852 192 : XVECEXP (par, 0, i) = operands[2];
7853 :
7854 96 : emit_insn (gen_vec_initv2didi (reg, par));
7855 :
7856 96 : if (negate)
7857 0 : emit_insn (gen_negv2di2 (reg, reg));
7858 :
7859 96 : emit_insn (gen_xop_shav2di3 (operands[0], operands[1], reg));
7860 : }
7861 :
7862 : /* Replace all occurrences of REG FROM with REG TO in X, including
7863 : occurrences with different modes. */
7864 :
7865 : rtx
7866 38540 : ix86_replace_reg_with_reg (rtx x, rtx from, rtx to)
7867 : {
7868 38540 : gcc_checking_assert (REG_P (from)
7869 : && REG_P (to)
7870 : && GET_MODE (from) == GET_MODE (to));
7871 38540 : if (!reg_overlap_mentioned_p (from, x))
7872 : return x;
7873 94 : rtx ret = copy_rtx (x);
7874 94 : subrtx_ptr_iterator::array_type array;
7875 458 : FOR_EACH_SUBRTX_PTR (iter, array, &ret, NONCONST)
7876 : {
7877 364 : rtx *loc = *iter;
7878 364 : x = *loc;
7879 364 : if (REG_P (x) && REGNO (x) == REGNO (from))
7880 : {
7881 94 : if (x == from)
7882 94 : *loc = to;
7883 : else
7884 : {
7885 0 : gcc_checking_assert (REG_NREGS (x) == 1);
7886 0 : *loc = gen_rtx_REG (GET_MODE (x), REGNO (to));
7887 : }
7888 : }
7889 : }
7890 94 : return ret;
7891 94 : }
7892 :
7893 : /* Return mode for the memcpy/memset loop counter. Prefer SImode over
7894 : DImode for constant loop counts. */
7895 :
7896 : static machine_mode
7897 34736 : counter_mode (rtx count_exp)
7898 : {
7899 8116 : if (GET_MODE (count_exp) != VOIDmode)
7900 27910 : return GET_MODE (count_exp);
7901 6826 : if (!CONST_INT_P (count_exp))
7902 0 : return Pmode;
7903 : if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
7904 : return DImode;
7905 : return SImode;
7906 : }
7907 :
7908 : /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
7909 : to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
7910 : specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
7911 : memory by VALUE (supposed to be in MODE).
7912 :
7913 : The size is rounded down to whole number of chunk size moved at once.
7914 : SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
7915 :
7916 :
7917 : static void
7918 18963 : expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
7919 : rtx destptr, rtx srcptr, rtx value,
7920 : rtx count, machine_mode mode, int unroll,
7921 : int expected_size, bool issetmem)
7922 : {
7923 18963 : rtx_code_label *out_label = nullptr;
7924 18963 : rtx_code_label *top_label = nullptr;
7925 18963 : rtx iter, tmp;
7926 18963 : machine_mode iter_mode = counter_mode (count);
7927 18963 : int piece_size_n = GET_MODE_SIZE (mode) * unroll;
7928 18963 : rtx piece_size = GEN_INT (piece_size_n);
7929 37926 : rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
7930 18963 : rtx size;
7931 18963 : int i;
7932 18963 : int loop_count;
7933 :
7934 18963 : if (expected_size != -1 && CONST_INT_P (count))
7935 6742 : loop_count = INTVAL (count) / GET_MODE_SIZE (mode) / unroll;
7936 : else
7937 : loop_count = -1;
7938 :
7939 : /* Don't generate the loop if the loop count is 1. */
7940 6742 : if (loop_count != 1)
7941 : {
7942 18891 : top_label = gen_label_rtx ();
7943 18891 : out_label = gen_label_rtx ();
7944 : }
7945 18963 : iter = gen_reg_rtx (iter_mode);
7946 :
7947 18963 : size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
7948 : NULL, 1, OPTAB_DIRECT);
7949 : /* Those two should combine. */
7950 18963 : if (piece_size == const1_rtx)
7951 : {
7952 4154 : emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
7953 : true, out_label);
7954 4154 : predict_jump (REG_BR_PROB_BASE * 10 / 100);
7955 : }
7956 18963 : emit_move_insn (iter, const0_rtx);
7957 :
7958 18963 : if (loop_count != 1)
7959 18891 : emit_label (top_label);
7960 :
7961 21713 : tmp = convert_modes (Pmode, iter_mode, iter, true);
7962 :
7963 : /* This assert could be relaxed - in this case we'll need to compute
7964 : smallest power of two, containing in PIECE_SIZE_N and pass it to
7965 : offset_address. */
7966 18963 : gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
7967 18963 : destmem = offset_address (destmem, tmp, piece_size_n);
7968 18963 : destmem = adjust_address (destmem, mode, 0);
7969 :
7970 18963 : if (!issetmem)
7971 : {
7972 12541 : srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
7973 12541 : srcmem = adjust_address (srcmem, mode, 0);
7974 :
7975 : /* When unrolling for chips that reorder memory reads and writes,
7976 : we can save registers by using single temporary.
7977 : Also using 4 temporaries is overkill in 32bit mode. */
7978 12541 : if (!TARGET_64BIT && 0)
7979 : {
7980 : for (i = 0; i < unroll; i++)
7981 : {
7982 : if (i)
7983 : {
7984 : destmem = adjust_address (copy_rtx (destmem), mode,
7985 : GET_MODE_SIZE (mode));
7986 : srcmem = adjust_address (copy_rtx (srcmem), mode,
7987 : GET_MODE_SIZE (mode));
7988 : }
7989 : emit_move_insn (destmem, srcmem);
7990 : }
7991 : }
7992 : else
7993 : {
7994 12541 : rtx tmpreg[4];
7995 12541 : gcc_assert (unroll <= 4);
7996 52525 : for (i = 0; i < unroll; i++)
7997 : {
7998 39984 : tmpreg[i] = gen_reg_rtx (mode);
7999 39984 : if (i)
8000 54886 : srcmem = adjust_address (copy_rtx (srcmem), mode,
8001 : GET_MODE_SIZE (mode));
8002 39984 : emit_move_insn (tmpreg[i], srcmem);
8003 : }
8004 52525 : for (i = 0; i < unroll; i++)
8005 : {
8006 39984 : if (i)
8007 54886 : destmem = adjust_address (copy_rtx (destmem), mode,
8008 : GET_MODE_SIZE (mode));
8009 39984 : emit_move_insn (destmem, tmpreg[i]);
8010 : }
8011 : }
8012 : }
8013 : else
8014 29668 : for (i = 0; i < unroll; i++)
8015 : {
8016 23246 : if (i)
8017 33648 : destmem = adjust_address (copy_rtx (destmem), mode,
8018 : GET_MODE_SIZE (mode));
8019 23246 : emit_move_insn (destmem, value);
8020 : }
8021 :
8022 18963 : tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
8023 : true, OPTAB_LIB_WIDEN);
8024 18963 : if (tmp != iter)
8025 0 : emit_move_insn (iter, tmp);
8026 :
8027 18963 : if (loop_count != 1)
8028 : {
8029 18891 : emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
8030 : true, top_label);
8031 18891 : if (expected_size != -1)
8032 : {
8033 9144 : expected_size /= GET_MODE_SIZE (mode) * unroll;
8034 9144 : if (expected_size == 0)
8035 1 : predict_jump (0);
8036 9143 : else if (expected_size > REG_BR_PROB_BASE)
8037 2 : predict_jump (REG_BR_PROB_BASE - 1);
8038 : else
8039 9141 : predict_jump (REG_BR_PROB_BASE
8040 9141 : - (REG_BR_PROB_BASE + expected_size / 2)
8041 9141 : / expected_size);
8042 : }
8043 : else
8044 9747 : predict_jump (REG_BR_PROB_BASE * 80 / 100);
8045 : }
8046 18963 : iter = ix86_zero_extend_to_Pmode (iter);
8047 21713 : tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
8048 : true, OPTAB_LIB_WIDEN);
8049 18963 : if (tmp != destptr)
8050 0 : emit_move_insn (destptr, tmp);
8051 18963 : if (!issetmem)
8052 : {
8053 13895 : tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
8054 : true, OPTAB_LIB_WIDEN);
8055 12541 : if (tmp != srcptr)
8056 0 : emit_move_insn (srcptr, tmp);
8057 : }
8058 18963 : if (loop_count != 1)
8059 18891 : emit_label (out_label);
8060 18963 : }
8061 :
8062 : /* Divide COUNTREG by SCALE. */
8063 : static rtx
8064 16573 : scale_counter (rtx countreg, int scale)
8065 : {
8066 16573 : rtx sc;
8067 :
8068 16573 : if (scale == 1)
8069 : return countreg;
8070 10607 : if (CONST_INT_P (countreg))
8071 10591 : return GEN_INT (INTVAL (countreg) / scale);
8072 16 : gcc_assert (REG_P (countreg));
8073 :
8074 48 : sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
8075 32 : GEN_INT (exact_log2 (scale)),
8076 : NULL, 1, OPTAB_DIRECT);
8077 16 : return sc;
8078 : }
8079 :
8080 : /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
8081 : When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
8082 : When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
8083 : For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
8084 : ORIG_VALUE is the original value passed to memset to fill the memory with.
8085 : Other arguments have same meaning as for previous function. */
8086 :
8087 : static void
8088 16573 : expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
8089 : rtx destptr, rtx srcptr, rtx value, rtx orig_value,
8090 : rtx count,
8091 : machine_mode mode, bool issetmem)
8092 : {
8093 16573 : rtx destexp;
8094 16573 : rtx srcexp;
8095 16573 : rtx countreg;
8096 16573 : HOST_WIDE_INT rounded_count;
8097 :
8098 : /* If possible, it is shorter to use rep movs.
8099 : TODO: Maybe it is better to move this logic to decide_alg. */
8100 16573 : if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
8101 243 : && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
8102 239 : && (!issetmem || orig_value == const0_rtx))
8103 16573 : mode = SImode;
8104 :
8105 16573 : if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
8106 16310 : destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
8107 :
8108 33146 : countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
8109 16573 : GET_MODE_SIZE (mode)));
8110 16573 : if (mode != QImode)
8111 : {
8112 32081 : destexp = gen_rtx_ASHIFT (Pmode, countreg,
8113 : GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
8114 10867 : destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
8115 : }
8116 : else
8117 5988 : destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
8118 16573 : if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
8119 : {
8120 11280 : rounded_count
8121 11280 : = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
8122 11280 : destmem = shallow_copy_rtx (destmem);
8123 11280 : set_mem_size (destmem, rounded_count);
8124 : }
8125 5300 : else if (MEM_SIZE_KNOWN_P (destmem))
8126 333 : clear_mem_size (destmem);
8127 :
8128 16573 : if (issetmem)
8129 : {
8130 6052 : value = force_reg (mode, gen_lowpart (mode, value));
8131 6052 : emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
8132 : }
8133 : else
8134 : {
8135 10521 : if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
8136 10319 : srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
8137 10521 : if (mode != QImode)
8138 : {
8139 17800 : srcexp = gen_rtx_ASHIFT (Pmode, countreg,
8140 : GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
8141 6056 : srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
8142 : }
8143 : else
8144 4667 : srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
8145 10521 : if (CONST_INT_P (count))
8146 : {
8147 6369 : rounded_count
8148 6369 : = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
8149 6369 : srcmem = shallow_copy_rtx (srcmem);
8150 6369 : set_mem_size (srcmem, rounded_count);
8151 : }
8152 : else
8153 : {
8154 4166 : if (MEM_SIZE_KNOWN_P (srcmem))
8155 0 : clear_mem_size (srcmem);
8156 : }
8157 10521 : emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
8158 : destexp, srcexp));
8159 : }
8160 16573 : }
8161 :
8162 : /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
8163 : DESTMEM.
8164 : SRC is passed by pointer to be updated on return.
8165 : Return value is updated DST. */
8166 : static rtx
8167 13 : emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
8168 : HOST_WIDE_INT size_to_move)
8169 : {
8170 13 : rtx dst = destmem, src = *srcmem, tempreg;
8171 13 : enum insn_code code;
8172 13 : machine_mode move_mode;
8173 13 : int piece_size, i;
8174 :
8175 : /* Find the widest mode in which we could perform moves.
8176 : Start with the biggest power of 2 less than SIZE_TO_MOVE and half
8177 : it until move of such size is supported. */
8178 13 : piece_size = 1 << floor_log2 (size_to_move);
8179 26 : while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
8180 26 : || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
8181 : {
8182 0 : gcc_assert (piece_size > 1);
8183 0 : piece_size >>= 1;
8184 : }
8185 :
8186 : /* Find the corresponding vector mode with the same size as MOVE_MODE.
8187 : MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
8188 39 : if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
8189 : {
8190 0 : int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
8191 0 : if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
8192 0 : || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
8193 : {
8194 0 : move_mode = word_mode;
8195 0 : piece_size = GET_MODE_SIZE (move_mode);
8196 0 : code = optab_handler (mov_optab, move_mode);
8197 : }
8198 : }
8199 13 : gcc_assert (code != CODE_FOR_nothing);
8200 :
8201 13 : dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
8202 13 : src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
8203 :
8204 : /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
8205 13 : gcc_assert (size_to_move % piece_size == 0);
8206 :
8207 26 : for (i = 0; i < size_to_move; i += piece_size)
8208 : {
8209 : /* We move from memory to memory, so we'll need to do it via
8210 : a temporary register. */
8211 13 : tempreg = gen_reg_rtx (move_mode);
8212 13 : emit_insn (GEN_FCN (code) (tempreg, src));
8213 13 : emit_insn (GEN_FCN (code) (dst, tempreg));
8214 :
8215 26 : emit_move_insn (destptr,
8216 13 : plus_constant (Pmode, copy_rtx (destptr), piece_size));
8217 26 : emit_move_insn (srcptr,
8218 13 : plus_constant (Pmode, copy_rtx (srcptr), piece_size));
8219 :
8220 13 : dst = adjust_automodify_address_nv (dst, move_mode, destptr,
8221 : piece_size);
8222 13 : src = adjust_automodify_address_nv (src, move_mode, srcptr,
8223 : piece_size);
8224 : }
8225 :
8226 : /* Update DST and SRC rtx. */
8227 13 : *srcmem = src;
8228 13 : return dst;
8229 : }
8230 :
8231 : /* Helper function for the string operations below. Dest VARIABLE whether
8232 : it is aligned to VALUE bytes. If true, jump to the label. */
8233 :
8234 : static rtx_code_label *
8235 39025 : ix86_expand_aligntest (rtx variable, int value, bool epilogue)
8236 : {
8237 39025 : rtx_code_label *label = gen_label_rtx ();
8238 39025 : rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
8239 39025 : if (GET_MODE (variable) == DImode)
8240 897 : emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
8241 : else
8242 38128 : emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
8243 39025 : emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
8244 : 1, label);
8245 39025 : if (epilogue)
8246 3 : predict_jump (REG_BR_PROB_BASE * 50 / 100);
8247 : else
8248 39022 : predict_jump (REG_BR_PROB_BASE * 90 / 100);
8249 39025 : return label;
8250 : }
8251 :
8252 :
8253 : /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
8254 :
8255 : static void
8256 8447 : expand_cpymem_epilogue (rtx destmem, rtx srcmem,
8257 : rtx destptr, rtx srcptr, rtx count, int max_size)
8258 : {
8259 8447 : rtx src, dest;
8260 8447 : if (CONST_INT_P (count))
8261 : {
8262 6294 : unsigned HOST_WIDE_INT countval = UINTVAL (count);
8263 6294 : unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
8264 6294 : unsigned int destalign = MEM_ALIGN (destmem);
8265 6294 : cfun->machine->by_pieces_in_use = true;
8266 6294 : move_by_pieces (destmem, srcmem, epilogue_size, destalign,
8267 : RETURN_BEGIN);
8268 6294 : cfun->machine->by_pieces_in_use = false;
8269 6294 : return;
8270 : }
8271 2153 : if (max_size > 8)
8272 : {
8273 2153 : count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
8274 : count, 1, OPTAB_DIRECT);
8275 2153 : expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
8276 : count, QImode, 1, 4, false);
8277 2153 : return;
8278 : }
8279 :
8280 : /* When there are stringops, we can cheaply increase dest and src pointers.
8281 : Otherwise we save code size by maintaining offset (zero is readily
8282 : available from preceding rep operation) and using x86 addressing modes.
8283 : */
8284 0 : if (TARGET_SINGLE_STRINGOP)
8285 : {
8286 0 : if (max_size > 4)
8287 : {
8288 0 : rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
8289 0 : src = change_address (srcmem, SImode, srcptr);
8290 0 : dest = change_address (destmem, SImode, destptr);
8291 0 : emit_insn (gen_strmov (destptr, dest, srcptr, src));
8292 0 : emit_label (label);
8293 0 : LABEL_NUSES (label) = 1;
8294 : }
8295 0 : if (max_size > 2)
8296 : {
8297 0 : rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
8298 0 : src = change_address (srcmem, HImode, srcptr);
8299 0 : dest = change_address (destmem, HImode, destptr);
8300 0 : emit_insn (gen_strmov (destptr, dest, srcptr, src));
8301 0 : emit_label (label);
8302 0 : LABEL_NUSES (label) = 1;
8303 : }
8304 0 : if (max_size > 1)
8305 : {
8306 0 : rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
8307 0 : src = change_address (srcmem, QImode, srcptr);
8308 0 : dest = change_address (destmem, QImode, destptr);
8309 0 : emit_insn (gen_strmov (destptr, dest, srcptr, src));
8310 0 : emit_label (label);
8311 0 : LABEL_NUSES (label) = 1;
8312 : }
8313 : }
8314 : else
8315 : {
8316 0 : rtx offset = force_reg (Pmode, const0_rtx);
8317 0 : rtx tmp;
8318 :
8319 0 : if (max_size > 4)
8320 : {
8321 0 : rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
8322 0 : src = change_address (srcmem, SImode, srcptr);
8323 0 : dest = change_address (destmem, SImode, destptr);
8324 0 : emit_move_insn (dest, src);
8325 0 : tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
8326 : true, OPTAB_LIB_WIDEN);
8327 0 : if (tmp != offset)
8328 0 : emit_move_insn (offset, tmp);
8329 0 : emit_label (label);
8330 0 : LABEL_NUSES (label) = 1;
8331 : }
8332 0 : if (max_size > 2)
8333 : {
8334 0 : rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
8335 0 : tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
8336 0 : src = change_address (srcmem, HImode, tmp);
8337 0 : tmp = gen_rtx_PLUS (Pmode, destptr, offset);
8338 0 : dest = change_address (destmem, HImode, tmp);
8339 0 : emit_move_insn (dest, src);
8340 0 : tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
8341 : true, OPTAB_LIB_WIDEN);
8342 0 : if (tmp != offset)
8343 0 : emit_move_insn (offset, tmp);
8344 0 : emit_label (label);
8345 0 : LABEL_NUSES (label) = 1;
8346 : }
8347 0 : if (max_size > 1)
8348 : {
8349 0 : rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
8350 0 : tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
8351 0 : src = change_address (srcmem, QImode, tmp);
8352 0 : tmp = gen_rtx_PLUS (Pmode, destptr, offset);
8353 0 : dest = change_address (destmem, QImode, tmp);
8354 0 : emit_move_insn (dest, src);
8355 0 : emit_label (label);
8356 0 : LABEL_NUSES (label) = 1;
8357 : }
8358 : }
8359 : }
8360 :
8361 : /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
8362 : with value PROMOTED_VAL.
8363 : SRC is passed by pointer to be updated on return.
8364 : Return value is updated DST. */
8365 : static rtx
8366 6 : emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
8367 : HOST_WIDE_INT size_to_move)
8368 : {
8369 6 : rtx dst = destmem;
8370 6 : enum insn_code code;
8371 6 : machine_mode move_mode;
8372 6 : int piece_size, i;
8373 :
8374 : /* Find the widest mode in which we could perform moves.
8375 : Start with the biggest power of 2 less than SIZE_TO_MOVE and half
8376 : it until move of such size is supported. */
8377 6 : move_mode = GET_MODE (promoted_val);
8378 6 : if (move_mode == VOIDmode)
8379 0 : move_mode = QImode;
8380 12 : if (size_to_move < GET_MODE_SIZE (move_mode))
8381 : {
8382 5 : unsigned int move_bits = size_to_move * BITS_PER_UNIT;
8383 5 : move_mode = int_mode_for_size (move_bits, 0).require ();
8384 5 : promoted_val = gen_lowpart (move_mode, promoted_val);
8385 : }
8386 6 : piece_size = GET_MODE_SIZE (move_mode);
8387 6 : code = optab_handler (mov_optab, move_mode);
8388 6 : gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
8389 :
8390 6 : dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
8391 :
8392 : /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
8393 6 : gcc_assert (size_to_move % piece_size == 0);
8394 :
8395 12 : for (i = 0; i < size_to_move; i += piece_size)
8396 : {
8397 12 : if (piece_size <= GET_MODE_SIZE (word_mode))
8398 : {
8399 4 : emit_insn (gen_strset (destptr, dst, promoted_val));
8400 4 : dst = adjust_automodify_address_nv (dst, move_mode, destptr,
8401 : piece_size);
8402 4 : continue;
8403 : }
8404 :
8405 2 : emit_insn (GEN_FCN (code) (dst, promoted_val));
8406 :
8407 4 : emit_move_insn (destptr,
8408 2 : plus_constant (Pmode, copy_rtx (destptr), piece_size));
8409 :
8410 2 : dst = adjust_automodify_address_nv (dst, move_mode, destptr,
8411 : piece_size);
8412 : }
8413 :
8414 : /* Update DST rtx. */
8415 6 : return dst;
8416 : }
8417 : /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
8418 : static void
8419 311 : expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
8420 : rtx count, int max_size)
8421 : {
8422 622 : count = expand_simple_binop (counter_mode (count), AND, count,
8423 311 : GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
8424 311 : expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
8425 311 : gen_lowpart (QImode, value), count, QImode,
8426 : 1, max_size / 2, true);
8427 311 : }
8428 :
8429 : /* Callback routine for store_by_pieces. Return the RTL of a register
8430 : containing GET_MODE_SIZE (MODE) bytes in the RTL register op_p which
8431 : is an integer or a word vector register. If PREV_P isn't nullptr,
8432 : it has the RTL info from the previous iteration. */
8433 :
8434 : static rtx
8435 4999 : setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT,
8436 : fixed_size_mode mode)
8437 : {
8438 4999 : rtx target;
8439 4999 : by_pieces_prev *prev = (by_pieces_prev *) prev_p;
8440 4999 : if (prev)
8441 : {
8442 4999 : rtx prev_op = prev->data;
8443 4999 : if (prev_op)
8444 : {
8445 2893 : machine_mode prev_mode = GET_MODE (prev_op);
8446 2893 : if (prev_mode == mode)
8447 : return prev_op;
8448 54 : if (VECTOR_MODE_P (prev_mode)
8449 1097 : && VECTOR_MODE_P (mode)
8450 1151 : && GET_MODE_INNER (prev_mode) == GET_MODE_INNER (mode))
8451 : {
8452 0 : target = gen_rtx_SUBREG (mode, prev_op, 0);
8453 0 : return target;
8454 : }
8455 : }
8456 : }
8457 :
8458 3257 : rtx op = (rtx) op_p;
8459 3257 : machine_mode op_mode = GET_MODE (op);
8460 :
8461 3257 : if (VECTOR_MODE_P (mode))
8462 : {
8463 3684 : gcc_assert (GET_MODE_INNER (mode) == QImode);
8464 :
8465 1842 : unsigned int op_size = GET_MODE_SIZE (op_mode);
8466 1842 : unsigned int size = GET_MODE_SIZE (mode);
8467 1842 : unsigned int nunits;
8468 1842 : machine_mode vec_mode;
8469 1842 : if (op_size < size)
8470 : {
8471 : /* If OP size is smaller than MODE size, duplicate it. */
8472 1 : nunits = size / GET_MODE_SIZE (QImode);
8473 1 : vec_mode = mode_for_vector (QImode, nunits).require ();
8474 1 : nunits = size / op_size;
8475 1 : gcc_assert (SCALAR_INT_MODE_P (op_mode));
8476 1 : machine_mode dup_mode
8477 1 : = mode_for_vector (as_a <scalar_mode> (op_mode),
8478 2 : nunits).require ();
8479 1 : target = gen_reg_rtx (vec_mode);
8480 1 : op = gen_vec_duplicate (dup_mode, op);
8481 1 : rtx dup_op = gen_reg_rtx (dup_mode);
8482 1 : emit_move_insn (dup_op, op);
8483 1 : op = gen_rtx_SUBREG (vec_mode, dup_op, 0);
8484 1 : emit_move_insn (target, op);
8485 1 : return target;
8486 : }
8487 1841 : nunits = op_size / GET_MODE_SIZE (QImode);
8488 1841 : vec_mode = mode_for_vector (QImode, nunits).require ();
8489 1841 : target = gen_reg_rtx (vec_mode);
8490 1841 : op = gen_rtx_SUBREG (vec_mode, op, 0);
8491 1841 : emit_move_insn (target, op);
8492 1841 : if (op_size == size)
8493 : return target;
8494 :
8495 0 : rtx tmp = gen_reg_rtx (mode);
8496 0 : target = gen_rtx_SUBREG (mode, target, 0);
8497 0 : emit_move_insn (tmp, target);
8498 0 : return tmp;
8499 : }
8500 :
8501 1415 : if (VECTOR_MODE_P (op_mode))
8502 : {
8503 2820 : gcc_assert (GET_MODE_INNER (op_mode) == word_mode);
8504 1410 : target = gen_reg_rtx (word_mode);
8505 1410 : op = gen_rtx_SUBREG (word_mode, op, 0);
8506 1410 : emit_move_insn (target, op);
8507 : }
8508 : else
8509 : target = op;
8510 :
8511 1415 : if (mode == GET_MODE (target))
8512 : return target;
8513 :
8514 241 : rtx tmp = gen_reg_rtx (mode);
8515 241 : target = gen_rtx_SUBREG (mode, target, 0);
8516 241 : emit_move_insn (tmp, target);
8517 241 : return tmp;
8518 : }
8519 :
8520 : /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
8521 : static void
8522 8039 : expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
8523 : rtx count, int max_size)
8524 : {
8525 8039 : rtx dest;
8526 :
8527 8039 : if (CONST_INT_P (count))
8528 : {
8529 7727 : unsigned HOST_WIDE_INT countval = UINTVAL (count);
8530 7727 : unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
8531 7727 : unsigned int destalign = MEM_ALIGN (destmem);
8532 7727 : cfun->machine->by_pieces_in_use = true;
8533 12413 : store_by_pieces (destmem, epilogue_size, setmem_epilogue_gen_val,
8534 : vec_value ? vec_value : value, destalign, true,
8535 : RETURN_BEGIN);
8536 7727 : cfun->machine->by_pieces_in_use = false;
8537 7727 : return;
8538 : }
8539 312 : if (max_size > 32)
8540 : {
8541 311 : expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
8542 311 : return;
8543 : }
8544 1 : if (max_size > 16)
8545 : {
8546 0 : rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
8547 0 : if (TARGET_64BIT)
8548 : {
8549 0 : dest = change_address (destmem, DImode, destptr);
8550 0 : emit_insn (gen_strset (destptr, dest, value));
8551 0 : dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
8552 0 : emit_insn (gen_strset (destptr, dest, value));
8553 : }
8554 : else
8555 : {
8556 0 : dest = change_address (destmem, SImode, destptr);
8557 0 : emit_insn (gen_strset (destptr, dest, value));
8558 0 : dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
8559 0 : emit_insn (gen_strset (destptr, dest, value));
8560 0 : dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
8561 0 : emit_insn (gen_strset (destptr, dest, value));
8562 0 : dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
8563 0 : emit_insn (gen_strset (destptr, dest, value));
8564 : }
8565 0 : emit_label (label);
8566 0 : LABEL_NUSES (label) = 1;
8567 : }
8568 1 : if (max_size > 8)
8569 : {
8570 0 : rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
8571 0 : if (TARGET_64BIT)
8572 : {
8573 0 : dest = change_address (destmem, DImode, destptr);
8574 0 : emit_insn (gen_strset (destptr, dest, value));
8575 : }
8576 : else
8577 : {
8578 0 : dest = change_address (destmem, SImode, destptr);
8579 0 : emit_insn (gen_strset (destptr, dest, value));
8580 0 : dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
8581 0 : emit_insn (gen_strset (destptr, dest, value));
8582 : }
8583 0 : emit_label (label);
8584 0 : LABEL_NUSES (label) = 1;
8585 : }
8586 1 : if (max_size > 4)
8587 : {
8588 1 : rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
8589 1 : dest = change_address (destmem, SImode, destptr);
8590 1 : emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
8591 1 : emit_label (label);
8592 1 : LABEL_NUSES (label) = 1;
8593 : }
8594 1 : if (max_size > 2)
8595 : {
8596 1 : rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
8597 1 : dest = change_address (destmem, HImode, destptr);
8598 1 : emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
8599 1 : emit_label (label);
8600 1 : LABEL_NUSES (label) = 1;
8601 : }
8602 1 : if (max_size > 1)
8603 : {
8604 1 : rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
8605 1 : dest = change_address (destmem, QImode, destptr);
8606 1 : emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
8607 1 : emit_label (label);
8608 1 : LABEL_NUSES (label) = 1;
8609 : }
8610 : }
8611 :
8612 : /* Adjust COUNTER by the VALUE. */
8613 : static void
8614 19 : ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
8615 : {
8616 19 : emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
8617 19 : }
8618 :
8619 : /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
8620 : DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
8621 : Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
8622 : ignored.
8623 : Return value is updated DESTMEM. */
8624 :
8625 : static rtx
8626 7 : expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
8627 : rtx destptr, rtx srcptr, rtx value,
8628 : rtx vec_value, rtx count, int align,
8629 : int desired_alignment, bool issetmem)
8630 : {
8631 7 : int i;
8632 35 : for (i = 1; i < desired_alignment; i <<= 1)
8633 : {
8634 28 : if (align <= i)
8635 : {
8636 19 : rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
8637 19 : if (issetmem)
8638 : {
8639 12 : if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
8640 2 : destmem = emit_memset (destmem, destptr, vec_value, i);
8641 : else
8642 4 : destmem = emit_memset (destmem, destptr, value, i);
8643 : }
8644 : else
8645 13 : destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
8646 19 : ix86_adjust_counter (count, i);
8647 19 : emit_label (label);
8648 19 : LABEL_NUSES (label) = 1;
8649 19 : set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
8650 : }
8651 : }
8652 7 : return destmem;
8653 : }
8654 :
8655 : /* Test if COUNT&SIZE is nonzero and if so, expand movme
8656 : or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
8657 : and jump to DONE_LABEL. */
8658 : static void
8659 31198 : expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
8660 : rtx destptr, rtx srcptr,
8661 : rtx value, rtx vec_value,
8662 : rtx count, int size,
8663 : rtx done_label, bool issetmem)
8664 : {
8665 31198 : rtx_code_label *label = ix86_expand_aligntest (count, size, false);
8666 31198 : machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
8667 31198 : rtx modesize;
8668 31198 : rtx scalar_value = value;
8669 31198 : int n;
8670 :
8671 : /* If we do not have vector value to copy, we must reduce size. */
8672 31198 : if (issetmem)
8673 : {
8674 3592 : if (!vec_value)
8675 : {
8676 7 : if (GET_MODE (value) == VOIDmode && size > 8)
8677 0 : mode = Pmode;
8678 21 : else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
8679 1 : mode = GET_MODE (value);
8680 : }
8681 : else
8682 3585 : mode = GET_MODE (vec_value), value = vec_value;
8683 : }
8684 : else
8685 : {
8686 : /* Choose appropriate vector mode. */
8687 27606 : if (size >= 32)
8688 6900 : switch (MOVE_MAX)
8689 : {
8690 0 : case 64:
8691 0 : if (size >= 64)
8692 : {
8693 : mode = V64QImode;
8694 : break;
8695 : }
8696 : /* FALLTHRU */
8697 0 : case 32:
8698 0 : mode = V32QImode;
8699 0 : break;
8700 : case 16:
8701 : mode = V16QImode;
8702 : break;
8703 : case 8:
8704 : mode = DImode;
8705 : break;
8706 0 : default:
8707 0 : gcc_unreachable ();
8708 : }
8709 20706 : else if (size >= 16)
8710 6900 : mode = TARGET_SSE ? V16QImode : DImode;
8711 27606 : srcmem = change_address (srcmem, mode, srcptr);
8712 : }
8713 34783 : if (issetmem && vec_value && GET_MODE_SIZE (mode) > size)
8714 : {
8715 : /* For memset with vector and the size is smaller than the vector
8716 : size, first try the narrower vector, otherwise, use the
8717 : original value. */
8718 1797 : machine_mode inner_mode = GET_MODE_INNER (mode);
8719 1797 : unsigned int nunits = size / GET_MODE_SIZE (inner_mode);
8720 1797 : if (nunits > 1)
8721 : {
8722 316 : mode = mode_for_vector (GET_MODE_INNER (mode),
8723 316 : nunits).require ();
8724 158 : value = gen_rtx_SUBREG (mode, value, 0);
8725 : }
8726 : else
8727 : {
8728 1639 : scalar_int_mode smode
8729 1639 : = smallest_int_mode_for_size (size * BITS_PER_UNIT).require ();
8730 4917 : gcc_assert (GET_MODE_SIZE (GET_MODE (scalar_value))
8731 : >= GET_MODE_SIZE (smode));
8732 1639 : mode = smode;
8733 1639 : if (GET_MODE (scalar_value) == mode)
8734 : value = scalar_value;
8735 : else
8736 745 : value = gen_rtx_SUBREG (mode, scalar_value, 0);
8737 : }
8738 : }
8739 31198 : destmem = change_address (destmem, mode, destptr);
8740 62396 : modesize = GEN_INT (GET_MODE_SIZE (mode));
8741 62396 : gcc_assert (GET_MODE_SIZE (mode) <= size);
8742 140382 : for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
8743 : {
8744 38993 : if (issetmem)
8745 4487 : emit_move_insn (destmem, gen_lowpart (mode, value));
8746 : else
8747 : {
8748 34506 : emit_move_insn (destmem, srcmem);
8749 69012 : srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
8750 : }
8751 77986 : destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
8752 : }
8753 :
8754 31198 : destmem = offset_address (destmem, count, 1);
8755 62396 : destmem = offset_address (destmem, GEN_INT (-2 * size),
8756 31198 : GET_MODE_SIZE (mode));
8757 31198 : if (!issetmem)
8758 : {
8759 27606 : srcmem = offset_address (srcmem, count, 1);
8760 55212 : srcmem = offset_address (srcmem, GEN_INT (-2 * size),
8761 27606 : GET_MODE_SIZE (mode));
8762 : }
8763 140382 : for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
8764 : {
8765 38993 : if (issetmem)
8766 4487 : emit_move_insn (destmem, gen_lowpart (mode, value));
8767 : else
8768 : {
8769 34506 : emit_move_insn (destmem, srcmem);
8770 69012 : srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
8771 : }
8772 77986 : destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
8773 : }
8774 31198 : emit_jump_insn (gen_jump (done_label));
8775 31198 : emit_barrier ();
8776 :
8777 31198 : emit_label (label);
8778 31198 : LABEL_NUSES (label) = 1;
8779 31198 : }
8780 :
8781 : /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
8782 : and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
8783 : bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
8784 : proceed with an loop copying SIZE bytes at once. Do moves in MODE.
8785 : DONE_LABEL is a label after the whole copying sequence. The label is created
8786 : on demand if *DONE_LABEL is NULL.
8787 : MIN_SIZE is minimal size of block copied. This value gets adjusted for new
8788 : bounds after the initial copies.
8789 :
8790 : DESTMEM/SRCMEM are memory expressions pointing to the copies block,
8791 : DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
8792 : we will dispatch to a library call for large blocks.
8793 :
8794 : In pseudocode we do:
8795 :
8796 : if (COUNT < SIZE)
8797 : {
8798 : Assume that SIZE is 4. Bigger sizes are handled analogously
8799 : if (COUNT & 4)
8800 : {
8801 : copy 4 bytes from SRCPTR to DESTPTR
8802 : copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
8803 : goto done_label
8804 : }
8805 : if (!COUNT)
8806 : goto done_label;
8807 : copy 1 byte from SRCPTR to DESTPTR
8808 : if (COUNT & 2)
8809 : {
8810 : copy 2 bytes from SRCPTR to DESTPTR
8811 : copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
8812 : }
8813 : }
8814 : else
8815 : {
8816 : copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
8817 : copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
8818 :
8819 : OLD_DESPTR = DESTPTR;
8820 : Align DESTPTR up to DESIRED_ALIGN
8821 : SRCPTR += DESTPTR - OLD_DESTPTR
8822 : COUNT -= DEST_PTR - OLD_DESTPTR
8823 : if (DYNAMIC_CHECK)
8824 : Round COUNT down to multiple of SIZE
8825 : << optional caller supplied zero size guard is here >>
8826 : << optional caller supplied dynamic check is here >>
8827 : << caller supplied main copy loop is here >>
8828 : }
8829 : done_label:
8830 : */
8831 : static void
8832 11142 : expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
8833 : rtx *destptr, rtx *srcptr,
8834 : machine_mode mode,
8835 : rtx value, rtx vec_value,
8836 : rtx *count,
8837 : rtx_code_label **done_label,
8838 : int size,
8839 : int desired_align,
8840 : int align,
8841 : unsigned HOST_WIDE_INT *min_size,
8842 : bool dynamic_check,
8843 : bool issetmem)
8844 : {
8845 11142 : rtx_code_label *loop_label = NULL, *label;
8846 11142 : int n;
8847 11142 : rtx modesize;
8848 11142 : int prolog_size = 0;
8849 11142 : rtx mode_value;
8850 :
8851 : /* Chose proper value to copy. */
8852 11142 : if (issetmem && VECTOR_MODE_P (mode))
8853 : mode_value = vec_value;
8854 : else
8855 11142 : mode_value = value;
8856 22284 : gcc_assert (GET_MODE_SIZE (mode) <= size);
8857 :
8858 : /* See if block is big or small, handle small blocks. */
8859 11142 : if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
8860 : {
8861 7805 : int size2 = size;
8862 7805 : loop_label = gen_label_rtx ();
8863 :
8864 7805 : if (!*done_label)
8865 7805 : *done_label = gen_label_rtx ();
8866 :
8867 7805 : emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
8868 : 1, loop_label);
8869 7805 : size2 >>= 1;
8870 :
8871 : /* Handle sizes > 3. */
8872 39003 : for (;size2 > 2; size2 >>= 1)
8873 31198 : expand_small_cpymem_or_setmem (destmem, srcmem,
8874 : *destptr, *srcptr,
8875 : value, vec_value,
8876 : *count,
8877 : size2, *done_label, issetmem);
8878 : /* Nothing to copy? Jump to DONE_LABEL if so */
8879 7805 : emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
8880 : 1, *done_label);
8881 :
8882 : /* Do a byte copy. */
8883 7805 : destmem = change_address (destmem, QImode, *destptr);
8884 7805 : if (issetmem)
8885 899 : emit_move_insn (destmem, gen_lowpart (QImode, value));
8886 : else
8887 : {
8888 6906 : srcmem = change_address (srcmem, QImode, *srcptr);
8889 6906 : emit_move_insn (destmem, srcmem);
8890 : }
8891 :
8892 : /* Handle sizes 2 and 3. */
8893 7805 : label = ix86_expand_aligntest (*count, 2, false);
8894 7805 : destmem = change_address (destmem, HImode, *destptr);
8895 7805 : destmem = offset_address (destmem, *count, 1);
8896 7805 : destmem = offset_address (destmem, GEN_INT (-2), 2);
8897 7805 : if (issetmem)
8898 899 : emit_move_insn (destmem, gen_lowpart (HImode, value));
8899 : else
8900 : {
8901 6906 : srcmem = change_address (srcmem, HImode, *srcptr);
8902 6906 : srcmem = offset_address (srcmem, *count, 1);
8903 6906 : srcmem = offset_address (srcmem, GEN_INT (-2), 2);
8904 6906 : emit_move_insn (destmem, srcmem);
8905 : }
8906 :
8907 7805 : emit_label (label);
8908 7805 : LABEL_NUSES (label) = 1;
8909 7805 : emit_jump_insn (gen_jump (*done_label));
8910 7805 : emit_barrier ();
8911 : }
8912 : else
8913 3337 : gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
8914 : || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
8915 :
8916 : /* Start memcpy for COUNT >= SIZE. */
8917 7805 : if (loop_label)
8918 : {
8919 7805 : emit_label (loop_label);
8920 7805 : LABEL_NUSES (loop_label) = 1;
8921 : }
8922 :
8923 : /* Copy first desired_align bytes. */
8924 11142 : if (!issetmem)
8925 8549 : srcmem = change_address (srcmem, mode, *srcptr);
8926 11142 : destmem = change_address (destmem, mode, *destptr);
8927 11142 : modesize = GEN_INT (GET_MODE_SIZE (mode));
8928 22305 : for (n = 0; prolog_size < desired_align - align; n++)
8929 : {
8930 21 : if (issetmem)
8931 3 : emit_move_insn (destmem, mode_value);
8932 : else
8933 : {
8934 18 : emit_move_insn (destmem, srcmem);
8935 36 : srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
8936 : }
8937 42 : destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
8938 42 : prolog_size += GET_MODE_SIZE (mode);
8939 : }
8940 :
8941 :
8942 : /* Copy last SIZE bytes. */
8943 11142 : destmem = offset_address (destmem, *count, 1);
8944 11142 : destmem = offset_address (destmem,
8945 11142 : GEN_INT (-size - prolog_size),
8946 : 1);
8947 11142 : if (issetmem)
8948 2593 : emit_move_insn (destmem, mode_value);
8949 : else
8950 : {
8951 8549 : srcmem = offset_address (srcmem, *count, 1);
8952 8549 : srcmem = offset_address (srcmem,
8953 : GEN_INT (-size - prolog_size),
8954 : 1);
8955 8549 : emit_move_insn (destmem, srcmem);
8956 : }
8957 87504 : for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
8958 : {
8959 32610 : destmem = offset_address (destmem, modesize, 1);
8960 32610 : if (issetmem)
8961 7587 : emit_move_insn (destmem, mode_value);
8962 : else
8963 : {
8964 25023 : srcmem = offset_address (srcmem, modesize, 1);
8965 25023 : emit_move_insn (destmem, srcmem);
8966 : }
8967 : }
8968 :
8969 : /* Align destination. */
8970 11142 : if (desired_align > 1 && desired_align > align)
8971 : {
8972 21 : rtx saveddest = *destptr;
8973 :
8974 21 : gcc_assert (desired_align <= size);
8975 : /* Align destptr up, place it to new register. */
8976 21 : *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
8977 : GEN_INT (prolog_size),
8978 : NULL_RTX, 1, OPTAB_DIRECT);
8979 21 : if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
8980 21 : REG_POINTER (*destptr) = 1;
8981 21 : *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
8982 21 : GEN_INT (-desired_align),
8983 : *destptr, 1, OPTAB_DIRECT);
8984 : /* See how many bytes we skipped. */
8985 21 : saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
8986 : *destptr,
8987 : NULL_RTX, 1, OPTAB_DIRECT);
8988 : /* Adjust srcptr and count. */
8989 21 : if (!issetmem)
8990 18 : *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
8991 : saveddest, *srcptr, 1, OPTAB_DIRECT);
8992 21 : *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
8993 : saveddest, *count, 1, OPTAB_DIRECT);
8994 : /* We copied at most size + prolog_size. */
8995 21 : if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
8996 14 : *min_size
8997 14 : = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
8998 : else
8999 7 : *min_size = 0;
9000 :
9001 : /* Our loops always round down the block size, but for dispatch to
9002 : library we need precise value. */
9003 21 : if (dynamic_check)
9004 21 : *count = expand_simple_binop (GET_MODE (*count), AND, *count,
9005 : GEN_INT (-size), *count, 1, OPTAB_DIRECT);
9006 : }
9007 : else
9008 : {
9009 11121 : gcc_assert (prolog_size == 0);
9010 : /* Decrease count, so we won't end up copying last word twice. */
9011 11121 : if (!CONST_INT_P (*count))
9012 7805 : *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
9013 : constm1_rtx, *count, 1, OPTAB_DIRECT);
9014 : else
9015 3316 : *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
9016 : (unsigned HOST_WIDE_INT)size));
9017 11121 : if (*min_size)
9018 9936 : *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
9019 : }
9020 11142 : }
9021 :
9022 :
9023 : /* This function is like the previous one, except here we know how many bytes
9024 : need to be copied. That allows us to update alignment not only of DST, which
9025 : is returned, but also of SRC, which is passed as a pointer for that
9026 : reason. */
9027 : static rtx
9028 0 : expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
9029 : rtx srcreg, rtx value, rtx vec_value,
9030 : int desired_align, int align_bytes,
9031 : bool issetmem)
9032 : {
9033 0 : rtx src = NULL;
9034 0 : rtx orig_dst = dst;
9035 0 : rtx orig_src = NULL;
9036 0 : int piece_size = 1;
9037 0 : int copied_bytes = 0;
9038 :
9039 0 : if (!issetmem)
9040 : {
9041 0 : gcc_assert (srcp != NULL);
9042 0 : src = *srcp;
9043 0 : orig_src = src;
9044 : }
9045 :
9046 0 : for (piece_size = 1;
9047 0 : piece_size <= desired_align && copied_bytes < align_bytes;
9048 0 : piece_size <<= 1)
9049 : {
9050 0 : if (align_bytes & piece_size)
9051 : {
9052 0 : if (issetmem)
9053 : {
9054 0 : if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
9055 0 : dst = emit_memset (dst, destreg, vec_value, piece_size);
9056 : else
9057 0 : dst = emit_memset (dst, destreg, value, piece_size);
9058 : }
9059 : else
9060 0 : dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
9061 0 : copied_bytes += piece_size;
9062 : }
9063 : }
9064 0 : if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
9065 0 : set_mem_align (dst, desired_align * BITS_PER_UNIT);
9066 0 : if (MEM_SIZE_KNOWN_P (orig_dst))
9067 0 : set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
9068 :
9069 0 : if (!issetmem)
9070 : {
9071 0 : int src_align_bytes = get_mem_align_offset (src, desired_align
9072 : * BITS_PER_UNIT);
9073 0 : if (src_align_bytes >= 0)
9074 0 : src_align_bytes = desired_align - src_align_bytes;
9075 0 : if (src_align_bytes >= 0)
9076 : {
9077 : unsigned int src_align;
9078 0 : for (src_align = desired_align; src_align >= 2; src_align >>= 1)
9079 : {
9080 0 : if ((src_align_bytes & (src_align - 1))
9081 0 : == (align_bytes & (src_align - 1)))
9082 : break;
9083 : }
9084 0 : if (src_align > (unsigned int) desired_align)
9085 : src_align = desired_align;
9086 0 : if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
9087 0 : set_mem_align (src, src_align * BITS_PER_UNIT);
9088 : }
9089 0 : if (MEM_SIZE_KNOWN_P (orig_src))
9090 0 : set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
9091 0 : *srcp = src;
9092 : }
9093 :
9094 0 : return dst;
9095 : }
9096 :
9097 : /* Return true if ALG can be used in current context.
9098 : Assume we expand memset if MEMSET is true. */
9099 : static bool
9100 844496 : alg_usable_p (enum stringop_alg alg, bool memset,
9101 : addr_space_t dst_as, addr_space_t src_as)
9102 : {
9103 844496 : if (alg == no_stringop)
9104 : return false;
9105 : /* It is not possible to use a library call if we have non-default
9106 : address space. We can do better than the generic byte-at-a-time
9107 : loop, used as a fallback. */
9108 844496 : if (alg == libcall &&
9109 473212 : !(ADDR_SPACE_GENERIC_P (dst_as) && ADDR_SPACE_GENERIC_P (src_as)))
9110 : return false;
9111 844489 : if (alg == vector_loop)
9112 373071 : return TARGET_SSE || TARGET_AVX;
9113 : /* Algorithms using the rep prefix want at least edi and ecx;
9114 : additionally, memset wants eax and memcpy wants esi. Don't
9115 : consider such algorithms if the user has appropriated those
9116 : registers for their own purposes, or if we have the destination
9117 : in the non-default address space, since string insns cannot
9118 : override the destination segment. */
9119 657922 : if (alg == rep_prefix_1_byte
9120 : || alg == rep_prefix_4_byte
9121 657922 : || alg == rep_prefix_8_byte)
9122 : {
9123 34977 : if (fixed_regs[CX_REG]
9124 34973 : || fixed_regs[DI_REG]
9125 34969 : || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG])
9126 34965 : || !ADDR_SPACE_GENERIC_P (dst_as)
9127 69942 : || !(ADDR_SPACE_GENERIC_P (src_as) || Pmode == word_mode))
9128 12 : return false;
9129 : }
9130 : return true;
9131 : }
9132 :
9133 : /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
9134 : static enum stringop_alg
9135 167127 : decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
9136 : unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
9137 : bool memset, bool zero_memset, addr_space_t dst_as,
9138 : addr_space_t src_as, int *dynamic_check, bool *noalign, bool recur)
9139 : {
9140 167127 : const struct stringop_algs *algs;
9141 167127 : bool optimize_for_speed;
9142 167127 : int max = 0;
9143 167127 : const struct processor_costs *cost;
9144 167127 : int i;
9145 167127 : bool any_alg_usable_p = false;
9146 :
9147 167127 : *noalign = false;
9148 167127 : *dynamic_check = -1;
9149 :
9150 : /* Even if the string operation call is cold, we still might spend a lot
9151 : of time processing large blocks. */
9152 167127 : if (optimize_function_for_size_p (cfun)
9153 167127 : || (optimize_insn_for_size_p ()
9154 10210 : && (max_size < 256
9155 3753 : || (expected_size != -1 && expected_size < 256))))
9156 : optimize_for_speed = false;
9157 : else
9158 149694 : optimize_for_speed = true;
9159 :
9160 149694 : cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
9161 167127 : if (memset)
9162 49155 : algs = &cost->memset[TARGET_64BIT != 0];
9163 : else
9164 126866 : algs = &cost->memcpy[TARGET_64BIT != 0];
9165 :
9166 : /* See maximal size for user defined algorithm. */
9167 835635 : for (i = 0; i < MAX_STRINGOP_ALGS; i++)
9168 : {
9169 668508 : enum stringop_alg candidate = algs->size[i].alg;
9170 668508 : bool usable = alg_usable_p (candidate, memset, dst_as, src_as);
9171 668508 : any_alg_usable_p |= usable;
9172 :
9173 668508 : if (candidate != libcall && candidate && usable)
9174 316754 : max = algs->size[i].max;
9175 : }
9176 :
9177 : /* If expected size is not known but max size is small enough
9178 : so inline version is a win, set expected size into
9179 : the range. */
9180 167127 : if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
9181 37261 : && expected_size == -1)
9182 19542 : expected_size = min_size / 2 + max_size / 2;
9183 :
9184 : /* If user specified the algorithm, honor it if possible. */
9185 167127 : if (ix86_stringop_alg != no_stringop
9186 167127 : && alg_usable_p (ix86_stringop_alg, memset, dst_as, src_as))
9187 : return ix86_stringop_alg;
9188 : /* rep; movq or rep; movl is the smallest variant. */
9189 167015 : else if (!optimize_for_speed)
9190 : {
9191 17350 : *noalign = true;
9192 17350 : if (!count || (count & 3) || (memset && !zero_memset))
9193 6664 : return alg_usable_p (rep_prefix_1_byte, memset, dst_as, src_as)
9194 6664 : ? rep_prefix_1_byte : loop_1_byte;
9195 : else
9196 10686 : return alg_usable_p (rep_prefix_4_byte, memset, dst_as, src_as)
9197 10686 : ? rep_prefix_4_byte : loop;
9198 : }
9199 : /* Very tiny blocks are best handled via the loop, REP is expensive to
9200 : setup. */
9201 149665 : else if (expected_size != -1 && expected_size < 4)
9202 : return loop_1_byte;
9203 146759 : else if (expected_size != -1)
9204 : {
9205 : enum stringop_alg alg = libcall;
9206 : bool alg_noalign = false;
9207 182271 : for (i = 0; i < MAX_STRINGOP_ALGS; i++)
9208 : {
9209 : /* We get here if the algorithms that were not libcall-based
9210 : were rep-prefix based and we are unable to use rep prefixes
9211 : based on global register usage. Break out of the loop and
9212 : use the heuristic below. */
9213 179338 : if (algs->size[i].max == 0)
9214 : break;
9215 179338 : if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
9216 : {
9217 75802 : enum stringop_alg candidate = algs->size[i].alg;
9218 :
9219 75802 : if (candidate != libcall
9220 75802 : && alg_usable_p (candidate, memset, dst_as, src_as))
9221 : {
9222 21077 : alg = candidate;
9223 21077 : alg_noalign = algs->size[i].noalign;
9224 : }
9225 : /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
9226 : last non-libcall inline algorithm. */
9227 75802 : if (TARGET_INLINE_ALL_STRINGOPS)
9228 : {
9229 : /* When the current size is best to be copied by a libcall,
9230 : but we are still forced to inline, run the heuristic below
9231 : that will pick code for medium sized blocks. */
9232 10992 : if (alg != libcall)
9233 : {
9234 5117 : *noalign = alg_noalign;
9235 5117 : return alg;
9236 : }
9237 5875 : else if (!any_alg_usable_p)
9238 : break;
9239 : }
9240 64810 : else if (alg_usable_p (candidate, memset, dst_as, src_as)
9241 64810 : && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
9242 22 : && candidate == rep_prefix_1_byte
9243 : /* NB: If min_size != max_size, size is
9244 : unknown. */
9245 22 : && min_size != max_size))
9246 : {
9247 64791 : *noalign = algs->size[i].noalign;
9248 64791 : return candidate;
9249 : }
9250 : }
9251 : }
9252 : }
9253 : /* When asked to inline the call anyway, try to pick meaningful choice.
9254 : We look for maximal size of block that is faster to copy by hand and
9255 : take blocks of at most of that size guessing that average size will
9256 : be roughly half of the block.
9257 :
9258 : If this turns out to be bad, we might simply specify the preferred
9259 : choice in ix86_costs. */
9260 72622 : if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
9261 76857 : && (algs->unknown_size == libcall
9262 0 : || !alg_usable_p (algs->unknown_size, memset, dst_as, src_as)))
9263 : {
9264 4235 : enum stringop_alg alg;
9265 4235 : HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
9266 :
9267 : /* If there aren't any usable algorithms or if recursing already,
9268 : then recursing on smaller sizes or same size isn't going to
9269 : find anything. Just return the simple byte-at-a-time copy loop. */
9270 4235 : if (!any_alg_usable_p || recur)
9271 : {
9272 : /* Pick something reasonable. */
9273 0 : if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
9274 0 : *dynamic_check = 128;
9275 0 : return loop_1_byte;
9276 : }
9277 4235 : alg = decide_alg (count, new_expected_size, min_size, max_size,
9278 : memset, zero_memset, dst_as, src_as,
9279 : dynamic_check, noalign, true);
9280 4235 : gcc_assert (*dynamic_check == -1);
9281 4235 : if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
9282 8 : *dynamic_check = max;
9283 : else
9284 4227 : gcc_assert (alg != libcall);
9285 4235 : return alg;
9286 : }
9287 :
9288 : /* Try to use some reasonable fallback algorithm. Note that for
9289 : non-default address spaces we default to a loop instead of
9290 : a libcall. */
9291 :
9292 72616 : bool have_as = !(ADDR_SPACE_GENERIC_P (dst_as)
9293 : && ADDR_SPACE_GENERIC_P (src_as));
9294 :
9295 72616 : return (alg_usable_p (algs->unknown_size, memset, dst_as, src_as)
9296 72616 : ? algs->unknown_size : have_as ? loop : libcall);
9297 : }
9298 :
9299 : /* Decide on alignment. We know that the operand is already aligned to ALIGN
9300 : (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
9301 : static int
9302 35285 : decide_alignment (int align,
9303 : enum stringop_alg alg,
9304 : int expected_size,
9305 : machine_mode move_mode)
9306 : {
9307 35285 : int desired_align = 0;
9308 :
9309 35285 : gcc_assert (alg != no_stringop);
9310 :
9311 35285 : if (alg == libcall)
9312 : return 0;
9313 35285 : if (move_mode == VOIDmode)
9314 : return 0;
9315 :
9316 35285 : desired_align = GET_MODE_SIZE (move_mode);
9317 : /* PentiumPro has special logic triggering for 8 byte aligned blocks.
9318 : copying whole cacheline at once. */
9319 35285 : if (TARGET_CPU_P (PENTIUMPRO)
9320 0 : && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
9321 35285 : desired_align = 8;
9322 :
9323 35285 : if (optimize_size)
9324 9681 : desired_align = 1;
9325 35285 : if (desired_align < align)
9326 : desired_align = align;
9327 35285 : if (expected_size != -1 && expected_size < 4)
9328 0 : desired_align = align;
9329 :
9330 : return desired_align;
9331 : }
9332 :
9333 :
9334 : /* Helper function for memcpy. For QImode value 0xXY produce
9335 : 0xXYXYXYXY of wide specified by MODE. This is essentially
9336 : a * 0x10101010, but we can do slightly better than
9337 : synth_mult by unwinding the sequence by hand on CPUs with
9338 : slow multiply. */
9339 : static rtx
9340 16798 : promote_duplicated_reg (machine_mode mode, rtx val)
9341 : {
9342 16798 : if (val == const0_rtx)
9343 14992 : return copy_to_mode_reg (mode, CONST0_RTX (mode));
9344 :
9345 1806 : machine_mode valmode = GET_MODE (val);
9346 1806 : if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
9347 : {
9348 : /* Duplicate the scalar value for integer vector. */
9349 1487 : gcc_assert ((val == const0_rtx || val == constm1_rtx)
9350 : || GET_MODE_INNER (mode) == valmode);
9351 755 : rtx dup = gen_reg_rtx (mode);
9352 755 : bool ok = ix86_expand_vector_init_duplicate (false, mode, dup,
9353 : val);
9354 755 : gcc_assert (ok);
9355 : return dup;
9356 : }
9357 :
9358 1051 : rtx tmp;
9359 1051 : int nops = mode == DImode ? 3 : 2;
9360 :
9361 38 : gcc_assert (mode == SImode || mode == DImode);
9362 1051 : if (CONST_INT_P (val))
9363 : {
9364 762 : HOST_WIDE_INT v = INTVAL (val) & 255;
9365 :
9366 762 : v |= v << 8;
9367 762 : v |= v << 16;
9368 762 : if (mode == DImode)
9369 736 : v |= (v << 16) << 16;
9370 762 : return copy_to_mode_reg (mode, gen_int_mode (v, mode));
9371 : }
9372 :
9373 289 : if (valmode == VOIDmode)
9374 : valmode = QImode;
9375 289 : if (valmode != QImode)
9376 0 : val = gen_lowpart (QImode, val);
9377 289 : if (mode == QImode)
9378 : return val;
9379 289 : if (!TARGET_PARTIAL_REG_STALL)
9380 289 : nops--;
9381 289 : if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
9382 289 : + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
9383 289 : <= (ix86_cost->shift_const + ix86_cost->add) * nops
9384 289 : + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
9385 : {
9386 289 : rtx reg = convert_modes (mode, QImode, val, true);
9387 289 : tmp = promote_duplicated_reg (mode, const1_rtx);
9388 289 : return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
9389 289 : OPTAB_DIRECT);
9390 : }
9391 : else
9392 : {
9393 0 : rtx reg = convert_modes (mode, QImode, val, true);
9394 :
9395 0 : if (!TARGET_PARTIAL_REG_STALL)
9396 0 : emit_insn (gen_insv_1 (mode, reg, reg));
9397 : else
9398 : {
9399 0 : tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
9400 : NULL, 1, OPTAB_DIRECT);
9401 0 : reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
9402 : OPTAB_DIRECT);
9403 : }
9404 0 : tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
9405 : NULL, 1, OPTAB_DIRECT);
9406 0 : reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
9407 0 : if (mode == SImode)
9408 : return reg;
9409 0 : tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
9410 : NULL, 1, OPTAB_DIRECT);
9411 0 : reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
9412 0 : return reg;
9413 : }
9414 : }
9415 :
9416 : /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
9417 : be needed by main loop copying SIZE_NEEDED chunks and prologue getting
9418 : alignment from ALIGN to DESIRED_ALIGN. */
9419 : static rtx
9420 12439 : promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
9421 : int align)
9422 : {
9423 12439 : rtx promoted_val;
9424 :
9425 12439 : if (TARGET_64BIT
9426 10964 : && (size_needed > 4 || (desired_align > align && desired_align > 4)))
9427 4511 : promoted_val = promote_duplicated_reg (DImode, val);
9428 7928 : else if (size_needed > 2 || (desired_align > align && desired_align > 2))
9429 6121 : promoted_val = promote_duplicated_reg (SImode, val);
9430 1807 : else if (size_needed > 1 || (desired_align > align && desired_align > 1))
9431 0 : promoted_val = promote_duplicated_reg (HImode, val);
9432 : else
9433 : promoted_val = val;
9434 :
9435 12439 : return promoted_val;
9436 : }
9437 :
9438 : /* Copy the address to a Pmode register. This is used for x32 to
9439 : truncate DImode TLS address to a SImode register. */
9440 :
9441 : static rtx
9442 70367 : ix86_copy_addr_to_reg (rtx addr)
9443 : {
9444 70367 : rtx reg;
9445 74954 : if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
9446 : {
9447 70367 : reg = copy_addr_to_reg (addr);
9448 70367 : REG_POINTER (reg) = 1;
9449 70367 : return reg;
9450 : }
9451 : else
9452 : {
9453 0 : gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
9454 0 : reg = copy_to_mode_reg (DImode, addr);
9455 0 : REG_POINTER (reg) = 1;
9456 0 : return gen_rtx_SUBREG (SImode, reg, 0);
9457 : }
9458 : }
9459 :
9460 : /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
9461 : operations when profitable. The code depends upon architecture, block size
9462 : and alignment, but always has one of the following overall structures:
9463 :
9464 : Aligned move sequence:
9465 :
9466 : 1) Prologue guard: Conditional that jumps up to epilogues for small
9467 : blocks that can be handled by epilogue alone. This is faster
9468 : but also needed for correctness, since prologue assume the block
9469 : is larger than the desired alignment.
9470 :
9471 : Optional dynamic check for size and libcall for large
9472 : blocks is emitted here too, with -minline-stringops-dynamically.
9473 :
9474 : 2) Prologue: copy first few bytes in order to get destination
9475 : aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
9476 : than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
9477 : copied. We emit either a jump tree on power of two sized
9478 : blocks, or a byte loop.
9479 :
9480 : 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
9481 : with specified algorithm.
9482 :
9483 : 4) Epilogue: code copying tail of the block that is too small to be
9484 : handled by main body (or up to size guarded by prologue guard).
9485 :
9486 : Misaligned move sequence
9487 :
9488 : 1) missaligned move prologue/epilogue containing:
9489 : a) Prologue handling small memory blocks and jumping to done_label
9490 : (skipped if blocks are known to be large enough)
9491 : b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
9492 : needed by single possibly misaligned move
9493 : (skipped if alignment is not needed)
9494 : c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
9495 :
9496 : 2) Zero size guard dispatching to done_label, if needed
9497 :
9498 : 3) dispatch to library call, if needed,
9499 :
9500 : 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
9501 : with specified algorithm. */
9502 : bool
9503 148683 : ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
9504 : rtx align_exp, rtx expected_align_exp,
9505 : rtx expected_size_exp, rtx min_size_exp,
9506 : rtx max_size_exp, rtx probable_max_size_exp,
9507 : bool issetmem)
9508 : {
9509 148683 : rtx destreg;
9510 148683 : rtx srcreg = NULL;
9511 148683 : rtx_code_label *label = NULL;
9512 148683 : rtx tmp;
9513 148683 : rtx_code_label *jump_around_label = NULL;
9514 148683 : HOST_WIDE_INT align = 1;
9515 148683 : unsigned HOST_WIDE_INT count = 0;
9516 148683 : HOST_WIDE_INT expected_size = -1;
9517 148683 : int size_needed = 0, epilogue_size_needed;
9518 148683 : int desired_align = 0, align_bytes = 0;
9519 148683 : enum stringop_alg alg;
9520 148683 : rtx promoted_val = NULL;
9521 148683 : rtx vec_promoted_val = NULL;
9522 148683 : bool force_loopy_epilogue = false;
9523 148683 : int dynamic_check;
9524 148683 : bool need_zero_guard = false;
9525 148683 : bool noalign;
9526 148683 : machine_mode move_mode = VOIDmode;
9527 148683 : int unroll_factor = 1;
9528 : /* TODO: Once value ranges are available, fill in proper data. */
9529 148683 : unsigned HOST_WIDE_INT min_size = HOST_WIDE_INT_0U;
9530 148683 : unsigned HOST_WIDE_INT max_size = HOST_WIDE_INT_M1U;
9531 148683 : unsigned HOST_WIDE_INT probable_max_size = HOST_WIDE_INT_M1U;
9532 148683 : bool misaligned_prologue_used = false;
9533 148683 : addr_space_t dst_as, src_as = ADDR_SPACE_GENERIC;
9534 :
9535 148683 : if (CONST_INT_P (align_exp))
9536 148683 : align = INTVAL (align_exp);
9537 : /* i386 can do misaligned access on reasonably increased cost. */
9538 148683 : if (CONST_INT_P (expected_align_exp)
9539 148683 : && INTVAL (expected_align_exp) > align)
9540 : align = INTVAL (expected_align_exp);
9541 : /* ALIGN is the minimum of destination and source alignment, but we care here
9542 : just about destination alignment. */
9543 141727 : else if (!issetmem
9544 238893 : && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
9545 3207 : align = MEM_ALIGN (dst) / BITS_PER_UNIT;
9546 :
9547 148683 : if (CONST_INT_P (count_exp))
9548 : {
9549 67337 : min_size = max_size = probable_max_size = count = expected_size
9550 67337 : = INTVAL (count_exp);
9551 : /* When COUNT is 0, there is nothing to do. */
9552 67337 : if (!count)
9553 : return true;
9554 : }
9555 : else
9556 : {
9557 81346 : if (min_size_exp)
9558 81346 : min_size = INTVAL (min_size_exp);
9559 81346 : if (max_size_exp)
9560 71049 : max_size = INTVAL (max_size_exp);
9561 81346 : if (probable_max_size_exp)
9562 72963 : probable_max_size = INTVAL (probable_max_size_exp);
9563 81346 : if (CONST_INT_P (expected_size_exp))
9564 81346 : expected_size = INTVAL (expected_size_exp);
9565 : }
9566 :
9567 : /* Make sure we don't need to care about overflow later on. */
9568 148681 : if (count > (HOST_WIDE_INT_1U << 30))
9569 : return false;
9570 :
9571 148507 : dst_as = MEM_ADDR_SPACE (dst);
9572 148507 : if (!issetmem)
9573 104011 : src_as = MEM_ADDR_SPACE (src);
9574 :
9575 : /* Step 0: Decide on preferred algorithm, desired alignment and
9576 : size of chunks to be copied by main loop. */
9577 148507 : alg = decide_alg (count, expected_size, min_size, probable_max_size,
9578 44496 : issetmem, issetmem && val_exp == const0_rtx,
9579 : dst_as, src_as, &dynamic_check, &noalign, false);
9580 :
9581 148507 : if (dump_file)
9582 7 : fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
9583 7 : stringop_alg_names[alg]);
9584 :
9585 148507 : if (alg == libcall)
9586 : return false;
9587 35285 : gcc_assert (alg != no_stringop);
9588 :
9589 35285 : if (!count)
9590 16886 : count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
9591 35285 : destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
9592 35285 : if (!issetmem)
9593 22846 : srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
9594 :
9595 35285 : bool aligned_dstmem = false;
9596 35285 : unsigned int nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX;
9597 35285 : bool single_insn_p = count && count <= nunits;
9598 35285 : if (single_insn_p)
9599 : {
9600 : /* If it can be done with a single instruction, use vector
9601 : instruction and don't align destination. */
9602 6 : alg = vector_loop;
9603 6 : noalign = true;
9604 6 : dynamic_check = -1;
9605 : }
9606 :
9607 35285 : unroll_factor = 1;
9608 35285 : move_mode = word_mode;
9609 35285 : switch (alg)
9610 : {
9611 0 : case libcall:
9612 0 : case no_stringop:
9613 0 : case last_alg:
9614 0 : gcc_unreachable ();
9615 1690 : case loop_1_byte:
9616 1690 : need_zero_guard = true;
9617 1690 : move_mode = QImode;
9618 1690 : break;
9619 48 : case loop:
9620 48 : need_zero_guard = true;
9621 48 : break;
9622 20 : case unrolled_loop:
9623 20 : need_zero_guard = true;
9624 20 : unroll_factor = (TARGET_64BIT ? 4 : 2);
9625 : break;
9626 16954 : case vector_loop:
9627 16954 : need_zero_guard = true;
9628 16954 : unroll_factor = 4;
9629 : /* Get the vector mode to move STORE_MAX_PIECES/MOVE_MAX bytes. */
9630 16954 : nunits /= GET_MODE_SIZE (word_mode);
9631 16954 : if (nunits > 1)
9632 : {
9633 16950 : move_mode = mode_for_vector (word_mode, nunits).require ();
9634 16950 : gcc_assert (optab_handler (mov_optab, move_mode)
9635 : != CODE_FOR_nothing);
9636 : }
9637 : break;
9638 24 : case rep_prefix_8_byte:
9639 24 : move_mode = DImode;
9640 24 : break;
9641 10582 : case rep_prefix_4_byte:
9642 10582 : move_mode = SImode;
9643 10582 : break;
9644 5967 : case rep_prefix_1_byte:
9645 5967 : move_mode = QImode;
9646 5967 : break;
9647 : }
9648 35285 : size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
9649 35285 : epilogue_size_needed = size_needed;
9650 :
9651 : /* If we are going to call any library calls conditionally, make sure any
9652 : pending stack adjustment happen before the first conditional branch,
9653 : otherwise they will be emitted before the library call only and won't
9654 : happen from the other branches. */
9655 35285 : if (dynamic_check != -1)
9656 7 : do_pending_stack_adjust ();
9657 :
9658 35285 : desired_align = decide_alignment (align, alg, expected_size, move_mode);
9659 35285 : if (!TARGET_ALIGN_STRINGOPS || noalign)
9660 33508 : align = desired_align;
9661 :
9662 : /* Step 1: Prologue guard. */
9663 :
9664 : /* Alignment code needs count to be in register. */
9665 35285 : if (CONST_INT_P (count_exp) && desired_align > align)
9666 : {
9667 20 : if (INTVAL (count_exp) > desired_align
9668 20 : && INTVAL (count_exp) > size_needed)
9669 : {
9670 20 : align_bytes
9671 20 : = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
9672 20 : if (align_bytes <= 0)
9673 : align_bytes = 0;
9674 : else
9675 0 : align_bytes = desired_align - align_bytes;
9676 : }
9677 0 : if (align_bytes == 0)
9678 40 : count_exp = force_reg (counter_mode (count_exp), count_exp);
9679 : }
9680 35285 : gcc_assert (desired_align >= 1 && align >= 1);
9681 :
9682 35285 : if (!single_insn_p)
9683 : {
9684 : /* Misaligned move sequences handle both prologue and epilogue
9685 : at once. Default code generation results in a smaller code
9686 : for large alignments and also avoids redundant job when sizes
9687 : are known precisely. */
9688 35279 : misaligned_prologue_used
9689 70558 : = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
9690 35273 : && MAX (desired_align, epilogue_size_needed) <= 32
9691 17998 : && desired_align <= epilogue_size_needed
9692 41430 : && ((desired_align > align && !align_bytes)
9693 6130 : || (!count && epilogue_size_needed > 1)));
9694 :
9695 : /* Destination is aligned after the misaligned prologue. */
9696 35279 : aligned_dstmem = misaligned_prologue_used;
9697 :
9698 35279 : if (noalign && !misaligned_prologue_used)
9699 : {
9700 : /* Also use misaligned prologue if alignment isn't needed and
9701 : destination isn't aligned. Since alignment isn't needed,
9702 : the destination after prologue won't be aligned. */
9703 33502 : aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode)
9704 33502 : <= MEM_ALIGN (dst));
9705 33502 : if (!aligned_dstmem)
9706 11121 : misaligned_prologue_used = true;
9707 : }
9708 : }
9709 :
9710 : /* Do the cheap promotion to allow better CSE across the
9711 : main loop and epilogue (ie one load of the big constant in the
9712 : front of all code.
9713 : For now the misaligned move sequences do not have fast path
9714 : without broadcasting. */
9715 35285 : if (issetmem
9716 12439 : && (alg == vector_loop
9717 6562 : || CONST_INT_P (val_exp)
9718 48 : || misaligned_prologue_used))
9719 : {
9720 6514 : if (alg == vector_loop)
9721 : {
9722 5877 : promoted_val = promote_duplicated_reg_to_size (val_exp,
9723 11754 : GET_MODE_SIZE (word_mode),
9724 : desired_align, align);
9725 : /* Duplicate the promoted scalar value if not 0 nor -1. */
9726 5877 : vec_promoted_val
9727 5877 : = promote_duplicated_reg (move_mode,
9728 5877 : (val_exp == const0_rtx
9729 755 : || val_exp == constm1_rtx)
9730 : ? val_exp : promoted_val);
9731 : }
9732 : else
9733 : {
9734 6514 : promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
9735 : desired_align, align);
9736 : }
9737 : }
9738 : /* Misaligned move sequences handles both prologues and epilogues at once.
9739 : Default code generation results in smaller code for large alignments and
9740 : also avoids redundant job when sizes are known precisely. */
9741 35237 : if (misaligned_prologue_used)
9742 : {
9743 : /* Misaligned move prologue handled small blocks by itself. */
9744 11142 : expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
9745 11142 : (dst, src, &destreg, &srcreg,
9746 : move_mode, promoted_val, vec_promoted_val,
9747 : &count_exp,
9748 : &jump_around_label,
9749 11142 : desired_align < align
9750 0 : ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
9751 : desired_align, align, &min_size, dynamic_check, issetmem);
9752 11142 : if (!issetmem)
9753 8549 : src = change_address (src, BLKmode, srcreg);
9754 11142 : dst = change_address (dst, BLKmode, destreg);
9755 11142 : if (aligned_dstmem)
9756 21 : set_mem_align (dst, desired_align * BITS_PER_UNIT);
9757 11142 : epilogue_size_needed = 0;
9758 11142 : if (need_zero_guard
9759 10879 : && min_size < (unsigned HOST_WIDE_INT) size_needed)
9760 : {
9761 : /* It is possible that we copied enough so the main loop will not
9762 : execute. */
9763 7854 : gcc_assert (size_needed > 1);
9764 7854 : if (jump_around_label == NULL_RTX)
9765 50 : jump_around_label = gen_label_rtx ();
9766 15708 : emit_cmp_and_jump_insns (count_exp,
9767 : GEN_INT (size_needed),
9768 : LTU, 0, counter_mode (count_exp), 1, jump_around_label);
9769 7854 : if (expected_size == -1
9770 53 : || expected_size < (desired_align - align) / 2 + size_needed)
9771 7802 : predict_jump (REG_BR_PROB_BASE * 20 / 100);
9772 : else
9773 52 : predict_jump (REG_BR_PROB_BASE * 60 / 100);
9774 : }
9775 : }
9776 : /* Ensure that alignment prologue won't copy past end of block. */
9777 24143 : else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
9778 : {
9779 16486 : epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
9780 : /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
9781 : Make sure it is power of 2. */
9782 16486 : epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
9783 :
9784 : /* To improve performance of small blocks, we jump around the VAL
9785 : promoting mode. This mean that if the promoted VAL is not constant,
9786 : we might not use it in the epilogue and have to use byte
9787 : loop variant. */
9788 16486 : if (issetmem && epilogue_size_needed > 2 && !promoted_val)
9789 16486 : force_loopy_epilogue = true;
9790 16486 : if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
9791 16478 : || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
9792 : {
9793 : /* If main algorithm works on QImode, no epilogue is needed.
9794 : For small sizes just don't align anything. */
9795 2212 : if (size_needed == 1)
9796 0 : desired_align = align;
9797 : else
9798 2212 : goto epilogue;
9799 : }
9800 14274 : else if (!count
9801 255 : && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
9802 : {
9803 255 : label = gen_label_rtx ();
9804 510 : emit_cmp_and_jump_insns (count_exp,
9805 : GEN_INT (epilogue_size_needed),
9806 : LTU, 0, counter_mode (count_exp), 1, label);
9807 255 : if (expected_size == -1 || expected_size < epilogue_size_needed)
9808 255 : predict_jump (REG_BR_PROB_BASE * 60 / 100);
9809 : else
9810 0 : predict_jump (REG_BR_PROB_BASE * 20 / 100);
9811 : }
9812 : }
9813 :
9814 : /* Emit code to decide on runtime whether library call or inline should be
9815 : used. */
9816 33073 : if (dynamic_check != -1)
9817 : {
9818 7 : if (!issetmem && CONST_INT_P (count_exp))
9819 : {
9820 1 : if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
9821 : {
9822 1 : emit_block_copy_via_libcall (dst, src, count_exp);
9823 1 : count_exp = const0_rtx;
9824 1 : goto epilogue;
9825 : }
9826 : }
9827 : else
9828 : {
9829 6 : rtx_code_label *hot_label = gen_label_rtx ();
9830 6 : if (jump_around_label == NULL_RTX)
9831 1 : jump_around_label = gen_label_rtx ();
9832 12 : emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
9833 : LEU, 0, counter_mode (count_exp),
9834 : 1, hot_label);
9835 6 : predict_jump (REG_BR_PROB_BASE * 90 / 100);
9836 6 : if (issetmem)
9837 4 : set_storage_via_libcall (dst, count_exp, val_exp);
9838 : else
9839 2 : emit_block_copy_via_libcall (dst, src, count_exp);
9840 6 : emit_jump (jump_around_label);
9841 6 : emit_label (hot_label);
9842 : }
9843 : }
9844 :
9845 : /* Step 2: Alignment prologue. */
9846 : /* Do the expensive promotion once we branched off the small blocks. */
9847 33072 : if (issetmem && !promoted_val)
9848 48 : promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
9849 : desired_align, align);
9850 :
9851 33072 : if (desired_align > align && !misaligned_prologue_used)
9852 : {
9853 7 : if (align_bytes == 0)
9854 : {
9855 : /* Except for the first move in prologue, we no longer know
9856 : constant offset in aliasing info. It don't seems to worth
9857 : the pain to maintain it for the first move, so throw away
9858 : the info early. */
9859 7 : dst = change_address (dst, BLKmode, destreg);
9860 7 : if (!issetmem)
9861 5 : src = change_address (src, BLKmode, srcreg);
9862 7 : dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
9863 : promoted_val, vec_promoted_val,
9864 : count_exp, align, desired_align,
9865 : issetmem);
9866 : /* At most desired_align - align bytes are copied. */
9867 7 : if (min_size < (unsigned)(desired_align - align))
9868 0 : min_size = 0;
9869 : else
9870 7 : min_size -= desired_align - align;
9871 : }
9872 : else
9873 : {
9874 : /* If we know how many bytes need to be stored before dst is
9875 : sufficiently aligned, maintain aliasing info accurately. */
9876 0 : dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
9877 : srcreg,
9878 : promoted_val,
9879 : vec_promoted_val,
9880 : desired_align,
9881 : align_bytes,
9882 : issetmem);
9883 :
9884 0 : count_exp = plus_constant (counter_mode (count_exp),
9885 0 : count_exp, -align_bytes);
9886 0 : count -= align_bytes;
9887 0 : min_size -= align_bytes;
9888 0 : max_size -= align_bytes;
9889 : }
9890 7 : if (need_zero_guard
9891 7 : && min_size < (unsigned HOST_WIDE_INT) size_needed
9892 1 : && (count < (unsigned HOST_WIDE_INT) size_needed
9893 0 : || (align_bytes == 0
9894 0 : && count < ((unsigned HOST_WIDE_INT) size_needed
9895 0 : + desired_align - align))))
9896 : {
9897 : /* It is possible that we copied enough so the main loop will not
9898 : execute. */
9899 1 : gcc_assert (size_needed > 1);
9900 1 : if (label == NULL_RTX)
9901 0 : label = gen_label_rtx ();
9902 2 : emit_cmp_and_jump_insns (count_exp,
9903 : GEN_INT (size_needed),
9904 : LTU, 0, counter_mode (count_exp), 1, label);
9905 1 : if (expected_size == -1
9906 0 : || expected_size < (desired_align - align) / 2 + size_needed)
9907 1 : predict_jump (REG_BR_PROB_BASE * 20 / 100);
9908 : else
9909 0 : predict_jump (REG_BR_PROB_BASE * 60 / 100);
9910 : }
9911 : }
9912 33072 : if (label && size_needed == 1)
9913 : {
9914 0 : emit_label (label);
9915 0 : LABEL_NUSES (label) = 1;
9916 0 : label = NULL;
9917 0 : epilogue_size_needed = 1;
9918 0 : if (issetmem)
9919 0 : promoted_val = val_exp;
9920 : }
9921 33072 : else if (label == NULL_RTX && !misaligned_prologue_used)
9922 21676 : epilogue_size_needed = size_needed;
9923 :
9924 : /* Step 3: Main loop. */
9925 :
9926 33072 : switch (alg)
9927 : {
9928 0 : case libcall:
9929 0 : case no_stringop:
9930 0 : case last_alg:
9931 0 : gcc_unreachable ();
9932 1758 : case loop_1_byte:
9933 1758 : case loop:
9934 1758 : case unrolled_loop:
9935 1758 : expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
9936 : count_exp, move_mode, unroll_factor,
9937 : expected_size, issetmem);
9938 1758 : break;
9939 14741 : case vector_loop:
9940 14741 : expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
9941 : vec_promoted_val, count_exp, move_mode,
9942 : unroll_factor, expected_size, issetmem);
9943 14741 : break;
9944 16573 : case rep_prefix_8_byte:
9945 16573 : case rep_prefix_4_byte:
9946 16573 : case rep_prefix_1_byte:
9947 16573 : expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
9948 : val_exp, count_exp, move_mode, issetmem);
9949 16573 : break;
9950 : }
9951 : /* Adjust properly the offset of src and dest memory for aliasing. */
9952 33072 : if (CONST_INT_P (count_exp))
9953 : {
9954 18370 : if (!issetmem)
9955 8438 : src = adjust_automodify_address_nv (src, BLKmode, srcreg,
9956 : (count / size_needed) * size_needed);
9957 18370 : dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
9958 : (count / size_needed) * size_needed);
9959 : }
9960 : else
9961 : {
9962 14702 : if (!issetmem)
9963 12471 : src = change_address (src, BLKmode, srcreg);
9964 14702 : dst = change_address (dst, BLKmode, destreg);
9965 : }
9966 :
9967 : /* Step 4: Epilogue to copy the remaining bytes. */
9968 35285 : epilogue:
9969 35285 : if (label)
9970 : {
9971 : /* When the main loop is done, COUNT_EXP might hold original count,
9972 : while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
9973 : Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
9974 : bytes. Compensate if needed. */
9975 :
9976 255 : if (size_needed < epilogue_size_needed)
9977 : {
9978 0 : tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
9979 0 : GEN_INT (size_needed - 1), count_exp, 1,
9980 : OPTAB_DIRECT);
9981 0 : if (tmp != count_exp)
9982 0 : emit_move_insn (count_exp, tmp);
9983 : }
9984 255 : emit_label (label);
9985 255 : LABEL_NUSES (label) = 1;
9986 : }
9987 :
9988 35285 : if (count_exp != const0_rtx && epilogue_size_needed > 1)
9989 : {
9990 16486 : if (force_loopy_epilogue)
9991 0 : expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
9992 : epilogue_size_needed);
9993 : else
9994 : {
9995 16486 : if (issetmem)
9996 8039 : expand_setmem_epilogue (dst, destreg, promoted_val,
9997 : vec_promoted_val, count_exp,
9998 : epilogue_size_needed);
9999 : else
10000 8447 : expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
10001 : epilogue_size_needed);
10002 : }
10003 : }
10004 35285 : if (jump_around_label)
10005 7856 : emit_label (jump_around_label);
10006 : return true;
10007 : }
10008 :
10009 : /* Fully unroll memmove of known size with up to 8 registers. */
10010 :
10011 : static bool
10012 2148 : ix86_expand_unroll_movmem (rtx dst, rtx src, rtx destreg, rtx srcreg,
10013 : unsigned HOST_WIDE_INT count,
10014 : machine_mode mode)
10015 : {
10016 : /* If 8 registers registers can cover all memory, load them into
10017 : registers and store them together to avoid possible address
10018 : overlap between source and destination. */
10019 2148 : unsigned HOST_WIDE_INT moves = count / GET_MODE_SIZE (mode);
10020 2148 : if (moves == 0)
10021 : {
10022 0 : mode = smallest_int_mode_for_size
10023 0 : (count * BITS_PER_UNIT).require ();
10024 0 : if (count == GET_MODE_SIZE (mode))
10025 : moves = 1;
10026 : else
10027 : {
10028 : /* Reduce the smallest move size by half so that MOVES == 1. */
10029 0 : mode = smallest_int_mode_for_size
10030 0 : (GET_MODE_BITSIZE (mode) / 2).require ();
10031 0 : moves = count / GET_MODE_SIZE (mode);
10032 0 : gcc_assert (moves == 1);
10033 : }
10034 : }
10035 2148 : else if (moves > 8)
10036 : return false;
10037 :
10038 2134 : unsigned int i;
10039 2134 : rtx tmp[9];
10040 :
10041 4853 : for (i = 0; i < moves; i++)
10042 2719 : tmp[i] = gen_reg_rtx (mode);
10043 :
10044 2134 : rtx srcmem = change_address (src, mode, srcreg);
10045 6987 : for (i = 0; i < moves; i++)
10046 : {
10047 2719 : emit_move_insn (tmp[i], srcmem);
10048 5438 : srcmem = offset_address (srcmem,
10049 2719 : GEN_INT (GET_MODE_SIZE (mode)),
10050 2719 : GET_MODE_SIZE (mode));
10051 : }
10052 :
10053 2134 : unsigned int epilogue_size = count & (GET_MODE_SIZE (mode) - 1);
10054 2134 : machine_mode epilogue_mode = VOIDmode;
10055 2134 : if (epilogue_size)
10056 : {
10057 : /* Handle the remaining bytes with overlapping move. */
10058 1953 : epilogue_mode = smallest_int_mode_for_size
10059 1953 : (epilogue_size * BITS_PER_UNIT).require ();
10060 1953 : tmp[8] = gen_reg_rtx (epilogue_mode);
10061 1953 : srcmem = adjust_address (srcmem, epilogue_mode, 0);
10062 1953 : srcmem = offset_address (srcmem, GEN_INT (epilogue_size), 1);
10063 3906 : srcmem = offset_address (srcmem,
10064 1953 : GEN_INT (-GET_MODE_SIZE (epilogue_mode)),
10065 1953 : GET_MODE_SIZE (epilogue_mode));
10066 1953 : emit_move_insn (tmp[8], srcmem);
10067 : }
10068 :
10069 2134 : rtx destmem = change_address (dst, mode, destreg);
10070 6987 : for (i = 0; i < moves; i++)
10071 : {
10072 2719 : emit_move_insn (destmem, tmp[i]);
10073 5438 : destmem = offset_address (destmem,
10074 2719 : GEN_INT (GET_MODE_SIZE (mode)),
10075 2719 : GET_MODE_SIZE (mode));
10076 : }
10077 :
10078 2134 : if (epilogue_size)
10079 : {
10080 : /* Use overlapping move. */
10081 1953 : destmem = adjust_address (destmem, epilogue_mode, 0);
10082 1953 : destmem = offset_address (destmem, GEN_INT (epilogue_size), 1);
10083 3906 : destmem = offset_address (destmem,
10084 1953 : GEN_INT (-GET_MODE_SIZE (epilogue_mode)),
10085 1953 : GET_MODE_SIZE (epilogue_mode));
10086 1953 : emit_move_insn (destmem, tmp[8]);
10087 : }
10088 :
10089 : return true;
10090 : }
10091 :
10092 : /* Expand memmove of size with MOVES * mode size and MOVES <= 4. If
10093 : FORWARD is true, copy forward. Otherwise copy backward. */
10094 :
10095 : static void
10096 2950 : ix86_expand_n_move_movmem (rtx destmem, rtx srcmem, machine_mode mode,
10097 : unsigned int moves, bool forward)
10098 : {
10099 2950 : gcc_assert (moves <= 4);
10100 :
10101 : unsigned int i;
10102 : rtx tmp[8];
10103 :
10104 14750 : for (i = 0; i < moves; i++)
10105 11800 : tmp[i] = gen_reg_rtx (mode);
10106 :
10107 2950 : rtx step;
10108 2950 : if (forward)
10109 2950 : step = GEN_INT (GET_MODE_SIZE (mode));
10110 : else
10111 2950 : step = GEN_INT (-GET_MODE_SIZE (mode));
10112 :
10113 : /* Load MOVES. */
10114 11800 : for (i = 0; i < moves - 1; i++)
10115 : {
10116 8850 : emit_move_insn (tmp[i], srcmem);
10117 17700 : srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
10118 : }
10119 2950 : emit_move_insn (tmp[i], srcmem);
10120 :
10121 : /* Store MOVES. */
10122 14750 : for (i = 0; i < moves - 1; i++)
10123 : {
10124 8850 : emit_move_insn (destmem, tmp[i]);
10125 17700 : destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
10126 : }
10127 2950 : emit_move_insn (destmem, tmp[i]);
10128 2950 : }
10129 :
10130 : /* Load MOVES of mode size into REGS. If LAST is true, load the
10131 : last MOVES. Otherwise, load the first MOVES. */
10132 :
10133 : static void
10134 2950 : ix86_expand_load_movmem (rtx src, rtx srcreg, rtx count_exp,
10135 : machine_mode mode, unsigned int moves,
10136 : rtx regs[], bool last)
10137 : {
10138 2950 : unsigned int i;
10139 :
10140 14750 : for (i = 0; i < moves; i++)
10141 11800 : regs[i] = gen_reg_rtx (mode);
10142 :
10143 2950 : rtx srcmem = change_address (src, mode, srcreg);
10144 2950 : rtx step;
10145 2950 : if (last)
10146 : {
10147 1475 : srcmem = offset_address (srcmem, count_exp, 1);
10148 2950 : step = GEN_INT (-GET_MODE_SIZE (mode));
10149 2950 : srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
10150 : }
10151 : else
10152 2950 : step = GEN_INT (GET_MODE_SIZE (mode));
10153 :
10154 11800 : for (i = 0; i < moves - 1; i++)
10155 : {
10156 8850 : emit_move_insn (regs[i], srcmem);
10157 17700 : srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
10158 : }
10159 2950 : emit_move_insn (regs[i], srcmem);
10160 2950 : }
10161 :
10162 : /* Store MOVES of mode size into REGS. If LAST is true, store the
10163 : last MOVES. Otherwise, store the first MOVES. */
10164 :
10165 : static void
10166 2950 : ix86_expand_store_movmem (rtx dst, rtx destreg, rtx count_exp,
10167 : machine_mode mode, unsigned int moves,
10168 : rtx regs[], bool last)
10169 : {
10170 2950 : unsigned int i;
10171 :
10172 2950 : rtx destmem = change_address (dst, mode, destreg);
10173 2950 : rtx step;
10174 2950 : if (last)
10175 : {
10176 1475 : destmem = offset_address (destmem, count_exp, 1);
10177 2950 : step = GEN_INT (-GET_MODE_SIZE (mode));
10178 2950 : destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
10179 : }
10180 : else
10181 2950 : step = GEN_INT (GET_MODE_SIZE (mode));
10182 :
10183 11800 : for (i = 0; i < moves - 1; i++)
10184 : {
10185 8850 : emit_move_insn (destmem, regs[i]);
10186 17700 : destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
10187 : }
10188 2950 : emit_move_insn (destmem, regs[i]);
10189 2950 : }
10190 :
10191 : /* Expand memmove of size between (MOVES / 2) * mode size and
10192 : MOVES * mode size with overlapping load and store. MOVES is even.
10193 : MOVES >= 2 and MOVES <= 8. */
10194 :
10195 : static void
10196 14925 : ix86_expand_n_overlapping_move_movmem (rtx dst, rtx src, rtx destreg,
10197 : rtx srcreg, rtx count_exp,
10198 : machine_mode mode,
10199 : unsigned int moves)
10200 : {
10201 14925 : gcc_assert (moves >= 2 && moves <= 8 && (moves & 1) == 0);
10202 :
10203 14925 : unsigned int half_moves = moves / 2;
10204 14925 : unsigned int i, j;
10205 14925 : rtx tmp[8];
10206 :
10207 57659 : for (i = 0; i < moves; i++)
10208 42734 : tmp[i] = gen_reg_rtx (mode);
10209 :
10210 14925 : rtx base_srcmem = change_address (src, mode, srcreg);
10211 :
10212 : /* Load the first half. */
10213 14925 : rtx srcmem = base_srcmem;
10214 36292 : for (i = 0; i < half_moves - 1; i++)
10215 : {
10216 6442 : emit_move_insn (tmp[i], srcmem);
10217 12884 : srcmem = offset_address (srcmem,
10218 6442 : GEN_INT (GET_MODE_SIZE (mode)),
10219 6442 : GET_MODE_SIZE (mode));
10220 : }
10221 14925 : emit_move_insn (tmp[i], srcmem);
10222 :
10223 : /* Load the second half. */
10224 14925 : srcmem = offset_address (base_srcmem, count_exp, 1);
10225 14925 : srcmem = offset_address (srcmem,
10226 14925 : GEN_INT (-GET_MODE_SIZE (mode)),
10227 14925 : GET_MODE_SIZE (mode));
10228 36292 : for (j = half_moves, i = 0; i < half_moves - 1; i++, j++)
10229 : {
10230 6442 : emit_move_insn (tmp[j], srcmem);
10231 12884 : srcmem = offset_address (srcmem,
10232 6442 : GEN_INT (-GET_MODE_SIZE (mode)),
10233 6442 : GET_MODE_SIZE (mode));
10234 : }
10235 14925 : emit_move_insn (tmp[j], srcmem);
10236 :
10237 14925 : rtx base_destmem = change_address (dst, mode, destreg);
10238 :
10239 : /* Store the first half. */
10240 14925 : rtx destmem = base_destmem;
10241 36292 : for (i = 0; i < half_moves - 1; i++)
10242 : {
10243 6442 : emit_move_insn (destmem, tmp[i]);
10244 12884 : destmem = offset_address (destmem,
10245 6442 : GEN_INT (GET_MODE_SIZE (mode)),
10246 6442 : GET_MODE_SIZE (mode));
10247 : }
10248 14925 : emit_move_insn (destmem, tmp[i]);
10249 :
10250 : /* Store the second half. */
10251 14925 : destmem = offset_address (base_destmem, count_exp, 1);
10252 29850 : destmem = offset_address (destmem, GEN_INT (-GET_MODE_SIZE (mode)),
10253 14925 : GET_MODE_SIZE (mode));
10254 36292 : for (j = half_moves, i = 0; i < half_moves - 1; i++, j++)
10255 : {
10256 6442 : emit_move_insn (destmem, tmp[j]);
10257 12884 : destmem = offset_address (destmem, GEN_INT (-GET_MODE_SIZE (mode)),
10258 6442 : GET_MODE_SIZE (mode));
10259 : }
10260 14925 : emit_move_insn (destmem, tmp[j]);
10261 14925 : }
10262 :
10263 : /* Expand memmove of size < mode size which is <= 64. */
10264 :
10265 : static void
10266 3342 : ix86_expand_less_move_movmem (rtx dst, rtx src, rtx destreg,
10267 : rtx srcreg, rtx count_exp,
10268 : unsigned HOST_WIDE_INT min_size,
10269 : machine_mode mode,
10270 : rtx_code_label *done_label)
10271 : {
10272 3342 : bool skip = false;
10273 3342 : machine_mode count_mode = counter_mode (count_exp);
10274 :
10275 3342 : rtx_code_label *between_32_63_label
10276 3342 : = GET_MODE_SIZE (mode) > 32 ? gen_label_rtx () : nullptr;
10277 : /* Jump to BETWEEN_32_64_LABEL if size >= 32 and size < 64. */
10278 3 : if (between_32_63_label)
10279 : {
10280 3 : if (min_size && min_size >= 32)
10281 : {
10282 1 : emit_jump_insn (gen_jump (between_32_63_label));
10283 1 : emit_barrier ();
10284 1 : skip = true;
10285 : }
10286 : else
10287 2 : emit_cmp_and_jump_insns (count_exp, GEN_INT (32), GEU,
10288 : nullptr, count_mode, 1,
10289 : between_32_63_label);
10290 : }
10291 :
10292 3 : rtx_code_label *between_16_31_label
10293 3341 : = (!skip && GET_MODE_SIZE (mode) > 16) ? gen_label_rtx () : nullptr;
10294 : /* Jump to BETWEEN_16_31_LABEL if size >= 16 and size < 31. */
10295 4 : if (between_16_31_label)
10296 : {
10297 4 : if (min_size && min_size >= 16)
10298 : {
10299 2 : emit_jump_insn (gen_jump (between_16_31_label));
10300 2 : emit_barrier ();
10301 2 : skip = true;
10302 : }
10303 : else
10304 2 : emit_cmp_and_jump_insns (count_exp, GEN_INT (16), GEU,
10305 : nullptr, count_mode, 1,
10306 : between_16_31_label);
10307 : }
10308 :
10309 2 : rtx_code_label *between_8_15_label
10310 6679 : = (!skip && GET_MODE_SIZE (mode) > 8) ? gen_label_rtx () : nullptr;
10311 : /* Jump to BETWEEN_8_15_LABEL if size >= 8 and size < 15. */
10312 2234 : if (between_8_15_label)
10313 : {
10314 2234 : if (min_size && min_size >= 8)
10315 : {
10316 152 : emit_jump_insn (gen_jump (between_8_15_label));
10317 152 : emit_barrier ();
10318 152 : skip = true;
10319 : }
10320 : else
10321 2082 : emit_cmp_and_jump_insns (count_exp, GEN_INT (8), GEU,
10322 : nullptr, count_mode, 1,
10323 : between_8_15_label);
10324 : }
10325 :
10326 152 : rtx_code_label *between_4_7_label
10327 6377 : = (!skip && GET_MODE_SIZE (mode) > 4) ? gen_label_rtx () : nullptr;
10328 : /* Jump to BETWEEN_4_7_LABEL if size >= 4 and size < 7. */
10329 2514 : if (between_4_7_label)
10330 : {
10331 2514 : if (min_size && min_size >= 4)
10332 : {
10333 180 : emit_jump_insn (gen_jump (between_4_7_label));
10334 180 : emit_barrier ();
10335 180 : skip = true;
10336 : }
10337 : else
10338 2334 : emit_cmp_and_jump_insns (count_exp, GEN_INT (4), GEU,
10339 : nullptr, count_mode, 1,
10340 : between_4_7_label);
10341 : }
10342 :
10343 180 : rtx_code_label *between_2_3_label
10344 6169 : = (!skip && GET_MODE_SIZE (mode) > 2) ? gen_label_rtx () : nullptr;
10345 : /* Jump to BETWEEN_2_3_LABEL if size >= 2 and size < 3. */
10346 2854 : if (between_2_3_label)
10347 : {
10348 2854 : if (min_size && min_size >= 2)
10349 : {
10350 290 : emit_jump_insn (gen_jump (between_2_3_label));
10351 290 : emit_barrier ();
10352 290 : skip = true;
10353 : }
10354 : else
10355 2564 : emit_cmp_and_jump_insns (count_exp, GEN_INT (1), GT,
10356 : nullptr, count_mode, 1,
10357 : between_2_3_label);
10358 : }
10359 :
10360 3342 : if (!skip)
10361 : {
10362 2717 : rtx_code_label *zero_label
10363 2717 : = min_size == 0 ? gen_label_rtx () : nullptr;
10364 : /* Skip if size == 0. */
10365 1661 : if (zero_label)
10366 1661 : emit_cmp_and_jump_insns (count_exp, GEN_INT (1), LT,
10367 : nullptr, count_mode, 1,
10368 : zero_label,
10369 : profile_probability::unlikely ());
10370 :
10371 : /* Move 1 byte. */
10372 2717 : rtx tmp0 = gen_reg_rtx (QImode);
10373 2717 : rtx srcmem = change_address (src, QImode, srcreg);
10374 2717 : emit_move_insn (tmp0, srcmem);
10375 2717 : rtx destmem = change_address (dst, QImode, destreg);
10376 2717 : emit_move_insn (destmem, tmp0);
10377 :
10378 2717 : if (zero_label)
10379 1661 : emit_label (zero_label);
10380 :
10381 2717 : emit_jump_insn (gen_jump (done_label));
10382 2717 : emit_barrier ();
10383 : }
10384 :
10385 3342 : if (between_32_63_label)
10386 : {
10387 3 : emit_label (between_32_63_label);
10388 3 : ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
10389 : count_exp, OImode, 2);
10390 3 : emit_jump_insn (gen_jump (done_label));
10391 3 : emit_barrier ();
10392 : }
10393 :
10394 3342 : if (between_16_31_label)
10395 : {
10396 4 : emit_label (between_16_31_label);
10397 4 : ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
10398 : count_exp, TImode, 2);
10399 4 : emit_jump_insn (gen_jump (done_label));
10400 4 : emit_barrier ();
10401 : }
10402 :
10403 3342 : if (between_8_15_label)
10404 : {
10405 2234 : emit_label (between_8_15_label);
10406 2234 : ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
10407 : count_exp, DImode, 2);
10408 2234 : emit_jump_insn (gen_jump (done_label));
10409 2234 : emit_barrier ();
10410 : }
10411 :
10412 3342 : if (between_4_7_label)
10413 : {
10414 2514 : emit_label (between_4_7_label);
10415 2514 : ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
10416 : count_exp, SImode, 2);
10417 2514 : emit_jump_insn (gen_jump (done_label));
10418 2514 : emit_barrier ();
10419 : }
10420 :
10421 3342 : if (between_2_3_label)
10422 : {
10423 2854 : emit_label (between_2_3_label);
10424 2854 : ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
10425 : count_exp, HImode, 2);
10426 2854 : emit_jump_insn (gen_jump (done_label));
10427 2854 : emit_barrier ();
10428 : }
10429 3342 : }
10430 :
10431 : /* Expand movmem with overlapping unaligned loads and stores:
10432 : 1. Load all sources into registers and store them together to avoid
10433 : possible address overlap between source and destination.
10434 : 2. For known size, first try to fully unroll with 8 registers.
10435 : 3. For size <= 2 * MOVE_MAX, load all sources into 2 registers first
10436 : and then store them together.
10437 : 4. For size > 2 * MOVE_MAX and size <= 4 * MOVE_MAX, load all sources
10438 : into 4 registers first and then store them together.
10439 : 5. For size > 4 * MOVE_MAX and size <= 8 * MOVE_MAX, load all sources
10440 : into 8 registers first and then store them together.
10441 : 6. For size > 8 * MOVE_MAX,
10442 : a. If address of destination > address of source, copy backward
10443 : with a 4 * MOVE_MAX loop with unaligned loads and stores. Load
10444 : the first 4 * MOVE_MAX into 4 registers before the loop and
10445 : store them after the loop to support overlapping addresses.
10446 : b. Otherwise, copy forward with a 4 * MOVE_MAX loop with unaligned
10447 : loads and stores. Load the last 4 * MOVE_MAX into 4 registers
10448 : before the loop and store them after the loop to support
10449 : overlapping addresses.
10450 : */
10451 :
10452 : bool
10453 16815 : ix86_expand_movmem (rtx operands[])
10454 : {
10455 : /* Since there are much less registers available in 32-bit mode, don't
10456 : inline movmem in 32-bit mode. */
10457 16815 : if (!TARGET_64BIT)
10458 : return false;
10459 :
10460 14421 : rtx dst = operands[0];
10461 14421 : rtx src = operands[1];
10462 14421 : rtx count_exp = operands[2];
10463 14421 : rtx expected_size_exp = operands[5];
10464 14421 : rtx min_size_exp = operands[6];
10465 14421 : rtx probable_max_size_exp = operands[8];
10466 14421 : unsigned HOST_WIDE_INT count = HOST_WIDE_INT_0U;
10467 14421 : HOST_WIDE_INT expected_size = HOST_WIDE_INT_M1U;
10468 14421 : unsigned HOST_WIDE_INT min_size = HOST_WIDE_INT_0U;
10469 14421 : unsigned HOST_WIDE_INT probable_max_size = HOST_WIDE_INT_M1U;
10470 :
10471 14421 : if (CONST_INT_P (count_exp))
10472 : {
10473 2288 : min_size = probable_max_size = count = expected_size
10474 2288 : = INTVAL (count_exp);
10475 : /* When COUNT is 0, there is nothing to do. */
10476 2288 : if (!count)
10477 : return true;
10478 : }
10479 : else
10480 : {
10481 12133 : if (min_size_exp)
10482 12133 : min_size = INTVAL (min_size_exp);
10483 12133 : if (probable_max_size_exp)
10484 9221 : probable_max_size = INTVAL (probable_max_size_exp);
10485 12133 : if (CONST_INT_P (expected_size_exp))
10486 12133 : expected_size = INTVAL (expected_size_exp);
10487 : }
10488 :
10489 : /* Make sure we don't need to care about overflow later on. */
10490 14421 : if (count > (HOST_WIDE_INT_1U << 30))
10491 : return false;
10492 :
10493 14385 : addr_space_t dst_as = MEM_ADDR_SPACE (dst);
10494 14385 : addr_space_t src_as = MEM_ADDR_SPACE (src);
10495 14385 : int dynamic_check;
10496 14385 : bool noalign;
10497 14385 : enum stringop_alg alg = decide_alg (count, expected_size, min_size,
10498 : probable_max_size, false, false,
10499 : dst_as, src_as, &dynamic_check,
10500 : &noalign, false);
10501 14385 : if (alg == libcall)
10502 : return false;
10503 :
10504 6118 : rtx destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
10505 6118 : rtx srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
10506 :
10507 6118 : unsigned int move_max = MOVE_MAX;
10508 6118 : machine_mode mode = smallest_int_mode_for_size
10509 6118 : (move_max * BITS_PER_UNIT).require ();
10510 6118 : if (probable_max_size && probable_max_size < move_max)
10511 : {
10512 : /* Get a usable MOVE_MAX. */
10513 3287 : mode = smallest_int_mode_for_size
10514 3287 : (probable_max_size * BITS_PER_UNIT).require ();
10515 : /* Reduce MOVE_MAX by half so that MOVE_MAX can be used. */
10516 6574 : if (GET_MODE_SIZE (mode) > probable_max_size)
10517 2796 : mode = smallest_int_mode_for_size
10518 2796 : (GET_MODE_BITSIZE (mode) / 2).require ();
10519 6574 : move_max = GET_MODE_SIZE (mode);
10520 : }
10521 :
10522 : /* Try to fully unroll memmove of known size first. */
10523 6118 : if (count
10524 6118 : && ix86_expand_unroll_movmem (dst, src, destreg, srcreg, count,
10525 : mode))
10526 : return true;
10527 :
10528 3984 : rtx_code_label *done_label = gen_label_rtx ();
10529 :
10530 3984 : rtx_code_label *less_vec_label = nullptr;
10531 3984 : if (min_size == 0 || min_size < move_max)
10532 3342 : less_vec_label = gen_label_rtx ();
10533 :
10534 3984 : machine_mode count_mode = counter_mode (count_exp);
10535 :
10536 : /* Jump to LESS_VEC_LABEL if size < MOVE_MAX. */
10537 3984 : if (less_vec_label)
10538 3342 : emit_cmp_and_jump_insns (count_exp, GEN_INT (move_max), LTU,
10539 : nullptr, count_mode, 1,
10540 : less_vec_label);
10541 :
10542 3984 : rtx_code_label *more_2x_vec_label = nullptr;
10543 3984 : if (probable_max_size == 0 || probable_max_size > 2 * move_max)
10544 1831 : more_2x_vec_label = gen_label_rtx ();
10545 :
10546 : /* Jump to MORE_2X_VEC_LABEL if size > 2 * MOVE_MAX. */
10547 1831 : if (more_2x_vec_label)
10548 1831 : emit_cmp_and_jump_insns (count_exp, GEN_INT (2 * move_max), GTU,
10549 : nullptr, count_mode, 1,
10550 : more_2x_vec_label);
10551 :
10552 3984 : if (min_size == 0 || min_size <= 2 * move_max)
10553 : {
10554 : /* Size >= MOVE_MAX and size <= 2 * MOVE_MAX. */
10555 3960 : ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
10556 : count_exp, mode, 2);
10557 3960 : emit_jump_insn (gen_jump (done_label));
10558 3960 : emit_barrier ();
10559 : }
10560 :
10561 3984 : if (less_vec_label)
10562 : {
10563 : /* Size < MOVE_MAX. */
10564 3342 : emit_label (less_vec_label);
10565 3342 : ix86_expand_less_move_movmem (dst, src, destreg, srcreg,
10566 : count_exp, min_size, mode,
10567 : done_label);
10568 3342 : emit_jump_insn (gen_jump (done_label));
10569 3342 : emit_barrier ();
10570 : }
10571 :
10572 3984 : if (more_2x_vec_label)
10573 : {
10574 : /* Size > 2 * MOVE_MAX and destination may overlap with source. */
10575 1831 : emit_label (more_2x_vec_label);
10576 :
10577 1831 : rtx_code_label *more_8x_vec_label = nullptr;
10578 1831 : if (probable_max_size == 0 || probable_max_size > 8 * move_max)
10579 1475 : more_8x_vec_label = gen_label_rtx ();
10580 :
10581 : /* Jump to MORE_8X_VEC_LABEL if size > 8 * MOVE_MAX. */
10582 1475 : if (more_8x_vec_label)
10583 1475 : emit_cmp_and_jump_insns (count_exp, GEN_INT (8 * move_max), GTU,
10584 : nullptr, count_mode, 1,
10585 : more_8x_vec_label);
10586 :
10587 1831 : rtx_code_label *last_4x_vec_label = nullptr;
10588 1831 : if (min_size == 0 || min_size < 4 * move_max)
10589 1813 : last_4x_vec_label = gen_label_rtx ();
10590 :
10591 : /* Jump to LAST_4X_VEC_LABEL if size < 4 * MOVE_MAX. */
10592 1813 : if (last_4x_vec_label)
10593 1813 : emit_cmp_and_jump_insns (count_exp, GEN_INT (4 * move_max), LTU,
10594 : nullptr, count_mode, 1,
10595 : last_4x_vec_label);
10596 :
10597 1831 : if (probable_max_size == 0 || probable_max_size > 4 * move_max)
10598 : {
10599 : /* Size > 4 * MOVE_MAX and size <= 8 * MOVE_MAX. */
10600 1543 : ix86_expand_n_overlapping_move_movmem (dst, src, destreg,
10601 : srcreg, count_exp,
10602 : mode, 8);
10603 1543 : emit_jump_insn (gen_jump (done_label));
10604 1543 : emit_barrier ();
10605 : }
10606 :
10607 1831 : if (last_4x_vec_label)
10608 : {
10609 : /* Size > 2 * MOVE_MAX and size <= 4 * MOVE_MAX. */
10610 1813 : emit_label (last_4x_vec_label);
10611 1813 : ix86_expand_n_overlapping_move_movmem (dst, src, destreg,
10612 : srcreg, count_exp,
10613 : mode, 4);
10614 1813 : emit_jump_insn (gen_jump (done_label));
10615 1813 : emit_barrier ();
10616 : }
10617 :
10618 1831 : if (more_8x_vec_label)
10619 : {
10620 : /* Size > 8 * MOVE_MAX. */
10621 1475 : emit_label (more_8x_vec_label);
10622 :
10623 1475 : rtx loop_count = gen_reg_rtx (count_mode);
10624 1475 : emit_move_insn (loop_count, count_exp);
10625 :
10626 : /* Jump to MORE_8X_VEC_BACKWARD_LABEL if source address is
10627 : lower than destination address. */
10628 1475 : rtx_code_label *more_8x_vec_backward_label = gen_label_rtx ();
10629 1475 : emit_cmp_and_jump_insns (srcreg, destreg, LTU, nullptr,
10630 1475 : GET_MODE (destreg), 1,
10631 : more_8x_vec_backward_label);
10632 :
10633 : /* Skip if source == destination which is less common. */
10634 1475 : emit_cmp_and_jump_insns (srcreg, destreg, EQ, nullptr,
10635 1475 : GET_MODE (destreg), 1, done_label,
10636 : profile_probability::unlikely ());
10637 :
10638 1475 : rtx base_destreg = gen_reg_rtx (GET_MODE (destreg));
10639 1475 : emit_move_insn (base_destreg, destreg);
10640 :
10641 : /* Load the last 4 * MOVE_MAX. */
10642 1475 : rtx regs[4];
10643 1475 : ix86_expand_load_movmem (src, srcreg, count_exp, mode,
10644 : ARRAY_SIZE (regs), regs, true);
10645 :
10646 1475 : rtx srcmem = change_address (src, mode, srcreg);
10647 1475 : rtx destmem = change_address (dst, mode, destreg);
10648 :
10649 : /* Copy forward with a 4 * MOVE_MAX loop. */
10650 1475 : rtx_code_label *loop_4x_vec_forward_label = gen_label_rtx ();
10651 1475 : emit_label (loop_4x_vec_forward_label);
10652 :
10653 1475 : ix86_expand_n_move_movmem (destmem, srcmem, mode, 4, true);
10654 :
10655 1475 : rtx tmp;
10656 1475 : rtx delta = GEN_INT (4 * MOVE_MAX);
10657 :
10658 : /* Decrement LOOP_COUNT by 4 * MOVE_MAX. */
10659 1475 : tmp = expand_simple_binop (GET_MODE (loop_count), MINUS,
10660 : loop_count, delta, nullptr, 1,
10661 : OPTAB_DIRECT);
10662 1475 : if (tmp != loop_count)
10663 1475 : emit_move_insn (loop_count, tmp);
10664 :
10665 : /* Increment DESTREG and SRCREG by 4 * MOVE_MAX. */
10666 1475 : tmp = expand_simple_binop (GET_MODE (destreg), PLUS,
10667 : destreg, delta, nullptr, 1,
10668 : OPTAB_DIRECT);
10669 1475 : if (tmp != destreg)
10670 1475 : emit_move_insn (destreg, tmp);
10671 1475 : tmp = expand_simple_binop (GET_MODE (srcreg), PLUS, srcreg,
10672 : delta, nullptr, 1, OPTAB_DIRECT);
10673 1475 : if (tmp != srcreg)
10674 1475 : emit_move_insn (srcreg, tmp);
10675 :
10676 : /* Stop if LOOP_EXP <= 4 * MOVE_MAX. */
10677 1475 : emit_cmp_and_jump_insns (loop_count, delta, GTU, nullptr,
10678 1475 : GET_MODE (loop_count), 1,
10679 : loop_4x_vec_forward_label);
10680 :
10681 : /* Store the last 4 * MOVE_MAX. */
10682 1475 : ix86_expand_store_movmem (dst, base_destreg, count_exp, mode,
10683 : ARRAY_SIZE (regs), regs, true);
10684 :
10685 1475 : emit_jump_insn (gen_jump (done_label));
10686 1475 : emit_barrier ();
10687 :
10688 : /* Copy backward with a 4 * MOVE_MAX loop. */
10689 1475 : emit_label (more_8x_vec_backward_label);
10690 :
10691 1475 : base_destreg = gen_reg_rtx (GET_MODE (destreg));
10692 1475 : emit_move_insn (base_destreg, destreg);
10693 :
10694 : /* Load the first 4 * MOVE_MAX. */
10695 1475 : ix86_expand_load_movmem (src, srcreg, count_exp, mode,
10696 : ARRAY_SIZE (regs), regs, false);
10697 :
10698 : /* Increment DESTREG and SRCREG by COUNT_EXP. */
10699 1475 : tmp = expand_simple_binop (GET_MODE (destreg), PLUS,
10700 : destreg, count_exp, nullptr, 1,
10701 : OPTAB_DIRECT);
10702 1475 : if (tmp != destreg)
10703 1475 : emit_move_insn (destreg, tmp);
10704 1475 : tmp = expand_simple_binop (GET_MODE (srcreg), PLUS, srcreg,
10705 : count_exp, nullptr, 1, OPTAB_DIRECT);
10706 1475 : if (tmp != srcreg)
10707 1475 : emit_move_insn (srcreg, tmp);
10708 :
10709 1475 : srcmem = change_address (src, mode, srcreg);
10710 1475 : destmem = change_address (dst, mode, destreg);
10711 2950 : rtx step = GEN_INT (-GET_MODE_SIZE (mode));
10712 2950 : srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
10713 2950 : destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
10714 :
10715 1475 : rtx_code_label *loop_4x_vec_backward_label = gen_label_rtx ();
10716 1475 : emit_label (loop_4x_vec_backward_label);
10717 :
10718 1475 : ix86_expand_n_move_movmem (destmem, srcmem, mode, 4, false);
10719 :
10720 : /* Decrement LOOP_COUNT by 4 * MOVE_MAX. */
10721 1475 : tmp = expand_simple_binop (GET_MODE (loop_count), MINUS,
10722 : loop_count, delta, nullptr, 1,
10723 : OPTAB_DIRECT);
10724 1475 : if (tmp != loop_count)
10725 1475 : emit_move_insn (loop_count, tmp);
10726 :
10727 : /* Decrement DESTREG and SRCREG by 4 * MOVE_MAX. */
10728 1475 : tmp = expand_simple_binop (GET_MODE (destreg), MINUS,
10729 : destreg, delta, nullptr, 1,
10730 : OPTAB_DIRECT);
10731 1475 : if (tmp != destreg)
10732 1475 : emit_move_insn (destreg, tmp);
10733 1475 : tmp = expand_simple_binop (GET_MODE (srcreg), MINUS, srcreg,
10734 : delta, nullptr, 1, OPTAB_DIRECT);
10735 1475 : if (tmp != srcreg)
10736 1475 : emit_move_insn (srcreg, tmp);
10737 :
10738 : /* Stop if LOOP_EXP <= 4 * MOVE_MAX. */
10739 1475 : emit_cmp_and_jump_insns (loop_count, delta, GTU, nullptr,
10740 1475 : GET_MODE (loop_count), 1,
10741 : loop_4x_vec_backward_label);
10742 :
10743 : /* Store the first 4 * MOVE_MAX. */
10744 1475 : ix86_expand_store_movmem (dst, base_destreg, count_exp, mode,
10745 : ARRAY_SIZE (regs), regs, false);
10746 :
10747 1475 : emit_jump_insn (gen_jump (done_label));
10748 1475 : emit_barrier ();
10749 : }
10750 : }
10751 :
10752 3984 : emit_label (done_label);
10753 :
10754 3984 : return true;
10755 : }
10756 :
10757 : /* Expand cmpstrn or memcmp. */
10758 :
10759 : bool
10760 170807 : ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
10761 : rtx length, rtx align, bool is_cmpstrn)
10762 : {
10763 : /* Expand strncmp and memcmp only with -minline-all-stringops since
10764 : "repz cmpsb" can be much slower than strncmp and memcmp functions
10765 : implemented with vector instructions, see
10766 :
10767 : https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
10768 : */
10769 170807 : if (!TARGET_INLINE_ALL_STRINGOPS)
10770 : return false;
10771 :
10772 : /* Can't use this if the user has appropriated ecx, esi or edi. */
10773 5780 : if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
10774 : return false;
10775 :
10776 5780 : if (is_cmpstrn)
10777 : {
10778 : /* For strncmp, length is the maximum length, which can be larger
10779 : than actual string lengths. We can expand the cmpstrn pattern
10780 : to "repz cmpsb" only if one of the strings is a constant so
10781 : that expand_builtin_strncmp() can write the length argument to
10782 : be the minimum of the const string length and the actual length
10783 : argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
10784 69 : tree t1 = MEM_EXPR (src1);
10785 69 : tree t2 = MEM_EXPR (src2);
10786 138 : if (!((t1 && TREE_CODE (t1) == MEM_REF
10787 69 : && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
10788 0 : && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
10789 : == STRING_CST))
10790 69 : || (t2 && TREE_CODE (t2) == MEM_REF
10791 69 : && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
10792 69 : && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
10793 : == STRING_CST))))
10794 : return false;
10795 : }
10796 :
10797 5780 : rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
10798 5780 : rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
10799 5780 : if (addr1 != XEXP (src1, 0))
10800 5780 : src1 = replace_equiv_address_nv (src1, addr1);
10801 5780 : if (addr2 != XEXP (src2, 0))
10802 5780 : src2 = replace_equiv_address_nv (src2, addr2);
10803 :
10804 : /* NB: Make a copy of the data length to avoid changing the original
10805 : data length by cmpstrnqi patterns. */
10806 5780 : length = ix86_zero_extend_to_Pmode (length);
10807 8673 : rtx lengthreg = gen_reg_rtx (Pmode);
10808 5780 : emit_move_insn (lengthreg, length);
10809 :
10810 : /* If we are testing strict equality, we can use known alignment to
10811 : good advantage. This may be possible with combine, particularly
10812 : once cc0 is dead. */
10813 5780 : if (CONST_INT_P (length))
10814 : {
10815 0 : if (length == const0_rtx)
10816 : {
10817 0 : emit_move_insn (result, const0_rtx);
10818 0 : return true;
10819 : }
10820 0 : emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
10821 : src1, src2));
10822 : }
10823 : else
10824 : {
10825 8673 : emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
10826 5780 : emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
10827 : src1, src2));
10828 : }
10829 :
10830 5780 : rtx out = gen_lowpart (QImode, result);
10831 5780 : emit_insn (gen_cmpintqi (out));
10832 5780 : emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
10833 :
10834 5780 : return true;
10835 : }
10836 :
10837 : /* Expand the appropriate insns for doing strlen if not just doing
10838 : repnz; scasb
10839 :
10840 : out = result, initialized with the start address
10841 : align_rtx = alignment of the address.
10842 : scratch = scratch register, initialized with the startaddress when
10843 : not aligned, otherwise undefined
10844 :
10845 : This is just the body. It needs the initializations mentioned above and
10846 : some address computing at the end. These things are done in i386.md. */
10847 :
10848 : static void
10849 11 : ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
10850 : {
10851 11 : int align;
10852 11 : rtx tmp;
10853 11 : rtx_code_label *align_2_label = NULL;
10854 11 : rtx_code_label *align_3_label = NULL;
10855 11 : rtx_code_label *align_4_label = gen_label_rtx ();
10856 11 : rtx_code_label *end_0_label = gen_label_rtx ();
10857 11 : rtx mem;
10858 11 : rtx tmpreg = gen_reg_rtx (SImode);
10859 11 : rtx scratch = gen_reg_rtx (SImode);
10860 11 : rtx cmp;
10861 :
10862 11 : align = 0;
10863 11 : if (CONST_INT_P (align_rtx))
10864 11 : align = INTVAL (align_rtx);
10865 :
10866 : /* Loop to check 1..3 bytes for null to get an aligned pointer. */
10867 :
10868 : /* Is there a known alignment and is it less than 4? */
10869 11 : if (align < 4)
10870 : {
10871 15 : rtx scratch1 = gen_reg_rtx (Pmode);
10872 11 : emit_move_insn (scratch1, out);
10873 : /* Is there a known alignment and is it not 2? */
10874 11 : if (align != 2)
10875 : {
10876 11 : align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
10877 11 : align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
10878 :
10879 : /* Leave just the 3 lower bits. */
10880 15 : align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
10881 : NULL_RTX, 0, OPTAB_WIDEN);
10882 :
10883 15 : emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
10884 11 : Pmode, 1, align_4_label);
10885 15 : emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
10886 11 : Pmode, 1, align_2_label);
10887 15 : emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
10888 11 : Pmode, 1, align_3_label);
10889 : }
10890 : else
10891 : {
10892 : /* Since the alignment is 2, we have to check 2 or 0 bytes;
10893 : check if is aligned to 4 - byte. */
10894 :
10895 0 : align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
10896 : NULL_RTX, 0, OPTAB_WIDEN);
10897 :
10898 0 : emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
10899 0 : Pmode, 1, align_4_label);
10900 : }
10901 :
10902 11 : mem = change_address (src, QImode, out);
10903 :
10904 : /* Now compare the bytes. */
10905 :
10906 : /* Compare the first n unaligned byte on a byte per byte basis. */
10907 11 : emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
10908 : QImode, 1, end_0_label);
10909 :
10910 : /* Increment the address. */
10911 11 : emit_insn (gen_add2_insn (out, const1_rtx));
10912 :
10913 : /* Not needed with an alignment of 2 */
10914 11 : if (align != 2)
10915 : {
10916 11 : emit_label (align_2_label);
10917 :
10918 11 : emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
10919 : end_0_label);
10920 :
10921 11 : emit_insn (gen_add2_insn (out, const1_rtx));
10922 :
10923 11 : emit_label (align_3_label);
10924 : }
10925 :
10926 11 : emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
10927 : end_0_label);
10928 :
10929 11 : emit_insn (gen_add2_insn (out, const1_rtx));
10930 : }
10931 :
10932 : /* Generate loop to check 4 bytes at a time. It is not a good idea to
10933 : align this loop. It gives only huge programs, but does not help to
10934 : speed up. */
10935 11 : emit_label (align_4_label);
10936 :
10937 11 : mem = change_address (src, SImode, out);
10938 11 : emit_move_insn (scratch, mem);
10939 11 : emit_insn (gen_add2_insn (out, GEN_INT (4)));
10940 :
10941 : /* This formula yields a nonzero result iff one of the bytes is zero.
10942 : This saves three branches inside loop and many cycles. */
10943 :
10944 11 : emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
10945 11 : emit_insn (gen_one_cmplsi2 (scratch, scratch));
10946 11 : emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
10947 11 : emit_insn (gen_andsi3 (tmpreg, tmpreg,
10948 : gen_int_mode (0x80808080, SImode)));
10949 11 : emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
10950 : align_4_label);
10951 :
10952 11 : if (TARGET_CMOVE)
10953 : {
10954 11 : rtx reg = gen_reg_rtx (SImode);
10955 15 : rtx reg2 = gen_reg_rtx (Pmode);
10956 11 : emit_move_insn (reg, tmpreg);
10957 11 : emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
10958 :
10959 : /* If zero is not in the first two bytes, move two bytes forward. */
10960 11 : emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
10961 11 : tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
10962 11 : tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
10963 11 : emit_insn (gen_rtx_SET (tmpreg,
10964 : gen_rtx_IF_THEN_ELSE (SImode, tmp,
10965 : reg,
10966 : tmpreg)));
10967 : /* Emit lea manually to avoid clobbering of flags. */
10968 15 : emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
10969 :
10970 11 : tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
10971 11 : tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
10972 15 : emit_insn (gen_rtx_SET (out,
10973 : gen_rtx_IF_THEN_ELSE (Pmode, tmp,
10974 : reg2,
10975 : out)));
10976 11 : }
10977 : else
10978 : {
10979 0 : rtx_code_label *end_2_label = gen_label_rtx ();
10980 : /* Is zero in the first two bytes? */
10981 :
10982 0 : emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
10983 0 : tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
10984 0 : tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
10985 0 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10986 : gen_rtx_LABEL_REF (VOIDmode, end_2_label),
10987 : pc_rtx);
10988 0 : tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
10989 0 : JUMP_LABEL (tmp) = end_2_label;
10990 :
10991 : /* Not in the first two. Move two bytes forward. */
10992 0 : emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
10993 0 : emit_insn (gen_add2_insn (out, const2_rtx));
10994 :
10995 0 : emit_label (end_2_label);
10996 :
10997 : }
10998 :
10999 : /* Avoid branch in fixing the byte. */
11000 11 : tmpreg = gen_lowpart (QImode, tmpreg);
11001 11 : emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
11002 11 : tmp = gen_rtx_REG (CCmode, FLAGS_REG);
11003 11 : cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
11004 15 : emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
11005 :
11006 11 : emit_label (end_0_label);
11007 11 : }
11008 :
11009 : /* Expand strlen. */
11010 :
11011 : bool
11012 13998 : ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
11013 : {
11014 13998 : if (TARGET_UNROLL_STRLEN
11015 13998 : && TARGET_INLINE_ALL_STRINGOPS
11016 11 : && eoschar == const0_rtx
11017 11 : && optimize > 1)
11018 : {
11019 : /* The generic case of strlen expander is long. Avoid it's
11020 : expanding unless TARGET_INLINE_ALL_STRINGOPS. */
11021 15 : rtx addr = force_reg (Pmode, XEXP (src, 0));
11022 : /* Well it seems that some optimizer does not combine a call like
11023 : foo(strlen(bar), strlen(bar));
11024 : when the move and the subtraction is done here. It does calculate
11025 : the length just once when these instructions are done inside of
11026 : output_strlen_unroll(). But I think since &bar[strlen(bar)] is
11027 : often used and I use one fewer register for the lifetime of
11028 : output_strlen_unroll() this is better. */
11029 :
11030 11 : emit_move_insn (out, addr);
11031 :
11032 11 : ix86_expand_strlensi_unroll_1 (out, src, align);
11033 :
11034 : /* strlensi_unroll_1 returns the address of the zero at the end of
11035 : the string, like memchr(), so compute the length by subtracting
11036 : the start address. */
11037 11 : emit_insn (gen_sub2_insn (out, addr));
11038 11 : return true;
11039 : }
11040 : else
11041 : return false;
11042 : }
11043 :
11044 : /* For given symbol (function) construct code to compute address of it's PLT
11045 : entry in large x86-64 PIC model. */
11046 :
11047 : static rtx
11048 34 : construct_plt_address (rtx symbol)
11049 : {
11050 34 : rtx tmp, unspec;
11051 :
11052 34 : gcc_assert (SYMBOL_REF_P (symbol));
11053 34 : gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
11054 34 : gcc_assert (Pmode == DImode);
11055 :
11056 34 : tmp = gen_reg_rtx (Pmode);
11057 34 : unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
11058 :
11059 34 : emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
11060 34 : emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
11061 34 : return tmp;
11062 : }
11063 :
11064 : /* Additional registers that are clobbered by SYSV calls. */
11065 :
11066 : static int const x86_64_ms_sysv_extra_clobbered_registers
11067 : [NUM_X86_64_MS_CLOBBERED_REGS] =
11068 : {
11069 : SI_REG, DI_REG,
11070 : XMM6_REG, XMM7_REG,
11071 : XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
11072 : XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
11073 : };
11074 :
11075 : rtx_insn *
11076 6233353 : ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
11077 : rtx callarg2,
11078 : rtx pop, bool sibcall)
11079 : {
11080 6233353 : rtx vec[3];
11081 6233353 : rtx use = NULL, call;
11082 6233353 : unsigned int vec_len = 0;
11083 6233353 : tree fndecl;
11084 6233353 : bool call_no_callee_saved_registers = false;
11085 :
11086 6233353 : if (SYMBOL_REF_P (XEXP (fnaddr, 0)))
11087 : {
11088 6049030 : fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
11089 6049030 : if (fndecl)
11090 : {
11091 5789114 : if (lookup_attribute ("interrupt",
11092 5789114 : TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
11093 1 : error ("interrupt service routine cannot be called directly");
11094 5789113 : else if (ix86_type_no_callee_saved_registers_p (TREE_TYPE (fndecl)))
11095 5789114 : call_no_callee_saved_registers = true;
11096 5789114 : if (fndecl == current_function_decl
11097 5789114 : && decl_binds_to_current_def_p (fndecl))
11098 11118 : cfun->machine->recursive_function = true;
11099 : }
11100 : }
11101 : else
11102 : {
11103 184323 : if (MEM_P (fnaddr))
11104 : {
11105 184323 : tree mem_expr = MEM_EXPR (fnaddr);
11106 184323 : if (mem_expr != nullptr
11107 184278 : && TREE_CODE (mem_expr) == MEM_REF
11108 368601 : && ix86_type_no_callee_saved_registers_p (TREE_TYPE (mem_expr)))
11109 : call_no_callee_saved_registers = true;
11110 : }
11111 :
11112 : fndecl = NULL_TREE;
11113 : }
11114 :
11115 6233353 : if (pop == const0_rtx)
11116 0 : pop = NULL;
11117 6233353 : gcc_assert (!TARGET_64BIT || !pop);
11118 :
11119 6233353 : rtx addr = XEXP (fnaddr, 0);
11120 6233353 : if (TARGET_MACHO && !TARGET_64BIT)
11121 : {
11122 : #if TARGET_MACHO
11123 : if (flag_pic && SYMBOL_REF_P (XEXP (fnaddr, 0)))
11124 : fnaddr = machopic_indirect_call_target (fnaddr);
11125 : #endif
11126 : }
11127 : else
11128 : {
11129 : /* Static functions and indirect calls don't need the pic register. Also,
11130 : check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
11131 : it an indirect call. */
11132 6233353 : if (flag_pic
11133 529637 : && SYMBOL_REF_P (addr)
11134 6736305 : && ix86_call_use_plt_p (addr))
11135 : {
11136 402347 : if (flag_plt
11137 402347 : && (SYMBOL_REF_DECL (addr) == NULL_TREE
11138 402313 : || !lookup_attribute ("noplt",
11139 402313 : DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
11140 : {
11141 402312 : if (!TARGET_64BIT
11142 223508 : || (ix86_cmodel == CM_LARGE_PIC
11143 : && DEFAULT_ABI != MS_ABI))
11144 : {
11145 536446 : use_reg (&use, gen_rtx_REG (Pmode,
11146 : REAL_PIC_OFFSET_TABLE_REGNUM));
11147 178838 : if (ix86_use_pseudo_pic_reg ())
11148 357642 : emit_move_insn (gen_rtx_REG (Pmode,
11149 178838 : REAL_PIC_OFFSET_TABLE_REGNUM),
11150 : pic_offset_table_rtx);
11151 : }
11152 : }
11153 35 : else if (!TARGET_PECOFF && !TARGET_MACHO)
11154 : {
11155 35 : if (TARGET_64BIT
11156 35 : && ix86_cmodel == CM_LARGE_PIC
11157 : && DEFAULT_ABI != MS_ABI)
11158 : {
11159 1 : fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
11160 : UNSPEC_GOT);
11161 1 : fnaddr = gen_rtx_CONST (Pmode, fnaddr);
11162 1 : fnaddr = force_reg (Pmode, fnaddr);
11163 1 : fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
11164 : }
11165 34 : else if (TARGET_64BIT)
11166 : {
11167 38 : fnaddr = gen_rtx_UNSPEC (Pmode,
11168 : gen_rtvec (1, addr),
11169 : UNSPEC_GOTPCREL);
11170 38 : fnaddr = gen_rtx_CONST (Pmode, fnaddr);
11171 : }
11172 : else
11173 : {
11174 0 : fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
11175 : UNSPEC_GOT);
11176 0 : fnaddr = gen_rtx_CONST (Pmode, fnaddr);
11177 0 : fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
11178 : fnaddr);
11179 : }
11180 39 : fnaddr = gen_const_mem (Pmode, fnaddr);
11181 : /* Pmode may not be the same as word_mode for x32, which
11182 : doesn't support indirect branch via 32-bit memory slot.
11183 : Since x32 GOT slot is 64 bit with zero upper 32 bits,
11184 : indirect branch via x32 GOT slot is OK. */
11185 35 : if (GET_MODE (fnaddr) != word_mode)
11186 4 : fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
11187 35 : fnaddr = gen_rtx_MEM (QImode, fnaddr);
11188 : }
11189 : }
11190 : }
11191 :
11192 : /* Skip setting up RAX register for -mskip-rax-setup when there are no
11193 : parameters passed in vector registers. */
11194 6233353 : if (TARGET_64BIT
11195 5393571 : && (INTVAL (callarg2) > 0
11196 5332411 : || (INTVAL (callarg2) == 0
11197 318214 : && (TARGET_SSE || !flag_skip_rax_setup))))
11198 : {
11199 379372 : rtx al = gen_rtx_REG (QImode, AX_REG);
11200 379372 : emit_move_insn (al, callarg2);
11201 379372 : use_reg (&use, al);
11202 : }
11203 :
11204 6233353 : if (ix86_cmodel == CM_LARGE_PIC
11205 : && !TARGET_PECOFF
11206 45 : && MEM_P (fnaddr)
11207 45 : && SYMBOL_REF_P (XEXP (fnaddr, 0))
11208 6233390 : && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
11209 34 : fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
11210 : /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
11211 : branch via x32 GOT slot is OK. */
11212 6233319 : else if (TARGET_X32
11213 74 : && MEM_P (fnaddr)
11214 74 : && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
11215 8 : && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode)
11216 6233323 : && !TARGET_INDIRECT_BRANCH_REGISTER)
11217 : ;
11218 6233319 : else if (sibcall
11219 6233319 : ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
11220 6104161 : : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
11221 : {
11222 532 : fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
11223 532 : fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
11224 : }
11225 :
11226 : /* PR100665: Hwasan may tag code pointer which is not supported by LAM,
11227 : mask off code pointers here.
11228 : TODO: also need to handle indirect jump. */
11229 6234403 : if (ix86_memtag_can_tag_addresses () && !fndecl
11230 6233377 : && sanitize_flags_p (SANITIZE_HWADDRESS))
11231 : {
11232 24 : rtx untagged_addr = ix86_memtag_untagged_pointer (XEXP (fnaddr, 0),
11233 : NULL_RTX);
11234 24 : fnaddr = gen_rtx_MEM (QImode, untagged_addr);
11235 : }
11236 :
11237 6233353 : call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
11238 :
11239 6233353 : if (retval)
11240 2464922 : call = gen_rtx_SET (retval, call);
11241 6233353 : vec[vec_len++] = call;
11242 :
11243 6233353 : if (pop)
11244 : {
11245 450458 : pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
11246 225229 : pop = gen_rtx_SET (stack_pointer_rtx, pop);
11247 225229 : vec[vec_len++] = pop;
11248 : }
11249 :
11250 6233353 : static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
11251 :
11252 6233353 : if ((cfun->machine->call_saved_registers
11253 6233353 : == TYPE_NO_CALLER_SAVED_REGISTERS)
11254 6233353 : && (!fndecl
11255 468 : || (!TREE_THIS_VOLATILE (fndecl)
11256 186 : && !lookup_attribute ("no_caller_saved_registers",
11257 186 : TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
11258 : {
11259 182 : bool is_64bit_ms_abi = (TARGET_64BIT
11260 182 : && ix86_function_abi (fndecl) == MS_ABI);
11261 182 : char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
11262 :
11263 : /* If there are no caller-saved registers, add all registers
11264 : that are clobbered by the call which returns. */
11265 16926 : for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
11266 16744 : if (!fixed_regs[i]
11267 3242 : && (ix86_call_used_regs[i] == 1
11268 1506 : || (ix86_call_used_regs[i] & c_mask))
11269 2150 : && !STACK_REGNO_P (i)
11270 2150 : && !MMX_REGNO_P (i))
11271 2150 : clobber_reg (&use,
11272 2150 : gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
11273 : }
11274 5393389 : else if (TARGET_64BIT_MS_ABI
11275 6306574 : && (!callarg2 || INTVAL (callarg2) != -2))
11276 : {
11277 : unsigned i;
11278 :
11279 861718 : for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
11280 : {
11281 795432 : int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
11282 795432 : machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
11283 :
11284 795432 : clobber_reg (&use, gen_rtx_REG (mode, regno));
11285 : }
11286 :
11287 : /* Set here, but it may get cleared later. */
11288 66286 : if (TARGET_CALL_MS2SYSV_XLOGUES)
11289 : {
11290 7046 : if (!TARGET_SSE)
11291 : ;
11292 :
11293 : /* Don't break hot-patched functions. */
11294 7046 : else if (ix86_function_ms_hook_prologue (current_function_decl))
11295 : ;
11296 :
11297 : /* TODO: Cases not yet examined. */
11298 7046 : else if (flag_split_stack)
11299 0 : warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
11300 :
11301 : else
11302 : {
11303 7046 : gcc_assert (!reload_completed);
11304 7046 : cfun->machine->call_ms2sysv = true;
11305 : }
11306 : }
11307 : }
11308 :
11309 6233353 : if (TARGET_MACHO && TARGET_64BIT && !sibcall
11310 : && ((SYMBOL_REF_P (addr) && !SYMBOL_REF_LOCAL_P (addr))
11311 : || !fndecl || TREE_PUBLIC (fndecl)))
11312 : {
11313 : /* We allow public functions defined in a TU to bind locally for PIC
11314 : code (the default) on 64bit Mach-O.
11315 : If such functions are not inlined, we cannot tell at compile-time if
11316 : they will be called via the lazy symbol resolver (this can depend on
11317 : options given at link-time). Therefore, we must assume that the lazy
11318 : resolver could be used which clobbers R11 and R10. */
11319 : clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
11320 : clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
11321 : }
11322 :
11323 6233353 : if (call_no_callee_saved_registers)
11324 : {
11325 : /* After calling a no_callee_saved_registers function, all
11326 : registers may be clobbered. Clobber all registers that are
11327 : not used by the callee. */
11328 59 : bool is_64bit_ms_abi = (TARGET_64BIT
11329 59 : && ix86_function_abi (fndecl) == MS_ABI);
11330 59 : char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
11331 5487 : for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
11332 5428 : if (!fixed_regs[i]
11333 2597 : && i != HARD_FRAME_POINTER_REGNUM
11334 2538 : && !(ix86_call_used_regs[i] == 1
11335 973 : || (ix86_call_used_regs[i] & c_mask))
11336 295 : && !STACK_REGNO_P (i)
11337 295 : && !MMX_REGNO_P (i))
11338 295 : clobber_reg (&use,
11339 295 : gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
11340 : }
11341 :
11342 6233353 : if (vec_len > 1)
11343 225229 : call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
11344 6233353 : rtx_insn *call_insn = emit_call_insn (call);
11345 6233353 : if (use)
11346 598638 : CALL_INSN_FUNCTION_USAGE (call_insn) = use;
11347 :
11348 6233353 : return call_insn;
11349 : }
11350 :
11351 : /* Split simple return with popping POPC bytes from stack to indirect
11352 : branch with stack adjustment . */
11353 :
11354 : void
11355 0 : ix86_split_simple_return_pop_internal (rtx popc)
11356 : {
11357 0 : struct machine_function *m = cfun->machine;
11358 0 : rtx ecx = gen_rtx_REG (SImode, CX_REG);
11359 0 : rtx_insn *insn;
11360 :
11361 : /* There is no "pascal" calling convention in any 64bit ABI. */
11362 0 : gcc_assert (!TARGET_64BIT);
11363 :
11364 0 : insn = emit_insn (gen_pop (ecx));
11365 0 : m->fs.cfa_offset -= UNITS_PER_WORD;
11366 0 : m->fs.sp_offset -= UNITS_PER_WORD;
11367 :
11368 0 : rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
11369 0 : x = gen_rtx_SET (stack_pointer_rtx, x);
11370 0 : add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
11371 0 : add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
11372 0 : RTX_FRAME_RELATED_P (insn) = 1;
11373 :
11374 0 : x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
11375 0 : x = gen_rtx_SET (stack_pointer_rtx, x);
11376 0 : insn = emit_insn (x);
11377 0 : add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
11378 0 : RTX_FRAME_RELATED_P (insn) = 1;
11379 :
11380 : /* Now return address is in ECX. */
11381 0 : emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11382 0 : }
11383 :
11384 : /* Errors in the source file can cause expand_expr to return const0_rtx
11385 : where we expect a vector. To avoid crashing, use one of the vector
11386 : clear instructions. */
11387 :
11388 : static rtx
11389 197989 : safe_vector_operand (rtx x, machine_mode mode)
11390 : {
11391 0 : if (x == const0_rtx)
11392 0 : x = CONST0_RTX (mode);
11393 24 : return x;
11394 : }
11395 :
11396 : /* Subroutine of ix86_expand_builtin to take care of binop insns. */
11397 :
11398 : static rtx
11399 8997 : ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
11400 : {
11401 8997 : rtx pat;
11402 8997 : tree arg0 = CALL_EXPR_ARG (exp, 0);
11403 8997 : tree arg1 = CALL_EXPR_ARG (exp, 1);
11404 8997 : rtx op0 = expand_normal (arg0);
11405 8997 : rtx op1 = expand_normal (arg1);
11406 8997 : machine_mode tmode = insn_data[icode].operand[0].mode;
11407 8997 : machine_mode mode0 = insn_data[icode].operand[1].mode;
11408 8997 : machine_mode mode1 = insn_data[icode].operand[2].mode;
11409 :
11410 8997 : if (VECTOR_MODE_P (mode0))
11411 8986 : op0 = safe_vector_operand (op0, mode0);
11412 8997 : if (VECTOR_MODE_P (mode1))
11413 8850 : op1 = safe_vector_operand (op1, mode1);
11414 :
11415 2852 : if (optimize || !target
11416 2852 : || GET_MODE (target) != tmode
11417 11849 : || !insn_data[icode].operand[0].predicate (target, tmode))
11418 6198 : target = gen_reg_rtx (tmode);
11419 :
11420 8997 : if (GET_MODE (op1) == SImode && mode1 == TImode)
11421 : {
11422 0 : rtx x = gen_reg_rtx (V4SImode);
11423 0 : emit_insn (gen_sse2_loadd (x, op1));
11424 0 : op1 = gen_lowpart (TImode, x);
11425 : }
11426 :
11427 8997 : if (!insn_data[icode].operand[1].predicate (op0, mode0))
11428 1409 : op0 = copy_to_mode_reg (mode0, op0);
11429 8997 : if (!insn_data[icode].operand[2].predicate (op1, mode1))
11430 817 : op1 = copy_to_mode_reg (mode1, op1);
11431 :
11432 8997 : pat = GEN_FCN (icode) (target, op0, op1);
11433 8997 : if (! pat)
11434 : return 0;
11435 :
11436 8997 : emit_insn (pat);
11437 :
11438 8997 : return target;
11439 : }
11440 :
11441 : /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
11442 :
11443 : static rtx
11444 1815 : ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
11445 : enum ix86_builtin_func_type m_type,
11446 : enum rtx_code sub_code)
11447 : {
11448 1815 : rtx pat;
11449 1815 : unsigned int i, nargs;
11450 1815 : bool comparison_p = false;
11451 1815 : bool tf_p = false;
11452 1815 : bool last_arg_constant = false;
11453 1815 : int num_memory = 0;
11454 1815 : rtx xops[4];
11455 :
11456 1815 : machine_mode tmode = insn_data[icode].operand[0].mode;
11457 :
11458 1815 : switch (m_type)
11459 : {
11460 : case MULTI_ARG_4_DF2_DI_I:
11461 : case MULTI_ARG_4_DF2_DI_I1:
11462 : case MULTI_ARG_4_SF2_SI_I:
11463 : case MULTI_ARG_4_SF2_SI_I1:
11464 : nargs = 4;
11465 : last_arg_constant = true;
11466 : break;
11467 :
11468 844 : case MULTI_ARG_3_SF:
11469 844 : case MULTI_ARG_3_DF:
11470 844 : case MULTI_ARG_3_SF2:
11471 844 : case MULTI_ARG_3_DF2:
11472 844 : case MULTI_ARG_3_DI:
11473 844 : case MULTI_ARG_3_SI:
11474 844 : case MULTI_ARG_3_SI_DI:
11475 844 : case MULTI_ARG_3_HI:
11476 844 : case MULTI_ARG_3_HI_SI:
11477 844 : case MULTI_ARG_3_QI:
11478 844 : case MULTI_ARG_3_DI2:
11479 844 : case MULTI_ARG_3_SI2:
11480 844 : case MULTI_ARG_3_HI2:
11481 844 : case MULTI_ARG_3_QI2:
11482 844 : nargs = 3;
11483 844 : break;
11484 :
11485 128 : case MULTI_ARG_2_SF:
11486 128 : case MULTI_ARG_2_DF:
11487 128 : case MULTI_ARG_2_DI:
11488 128 : case MULTI_ARG_2_SI:
11489 128 : case MULTI_ARG_2_HI:
11490 128 : case MULTI_ARG_2_QI:
11491 128 : nargs = 2;
11492 128 : break;
11493 :
11494 64 : case MULTI_ARG_2_DI_IMM:
11495 64 : case MULTI_ARG_2_SI_IMM:
11496 64 : case MULTI_ARG_2_HI_IMM:
11497 64 : case MULTI_ARG_2_QI_IMM:
11498 64 : nargs = 2;
11499 64 : last_arg_constant = true;
11500 64 : break;
11501 :
11502 187 : case MULTI_ARG_1_SF:
11503 187 : case MULTI_ARG_1_DF:
11504 187 : case MULTI_ARG_1_SF2:
11505 187 : case MULTI_ARG_1_DF2:
11506 187 : case MULTI_ARG_1_DI:
11507 187 : case MULTI_ARG_1_SI:
11508 187 : case MULTI_ARG_1_HI:
11509 187 : case MULTI_ARG_1_QI:
11510 187 : case MULTI_ARG_1_SI_DI:
11511 187 : case MULTI_ARG_1_HI_DI:
11512 187 : case MULTI_ARG_1_HI_SI:
11513 187 : case MULTI_ARG_1_QI_DI:
11514 187 : case MULTI_ARG_1_QI_SI:
11515 187 : case MULTI_ARG_1_QI_HI:
11516 187 : nargs = 1;
11517 187 : break;
11518 :
11519 384 : case MULTI_ARG_2_DI_CMP:
11520 384 : case MULTI_ARG_2_SI_CMP:
11521 384 : case MULTI_ARG_2_HI_CMP:
11522 384 : case MULTI_ARG_2_QI_CMP:
11523 384 : nargs = 2;
11524 384 : comparison_p = true;
11525 384 : break;
11526 :
11527 128 : case MULTI_ARG_2_SF_TF:
11528 128 : case MULTI_ARG_2_DF_TF:
11529 128 : case MULTI_ARG_2_DI_TF:
11530 128 : case MULTI_ARG_2_SI_TF:
11531 128 : case MULTI_ARG_2_HI_TF:
11532 128 : case MULTI_ARG_2_QI_TF:
11533 128 : nargs = 2;
11534 128 : tf_p = true;
11535 128 : break;
11536 :
11537 0 : default:
11538 0 : gcc_unreachable ();
11539 : }
11540 :
11541 628 : if (optimize || !target
11542 628 : || GET_MODE (target) != tmode
11543 2419 : || !insn_data[icode].operand[0].predicate (target, tmode))
11544 1211 : target = gen_reg_rtx (tmode);
11545 604 : else if (memory_operand (target, tmode))
11546 0 : num_memory++;
11547 :
11548 1815 : gcc_assert (nargs <= ARRAY_SIZE (xops));
11549 :
11550 6254 : for (i = 0; i < nargs; i++)
11551 : {
11552 4447 : tree arg = CALL_EXPR_ARG (exp, i);
11553 4447 : rtx op = expand_normal (arg);
11554 4447 : int adjust = (comparison_p) ? 1 : 0;
11555 4447 : machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
11556 :
11557 4447 : if (last_arg_constant && i == nargs - 1)
11558 : {
11559 144 : if (!insn_data[icode].operand[i + 1].predicate (op, mode))
11560 : {
11561 30 : enum insn_code new_icode = icode;
11562 30 : switch (icode)
11563 : {
11564 8 : case CODE_FOR_xop_vpermil2v2df3:
11565 8 : case CODE_FOR_xop_vpermil2v4sf3:
11566 8 : case CODE_FOR_xop_vpermil2v4df3:
11567 8 : case CODE_FOR_xop_vpermil2v8sf3:
11568 8 : error ("the last argument must be a 2-bit immediate");
11569 8 : return gen_reg_rtx (tmode);
11570 5 : case CODE_FOR_xop_rotlv2di3:
11571 5 : new_icode = CODE_FOR_rotlv2di3;
11572 5 : goto xop_rotl;
11573 5 : case CODE_FOR_xop_rotlv4si3:
11574 5 : new_icode = CODE_FOR_rotlv4si3;
11575 5 : goto xop_rotl;
11576 6 : case CODE_FOR_xop_rotlv8hi3:
11577 6 : new_icode = CODE_FOR_rotlv8hi3;
11578 6 : goto xop_rotl;
11579 : case CODE_FOR_xop_rotlv16qi3:
11580 : new_icode = CODE_FOR_rotlv16qi3;
11581 22 : xop_rotl:
11582 22 : if (CONST_INT_P (op))
11583 : {
11584 6 : int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
11585 6 : op = GEN_INT (INTVAL (op) & mask);
11586 6 : gcc_checking_assert
11587 : (insn_data[icode].operand[i + 1].predicate (op, mode));
11588 : }
11589 : else
11590 : {
11591 16 : gcc_checking_assert
11592 : (nargs == 2
11593 : && insn_data[new_icode].operand[0].mode == tmode
11594 : && insn_data[new_icode].operand[1].mode == tmode
11595 : && insn_data[new_icode].operand[2].mode == mode
11596 : && insn_data[new_icode].operand[0].predicate
11597 : == insn_data[icode].operand[0].predicate
11598 : && insn_data[new_icode].operand[1].predicate
11599 : == insn_data[icode].operand[1].predicate);
11600 16 : icode = new_icode;
11601 16 : goto non_constant;
11602 : }
11603 : break;
11604 0 : default:
11605 0 : gcc_unreachable ();
11606 : }
11607 : }
11608 : }
11609 : else
11610 : {
11611 4303 : non_constant:
11612 4319 : if (VECTOR_MODE_P (mode))
11613 4303 : op = safe_vector_operand (op, mode);
11614 :
11615 : /* If we aren't optimizing, only allow one memory operand to be
11616 : generated. */
11617 4319 : if (memory_operand (op, mode))
11618 826 : num_memory++;
11619 :
11620 4319 : gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
11621 :
11622 4319 : if (optimize
11623 1506 : || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
11624 5747 : || num_memory > 1)
11625 3398 : op = force_reg (mode, op);
11626 : }
11627 :
11628 4439 : xops[i] = op;
11629 : }
11630 :
11631 1807 : switch (nargs)
11632 : {
11633 187 : case 1:
11634 187 : pat = GEN_FCN (icode) (target, xops[0]);
11635 187 : break;
11636 :
11637 704 : case 2:
11638 704 : if (tf_p)
11639 128 : pat = GEN_FCN (icode) (target, xops[0], xops[1],
11640 128 : GEN_INT ((int)sub_code));
11641 576 : else if (! comparison_p)
11642 192 : pat = GEN_FCN (icode) (target, xops[0], xops[1]);
11643 : else
11644 : {
11645 384 : rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
11646 : xops[0], xops[1]);
11647 :
11648 384 : pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
11649 : }
11650 : break;
11651 :
11652 844 : case 3:
11653 844 : pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
11654 844 : break;
11655 :
11656 72 : case 4:
11657 72 : pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
11658 72 : break;
11659 :
11660 : default:
11661 : gcc_unreachable ();
11662 : }
11663 :
11664 1807 : if (! pat)
11665 : return 0;
11666 :
11667 1807 : emit_insn (pat);
11668 1807 : return target;
11669 : }
11670 :
11671 : /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
11672 : insns with vec_merge. */
11673 :
11674 : static rtx
11675 52 : ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
11676 : rtx target)
11677 : {
11678 52 : rtx pat;
11679 52 : tree arg0 = CALL_EXPR_ARG (exp, 0);
11680 52 : rtx op1, op0 = expand_normal (arg0);
11681 52 : machine_mode tmode = insn_data[icode].operand[0].mode;
11682 52 : machine_mode mode0 = insn_data[icode].operand[1].mode;
11683 :
11684 16 : if (optimize || !target
11685 16 : || GET_MODE (target) != tmode
11686 68 : || !insn_data[icode].operand[0].predicate (target, tmode))
11687 36 : target = gen_reg_rtx (tmode);
11688 :
11689 52 : if (VECTOR_MODE_P (mode0))
11690 52 : op0 = safe_vector_operand (op0, mode0);
11691 :
11692 36 : if ((optimize && !register_operand (op0, mode0))
11693 88 : || !insn_data[icode].operand[1].predicate (op0, mode0))
11694 0 : op0 = copy_to_mode_reg (mode0, op0);
11695 :
11696 52 : op1 = op0;
11697 52 : if (!insn_data[icode].operand[2].predicate (op1, mode0))
11698 16 : op1 = copy_to_mode_reg (mode0, op1);
11699 :
11700 52 : pat = GEN_FCN (icode) (target, op0, op1);
11701 52 : if (! pat)
11702 : return 0;
11703 52 : emit_insn (pat);
11704 52 : return target;
11705 : }
11706 :
11707 : /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
11708 :
11709 : static rtx
11710 614 : ix86_expand_sse_compare (const struct builtin_description *d,
11711 : tree exp, rtx target, bool swap)
11712 : {
11713 614 : rtx pat;
11714 614 : tree arg0 = CALL_EXPR_ARG (exp, 0);
11715 614 : tree arg1 = CALL_EXPR_ARG (exp, 1);
11716 614 : rtx op0 = expand_normal (arg0);
11717 614 : rtx op1 = expand_normal (arg1);
11718 614 : rtx op2;
11719 614 : machine_mode tmode = insn_data[d->icode].operand[0].mode;
11720 614 : machine_mode mode0 = insn_data[d->icode].operand[1].mode;
11721 614 : machine_mode mode1 = insn_data[d->icode].operand[2].mode;
11722 614 : enum rtx_code comparison = d->comparison;
11723 :
11724 614 : if (VECTOR_MODE_P (mode0))
11725 614 : op0 = safe_vector_operand (op0, mode0);
11726 614 : if (VECTOR_MODE_P (mode1))
11727 614 : op1 = safe_vector_operand (op1, mode1);
11728 :
11729 : /* Swap operands if we have a comparison that isn't available in
11730 : hardware. */
11731 614 : if (swap)
11732 80 : std::swap (op0, op1);
11733 :
11734 202 : if (optimize || !target
11735 202 : || GET_MODE (target) != tmode
11736 816 : || !insn_data[d->icode].operand[0].predicate (target, tmode))
11737 412 : target = gen_reg_rtx (tmode);
11738 :
11739 412 : if ((optimize && !register_operand (op0, mode0))
11740 956 : || !insn_data[d->icode].operand[1].predicate (op0, mode0))
11741 272 : op0 = copy_to_mode_reg (mode0, op0);
11742 412 : if ((optimize && !register_operand (op1, mode1))
11743 972 : || !insn_data[d->icode].operand[2].predicate (op1, mode1))
11744 54 : op1 = copy_to_mode_reg (mode1, op1);
11745 :
11746 614 : op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
11747 614 : pat = GEN_FCN (d->icode) (target, op0, op1, op2);
11748 614 : if (! pat)
11749 : return 0;
11750 614 : emit_insn (pat);
11751 614 : return target;
11752 : }
11753 :
11754 : /* Subroutine of ix86_sse_comi and ix86_sse_comi_round to take care of
11755 : * ordered EQ or unordered NE, generate PF jump. */
11756 :
11757 : static rtx
11758 646 : ix86_ssecom_setcc (const enum rtx_code comparison,
11759 : bool check_unordered, machine_mode mode,
11760 : rtx set_dst, rtx target)
11761 : {
11762 :
11763 646 : rtx_code_label *label = NULL;
11764 :
11765 : /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
11766 : with NAN operands.
11767 : Under TARGET_AVX10_2, VCOMX/VUCOMX are generated instead of
11768 : COMI/UCOMI. VCOMX/VUCOMX will not set ZF for NAN operands. */
11769 646 : if (check_unordered)
11770 : {
11771 122 : gcc_assert (comparison == EQ || comparison == NE);
11772 :
11773 122 : rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
11774 122 : label = gen_label_rtx ();
11775 122 : rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
11776 122 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
11777 : gen_rtx_LABEL_REF (VOIDmode, label),
11778 : pc_rtx);
11779 122 : emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
11780 : }
11781 :
11782 : /* NB: Set CCFPmode and check a different CCmode which is in subset
11783 : of CCFPmode. */
11784 646 : if (GET_MODE (set_dst) != mode)
11785 : {
11786 200 : gcc_assert (mode == CCAmode || mode == CCCmode
11787 : || mode == CCOmode || mode == CCPmode
11788 : || mode == CCSmode || mode == CCZmode);
11789 200 : set_dst = gen_rtx_REG (mode, FLAGS_REG);
11790 : }
11791 :
11792 646 : emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
11793 : gen_rtx_fmt_ee (comparison, QImode,
11794 : set_dst,
11795 : const0_rtx)));
11796 :
11797 646 : if (label)
11798 122 : emit_label (label);
11799 :
11800 646 : return SUBREG_REG (target);
11801 : }
11802 :
11803 : /* Subroutine of ix86_expand_builtin to take care of comi insns. */
11804 :
11805 : static rtx
11806 547 : ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
11807 : rtx target, bool comx_ok)
11808 : {
11809 547 : rtx pat, set_dst;
11810 547 : tree arg0 = CALL_EXPR_ARG (exp, 0);
11811 547 : tree arg1 = CALL_EXPR_ARG (exp, 1);
11812 547 : rtx op0 = expand_normal (arg0);
11813 547 : rtx op1 = expand_normal (arg1);
11814 547 : enum insn_code icode = d->icode;
11815 547 : const struct insn_data_d *insn_p = &insn_data[icode];
11816 547 : machine_mode mode0 = insn_p->operand[0].mode;
11817 547 : machine_mode mode1 = insn_p->operand[1].mode;
11818 :
11819 547 : if (VECTOR_MODE_P (mode0))
11820 547 : op0 = safe_vector_operand (op0, mode0);
11821 547 : if (VECTOR_MODE_P (mode1))
11822 547 : op1 = safe_vector_operand (op1, mode1);
11823 :
11824 547 : enum rtx_code comparison = d->comparison;
11825 547 : rtx const_val = const0_rtx;
11826 :
11827 547 : bool check_unordered = false;
11828 547 : machine_mode mode = CCFPmode;
11829 547 : switch (comparison)
11830 : {
11831 194 : case LE: /* -> GE */
11832 194 : case LT: /* -> GT */
11833 194 : std::swap (op0, op1);
11834 194 : comparison = swap_condition (comparison);
11835 : /* FALLTHRU */
11836 : case GT:
11837 : case GE:
11838 : break;
11839 73 : case EQ:
11840 73 : if (!TARGET_AVX10_2 || !comx_ok)
11841 45 : check_unordered = true;
11842 : mode = CCZmode;
11843 : break;
11844 96 : case NE:
11845 96 : if (!TARGET_AVX10_2 || !comx_ok)
11846 68 : check_unordered = true;
11847 96 : mode = CCZmode;
11848 96 : const_val = const1_rtx;
11849 96 : break;
11850 0 : default:
11851 0 : gcc_unreachable ();
11852 : }
11853 :
11854 547 : target = gen_reg_rtx (SImode);
11855 547 : emit_move_insn (target, const_val);
11856 547 : target = gen_rtx_SUBREG (QImode, target, 0);
11857 :
11858 426 : if ((optimize && !register_operand (op0, mode0))
11859 925 : || !insn_p->operand[0].predicate (op0, mode0))
11860 169 : op0 = copy_to_mode_reg (mode0, op0);
11861 426 : if ((optimize && !register_operand (op1, mode1))
11862 924 : || !insn_p->operand[1].predicate (op1, mode1))
11863 49 : op1 = copy_to_mode_reg (mode1, op1);
11864 :
11865 547 : if ((comparison == EQ || comparison == NE)
11866 169 : && TARGET_AVX10_2 && comx_ok)
11867 : {
11868 56 : switch (icode)
11869 : {
11870 : case CODE_FOR_sse_comi:
11871 : icode = CODE_FOR_avx10_2_comxsf;
11872 : break;
11873 14 : case CODE_FOR_sse_ucomi:
11874 14 : icode = CODE_FOR_avx10_2_ucomxsf;
11875 14 : break;
11876 14 : case CODE_FOR_sse2_comi:
11877 14 : icode = CODE_FOR_avx10_2_comxdf;
11878 14 : break;
11879 14 : case CODE_FOR_sse2_ucomi:
11880 14 : icode = CODE_FOR_avx10_2_ucomxdf;
11881 14 : break;
11882 :
11883 0 : default:
11884 0 : gcc_unreachable ();
11885 : }
11886 : }
11887 547 : pat = GEN_FCN (icode) (op0, op1);
11888 547 : if (! pat)
11889 : return 0;
11890 :
11891 547 : set_dst = SET_DEST (pat);
11892 547 : emit_insn (pat);
11893 547 : return ix86_ssecom_setcc (comparison, check_unordered, mode,
11894 547 : set_dst, target);
11895 : }
11896 :
11897 : /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
11898 :
11899 : static rtx
11900 0 : ix86_expand_sse_round (const struct builtin_description *d, tree exp,
11901 : rtx target)
11902 : {
11903 0 : rtx pat;
11904 0 : tree arg0 = CALL_EXPR_ARG (exp, 0);
11905 0 : rtx op1, op0 = expand_normal (arg0);
11906 0 : machine_mode tmode = insn_data[d->icode].operand[0].mode;
11907 0 : machine_mode mode0 = insn_data[d->icode].operand[1].mode;
11908 :
11909 0 : if (optimize || target == 0
11910 0 : || GET_MODE (target) != tmode
11911 0 : || !insn_data[d->icode].operand[0].predicate (target, tmode))
11912 0 : target = gen_reg_rtx (tmode);
11913 :
11914 0 : if (VECTOR_MODE_P (mode0))
11915 0 : op0 = safe_vector_operand (op0, mode0);
11916 :
11917 0 : if ((optimize && !register_operand (op0, mode0))
11918 0 : || !insn_data[d->icode].operand[0].predicate (op0, mode0))
11919 0 : op0 = copy_to_mode_reg (mode0, op0);
11920 :
11921 0 : op1 = GEN_INT (d->comparison);
11922 :
11923 0 : pat = GEN_FCN (d->icode) (target, op0, op1);
11924 0 : if (! pat)
11925 : return 0;
11926 0 : emit_insn (pat);
11927 0 : return target;
11928 : }
11929 :
11930 : static rtx
11931 12 : ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
11932 : tree exp, rtx target)
11933 : {
11934 12 : rtx pat;
11935 12 : tree arg0 = CALL_EXPR_ARG (exp, 0);
11936 12 : tree arg1 = CALL_EXPR_ARG (exp, 1);
11937 12 : rtx op0 = expand_normal (arg0);
11938 12 : rtx op1 = expand_normal (arg1);
11939 12 : rtx op2;
11940 12 : machine_mode tmode = insn_data[d->icode].operand[0].mode;
11941 12 : machine_mode mode0 = insn_data[d->icode].operand[1].mode;
11942 12 : machine_mode mode1 = insn_data[d->icode].operand[2].mode;
11943 :
11944 0 : if (optimize || target == 0
11945 0 : || GET_MODE (target) != tmode
11946 12 : || !insn_data[d->icode].operand[0].predicate (target, tmode))
11947 12 : target = gen_reg_rtx (tmode);
11948 :
11949 12 : op0 = safe_vector_operand (op0, mode0);
11950 12 : op1 = safe_vector_operand (op1, mode1);
11951 :
11952 12 : if ((optimize && !register_operand (op0, mode0))
11953 12 : || !insn_data[d->icode].operand[0].predicate (op0, mode0))
11954 12 : op0 = copy_to_mode_reg (mode0, op0);
11955 12 : if ((optimize && !register_operand (op1, mode1))
11956 12 : || !insn_data[d->icode].operand[1].predicate (op1, mode1))
11957 12 : op1 = copy_to_mode_reg (mode1, op1);
11958 :
11959 12 : op2 = GEN_INT (d->comparison);
11960 :
11961 12 : pat = GEN_FCN (d->icode) (target, op0, op1, op2);
11962 12 : if (! pat)
11963 : return 0;
11964 12 : emit_insn (pat);
11965 12 : return target;
11966 : }
11967 :
11968 : /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
11969 :
11970 : static rtx
11971 235 : ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
11972 : rtx target)
11973 : {
11974 235 : rtx pat;
11975 235 : tree arg0 = CALL_EXPR_ARG (exp, 0);
11976 235 : tree arg1 = CALL_EXPR_ARG (exp, 1);
11977 235 : rtx op0 = expand_normal (arg0);
11978 235 : rtx op1 = expand_normal (arg1);
11979 235 : machine_mode mode0 = insn_data[d->icode].operand[0].mode;
11980 235 : machine_mode mode1 = insn_data[d->icode].operand[1].mode;
11981 235 : enum rtx_code comparison = d->comparison;
11982 :
11983 : /* ptest reg, reg sets the carry flag. */
11984 235 : if (comparison == LTU
11985 75 : && (d->code == IX86_BUILTIN_PTESTC
11986 57 : || d->code == IX86_BUILTIN_PTESTC256)
11987 266 : && rtx_equal_p (op0, op1))
11988 : {
11989 2 : if (!target)
11990 0 : target = gen_reg_rtx (SImode);
11991 2 : emit_move_insn (target, const1_rtx);
11992 2 : return target;
11993 : }
11994 :
11995 233 : if (VECTOR_MODE_P (mode0))
11996 233 : op0 = safe_vector_operand (op0, mode0);
11997 233 : if (VECTOR_MODE_P (mode1))
11998 233 : op1 = safe_vector_operand (op1, mode1);
11999 :
12000 233 : target = gen_reg_rtx (SImode);
12001 233 : emit_move_insn (target, const0_rtx);
12002 233 : target = gen_rtx_SUBREG (QImode, target, 0);
12003 :
12004 161 : if ((optimize && !register_operand (op0, mode0))
12005 366 : || !insn_data[d->icode].operand[0].predicate (op0, mode0))
12006 100 : op0 = copy_to_mode_reg (mode0, op0);
12007 161 : if ((optimize && !register_operand (op1, mode1))
12008 367 : || !insn_data[d->icode].operand[1].predicate (op1, mode1))
12009 27 : op1 = copy_to_mode_reg (mode1, op1);
12010 :
12011 233 : pat = GEN_FCN (d->icode) (op0, op1);
12012 233 : if (! pat)
12013 : return 0;
12014 233 : emit_insn (pat);
12015 233 : emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
12016 : gen_rtx_fmt_ee (comparison, QImode,
12017 : SET_DEST (pat),
12018 : const0_rtx)));
12019 :
12020 233 : return SUBREG_REG (target);
12021 : }
12022 :
12023 : /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
12024 :
12025 : static rtx
12026 216 : ix86_expand_sse_pcmpestr (const struct builtin_description *d,
12027 : tree exp, rtx target)
12028 : {
12029 216 : rtx pat;
12030 216 : tree arg0 = CALL_EXPR_ARG (exp, 0);
12031 216 : tree arg1 = CALL_EXPR_ARG (exp, 1);
12032 216 : tree arg2 = CALL_EXPR_ARG (exp, 2);
12033 216 : tree arg3 = CALL_EXPR_ARG (exp, 3);
12034 216 : tree arg4 = CALL_EXPR_ARG (exp, 4);
12035 216 : rtx scratch0, scratch1;
12036 216 : rtx op0 = expand_normal (arg0);
12037 216 : rtx op1 = expand_normal (arg1);
12038 216 : rtx op2 = expand_normal (arg2);
12039 216 : rtx op3 = expand_normal (arg3);
12040 216 : rtx op4 = expand_normal (arg4);
12041 216 : machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
12042 :
12043 216 : tmode0 = insn_data[d->icode].operand[0].mode;
12044 216 : tmode1 = insn_data[d->icode].operand[1].mode;
12045 216 : modev2 = insn_data[d->icode].operand[2].mode;
12046 216 : modei3 = insn_data[d->icode].operand[3].mode;
12047 216 : modev4 = insn_data[d->icode].operand[4].mode;
12048 216 : modei5 = insn_data[d->icode].operand[5].mode;
12049 216 : modeimm = insn_data[d->icode].operand[6].mode;
12050 :
12051 216 : if (VECTOR_MODE_P (modev2))
12052 216 : op0 = safe_vector_operand (op0, modev2);
12053 216 : if (VECTOR_MODE_P (modev4))
12054 216 : op2 = safe_vector_operand (op2, modev4);
12055 :
12056 216 : if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
12057 6 : op0 = copy_to_mode_reg (modev2, op0);
12058 216 : if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
12059 34 : op1 = copy_to_mode_reg (modei3, op1);
12060 160 : if ((optimize && !register_operand (op2, modev4))
12061 371 : || !insn_data[d->icode].operand[4].predicate (op2, modev4))
12062 5 : op2 = copy_to_mode_reg (modev4, op2);
12063 216 : if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
12064 34 : op3 = copy_to_mode_reg (modei5, op3);
12065 :
12066 216 : if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
12067 : {
12068 21 : error ("the fifth argument must be an 8-bit immediate");
12069 21 : return const0_rtx;
12070 : }
12071 :
12072 195 : if (d->code == IX86_BUILTIN_PCMPESTRI128)
12073 : {
12074 5 : if (optimize || !target
12075 5 : || GET_MODE (target) != tmode0
12076 34 : || !insn_data[d->icode].operand[0].predicate (target, tmode0))
12077 24 : target = gen_reg_rtx (tmode0);
12078 :
12079 29 : scratch1 = gen_reg_rtx (tmode1);
12080 :
12081 29 : pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
12082 : }
12083 166 : else if (d->code == IX86_BUILTIN_PCMPESTRM128)
12084 : {
12085 5 : if (optimize || !target
12086 5 : || GET_MODE (target) != tmode1
12087 36 : || !insn_data[d->icode].operand[1].predicate (target, tmode1))
12088 26 : target = gen_reg_rtx (tmode1);
12089 :
12090 31 : scratch0 = gen_reg_rtx (tmode0);
12091 :
12092 31 : pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
12093 : }
12094 : else
12095 : {
12096 135 : gcc_assert (d->flag);
12097 :
12098 135 : scratch0 = gen_reg_rtx (tmode0);
12099 135 : scratch1 = gen_reg_rtx (tmode1);
12100 :
12101 135 : pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
12102 : }
12103 :
12104 195 : if (! pat)
12105 : return 0;
12106 :
12107 195 : emit_insn (pat);
12108 :
12109 195 : if (d->flag)
12110 : {
12111 135 : target = gen_reg_rtx (SImode);
12112 135 : emit_move_insn (target, const0_rtx);
12113 135 : target = gen_rtx_SUBREG (QImode, target, 0);
12114 :
12115 135 : emit_insn
12116 135 : (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
12117 : gen_rtx_fmt_ee (EQ, QImode,
12118 : gen_rtx_REG ((machine_mode) d->flag,
12119 : FLAGS_REG),
12120 : const0_rtx)));
12121 135 : return SUBREG_REG (target);
12122 : }
12123 : else
12124 : return target;
12125 : }
12126 :
12127 :
12128 : /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
12129 :
12130 : static rtx
12131 275 : ix86_expand_sse_pcmpistr (const struct builtin_description *d,
12132 : tree exp, rtx target)
12133 : {
12134 275 : rtx pat;
12135 275 : tree arg0 = CALL_EXPR_ARG (exp, 0);
12136 275 : tree arg1 = CALL_EXPR_ARG (exp, 1);
12137 275 : tree arg2 = CALL_EXPR_ARG (exp, 2);
12138 275 : rtx scratch0, scratch1;
12139 275 : rtx op0 = expand_normal (arg0);
12140 275 : rtx op1 = expand_normal (arg1);
12141 275 : rtx op2 = expand_normal (arg2);
12142 275 : machine_mode tmode0, tmode1, modev2, modev3, modeimm;
12143 :
12144 275 : tmode0 = insn_data[d->icode].operand[0].mode;
12145 275 : tmode1 = insn_data[d->icode].operand[1].mode;
12146 275 : modev2 = insn_data[d->icode].operand[2].mode;
12147 275 : modev3 = insn_data[d->icode].operand[3].mode;
12148 275 : modeimm = insn_data[d->icode].operand[4].mode;
12149 :
12150 275 : if (VECTOR_MODE_P (modev2))
12151 275 : op0 = safe_vector_operand (op0, modev2);
12152 275 : if (VECTOR_MODE_P (modev3))
12153 275 : op1 = safe_vector_operand (op1, modev3);
12154 :
12155 275 : if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
12156 4 : op0 = copy_to_mode_reg (modev2, op0);
12157 210 : if ((optimize && !register_operand (op1, modev3))
12158 481 : || !insn_data[d->icode].operand[3].predicate (op1, modev3))
12159 4 : op1 = copy_to_mode_reg (modev3, op1);
12160 :
12161 275 : if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
12162 : {
12163 21 : error ("the third argument must be an 8-bit immediate");
12164 21 : return const0_rtx;
12165 : }
12166 :
12167 254 : if (d->code == IX86_BUILTIN_PCMPISTRI128)
12168 : {
12169 5 : if (optimize || !target
12170 5 : || GET_MODE (target) != tmode0
12171 38 : || !insn_data[d->icode].operand[0].predicate (target, tmode0))
12172 28 : target = gen_reg_rtx (tmode0);
12173 :
12174 33 : scratch1 = gen_reg_rtx (tmode1);
12175 :
12176 33 : pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
12177 : }
12178 221 : else if (d->code == IX86_BUILTIN_PCMPISTRM128)
12179 : {
12180 8 : if (optimize || !target
12181 8 : || GET_MODE (target) != tmode1
12182 58 : || !insn_data[d->icode].operand[1].predicate (target, tmode1))
12183 42 : target = gen_reg_rtx (tmode1);
12184 :
12185 50 : scratch0 = gen_reg_rtx (tmode0);
12186 :
12187 50 : pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
12188 : }
12189 : else
12190 : {
12191 171 : gcc_assert (d->flag);
12192 :
12193 171 : scratch0 = gen_reg_rtx (tmode0);
12194 171 : scratch1 = gen_reg_rtx (tmode1);
12195 :
12196 171 : pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
12197 : }
12198 :
12199 254 : if (! pat)
12200 : return 0;
12201 :
12202 254 : emit_insn (pat);
12203 :
12204 254 : if (d->flag)
12205 : {
12206 171 : target = gen_reg_rtx (SImode);
12207 171 : emit_move_insn (target, const0_rtx);
12208 171 : target = gen_rtx_SUBREG (QImode, target, 0);
12209 :
12210 171 : emit_insn
12211 171 : (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
12212 : gen_rtx_fmt_ee (EQ, QImode,
12213 : gen_rtx_REG ((machine_mode) d->flag,
12214 : FLAGS_REG),
12215 : const0_rtx)));
12216 171 : return SUBREG_REG (target);
12217 : }
12218 : else
12219 : return target;
12220 : }
12221 :
12222 : /* Fixup modeless constants to fit required mode. */
12223 :
12224 : static rtx
12225 260811 : fixup_modeless_constant (rtx x, machine_mode mode)
12226 : {
12227 260811 : if (GET_MODE (x) == VOIDmode)
12228 41463 : x = convert_to_mode (mode, x, 1);
12229 260811 : return x;
12230 : }
12231 :
12232 : /* Expand the outgoing argument ARG to extract unsigned char and short
12233 : integer constants suitable for the predicates and the instruction
12234 : templates which expect the unsigned expanded value. */
12235 :
12236 : static rtx
12237 282055 : ix86_expand_unsigned_small_int_cst_argument (tree arg)
12238 : {
12239 : /* When passing 0xff as an unsigned char function argument with the
12240 : C frontend promotion, expand_normal gets
12241 :
12242 : <integer_cst 0x7fffe6aa23a8 type <integer_type 0x7fffe98225e8 int> constant 255>
12243 :
12244 : and returns the rtx value using the sign-extended representation:
12245 :
12246 : (const_int 255 [0xff])
12247 :
12248 : Without the C frontend promotion, expand_normal gets
12249 :
12250 : <integer_cst 0x7fffe9824018 type <integer_type 0x7fffe9822348 unsigned char > constant 255>
12251 :
12252 : and returns
12253 :
12254 : (const_int -1 [0xffffffffffffffff])
12255 :
12256 : which doesn't work with the predicates nor the instruction templates
12257 : which expect the unsigned expanded value. Extract the unsigned char
12258 : and short integer constants to return
12259 :
12260 : (const_int 255 [0xff])
12261 :
12262 : so that the expanded value is always unsigned, without the C frontend
12263 : promotion. */
12264 :
12265 282055 : if (TREE_CODE (arg) == INTEGER_CST)
12266 : {
12267 60352 : tree type = TREE_TYPE (arg);
12268 60352 : if (INTEGRAL_TYPE_P (type)
12269 60352 : && TYPE_UNSIGNED (type)
12270 82165 : && TYPE_PRECISION (type) < TYPE_PRECISION (integer_type_node))
12271 : {
12272 18326 : HOST_WIDE_INT cst = TREE_INT_CST_LOW (arg);
12273 18326 : return GEN_INT (cst);
12274 : }
12275 : }
12276 :
12277 263729 : return expand_normal (arg);
12278 : }
12279 :
12280 : /* Subroutine of ix86_expand_builtin to take care of insns with
12281 : variable number of operands. */
12282 :
12283 : static rtx
12284 70978 : ix86_expand_args_builtin (const struct builtin_description *d,
12285 : tree exp, rtx target)
12286 : {
12287 70978 : rtx pat, real_target;
12288 70978 : unsigned int i, nargs;
12289 70978 : unsigned int nargs_constant = 0;
12290 70978 : unsigned int mask_pos = 0;
12291 70978 : int num_memory = 0;
12292 70978 : rtx xops[6];
12293 70978 : bool second_arg_count = false;
12294 70978 : enum insn_code icode = d->icode;
12295 70978 : const struct insn_data_d *insn_p = &insn_data[icode];
12296 70978 : machine_mode tmode = insn_p->operand[0].mode;
12297 70978 : machine_mode rmode = VOIDmode;
12298 70978 : bool swap = false;
12299 70978 : enum rtx_code comparison = d->comparison;
12300 :
12301 70978 : switch ((enum ix86_builtin_func_type) d->flag)
12302 : {
12303 0 : case V2DF_FTYPE_V2DF_ROUND:
12304 0 : case V4DF_FTYPE_V4DF_ROUND:
12305 0 : case V8DF_FTYPE_V8DF_ROUND:
12306 0 : case V4SF_FTYPE_V4SF_ROUND:
12307 0 : case V8SF_FTYPE_V8SF_ROUND:
12308 0 : case V16SF_FTYPE_V16SF_ROUND:
12309 0 : case V8HF_FTYPE_V8HF_ROUND:
12310 0 : case V16HF_FTYPE_V16HF_ROUND:
12311 0 : case V32HF_FTYPE_V32HF_ROUND:
12312 0 : case V4SI_FTYPE_V4SF_ROUND:
12313 0 : case V8SI_FTYPE_V8SF_ROUND:
12314 0 : case V16SI_FTYPE_V16SF_ROUND:
12315 0 : return ix86_expand_sse_round (d, exp, target);
12316 12 : case V4SI_FTYPE_V2DF_V2DF_ROUND:
12317 12 : case V8SI_FTYPE_V4DF_V4DF_ROUND:
12318 12 : case V16SI_FTYPE_V8DF_V8DF_ROUND:
12319 12 : return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
12320 235 : case INT_FTYPE_V8SF_V8SF_PTEST:
12321 235 : case INT_FTYPE_V4DI_V4DI_PTEST:
12322 235 : case INT_FTYPE_V4DF_V4DF_PTEST:
12323 235 : case INT_FTYPE_V4SF_V4SF_PTEST:
12324 235 : case INT_FTYPE_V2DI_V2DI_PTEST:
12325 235 : case INT_FTYPE_V2DF_V2DF_PTEST:
12326 235 : return ix86_expand_sse_ptest (d, exp, target);
12327 : case FLOAT128_FTYPE_FLOAT128:
12328 : case FLOAT_FTYPE_FLOAT:
12329 : case FLOAT_FTYPE_BFLOAT16:
12330 : case INT_FTYPE_INT:
12331 : case UINT_FTYPE_UINT:
12332 : case UINT16_FTYPE_UINT16:
12333 : case UINT64_FTYPE_INT:
12334 : case UINT64_FTYPE_UINT64:
12335 : case INT64_FTYPE_INT64:
12336 : case INT64_FTYPE_V4SF:
12337 : case INT64_FTYPE_V2DF:
12338 : case INT_FTYPE_V16QI:
12339 : case INT_FTYPE_V8QI:
12340 : case INT_FTYPE_V8SF:
12341 : case INT_FTYPE_V4DF:
12342 : case INT_FTYPE_V4SF:
12343 : case INT_FTYPE_V2DF:
12344 : case INT_FTYPE_V32QI:
12345 : case V16QI_FTYPE_V16QI:
12346 : case V8SI_FTYPE_V8SF:
12347 : case V8SI_FTYPE_V4SI:
12348 : case V8HI_FTYPE_V8HI:
12349 : case V8HI_FTYPE_V16QI:
12350 : case V8QI_FTYPE_V8QI:
12351 : case V8SF_FTYPE_V8SF:
12352 : case V8SF_FTYPE_V8SI:
12353 : case V8SF_FTYPE_V4SF:
12354 : case V8SF_FTYPE_V8HI:
12355 : case V4SI_FTYPE_V4SI:
12356 : case V4SI_FTYPE_V16QI:
12357 : case V4SI_FTYPE_V4SF:
12358 : case V4SI_FTYPE_V8SI:
12359 : case V4SI_FTYPE_V8HI:
12360 : case V4SI_FTYPE_V4DF:
12361 : case V4SI_FTYPE_V2DF:
12362 : case V4HI_FTYPE_V4HI:
12363 : case V4DF_FTYPE_V4DF:
12364 : case V4DF_FTYPE_V4SI:
12365 : case V4DF_FTYPE_V4SF:
12366 : case V4DF_FTYPE_V2DF:
12367 : case V4SF_FTYPE_V4SF:
12368 : case V4SF_FTYPE_V4SI:
12369 : case V4SF_FTYPE_V8SF:
12370 : case V4SF_FTYPE_V4DF:
12371 : case V4SF_FTYPE_V8HI:
12372 : case V4SF_FTYPE_V2DF:
12373 : case V2DI_FTYPE_V2DI:
12374 : case V2DI_FTYPE_V16QI:
12375 : case V2DI_FTYPE_V8HI:
12376 : case V2DI_FTYPE_V4SI:
12377 : case V2DF_FTYPE_V2DF:
12378 : case V2DF_FTYPE_V4SI:
12379 : case V2DF_FTYPE_V4DF:
12380 : case V2DF_FTYPE_V4SF:
12381 : case V2DF_FTYPE_V2SI:
12382 : case V2SI_FTYPE_V2SI:
12383 : case V2SI_FTYPE_V4SF:
12384 : case V2SI_FTYPE_V2SF:
12385 : case V2SI_FTYPE_V2DF:
12386 : case V2SF_FTYPE_V2SF:
12387 : case V2SF_FTYPE_V2SI:
12388 : case V32QI_FTYPE_V32QI:
12389 : case V32QI_FTYPE_V16QI:
12390 : case V16HI_FTYPE_V16HI:
12391 : case V16HI_FTYPE_V8HI:
12392 : case V8SI_FTYPE_V8SI:
12393 : case V16HI_FTYPE_V16QI:
12394 : case V8SI_FTYPE_V16QI:
12395 : case V4DI_FTYPE_V16QI:
12396 : case V8SI_FTYPE_V8HI:
12397 : case V4DI_FTYPE_V8HI:
12398 : case V4DI_FTYPE_V4SI:
12399 : case V4DI_FTYPE_V2DI:
12400 : case UQI_FTYPE_UQI:
12401 : case UHI_FTYPE_UHI:
12402 : case USI_FTYPE_USI:
12403 : case USI_FTYPE_UQI:
12404 : case USI_FTYPE_UHI:
12405 : case UDI_FTYPE_UDI:
12406 : case UHI_FTYPE_V16QI:
12407 : case USI_FTYPE_V32QI:
12408 : case UDI_FTYPE_V64QI:
12409 : case V16QI_FTYPE_UHI:
12410 : case V32QI_FTYPE_USI:
12411 : case V64QI_FTYPE_UDI:
12412 : case V8HI_FTYPE_UQI:
12413 : case V16HI_FTYPE_UHI:
12414 : case V32HI_FTYPE_USI:
12415 : case V4SI_FTYPE_UQI:
12416 : case V8SI_FTYPE_UQI:
12417 : case V4SI_FTYPE_UHI:
12418 : case V8SI_FTYPE_UHI:
12419 : case UQI_FTYPE_V8HI:
12420 : case UHI_FTYPE_V16HI:
12421 : case USI_FTYPE_V32HI:
12422 : case UQI_FTYPE_V4SI:
12423 : case UQI_FTYPE_V8SI:
12424 : case UHI_FTYPE_V16SI:
12425 : case UQI_FTYPE_V2DI:
12426 : case UQI_FTYPE_V4DI:
12427 : case UQI_FTYPE_V8DI:
12428 : case V16SI_FTYPE_UHI:
12429 : case V2DI_FTYPE_UQI:
12430 : case V4DI_FTYPE_UQI:
12431 : case V16SI_FTYPE_INT:
12432 : case V16SF_FTYPE_V8SF:
12433 : case V16SI_FTYPE_V8SI:
12434 : case V16SF_FTYPE_V4SF:
12435 : case V16SI_FTYPE_V4SI:
12436 : case V16SI_FTYPE_V16SF:
12437 : case V16SI_FTYPE_V16SI:
12438 : case V64QI_FTYPE_V64QI:
12439 : case V32HI_FTYPE_V32HI:
12440 : case V16SF_FTYPE_V16SF:
12441 : case V8DI_FTYPE_UQI:
12442 : case V8DI_FTYPE_V8DI:
12443 : case V8DF_FTYPE_V4DF:
12444 : case V8DF_FTYPE_V2DF:
12445 : case V8DF_FTYPE_V8DF:
12446 : case V4DI_FTYPE_V4DI:
12447 : case V16BF_FTYPE_V16SF:
12448 : case V8BF_FTYPE_V8SF:
12449 : case V8BF_FTYPE_V4SF:
12450 : nargs = 1;
12451 : break;
12452 52 : case V4SF_FTYPE_V4SF_VEC_MERGE:
12453 52 : case V2DF_FTYPE_V2DF_VEC_MERGE:
12454 52 : return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
12455 9531 : case FLOAT128_FTYPE_FLOAT128_FLOAT128:
12456 9531 : case V16QI_FTYPE_V16QI_V16QI:
12457 9531 : case V16QI_FTYPE_V8HI_V8HI:
12458 9531 : case V16HF_FTYPE_V16HF_V16HF:
12459 9531 : case V16SF_FTYPE_V16SF_V16SF:
12460 9531 : case V16SI_FTYPE_V16SI_V16SI:
12461 9531 : case V8QI_FTYPE_V8QI_V8QI:
12462 9531 : case V8QI_FTYPE_V4HI_V4HI:
12463 9531 : case V8HI_FTYPE_V8HI_V8HI:
12464 9531 : case V8HI_FTYPE_V16QI_V16QI:
12465 9531 : case V8HI_FTYPE_V4SI_V4SI:
12466 9531 : case V8HF_FTYPE_V8HF_V8HF:
12467 9531 : case V8SF_FTYPE_V8SF_V8SF:
12468 9531 : case V8SF_FTYPE_V8SF_V8SI:
12469 9531 : case V8DF_FTYPE_V8DF_V8DF:
12470 9531 : case V4SI_FTYPE_V4SI_V4SI:
12471 9531 : case V4SI_FTYPE_V8HI_V8HI:
12472 9531 : case V4SI_FTYPE_V2DF_V2DF:
12473 9531 : case V4HI_FTYPE_V4HI_V4HI:
12474 9531 : case V4HI_FTYPE_V8QI_V8QI:
12475 9531 : case V4HI_FTYPE_V2SI_V2SI:
12476 9531 : case V4DF_FTYPE_V4DF_V4DF:
12477 9531 : case V4DF_FTYPE_V4DF_V4DI:
12478 9531 : case V4SF_FTYPE_V4SF_V4SF:
12479 9531 : case V4SF_FTYPE_V4SF_V4SI:
12480 9531 : case V4SF_FTYPE_V4SF_V2SI:
12481 9531 : case V4SF_FTYPE_V4SF_V2DF:
12482 9531 : case V4SF_FTYPE_V4SF_UINT:
12483 9531 : case V4SF_FTYPE_V4SF_DI:
12484 9531 : case V4SF_FTYPE_V4SF_SI:
12485 9531 : case V4DI_FTYPE_V4DI_V2DI:
12486 9531 : case V2DI_FTYPE_V2DI_V2DI:
12487 9531 : case V2DI_FTYPE_V16QI_V16QI:
12488 9531 : case V2DI_FTYPE_V4SI_V4SI:
12489 9531 : case V2DI_FTYPE_V2DI_V16QI:
12490 9531 : case V2SI_FTYPE_V2SI_V2SI:
12491 9531 : case V2SI_FTYPE_V4HI_V4HI:
12492 9531 : case V2SI_FTYPE_V2SF_V2SF:
12493 9531 : case V2DF_FTYPE_V2DF_V2DF:
12494 9531 : case V2DF_FTYPE_V2DF_V4SF:
12495 9531 : case V2DF_FTYPE_V2DF_V2DI:
12496 9531 : case V2DF_FTYPE_V2DF_DI:
12497 9531 : case V2DF_FTYPE_V2DF_SI:
12498 9531 : case V2DF_FTYPE_V2DF_UINT:
12499 9531 : case V2SF_FTYPE_V2SF_V2SF:
12500 9531 : case V1DI_FTYPE_V1DI_V1DI:
12501 9531 : case V1DI_FTYPE_V8QI_V8QI:
12502 9531 : case V1DI_FTYPE_V2SI_V2SI:
12503 9531 : case V32QI_FTYPE_V16HI_V16HI:
12504 9531 : case V16HI_FTYPE_V8SI_V8SI:
12505 9531 : case V64QI_FTYPE_V64QI_V64QI:
12506 9531 : case V32QI_FTYPE_V32QI_V32QI:
12507 9531 : case V32BF_FTYPE_V32BF_V32BF:
12508 9531 : case V16BF_FTYPE_V16BF_V16BF:
12509 9531 : case V8BF_FTYPE_V8BF_V8BF:
12510 9531 : case V16HI_FTYPE_V32QI_V32QI:
12511 9531 : case V16HI_FTYPE_V16HI_V16HI:
12512 9531 : case V8SI_FTYPE_V4DF_V4DF:
12513 9531 : case V8SI_FTYPE_V8SI_V8SI:
12514 9531 : case V8SI_FTYPE_V16HI_V16HI:
12515 9531 : case V4DI_FTYPE_V4DI_V4DI:
12516 9531 : case V4DI_FTYPE_V8SI_V8SI:
12517 9531 : case V4DI_FTYPE_V32QI_V32QI:
12518 9531 : case V8DI_FTYPE_V64QI_V64QI:
12519 9531 : if (comparison == UNKNOWN)
12520 8997 : return ix86_expand_binop_builtin (icode, exp, target);
12521 : nargs = 2;
12522 : break;
12523 80 : case V4SF_FTYPE_V4SF_V4SF_SWAP:
12524 80 : case V2DF_FTYPE_V2DF_V2DF_SWAP:
12525 80 : gcc_assert (comparison != UNKNOWN);
12526 : nargs = 2;
12527 : swap = true;
12528 : break;
12529 1481 : case V16HI_FTYPE_V16HI_V8HI_COUNT:
12530 1481 : case V16HI_FTYPE_V16HI_SI_COUNT:
12531 1481 : case V8SI_FTYPE_V8SI_V4SI_COUNT:
12532 1481 : case V8SI_FTYPE_V8SI_SI_COUNT:
12533 1481 : case V4DI_FTYPE_V4DI_V2DI_COUNT:
12534 1481 : case V4DI_FTYPE_V4DI_INT_COUNT:
12535 1481 : case V8HI_FTYPE_V8HI_V8HI_COUNT:
12536 1481 : case V8HI_FTYPE_V8HI_SI_COUNT:
12537 1481 : case V4SI_FTYPE_V4SI_V4SI_COUNT:
12538 1481 : case V4SI_FTYPE_V4SI_SI_COUNT:
12539 1481 : case V4HI_FTYPE_V4HI_V4HI_COUNT:
12540 1481 : case V4HI_FTYPE_V4HI_SI_COUNT:
12541 1481 : case V2DI_FTYPE_V2DI_V2DI_COUNT:
12542 1481 : case V2DI_FTYPE_V2DI_SI_COUNT:
12543 1481 : case V2SI_FTYPE_V2SI_V2SI_COUNT:
12544 1481 : case V2SI_FTYPE_V2SI_SI_COUNT:
12545 1481 : case V1DI_FTYPE_V1DI_V1DI_COUNT:
12546 1481 : case V1DI_FTYPE_V1DI_SI_COUNT:
12547 1481 : nargs = 2;
12548 1481 : second_arg_count = true;
12549 1481 : break;
12550 1408 : case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
12551 1408 : case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
12552 1408 : case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
12553 1408 : case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
12554 1408 : case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
12555 1408 : case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
12556 1408 : case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
12557 1408 : case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
12558 1408 : case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
12559 1408 : case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
12560 1408 : case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
12561 1408 : case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
12562 1408 : case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
12563 1408 : case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
12564 1408 : case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
12565 1408 : case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
12566 1408 : case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
12567 1408 : case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
12568 1408 : nargs = 4;
12569 1408 : second_arg_count = true;
12570 1408 : break;
12571 967 : case UINT64_FTYPE_UINT64_UINT64:
12572 967 : case UINT_FTYPE_UINT_UINT:
12573 967 : case UINT_FTYPE_UINT_USHORT:
12574 967 : case UINT_FTYPE_UINT_UCHAR:
12575 967 : case UINT16_FTYPE_UINT16_INT:
12576 967 : case UINT8_FTYPE_UINT8_INT:
12577 967 : case UQI_FTYPE_UQI_UQI:
12578 967 : case UHI_FTYPE_UHI_UHI:
12579 967 : case USI_FTYPE_USI_USI:
12580 967 : case UDI_FTYPE_UDI_UDI:
12581 967 : case V16SI_FTYPE_V8DF_V8DF:
12582 967 : case V32BF_FTYPE_V16SF_V16SF:
12583 967 : case V16BF_FTYPE_V8SF_V8SF:
12584 967 : case V8BF_FTYPE_V4SF_V4SF:
12585 967 : case V16BF_FTYPE_V16SF_UHI:
12586 967 : case V8BF_FTYPE_V8SF_UQI:
12587 967 : case V8BF_FTYPE_V4SF_UQI:
12588 967 : case V16QI_FTYPE_V16QI_V8HF:
12589 967 : nargs = 2;
12590 967 : break;
12591 811 : case V2DI_FTYPE_V2DI_INT_CONVERT:
12592 811 : nargs = 2;
12593 811 : rmode = V1TImode;
12594 811 : nargs_constant = 1;
12595 811 : break;
12596 42 : case V4DI_FTYPE_V4DI_INT_CONVERT:
12597 42 : nargs = 2;
12598 42 : rmode = V2TImode;
12599 42 : nargs_constant = 1;
12600 42 : break;
12601 16 : case V8DI_FTYPE_V8DI_INT_CONVERT:
12602 16 : nargs = 2;
12603 16 : rmode = V4TImode;
12604 16 : nargs_constant = 1;
12605 16 : break;
12606 2422 : case V8HI_FTYPE_V8HI_INT:
12607 2422 : case V8HI_FTYPE_V8SF_INT:
12608 2422 : case V16HI_FTYPE_V16SF_INT:
12609 2422 : case V8HI_FTYPE_V4SF_INT:
12610 2422 : case V8SF_FTYPE_V8SF_INT:
12611 2422 : case V4SF_FTYPE_V16SF_INT:
12612 2422 : case V16SF_FTYPE_V16SF_INT:
12613 2422 : case V4SI_FTYPE_V4SI_INT:
12614 2422 : case V4SI_FTYPE_V8SI_INT:
12615 2422 : case V4HI_FTYPE_V4HI_INT:
12616 2422 : case V4DF_FTYPE_V4DF_INT:
12617 2422 : case V4DF_FTYPE_V8DF_INT:
12618 2422 : case V4SF_FTYPE_V4SF_INT:
12619 2422 : case V4SF_FTYPE_V8SF_INT:
12620 2422 : case V2DI_FTYPE_V2DI_INT:
12621 2422 : case V2DF_FTYPE_V2DF_INT:
12622 2422 : case V2DF_FTYPE_V4DF_INT:
12623 2422 : case V16HI_FTYPE_V16HI_INT:
12624 2422 : case V8SI_FTYPE_V8SI_INT:
12625 2422 : case V16SI_FTYPE_V16SI_INT:
12626 2422 : case V4SI_FTYPE_V16SI_INT:
12627 2422 : case V4DI_FTYPE_V4DI_INT:
12628 2422 : case V2DI_FTYPE_V4DI_INT:
12629 2422 : case V4DI_FTYPE_V8DI_INT:
12630 2422 : case UQI_FTYPE_UQI_UQI_CONST:
12631 2422 : case UHI_FTYPE_UHI_UQI:
12632 2422 : case USI_FTYPE_USI_UQI:
12633 2422 : case UDI_FTYPE_UDI_UQI:
12634 2422 : nargs = 2;
12635 2422 : nargs_constant = 1;
12636 2422 : break;
12637 18718 : case V16QI_FTYPE_V16QI_V16QI_V16QI:
12638 18718 : case V8SF_FTYPE_V8SF_V8SF_V8SF:
12639 18718 : case V4DF_FTYPE_V4DF_V4DF_V4DF:
12640 18718 : case V4SF_FTYPE_V4SF_V4SF_V4SF:
12641 18718 : case V2DF_FTYPE_V2DF_V2DF_V2DF:
12642 18718 : case V32QI_FTYPE_V32QI_V32QI_V32QI:
12643 18718 : case UHI_FTYPE_V16SI_V16SI_UHI:
12644 18718 : case UQI_FTYPE_V8DI_V8DI_UQI:
12645 18718 : case V16HI_FTYPE_V16SI_V16HI_UHI:
12646 18718 : case V16QI_FTYPE_V16SI_V16QI_UHI:
12647 18718 : case V16QI_FTYPE_V8DI_V16QI_UQI:
12648 18718 : case V32HF_FTYPE_V32HF_V32HF_USI:
12649 18718 : case V16SF_FTYPE_V16SF_V16SF_UHI:
12650 18718 : case V16SF_FTYPE_V4SF_V16SF_UHI:
12651 18718 : case V16SI_FTYPE_SI_V16SI_UHI:
12652 18718 : case V16SI_FTYPE_V16HI_V16SI_UHI:
12653 18718 : case V16SI_FTYPE_V16QI_V16SI_UHI:
12654 18718 : case V8SF_FTYPE_V4SF_V8SF_UQI:
12655 18718 : case V4DF_FTYPE_V2DF_V4DF_UQI:
12656 18718 : case V8SI_FTYPE_V4SI_V8SI_UQI:
12657 18718 : case V8SI_FTYPE_SI_V8SI_UQI:
12658 18718 : case V4SI_FTYPE_V4SI_V4SI_UQI:
12659 18718 : case V4SI_FTYPE_SI_V4SI_UQI:
12660 18718 : case V4DI_FTYPE_V2DI_V4DI_UQI:
12661 18718 : case V4DI_FTYPE_DI_V4DI_UQI:
12662 18718 : case V2DI_FTYPE_V2DI_V2DI_UQI:
12663 18718 : case V2DI_FTYPE_DI_V2DI_UQI:
12664 18718 : case V64QI_FTYPE_V64QI_V64QI_UDI:
12665 18718 : case V64QI_FTYPE_V16QI_V64QI_UDI:
12666 18718 : case V64QI_FTYPE_QI_V64QI_UDI:
12667 18718 : case V32QI_FTYPE_V32QI_V32QI_USI:
12668 18718 : case V32QI_FTYPE_V16QI_V32QI_USI:
12669 18718 : case V32QI_FTYPE_QI_V32QI_USI:
12670 18718 : case V16QI_FTYPE_V16QI_V16QI_UHI:
12671 18718 : case V16QI_FTYPE_QI_V16QI_UHI:
12672 18718 : case V32HI_FTYPE_V8HI_V32HI_USI:
12673 18718 : case V32HI_FTYPE_V32BF_V32HI_USI:
12674 18718 : case V32HI_FTYPE_HI_V32HI_USI:
12675 18718 : case V16HI_FTYPE_V8HI_V16HI_UHI:
12676 18718 : case V16HI_FTYPE_V16BF_V16HI_UHI:
12677 18718 : case V16HI_FTYPE_HI_V16HI_UHI:
12678 18718 : case V8HI_FTYPE_V8HI_V8HI_UQI:
12679 18718 : case V8HI_FTYPE_V8BF_V8HI_UQI:
12680 18718 : case V8BF_FTYPE_V8BF_V8BF_UQI:
12681 18718 : case V8HI_FTYPE_HI_V8HI_UQI:
12682 18718 : case V16HF_FTYPE_V16HF_V16HF_UHI:
12683 18718 : case V8SF_FTYPE_V8HI_V8SF_UQI:
12684 18718 : case V4SF_FTYPE_V8HI_V4SF_UQI:
12685 18718 : case V8SI_FTYPE_V8HF_V8SI_UQI:
12686 18718 : case V8SF_FTYPE_V8HF_V8SF_UQI:
12687 18718 : case V8SI_FTYPE_V8SF_V8SI_UQI:
12688 18718 : case V4SI_FTYPE_V4SF_V4SI_UQI:
12689 18718 : case V4SI_FTYPE_V8HF_V4SI_UQI:
12690 18718 : case V4SF_FTYPE_V8HF_V4SF_UQI:
12691 18718 : case V4DI_FTYPE_V8HF_V4DI_UQI:
12692 18718 : case V4DI_FTYPE_V4SF_V4DI_UQI:
12693 18718 : case V2DI_FTYPE_V8HF_V2DI_UQI:
12694 18718 : case V2DI_FTYPE_V4SF_V2DI_UQI:
12695 18718 : case V8HF_FTYPE_V8HF_V8HF_UQI:
12696 18718 : case V8HF_FTYPE_V8HF_V8HF_V8HF:
12697 18718 : case V8HF_FTYPE_V8HI_V8HF_UQI:
12698 18718 : case V8HF_FTYPE_V8SI_V8HF_UQI:
12699 18718 : case V8HF_FTYPE_V8SF_V8HF_UQI:
12700 18718 : case V8HF_FTYPE_V4SI_V8HF_UQI:
12701 18718 : case V8HF_FTYPE_V4SF_V8HF_UQI:
12702 18718 : case V8HF_FTYPE_V4DI_V8HF_UQI:
12703 18718 : case V8HF_FTYPE_V4DF_V8HF_UQI:
12704 18718 : case V8HF_FTYPE_V2DI_V8HF_UQI:
12705 18718 : case V8HF_FTYPE_V2DF_V8HF_UQI:
12706 18718 : case V4SF_FTYPE_V4DI_V4SF_UQI:
12707 18718 : case V4SF_FTYPE_V2DI_V4SF_UQI:
12708 18718 : case V4DF_FTYPE_V4DI_V4DF_UQI:
12709 18718 : case V4DF_FTYPE_V8HF_V4DF_UQI:
12710 18718 : case V2DF_FTYPE_V8HF_V2DF_UQI:
12711 18718 : case V2DF_FTYPE_V2DI_V2DF_UQI:
12712 18718 : case V16QI_FTYPE_V8HI_V16QI_UQI:
12713 18718 : case V16QI_FTYPE_V16HI_V16QI_UHI:
12714 18718 : case V16QI_FTYPE_V4SI_V16QI_UQI:
12715 18718 : case V16QI_FTYPE_V8SI_V16QI_UQI:
12716 18718 : case V8HI_FTYPE_V8HF_V8HI_UQI:
12717 18718 : case V8HI_FTYPE_V4SI_V8HI_UQI:
12718 18718 : case V8HI_FTYPE_V8SI_V8HI_UQI:
12719 18718 : case V16QI_FTYPE_V2DI_V16QI_UQI:
12720 18718 : case V16QI_FTYPE_V4DI_V16QI_UQI:
12721 18718 : case V8HI_FTYPE_V2DI_V8HI_UQI:
12722 18718 : case V8HI_FTYPE_V4DI_V8HI_UQI:
12723 18718 : case V4SI_FTYPE_V2DI_V4SI_UQI:
12724 18718 : case V4SI_FTYPE_V4DI_V4SI_UQI:
12725 18718 : case V32QI_FTYPE_V32HI_V32QI_USI:
12726 18718 : case UHI_FTYPE_V16QI_V16QI_UHI:
12727 18718 : case USI_FTYPE_V32QI_V32QI_USI:
12728 18718 : case UDI_FTYPE_V64QI_V64QI_UDI:
12729 18718 : case UQI_FTYPE_V8HI_V8HI_UQI:
12730 18718 : case UHI_FTYPE_V16HI_V16HI_UHI:
12731 18718 : case USI_FTYPE_V32HI_V32HI_USI:
12732 18718 : case UQI_FTYPE_V4SI_V4SI_UQI:
12733 18718 : case UQI_FTYPE_V8SI_V8SI_UQI:
12734 18718 : case UQI_FTYPE_V2DI_V2DI_UQI:
12735 18718 : case UQI_FTYPE_V4DI_V4DI_UQI:
12736 18718 : case V4SF_FTYPE_V2DF_V4SF_UQI:
12737 18718 : case V4SF_FTYPE_V4DF_V4SF_UQI:
12738 18718 : case V16SI_FTYPE_V16SI_V16SI_UHI:
12739 18718 : case V16SI_FTYPE_V4SI_V16SI_UHI:
12740 18718 : case V2DI_FTYPE_V4SI_V2DI_UQI:
12741 18718 : case V2DI_FTYPE_V8HI_V2DI_UQI:
12742 18718 : case V2DI_FTYPE_V16QI_V2DI_UQI:
12743 18718 : case V4DI_FTYPE_V4DI_V4DI_UQI:
12744 18718 : case V4DI_FTYPE_V4SI_V4DI_UQI:
12745 18718 : case V4DI_FTYPE_V8HI_V4DI_UQI:
12746 18718 : case V4DI_FTYPE_V16QI_V4DI_UQI:
12747 18718 : case V4DI_FTYPE_V4DF_V4DI_UQI:
12748 18718 : case V2DI_FTYPE_V2DF_V2DI_UQI:
12749 18718 : case V4SI_FTYPE_V4DF_V4SI_UQI:
12750 18718 : case V4SI_FTYPE_V2DF_V4SI_UQI:
12751 18718 : case V4SI_FTYPE_V8HI_V4SI_UQI:
12752 18718 : case V4SI_FTYPE_V16QI_V4SI_UQI:
12753 18718 : case V4DI_FTYPE_V4DI_V4DI_V4DI:
12754 18718 : case V8DF_FTYPE_V2DF_V8DF_UQI:
12755 18718 : case V8DF_FTYPE_V4DF_V8DF_UQI:
12756 18718 : case V8DF_FTYPE_V8DF_V8DF_UQI:
12757 18718 : case V8SF_FTYPE_V8SF_V8SF_UQI:
12758 18718 : case V8SF_FTYPE_V8SI_V8SF_UQI:
12759 18718 : case V4DF_FTYPE_V4DF_V4DF_UQI:
12760 18718 : case V4SF_FTYPE_V4SF_V4SF_UQI:
12761 18718 : case V2DF_FTYPE_V2DF_V2DF_UQI:
12762 18718 : case V2DF_FTYPE_V4SF_V2DF_UQI:
12763 18718 : case V2DF_FTYPE_V4SI_V2DF_UQI:
12764 18718 : case V4SF_FTYPE_V4SI_V4SF_UQI:
12765 18718 : case V4DF_FTYPE_V4SF_V4DF_UQI:
12766 18718 : case V4DF_FTYPE_V4SI_V4DF_UQI:
12767 18718 : case V8SI_FTYPE_V8SI_V8SI_UQI:
12768 18718 : case V8SI_FTYPE_V8HI_V8SI_UQI:
12769 18718 : case V8SI_FTYPE_V16QI_V8SI_UQI:
12770 18718 : case V8DF_FTYPE_V8SI_V8DF_UQI:
12771 18718 : case V8DI_FTYPE_DI_V8DI_UQI:
12772 18718 : case V16SF_FTYPE_V8SF_V16SF_UHI:
12773 18718 : case V16SI_FTYPE_V8SI_V16SI_UHI:
12774 18718 : case V16HF_FTYPE_V16HI_V16HF_UHI:
12775 18718 : case V16HF_FTYPE_V16HF_V16HF_V16HF:
12776 18718 : case V16HI_FTYPE_V16HF_V16HI_UHI:
12777 18718 : case V16HI_FTYPE_V16HI_V16HI_UHI:
12778 18718 : case V16BF_FTYPE_V16BF_V16BF_UHI:
12779 18718 : case V8HI_FTYPE_V16QI_V8HI_UQI:
12780 18718 : case V16HI_FTYPE_V16QI_V16HI_UHI:
12781 18718 : case V32HI_FTYPE_V32HI_V32HI_USI:
12782 18718 : case V32BF_FTYPE_V32BF_V32BF_USI:
12783 18718 : case V32HI_FTYPE_V32QI_V32HI_USI:
12784 18718 : case V8DI_FTYPE_V16QI_V8DI_UQI:
12785 18718 : case V8DI_FTYPE_V2DI_V8DI_UQI:
12786 18718 : case V8DI_FTYPE_V4DI_V8DI_UQI:
12787 18718 : case V8DI_FTYPE_V8DI_V8DI_UQI:
12788 18718 : case V8DI_FTYPE_V8HI_V8DI_UQI:
12789 18718 : case V8DI_FTYPE_V8SI_V8DI_UQI:
12790 18718 : case V8HI_FTYPE_V8DI_V8HI_UQI:
12791 18718 : case V8SI_FTYPE_V8DI_V8SI_UQI:
12792 18718 : case V4SI_FTYPE_V4SI_V4SI_V4SI:
12793 18718 : case V4DI_FTYPE_V4DI_V4DI_V2DI:
12794 18718 : case V16SI_FTYPE_V16SI_V16SI_V16SI:
12795 18718 : case V8DI_FTYPE_V8DI_V8DI_V8DI:
12796 18718 : case V32HI_FTYPE_V32HI_V32HI_V32HI:
12797 18718 : case V2DI_FTYPE_V2DI_V2DI_V2DI:
12798 18718 : case V16HI_FTYPE_V16HI_V16HI_V16HI:
12799 18718 : case V8SI_FTYPE_V8SI_V8SI_V8SI:
12800 18718 : case V8HI_FTYPE_V8HI_V8HI_V8HI:
12801 18718 : case V32BF_FTYPE_V16SF_V16SF_USI:
12802 18718 : case V16BF_FTYPE_V8SF_V8SF_UHI:
12803 18718 : case V8BF_FTYPE_V4SF_V4SF_UQI:
12804 18718 : case V16BF_FTYPE_V16SF_V16BF_UHI:
12805 18718 : case V8BF_FTYPE_V8SF_V8BF_UQI:
12806 18718 : case V8BF_FTYPE_V4SF_V8BF_UQI:
12807 18718 : case V16SF_FTYPE_V16SF_V32BF_V32BF:
12808 18718 : case V8SF_FTYPE_V8SF_V16BF_V16BF:
12809 18718 : case V4SF_FTYPE_V4SF_V8BF_V8BF:
12810 18718 : case V16QI_FTYPE_V16QI_V8HF_V8HF:
12811 18718 : case V32QI_FTYPE_V32QI_V16HF_V16HF:
12812 18718 : case V64QI_FTYPE_V64QI_V32HF_V32HF:
12813 18718 : case V16QI_FTYPE_V8HF_V16QI_UQI:
12814 18718 : case V16QI_FTYPE_V16HF_V16QI_UHI:
12815 18718 : case V32QI_FTYPE_V32HF_V32QI_USI:
12816 18718 : case V8HF_FTYPE_V16QI_V8HF_UQI:
12817 18718 : case V16HF_FTYPE_V16QI_V16HF_UHI:
12818 18718 : case V32HF_FTYPE_V32QI_V32HF_USI:
12819 18718 : case V16SI_FTYPE_V16SF_V16SI_UHI:
12820 18718 : case V32HI_FTYPE_V32HF_V32HI_USI:
12821 18718 : case V8DI_FTYPE_V8SF_V8DI_UQI:
12822 18718 : case V8DI_FTYPE_V8DF_V8DI_UQI:
12823 18718 : case V8SI_FTYPE_V8DF_V8SI_UQI:
12824 18718 : nargs = 3;
12825 18718 : break;
12826 1480 : case V32QI_FTYPE_V32QI_V32QI_INT:
12827 1480 : case V16HI_FTYPE_V16HI_V16HI_INT:
12828 1480 : case V16QI_FTYPE_V16QI_V16QI_INT:
12829 1480 : case V4DI_FTYPE_V4DI_V4DI_INT:
12830 1480 : case V8HI_FTYPE_V8HI_V8HI_INT:
12831 1480 : case V8SI_FTYPE_V8SI_V8SI_INT:
12832 1480 : case V8SI_FTYPE_V8SI_V4SI_INT:
12833 1480 : case V8SF_FTYPE_V8SF_V8SF_INT:
12834 1480 : case V8SF_FTYPE_V8SF_V4SF_INT:
12835 1480 : case V4SI_FTYPE_V4SI_V4SI_INT:
12836 1480 : case V4DF_FTYPE_V4DF_V4DF_INT:
12837 1480 : case V16SF_FTYPE_V16SF_V16SF_INT:
12838 1480 : case V16SF_FTYPE_V16SF_V4SF_INT:
12839 1480 : case V16SI_FTYPE_V16SI_V4SI_INT:
12840 1480 : case V4DF_FTYPE_V4DF_V2DF_INT:
12841 1480 : case V4SF_FTYPE_V4SF_V4SF_INT:
12842 1480 : case V2DI_FTYPE_V2DI_V2DI_INT:
12843 1480 : case V4DI_FTYPE_V4DI_V2DI_INT:
12844 1480 : case V2DF_FTYPE_V2DF_V2DF_INT:
12845 1480 : case UQI_FTYPE_V8DI_V8UDI_INT:
12846 1480 : case UQI_FTYPE_V8DF_V8DF_INT:
12847 1480 : case UQI_FTYPE_V2DF_V2DF_INT:
12848 1480 : case UQI_FTYPE_V4SF_V4SF_INT:
12849 1480 : case UHI_FTYPE_V16SI_V16SI_INT:
12850 1480 : case UHI_FTYPE_V16SF_V16SF_INT:
12851 1480 : case V64QI_FTYPE_V64QI_V64QI_INT:
12852 1480 : case V32HI_FTYPE_V32HI_V32HI_INT:
12853 1480 : case V16SI_FTYPE_V16SI_V16SI_INT:
12854 1480 : case V8DI_FTYPE_V8DI_V8DI_INT:
12855 1480 : nargs = 3;
12856 1480 : nargs_constant = 1;
12857 1480 : break;
12858 47 : case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
12859 47 : nargs = 3;
12860 47 : rmode = V4DImode;
12861 47 : nargs_constant = 1;
12862 47 : break;
12863 80 : case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
12864 80 : nargs = 3;
12865 80 : rmode = V2DImode;
12866 80 : nargs_constant = 1;
12867 80 : break;
12868 48 : case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
12869 48 : nargs = 3;
12870 48 : rmode = DImode;
12871 48 : nargs_constant = 1;
12872 48 : break;
12873 20 : case V2DI_FTYPE_V2DI_UINT_UINT:
12874 20 : nargs = 3;
12875 20 : nargs_constant = 2;
12876 20 : break;
12877 8 : case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
12878 8 : nargs = 3;
12879 8 : rmode = V8DImode;
12880 8 : nargs_constant = 1;
12881 8 : break;
12882 16 : case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
12883 16 : nargs = 5;
12884 16 : rmode = V8DImode;
12885 16 : mask_pos = 2;
12886 16 : nargs_constant = 1;
12887 16 : break;
12888 320 : case QI_FTYPE_V8DF_INT_UQI:
12889 320 : case QI_FTYPE_V4DF_INT_UQI:
12890 320 : case QI_FTYPE_V2DF_INT_UQI:
12891 320 : case HI_FTYPE_V16SF_INT_UHI:
12892 320 : case QI_FTYPE_V8SF_INT_UQI:
12893 320 : case QI_FTYPE_V4SF_INT_UQI:
12894 320 : case QI_FTYPE_V8HF_INT_UQI:
12895 320 : case HI_FTYPE_V16HF_INT_UHI:
12896 320 : case SI_FTYPE_V32HF_INT_USI:
12897 320 : case QI_FTYPE_V8BF_INT_UQI:
12898 320 : case HI_FTYPE_V16BF_INT_UHI:
12899 320 : case SI_FTYPE_V32BF_INT_USI:
12900 320 : case V4SI_FTYPE_V4SI_V4SI_UHI:
12901 320 : case V8SI_FTYPE_V8SI_V8SI_UHI:
12902 320 : nargs = 3;
12903 320 : mask_pos = 1;
12904 320 : nargs_constant = 1;
12905 320 : break;
12906 17 : case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
12907 17 : nargs = 5;
12908 17 : rmode = V4DImode;
12909 17 : mask_pos = 2;
12910 17 : nargs_constant = 1;
12911 17 : break;
12912 17 : case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
12913 17 : nargs = 5;
12914 17 : rmode = V2DImode;
12915 17 : mask_pos = 2;
12916 17 : nargs_constant = 1;
12917 17 : break;
12918 17266 : case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
12919 17266 : case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
12920 17266 : case V32BF_FTYPE_V32BF_V32BF_V32BF_USI:
12921 17266 : case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
12922 17266 : case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
12923 17266 : case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
12924 17266 : case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
12925 17266 : case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
12926 17266 : case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
12927 17266 : case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
12928 17266 : case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
12929 17266 : case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
12930 17266 : case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
12931 17266 : case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
12932 17266 : case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
12933 17266 : case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
12934 17266 : case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
12935 17266 : case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
12936 17266 : case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
12937 17266 : case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
12938 17266 : case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
12939 17266 : case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
12940 17266 : case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
12941 17266 : case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
12942 17266 : case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
12943 17266 : case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
12944 17266 : case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
12945 17266 : case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
12946 17266 : case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
12947 17266 : case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
12948 17266 : case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
12949 17266 : case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
12950 17266 : case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
12951 17266 : case V8BF_FTYPE_V8BF_V8BF_V8BF_UQI:
12952 17266 : case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
12953 17266 : case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
12954 17266 : case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI:
12955 17266 : case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
12956 17266 : case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
12957 17266 : case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
12958 17266 : case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
12959 17266 : case V16BF_FTYPE_V16BF_V16BF_V16BF_UHI:
12960 17266 : case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
12961 17266 : case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
12962 17266 : case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
12963 17266 : case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
12964 17266 : case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
12965 17266 : case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
12966 17266 : case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
12967 17266 : case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
12968 17266 : case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
12969 17266 : case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
12970 17266 : case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
12971 17266 : case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
12972 17266 : case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
12973 17266 : case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
12974 17266 : case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
12975 17266 : case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
12976 17266 : case V32BF_FTYPE_V16SF_V16SF_V32BF_USI:
12977 17266 : case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI:
12978 17266 : case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI:
12979 17266 : case V32HF_FTYPE_V16SF_V16SF_V32HF_USI:
12980 17266 : case V16HF_FTYPE_V8SF_V8SF_V16HF_UHI:
12981 17266 : case V8HF_FTYPE_V4SF_V4SF_V8HF_UQI:
12982 17266 : case V16QI_FTYPE_V8HF_V8HF_V16QI_UHI:
12983 17266 : case V32QI_FTYPE_V16HF_V16HF_V32QI_USI:
12984 17266 : case V64QI_FTYPE_V32HF_V32HF_V64QI_UDI:
12985 17266 : case V16QI_FTYPE_V16QI_V8HF_V16QI_UHI:
12986 17266 : case V16QI_FTYPE_V32QI_V16HF_V16QI_UHI:
12987 17266 : case V32QI_FTYPE_V64QI_V32HF_V32QI_USI:
12988 17266 : nargs = 4;
12989 17266 : break;
12990 11 : case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
12991 11 : case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
12992 11 : case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
12993 11 : case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
12994 11 : case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
12995 11 : case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
12996 11 : nargs = 4;
12997 11 : nargs_constant = 1;
12998 11 : break;
12999 3718 : case UQI_FTYPE_V4DI_V4DI_INT_UQI:
13000 3718 : case UQI_FTYPE_V8SI_V8SI_INT_UQI:
13001 3718 : case QI_FTYPE_V4DF_V4DF_INT_UQI:
13002 3718 : case QI_FTYPE_V8SF_V8SF_INT_UQI:
13003 3718 : case UHI_FTYPE_V16HF_V16HF_INT_UHI:
13004 3718 : case UQI_FTYPE_V2DI_V2DI_INT_UQI:
13005 3718 : case UQI_FTYPE_V4SI_V4SI_INT_UQI:
13006 3718 : case UQI_FTYPE_V2DF_V2DF_INT_UQI:
13007 3718 : case UQI_FTYPE_V4SF_V4SF_INT_UQI:
13008 3718 : case UQI_FTYPE_V8HF_V8HF_INT_UQI:
13009 3718 : case UDI_FTYPE_V64QI_V64QI_INT_UDI:
13010 3718 : case USI_FTYPE_V32QI_V32QI_INT_USI:
13011 3718 : case UHI_FTYPE_V16QI_V16QI_INT_UHI:
13012 3718 : case USI_FTYPE_V32HI_V32HI_INT_USI:
13013 3718 : case USI_FTYPE_V32BF_V32BF_INT_USI:
13014 3718 : case USI_FTYPE_V32HF_V32HF_INT_USI:
13015 3718 : case UHI_FTYPE_V16HI_V16HI_INT_UHI:
13016 3718 : case UHI_FTYPE_V16BF_V16BF_INT_UHI:
13017 3718 : case UQI_FTYPE_V8HI_V8HI_INT_UQI:
13018 3718 : case UQI_FTYPE_V8BF_V8BF_INT_UQI:
13019 3718 : nargs = 4;
13020 3718 : mask_pos = 1;
13021 3718 : nargs_constant = 1;
13022 3718 : break;
13023 23 : case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
13024 23 : nargs = 4;
13025 23 : nargs_constant = 2;
13026 23 : break;
13027 67 : case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
13028 67 : case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
13029 67 : case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI:
13030 67 : case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI:
13031 67 : case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI:
13032 67 : nargs = 4;
13033 67 : break;
13034 679 : case UQI_FTYPE_V8DI_V8DI_INT_UQI:
13035 679 : case UHI_FTYPE_V16SI_V16SI_INT_UHI:
13036 679 : mask_pos = 1;
13037 679 : nargs = 4;
13038 679 : nargs_constant = 1;
13039 679 : break;
13040 3948 : case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
13041 3948 : case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
13042 3948 : case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
13043 3948 : case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
13044 3948 : case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
13045 3948 : case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
13046 3948 : case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
13047 3948 : case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
13048 3948 : case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
13049 3948 : case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
13050 3948 : case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
13051 3948 : case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
13052 3948 : case V32HI_FTYPE_V32HI_INT_V32HI_USI:
13053 3948 : case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
13054 3948 : case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
13055 3948 : case V32BF_FTYPE_V32BF_INT_V32BF_USI:
13056 3948 : case V16BF_FTYPE_V16BF_INT_V16BF_UHI:
13057 3948 : case V8BF_FTYPE_V8BF_INT_V8BF_UQI:
13058 3948 : case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
13059 3948 : case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
13060 3948 : case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
13061 3948 : case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
13062 3948 : case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
13063 3948 : case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
13064 3948 : case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
13065 3948 : case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
13066 3948 : case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
13067 3948 : case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
13068 3948 : case V16HF_FTYPE_V16HF_INT_V16HF_UHI:
13069 3948 : case V8HF_FTYPE_V8HF_INT_V8HF_UQI:
13070 3948 : case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
13071 3948 : case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
13072 3948 : case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
13073 3948 : case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
13074 3948 : case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
13075 3948 : nargs = 4;
13076 3948 : mask_pos = 2;
13077 3948 : nargs_constant = 1;
13078 3948 : break;
13079 1726 : case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
13080 1726 : case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
13081 1726 : case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
13082 1726 : case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
13083 1726 : case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
13084 1726 : case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
13085 1726 : case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
13086 1726 : case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
13087 1726 : case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
13088 1726 : case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
13089 1726 : case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
13090 1726 : case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
13091 1726 : case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
13092 1726 : case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
13093 1726 : case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
13094 1726 : case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
13095 1726 : case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
13096 1726 : case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
13097 1726 : case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
13098 1726 : case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
13099 1726 : case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
13100 1726 : case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
13101 1726 : case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
13102 1726 : case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
13103 1726 : case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
13104 1726 : case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
13105 1726 : case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
13106 1726 : nargs = 5;
13107 1726 : mask_pos = 2;
13108 1726 : nargs_constant = 1;
13109 1726 : break;
13110 268 : case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
13111 268 : case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
13112 268 : case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
13113 268 : case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
13114 268 : case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
13115 268 : case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
13116 268 : case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
13117 268 : case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
13118 268 : case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
13119 268 : case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
13120 268 : nargs = 5;
13121 268 : mask_pos = 1;
13122 268 : nargs_constant = 1;
13123 268 : break;
13124 732 : case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
13125 732 : case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
13126 732 : case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
13127 732 : case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
13128 732 : case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
13129 732 : case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
13130 732 : case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
13131 732 : case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
13132 732 : case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
13133 732 : case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
13134 732 : case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
13135 732 : case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
13136 732 : case V8BF_FTYPE_V8BF_V8BF_INT_V8BF_UQI:
13137 732 : case V16BF_FTYPE_V16BF_V16BF_INT_V16BF_UHI:
13138 732 : case V32BF_FTYPE_V32BF_V32BF_INT_V32BF_USI:
13139 732 : case V16HF_FTYPE_V16HF_V16HF_INT_V16HF_UHI:
13140 732 : case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI:
13141 732 : nargs = 5;
13142 732 : mask_pos = 1;
13143 732 : nargs_constant = 2;
13144 732 : break;
13145 :
13146 0 : default:
13147 0 : gcc_unreachable ();
13148 : }
13149 :
13150 56356 : gcc_assert (nargs <= ARRAY_SIZE (xops));
13151 :
13152 61682 : if (comparison != UNKNOWN)
13153 : {
13154 614 : gcc_assert (nargs == 2);
13155 614 : return ix86_expand_sse_compare (d, exp, target, swap);
13156 : }
13157 :
13158 61068 : if (rmode == VOIDmode || rmode == tmode)
13159 : {
13160 60883 : if (optimize
13161 17729 : || target == 0
13162 17729 : || GET_MODE (target) != tmode
13163 78410 : || !insn_p->operand[0].predicate (target, tmode))
13164 43444 : target = gen_reg_rtx (tmode);
13165 17439 : else if (memory_operand (target, tmode))
13166 578 : num_memory++;
13167 : real_target = target;
13168 : }
13169 : else
13170 : {
13171 185 : real_target = gen_reg_rtx (tmode);
13172 185 : target = lowpart_subreg (rmode, real_target, tmode);
13173 : }
13174 :
13175 261380 : for (i = 0; i < nargs; i++)
13176 : {
13177 200545 : tree arg = CALL_EXPR_ARG (exp, i);
13178 200545 : rtx op = ix86_expand_unsigned_small_int_cst_argument (arg);
13179 200545 : machine_mode mode = insn_p->operand[i + 1].mode;
13180 : /* Need to fixup modeless constant before testing predicate. */
13181 200545 : op = fixup_modeless_constant (op, mode);
13182 200545 : bool match = insn_p->operand[i + 1].predicate (op, mode);
13183 :
13184 200545 : if (second_arg_count && i == 1)
13185 : {
13186 : /* SIMD shift insns take either an 8-bit immediate or
13187 : register as count. But builtin functions take int as
13188 : count. If count doesn't match, we put it in register.
13189 : The instructions are using 64-bit count, if op is just
13190 : 32-bit, zero-extend it, as negative shift counts
13191 : are undefined behavior and zero-extension is more
13192 : efficient. */
13193 2889 : if (!match)
13194 : {
13195 1750 : if (SCALAR_INT_MODE_P (GET_MODE (op)))
13196 489 : op = convert_modes (mode, GET_MODE (op), op, 1);
13197 : else
13198 1261 : op = lowpart_subreg (mode, op, GET_MODE (op));
13199 1750 : if (!insn_p->operand[i + 1].predicate (op, mode))
13200 190 : op = copy_to_reg (op);
13201 : }
13202 : }
13203 197656 : else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
13204 149600 : (!mask_pos && (nargs - i) <= nargs_constant))
13205 : {
13206 16488 : if (!match)
13207 233 : switch (icode)
13208 : {
13209 2 : case CODE_FOR_avx_vinsertf128v4di:
13210 2 : case CODE_FOR_avx_vextractf128v4di:
13211 2 : error ("the last argument must be an 1-bit immediate");
13212 2 : return const0_rtx;
13213 :
13214 8 : case CODE_FOR_avx512f_cmpv8di3_mask:
13215 8 : case CODE_FOR_avx512f_cmpv16si3_mask:
13216 8 : case CODE_FOR_avx512f_ucmpv8di3_mask:
13217 8 : case CODE_FOR_avx512f_ucmpv16si3_mask:
13218 8 : case CODE_FOR_avx512vl_cmpv4di3_mask:
13219 8 : case CODE_FOR_avx512vl_cmpv8si3_mask:
13220 8 : case CODE_FOR_avx512vl_ucmpv4di3_mask:
13221 8 : case CODE_FOR_avx512vl_ucmpv8si3_mask:
13222 8 : case CODE_FOR_avx512vl_cmpv2di3_mask:
13223 8 : case CODE_FOR_avx512vl_cmpv4si3_mask:
13224 8 : case CODE_FOR_avx512vl_ucmpv2di3_mask:
13225 8 : case CODE_FOR_avx512vl_ucmpv4si3_mask:
13226 8 : error ("the last argument must be a 3-bit immediate");
13227 8 : return const0_rtx;
13228 :
13229 24 : case CODE_FOR_sse4_1_roundsd:
13230 24 : case CODE_FOR_sse4_1_roundss:
13231 :
13232 24 : case CODE_FOR_sse4_1_roundpd:
13233 24 : case CODE_FOR_sse4_1_roundps:
13234 24 : case CODE_FOR_avx_roundpd256:
13235 24 : case CODE_FOR_avx_roundps256:
13236 :
13237 24 : case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
13238 24 : case CODE_FOR_sse4_1_roundps_sfix:
13239 24 : case CODE_FOR_avx_roundpd_vec_pack_sfix256:
13240 24 : case CODE_FOR_avx_roundps_sfix256:
13241 :
13242 24 : case CODE_FOR_sse4_1_blendps:
13243 24 : case CODE_FOR_avx_blendpd256:
13244 24 : case CODE_FOR_avx_vpermilv4df:
13245 24 : case CODE_FOR_avx_vpermilv4df_mask:
13246 24 : case CODE_FOR_avx512f_getmantv8df_mask:
13247 24 : case CODE_FOR_avx512f_getmantv16sf_mask:
13248 24 : case CODE_FOR_avx512vl_getmantv16hf_mask:
13249 24 : case CODE_FOR_avx512vl_getmantv8sf_mask:
13250 24 : case CODE_FOR_avx512vl_getmantv4df_mask:
13251 24 : case CODE_FOR_avx512fp16_getmantv8hf_mask:
13252 24 : case CODE_FOR_avx512vl_getmantv4sf_mask:
13253 24 : case CODE_FOR_avx512vl_getmantv2df_mask:
13254 24 : case CODE_FOR_avx512dq_rangepv8df_mask_round:
13255 24 : case CODE_FOR_avx512dq_rangepv16sf_mask_round:
13256 24 : case CODE_FOR_avx512dq_rangepv4df_mask:
13257 24 : case CODE_FOR_avx512dq_rangepv8sf_mask:
13258 24 : case CODE_FOR_avx512dq_rangepv2df_mask:
13259 24 : case CODE_FOR_avx512dq_rangepv4sf_mask:
13260 24 : case CODE_FOR_avx_shufpd256_mask:
13261 24 : error ("the last argument must be a 4-bit immediate");
13262 24 : return const0_rtx;
13263 :
13264 15 : case CODE_FOR_sha1rnds4:
13265 15 : case CODE_FOR_sse4_1_blendpd:
13266 15 : case CODE_FOR_avx_vpermilv2df:
13267 15 : case CODE_FOR_avx_vpermilv2df_mask:
13268 15 : case CODE_FOR_xop_vpermil2v2df3:
13269 15 : case CODE_FOR_xop_vpermil2v4sf3:
13270 15 : case CODE_FOR_xop_vpermil2v4df3:
13271 15 : case CODE_FOR_xop_vpermil2v8sf3:
13272 15 : case CODE_FOR_avx512f_vinsertf32x4_mask:
13273 15 : case CODE_FOR_avx512f_vinserti32x4_mask:
13274 15 : case CODE_FOR_avx512f_vextractf32x4_mask:
13275 15 : case CODE_FOR_avx512f_vextracti32x4_mask:
13276 15 : case CODE_FOR_sse2_shufpd:
13277 15 : case CODE_FOR_sse2_shufpd_mask:
13278 15 : case CODE_FOR_avx512dq_shuf_f64x2_mask:
13279 15 : case CODE_FOR_avx512dq_shuf_i64x2_mask:
13280 15 : case CODE_FOR_avx512vl_shuf_i32x4_mask:
13281 15 : case CODE_FOR_avx512vl_shuf_f32x4_mask:
13282 15 : error ("the last argument must be a 2-bit immediate");
13283 15 : return const0_rtx;
13284 :
13285 30 : case CODE_FOR_avx_vextractf128v4df:
13286 30 : case CODE_FOR_avx_vextractf128v8sf:
13287 30 : case CODE_FOR_avx_vextractf128v8si:
13288 30 : case CODE_FOR_avx_vinsertf128v4df:
13289 30 : case CODE_FOR_avx_vinsertf128v8sf:
13290 30 : case CODE_FOR_avx_vinsertf128v8si:
13291 30 : case CODE_FOR_avx512f_vinsertf64x4_mask:
13292 30 : case CODE_FOR_avx512f_vinserti64x4_mask:
13293 30 : case CODE_FOR_avx512f_vextractf64x4_mask:
13294 30 : case CODE_FOR_avx512f_vextracti64x4_mask:
13295 30 : case CODE_FOR_avx512dq_vinsertf32x8_mask:
13296 30 : case CODE_FOR_avx512dq_vinserti32x8_mask:
13297 30 : case CODE_FOR_avx512vl_vinsertv4df:
13298 30 : case CODE_FOR_avx512vl_vinsertv4di:
13299 30 : case CODE_FOR_avx512vl_vinsertv8sf:
13300 30 : case CODE_FOR_avx512vl_vinsertv8si:
13301 30 : error ("the last argument must be a 1-bit immediate");
13302 30 : return const0_rtx;
13303 :
13304 16 : case CODE_FOR_avx_vmcmpv2df3:
13305 16 : case CODE_FOR_avx_vmcmpv4sf3:
13306 16 : case CODE_FOR_avx_cmpv2df3:
13307 16 : case CODE_FOR_avx_cmpv4sf3:
13308 16 : if (CONST_INT_P (op) && IN_RANGE (INTVAL (op), 8, 31))
13309 : {
13310 4 : error ("'%s' needs isa option %s", d->name, "-mavx");
13311 4 : return const0_rtx;
13312 : }
13313 : /* FALLTHRU */
13314 18 : case CODE_FOR_avx_cmpv4df3:
13315 18 : case CODE_FOR_avx_cmpv8sf3:
13316 18 : case CODE_FOR_avx512f_cmpv8df3_mask:
13317 18 : case CODE_FOR_avx512f_cmpv16sf3_mask:
13318 18 : case CODE_FOR_avx512f_vmcmpv2df3_mask:
13319 18 : case CODE_FOR_avx512f_vmcmpv4sf3_mask:
13320 18 : case CODE_FOR_avx512bw_cmpv32hf3_mask:
13321 18 : case CODE_FOR_avx512vl_cmpv16hf3_mask:
13322 18 : case CODE_FOR_avx512fp16_cmpv8hf3_mask:
13323 18 : error ("the last argument must be a 5-bit immediate");
13324 18 : return const0_rtx;
13325 :
13326 132 : default:
13327 132 : switch (nargs_constant)
13328 : {
13329 8 : case 2:
13330 8 : if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
13331 8 : (!mask_pos && (nargs - i) == nargs_constant))
13332 : {
13333 4 : error ("the next to last argument must be an 8-bit immediate");
13334 4 : break;
13335 : }
13336 : /* FALLTHRU */
13337 128 : case 1:
13338 128 : error ("the last argument must be an 8-bit immediate");
13339 128 : break;
13340 0 : default:
13341 0 : gcc_unreachable ();
13342 : }
13343 132 : return const0_rtx;
13344 : }
13345 : }
13346 : else
13347 : {
13348 181168 : if (VECTOR_MODE_P (mode))
13349 130544 : op = safe_vector_operand (op, mode);
13350 :
13351 : /* If we aren't optimizing, only allow one memory operand to
13352 : be generated. */
13353 181168 : if (memory_operand (op, mode))
13354 : {
13355 29875 : num_memory++;
13356 29875 : if (!optimize && num_memory > 1)
13357 13613 : op = copy_to_mode_reg (mode, op);
13358 : }
13359 :
13360 181168 : if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
13361 : {
13362 178853 : if (!match)
13363 42591 : op = copy_to_mode_reg (mode, op);
13364 : }
13365 : else
13366 : {
13367 2315 : op = copy_to_reg (op);
13368 2315 : op = lowpart_subreg (mode, op, GET_MODE (op));
13369 : }
13370 : }
13371 :
13372 200312 : xops[i] = op;
13373 : }
13374 :
13375 60835 : switch (nargs)
13376 : {
13377 4712 : case 1:
13378 4712 : pat = GEN_FCN (icode) (real_target, xops[0]);
13379 4712 : break;
13380 5686 : case 2:
13381 5686 : pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
13382 5686 : break;
13383 20631 : case 3:
13384 20631 : pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
13385 20631 : break;
13386 27066 : case 4:
13387 27066 : pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
13388 27066 : xops[2], xops[3]);
13389 27066 : break;
13390 2740 : case 5:
13391 2740 : pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
13392 2740 : xops[2], xops[3], xops[4]);
13393 2740 : break;
13394 : case 6:
13395 : pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
13396 : xops[2], xops[3], xops[4], xops[5]);
13397 : break;
13398 : default:
13399 : gcc_unreachable ();
13400 : }
13401 :
13402 60835 : if (! pat)
13403 : return 0;
13404 :
13405 60835 : emit_insn (pat);
13406 60835 : return target;
13407 : }
13408 :
13409 : /* Transform pattern of following layout:
13410 : (set A
13411 : (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
13412 : )
13413 : into:
13414 : (set (A B)) */
13415 :
13416 : static rtx
13417 4949 : ix86_erase_embedded_rounding (rtx pat)
13418 : {
13419 4949 : if (NONJUMP_INSN_P (pat))
13420 699 : pat = PATTERN (pat);
13421 :
13422 4949 : gcc_assert (GET_CODE (pat) == SET);
13423 4949 : rtx src = SET_SRC (pat);
13424 4949 : gcc_assert (XVECLEN (src, 0) == 2);
13425 4949 : rtx p0 = XVECEXP (src, 0, 0);
13426 4949 : gcc_assert (GET_CODE (src) == UNSPEC
13427 : && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
13428 4949 : rtx res = gen_rtx_SET (SET_DEST (pat), p0);
13429 4949 : return res;
13430 : }
13431 :
13432 : /* Subroutine of ix86_expand_round_builtin to take care of comi insns
13433 : with rounding. */
13434 : static rtx
13435 103 : ix86_expand_sse_comi_round (const struct builtin_description *d,
13436 : tree exp, rtx target, bool comx_ok)
13437 : {
13438 103 : rtx pat, set_dst;
13439 103 : tree arg0 = CALL_EXPR_ARG (exp, 0);
13440 103 : tree arg1 = CALL_EXPR_ARG (exp, 1);
13441 103 : tree arg2 = CALL_EXPR_ARG (exp, 2);
13442 103 : tree arg3 = CALL_EXPR_ARG (exp, 3);
13443 103 : rtx op0 = expand_normal (arg0);
13444 103 : rtx op1 = expand_normal (arg1);
13445 103 : rtx op2 = expand_normal (arg2);
13446 103 : rtx op3 = expand_normal (arg3);
13447 103 : enum insn_code icode = d->icode;
13448 103 : const struct insn_data_d *insn_p = &insn_data[icode];
13449 103 : machine_mode mode0 = insn_p->operand[0].mode;
13450 103 : machine_mode mode1 = insn_p->operand[1].mode;
13451 :
13452 : /* See avxintrin.h for values. */
13453 103 : static const enum rtx_code comparisons[32] =
13454 : {
13455 : EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
13456 : UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
13457 : EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
13458 : UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
13459 : };
13460 103 : static const bool ordereds[32] =
13461 : {
13462 : true, true, true, false, false, false, false, true,
13463 : false, false, false, true, true, true, true, false,
13464 : true, true, true, false, false, false, false, true,
13465 : false, false, false, true, true, true, true, false
13466 : };
13467 103 : static const bool non_signalings[32] =
13468 : {
13469 : true, false, false, true, true, false, false, true,
13470 : true, false, false, true, true, false, false, true,
13471 : false, true, true, false, false, true, true, false,
13472 : false, true, true, false, false, true, true, false
13473 : };
13474 :
13475 103 : if (!CONST_INT_P (op2))
13476 : {
13477 0 : error ("the third argument must be comparison constant");
13478 0 : return const0_rtx;
13479 : }
13480 103 : if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
13481 : {
13482 0 : error ("incorrect comparison mode");
13483 0 : return const0_rtx;
13484 : }
13485 :
13486 103 : if (!insn_p->operand[2].predicate (op3, SImode))
13487 : {
13488 4 : error ("incorrect rounding operand");
13489 4 : return const0_rtx;
13490 : }
13491 :
13492 99 : if (VECTOR_MODE_P (mode0))
13493 99 : op0 = safe_vector_operand (op0, mode0);
13494 99 : if (VECTOR_MODE_P (mode1))
13495 99 : op1 = safe_vector_operand (op1, mode1);
13496 :
13497 99 : enum rtx_code comparison = comparisons[INTVAL (op2)];
13498 99 : enum rtx_code orig_comp = comparison;
13499 99 : bool ordered = ordereds[INTVAL (op2)];
13500 99 : bool non_signaling = non_signalings[INTVAL (op2)];
13501 99 : rtx const_val = const0_rtx;
13502 :
13503 99 : bool check_unordered = false;
13504 99 : machine_mode mode = CCFPmode;
13505 99 : switch (comparison)
13506 : {
13507 8 : case ORDERED:
13508 8 : if (!ordered)
13509 : {
13510 4 : if (TARGET_AVX10_2 && comx_ok)
13511 : {
13512 : /* Unlike VCOMI{SH,SS,SD}, VCOMX{SH,SS,SD} will set SF
13513 : differently. So directly return true here. */
13514 0 : target = gen_reg_rtx (SImode);
13515 0 : emit_move_insn (target, const1_rtx);
13516 0 : return target;
13517 : }
13518 : else
13519 : {
13520 : /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
13521 : if (!non_signaling)
13522 99 : ordered = true;
13523 99 : mode = CCSmode;
13524 : }
13525 : }
13526 : else
13527 : {
13528 : /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
13529 : if (non_signaling)
13530 : ordered = false;
13531 : mode = CCPmode;
13532 : }
13533 : comparison = NE;
13534 : break;
13535 8 : case UNORDERED:
13536 8 : if (ordered)
13537 : {
13538 4 : if (TARGET_AVX10_2 && comx_ok)
13539 : {
13540 : /* Unlike VCOMI{SH,SS,SD}, VCOMX{SH,SS,SD} will set SF
13541 : differently. So directly return false here. */
13542 0 : target = gen_reg_rtx (SImode);
13543 0 : emit_move_insn (target, const0_rtx);
13544 0 : return target;
13545 : }
13546 : else
13547 : {
13548 : /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
13549 : if (non_signaling)
13550 99 : ordered = false;
13551 : mode = CCSmode;
13552 : }
13553 : }
13554 : else
13555 : {
13556 : /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
13557 : if (!non_signaling)
13558 99 : ordered = true;
13559 99 : mode = CCPmode;
13560 : }
13561 : comparison = EQ;
13562 : break;
13563 :
13564 40 : case LE: /* -> GE */
13565 40 : case LT: /* -> GT */
13566 40 : case UNGE: /* -> UNLE */
13567 40 : case UNGT: /* -> UNLT */
13568 40 : std::swap (op0, op1);
13569 40 : comparison = swap_condition (comparison);
13570 : /* FALLTHRU */
13571 68 : case GT:
13572 68 : case GE:
13573 68 : case UNEQ:
13574 68 : case UNLT:
13575 68 : case UNLE:
13576 68 : case LTGT:
13577 : /* These are supported by CCFPmode. NB: Use ordered/signaling
13578 : COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
13579 : with NAN operands. */
13580 68 : if (ordered == non_signaling)
13581 : ordered = !ordered;
13582 : break;
13583 : /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
13584 : _CMP_EQ_OQ/_CMP_EQ_OS.
13585 : Under TARGET_AVX10_2, VCOMX/VUCOMX are always generated instead
13586 : of COMI/UCOMI, VCOMX/VUCOMX will not set ZF with NAN. */
13587 8 : case EQ:
13588 8 : if (!TARGET_AVX10_2 || !comx_ok)
13589 5 : check_unordered = true;
13590 : mode = CCZmode;
13591 : break;
13592 7 : case NE:
13593 : /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
13594 : _CMP_NEQ_UQ/_CMP_NEQ_US.
13595 : Under TARGET_AVX10_2, VCOMX/VUCOMX are always generated instead
13596 : of COMI/UCOMI, VCOMX/VUCOMX will not set ZF with NAN. */
13597 7 : gcc_assert (!ordered);
13598 7 : if (!TARGET_AVX10_2 || !comx_ok)
13599 4 : check_unordered = true;
13600 7 : mode = CCZmode;
13601 7 : const_val = const1_rtx;
13602 7 : break;
13603 0 : default:
13604 0 : gcc_unreachable ();
13605 : }
13606 :
13607 99 : target = gen_reg_rtx (SImode);
13608 99 : emit_move_insn (target, const_val);
13609 99 : target = gen_rtx_SUBREG (QImode, target, 0);
13610 :
13611 93 : if ((optimize && !register_operand (op0, mode0))
13612 192 : || !insn_p->operand[0].predicate (op0, mode0))
13613 6 : op0 = copy_to_mode_reg (mode0, op0);
13614 93 : if ((optimize && !register_operand (op1, mode1))
13615 192 : || !insn_p->operand[1].predicate (op1, mode1))
13616 6 : op1 = copy_to_mode_reg (mode1, op1);
13617 :
13618 : /* Generate comx instead of comi when EQ/NE to avoid NAN checks.
13619 : Use orig_comp to exclude ORDERED/UNORDERED cases. */
13620 99 : if ((orig_comp == EQ || orig_comp == NE)
13621 15 : && TARGET_AVX10_2 && comx_ok)
13622 : {
13623 6 : switch (icode)
13624 : {
13625 : case CODE_FOR_avx512fp16_comi_round:
13626 99 : icode = CODE_FOR_avx10_2_comxhf_round;
13627 : break;
13628 4 : case CODE_FOR_sse_comi_round:
13629 4 : icode = CODE_FOR_avx10_2_comxsf_round;
13630 4 : break;
13631 2 : case CODE_FOR_sse2_comi_round:
13632 2 : icode = CODE_FOR_avx10_2_comxdf_round;
13633 2 : break;
13634 :
13635 : default:
13636 : break;
13637 : }
13638 : }
13639 :
13640 : /* Generate comi instead of comx when UNEQ/LTGT to avoid NAN checks. */
13641 99 : if ((comparison == UNEQ || comparison == LTGT)
13642 8 : && TARGET_AVX10_2 && comx_ok)
13643 : {
13644 0 : switch (icode)
13645 : {
13646 : case CODE_FOR_avx10_2_comxhf_round:
13647 99 : icode = CODE_FOR_avx512fp16_comi_round;
13648 : break;
13649 0 : case CODE_FOR_avx10_2_comxsf_round:
13650 0 : icode = CODE_FOR_sse_comi_round;
13651 0 : break;
13652 0 : case CODE_FOR_avx10_2_comxdf_round:
13653 0 : icode = CODE_FOR_sse2_comi_round;
13654 0 : break;
13655 :
13656 : default:
13657 : break;
13658 : }
13659 : }
13660 :
13661 : /*
13662 : 1. COMI/VCOMX: ordered and signaling.
13663 : 2. UCOMI/VUCOMX: unordered and non-signaling.
13664 : */
13665 99 : if (non_signaling)
13666 38 : switch (icode)
13667 : {
13668 : case CODE_FOR_sse_comi_round:
13669 : icode = CODE_FOR_sse_ucomi_round;
13670 : break;
13671 17 : case CODE_FOR_sse2_comi_round:
13672 17 : icode = CODE_FOR_sse2_ucomi_round;
13673 17 : break;
13674 0 : case CODE_FOR_avx512fp16_comi_round:
13675 0 : icode = CODE_FOR_avx512fp16_ucomi_round;
13676 0 : break;
13677 3 : case CODE_FOR_avx10_2_comxsf_round:
13678 3 : icode = CODE_FOR_avx10_2_ucomxsf_round;
13679 3 : break;
13680 0 : case CODE_FOR_avx10_2_comxhf_round:
13681 0 : icode = CODE_FOR_avx10_2_ucomxhf_round;
13682 0 : break;
13683 1 : case CODE_FOR_avx10_2_comxdf_round:
13684 1 : icode = CODE_FOR_avx10_2_ucomxdf_round;
13685 1 : break;
13686 0 : default:
13687 0 : gcc_unreachable ();
13688 : }
13689 :
13690 99 : pat = GEN_FCN (icode) (op0, op1, op3);
13691 99 : if (! pat)
13692 : return 0;
13693 :
13694 : /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
13695 99 : if (INTVAL (op3) == NO_ROUND)
13696 : {
13697 1 : pat = ix86_erase_embedded_rounding (pat);
13698 1 : if (! pat)
13699 : return 0;
13700 :
13701 1 : set_dst = SET_DEST (pat);
13702 : }
13703 : else
13704 : {
13705 98 : gcc_assert (GET_CODE (pat) == SET);
13706 98 : set_dst = SET_DEST (pat);
13707 : }
13708 :
13709 99 : emit_insn (pat);
13710 :
13711 99 : return ix86_ssecom_setcc (comparison, check_unordered, mode,
13712 99 : set_dst, target);
13713 : }
13714 :
13715 : static rtx
13716 15604 : ix86_expand_round_builtin (const struct builtin_description *d,
13717 : tree exp, rtx target)
13718 : {
13719 15604 : rtx pat;
13720 15604 : unsigned int i, nargs;
13721 15604 : rtx xops[6];
13722 15604 : enum insn_code icode = d->icode;
13723 15604 : const struct insn_data_d *insn_p = &insn_data[icode];
13724 15604 : machine_mode tmode = insn_p->operand[0].mode;
13725 15604 : unsigned int nargs_constant = 0;
13726 15604 : unsigned int redundant_embed_rnd = 0;
13727 :
13728 15604 : switch ((enum ix86_builtin_func_type) d->flag)
13729 : {
13730 : case UINT64_FTYPE_V2DF_INT:
13731 : case UINT64_FTYPE_V4SF_INT:
13732 : case UINT64_FTYPE_V8HF_INT:
13733 : case UINT_FTYPE_V2DF_INT:
13734 : case UINT_FTYPE_V4SF_INT:
13735 : case UINT_FTYPE_V8HF_INT:
13736 : case INT64_FTYPE_V2DF_INT:
13737 : case INT64_FTYPE_V4SF_INT:
13738 : case INT64_FTYPE_V8HF_INT:
13739 : case INT_FTYPE_V2DF_INT:
13740 : case INT_FTYPE_V4SF_INT:
13741 : case INT_FTYPE_V8HF_INT:
13742 : nargs = 2;
13743 : break;
13744 656 : case V32HF_FTYPE_V32HF_V32HF_INT:
13745 656 : case V8HF_FTYPE_V8HF_V8HF_INT:
13746 656 : case V8HF_FTYPE_V8HF_INT_INT:
13747 656 : case V8HF_FTYPE_V8HF_UINT_INT:
13748 656 : case V8HF_FTYPE_V8HF_INT64_INT:
13749 656 : case V8HF_FTYPE_V8HF_UINT64_INT:
13750 656 : case V4SF_FTYPE_V4SF_UINT_INT:
13751 656 : case V4SF_FTYPE_V4SF_UINT64_INT:
13752 656 : case V2DF_FTYPE_V2DF_UINT64_INT:
13753 656 : case V4SF_FTYPE_V4SF_INT_INT:
13754 656 : case V4SF_FTYPE_V4SF_INT64_INT:
13755 656 : case V2DF_FTYPE_V2DF_INT64_INT:
13756 656 : case V4SF_FTYPE_V4SF_V4SF_INT:
13757 656 : case V2DF_FTYPE_V2DF_V2DF_INT:
13758 656 : case V4SF_FTYPE_V4SF_V2DF_INT:
13759 656 : case V2DF_FTYPE_V2DF_V4SF_INT:
13760 656 : nargs = 3;
13761 656 : break;
13762 4554 : case V8SF_FTYPE_V8DF_V8SF_QI_INT:
13763 4554 : case V8DF_FTYPE_V8DF_V8DF_QI_INT:
13764 4554 : case V32HI_FTYPE_V32HF_V32HI_USI_INT:
13765 4554 : case V32HI_FTYPE_V32BF_V32HI_USI_INT:
13766 4554 : case V8SI_FTYPE_V8DF_V8SI_QI_INT:
13767 4554 : case V8DI_FTYPE_V8HF_V8DI_UQI_INT:
13768 4554 : case V8DI_FTYPE_V8DF_V8DI_QI_INT:
13769 4554 : case V8SF_FTYPE_V8DI_V8SF_QI_INT:
13770 4554 : case V8DF_FTYPE_V8DI_V8DF_QI_INT:
13771 4554 : case V8DF_FTYPE_V8HF_V8DF_UQI_INT:
13772 4554 : case V16SF_FTYPE_V16HF_V16SF_UHI_INT:
13773 4554 : case V32HF_FTYPE_V32HI_V32HF_USI_INT:
13774 4554 : case V32HF_FTYPE_V32HF_V32HF_USI_INT:
13775 4554 : case V32HF_FTYPE_V32HF_V32HF_V32HF_INT:
13776 4554 : case V16SF_FTYPE_V16SF_V16SF_HI_INT:
13777 4554 : case V8DI_FTYPE_V8SF_V8DI_QI_INT:
13778 4554 : case V16SF_FTYPE_V16SI_V16SF_HI_INT:
13779 4554 : case V16SI_FTYPE_V16SF_V16SI_HI_INT:
13780 4554 : case V16SI_FTYPE_V16SF_V16SI_UHI_INT:
13781 4554 : case V16SI_FTYPE_V16HF_V16SI_UHI_INT:
13782 4554 : case V16HF_FTYPE_V16SI_V16HF_UHI_INT:
13783 4554 : case V8DF_FTYPE_V8SF_V8DF_QI_INT:
13784 4554 : case V16SF_FTYPE_V16HI_V16SF_HI_INT:
13785 4554 : case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
13786 4554 : case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
13787 4554 : case V8HF_FTYPE_V8DI_V8HF_UQI_INT:
13788 4554 : case V8HF_FTYPE_V8DF_V8HF_UQI_INT:
13789 4554 : case V16HF_FTYPE_V16SF_V16HF_UHI_INT:
13790 4554 : case V16HI_FTYPE_V16BF_V16HI_UHI_INT:
13791 4554 : case V8HF_FTYPE_V8HF_V8HF_V8HF_INT:
13792 4554 : nargs = 4;
13793 4554 : break;
13794 185 : case V4SF_FTYPE_V4SF_V4SF_INT_INT:
13795 185 : case V2DF_FTYPE_V2DF_V2DF_INT_INT:
13796 185 : nargs_constant = 2;
13797 185 : nargs = 4;
13798 185 : break;
13799 103 : case INT_FTYPE_V4SF_V4SF_INT_INT:
13800 103 : case INT_FTYPE_V2DF_V2DF_INT_INT:
13801 103 : return ix86_expand_sse_comi_round (d, exp, target, true);
13802 6238 : case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
13803 6238 : case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
13804 6238 : case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
13805 6238 : case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT:
13806 6238 : case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
13807 6238 : case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT:
13808 6238 : case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
13809 6238 : case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT:
13810 6238 : case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
13811 6238 : case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
13812 6238 : case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
13813 6238 : case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
13814 6238 : case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
13815 6238 : case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
13816 6238 : case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT:
13817 6238 : case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT:
13818 6238 : case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT:
13819 6238 : case V32HF_FTYPE_V16SF_V16SF_V32HF_USI_INT:
13820 6238 : nargs = 5;
13821 6238 : break;
13822 635 : case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT:
13823 635 : case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
13824 635 : case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
13825 635 : case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
13826 635 : case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
13827 635 : nargs_constant = 4;
13828 635 : nargs = 5;
13829 635 : break;
13830 1181 : case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
13831 1181 : case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
13832 1181 : case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
13833 1181 : case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
13834 1181 : case USI_FTYPE_V32HF_V32HF_INT_USI_INT:
13835 1181 : case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT:
13836 1181 : nargs_constant = 3;
13837 1181 : nargs = 5;
13838 1181 : break;
13839 1071 : case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
13840 1071 : case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
13841 1071 : case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
13842 1071 : case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
13843 1071 : case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
13844 1071 : case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
13845 1071 : case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT:
13846 1071 : case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI_INT:
13847 1071 : case V32HF_FTYPE_V32HF_V32HF_INT_V32HF_USI_INT:
13848 1071 : case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI_INT:
13849 1071 : nargs = 6;
13850 1071 : nargs_constant = 4;
13851 1071 : break;
13852 252 : case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
13853 252 : case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
13854 252 : case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
13855 252 : case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
13856 252 : nargs = 6;
13857 252 : nargs_constant = 3;
13858 252 : break;
13859 0 : default:
13860 0 : gcc_unreachable ();
13861 : }
13862 14772 : gcc_assert (nargs <= ARRAY_SIZE (xops));
13863 :
13864 15501 : if (optimize
13865 4265 : || target == 0
13866 4265 : || GET_MODE (target) != tmode
13867 19766 : || !insn_p->operand[0].predicate (target, tmode))
13868 11236 : target = gen_reg_rtx (tmode);
13869 :
13870 85440 : for (i = 0; i < nargs; i++)
13871 : {
13872 70494 : tree arg = CALL_EXPR_ARG (exp, i);
13873 70494 : rtx op = ix86_expand_unsigned_small_int_cst_argument (arg);
13874 70494 : machine_mode mode = insn_p->operand[i + 1].mode;
13875 70494 : bool match = insn_p->operand[i + 1].predicate (op, mode);
13876 :
13877 70494 : if (i == nargs - nargs_constant)
13878 : {
13879 3324 : if (!match)
13880 : {
13881 40 : switch (icode)
13882 : {
13883 12 : case CODE_FOR_avx512f_getmantv8df_mask_round:
13884 12 : case CODE_FOR_avx512f_getmantv16sf_mask_round:
13885 12 : case CODE_FOR_avx512bw_getmantv32hf_mask_round:
13886 12 : case CODE_FOR_avx512f_vgetmantv2df_round:
13887 12 : case CODE_FOR_avx512f_vgetmantv2df_mask_round:
13888 12 : case CODE_FOR_avx512f_vgetmantv4sf_round:
13889 12 : case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
13890 12 : case CODE_FOR_avx512f_vgetmantv8hf_mask_round:
13891 12 : error ("the immediate argument must be a 4-bit immediate");
13892 12 : return const0_rtx;
13893 8 : case CODE_FOR_avx512f_cmpv8df3_mask_round:
13894 8 : case CODE_FOR_avx512f_cmpv16sf3_mask_round:
13895 8 : case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
13896 8 : case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
13897 8 : case CODE_FOR_avx512f_vmcmpv8hf3_mask_round:
13898 8 : case CODE_FOR_avx512bw_cmpv32hf3_mask_round:
13899 8 : error ("the immediate argument must be a 5-bit immediate");
13900 8 : return const0_rtx;
13901 20 : default:
13902 20 : error ("the immediate argument must be an 8-bit immediate");
13903 20 : return const0_rtx;
13904 : }
13905 : }
13906 : }
13907 67170 : else if (i == nargs-1)
13908 : {
13909 15461 : if (!insn_p->operand[nargs].predicate (op, SImode))
13910 : {
13911 515 : error ("incorrect rounding operand");
13912 515 : return const0_rtx;
13913 : }
13914 :
13915 : /* If there is no rounding use normal version of the pattern. */
13916 14946 : if (INTVAL (op) == NO_ROUND)
13917 : {
13918 : /* Skip erasing embedded rounding for below expanders who
13919 : generates multiple insns. In ix86_erase_embedded_rounding
13920 : the pattern will be transformed to a single set, and emit_insn
13921 : appends the set instead of insert it to chain. So the insns
13922 : emitted inside define_expander would be ignored. */
13923 4980 : switch (icode)
13924 : {
13925 : case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round:
13926 : case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round:
13927 : case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round:
13928 : case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round:
13929 : case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round:
13930 : case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round:
13931 : redundant_embed_rnd = 0;
13932 : break;
13933 4948 : default:
13934 4948 : redundant_embed_rnd = 1;
13935 4948 : break;
13936 : }
13937 : }
13938 : }
13939 : else
13940 : {
13941 51709 : if (VECTOR_MODE_P (mode))
13942 37787 : op = safe_vector_operand (op, mode);
13943 :
13944 51709 : op = fixup_modeless_constant (op, mode);
13945 :
13946 51709 : if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
13947 : {
13948 51709 : if (optimize || !match)
13949 45381 : op = copy_to_mode_reg (mode, op);
13950 : }
13951 : else
13952 : {
13953 0 : op = copy_to_reg (op);
13954 0 : op = lowpart_subreg (mode, op, GET_MODE (op));
13955 : }
13956 : }
13957 :
13958 69939 : xops[i] = op;
13959 : }
13960 :
13961 14946 : switch (nargs)
13962 : {
13963 : case 1:
13964 : pat = GEN_FCN (icode) (target, xops[0]);
13965 : break;
13966 696 : case 2:
13967 696 : pat = GEN_FCN (icode) (target, xops[0], xops[1]);
13968 696 : break;
13969 612 : case 3:
13970 612 : pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
13971 612 : break;
13972 4615 : case 4:
13973 4615 : pat = GEN_FCN (icode) (target, xops[0], xops[1],
13974 4615 : xops[2], xops[3]);
13975 4615 : break;
13976 7750 : case 5:
13977 7750 : pat = GEN_FCN (icode) (target, xops[0], xops[1],
13978 7750 : xops[2], xops[3], xops[4]);
13979 7750 : break;
13980 1273 : case 6:
13981 1273 : pat = GEN_FCN (icode) (target, xops[0], xops[1],
13982 1273 : xops[2], xops[3], xops[4], xops[5]);
13983 1273 : break;
13984 : default:
13985 : gcc_unreachable ();
13986 : }
13987 :
13988 14946 : if (!pat)
13989 : return 0;
13990 :
13991 14946 : if (redundant_embed_rnd)
13992 4948 : pat = ix86_erase_embedded_rounding (pat);
13993 :
13994 14946 : emit_insn (pat);
13995 14946 : return target;
13996 : }
13997 :
13998 : /* Subroutine of ix86_expand_builtin to take care of special insns
13999 : with variable number of operands. */
14000 :
14001 : static rtx
14002 27190 : ix86_expand_special_args_builtin (const struct builtin_description *d,
14003 : tree exp, rtx target)
14004 : {
14005 27190 : tree arg;
14006 27190 : rtx pat, op;
14007 27190 : unsigned int i, nargs, arg_adjust, memory;
14008 27190 : unsigned int constant = 100;
14009 27190 : bool aligned_mem = false;
14010 27190 : rtx xops[4];
14011 27190 : enum insn_code icode = d->icode;
14012 27190 : const struct insn_data_d *insn_p = &insn_data[icode];
14013 27190 : machine_mode tmode = insn_p->operand[0].mode;
14014 27190 : enum { load, store } klass;
14015 :
14016 27190 : switch ((enum ix86_builtin_func_type) d->flag)
14017 : {
14018 15380 : case VOID_FTYPE_VOID:
14019 15380 : emit_insn (GEN_FCN (icode) (target));
14020 15380 : return 0;
14021 : case VOID_FTYPE_UINT64:
14022 : case VOID_FTYPE_UNSIGNED:
14023 : nargs = 0;
14024 : klass = store;
14025 : memory = 0;
14026 : break;
14027 :
14028 7581 : case INT_FTYPE_VOID:
14029 7581 : case USHORT_FTYPE_VOID:
14030 7581 : case UINT64_FTYPE_VOID:
14031 7581 : case UINT_FTYPE_VOID:
14032 7581 : case UINT8_FTYPE_VOID:
14033 7581 : case UNSIGNED_FTYPE_VOID:
14034 7581 : nargs = 0;
14035 7581 : klass = load;
14036 7581 : memory = 0;
14037 7581 : break;
14038 359 : case CHAR_FTYPE_PCCHAR:
14039 359 : case SHORT_FTYPE_PCSHORT:
14040 359 : case INT_FTYPE_PCINT:
14041 359 : case INT64_FTYPE_PCINT64:
14042 359 : case UINT64_FTYPE_PUNSIGNED:
14043 359 : case V2DI_FTYPE_PV2DI:
14044 359 : case V4DI_FTYPE_PV4DI:
14045 359 : case V32QI_FTYPE_PCCHAR:
14046 359 : case V16QI_FTYPE_PCCHAR:
14047 359 : case V8SF_FTYPE_PCV4SF:
14048 359 : case V8SF_FTYPE_PCFLOAT:
14049 359 : case V4SF_FTYPE_PCFLOAT:
14050 359 : case V4SF_FTYPE_PCFLOAT16:
14051 359 : case V4SF_FTYPE_PCBFLOAT16:
14052 359 : case V4SF_FTYPE_PCV8BF:
14053 359 : case V4SF_FTYPE_PCV8HF:
14054 359 : case V8SF_FTYPE_PCFLOAT16:
14055 359 : case V8SF_FTYPE_PCBFLOAT16:
14056 359 : case V8SF_FTYPE_PCV16HF:
14057 359 : case V8SF_FTYPE_PCV16BF:
14058 359 : case V4DF_FTYPE_PCV2DF:
14059 359 : case V4DF_FTYPE_PCDOUBLE:
14060 359 : case V2DF_FTYPE_PCDOUBLE:
14061 359 : case VOID_FTYPE_PVOID:
14062 359 : case V8DI_FTYPE_PV8DI:
14063 359 : nargs = 1;
14064 359 : klass = load;
14065 359 : memory = 0;
14066 359 : switch (icode)
14067 : {
14068 : case CODE_FOR_sse4_1_movntdqa:
14069 : case CODE_FOR_avx2_movntdqa:
14070 : case CODE_FOR_avx512f_movntdqa:
14071 : aligned_mem = true;
14072 : break;
14073 : default:
14074 : break;
14075 : }
14076 : break;
14077 371 : case VOID_FTYPE_PV2SF_V4SF:
14078 371 : case VOID_FTYPE_PV8DI_V8DI:
14079 371 : case VOID_FTYPE_PV4DI_V4DI:
14080 371 : case VOID_FTYPE_PV2DI_V2DI:
14081 371 : case VOID_FTYPE_PCHAR_V32QI:
14082 371 : case VOID_FTYPE_PCHAR_V16QI:
14083 371 : case VOID_FTYPE_PFLOAT_V16SF:
14084 371 : case VOID_FTYPE_PFLOAT_V8SF:
14085 371 : case VOID_FTYPE_PFLOAT_V4SF:
14086 371 : case VOID_FTYPE_PDOUBLE_V8DF:
14087 371 : case VOID_FTYPE_PDOUBLE_V4DF:
14088 371 : case VOID_FTYPE_PDOUBLE_V2DF:
14089 371 : case VOID_FTYPE_PLONGLONG_LONGLONG:
14090 371 : case VOID_FTYPE_PULONGLONG_ULONGLONG:
14091 371 : case VOID_FTYPE_PUNSIGNED_UNSIGNED:
14092 371 : case VOID_FTYPE_PINT_INT:
14093 371 : nargs = 1;
14094 371 : klass = store;
14095 : /* Reserve memory operand for target. */
14096 371 : memory = ARRAY_SIZE (xops);
14097 371 : switch (icode)
14098 : {
14099 : /* These builtins and instructions require the memory
14100 : to be properly aligned. */
14101 : case CODE_FOR_avx_movntv4di:
14102 : case CODE_FOR_sse2_movntv2di:
14103 : case CODE_FOR_avx_movntv8sf:
14104 : case CODE_FOR_sse_movntv4sf:
14105 : case CODE_FOR_sse4a_vmmovntv4sf:
14106 : case CODE_FOR_avx_movntv4df:
14107 : case CODE_FOR_sse2_movntv2df:
14108 : case CODE_FOR_sse4a_vmmovntv2df:
14109 : case CODE_FOR_sse2_movntidi:
14110 : case CODE_FOR_sse_movntq:
14111 : case CODE_FOR_sse2_movntisi:
14112 : case CODE_FOR_avx512f_movntv16sf:
14113 : case CODE_FOR_avx512f_movntv8df:
14114 : case CODE_FOR_avx512f_movntv8di:
14115 : aligned_mem = true;
14116 : break;
14117 : default:
14118 : break;
14119 : }
14120 : break;
14121 0 : case VOID_FTYPE_PVOID_PCVOID:
14122 0 : nargs = 1;
14123 0 : klass = store;
14124 0 : memory = 0;
14125 :
14126 0 : break;
14127 26 : case V4SF_FTYPE_V4SF_PCV2SF:
14128 26 : case V2DF_FTYPE_V2DF_PCDOUBLE:
14129 26 : nargs = 2;
14130 26 : klass = load;
14131 26 : memory = 1;
14132 26 : break;
14133 93 : case V8SF_FTYPE_PCV8SF_V8SI:
14134 93 : case V4DF_FTYPE_PCV4DF_V4DI:
14135 93 : case V4SF_FTYPE_PCV4SF_V4SI:
14136 93 : case V2DF_FTYPE_PCV2DF_V2DI:
14137 93 : case V8SI_FTYPE_PCV8SI_V8SI:
14138 93 : case V4DI_FTYPE_PCV4DI_V4DI:
14139 93 : case V4SI_FTYPE_PCV4SI_V4SI:
14140 93 : case V2DI_FTYPE_PCV2DI_V2DI:
14141 93 : case VOID_FTYPE_INT_INT64:
14142 93 : nargs = 2;
14143 93 : klass = load;
14144 93 : memory = 0;
14145 93 : break;
14146 360 : case VOID_FTYPE_PV8DF_V8DF_UQI:
14147 360 : case VOID_FTYPE_PV4DF_V4DF_UQI:
14148 360 : case VOID_FTYPE_PV2DF_V2DF_UQI:
14149 360 : case VOID_FTYPE_PV16SF_V16SF_UHI:
14150 360 : case VOID_FTYPE_PV8SF_V8SF_UQI:
14151 360 : case VOID_FTYPE_PV4SF_V4SF_UQI:
14152 360 : case VOID_FTYPE_PV8DI_V8DI_UQI:
14153 360 : case VOID_FTYPE_PV4DI_V4DI_UQI:
14154 360 : case VOID_FTYPE_PV2DI_V2DI_UQI:
14155 360 : case VOID_FTYPE_PV16SI_V16SI_UHI:
14156 360 : case VOID_FTYPE_PV8SI_V8SI_UQI:
14157 360 : case VOID_FTYPE_PV4SI_V4SI_UQI:
14158 360 : case VOID_FTYPE_PV64QI_V64QI_UDI:
14159 360 : case VOID_FTYPE_PV32HI_V32HI_USI:
14160 360 : case VOID_FTYPE_PV32QI_V32QI_USI:
14161 360 : case VOID_FTYPE_PV16QI_V16QI_UHI:
14162 360 : case VOID_FTYPE_PV16HI_V16HI_UHI:
14163 360 : case VOID_FTYPE_PV8HI_V8HI_UQI:
14164 360 : switch (icode)
14165 : {
14166 : /* These builtins and instructions require the memory
14167 : to be properly aligned. */
14168 : case CODE_FOR_avx512f_storev16sf_mask:
14169 : case CODE_FOR_avx512f_storev16si_mask:
14170 : case CODE_FOR_avx512f_storev8df_mask:
14171 : case CODE_FOR_avx512f_storev8di_mask:
14172 : case CODE_FOR_avx512vl_storev8sf_mask:
14173 : case CODE_FOR_avx512vl_storev8si_mask:
14174 : case CODE_FOR_avx512vl_storev4df_mask:
14175 : case CODE_FOR_avx512vl_storev4di_mask:
14176 : case CODE_FOR_avx512vl_storev4sf_mask:
14177 : case CODE_FOR_avx512vl_storev4si_mask:
14178 : case CODE_FOR_avx512vl_storev2df_mask:
14179 : case CODE_FOR_avx512vl_storev2di_mask:
14180 11810 : aligned_mem = true;
14181 : break;
14182 : default:
14183 : break;
14184 : }
14185 : /* FALLTHRU */
14186 : case VOID_FTYPE_PV8SF_V8SI_V8SF:
14187 : case VOID_FTYPE_PV4DF_V4DI_V4DF:
14188 : case VOID_FTYPE_PV4SF_V4SI_V4SF:
14189 : case VOID_FTYPE_PV2DF_V2DI_V2DF:
14190 : case VOID_FTYPE_PV8SI_V8SI_V8SI:
14191 : case VOID_FTYPE_PV4DI_V4DI_V4DI:
14192 : case VOID_FTYPE_PV4SI_V4SI_V4SI:
14193 : case VOID_FTYPE_PV2DI_V2DI_V2DI:
14194 : case VOID_FTYPE_PV8SI_V8DI_UQI:
14195 : case VOID_FTYPE_PV8HI_V8DI_UQI:
14196 : case VOID_FTYPE_PV16HI_V16SI_UHI:
14197 : case VOID_FTYPE_PUDI_V8DI_UQI:
14198 : case VOID_FTYPE_PV16QI_V16SI_UHI:
14199 : case VOID_FTYPE_PV4SI_V4DI_UQI:
14200 : case VOID_FTYPE_PUDI_V2DI_UQI:
14201 : case VOID_FTYPE_PUDI_V4DI_UQI:
14202 : case VOID_FTYPE_PUSI_V2DI_UQI:
14203 : case VOID_FTYPE_PV8HI_V8SI_UQI:
14204 : case VOID_FTYPE_PUDI_V4SI_UQI:
14205 : case VOID_FTYPE_PUSI_V4DI_UQI:
14206 : case VOID_FTYPE_PUHI_V2DI_UQI:
14207 : case VOID_FTYPE_PUDI_V8SI_UQI:
14208 : case VOID_FTYPE_PUSI_V4SI_UQI:
14209 : case VOID_FTYPE_PCHAR_V64QI_UDI:
14210 : case VOID_FTYPE_PCHAR_V32QI_USI:
14211 : case VOID_FTYPE_PCHAR_V16QI_UHI:
14212 : case VOID_FTYPE_PSHORT_V32HI_USI:
14213 : case VOID_FTYPE_PSHORT_V16HI_UHI:
14214 : case VOID_FTYPE_PSHORT_V8HI_UQI:
14215 : case VOID_FTYPE_PINT_V16SI_UHI:
14216 : case VOID_FTYPE_PINT_V8SI_UQI:
14217 : case VOID_FTYPE_PINT_V4SI_UQI:
14218 : case VOID_FTYPE_PINT64_V8DI_UQI:
14219 : case VOID_FTYPE_PINT64_V4DI_UQI:
14220 : case VOID_FTYPE_PINT64_V2DI_UQI:
14221 : case VOID_FTYPE_PDOUBLE_V8DF_UQI:
14222 : case VOID_FTYPE_PDOUBLE_V4DF_UQI:
14223 : case VOID_FTYPE_PDOUBLE_V2DF_UQI:
14224 : case VOID_FTYPE_PFLOAT_V16SF_UHI:
14225 : case VOID_FTYPE_PFLOAT_V8SF_UQI:
14226 : case VOID_FTYPE_PFLOAT_V4SF_UQI:
14227 : case VOID_FTYPE_PCFLOAT16_V8HF_UQI:
14228 : case VOID_FTYPE_PV32QI_V32HI_USI:
14229 : case VOID_FTYPE_PV16QI_V16HI_UHI:
14230 : case VOID_FTYPE_PUDI_V8HI_UQI:
14231 : nargs = 2;
14232 : klass = store;
14233 : /* Reserve memory operand for target. */
14234 : memory = ARRAY_SIZE (xops);
14235 : break;
14236 1243 : case V4SF_FTYPE_PCV4SF_V4SF_UQI:
14237 1243 : case V8SF_FTYPE_PCV8SF_V8SF_UQI:
14238 1243 : case V16SF_FTYPE_PCV16SF_V16SF_UHI:
14239 1243 : case V4SI_FTYPE_PCV4SI_V4SI_UQI:
14240 1243 : case V8SI_FTYPE_PCV8SI_V8SI_UQI:
14241 1243 : case V16SI_FTYPE_PCV16SI_V16SI_UHI:
14242 1243 : case V2DF_FTYPE_PCV2DF_V2DF_UQI:
14243 1243 : case V4DF_FTYPE_PCV4DF_V4DF_UQI:
14244 1243 : case V8DF_FTYPE_PCV8DF_V8DF_UQI:
14245 1243 : case V2DI_FTYPE_PCV2DI_V2DI_UQI:
14246 1243 : case V4DI_FTYPE_PCV4DI_V4DI_UQI:
14247 1243 : case V8DI_FTYPE_PCV8DI_V8DI_UQI:
14248 1243 : case V64QI_FTYPE_PCV64QI_V64QI_UDI:
14249 1243 : case V32HI_FTYPE_PCV32HI_V32HI_USI:
14250 1243 : case V32QI_FTYPE_PCV32QI_V32QI_USI:
14251 1243 : case V16QI_FTYPE_PCV16QI_V16QI_UHI:
14252 1243 : case V16HI_FTYPE_PCV16HI_V16HI_UHI:
14253 1243 : case V8HI_FTYPE_PCV8HI_V8HI_UQI:
14254 1243 : switch (icode)
14255 : {
14256 : /* These builtins and instructions require the memory
14257 : to be properly aligned. */
14258 : case CODE_FOR_avx512f_loadv16sf_mask:
14259 : case CODE_FOR_avx512f_loadv16si_mask:
14260 : case CODE_FOR_avx512f_loadv8df_mask:
14261 : case CODE_FOR_avx512f_loadv8di_mask:
14262 : case CODE_FOR_avx512vl_loadv8sf_mask:
14263 : case CODE_FOR_avx512vl_loadv8si_mask:
14264 : case CODE_FOR_avx512vl_loadv4df_mask:
14265 : case CODE_FOR_avx512vl_loadv4di_mask:
14266 : case CODE_FOR_avx512vl_loadv4sf_mask:
14267 : case CODE_FOR_avx512vl_loadv4si_mask:
14268 : case CODE_FOR_avx512vl_loadv2df_mask:
14269 : case CODE_FOR_avx512vl_loadv2di_mask:
14270 : case CODE_FOR_avx512bw_loadv64qi_mask:
14271 : case CODE_FOR_avx512vl_loadv32qi_mask:
14272 : case CODE_FOR_avx512vl_loadv16qi_mask:
14273 : case CODE_FOR_avx512bw_loadv32hi_mask:
14274 : case CODE_FOR_avx512vl_loadv16hi_mask:
14275 : case CODE_FOR_avx512vl_loadv8hi_mask:
14276 11810 : aligned_mem = true;
14277 : break;
14278 : default:
14279 : break;
14280 : }
14281 : /* FALLTHRU */
14282 : case V64QI_FTYPE_PCCHAR_V64QI_UDI:
14283 : case V32QI_FTYPE_PCCHAR_V32QI_USI:
14284 : case V16QI_FTYPE_PCCHAR_V16QI_UHI:
14285 : case V32HI_FTYPE_PCSHORT_V32HI_USI:
14286 : case V16HI_FTYPE_PCSHORT_V16HI_UHI:
14287 : case V8HI_FTYPE_PCSHORT_V8HI_UQI:
14288 : case V16SI_FTYPE_PCINT_V16SI_UHI:
14289 : case V8SI_FTYPE_PCINT_V8SI_UQI:
14290 : case V4SI_FTYPE_PCINT_V4SI_UQI:
14291 : case V8DI_FTYPE_PCINT64_V8DI_UQI:
14292 : case V4DI_FTYPE_PCINT64_V4DI_UQI:
14293 : case V2DI_FTYPE_PCINT64_V2DI_UQI:
14294 : case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
14295 : case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
14296 : case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
14297 : case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
14298 : case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
14299 : case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
14300 : case V8HF_FTYPE_PCFLOAT16_V8HF_UQI:
14301 : nargs = 3;
14302 : klass = load;
14303 : memory = 0;
14304 : break;
14305 105 : case INT_FTYPE_PINT_INT_INT_INT:
14306 105 : case LONGLONG_FTYPE_PLONGLONG_LONGLONG_LONGLONG_INT:
14307 105 : nargs = 4;
14308 105 : klass = load;
14309 105 : memory = 0;
14310 105 : constant = 3;
14311 105 : break;
14312 0 : default:
14313 0 : gcc_unreachable ();
14314 : }
14315 :
14316 8339 : gcc_assert (nargs <= ARRAY_SIZE (xops));
14317 :
14318 11810 : if (klass == store)
14319 : {
14320 1875 : arg = CALL_EXPR_ARG (exp, 0);
14321 1875 : op = expand_normal (arg);
14322 1875 : gcc_assert (target == 0);
14323 1875 : if (memory)
14324 : {
14325 1715 : op = ix86_zero_extend_to_Pmode (op);
14326 1715 : target = gen_rtx_MEM (tmode, op);
14327 : /* target at this point has just BITS_PER_UNIT MEM_ALIGN
14328 : on it. Try to improve it using get_pointer_alignment,
14329 : and if the special builtin is one that requires strict
14330 : mode alignment, also from it's GET_MODE_ALIGNMENT.
14331 : Failure to do so could lead to ix86_legitimate_combined_insn
14332 : rejecting all changes to such insns. */
14333 1715 : unsigned int align = get_pointer_alignment (arg);
14334 1715 : if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
14335 275 : align = GET_MODE_ALIGNMENT (tmode);
14336 3430 : if (MEM_ALIGN (target) < align)
14337 422 : set_mem_align (target, align);
14338 : }
14339 : else
14340 160 : target = force_reg (tmode, op);
14341 : arg_adjust = 1;
14342 : }
14343 : else
14344 : {
14345 9935 : arg_adjust = 0;
14346 9935 : if (optimize
14347 2918 : || target == 0
14348 2918 : || !register_operand (target, tmode)
14349 12842 : || GET_MODE (target) != tmode)
14350 7028 : target = gen_reg_rtx (tmode);
14351 : }
14352 :
14353 21199 : for (i = 0; i < nargs; i++)
14354 : {
14355 9389 : machine_mode mode = insn_p->operand[i + 1].mode;
14356 :
14357 9389 : arg = CALL_EXPR_ARG (exp, i + arg_adjust);
14358 9389 : op = ix86_expand_unsigned_small_int_cst_argument (arg);
14359 :
14360 9389 : if (i == memory)
14361 : {
14362 : /* This must be the memory operand. */
14363 2354 : op = ix86_zero_extend_to_Pmode (op);
14364 2354 : op = gen_rtx_MEM (mode, op);
14365 : /* op at this point has just BITS_PER_UNIT MEM_ALIGN
14366 : on it. Try to improve it using get_pointer_alignment,
14367 : and if the special builtin is one that requires strict
14368 : mode alignment, also from it's GET_MODE_ALIGNMENT.
14369 : Failure to do so could lead to ix86_legitimate_combined_insn
14370 : rejecting all changes to such insns. */
14371 2354 : unsigned int align = get_pointer_alignment (arg);
14372 2354 : if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
14373 299 : align = GET_MODE_ALIGNMENT (mode);
14374 4708 : if (MEM_ALIGN (op) < align)
14375 523 : set_mem_align (op, align);
14376 : }
14377 7035 : else if (i == constant)
14378 : {
14379 : /* This must be the constant. */
14380 105 : if (!insn_p->operand[nargs].predicate(op, SImode))
14381 : {
14382 0 : error ("the fourth argument must be one of enum %qs", "_CMPCCX_ENUM");
14383 0 : return const0_rtx;
14384 : }
14385 : }
14386 : else
14387 : {
14388 : /* This must be register. */
14389 6930 : if (VECTOR_MODE_P (mode))
14390 3475 : op = safe_vector_operand (op, mode);
14391 :
14392 6930 : op = fixup_modeless_constant (op, mode);
14393 :
14394 : /* NB: 3-operands load implied it's a mask load or v{p}expand*,
14395 : and that mask operand shoud be at the end.
14396 : Keep all-ones mask which would be simplified by the expander. */
14397 1771 : if (nargs == 3 && i == 2 && klass == load
14398 1771 : && constm1_operand (op, mode)
14399 7103 : && insn_p->operand[i].predicate (op, mode))
14400 : ;
14401 6930 : else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
14402 6930 : op = copy_to_mode_reg (mode, op);
14403 : else
14404 : {
14405 0 : op = copy_to_reg (op);
14406 0 : op = lowpart_subreg (mode, op, GET_MODE (op));
14407 : }
14408 : }
14409 :
14410 9389 : xops[i]= op;
14411 : }
14412 :
14413 11810 : switch (nargs)
14414 : {
14415 7741 : case 0:
14416 7741 : pat = GEN_FCN (icode) (target);
14417 7741 : break;
14418 730 : case 1:
14419 730 : pat = GEN_FCN (icode) (target, xops[0]);
14420 730 : break;
14421 1463 : case 2:
14422 1463 : pat = GEN_FCN (icode) (target, xops[0], xops[1]);
14423 1463 : break;
14424 1771 : case 3:
14425 1771 : pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
14426 1771 : break;
14427 105 : case 4:
14428 105 : pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
14429 105 : break;
14430 : default:
14431 : gcc_unreachable ();
14432 : }
14433 :
14434 11810 : if (! pat)
14435 : return 0;
14436 :
14437 11810 : emit_insn (pat);
14438 11810 : return klass == store ? 0 : target;
14439 : }
14440 :
14441 : /* Return the integer constant in ARG. Constrain it to be in the range
14442 : of the subparts of VEC_TYPE; issue an error if not. */
14443 :
14444 : static int
14445 604 : get_element_number (tree vec_type, tree arg)
14446 : {
14447 604 : unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
14448 :
14449 604 : if (!tree_fits_uhwi_p (arg)
14450 604 : || (elt = tree_to_uhwi (arg), elt > max))
14451 : {
14452 0 : error ("selector must be an integer constant in the range "
14453 : "[0, %wi]", max);
14454 0 : return 0;
14455 : }
14456 :
14457 604 : return elt;
14458 : }
14459 :
14460 : /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
14461 : ix86_expand_vector_init. We DO have language-level syntax for this, in
14462 : the form of (type){ init-list }. Except that since we can't place emms
14463 : instructions from inside the compiler, we can't allow the use of MMX
14464 : registers unless the user explicitly asks for it. So we do *not* define
14465 : vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
14466 : we have builtins invoked by mmintrin.h that gives us license to emit
14467 : these sorts of instructions. */
14468 :
14469 : static rtx
14470 229 : ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
14471 : {
14472 229 : machine_mode tmode = TYPE_MODE (type);
14473 229 : machine_mode inner_mode = GET_MODE_INNER (tmode);
14474 229 : int i, n_elt = GET_MODE_NUNITS (tmode);
14475 229 : rtvec v = rtvec_alloc (n_elt);
14476 :
14477 229 : gcc_assert (VECTOR_MODE_P (tmode));
14478 229 : gcc_assert (call_expr_nargs (exp) == n_elt);
14479 :
14480 1203 : for (i = 0; i < n_elt; ++i)
14481 : {
14482 974 : rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
14483 974 : RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
14484 : }
14485 :
14486 229 : if (!target || !register_operand (target, tmode))
14487 0 : target = gen_reg_rtx (tmode);
14488 :
14489 229 : ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
14490 229 : return target;
14491 : }
14492 :
14493 : /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
14494 : ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
14495 : had a language-level syntax for referencing vector elements. */
14496 :
14497 : static rtx
14498 400 : ix86_expand_vec_ext_builtin (tree exp, rtx target)
14499 : {
14500 400 : machine_mode tmode, mode0;
14501 400 : tree arg0, arg1;
14502 400 : int elt;
14503 400 : rtx op0;
14504 :
14505 400 : arg0 = CALL_EXPR_ARG (exp, 0);
14506 400 : arg1 = CALL_EXPR_ARG (exp, 1);
14507 :
14508 400 : op0 = expand_normal (arg0);
14509 400 : elt = get_element_number (TREE_TYPE (arg0), arg1);
14510 :
14511 400 : tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
14512 400 : mode0 = TYPE_MODE (TREE_TYPE (arg0));
14513 400 : gcc_assert (VECTOR_MODE_P (mode0));
14514 :
14515 400 : op0 = force_reg (mode0, op0);
14516 :
14517 400 : if (optimize || !target || !register_operand (target, tmode))
14518 321 : target = gen_reg_rtx (tmode);
14519 :
14520 400 : ix86_expand_vector_extract (true, target, op0, elt);
14521 :
14522 400 : return target;
14523 : }
14524 :
14525 : /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
14526 : ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
14527 : a language-level syntax for referencing vector elements. */
14528 :
14529 : static rtx
14530 204 : ix86_expand_vec_set_builtin (tree exp)
14531 : {
14532 204 : machine_mode tmode, mode1;
14533 204 : tree arg0, arg1, arg2;
14534 204 : int elt;
14535 204 : rtx op0, op1, target;
14536 :
14537 204 : arg0 = CALL_EXPR_ARG (exp, 0);
14538 204 : arg1 = CALL_EXPR_ARG (exp, 1);
14539 204 : arg2 = CALL_EXPR_ARG (exp, 2);
14540 :
14541 204 : tmode = TYPE_MODE (TREE_TYPE (arg0));
14542 204 : mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
14543 204 : gcc_assert (VECTOR_MODE_P (tmode));
14544 :
14545 204 : op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
14546 204 : op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
14547 204 : elt = get_element_number (TREE_TYPE (arg0), arg2);
14548 :
14549 204 : if (GET_MODE (op1) != mode1)
14550 82 : op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
14551 :
14552 204 : op0 = force_reg (tmode, op0);
14553 204 : op1 = force_reg (mode1, op1);
14554 :
14555 : /* OP0 is the source of these builtin functions and shouldn't be
14556 : modified. Create a copy, use it and return it as target. */
14557 204 : target = gen_reg_rtx (tmode);
14558 204 : emit_move_insn (target, op0);
14559 204 : ix86_expand_vector_set (true, target, op1, elt);
14560 :
14561 204 : return target;
14562 : }
14563 :
14564 : /* Return true if the necessary isa options for this builtin exist,
14565 : else false.
14566 : fcode = DECL_MD_FUNCTION_CODE (fndecl); */
14567 : bool
14568 1294707 : ix86_check_builtin_isa_match (unsigned int fcode,
14569 : HOST_WIDE_INT* pbisa,
14570 : HOST_WIDE_INT* pbisa2)
14571 : {
14572 1294707 : HOST_WIDE_INT isa = ix86_isa_flags;
14573 1294707 : HOST_WIDE_INT isa2 = ix86_isa_flags2;
14574 1294707 : HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
14575 1294707 : HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
14576 1294707 : HOST_WIDE_INT tmp_isa = isa, tmp_isa2 = isa2;
14577 : /* The general case is we require all the ISAs specified in bisa{,2}
14578 : to be enabled.
14579 : The exceptions are:
14580 : OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
14581 : OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
14582 : OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
14583 : (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
14584 : OPTION_MASK_ISA2_AVXVNNI
14585 : (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL) or
14586 : OPTION_MASK_ISA2_AVXIFMA
14587 : (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_AVX512BF16) or
14588 : OPTION_MASK_ISA2_AVXNECONVERT
14589 : OPTION_MASK_ISA_AES or (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_VAES)
14590 : OPTION_MASK_ISA2_AVX10_2 or OPTION_MASK_ISA2_AVXVNNIINT8
14591 : OPTION_MASK_ISA2_AVX10_2 or OPTION_MASK_ISA2_AVXVNNIINT16
14592 : where for each such pair it is sufficient if either of the ISAs is
14593 : enabled, plus if it is ored with other options also those others.
14594 : OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
14595 :
14596 : #define SHARE_BUILTIN(A1, A2, B1, B2) \
14597 : if ((((bisa & (A1)) == (A1) && (bisa2 & (A2)) == (A2)) \
14598 : && ((bisa & (B1)) == (B1) && (bisa2 & (B2)) == (B2))) \
14599 : && (((isa & (A1)) == (A1) && (isa2 & (A2)) == (A2)) \
14600 : || ((isa & (B1)) == (B1) && (isa2 & (B2)) == (B2)))) \
14601 : { \
14602 : tmp_isa |= (A1) | (B1); \
14603 : tmp_isa2 |= (A2) | (B2); \
14604 : }
14605 :
14606 1294707 : SHARE_BUILTIN (OPTION_MASK_ISA_SSE, 0, OPTION_MASK_ISA_3DNOW_A, 0);
14607 1294707 : SHARE_BUILTIN (OPTION_MASK_ISA_SSE4_2, 0, OPTION_MASK_ISA_CRC32, 0);
14608 1294707 : SHARE_BUILTIN (OPTION_MASK_ISA_FMA, 0, OPTION_MASK_ISA_FMA4, 0);
14609 1294707 : SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, 0,
14610 1294707 : OPTION_MASK_ISA2_AVXVNNI);
14611 1294707 : SHARE_BUILTIN (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL, 0, 0,
14612 1294707 : OPTION_MASK_ISA2_AVXIFMA);
14613 1294707 : SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BF16, 0,
14614 1294707 : OPTION_MASK_ISA2_AVXNECONVERT);
14615 1294707 : SHARE_BUILTIN (OPTION_MASK_ISA_AES, 0, OPTION_MASK_ISA_AVX512VL,
14616 1294707 : OPTION_MASK_ISA2_VAES);
14617 1294707 : SHARE_BUILTIN (0, OPTION_MASK_ISA2_AVXVNNIINT8, 0,
14618 1294707 : OPTION_MASK_ISA2_AVX10_2);
14619 1294707 : SHARE_BUILTIN (0, OPTION_MASK_ISA2_AVXVNNIINT16, 0,
14620 1294707 : OPTION_MASK_ISA2_AVX10_2);
14621 1294707 : isa = tmp_isa;
14622 1294707 : isa2 = tmp_isa2;
14623 :
14624 1294707 : if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
14625 : /* __builtin_ia32_maskmovq requires MMX registers. */
14626 4563 : && fcode != IX86_BUILTIN_MASKMOVQ)
14627 : {
14628 4554 : bisa &= ~OPTION_MASK_ISA_MMX;
14629 4554 : bisa |= OPTION_MASK_ISA_SSE2;
14630 : }
14631 :
14632 1294707 : if (pbisa)
14633 173272 : *pbisa = bisa;
14634 1294707 : if (pbisa2)
14635 173272 : *pbisa2 = bisa2;
14636 :
14637 1294707 : return (bisa & isa) == bisa && (bisa2 & isa2) == bisa2;
14638 : }
14639 :
14640 : /* Emit instructions to set the carry flag from ARG. */
14641 :
14642 : void
14643 13267 : ix86_expand_carry (rtx arg)
14644 : {
14645 13267 : if (!CONST_INT_P (arg) || arg == const0_rtx)
14646 : {
14647 13261 : arg = convert_to_mode (QImode, arg, 1);
14648 13261 : arg = copy_to_mode_reg (QImode, arg);
14649 13261 : emit_insn (gen_addqi3_cconly_overflow (arg, constm1_rtx));
14650 : }
14651 : else
14652 6 : emit_insn (gen_x86_stc ());
14653 13267 : }
14654 :
14655 : /* Expand an expression EXP that calls a built-in function,
14656 : with result going to TARGET if that's convenient
14657 : (and in mode MODE if that's convenient).
14658 : SUBTARGET may be used as the target for computing one of EXP's operands.
14659 : IGNORE is nonzero if the value is to be ignored. */
14660 :
14661 : rtx
14662 174055 : ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
14663 : machine_mode mode, int ignore)
14664 : {
14665 174055 : size_t i;
14666 174055 : enum insn_code icode, icode2;
14667 174055 : tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
14668 174055 : tree arg0, arg1, arg2, arg3, arg4;
14669 174055 : rtx op0, op1, op2, op3, op4, pat, pat2, insn;
14670 174055 : machine_mode mode0, mode1, mode2, mode3, mode4;
14671 174055 : unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
14672 174055 : HOST_WIDE_INT bisa, bisa2;
14673 :
14674 : /* For CPU builtins that can be folded, fold first and expand the fold. */
14675 174055 : switch (fcode)
14676 : {
14677 197 : case IX86_BUILTIN_CPU_INIT:
14678 197 : {
14679 : /* Make it call __cpu_indicator_init in libgcc. */
14680 197 : tree call_expr, fndecl, type;
14681 197 : type = build_function_type_list (integer_type_node, NULL_TREE);
14682 197 : fndecl = build_fn_decl ("__cpu_indicator_init", type);
14683 197 : call_expr = build_call_expr (fndecl, 0);
14684 197 : return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
14685 : }
14686 586 : case IX86_BUILTIN_CPU_IS:
14687 586 : case IX86_BUILTIN_CPU_SUPPORTS:
14688 586 : {
14689 586 : tree arg0 = CALL_EXPR_ARG (exp, 0);
14690 586 : tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
14691 586 : gcc_assert (fold_expr != NULL_TREE);
14692 586 : return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
14693 : }
14694 : }
14695 :
14696 173272 : if (!ix86_check_builtin_isa_match (fcode, &bisa, &bisa2))
14697 : {
14698 23 : bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
14699 23 : if (TARGET_ABI_X32)
14700 0 : bisa |= OPTION_MASK_ABI_X32;
14701 : else
14702 23 : bisa |= OPTION_MASK_ABI_64;
14703 23 : char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
14704 : (enum fpmath_unit) 0,
14705 : (enum prefer_vector_width) 0,
14706 : PVW_NONE, false, add_abi_p);
14707 23 : if (!opts)
14708 0 : error ("%qE needs unknown isa option", fndecl);
14709 : else
14710 : {
14711 23 : gcc_assert (opts != NULL);
14712 23 : error ("%qE needs isa option %s", fndecl, opts);
14713 23 : free (opts);
14714 : }
14715 23 : return expand_call (exp, target, ignore);
14716 : }
14717 :
14718 173249 : switch (fcode)
14719 : {
14720 35 : case IX86_BUILTIN_MASKMOVQ:
14721 35 : case IX86_BUILTIN_MASKMOVDQU:
14722 34 : icode = (fcode == IX86_BUILTIN_MASKMOVQ
14723 35 : ? CODE_FOR_mmx_maskmovq
14724 : : CODE_FOR_sse2_maskmovdqu);
14725 : /* Note the arg order is different from the operand order. */
14726 35 : arg1 = CALL_EXPR_ARG (exp, 0);
14727 35 : arg2 = CALL_EXPR_ARG (exp, 1);
14728 35 : arg0 = CALL_EXPR_ARG (exp, 2);
14729 35 : op0 = expand_normal (arg0);
14730 35 : op1 = expand_normal (arg1);
14731 35 : op2 = expand_normal (arg2);
14732 35 : mode0 = insn_data[icode].operand[0].mode;
14733 35 : mode1 = insn_data[icode].operand[1].mode;
14734 35 : mode2 = insn_data[icode].operand[2].mode;
14735 :
14736 35 : op0 = ix86_zero_extend_to_Pmode (op0);
14737 35 : op0 = gen_rtx_MEM (mode1, op0);
14738 :
14739 35 : if (!insn_data[icode].operand[0].predicate (op0, mode0))
14740 0 : op0 = copy_to_mode_reg (mode0, op0);
14741 35 : if (!insn_data[icode].operand[1].predicate (op1, mode1))
14742 2 : op1 = copy_to_mode_reg (mode1, op1);
14743 35 : if (!insn_data[icode].operand[2].predicate (op2, mode2))
14744 2 : op2 = copy_to_mode_reg (mode2, op2);
14745 35 : pat = GEN_FCN (icode) (op0, op1, op2);
14746 35 : if (! pat)
14747 56624 : return 0;
14748 35 : emit_insn (pat);
14749 35 : return 0;
14750 :
14751 22008 : case IX86_BUILTIN_LDMXCSR:
14752 22008 : op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
14753 22008 : target = assign_stack_temp (SImode, GET_MODE_SIZE (SImode));
14754 22008 : emit_move_insn (target, op0);
14755 22008 : emit_insn (gen_sse_ldmxcsr (target));
14756 22008 : return 0;
14757 :
14758 14785 : case IX86_BUILTIN_STMXCSR:
14759 14785 : target = assign_stack_temp (SImode, GET_MODE_SIZE (SImode));
14760 14785 : emit_insn (gen_sse_stmxcsr (target));
14761 14785 : return copy_to_mode_reg (SImode, target);
14762 :
14763 11 : case IX86_BUILTIN_CLFLUSH:
14764 11 : arg0 = CALL_EXPR_ARG (exp, 0);
14765 11 : op0 = expand_normal (arg0);
14766 11 : icode = CODE_FOR_sse2_clflush;
14767 11 : if (!insn_data[icode].operand[0].predicate (op0, Pmode))
14768 5 : op0 = ix86_zero_extend_to_Pmode (op0);
14769 :
14770 11 : emit_insn (gen_sse2_clflush (op0));
14771 11 : return 0;
14772 :
14773 19 : case IX86_BUILTIN_CLWB:
14774 19 : arg0 = CALL_EXPR_ARG (exp, 0);
14775 19 : op0 = expand_normal (arg0);
14776 19 : icode = CODE_FOR_clwb;
14777 19 : if (!insn_data[icode].operand[0].predicate (op0, Pmode))
14778 9 : op0 = ix86_zero_extend_to_Pmode (op0);
14779 :
14780 19 : emit_insn (gen_clwb (op0));
14781 19 : return 0;
14782 :
14783 19 : case IX86_BUILTIN_CLFLUSHOPT:
14784 19 : arg0 = CALL_EXPR_ARG (exp, 0);
14785 19 : op0 = expand_normal (arg0);
14786 19 : icode = CODE_FOR_clflushopt;
14787 19 : if (!insn_data[icode].operand[0].predicate (op0, Pmode))
14788 9 : op0 = ix86_zero_extend_to_Pmode (op0);
14789 :
14790 19 : emit_insn (gen_clflushopt (op0));
14791 19 : return 0;
14792 :
14793 47 : case IX86_BUILTIN_MONITOR:
14794 47 : case IX86_BUILTIN_MONITORX:
14795 47 : arg0 = CALL_EXPR_ARG (exp, 0);
14796 47 : arg1 = CALL_EXPR_ARG (exp, 1);
14797 47 : arg2 = CALL_EXPR_ARG (exp, 2);
14798 47 : op0 = expand_normal (arg0);
14799 47 : op1 = expand_normal (arg1);
14800 47 : op2 = expand_normal (arg2);
14801 47 : if (!REG_P (op0))
14802 19 : op0 = ix86_zero_extend_to_Pmode (op0);
14803 47 : if (!REG_P (op1))
14804 22 : op1 = copy_to_mode_reg (SImode, op1);
14805 47 : if (!REG_P (op2))
14806 25 : op2 = copy_to_mode_reg (SImode, op2);
14807 :
14808 47 : emit_insn (fcode == IX86_BUILTIN_MONITOR
14809 26 : ? gen_sse3_monitor (Pmode, op0, op1, op2)
14810 21 : : gen_monitorx (Pmode, op0, op1, op2));
14811 47 : return 0;
14812 :
14813 25 : case IX86_BUILTIN_MWAIT:
14814 25 : arg0 = CALL_EXPR_ARG (exp, 0);
14815 25 : arg1 = CALL_EXPR_ARG (exp, 1);
14816 25 : op0 = expand_normal (arg0);
14817 25 : op1 = expand_normal (arg1);
14818 25 : if (!REG_P (op0))
14819 13 : op0 = copy_to_mode_reg (SImode, op0);
14820 25 : if (!REG_P (op1))
14821 11 : op1 = copy_to_mode_reg (SImode, op1);
14822 25 : emit_insn (gen_sse3_mwait (op0, op1));
14823 25 : return 0;
14824 :
14825 21 : case IX86_BUILTIN_MWAITX:
14826 21 : arg0 = CALL_EXPR_ARG (exp, 0);
14827 21 : arg1 = CALL_EXPR_ARG (exp, 1);
14828 21 : arg2 = CALL_EXPR_ARG (exp, 2);
14829 21 : op0 = expand_normal (arg0);
14830 21 : op1 = expand_normal (arg1);
14831 21 : op2 = expand_normal (arg2);
14832 21 : if (!REG_P (op0))
14833 11 : op0 = copy_to_mode_reg (SImode, op0);
14834 21 : if (!REG_P (op1))
14835 10 : op1 = copy_to_mode_reg (SImode, op1);
14836 21 : if (!REG_P (op2))
14837 11 : op2 = copy_to_mode_reg (SImode, op2);
14838 21 : emit_insn (gen_mwaitx (op0, op1, op2));
14839 21 : return 0;
14840 :
14841 21 : case IX86_BUILTIN_UMONITOR:
14842 21 : arg0 = CALL_EXPR_ARG (exp, 0);
14843 21 : op0 = expand_normal (arg0);
14844 :
14845 21 : op0 = ix86_zero_extend_to_Pmode (op0);
14846 21 : emit_insn (gen_umonitor (Pmode, op0));
14847 21 : return 0;
14848 :
14849 42 : case IX86_BUILTIN_UMWAIT:
14850 42 : case IX86_BUILTIN_TPAUSE:
14851 42 : arg0 = CALL_EXPR_ARG (exp, 0);
14852 42 : arg1 = CALL_EXPR_ARG (exp, 1);
14853 42 : op0 = expand_normal (arg0);
14854 42 : op1 = expand_normal (arg1);
14855 :
14856 42 : if (!REG_P (op0))
14857 20 : op0 = copy_to_mode_reg (SImode, op0);
14858 :
14859 42 : op1 = force_reg (DImode, op1);
14860 :
14861 42 : if (TARGET_64BIT)
14862 : {
14863 42 : op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
14864 : NULL, 1, OPTAB_DIRECT);
14865 42 : switch (fcode)
14866 : {
14867 : case IX86_BUILTIN_UMWAIT:
14868 : icode = CODE_FOR_umwait_rex64;
14869 : break;
14870 21 : case IX86_BUILTIN_TPAUSE:
14871 21 : icode = CODE_FOR_tpause_rex64;
14872 21 : break;
14873 0 : default:
14874 0 : gcc_unreachable ();
14875 : }
14876 :
14877 42 : op2 = gen_lowpart (SImode, op2);
14878 42 : op1 = gen_lowpart (SImode, op1);
14879 42 : pat = GEN_FCN (icode) (op0, op1, op2);
14880 : }
14881 : else
14882 : {
14883 0 : switch (fcode)
14884 : {
14885 : case IX86_BUILTIN_UMWAIT:
14886 : icode = CODE_FOR_umwait;
14887 : break;
14888 0 : case IX86_BUILTIN_TPAUSE:
14889 0 : icode = CODE_FOR_tpause;
14890 0 : break;
14891 0 : default:
14892 0 : gcc_unreachable ();
14893 : }
14894 0 : pat = GEN_FCN (icode) (op0, op1);
14895 : }
14896 :
14897 42 : if (!pat)
14898 : return 0;
14899 :
14900 42 : emit_insn (pat);
14901 :
14902 42 : if (target == 0
14903 42 : || !register_operand (target, QImode))
14904 0 : target = gen_reg_rtx (QImode);
14905 :
14906 42 : pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
14907 : const0_rtx);
14908 42 : emit_insn (gen_rtx_SET (target, pat));
14909 :
14910 42 : return target;
14911 :
14912 20 : case IX86_BUILTIN_TESTUI:
14913 20 : emit_insn (gen_testui ());
14914 :
14915 20 : if (target == 0
14916 20 : || !register_operand (target, QImode))
14917 0 : target = gen_reg_rtx (QImode);
14918 :
14919 20 : pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
14920 : const0_rtx);
14921 20 : emit_insn (gen_rtx_SET (target, pat));
14922 :
14923 20 : return target;
14924 :
14925 19 : case IX86_BUILTIN_CLZERO:
14926 19 : arg0 = CALL_EXPR_ARG (exp, 0);
14927 19 : op0 = expand_normal (arg0);
14928 19 : if (!REG_P (op0))
14929 9 : op0 = ix86_zero_extend_to_Pmode (op0);
14930 19 : emit_insn (gen_clzero (Pmode, op0));
14931 19 : return 0;
14932 :
14933 19 : case IX86_BUILTIN_CLDEMOTE:
14934 19 : arg0 = CALL_EXPR_ARG (exp, 0);
14935 19 : op0 = expand_normal (arg0);
14936 19 : icode = CODE_FOR_cldemote;
14937 19 : if (!insn_data[icode].operand[0].predicate (op0, Pmode))
14938 9 : op0 = ix86_zero_extend_to_Pmode (op0);
14939 :
14940 19 : emit_insn (gen_cldemote (op0));
14941 19 : return 0;
14942 :
14943 11 : case IX86_BUILTIN_LOADIWKEY:
14944 11 : {
14945 11 : arg0 = CALL_EXPR_ARG (exp, 0);
14946 11 : arg1 = CALL_EXPR_ARG (exp, 1);
14947 11 : arg2 = CALL_EXPR_ARG (exp, 2);
14948 11 : arg3 = CALL_EXPR_ARG (exp, 3);
14949 :
14950 11 : op0 = expand_normal (arg0);
14951 11 : op1 = expand_normal (arg1);
14952 11 : op2 = expand_normal (arg2);
14953 11 : op3 = expand_normal (arg3);
14954 :
14955 11 : if (!REG_P (op0))
14956 5 : op0 = copy_to_mode_reg (V2DImode, op0);
14957 11 : if (!REG_P (op1))
14958 5 : op1 = copy_to_mode_reg (V2DImode, op1);
14959 11 : if (!REG_P (op2))
14960 5 : op2 = copy_to_mode_reg (V2DImode, op2);
14961 11 : if (!REG_P (op3))
14962 5 : op3 = copy_to_mode_reg (SImode, op3);
14963 :
14964 11 : emit_insn (gen_loadiwkey (op0, op1, op2, op3));
14965 :
14966 11 : return 0;
14967 : }
14968 :
14969 12 : case IX86_BUILTIN_AESDEC128KLU8:
14970 12 : icode = CODE_FOR_aesdec128klu8;
14971 12 : goto aesdecenc_expand;
14972 :
14973 12 : case IX86_BUILTIN_AESDEC256KLU8:
14974 12 : icode = CODE_FOR_aesdec256klu8;
14975 12 : goto aesdecenc_expand;
14976 :
14977 12 : case IX86_BUILTIN_AESENC128KLU8:
14978 12 : icode = CODE_FOR_aesenc128klu8;
14979 12 : goto aesdecenc_expand;
14980 :
14981 : case IX86_BUILTIN_AESENC256KLU8:
14982 : icode = CODE_FOR_aesenc256klu8;
14983 :
14984 48 : aesdecenc_expand:
14985 :
14986 48 : arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
14987 48 : arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
14988 48 : arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
14989 :
14990 48 : op0 = expand_normal (arg0);
14991 48 : op1 = expand_normal (arg1);
14992 48 : op2 = expand_normal (arg2);
14993 :
14994 48 : if (!address_operand (op0, V2DImode))
14995 : {
14996 16 : op0 = convert_memory_address (Pmode, op0);
14997 16 : op0 = copy_addr_to_reg (op0);
14998 : }
14999 48 : op0 = gen_rtx_MEM (V2DImode, op0);
15000 :
15001 48 : if (!REG_P (op1))
15002 20 : op1 = copy_to_mode_reg (V2DImode, op1);
15003 :
15004 48 : if (!address_operand (op2, VOIDmode))
15005 : {
15006 16 : op2 = convert_memory_address (Pmode, op2);
15007 16 : op2 = copy_addr_to_reg (op2);
15008 : }
15009 48 : op2 = gen_rtx_MEM (BLKmode, op2);
15010 :
15011 48 : emit_insn (GEN_FCN (icode) (op1, op1, op2));
15012 :
15013 48 : if (target == 0)
15014 4 : target = gen_reg_rtx (QImode);
15015 :
15016 : /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
15017 : error occurs. Then the output should be cleared for safety. */
15018 48 : rtx_code_label *ok_label;
15019 48 : rtx tmp;
15020 :
15021 48 : tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
15022 48 : pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
15023 48 : ok_label = gen_label_rtx ();
15024 48 : emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
15025 : true, ok_label);
15026 : /* Usually the runtime error seldom occur, so predict OK path as
15027 : hotspot to optimize it as fallthrough block. */
15028 48 : predict_jump (REG_BR_PROB_BASE * 90 / 100);
15029 :
15030 48 : emit_insn (gen_rtx_SET (op1, const0_rtx));
15031 :
15032 48 : emit_label (ok_label);
15033 48 : emit_insn (gen_rtx_SET (target, pat));
15034 48 : emit_insn (gen_rtx_SET (op0, op1));
15035 :
15036 48 : return target;
15037 :
15038 11 : case IX86_BUILTIN_AESDECWIDE128KLU8:
15039 11 : icode = CODE_FOR_aesdecwide128klu8;
15040 11 : goto wideaesdecenc_expand;
15041 :
15042 11 : case IX86_BUILTIN_AESDECWIDE256KLU8:
15043 11 : icode = CODE_FOR_aesdecwide256klu8;
15044 11 : goto wideaesdecenc_expand;
15045 :
15046 11 : case IX86_BUILTIN_AESENCWIDE128KLU8:
15047 11 : icode = CODE_FOR_aesencwide128klu8;
15048 11 : goto wideaesdecenc_expand;
15049 :
15050 : case IX86_BUILTIN_AESENCWIDE256KLU8:
15051 : icode = CODE_FOR_aesencwide256klu8;
15052 :
15053 44 : wideaesdecenc_expand:
15054 :
15055 44 : rtx xmm_regs[8];
15056 44 : rtx op;
15057 :
15058 44 : arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
15059 44 : arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
15060 44 : arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
15061 :
15062 44 : op0 = expand_normal (arg0);
15063 44 : op1 = expand_normal (arg1);
15064 44 : op2 = expand_normal (arg2);
15065 :
15066 44 : if (GET_MODE (op1) != Pmode)
15067 0 : op1 = convert_to_mode (Pmode, op1, 1);
15068 :
15069 44 : if (!address_operand (op2, VOIDmode))
15070 : {
15071 16 : op2 = convert_memory_address (Pmode, op2);
15072 16 : op2 = copy_addr_to_reg (op2);
15073 : }
15074 44 : op2 = gen_rtx_MEM (BLKmode, op2);
15075 :
15076 440 : for (i = 0; i < 8; i++)
15077 : {
15078 352 : xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
15079 :
15080 352 : op = gen_rtx_MEM (V2DImode,
15081 352 : plus_constant (Pmode, op1, (i * 16)));
15082 :
15083 352 : emit_move_insn (xmm_regs[i], op);
15084 : }
15085 :
15086 44 : emit_insn (GEN_FCN (icode) (op2));
15087 :
15088 44 : if (target == 0)
15089 0 : target = gen_reg_rtx (QImode);
15090 :
15091 44 : tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
15092 44 : pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
15093 44 : ok_label = gen_label_rtx ();
15094 44 : emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
15095 : true, ok_label);
15096 44 : predict_jump (REG_BR_PROB_BASE * 90 / 100);
15097 :
15098 440 : for (i = 0; i < 8; i++)
15099 352 : emit_insn (gen_rtx_SET (xmm_regs[i], const0_rtx));
15100 :
15101 44 : emit_label (ok_label);
15102 44 : emit_insn (gen_rtx_SET (target, pat));
15103 :
15104 44 : if (GET_MODE (op0) != Pmode)
15105 0 : op0 = convert_to_mode (Pmode, op0, 1);
15106 :
15107 396 : for (i = 0; i < 8; i++)
15108 : {
15109 352 : op = gen_rtx_MEM (V2DImode,
15110 352 : plus_constant (Pmode, op0, (i * 16)));
15111 352 : emit_move_insn (op, xmm_regs[i]);
15112 : }
15113 :
15114 : return target;
15115 :
15116 13 : case IX86_BUILTIN_ENCODEKEY128U32:
15117 13 : {
15118 13 : rtx op, xmm_regs[7];
15119 :
15120 13 : arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
15121 13 : arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
15122 13 : arg2 = CALL_EXPR_ARG (exp, 2); // void *h
15123 :
15124 13 : op0 = expand_normal (arg0);
15125 13 : op1 = expand_normal (arg1);
15126 13 : op2 = expand_normal (arg2);
15127 :
15128 13 : if (!REG_P (op0))
15129 7 : op0 = copy_to_mode_reg (SImode, op0);
15130 :
15131 13 : if (GET_MODE (op2) != Pmode)
15132 1 : op2 = convert_to_mode (Pmode, op2, 1);
15133 :
15134 13 : op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
15135 13 : emit_move_insn (op, op1);
15136 :
15137 65 : for (i = 0; i < 3; i++)
15138 39 : xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
15139 :
15140 13 : if (target == 0 || !register_operand (target, SImode))
15141 2 : target = gen_reg_rtx (SImode);
15142 :
15143 13 : emit_insn (gen_encodekey128u32 (target, op0));
15144 :
15145 65 : for (i = 0; i < 3; i++)
15146 : {
15147 39 : op = gen_rtx_MEM (V2DImode,
15148 39 : plus_constant (Pmode, op2, (i * 16)));
15149 39 : emit_move_insn (op, xmm_regs[i]);
15150 : }
15151 :
15152 13 : return target;
15153 : }
15154 13 : case IX86_BUILTIN_ENCODEKEY256U32:
15155 13 : {
15156 13 : rtx op, xmm_regs[7];
15157 :
15158 13 : arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
15159 13 : arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
15160 13 : arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
15161 13 : arg3 = CALL_EXPR_ARG (exp, 3); // void *h
15162 :
15163 13 : op0 = expand_normal (arg0);
15164 13 : op1 = expand_normal (arg1);
15165 13 : op2 = expand_normal (arg2);
15166 13 : op3 = expand_normal (arg3);
15167 :
15168 13 : if (!REG_P (op0))
15169 7 : op0 = copy_to_mode_reg (SImode, op0);
15170 :
15171 13 : if (GET_MODE (op3) != Pmode)
15172 1 : op3 = convert_to_mode (Pmode, op3, 1);
15173 :
15174 : /* Force to use xmm0, xmm1 for keylow, keyhi*/
15175 13 : op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
15176 13 : emit_move_insn (op, op1);
15177 13 : op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
15178 13 : emit_move_insn (op, op2);
15179 :
15180 78 : for (i = 0; i < 4; i++)
15181 52 : xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
15182 :
15183 13 : if (target == 0 || !register_operand (target, SImode))
15184 2 : target = gen_reg_rtx (SImode);
15185 :
15186 13 : emit_insn (gen_encodekey256u32 (target, op0));
15187 :
15188 78 : for (i = 0; i < 4; i++)
15189 : {
15190 52 : op = gen_rtx_MEM (V2DImode,
15191 52 : plus_constant (Pmode, op3, (i * 16)));
15192 52 : emit_move_insn (op, xmm_regs[i]);
15193 : }
15194 :
15195 13 : return target;
15196 : }
15197 :
15198 48 : case IX86_BUILTIN_PREFETCH:
15199 48 : {
15200 48 : arg0 = CALL_EXPR_ARG (exp, 0); // const void *
15201 48 : arg1 = CALL_EXPR_ARG (exp, 1); // const int
15202 48 : arg2 = CALL_EXPR_ARG (exp, 2); // const int
15203 48 : arg3 = CALL_EXPR_ARG (exp, 3); // const int
15204 :
15205 48 : op0 = expand_normal (arg0);
15206 48 : op1 = expand_normal (arg1);
15207 48 : op2 = expand_normal (arg2);
15208 48 : op3 = expand_normal (arg3);
15209 :
15210 48 : if (!CONST_INT_P (op1) || !CONST_INT_P (op2) || !CONST_INT_P (op3))
15211 : {
15212 0 : error ("second, third and fourth argument must be a const");
15213 0 : return const0_rtx;
15214 : }
15215 :
15216 48 : if (!IN_RANGE (INTVAL (op1), 0, 2))
15217 : {
15218 1 : warning (0, "invalid second argument to"
15219 : " %<__builtin_ia32_prefetch%>; using zero");
15220 1 : op1 = const0_rtx;
15221 : }
15222 :
15223 48 : if (INTVAL (op3) == 1)
15224 : {
15225 4 : if (!IN_RANGE (INTVAL (op2), 2, 3))
15226 : {
15227 1 : error ("invalid third argument");
15228 1 : return const0_rtx;
15229 : }
15230 :
15231 3 : if (TARGET_64BIT && TARGET_PREFETCHI
15232 6 : && local_func_symbolic_operand (op0, GET_MODE (op0)))
15233 2 : emit_insn (gen_prefetchi (op0, op2));
15234 : else
15235 : {
15236 1 : warning (0, "instruction prefetch applies when in 64-bit mode"
15237 : " with RIP-relative addressing and"
15238 : " option %<-mprefetchi%>;"
15239 : " they stay NOPs otherwise");
15240 1 : emit_insn (gen_nop ());
15241 : }
15242 : }
15243 : else
15244 : {
15245 44 : if (INTVAL (op3) != 0)
15246 1 : warning (0, "invalid forth argument to"
15247 : " %<__builtin_ia32_prefetch%>; using zero");
15248 :
15249 44 : if (!address_operand (op0, VOIDmode))
15250 : {
15251 10 : op0 = convert_memory_address (Pmode, op0);
15252 10 : op0 = copy_addr_to_reg (op0);
15253 : }
15254 :
15255 44 : if (!IN_RANGE (INTVAL (op2), 0, 3))
15256 : {
15257 1 : warning (0, "invalid third argument to %<__builtin_ia32_prefetch%>; using zero");
15258 1 : op2 = const0_rtx;
15259 : }
15260 :
15261 44 : if (TARGET_3DNOW
15262 26 : || TARGET_PREFETCH_SSE
15263 0 : || TARGET_PRFCHW
15264 0 : || TARGET_MOVRS)
15265 44 : emit_insn (gen_prefetch (op0, op1, op2));
15266 0 : else if (!MEM_P (op0) && side_effects_p (op0))
15267 : /* Don't do anything with direct references to volatile memory,
15268 : but generate code to handle other side effects. */
15269 0 : emit_insn (op0);
15270 : }
15271 :
15272 : return 0;
15273 : }
15274 :
15275 21 : case IX86_BUILTIN_PREFETCHI:
15276 21 : {
15277 21 : arg0 = CALL_EXPR_ARG (exp, 0); // const void *
15278 21 : arg1 = CALL_EXPR_ARG (exp, 1); // const int
15279 :
15280 21 : op0 = expand_normal (arg0);
15281 21 : op1 = expand_normal (arg1);
15282 :
15283 21 : if (!CONST_INT_P (op1))
15284 : {
15285 0 : error ("second argument must be a const");
15286 0 : return const0_rtx;
15287 : }
15288 :
15289 : /* GOT/PLT_PIC should not be available for instruction prefetch.
15290 : It must be real instruction address. */
15291 21 : if (TARGET_64BIT
15292 21 : && local_func_symbolic_operand (op0, GET_MODE (op0)))
15293 4 : emit_insn (gen_prefetchi (op0, op1));
15294 : else
15295 : {
15296 : /* Ignore the hint. */
15297 17 : warning (0, "instruction prefetch applies when in 64-bit mode"
15298 : " with RIP-relative addressing and"
15299 : " option %<-mprefetchi%>;"
15300 : " they stay NOPs otherwise");
15301 17 : emit_insn (gen_nop ());
15302 : }
15303 :
15304 : return 0;
15305 : }
15306 :
15307 53 : case IX86_BUILTIN_URDMSR:
15308 53 : case IX86_BUILTIN_UWRMSR:
15309 53 : {
15310 53 : arg0 = CALL_EXPR_ARG (exp, 0);
15311 53 : op0 = expand_normal (arg0);
15312 :
15313 53 : if (CONST_INT_P (op0))
15314 : {
15315 12 : unsigned HOST_WIDE_INT val = UINTVAL (op0);
15316 12 : if (val > 0xffffffff)
15317 2 : op0 = force_reg (DImode, op0);
15318 : }
15319 : else
15320 41 : op0 = force_reg (DImode, op0);
15321 :
15322 53 : if (fcode == IX86_BUILTIN_UWRMSR)
15323 : {
15324 26 : arg1 = CALL_EXPR_ARG (exp, 1);
15325 26 : op1 = expand_normal (arg1);
15326 26 : op1 = force_reg (DImode, op1);
15327 26 : icode = CODE_FOR_uwrmsr;
15328 26 : target = 0;
15329 : }
15330 : else
15331 : {
15332 27 : if (target == 0 || !register_operand (target, DImode))
15333 1 : target = gen_reg_rtx (DImode);
15334 : icode = CODE_FOR_urdmsr;
15335 : op1 = op0;
15336 : op0 = target;
15337 : }
15338 53 : emit_insn (GEN_FCN (icode) (op0, op1));
15339 53 : return target;
15340 : }
15341 :
15342 229 : case IX86_BUILTIN_VEC_INIT_V2SI:
15343 229 : case IX86_BUILTIN_VEC_INIT_V4HI:
15344 229 : case IX86_BUILTIN_VEC_INIT_V8QI:
15345 229 : return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
15346 :
15347 400 : case IX86_BUILTIN_VEC_EXT_V2DF:
15348 400 : case IX86_BUILTIN_VEC_EXT_V2DI:
15349 400 : case IX86_BUILTIN_VEC_EXT_V4SF:
15350 400 : case IX86_BUILTIN_VEC_EXT_V4SI:
15351 400 : case IX86_BUILTIN_VEC_EXT_V8HI:
15352 400 : case IX86_BUILTIN_VEC_EXT_V2SI:
15353 400 : case IX86_BUILTIN_VEC_EXT_V4HI:
15354 400 : case IX86_BUILTIN_VEC_EXT_V16QI:
15355 400 : return ix86_expand_vec_ext_builtin (exp, target);
15356 :
15357 204 : case IX86_BUILTIN_VEC_SET_V2DI:
15358 204 : case IX86_BUILTIN_VEC_SET_V4SF:
15359 204 : case IX86_BUILTIN_VEC_SET_V4SI:
15360 204 : case IX86_BUILTIN_VEC_SET_V8HI:
15361 204 : case IX86_BUILTIN_VEC_SET_V4HI:
15362 204 : case IX86_BUILTIN_VEC_SET_V16QI:
15363 204 : return ix86_expand_vec_set_builtin (exp);
15364 :
15365 0 : case IX86_BUILTIN_NANQ:
15366 0 : case IX86_BUILTIN_NANSQ:
15367 0 : return expand_call (exp, target, ignore);
15368 :
15369 18 : case IX86_BUILTIN_RDPID:
15370 :
15371 18 : op0 = gen_reg_rtx (word_mode);
15372 :
15373 18 : if (TARGET_64BIT)
15374 : {
15375 18 : insn = gen_rdpid_rex64 (op0);
15376 18 : op0 = convert_to_mode (SImode, op0, 1);
15377 : }
15378 : else
15379 0 : insn = gen_rdpid (op0);
15380 :
15381 18 : emit_insn (insn);
15382 :
15383 18 : if (target == 0
15384 18 : || !register_operand (target, SImode))
15385 0 : target = gen_reg_rtx (SImode);
15386 :
15387 18 : emit_move_insn (target, op0);
15388 18 : return target;
15389 :
15390 76 : case IX86_BUILTIN_2INTERSECTD512:
15391 76 : case IX86_BUILTIN_2INTERSECTQ512:
15392 76 : case IX86_BUILTIN_2INTERSECTD256:
15393 76 : case IX86_BUILTIN_2INTERSECTQ256:
15394 76 : case IX86_BUILTIN_2INTERSECTD128:
15395 76 : case IX86_BUILTIN_2INTERSECTQ128:
15396 76 : arg0 = CALL_EXPR_ARG (exp, 0);
15397 76 : arg1 = CALL_EXPR_ARG (exp, 1);
15398 76 : arg2 = CALL_EXPR_ARG (exp, 2);
15399 76 : arg3 = CALL_EXPR_ARG (exp, 3);
15400 76 : op0 = expand_normal (arg0);
15401 76 : op1 = expand_normal (arg1);
15402 76 : op2 = expand_normal (arg2);
15403 76 : op3 = expand_normal (arg3);
15404 :
15405 76 : if (!address_operand (op0, VOIDmode))
15406 : {
15407 26 : op0 = convert_memory_address (Pmode, op0);
15408 26 : op0 = copy_addr_to_reg (op0);
15409 : }
15410 76 : if (!address_operand (op1, VOIDmode))
15411 : {
15412 26 : op1 = convert_memory_address (Pmode, op1);
15413 26 : op1 = copy_addr_to_reg (op1);
15414 : }
15415 :
15416 76 : switch (fcode)
15417 : {
15418 : case IX86_BUILTIN_2INTERSECTD512:
15419 : mode4 = P2HImode;
15420 : icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
15421 : break;
15422 : case IX86_BUILTIN_2INTERSECTQ512:
15423 : mode4 = P2QImode;
15424 : icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
15425 : break;
15426 : case IX86_BUILTIN_2INTERSECTD256:
15427 : mode4 = P2QImode;
15428 : icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
15429 : break;
15430 : case IX86_BUILTIN_2INTERSECTQ256:
15431 : mode4 = P2QImode;
15432 : icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
15433 : break;
15434 : case IX86_BUILTIN_2INTERSECTD128:
15435 : mode4 = P2QImode;
15436 : icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
15437 : break;
15438 : case IX86_BUILTIN_2INTERSECTQ128:
15439 : mode4 = P2QImode;
15440 : icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
15441 : break;
15442 0 : default:
15443 0 : gcc_unreachable ();
15444 : }
15445 :
15446 76 : mode2 = insn_data[icode].operand[1].mode;
15447 76 : mode3 = insn_data[icode].operand[2].mode;
15448 76 : if (!insn_data[icode].operand[1].predicate (op2, mode2))
15449 26 : op2 = copy_to_mode_reg (mode2, op2);
15450 76 : if (!insn_data[icode].operand[2].predicate (op3, mode3))
15451 6 : op3 = copy_to_mode_reg (mode3, op3);
15452 :
15453 76 : op4 = gen_reg_rtx (mode4);
15454 76 : emit_insn (GEN_FCN (icode) (op4, op2, op3));
15455 76 : mode0 = mode4 == P2HImode ? HImode : QImode;
15456 76 : emit_move_insn (gen_rtx_MEM (mode0, op0),
15457 76 : gen_lowpart (mode0, op4));
15458 76 : emit_move_insn (gen_rtx_MEM (mode0, op1),
15459 : gen_highpart (mode0, op4));
15460 :
15461 76 : return 0;
15462 :
15463 102 : case IX86_BUILTIN_RDPMC:
15464 102 : case IX86_BUILTIN_RDTSC:
15465 102 : case IX86_BUILTIN_RDTSCP:
15466 102 : case IX86_BUILTIN_XGETBV:
15467 :
15468 102 : op0 = gen_reg_rtx (DImode);
15469 102 : op1 = gen_reg_rtx (DImode);
15470 :
15471 102 : if (fcode == IX86_BUILTIN_RDPMC)
15472 : {
15473 22 : arg0 = CALL_EXPR_ARG (exp, 0);
15474 22 : op2 = expand_normal (arg0);
15475 22 : if (!register_operand (op2, SImode))
15476 11 : op2 = copy_to_mode_reg (SImode, op2);
15477 :
15478 22 : insn = (TARGET_64BIT
15479 22 : ? gen_rdpmc_rex64 (op0, op1, op2)
15480 0 : : gen_rdpmc (op0, op2));
15481 22 : emit_insn (insn);
15482 : }
15483 80 : else if (fcode == IX86_BUILTIN_XGETBV)
15484 : {
15485 22 : arg0 = CALL_EXPR_ARG (exp, 0);
15486 22 : op2 = expand_normal (arg0);
15487 22 : if (!register_operand (op2, SImode))
15488 1 : op2 = copy_to_mode_reg (SImode, op2);
15489 :
15490 22 : insn = (TARGET_64BIT
15491 22 : ? gen_xgetbv_rex64 (op0, op1, op2)
15492 0 : : gen_xgetbv (op0, op2));
15493 22 : emit_insn (insn);
15494 : }
15495 58 : else if (fcode == IX86_BUILTIN_RDTSC)
15496 : {
15497 36 : insn = (TARGET_64BIT
15498 36 : ? gen_rdtsc_rex64 (op0, op1)
15499 2 : : gen_rdtsc (op0));
15500 36 : emit_insn (insn);
15501 : }
15502 : else
15503 : {
15504 22 : op2 = gen_reg_rtx (SImode);
15505 :
15506 22 : insn = (TARGET_64BIT
15507 22 : ? gen_rdtscp_rex64 (op0, op1, op2)
15508 0 : : gen_rdtscp (op0, op2));
15509 22 : emit_insn (insn);
15510 :
15511 22 : arg0 = CALL_EXPR_ARG (exp, 0);
15512 22 : op4 = expand_normal (arg0);
15513 22 : if (!address_operand (op4, VOIDmode))
15514 : {
15515 10 : op4 = convert_memory_address (Pmode, op4);
15516 10 : op4 = copy_addr_to_reg (op4);
15517 : }
15518 22 : emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
15519 : }
15520 :
15521 102 : if (target == 0
15522 102 : || !register_operand (target, DImode))
15523 10 : target = gen_reg_rtx (DImode);
15524 :
15525 102 : if (TARGET_64BIT)
15526 : {
15527 100 : op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
15528 : op1, 1, OPTAB_DIRECT);
15529 100 : op0 = expand_simple_binop (DImode, IOR, op0, op1,
15530 : op0, 1, OPTAB_DIRECT);
15531 : }
15532 :
15533 102 : emit_move_insn (target, op0);
15534 102 : return target;
15535 :
15536 61 : case IX86_BUILTIN_ENQCMD:
15537 61 : case IX86_BUILTIN_ENQCMDS:
15538 61 : case IX86_BUILTIN_MOVDIR64B:
15539 :
15540 61 : arg0 = CALL_EXPR_ARG (exp, 0);
15541 61 : arg1 = CALL_EXPR_ARG (exp, 1);
15542 61 : op0 = expand_normal (arg0);
15543 61 : op1 = expand_normal (arg1);
15544 :
15545 61 : op0 = ix86_zero_extend_to_Pmode (op0);
15546 61 : if (!address_operand (op1, VOIDmode))
15547 : {
15548 28 : op1 = convert_memory_address (Pmode, op1);
15549 28 : op1 = copy_addr_to_reg (op1);
15550 : }
15551 61 : op1 = gen_rtx_MEM (XImode, op1);
15552 :
15553 61 : if (fcode == IX86_BUILTIN_MOVDIR64B)
15554 : {
15555 24 : emit_insn (gen_movdir64b (Pmode, op0, op1));
15556 23 : return 0;
15557 : }
15558 : else
15559 : {
15560 38 : if (target == 0
15561 38 : || !register_operand (target, SImode))
15562 0 : target = gen_reg_rtx (SImode);
15563 :
15564 38 : emit_move_insn (target, const0_rtx);
15565 38 : target = gen_rtx_SUBREG (QImode, target, 0);
15566 :
15567 19 : int unspecv = (fcode == IX86_BUILTIN_ENQCMD
15568 38 : ? UNSPECV_ENQCMD
15569 : : UNSPECV_ENQCMDS);
15570 38 : icode = code_for_enqcmd (unspecv, Pmode);
15571 38 : emit_insn (GEN_FCN (icode) (op0, op1));
15572 :
15573 38 : emit_insn
15574 38 : (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
15575 : gen_rtx_fmt_ee (EQ, QImode,
15576 : gen_rtx_REG (CCZmode, FLAGS_REG),
15577 : const0_rtx)));
15578 38 : return SUBREG_REG (target);
15579 : }
15580 :
15581 14775 : case IX86_BUILTIN_FXSAVE:
15582 14775 : case IX86_BUILTIN_FXRSTOR:
15583 14775 : case IX86_BUILTIN_FXSAVE64:
15584 14775 : case IX86_BUILTIN_FXRSTOR64:
15585 14775 : case IX86_BUILTIN_FNSTENV:
15586 14775 : case IX86_BUILTIN_FLDENV:
15587 14775 : mode0 = BLKmode;
15588 14775 : switch (fcode)
15589 : {
15590 : case IX86_BUILTIN_FXSAVE:
15591 : icode = CODE_FOR_fxsave;
15592 : break;
15593 19 : case IX86_BUILTIN_FXRSTOR:
15594 19 : icode = CODE_FOR_fxrstor;
15595 19 : break;
15596 23 : case IX86_BUILTIN_FXSAVE64:
15597 23 : icode = CODE_FOR_fxsave64;
15598 23 : break;
15599 21 : case IX86_BUILTIN_FXRSTOR64:
15600 21 : icode = CODE_FOR_fxrstor64;
15601 21 : break;
15602 7257 : case IX86_BUILTIN_FNSTENV:
15603 7257 : icode = CODE_FOR_fnstenv;
15604 7257 : break;
15605 7435 : case IX86_BUILTIN_FLDENV:
15606 7435 : icode = CODE_FOR_fldenv;
15607 7435 : break;
15608 0 : default:
15609 0 : gcc_unreachable ();
15610 : }
15611 :
15612 14775 : arg0 = CALL_EXPR_ARG (exp, 0);
15613 14775 : op0 = expand_normal (arg0);
15614 :
15615 14775 : if (!address_operand (op0, VOIDmode))
15616 : {
15617 36 : op0 = convert_memory_address (Pmode, op0);
15618 36 : op0 = copy_addr_to_reg (op0);
15619 : }
15620 14775 : op0 = gen_rtx_MEM (mode0, op0);
15621 :
15622 14775 : pat = GEN_FCN (icode) (op0);
15623 14775 : if (pat)
15624 14775 : emit_insn (pat);
15625 : return 0;
15626 :
15627 21 : case IX86_BUILTIN_XSETBV:
15628 21 : arg0 = CALL_EXPR_ARG (exp, 0);
15629 21 : arg1 = CALL_EXPR_ARG (exp, 1);
15630 21 : op0 = expand_normal (arg0);
15631 21 : op1 = expand_normal (arg1);
15632 :
15633 21 : if (!REG_P (op0))
15634 1 : op0 = copy_to_mode_reg (SImode, op0);
15635 :
15636 21 : op1 = force_reg (DImode, op1);
15637 :
15638 21 : if (TARGET_64BIT)
15639 : {
15640 21 : op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
15641 : NULL, 1, OPTAB_DIRECT);
15642 :
15643 21 : icode = CODE_FOR_xsetbv_rex64;
15644 :
15645 21 : op2 = gen_lowpart (SImode, op2);
15646 21 : op1 = gen_lowpart (SImode, op1);
15647 21 : pat = GEN_FCN (icode) (op0, op1, op2);
15648 : }
15649 : else
15650 : {
15651 0 : icode = CODE_FOR_xsetbv;
15652 :
15653 0 : pat = GEN_FCN (icode) (op0, op1);
15654 : }
15655 21 : if (pat)
15656 21 : emit_insn (pat);
15657 : return 0;
15658 :
15659 232 : case IX86_BUILTIN_XSAVE:
15660 232 : case IX86_BUILTIN_XRSTOR:
15661 232 : case IX86_BUILTIN_XSAVE64:
15662 232 : case IX86_BUILTIN_XRSTOR64:
15663 232 : case IX86_BUILTIN_XSAVEOPT:
15664 232 : case IX86_BUILTIN_XSAVEOPT64:
15665 232 : case IX86_BUILTIN_XSAVES:
15666 232 : case IX86_BUILTIN_XRSTORS:
15667 232 : case IX86_BUILTIN_XSAVES64:
15668 232 : case IX86_BUILTIN_XRSTORS64:
15669 232 : case IX86_BUILTIN_XSAVEC:
15670 232 : case IX86_BUILTIN_XSAVEC64:
15671 232 : arg0 = CALL_EXPR_ARG (exp, 0);
15672 232 : arg1 = CALL_EXPR_ARG (exp, 1);
15673 232 : op0 = expand_normal (arg0);
15674 232 : op1 = expand_normal (arg1);
15675 :
15676 232 : if (!address_operand (op0, VOIDmode))
15677 : {
15678 108 : op0 = convert_memory_address (Pmode, op0);
15679 108 : op0 = copy_addr_to_reg (op0);
15680 : }
15681 232 : op0 = gen_rtx_MEM (BLKmode, op0);
15682 :
15683 232 : op1 = force_reg (DImode, op1);
15684 :
15685 232 : if (TARGET_64BIT)
15686 : {
15687 232 : op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
15688 : NULL, 1, OPTAB_DIRECT);
15689 232 : switch (fcode)
15690 : {
15691 : case IX86_BUILTIN_XSAVE:
15692 : icode = CODE_FOR_xsave_rex64;
15693 : break;
15694 19 : case IX86_BUILTIN_XRSTOR:
15695 19 : icode = CODE_FOR_xrstor_rex64;
15696 19 : break;
15697 21 : case IX86_BUILTIN_XSAVE64:
15698 21 : icode = CODE_FOR_xsave64;
15699 21 : break;
15700 21 : case IX86_BUILTIN_XRSTOR64:
15701 21 : icode = CODE_FOR_xrstor64;
15702 21 : break;
15703 19 : case IX86_BUILTIN_XSAVEOPT:
15704 19 : icode = CODE_FOR_xsaveopt_rex64;
15705 19 : break;
15706 19 : case IX86_BUILTIN_XSAVEOPT64:
15707 19 : icode = CODE_FOR_xsaveopt64;
15708 19 : break;
15709 19 : case IX86_BUILTIN_XSAVES:
15710 19 : icode = CODE_FOR_xsaves_rex64;
15711 19 : break;
15712 19 : case IX86_BUILTIN_XRSTORS:
15713 19 : icode = CODE_FOR_xrstors_rex64;
15714 19 : break;
15715 19 : case IX86_BUILTIN_XSAVES64:
15716 19 : icode = CODE_FOR_xsaves64;
15717 19 : break;
15718 19 : case IX86_BUILTIN_XRSTORS64:
15719 19 : icode = CODE_FOR_xrstors64;
15720 19 : break;
15721 19 : case IX86_BUILTIN_XSAVEC:
15722 19 : icode = CODE_FOR_xsavec_rex64;
15723 19 : break;
15724 19 : case IX86_BUILTIN_XSAVEC64:
15725 19 : icode = CODE_FOR_xsavec64;
15726 19 : break;
15727 0 : default:
15728 0 : gcc_unreachable ();
15729 : }
15730 :
15731 232 : op2 = gen_lowpart (SImode, op2);
15732 232 : op1 = gen_lowpart (SImode, op1);
15733 232 : pat = GEN_FCN (icode) (op0, op1, op2);
15734 : }
15735 : else
15736 : {
15737 0 : switch (fcode)
15738 : {
15739 : case IX86_BUILTIN_XSAVE:
15740 : icode = CODE_FOR_xsave;
15741 : break;
15742 : case IX86_BUILTIN_XRSTOR:
15743 : icode = CODE_FOR_xrstor;
15744 : break;
15745 : case IX86_BUILTIN_XSAVEOPT:
15746 : icode = CODE_FOR_xsaveopt;
15747 : break;
15748 : case IX86_BUILTIN_XSAVES:
15749 : icode = CODE_FOR_xsaves;
15750 : break;
15751 : case IX86_BUILTIN_XRSTORS:
15752 : icode = CODE_FOR_xrstors;
15753 : break;
15754 : case IX86_BUILTIN_XSAVEC:
15755 : icode = CODE_FOR_xsavec;
15756 : break;
15757 0 : default:
15758 0 : gcc_unreachable ();
15759 : }
15760 0 : pat = GEN_FCN (icode) (op0, op1);
15761 : }
15762 :
15763 232 : if (pat)
15764 232 : emit_insn (pat);
15765 : return 0;
15766 :
15767 144 : case IX86_BUILTIN_LDTILECFG:
15768 144 : case IX86_BUILTIN_STTILECFG:
15769 144 : arg0 = CALL_EXPR_ARG (exp, 0);
15770 144 : op0 = expand_normal (arg0);
15771 :
15772 144 : if (!address_operand (op0, VOIDmode))
15773 : {
15774 8 : op0 = convert_memory_address (Pmode, op0);
15775 8 : op0 = copy_addr_to_reg (op0);
15776 : }
15777 144 : op0 = gen_rtx_MEM (BLKmode, op0);
15778 144 : if (fcode == IX86_BUILTIN_LDTILECFG)
15779 : icode = CODE_FOR_ldtilecfg;
15780 : else
15781 93 : icode = CODE_FOR_sttilecfg;
15782 144 : pat = GEN_FCN (icode) (op0);
15783 144 : emit_insn (pat);
15784 144 : return 0;
15785 :
15786 18 : case IX86_BUILTIN_LLWPCB:
15787 18 : arg0 = CALL_EXPR_ARG (exp, 0);
15788 18 : op0 = expand_normal (arg0);
15789 :
15790 18 : if (!register_operand (op0, Pmode))
15791 9 : op0 = ix86_zero_extend_to_Pmode (op0);
15792 18 : emit_insn (gen_lwp_llwpcb (Pmode, op0));
15793 18 : return 0;
15794 :
15795 18 : case IX86_BUILTIN_SLWPCB:
15796 18 : if (!target
15797 18 : || !register_operand (target, Pmode))
15798 0 : target = gen_reg_rtx (Pmode);
15799 18 : emit_insn (gen_lwp_slwpcb (Pmode, target));
15800 18 : return target;
15801 :
15802 51 : case IX86_BUILTIN_LWPVAL32:
15803 51 : case IX86_BUILTIN_LWPVAL64:
15804 51 : case IX86_BUILTIN_LWPINS32:
15805 51 : case IX86_BUILTIN_LWPINS64:
15806 51 : mode = ((fcode == IX86_BUILTIN_LWPVAL32
15807 51 : || fcode == IX86_BUILTIN_LWPINS32)
15808 51 : ? SImode : DImode);
15809 :
15810 51 : if (fcode == IX86_BUILTIN_LWPVAL32
15811 51 : || fcode == IX86_BUILTIN_LWPVAL64)
15812 26 : icode = code_for_lwp_lwpval (mode);
15813 : else
15814 25 : icode = code_for_lwp_lwpins (mode);
15815 :
15816 51 : arg0 = CALL_EXPR_ARG (exp, 0);
15817 51 : arg1 = CALL_EXPR_ARG (exp, 1);
15818 51 : arg2 = CALL_EXPR_ARG (exp, 2);
15819 51 : op0 = expand_normal (arg0);
15820 51 : op1 = expand_normal (arg1);
15821 51 : op2 = expand_normal (arg2);
15822 51 : mode0 = insn_data[icode].operand[0].mode;
15823 :
15824 51 : if (!insn_data[icode].operand[0].predicate (op0, mode0))
15825 13 : op0 = copy_to_mode_reg (mode0, op0);
15826 51 : if (!insn_data[icode].operand[1].predicate (op1, SImode))
15827 0 : op1 = copy_to_mode_reg (SImode, op1);
15828 :
15829 51 : if (!CONST_INT_P (op2))
15830 : {
15831 0 : error ("the last argument must be a 32-bit immediate");
15832 0 : return const0_rtx;
15833 : }
15834 :
15835 51 : emit_insn (GEN_FCN (icode) (op0, op1, op2));
15836 :
15837 51 : if (fcode == IX86_BUILTIN_LWPINS32
15838 51 : || fcode == IX86_BUILTIN_LWPINS64)
15839 : {
15840 25 : if (target == 0
15841 25 : || !nonimmediate_operand (target, QImode))
15842 0 : target = gen_reg_rtx (QImode);
15843 :
15844 25 : pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
15845 : const0_rtx);
15846 25 : emit_insn (gen_rtx_SET (target, pat));
15847 :
15848 25 : return target;
15849 : }
15850 : else
15851 : return 0;
15852 :
15853 18 : case IX86_BUILTIN_BEXTRI32:
15854 18 : case IX86_BUILTIN_BEXTRI64:
15855 18 : mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
15856 :
15857 18 : arg0 = CALL_EXPR_ARG (exp, 0);
15858 18 : arg1 = CALL_EXPR_ARG (exp, 1);
15859 18 : op0 = expand_normal (arg0);
15860 18 : op1 = expand_normal (arg1);
15861 :
15862 18 : if (!CONST_INT_P (op1))
15863 : {
15864 0 : error ("last argument must be an immediate");
15865 0 : return const0_rtx;
15866 : }
15867 : else
15868 : {
15869 18 : unsigned char lsb_index = UINTVAL (op1);
15870 18 : unsigned char length = UINTVAL (op1) >> 8;
15871 :
15872 18 : unsigned char bitsize = GET_MODE_BITSIZE (mode);
15873 :
15874 18 : icode = code_for_tbm_bextri (mode);
15875 :
15876 18 : mode1 = insn_data[icode].operand[1].mode;
15877 18 : if (!insn_data[icode].operand[1].predicate (op0, mode1))
15878 12 : op0 = copy_to_mode_reg (mode1, op0);
15879 :
15880 18 : mode0 = insn_data[icode].operand[0].mode;
15881 18 : if (target == 0
15882 18 : || !register_operand (target, mode0))
15883 0 : target = gen_reg_rtx (mode0);
15884 :
15885 18 : if (length == 0 || lsb_index >= bitsize)
15886 : {
15887 8 : emit_move_insn (target, const0_rtx);
15888 8 : return target;
15889 : }
15890 :
15891 10 : if (length + lsb_index > bitsize)
15892 5 : length = bitsize - lsb_index;
15893 :
15894 10 : op1 = GEN_INT (length);
15895 10 : op2 = GEN_INT (lsb_index);
15896 :
15897 10 : emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
15898 10 : return target;
15899 : }
15900 :
15901 21 : case IX86_BUILTIN_RDRAND16_STEP:
15902 21 : mode = HImode;
15903 21 : goto rdrand_step;
15904 :
15905 42 : case IX86_BUILTIN_RDRAND32_STEP:
15906 42 : mode = SImode;
15907 42 : goto rdrand_step;
15908 :
15909 : case IX86_BUILTIN_RDRAND64_STEP:
15910 : mode = DImode;
15911 :
15912 83 : rdrand_step:
15913 83 : arg0 = CALL_EXPR_ARG (exp, 0);
15914 83 : op1 = expand_normal (arg0);
15915 83 : if (!address_operand (op1, VOIDmode))
15916 : {
15917 29 : op1 = convert_memory_address (Pmode, op1);
15918 29 : op1 = copy_addr_to_reg (op1);
15919 : }
15920 :
15921 83 : op0 = gen_reg_rtx (mode);
15922 83 : emit_insn (gen_rdrand (mode, op0));
15923 :
15924 83 : emit_move_insn (gen_rtx_MEM (mode, op1), op0);
15925 :
15926 83 : op1 = force_reg (SImode, const1_rtx);
15927 :
15928 : /* Emit SImode conditional move. */
15929 83 : if (mode == HImode)
15930 : {
15931 21 : if (TARGET_ZERO_EXTEND_WITH_AND
15932 21 : && optimize_function_for_speed_p (cfun))
15933 : {
15934 0 : op2 = force_reg (SImode, const0_rtx);
15935 :
15936 0 : emit_insn (gen_movstricthi
15937 0 : (gen_lowpart (HImode, op2), op0));
15938 : }
15939 : else
15940 : {
15941 21 : op2 = gen_reg_rtx (SImode);
15942 :
15943 21 : emit_insn (gen_zero_extendhisi2 (op2, op0));
15944 : }
15945 : }
15946 62 : else if (mode == SImode)
15947 : op2 = op0;
15948 : else
15949 20 : op2 = gen_rtx_SUBREG (SImode, op0, 0);
15950 :
15951 83 : if (target == 0
15952 83 : || !register_operand (target, SImode))
15953 7 : target = gen_reg_rtx (SImode);
15954 :
15955 83 : pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
15956 : const0_rtx);
15957 83 : emit_insn (gen_rtx_SET (target,
15958 : gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
15959 83 : return target;
15960 :
15961 19 : case IX86_BUILTIN_RDSEED16_STEP:
15962 19 : mode = HImode;
15963 19 : goto rdseed_step;
15964 :
15965 28 : case IX86_BUILTIN_RDSEED32_STEP:
15966 28 : mode = SImode;
15967 28 : goto rdseed_step;
15968 :
15969 : case IX86_BUILTIN_RDSEED64_STEP:
15970 : mode = DImode;
15971 :
15972 66 : rdseed_step:
15973 66 : arg0 = CALL_EXPR_ARG (exp, 0);
15974 66 : op1 = expand_normal (arg0);
15975 66 : if (!address_operand (op1, VOIDmode))
15976 : {
15977 28 : op1 = convert_memory_address (Pmode, op1);
15978 28 : op1 = copy_addr_to_reg (op1);
15979 : }
15980 :
15981 66 : op0 = gen_reg_rtx (mode);
15982 66 : emit_insn (gen_rdseed (mode, op0));
15983 :
15984 66 : emit_move_insn (gen_rtx_MEM (mode, op1), op0);
15985 :
15986 66 : op2 = gen_reg_rtx (QImode);
15987 :
15988 66 : pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
15989 : const0_rtx);
15990 66 : emit_insn (gen_rtx_SET (op2, pat));
15991 :
15992 66 : if (target == 0
15993 66 : || !register_operand (target, SImode))
15994 1 : target = gen_reg_rtx (SImode);
15995 :
15996 66 : emit_insn (gen_zero_extendqisi2 (target, op2));
15997 66 : return target;
15998 :
15999 38 : case IX86_BUILTIN_SBB32:
16000 38 : icode = CODE_FOR_subborrowsi;
16001 38 : icode2 = CODE_FOR_subborrowsi_0;
16002 38 : mode0 = SImode;
16003 38 : mode1 = DImode;
16004 38 : mode2 = CCmode;
16005 38 : goto handlecarry;
16006 :
16007 44 : case IX86_BUILTIN_SBB64:
16008 44 : icode = CODE_FOR_subborrowdi;
16009 44 : icode2 = CODE_FOR_subborrowdi_0;
16010 44 : mode0 = DImode;
16011 44 : mode1 = TImode;
16012 44 : mode2 = CCmode;
16013 44 : goto handlecarry;
16014 :
16015 69 : case IX86_BUILTIN_ADDCARRYX32:
16016 69 : icode = CODE_FOR_addcarrysi;
16017 69 : icode2 = CODE_FOR_addcarrysi_0;
16018 69 : mode0 = SImode;
16019 69 : mode1 = DImode;
16020 69 : mode2 = CCCmode;
16021 69 : goto handlecarry;
16022 :
16023 : case IX86_BUILTIN_ADDCARRYX64:
16024 : icode = CODE_FOR_addcarrydi;
16025 : icode2 = CODE_FOR_addcarrydi_0;
16026 : mode0 = DImode;
16027 : mode1 = TImode;
16028 : mode2 = CCCmode;
16029 :
16030 213 : handlecarry:
16031 213 : arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
16032 213 : arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
16033 213 : arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
16034 213 : arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
16035 :
16036 213 : op1 = expand_normal (arg0);
16037 :
16038 213 : op2 = expand_normal (arg1);
16039 213 : if (!register_operand (op2, mode0))
16040 118 : op2 = copy_to_mode_reg (mode0, op2);
16041 :
16042 213 : op3 = expand_normal (arg2);
16043 213 : if (!register_operand (op3, mode0))
16044 121 : op3 = copy_to_mode_reg (mode0, op3);
16045 :
16046 213 : op4 = expand_normal (arg3);
16047 213 : if (!address_operand (op4, VOIDmode))
16048 : {
16049 68 : op4 = convert_memory_address (Pmode, op4);
16050 68 : op4 = copy_addr_to_reg (op4);
16051 : }
16052 :
16053 213 : op0 = gen_reg_rtx (mode0);
16054 213 : if (op1 == const0_rtx)
16055 : {
16056 : /* If arg0 is 0, optimize right away into add or sub
16057 : instruction that sets CCCmode flags. */
16058 21 : op1 = gen_rtx_REG (mode2, FLAGS_REG);
16059 21 : emit_insn (GEN_FCN (icode2) (op0, op2, op3));
16060 : }
16061 : else
16062 : {
16063 : /* Generate CF from input operand. */
16064 192 : ix86_expand_carry (op1);
16065 :
16066 : /* Generate instruction that consumes CF. */
16067 192 : op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
16068 192 : pat = gen_rtx_LTU (mode1, op1, const0_rtx);
16069 192 : pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
16070 192 : emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
16071 : }
16072 :
16073 : /* Return current CF value. */
16074 213 : if (target == 0)
16075 14 : target = gen_reg_rtx (QImode);
16076 :
16077 213 : pat = gen_rtx_LTU (QImode, op1, const0_rtx);
16078 213 : emit_insn (gen_rtx_SET (target, pat));
16079 :
16080 : /* Store the result. */
16081 213 : emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
16082 :
16083 213 : return target;
16084 :
16085 24 : case IX86_BUILTIN_READ_FLAGS:
16086 24 : if (ignore)
16087 1 : return const0_rtx;
16088 :
16089 23 : emit_insn (gen_pushfl ());
16090 :
16091 23 : if (optimize
16092 11 : || target == NULL_RTX
16093 11 : || !nonimmediate_operand (target, word_mode)
16094 34 : || GET_MODE (target) != word_mode)
16095 12 : target = gen_reg_rtx (word_mode);
16096 :
16097 23 : emit_insn (gen_pop (target));
16098 23 : return target;
16099 :
16100 21 : case IX86_BUILTIN_WRITE_FLAGS:
16101 :
16102 21 : arg0 = CALL_EXPR_ARG (exp, 0);
16103 21 : op0 = expand_normal (arg0);
16104 21 : if (!general_no_elim_operand (op0, word_mode))
16105 0 : op0 = copy_to_mode_reg (word_mode, op0);
16106 :
16107 21 : emit_insn (gen_push (op0));
16108 21 : emit_insn (gen_popfl ());
16109 21 : return 0;
16110 :
16111 22 : case IX86_BUILTIN_KTESTC8:
16112 22 : icode = CODE_FOR_ktestqi;
16113 22 : mode3 = CCCmode;
16114 22 : goto kortest;
16115 :
16116 22 : case IX86_BUILTIN_KTESTZ8:
16117 22 : icode = CODE_FOR_ktestqi;
16118 22 : mode3 = CCZmode;
16119 22 : goto kortest;
16120 :
16121 22 : case IX86_BUILTIN_KTESTC16:
16122 22 : icode = CODE_FOR_ktesthi;
16123 22 : mode3 = CCCmode;
16124 22 : goto kortest;
16125 :
16126 22 : case IX86_BUILTIN_KTESTZ16:
16127 22 : icode = CODE_FOR_ktesthi;
16128 22 : mode3 = CCZmode;
16129 22 : goto kortest;
16130 :
16131 22 : case IX86_BUILTIN_KTESTC32:
16132 22 : icode = CODE_FOR_ktestsi;
16133 22 : mode3 = CCCmode;
16134 22 : goto kortest;
16135 :
16136 22 : case IX86_BUILTIN_KTESTZ32:
16137 22 : icode = CODE_FOR_ktestsi;
16138 22 : mode3 = CCZmode;
16139 22 : goto kortest;
16140 :
16141 22 : case IX86_BUILTIN_KTESTC64:
16142 22 : icode = CODE_FOR_ktestdi;
16143 22 : mode3 = CCCmode;
16144 22 : goto kortest;
16145 :
16146 22 : case IX86_BUILTIN_KTESTZ64:
16147 22 : icode = CODE_FOR_ktestdi;
16148 22 : mode3 = CCZmode;
16149 22 : goto kortest;
16150 :
16151 22 : case IX86_BUILTIN_KORTESTC8:
16152 22 : icode = CODE_FOR_kortestqi;
16153 22 : mode3 = CCCmode;
16154 22 : goto kortest;
16155 :
16156 76 : case IX86_BUILTIN_KORTESTZ8:
16157 76 : icode = CODE_FOR_kortestqi;
16158 76 : mode3 = CCZmode;
16159 76 : goto kortest;
16160 :
16161 38 : case IX86_BUILTIN_KORTESTC16:
16162 38 : icode = CODE_FOR_kortesthi;
16163 38 : mode3 = CCCmode;
16164 38 : goto kortest;
16165 :
16166 91 : case IX86_BUILTIN_KORTESTZ16:
16167 91 : icode = CODE_FOR_kortesthi;
16168 91 : mode3 = CCZmode;
16169 91 : goto kortest;
16170 :
16171 22 : case IX86_BUILTIN_KORTESTC32:
16172 22 : icode = CODE_FOR_kortestsi;
16173 22 : mode3 = CCCmode;
16174 22 : goto kortest;
16175 :
16176 79 : case IX86_BUILTIN_KORTESTZ32:
16177 79 : icode = CODE_FOR_kortestsi;
16178 79 : mode3 = CCZmode;
16179 79 : goto kortest;
16180 :
16181 22 : case IX86_BUILTIN_KORTESTC64:
16182 22 : icode = CODE_FOR_kortestdi;
16183 22 : mode3 = CCCmode;
16184 22 : goto kortest;
16185 :
16186 : case IX86_BUILTIN_KORTESTZ64:
16187 : icode = CODE_FOR_kortestdi;
16188 : mode3 = CCZmode;
16189 :
16190 610 : kortest:
16191 610 : arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
16192 610 : arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
16193 610 : op0 = expand_normal (arg0);
16194 610 : op1 = expand_normal (arg1);
16195 :
16196 610 : mode0 = insn_data[icode].operand[0].mode;
16197 610 : mode1 = insn_data[icode].operand[1].mode;
16198 :
16199 610 : if (GET_MODE (op0) != VOIDmode)
16200 610 : op0 = force_reg (GET_MODE (op0), op0);
16201 :
16202 610 : op0 = gen_lowpart (mode0, op0);
16203 :
16204 610 : if (!insn_data[icode].operand[0].predicate (op0, mode0))
16205 0 : op0 = copy_to_mode_reg (mode0, op0);
16206 :
16207 610 : if (GET_MODE (op1) != VOIDmode)
16208 609 : op1 = force_reg (GET_MODE (op1), op1);
16209 :
16210 610 : op1 = gen_lowpart (mode1, op1);
16211 :
16212 610 : if (!insn_data[icode].operand[1].predicate (op1, mode1))
16213 1 : op1 = copy_to_mode_reg (mode1, op1);
16214 :
16215 610 : target = gen_reg_rtx (QImode);
16216 :
16217 : /* Emit kortest. */
16218 610 : emit_insn (GEN_FCN (icode) (op0, op1));
16219 : /* And use setcc to return result from flags. */
16220 610 : ix86_expand_setcc (target, EQ,
16221 : gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
16222 610 : return target;
16223 :
16224 24 : case IX86_BUILTIN_GATHERSIV2DF:
16225 24 : icode = CODE_FOR_avx2_gathersiv2df;
16226 24 : goto gather_gen;
16227 18 : case IX86_BUILTIN_GATHERSIV4DF:
16228 18 : icode = CODE_FOR_avx2_gathersiv4df;
16229 18 : goto gather_gen;
16230 21 : case IX86_BUILTIN_GATHERDIV2DF:
16231 21 : icode = CODE_FOR_avx2_gatherdiv2df;
16232 21 : goto gather_gen;
16233 32 : case IX86_BUILTIN_GATHERDIV4DF:
16234 32 : icode = CODE_FOR_avx2_gatherdiv4df;
16235 32 : goto gather_gen;
16236 30 : case IX86_BUILTIN_GATHERSIV4SF:
16237 30 : icode = CODE_FOR_avx2_gathersiv4sf;
16238 30 : goto gather_gen;
16239 37 : case IX86_BUILTIN_GATHERSIV8SF:
16240 37 : icode = CODE_FOR_avx2_gathersiv8sf;
16241 37 : goto gather_gen;
16242 24 : case IX86_BUILTIN_GATHERDIV4SF:
16243 24 : icode = CODE_FOR_avx2_gatherdiv4sf;
16244 24 : goto gather_gen;
16245 18 : case IX86_BUILTIN_GATHERDIV8SF:
16246 18 : icode = CODE_FOR_avx2_gatherdiv8sf;
16247 18 : goto gather_gen;
16248 18 : case IX86_BUILTIN_GATHERSIV2DI:
16249 18 : icode = CODE_FOR_avx2_gathersiv2di;
16250 18 : goto gather_gen;
16251 18 : case IX86_BUILTIN_GATHERSIV4DI:
16252 18 : icode = CODE_FOR_avx2_gathersiv4di;
16253 18 : goto gather_gen;
16254 27 : case IX86_BUILTIN_GATHERDIV2DI:
16255 27 : icode = CODE_FOR_avx2_gatherdiv2di;
16256 27 : goto gather_gen;
16257 29 : case IX86_BUILTIN_GATHERDIV4DI:
16258 29 : icode = CODE_FOR_avx2_gatherdiv4di;
16259 29 : goto gather_gen;
16260 20 : case IX86_BUILTIN_GATHERSIV4SI:
16261 20 : icode = CODE_FOR_avx2_gathersiv4si;
16262 20 : goto gather_gen;
16263 22 : case IX86_BUILTIN_GATHERSIV8SI:
16264 22 : icode = CODE_FOR_avx2_gathersiv8si;
16265 22 : goto gather_gen;
16266 28 : case IX86_BUILTIN_GATHERDIV4SI:
16267 28 : icode = CODE_FOR_avx2_gatherdiv4si;
16268 28 : goto gather_gen;
16269 18 : case IX86_BUILTIN_GATHERDIV8SI:
16270 18 : icode = CODE_FOR_avx2_gatherdiv8si;
16271 18 : goto gather_gen;
16272 20 : case IX86_BUILTIN_GATHERALTSIV4DF:
16273 20 : icode = CODE_FOR_avx2_gathersiv4df;
16274 20 : goto gather_gen;
16275 16 : case IX86_BUILTIN_GATHERALTDIV8SF:
16276 16 : icode = CODE_FOR_avx2_gatherdiv8sf;
16277 16 : goto gather_gen;
16278 4 : case IX86_BUILTIN_GATHERALTSIV4DI:
16279 4 : icode = CODE_FOR_avx2_gathersiv4di;
16280 4 : goto gather_gen;
16281 12 : case IX86_BUILTIN_GATHERALTDIV8SI:
16282 12 : icode = CODE_FOR_avx2_gatherdiv8si;
16283 12 : goto gather_gen;
16284 36 : case IX86_BUILTIN_GATHER3SIV16SF:
16285 36 : icode = CODE_FOR_avx512f_gathersiv16sf;
16286 36 : goto gather_gen;
16287 24 : case IX86_BUILTIN_GATHER3SIV8DF:
16288 24 : icode = CODE_FOR_avx512f_gathersiv8df;
16289 24 : goto gather_gen;
16290 24 : case IX86_BUILTIN_GATHER3DIV16SF:
16291 24 : icode = CODE_FOR_avx512f_gatherdiv16sf;
16292 24 : goto gather_gen;
16293 37 : case IX86_BUILTIN_GATHER3DIV8DF:
16294 37 : icode = CODE_FOR_avx512f_gatherdiv8df;
16295 37 : goto gather_gen;
16296 30 : case IX86_BUILTIN_GATHER3SIV16SI:
16297 30 : icode = CODE_FOR_avx512f_gathersiv16si;
16298 30 : goto gather_gen;
16299 24 : case IX86_BUILTIN_GATHER3SIV8DI:
16300 24 : icode = CODE_FOR_avx512f_gathersiv8di;
16301 24 : goto gather_gen;
16302 24 : case IX86_BUILTIN_GATHER3DIV16SI:
16303 24 : icode = CODE_FOR_avx512f_gatherdiv16si;
16304 24 : goto gather_gen;
16305 37 : case IX86_BUILTIN_GATHER3DIV8DI:
16306 37 : icode = CODE_FOR_avx512f_gatherdiv8di;
16307 37 : goto gather_gen;
16308 16 : case IX86_BUILTIN_GATHER3ALTSIV8DF:
16309 16 : icode = CODE_FOR_avx512f_gathersiv8df;
16310 16 : goto gather_gen;
16311 22 : case IX86_BUILTIN_GATHER3ALTDIV16SF:
16312 22 : icode = CODE_FOR_avx512f_gatherdiv16sf;
16313 22 : goto gather_gen;
16314 14 : case IX86_BUILTIN_GATHER3ALTSIV8DI:
16315 14 : icode = CODE_FOR_avx512f_gathersiv8di;
16316 14 : goto gather_gen;
16317 18 : case IX86_BUILTIN_GATHER3ALTDIV16SI:
16318 18 : icode = CODE_FOR_avx512f_gatherdiv16si;
16319 18 : goto gather_gen;
16320 18 : case IX86_BUILTIN_GATHER3SIV2DF:
16321 18 : icode = CODE_FOR_avx512vl_gathersiv2df;
16322 18 : goto gather_gen;
16323 10 : case IX86_BUILTIN_GATHER3SIV4DF:
16324 10 : icode = CODE_FOR_avx512vl_gathersiv4df;
16325 10 : goto gather_gen;
16326 15 : case IX86_BUILTIN_GATHER3DIV2DF:
16327 15 : icode = CODE_FOR_avx512vl_gatherdiv2df;
16328 15 : goto gather_gen;
16329 16 : case IX86_BUILTIN_GATHER3DIV4DF:
16330 16 : icode = CODE_FOR_avx512vl_gatherdiv4df;
16331 16 : goto gather_gen;
16332 14 : case IX86_BUILTIN_GATHER3SIV4SF:
16333 14 : icode = CODE_FOR_avx512vl_gathersiv4sf;
16334 14 : goto gather_gen;
16335 12 : case IX86_BUILTIN_GATHER3SIV8SF:
16336 12 : icode = CODE_FOR_avx512vl_gathersiv8sf;
16337 12 : goto gather_gen;
16338 22 : case IX86_BUILTIN_GATHER3DIV4SF:
16339 22 : icode = CODE_FOR_avx512vl_gatherdiv4sf;
16340 22 : goto gather_gen;
16341 10 : case IX86_BUILTIN_GATHER3DIV8SF:
16342 10 : icode = CODE_FOR_avx512vl_gatherdiv8sf;
16343 10 : goto gather_gen;
16344 20 : case IX86_BUILTIN_GATHER3SIV2DI:
16345 20 : icode = CODE_FOR_avx512vl_gathersiv2di;
16346 20 : goto gather_gen;
16347 10 : case IX86_BUILTIN_GATHER3SIV4DI:
16348 10 : icode = CODE_FOR_avx512vl_gathersiv4di;
16349 10 : goto gather_gen;
16350 14 : case IX86_BUILTIN_GATHER3DIV2DI:
16351 14 : icode = CODE_FOR_avx512vl_gatherdiv2di;
16352 14 : goto gather_gen;
16353 13 : case IX86_BUILTIN_GATHER3DIV4DI:
16354 13 : icode = CODE_FOR_avx512vl_gatherdiv4di;
16355 13 : goto gather_gen;
16356 14 : case IX86_BUILTIN_GATHER3SIV4SI:
16357 14 : icode = CODE_FOR_avx512vl_gathersiv4si;
16358 14 : goto gather_gen;
16359 12 : case IX86_BUILTIN_GATHER3SIV8SI:
16360 12 : icode = CODE_FOR_avx512vl_gathersiv8si;
16361 12 : goto gather_gen;
16362 24 : case IX86_BUILTIN_GATHER3DIV4SI:
16363 24 : icode = CODE_FOR_avx512vl_gatherdiv4si;
16364 24 : goto gather_gen;
16365 10 : case IX86_BUILTIN_GATHER3DIV8SI:
16366 10 : icode = CODE_FOR_avx512vl_gatherdiv8si;
16367 10 : goto gather_gen;
16368 4 : case IX86_BUILTIN_GATHER3ALTSIV4DF:
16369 4 : icode = CODE_FOR_avx512vl_gathersiv4df;
16370 4 : goto gather_gen;
16371 8 : case IX86_BUILTIN_GATHER3ALTDIV8SF:
16372 8 : icode = CODE_FOR_avx512vl_gatherdiv8sf;
16373 8 : goto gather_gen;
16374 6 : case IX86_BUILTIN_GATHER3ALTSIV4DI:
16375 6 : icode = CODE_FOR_avx512vl_gathersiv4di;
16376 6 : goto gather_gen;
16377 10 : case IX86_BUILTIN_GATHER3ALTDIV8SI:
16378 10 : icode = CODE_FOR_avx512vl_gatherdiv8si;
16379 10 : goto gather_gen;
16380 40 : case IX86_BUILTIN_SCATTERSIV16SF:
16381 40 : icode = CODE_FOR_avx512f_scattersiv16sf;
16382 40 : goto scatter_gen;
16383 27 : case IX86_BUILTIN_SCATTERSIV8DF:
16384 27 : icode = CODE_FOR_avx512f_scattersiv8df;
16385 27 : goto scatter_gen;
16386 24 : case IX86_BUILTIN_SCATTERDIV16SF:
16387 24 : icode = CODE_FOR_avx512f_scatterdiv16sf;
16388 24 : goto scatter_gen;
16389 33 : case IX86_BUILTIN_SCATTERDIV8DF:
16390 33 : icode = CODE_FOR_avx512f_scatterdiv8df;
16391 33 : goto scatter_gen;
16392 30 : case IX86_BUILTIN_SCATTERSIV16SI:
16393 30 : icode = CODE_FOR_avx512f_scattersiv16si;
16394 30 : goto scatter_gen;
16395 24 : case IX86_BUILTIN_SCATTERSIV8DI:
16396 24 : icode = CODE_FOR_avx512f_scattersiv8di;
16397 24 : goto scatter_gen;
16398 24 : case IX86_BUILTIN_SCATTERDIV16SI:
16399 24 : icode = CODE_FOR_avx512f_scatterdiv16si;
16400 24 : goto scatter_gen;
16401 29 : case IX86_BUILTIN_SCATTERDIV8DI:
16402 29 : icode = CODE_FOR_avx512f_scatterdiv8di;
16403 29 : goto scatter_gen;
16404 18 : case IX86_BUILTIN_SCATTERSIV8SF:
16405 18 : icode = CODE_FOR_avx512vl_scattersiv8sf;
16406 18 : goto scatter_gen;
16407 20 : case IX86_BUILTIN_SCATTERSIV4SF:
16408 20 : icode = CODE_FOR_avx512vl_scattersiv4sf;
16409 20 : goto scatter_gen;
16410 16 : case IX86_BUILTIN_SCATTERSIV4DF:
16411 16 : icode = CODE_FOR_avx512vl_scattersiv4df;
16412 16 : goto scatter_gen;
16413 16 : case IX86_BUILTIN_SCATTERSIV2DF:
16414 16 : icode = CODE_FOR_avx512vl_scattersiv2df;
16415 16 : goto scatter_gen;
16416 16 : case IX86_BUILTIN_SCATTERDIV8SF:
16417 16 : icode = CODE_FOR_avx512vl_scatterdiv8sf;
16418 16 : goto scatter_gen;
16419 16 : case IX86_BUILTIN_SCATTERDIV4SF:
16420 16 : icode = CODE_FOR_avx512vl_scatterdiv4sf;
16421 16 : goto scatter_gen;
16422 18 : case IX86_BUILTIN_SCATTERDIV4DF:
16423 18 : icode = CODE_FOR_avx512vl_scatterdiv4df;
16424 18 : goto scatter_gen;
16425 18 : case IX86_BUILTIN_SCATTERDIV2DF:
16426 18 : icode = CODE_FOR_avx512vl_scatterdiv2df;
16427 18 : goto scatter_gen;
16428 22 : case IX86_BUILTIN_SCATTERSIV8SI:
16429 22 : icode = CODE_FOR_avx512vl_scattersiv8si;
16430 22 : goto scatter_gen;
16431 24 : case IX86_BUILTIN_SCATTERSIV4SI:
16432 24 : icode = CODE_FOR_avx512vl_scattersiv4si;
16433 24 : goto scatter_gen;
16434 16 : case IX86_BUILTIN_SCATTERSIV4DI:
16435 16 : icode = CODE_FOR_avx512vl_scattersiv4di;
16436 16 : goto scatter_gen;
16437 16 : case IX86_BUILTIN_SCATTERSIV2DI:
16438 16 : icode = CODE_FOR_avx512vl_scattersiv2di;
16439 16 : goto scatter_gen;
16440 16 : case IX86_BUILTIN_SCATTERDIV8SI:
16441 16 : icode = CODE_FOR_avx512vl_scatterdiv8si;
16442 16 : goto scatter_gen;
16443 16 : case IX86_BUILTIN_SCATTERDIV4SI:
16444 16 : icode = CODE_FOR_avx512vl_scatterdiv4si;
16445 16 : goto scatter_gen;
16446 18 : case IX86_BUILTIN_SCATTERDIV4DI:
16447 18 : icode = CODE_FOR_avx512vl_scatterdiv4di;
16448 18 : goto scatter_gen;
16449 18 : case IX86_BUILTIN_SCATTERDIV2DI:
16450 18 : icode = CODE_FOR_avx512vl_scatterdiv2di;
16451 18 : goto scatter_gen;
16452 16 : case IX86_BUILTIN_SCATTERALTSIV8DF:
16453 16 : icode = CODE_FOR_avx512f_scattersiv8df;
16454 16 : goto scatter_gen;
16455 12 : case IX86_BUILTIN_SCATTERALTDIV16SF:
16456 12 : icode = CODE_FOR_avx512f_scatterdiv16sf;
16457 12 : goto scatter_gen;
16458 8 : case IX86_BUILTIN_SCATTERALTSIV8DI:
16459 8 : icode = CODE_FOR_avx512f_scattersiv8di;
16460 8 : goto scatter_gen;
16461 24 : case IX86_BUILTIN_SCATTERALTDIV16SI:
16462 24 : icode = CODE_FOR_avx512f_scatterdiv16si;
16463 24 : goto scatter_gen;
16464 4 : case IX86_BUILTIN_SCATTERALTSIV4DF:
16465 4 : icode = CODE_FOR_avx512vl_scattersiv4df;
16466 4 : goto scatter_gen;
16467 4 : case IX86_BUILTIN_SCATTERALTDIV8SF:
16468 4 : icode = CODE_FOR_avx512vl_scatterdiv8sf;
16469 4 : goto scatter_gen;
16470 4 : case IX86_BUILTIN_SCATTERALTSIV4DI:
16471 4 : icode = CODE_FOR_avx512vl_scattersiv4di;
16472 4 : goto scatter_gen;
16473 4 : case IX86_BUILTIN_SCATTERALTDIV8SI:
16474 4 : icode = CODE_FOR_avx512vl_scatterdiv8si;
16475 4 : goto scatter_gen;
16476 8 : case IX86_BUILTIN_SCATTERALTSIV2DF:
16477 8 : icode = CODE_FOR_avx512vl_scattersiv2df;
16478 8 : goto scatter_gen;
16479 8 : case IX86_BUILTIN_SCATTERALTDIV4SF:
16480 8 : icode = CODE_FOR_avx512vl_scatterdiv4sf;
16481 8 : goto scatter_gen;
16482 8 : case IX86_BUILTIN_SCATTERALTSIV2DI:
16483 8 : icode = CODE_FOR_avx512vl_scattersiv2di;
16484 8 : goto scatter_gen;
16485 8 : case IX86_BUILTIN_SCATTERALTDIV4SI:
16486 8 : icode = CODE_FOR_avx512vl_scatterdiv4si;
16487 8 : goto scatter_gen;
16488 :
16489 1004 : gather_gen:
16490 1004 : rtx half;
16491 1004 : rtx (*gen) (rtx, rtx);
16492 :
16493 1004 : arg0 = CALL_EXPR_ARG (exp, 0);
16494 1004 : arg1 = CALL_EXPR_ARG (exp, 1);
16495 1004 : arg2 = CALL_EXPR_ARG (exp, 2);
16496 1004 : arg3 = CALL_EXPR_ARG (exp, 3);
16497 1004 : arg4 = CALL_EXPR_ARG (exp, 4);
16498 1004 : op0 = expand_normal (arg0);
16499 1004 : op1 = expand_normal (arg1);
16500 1004 : op2 = expand_normal (arg2);
16501 1004 : op3 = ix86_expand_unsigned_small_int_cst_argument (arg3);
16502 1004 : op4 = expand_normal (arg4);
16503 : /* Note the arg order is different from the operand order. */
16504 1004 : mode0 = insn_data[icode].operand[1].mode;
16505 1004 : mode2 = insn_data[icode].operand[3].mode;
16506 1004 : mode3 = insn_data[icode].operand[4].mode;
16507 1004 : mode4 = insn_data[icode].operand[5].mode;
16508 :
16509 1004 : if (target == NULL_RTX
16510 1004 : || GET_MODE (target) != insn_data[icode].operand[0].mode
16511 1904 : || !insn_data[icode].operand[0].predicate (target,
16512 : GET_MODE (target)))
16513 105 : subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
16514 : else
16515 : subtarget = target;
16516 :
16517 1004 : switch (fcode)
16518 : {
16519 30 : case IX86_BUILTIN_GATHER3ALTSIV8DF:
16520 30 : case IX86_BUILTIN_GATHER3ALTSIV8DI:
16521 30 : half = gen_reg_rtx (V8SImode);
16522 30 : if (!nonimmediate_operand (op2, V16SImode))
16523 0 : op2 = copy_to_mode_reg (V16SImode, op2);
16524 30 : emit_insn (gen_vec_extract_lo_v16si (half, op2));
16525 30 : op2 = half;
16526 30 : break;
16527 34 : case IX86_BUILTIN_GATHER3ALTSIV4DF:
16528 34 : case IX86_BUILTIN_GATHER3ALTSIV4DI:
16529 34 : case IX86_BUILTIN_GATHERALTSIV4DF:
16530 34 : case IX86_BUILTIN_GATHERALTSIV4DI:
16531 34 : half = gen_reg_rtx (V4SImode);
16532 34 : if (!nonimmediate_operand (op2, V8SImode))
16533 0 : op2 = copy_to_mode_reg (V8SImode, op2);
16534 34 : emit_insn (gen_vec_extract_lo_v8si (half, op2));
16535 34 : op2 = half;
16536 34 : break;
16537 40 : case IX86_BUILTIN_GATHER3ALTDIV16SF:
16538 40 : case IX86_BUILTIN_GATHER3ALTDIV16SI:
16539 40 : half = gen_reg_rtx (mode0);
16540 40 : if (mode0 == V8SFmode)
16541 : gen = gen_vec_extract_lo_v16sf;
16542 : else
16543 18 : gen = gen_vec_extract_lo_v16si;
16544 40 : if (!nonimmediate_operand (op0, GET_MODE (op0)))
16545 40 : op0 = copy_to_mode_reg (GET_MODE (op0), op0);
16546 40 : emit_insn (gen (half, op0));
16547 40 : op0 = half;
16548 40 : op3 = lowpart_subreg (QImode, op3, HImode);
16549 40 : break;
16550 46 : case IX86_BUILTIN_GATHER3ALTDIV8SF:
16551 46 : case IX86_BUILTIN_GATHER3ALTDIV8SI:
16552 46 : case IX86_BUILTIN_GATHERALTDIV8SF:
16553 46 : case IX86_BUILTIN_GATHERALTDIV8SI:
16554 46 : half = gen_reg_rtx (mode0);
16555 46 : if (mode0 == V4SFmode)
16556 : gen = gen_vec_extract_lo_v8sf;
16557 : else
16558 22 : gen = gen_vec_extract_lo_v8si;
16559 46 : if (!nonimmediate_operand (op0, GET_MODE (op0)))
16560 46 : op0 = copy_to_mode_reg (GET_MODE (op0), op0);
16561 46 : emit_insn (gen (half, op0));
16562 46 : op0 = half;
16563 46 : if (VECTOR_MODE_P (GET_MODE (op3)))
16564 : {
16565 28 : half = gen_reg_rtx (mode0);
16566 28 : if (!nonimmediate_operand (op3, GET_MODE (op3)))
16567 12 : op3 = copy_to_mode_reg (GET_MODE (op3), op3);
16568 28 : emit_insn (gen (half, op3));
16569 28 : op3 = half;
16570 : }
16571 : break;
16572 : default:
16573 : break;
16574 : }
16575 :
16576 : /* Force memory operand only with base register here. But we
16577 : don't want to do it on memory operand for other builtin
16578 : functions. */
16579 1004 : op1 = ix86_zero_extend_to_Pmode (op1);
16580 :
16581 1004 : if (!insn_data[icode].operand[1].predicate (op0, mode0))
16582 403 : op0 = copy_to_mode_reg (mode0, op0);
16583 1009 : if (!insn_data[icode].operand[2].predicate (op1, Pmode))
16584 0 : op1 = copy_to_mode_reg (Pmode, op1);
16585 1004 : if (!insn_data[icode].operand[3].predicate (op2, mode2))
16586 221 : op2 = copy_to_mode_reg (mode2, op2);
16587 :
16588 1004 : op3 = fixup_modeless_constant (op3, mode3);
16589 :
16590 1004 : if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
16591 : {
16592 1004 : if (!insn_data[icode].operand[4].predicate (op3, mode3))
16593 356 : op3 = copy_to_mode_reg (mode3, op3);
16594 : }
16595 : else
16596 : {
16597 0 : op3 = copy_to_reg (op3);
16598 0 : op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
16599 : }
16600 1004 : if (!insn_data[icode].operand[5].predicate (op4, mode4))
16601 : {
16602 0 : error ("the last argument must be scale 1, 2, 4, 8");
16603 0 : return const0_rtx;
16604 : }
16605 :
16606 : /* Optimize. If mask is known to have all high bits set,
16607 : replace op0 with pc_rtx to signal that the instruction
16608 : overwrites the whole destination and doesn't use its
16609 : previous contents. */
16610 1004 : if (optimize)
16611 : {
16612 914 : if (TREE_CODE (arg3) == INTEGER_CST)
16613 : {
16614 209 : if (integer_all_onesp (arg3))
16615 201 : op0 = pc_rtx;
16616 : }
16617 705 : else if (TREE_CODE (arg3) == VECTOR_CST)
16618 : {
16619 : unsigned int negative = 0;
16620 755 : for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
16621 : {
16622 620 : tree cst = VECTOR_CST_ELT (arg3, i);
16623 620 : if (TREE_CODE (cst) == INTEGER_CST
16624 620 : && tree_int_cst_sign_bit (cst))
16625 286 : negative++;
16626 334 : else if (TREE_CODE (cst) == REAL_CST
16627 334 : && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
16628 306 : negative++;
16629 : }
16630 135 : if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
16631 121 : op0 = pc_rtx;
16632 : }
16633 570 : else if (TREE_CODE (arg3) == SSA_NAME
16634 570 : && VECTOR_TYPE_P (TREE_TYPE (arg3)))
16635 : {
16636 : /* Recognize also when mask is like:
16637 : __v2df src = _mm_setzero_pd ();
16638 : __v2df mask = _mm_cmpeq_pd (src, src);
16639 : or
16640 : __v8sf src = _mm256_setzero_ps ();
16641 : __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
16642 : as that is a cheaper way to load all ones into
16643 : a register than having to load a constant from
16644 : memory. */
16645 259 : gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
16646 259 : if (is_gimple_call (def_stmt))
16647 : {
16648 76 : tree fndecl = gimple_call_fndecl (def_stmt);
16649 76 : if (fndecl
16650 76 : && fndecl_built_in_p (fndecl, BUILT_IN_MD))
16651 67 : switch (DECL_MD_FUNCTION_CODE (fndecl))
16652 : {
16653 24 : case IX86_BUILTIN_CMPPD:
16654 24 : case IX86_BUILTIN_CMPPS:
16655 24 : case IX86_BUILTIN_CMPPD256:
16656 24 : case IX86_BUILTIN_CMPPS256:
16657 24 : if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
16658 : break;
16659 : /* FALLTHRU */
16660 49 : case IX86_BUILTIN_CMPEQPD:
16661 49 : case IX86_BUILTIN_CMPEQPS:
16662 49 : if (initializer_zerop (gimple_call_arg (def_stmt, 0))
16663 49 : && initializer_zerop (gimple_call_arg (def_stmt,
16664 : 1)))
16665 49 : op0 = pc_rtx;
16666 : break;
16667 : default:
16668 : break;
16669 : }
16670 : }
16671 : }
16672 : }
16673 :
16674 1004 : pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
16675 1004 : if (! pat)
16676 0 : return const0_rtx;
16677 1004 : emit_insn (pat);
16678 :
16679 1004 : switch (fcode)
16680 : {
16681 24 : case IX86_BUILTIN_GATHER3DIV16SF:
16682 24 : if (target == NULL_RTX)
16683 0 : target = gen_reg_rtx (V8SFmode);
16684 24 : emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
16685 24 : break;
16686 24 : case IX86_BUILTIN_GATHER3DIV16SI:
16687 24 : if (target == NULL_RTX)
16688 0 : target = gen_reg_rtx (V8SImode);
16689 24 : emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
16690 24 : break;
16691 28 : case IX86_BUILTIN_GATHER3DIV8SF:
16692 28 : case IX86_BUILTIN_GATHERDIV8SF:
16693 28 : if (target == NULL_RTX)
16694 0 : target = gen_reg_rtx (V4SFmode);
16695 28 : emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
16696 28 : break;
16697 28 : case IX86_BUILTIN_GATHER3DIV8SI:
16698 28 : case IX86_BUILTIN_GATHERDIV8SI:
16699 28 : if (target == NULL_RTX)
16700 0 : target = gen_reg_rtx (V4SImode);
16701 28 : emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
16702 28 : break;
16703 : default:
16704 : target = subtarget;
16705 : break;
16706 : }
16707 : return target;
16708 :
16709 623 : scatter_gen:
16710 623 : arg0 = CALL_EXPR_ARG (exp, 0);
16711 623 : arg1 = CALL_EXPR_ARG (exp, 1);
16712 623 : arg2 = CALL_EXPR_ARG (exp, 2);
16713 623 : arg3 = CALL_EXPR_ARG (exp, 3);
16714 623 : arg4 = CALL_EXPR_ARG (exp, 4);
16715 623 : op0 = expand_normal (arg0);
16716 623 : op1 = ix86_expand_unsigned_small_int_cst_argument (arg1);
16717 623 : op2 = expand_normal (arg2);
16718 623 : op3 = expand_normal (arg3);
16719 623 : op4 = expand_normal (arg4);
16720 623 : mode1 = insn_data[icode].operand[1].mode;
16721 623 : mode2 = insn_data[icode].operand[2].mode;
16722 623 : mode3 = insn_data[icode].operand[3].mode;
16723 623 : mode4 = insn_data[icode].operand[4].mode;
16724 :
16725 : /* Scatter instruction stores operand op3 to memory with
16726 : indices from op2 and scale from op4 under writemask op1.
16727 : If index operand op2 has more elements then source operand
16728 : op3 one need to use only its low half. And vice versa. */
16729 623 : switch (fcode)
16730 : {
16731 24 : case IX86_BUILTIN_SCATTERALTSIV8DF:
16732 24 : case IX86_BUILTIN_SCATTERALTSIV8DI:
16733 24 : half = gen_reg_rtx (V8SImode);
16734 24 : if (!nonimmediate_operand (op2, V16SImode))
16735 0 : op2 = copy_to_mode_reg (V16SImode, op2);
16736 24 : emit_insn (gen_vec_extract_lo_v16si (half, op2));
16737 24 : op2 = half;
16738 24 : break;
16739 36 : case IX86_BUILTIN_SCATTERALTDIV16SF:
16740 36 : case IX86_BUILTIN_SCATTERALTDIV16SI:
16741 36 : half = gen_reg_rtx (mode3);
16742 36 : if (mode3 == V8SFmode)
16743 : gen = gen_vec_extract_lo_v16sf;
16744 : else
16745 24 : gen = gen_vec_extract_lo_v16si;
16746 36 : if (!nonimmediate_operand (op3, GET_MODE (op3)))
16747 0 : op3 = copy_to_mode_reg (GET_MODE (op3), op3);
16748 36 : emit_insn (gen (half, op3));
16749 36 : op3 = half;
16750 36 : break;
16751 8 : case IX86_BUILTIN_SCATTERALTSIV4DF:
16752 8 : case IX86_BUILTIN_SCATTERALTSIV4DI:
16753 8 : half = gen_reg_rtx (V4SImode);
16754 8 : if (!nonimmediate_operand (op2, V8SImode))
16755 0 : op2 = copy_to_mode_reg (V8SImode, op2);
16756 8 : emit_insn (gen_vec_extract_lo_v8si (half, op2));
16757 8 : op2 = half;
16758 8 : break;
16759 8 : case IX86_BUILTIN_SCATTERALTDIV8SF:
16760 8 : case IX86_BUILTIN_SCATTERALTDIV8SI:
16761 8 : half = gen_reg_rtx (mode3);
16762 8 : if (mode3 == V4SFmode)
16763 : gen = gen_vec_extract_lo_v8sf;
16764 : else
16765 4 : gen = gen_vec_extract_lo_v8si;
16766 8 : if (!nonimmediate_operand (op3, GET_MODE (op3)))
16767 0 : op3 = copy_to_mode_reg (GET_MODE (op3), op3);
16768 8 : emit_insn (gen (half, op3));
16769 8 : op3 = half;
16770 8 : break;
16771 16 : case IX86_BUILTIN_SCATTERALTSIV2DF:
16772 16 : case IX86_BUILTIN_SCATTERALTSIV2DI:
16773 16 : if (!nonimmediate_operand (op2, V4SImode))
16774 0 : op2 = copy_to_mode_reg (V4SImode, op2);
16775 : break;
16776 16 : case IX86_BUILTIN_SCATTERALTDIV4SF:
16777 16 : case IX86_BUILTIN_SCATTERALTDIV4SI:
16778 16 : if (!nonimmediate_operand (op3, GET_MODE (op3)))
16779 0 : op3 = copy_to_mode_reg (GET_MODE (op3), op3);
16780 : break;
16781 : default:
16782 : break;
16783 : }
16784 :
16785 : /* Force memory operand only with base register here. But we
16786 : don't want to do it on memory operand for other builtin
16787 : functions. */
16788 633 : op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
16789 :
16790 628 : if (!insn_data[icode].operand[0].predicate (op0, Pmode))
16791 0 : op0 = copy_to_mode_reg (Pmode, op0);
16792 :
16793 623 : op1 = fixup_modeless_constant (op1, mode1);
16794 :
16795 623 : if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
16796 : {
16797 607 : if (!insn_data[icode].operand[1].predicate (op1, mode1))
16798 273 : op1 = copy_to_mode_reg (mode1, op1);
16799 : }
16800 : else
16801 : {
16802 16 : op1 = copy_to_reg (op1);
16803 16 : op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
16804 : }
16805 :
16806 623 : if (!insn_data[icode].operand[2].predicate (op2, mode2))
16807 57 : op2 = copy_to_mode_reg (mode2, op2);
16808 :
16809 623 : if (!insn_data[icode].operand[3].predicate (op3, mode3))
16810 82 : op3 = copy_to_mode_reg (mode3, op3);
16811 :
16812 623 : if (!insn_data[icode].operand[4].predicate (op4, mode4))
16813 : {
16814 0 : error ("the last argument must be scale 1, 2, 4, 8");
16815 0 : return const0_rtx;
16816 : }
16817 :
16818 623 : pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
16819 623 : if (! pat)
16820 0 : return const0_rtx;
16821 :
16822 623 : emit_insn (pat);
16823 623 : return 0;
16824 :
16825 23 : case IX86_BUILTIN_XABORT:
16826 23 : icode = CODE_FOR_xabort;
16827 23 : arg0 = CALL_EXPR_ARG (exp, 0);
16828 23 : op0 = expand_normal (arg0);
16829 23 : mode0 = insn_data[icode].operand[0].mode;
16830 23 : if (!insn_data[icode].operand[0].predicate (op0, mode0))
16831 : {
16832 0 : error ("the argument to %<xabort%> intrinsic must "
16833 : "be an 8-bit immediate");
16834 0 : return const0_rtx;
16835 : }
16836 23 : emit_insn (gen_xabort (op0));
16837 23 : return 0;
16838 :
16839 55 : case IX86_BUILTIN_RDSSPD:
16840 55 : case IX86_BUILTIN_RDSSPQ:
16841 55 : mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
16842 :
16843 55 : if (target == 0
16844 55 : || !register_operand (target, mode))
16845 0 : target = gen_reg_rtx (mode);
16846 :
16847 55 : op0 = force_reg (mode, const0_rtx);
16848 :
16849 55 : emit_insn (gen_rdssp (mode, target, op0));
16850 55 : return target;
16851 :
16852 55 : case IX86_BUILTIN_INCSSPD:
16853 55 : case IX86_BUILTIN_INCSSPQ:
16854 55 : mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
16855 :
16856 55 : arg0 = CALL_EXPR_ARG (exp, 0);
16857 55 : op0 = expand_normal (arg0);
16858 :
16859 55 : op0 = force_reg (mode, op0);
16860 :
16861 55 : emit_insn (gen_incssp (mode, op0));
16862 55 : return 0;
16863 :
16864 20 : case IX86_BUILTIN_HRESET:
16865 20 : icode = CODE_FOR_hreset;
16866 20 : arg0 = CALL_EXPR_ARG (exp, 0);
16867 20 : op0 = expand_normal (arg0);
16868 20 : op0 = force_reg (SImode, op0);
16869 20 : emit_insn (gen_hreset (op0));
16870 20 : return 0;
16871 :
16872 38 : case IX86_BUILTIN_RSTORSSP:
16873 38 : case IX86_BUILTIN_CLRSSBSY:
16874 38 : arg0 = CALL_EXPR_ARG (exp, 0);
16875 38 : op0 = expand_normal (arg0);
16876 19 : icode = (fcode == IX86_BUILTIN_RSTORSSP
16877 38 : ? CODE_FOR_rstorssp
16878 : : CODE_FOR_clrssbsy);
16879 :
16880 38 : if (!address_operand (op0, VOIDmode))
16881 : {
16882 18 : op0 = convert_memory_address (Pmode, op0);
16883 18 : op0 = copy_addr_to_reg (op0);
16884 : }
16885 38 : emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
16886 38 : return 0;
16887 :
16888 80 : case IX86_BUILTIN_WRSSD:
16889 80 : case IX86_BUILTIN_WRSSQ:
16890 80 : case IX86_BUILTIN_WRUSSD:
16891 80 : case IX86_BUILTIN_WRUSSQ:
16892 80 : mode = ((fcode == IX86_BUILTIN_WRSSD
16893 80 : || fcode == IX86_BUILTIN_WRUSSD)
16894 80 : ? SImode : DImode);
16895 :
16896 80 : arg0 = CALL_EXPR_ARG (exp, 0);
16897 80 : op0 = expand_normal (arg0);
16898 80 : arg1 = CALL_EXPR_ARG (exp, 1);
16899 80 : op1 = expand_normal (arg1);
16900 :
16901 80 : op0 = force_reg (mode, op0);
16902 :
16903 80 : if (!address_operand (op1, VOIDmode))
16904 : {
16905 36 : op1 = convert_memory_address (Pmode, op1);
16906 36 : op1 = copy_addr_to_reg (op1);
16907 : }
16908 80 : op1 = gen_rtx_MEM (mode, op1);
16909 :
16910 80 : icode = ((fcode == IX86_BUILTIN_WRSSD
16911 80 : || fcode == IX86_BUILTIN_WRSSQ)
16912 80 : ? code_for_wrss (mode)
16913 40 : : code_for_wruss (mode));
16914 80 : emit_insn (GEN_FCN (icode) (op0, op1));
16915 :
16916 80 : return 0;
16917 :
16918 116625 : default:
16919 116625 : break;
16920 : }
16921 :
16922 116625 : if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
16923 116625 : && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
16924 : {
16925 27059 : i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
16926 27059 : return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
16927 27059 : target);
16928 : }
16929 :
16930 89566 : if (fcode >= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
16931 89566 : && fcode <= IX86_BUILTIN__BDESC_PURE_ARGS_LAST)
16932 : {
16933 93 : i = fcode - IX86_BUILTIN__BDESC_PURE_ARGS_FIRST;
16934 93 : return ix86_expand_special_args_builtin (bdesc_pure_args + i, exp,
16935 93 : target);
16936 : }
16937 :
16938 89473 : if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
16939 89473 : && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
16940 : {
16941 71052 : i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
16942 :
16943 71052 : switch (fcode)
16944 : {
16945 0 : case IX86_BUILTIN_RDPID:
16946 0 : return ix86_expand_special_args_builtin (bdesc_args + i, exp,
16947 0 : target);
16948 74 : case IX86_BUILTIN_VCOMISBF16EQ:
16949 74 : case IX86_BUILTIN_VCOMISBF16NE:
16950 74 : case IX86_BUILTIN_VCOMISBF16GT:
16951 74 : case IX86_BUILTIN_VCOMISBF16GE:
16952 74 : case IX86_BUILTIN_VCOMISBF16LT:
16953 74 : case IX86_BUILTIN_VCOMISBF16LE:
16954 74 : return ix86_expand_sse_comi (bdesc_args + i, exp, target, false);
16955 15 : case IX86_BUILTIN_FABSQ:
16956 15 : case IX86_BUILTIN_COPYSIGNQ:
16957 15 : if (!TARGET_SSE)
16958 : /* Emit a normal call if SSE isn't available. */
16959 0 : return expand_call (exp, target, ignore);
16960 : /* FALLTHRU */
16961 70978 : default:
16962 70978 : return ix86_expand_args_builtin (bdesc_args + i, exp, target);
16963 : }
16964 : }
16965 :
16966 18421 : if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
16967 18421 : && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
16968 : {
16969 473 : i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
16970 473 : return ix86_expand_sse_comi (bdesc_comi + i, exp, target, true);
16971 : }
16972 :
16973 17948 : if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
16974 17948 : && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
16975 : {
16976 15604 : i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
16977 15604 : return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
16978 : }
16979 :
16980 2344 : if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
16981 2344 : && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
16982 : {
16983 216 : i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
16984 216 : return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
16985 : }
16986 :
16987 2128 : if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
16988 2128 : && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
16989 : {
16990 275 : i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
16991 275 : return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
16992 : }
16993 :
16994 1853 : if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
16995 1853 : && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
16996 : {
16997 1815 : i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
16998 1815 : const struct builtin_description *d = bdesc_multi_arg + i;
16999 1815 : return ix86_expand_multi_arg_builtin (d->icode, exp, target,
17000 : (enum ix86_builtin_func_type)
17001 1815 : d->flag, d->comparison);
17002 : }
17003 :
17004 38 : if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
17005 38 : && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
17006 : {
17007 38 : i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
17008 38 : return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
17009 38 : target);
17010 : }
17011 :
17012 0 : gcc_unreachable ();
17013 : }
17014 :
17015 : /* See below where shifts are handled for explanation of this enum. */
17016 : enum ix86_vec_bcast_alg
17017 : {
17018 : VEC_BCAST_PXOR,
17019 : VEC_BCAST_PCMPEQ,
17020 : VEC_BCAST_PABSB,
17021 : VEC_BCAST_PADDB,
17022 : VEC_BCAST_PSRLW,
17023 : VEC_BCAST_PSRLD,
17024 : VEC_BCAST_PSLLW,
17025 : VEC_BCAST_PSLLD
17026 : };
17027 :
17028 : struct ix86_vec_bcast_map_simode_t
17029 : {
17030 : unsigned int key;
17031 : enum ix86_vec_bcast_alg alg;
17032 : unsigned int arg;
17033 : };
17034 :
17035 : /* This table must be kept sorted as values are looked-up using bsearch. */
17036 : static const ix86_vec_bcast_map_simode_t ix86_vec_bcast_map_simode[] = {
17037 : { 0x00000000, VEC_BCAST_PXOR, 0 },
17038 : { 0x00000001, VEC_BCAST_PSRLD, 31 },
17039 : { 0x00000003, VEC_BCAST_PSRLD, 30 },
17040 : { 0x00000007, VEC_BCAST_PSRLD, 29 },
17041 : { 0x0000000f, VEC_BCAST_PSRLD, 28 },
17042 : { 0x0000001f, VEC_BCAST_PSRLD, 27 },
17043 : { 0x0000003f, VEC_BCAST_PSRLD, 26 },
17044 : { 0x0000007f, VEC_BCAST_PSRLD, 25 },
17045 : { 0x000000ff, VEC_BCAST_PSRLD, 24 },
17046 : { 0x000001ff, VEC_BCAST_PSRLD, 23 },
17047 : { 0x000003ff, VEC_BCAST_PSRLD, 22 },
17048 : { 0x000007ff, VEC_BCAST_PSRLD, 21 },
17049 : { 0x00000fff, VEC_BCAST_PSRLD, 20 },
17050 : { 0x00001fff, VEC_BCAST_PSRLD, 19 },
17051 : { 0x00003fff, VEC_BCAST_PSRLD, 18 },
17052 : { 0x00007fff, VEC_BCAST_PSRLD, 17 },
17053 : { 0x0000ffff, VEC_BCAST_PSRLD, 16 },
17054 : { 0x00010001, VEC_BCAST_PSRLW, 15 },
17055 : { 0x0001ffff, VEC_BCAST_PSRLD, 15 },
17056 : { 0x00030003, VEC_BCAST_PSRLW, 14 },
17057 : { 0x0003ffff, VEC_BCAST_PSRLD, 14 },
17058 : { 0x00070007, VEC_BCAST_PSRLW, 13 },
17059 : { 0x0007ffff, VEC_BCAST_PSRLD, 13 },
17060 : { 0x000f000f, VEC_BCAST_PSRLW, 12 },
17061 : { 0x000fffff, VEC_BCAST_PSRLD, 12 },
17062 : { 0x001f001f, VEC_BCAST_PSRLW, 11 },
17063 : { 0x001fffff, VEC_BCAST_PSRLD, 11 },
17064 : { 0x003f003f, VEC_BCAST_PSRLW, 10 },
17065 : { 0x003fffff, VEC_BCAST_PSRLD, 10 },
17066 : { 0x007f007f, VEC_BCAST_PSRLW, 9 },
17067 : { 0x007fffff, VEC_BCAST_PSRLD, 9 },
17068 : { 0x00ff00ff, VEC_BCAST_PSRLW, 8 },
17069 : { 0x00ffffff, VEC_BCAST_PSRLD, 8 },
17070 : { 0x01010101, VEC_BCAST_PABSB, 0 },
17071 : { 0x01ff01ff, VEC_BCAST_PSRLW, 7 },
17072 : { 0x01ffffff, VEC_BCAST_PSRLD, 7 },
17073 : { 0x03ff03ff, VEC_BCAST_PSRLW, 6 },
17074 : { 0x03ffffff, VEC_BCAST_PSRLD, 6 },
17075 : { 0x07ff07ff, VEC_BCAST_PSRLW, 5 },
17076 : { 0x07ffffff, VEC_BCAST_PSRLD, 5 },
17077 : { 0x0fff0fff, VEC_BCAST_PSRLW, 4 },
17078 : { 0x0fffffff, VEC_BCAST_PSRLD, 4 },
17079 : { 0x1fff1fff, VEC_BCAST_PSRLW, 3 },
17080 : { 0x1fffffff, VEC_BCAST_PSRLD, 3 },
17081 : { 0x3fff3fff, VEC_BCAST_PSRLW, 2 },
17082 : { 0x3fffffff, VEC_BCAST_PSRLD, 2 },
17083 : { 0x7fff7fff, VEC_BCAST_PSRLW, 1 },
17084 : { 0x7fffffff, VEC_BCAST_PSRLD, 1 },
17085 : { 0x80000000, VEC_BCAST_PSLLD, 31 },
17086 : { 0x80008000, VEC_BCAST_PSLLW, 15 },
17087 : { 0xc0000000, VEC_BCAST_PSLLD, 30 },
17088 : { 0xc000c000, VEC_BCAST_PSLLW, 14 },
17089 : { 0xe0000000, VEC_BCAST_PSLLD, 29 },
17090 : { 0xe000e000, VEC_BCAST_PSLLW, 13 },
17091 : { 0xf0000000, VEC_BCAST_PSLLD, 28 },
17092 : { 0xf000f000, VEC_BCAST_PSLLW, 12 },
17093 : { 0xf8000000, VEC_BCAST_PSLLD, 27 },
17094 : { 0xf800f800, VEC_BCAST_PSLLW, 11 },
17095 : { 0xfc000000, VEC_BCAST_PSLLD, 26 },
17096 : { 0xfc00fc00, VEC_BCAST_PSLLW, 10 },
17097 : { 0xfe000000, VEC_BCAST_PSLLD, 25 },
17098 : { 0xfe00fe00, VEC_BCAST_PSLLW, 9 },
17099 : { 0xfefefefe, VEC_BCAST_PADDB, 0 },
17100 : { 0xff000000, VEC_BCAST_PSLLD, 24 },
17101 : { 0xff00ff00, VEC_BCAST_PSLLW, 8 },
17102 : { 0xff800000, VEC_BCAST_PSLLD, 23 },
17103 : { 0xff80ff80, VEC_BCAST_PSLLW, 7 },
17104 : { 0xffc00000, VEC_BCAST_PSLLD, 22 },
17105 : { 0xffc0ffc0, VEC_BCAST_PSLLW, 6 },
17106 : { 0xffe00000, VEC_BCAST_PSLLD, 21 },
17107 : { 0xffe0ffe0, VEC_BCAST_PSLLW, 5 },
17108 : { 0xfff00000, VEC_BCAST_PSLLD, 20 },
17109 : { 0xfff0fff0, VEC_BCAST_PSLLW, 4 },
17110 : { 0xfff80000, VEC_BCAST_PSLLD, 19 },
17111 : { 0xfff8fff8, VEC_BCAST_PSLLW, 3 },
17112 : { 0xfffc0000, VEC_BCAST_PSLLD, 18 },
17113 : { 0xfffcfffc, VEC_BCAST_PSLLW, 2 },
17114 : { 0xfffe0000, VEC_BCAST_PSLLD, 17 },
17115 : { 0xfffefffe, VEC_BCAST_PSLLW, 1 },
17116 : { 0xffff0000, VEC_BCAST_PSLLD, 16 },
17117 : { 0xffff8000, VEC_BCAST_PSLLD, 15 },
17118 : { 0xffffc000, VEC_BCAST_PSLLD, 14 },
17119 : { 0xffffe000, VEC_BCAST_PSLLD, 13 },
17120 : { 0xfffff000, VEC_BCAST_PSLLD, 12 },
17121 : { 0xfffff800, VEC_BCAST_PSLLD, 11 },
17122 : { 0xfffffc00, VEC_BCAST_PSLLD, 10 },
17123 : { 0xfffffe00, VEC_BCAST_PSLLD, 9 },
17124 : { 0xffffff00, VEC_BCAST_PSLLD, 8 },
17125 : { 0xffffff80, VEC_BCAST_PSLLD, 7 },
17126 : { 0xffffffc0, VEC_BCAST_PSLLD, 6 },
17127 : { 0xffffffe0, VEC_BCAST_PSLLD, 5 },
17128 : { 0xfffffff0, VEC_BCAST_PSLLD, 4 },
17129 : { 0xfffffff8, VEC_BCAST_PSLLD, 3 },
17130 : { 0xfffffffc, VEC_BCAST_PSLLD, 2 },
17131 : { 0xfffffffe, VEC_BCAST_PSLLD, 1 },
17132 : { 0xffffffff, VEC_BCAST_PCMPEQ, 0 }
17133 : };
17134 :
17135 : /* Comparator for bsearch on ix86_vec_bcast_map. */
17136 : static int
17137 292957 : ix86_vec_bcast_map_simode_cmp (const void *key, const void *entry)
17138 : {
17139 292957 : return (*(const unsigned int*)key)
17140 292957 : - ((const ix86_vec_bcast_map_simode_t*)entry)->key;
17141 : }
17142 :
17143 : /* A subroutine of ix86_vector_duplicate_value. Tries to efficiently
17144 : materialize V4SImode, V8SImode and V16SImode vectors from SImode
17145 : integer constants. */
17146 : static bool
17147 45092 : ix86_vector_duplicate_simode_const (machine_mode mode, rtx target,
17148 : unsigned int val)
17149 : {
17150 45092 : const ix86_vec_bcast_map_simode_t *entry;
17151 45092 : rtx tmp1, tmp2;
17152 :
17153 45092 : entry = (const ix86_vec_bcast_map_simode_t*)
17154 45092 : bsearch(&val, ix86_vec_bcast_map_simode,
17155 : ARRAY_SIZE (ix86_vec_bcast_map_simode),
17156 : sizeof (ix86_vec_bcast_map_simode_t),
17157 : ix86_vec_bcast_map_simode_cmp);
17158 45092 : if (!entry)
17159 : return false;
17160 :
17161 19359 : switch (entry->alg)
17162 : {
17163 0 : case VEC_BCAST_PXOR:
17164 0 : if ((mode == V8SImode && !TARGET_AVX2)
17165 0 : || (mode == V16SImode && !TARGET_AVX512F))
17166 : return false;
17167 0 : emit_move_insn (target, CONST0_RTX (mode));
17168 0 : return true;
17169 :
17170 156 : case VEC_BCAST_PCMPEQ:
17171 156 : if ((mode == V4SImode && !TARGET_SSE2)
17172 155 : || (mode == V8SImode && !TARGET_AVX2)
17173 128 : || (mode == V16SImode && !TARGET_AVX512F))
17174 : return false;
17175 128 : emit_move_insn (target, CONSTM1_RTX (mode));
17176 128 : return true;
17177 :
17178 685 : case VEC_BCAST_PABSB:
17179 685 : if (mode == V4SImode && TARGET_SSE2)
17180 : {
17181 547 : tmp1 = gen_reg_rtx (V16QImode);
17182 547 : emit_move_insn (tmp1, CONSTM1_RTX (V16QImode));
17183 547 : tmp2 = gen_reg_rtx (V16QImode);
17184 547 : emit_insn (gen_absv16qi2 (tmp2, tmp1));
17185 : }
17186 138 : else if (mode == V8SImode && TARGET_AVX2)
17187 : {
17188 80 : tmp1 = gen_reg_rtx (V32QImode);
17189 80 : emit_move_insn (tmp1, CONSTM1_RTX (V32QImode));
17190 80 : tmp2 = gen_reg_rtx (V32QImode);
17191 80 : emit_insn (gen_absv32qi2 (tmp2, tmp1));
17192 : }
17193 58 : else if (mode == V16SImode && TARGET_AVX512BW)
17194 : {
17195 50 : tmp1 = gen_reg_rtx (V64QImode);
17196 50 : emit_move_insn (tmp1, CONSTM1_RTX (V64QImode));
17197 50 : tmp2 = gen_reg_rtx (V64QImode);
17198 50 : emit_insn (gen_absv64qi2 (tmp2, tmp1));
17199 : }
17200 : else
17201 : return false;
17202 : break;
17203 :
17204 104 : case VEC_BCAST_PADDB:
17205 104 : if (mode == V4SImode && TARGET_SSE2)
17206 : {
17207 97 : tmp1 = gen_reg_rtx (V16QImode);
17208 97 : emit_move_insn (tmp1, CONSTM1_RTX (V16QImode));
17209 97 : tmp2 = gen_reg_rtx (V16QImode);
17210 97 : emit_insn (gen_addv16qi3 (tmp2, tmp1, tmp1));
17211 : }
17212 7 : else if (mode == V8SImode && TARGET_AVX2)
17213 : {
17214 1 : tmp1 = gen_reg_rtx (V32QImode);
17215 1 : emit_move_insn (tmp1, CONSTM1_RTX (V32QImode));
17216 1 : tmp2 = gen_reg_rtx (V32QImode);
17217 1 : emit_insn (gen_addv32qi3 (tmp2, tmp1, tmp1));
17218 : }
17219 6 : else if (mode == V16SImode && TARGET_AVX512BW)
17220 : {
17221 6 : tmp1 = gen_reg_rtx (V64QImode);
17222 6 : emit_move_insn (tmp1, CONSTM1_RTX (V64QImode));
17223 6 : tmp2 = gen_reg_rtx (V64QImode);
17224 6 : emit_insn (gen_addv64qi3 (tmp2, tmp1, tmp1));
17225 : }
17226 : else
17227 : return false;
17228 : break;
17229 :
17230 3752 : case VEC_BCAST_PSRLW:
17231 3752 : if (mode == V4SImode && TARGET_SSE2)
17232 : {
17233 3526 : tmp1 = gen_reg_rtx (V8HImode);
17234 3526 : emit_move_insn (tmp1, CONSTM1_RTX (V8HImode));
17235 3526 : tmp2 = gen_reg_rtx (V8HImode);
17236 3526 : emit_insn (gen_lshrv8hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
17237 : }
17238 226 : else if (mode == V8SImode && TARGET_AVX2)
17239 : {
17240 133 : tmp1 = gen_reg_rtx (V16HImode);
17241 133 : emit_move_insn (tmp1, CONSTM1_RTX (V16HImode));
17242 133 : tmp2 = gen_reg_rtx (V16HImode);
17243 133 : emit_insn (gen_lshrv16hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
17244 : }
17245 93 : else if (mode == V16SImode && TARGET_AVX512BW)
17246 : {
17247 91 : tmp1 = gen_reg_rtx (V32HImode);
17248 91 : emit_move_insn (tmp1, CONSTM1_RTX (V32HImode));
17249 91 : tmp2 = gen_reg_rtx (V32HImode);
17250 91 : emit_insn (gen_lshrv32hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
17251 : }
17252 : else
17253 : return false;
17254 : break;
17255 :
17256 12966 : case VEC_BCAST_PSRLD:
17257 12966 : if (mode == V4SImode && TARGET_SSE2)
17258 : {
17259 10041 : tmp1 = gen_reg_rtx (V4SImode);
17260 10041 : emit_move_insn (tmp1, CONSTM1_RTX (V4SImode));
17261 10041 : emit_insn (gen_lshrv4si3 (target, tmp1, GEN_INT (entry->arg)));
17262 10041 : return true;
17263 : }
17264 2925 : else if (mode == V8SImode && TARGET_AVX2)
17265 : {
17266 1097 : tmp1 = gen_reg_rtx (V8SImode);
17267 1097 : emit_move_insn (tmp1, CONSTM1_RTX (V8SImode));
17268 1097 : emit_insn (gen_lshrv8si3 (target, tmp1, GEN_INT (entry->arg)));
17269 1097 : return true;
17270 : }
17271 1828 : else if (mode == V16SImode && TARGET_AVX512F)
17272 : {
17273 989 : tmp1 = gen_reg_rtx (V16SImode);
17274 989 : emit_move_insn (tmp1, CONSTM1_RTX (V16SImode));
17275 989 : emit_insn (gen_lshrv16si3 (target, tmp1, GEN_INT (entry->arg)));
17276 989 : return true;
17277 : }
17278 : else
17279 : return false;
17280 126 : break;
17281 :
17282 126 : case VEC_BCAST_PSLLW:
17283 126 : if (mode == V4SImode && TARGET_SSE2)
17284 : {
17285 96 : tmp1 = gen_reg_rtx (V8HImode);
17286 96 : emit_move_insn (tmp1, CONSTM1_RTX (V8HImode));
17287 96 : tmp2 = gen_reg_rtx (V8HImode);
17288 96 : emit_insn (gen_ashlv8hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
17289 : }
17290 30 : else if (mode == V8SImode && TARGET_AVX2)
17291 : {
17292 21 : tmp1 = gen_reg_rtx (V16HImode);
17293 21 : emit_move_insn (tmp1, CONSTM1_RTX (V16HImode));
17294 21 : tmp2 = gen_reg_rtx (V16HImode);
17295 21 : emit_insn (gen_ashlv16hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
17296 : }
17297 9 : else if (mode == V16SImode && TARGET_AVX512BW)
17298 : {
17299 9 : tmp1 = gen_reg_rtx (V32HImode);
17300 9 : emit_move_insn (tmp1, CONSTM1_RTX (V32HImode));
17301 9 : tmp2 = gen_reg_rtx (V32HImode);
17302 9 : emit_insn (gen_ashlv32hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
17303 : }
17304 : else
17305 : return false;
17306 : break;
17307 :
17308 1570 : case VEC_BCAST_PSLLD:
17309 1570 : if (mode == V4SImode && TARGET_SSE2)
17310 : {
17311 1538 : tmp1 = gen_reg_rtx (V4SImode);
17312 1538 : emit_move_insn (tmp1, CONSTM1_RTX (V4SImode));
17313 1538 : emit_insn (gen_ashlv4si3 (target, tmp1, GEN_INT (entry->arg)));
17314 1538 : return true;
17315 : }
17316 32 : else if (mode == V8SImode && TARGET_AVX2)
17317 : {
17318 15 : tmp1 = gen_reg_rtx (V8SImode);
17319 15 : emit_move_insn (tmp1, CONSTM1_RTX (V8SImode));
17320 15 : emit_insn (gen_ashlv8si3 (target, tmp1, GEN_INT (entry->arg)));
17321 15 : return true;
17322 : }
17323 17 : else if (mode == V16SImode && TARGET_AVX512F)
17324 : {
17325 17 : tmp1 = gen_reg_rtx (V16SImode);
17326 17 : emit_move_insn (tmp1, CONSTM1_RTX (V16SImode));
17327 17 : emit_insn (gen_ashlv16si3 (target, tmp1, GEN_INT (entry->arg)));
17328 17 : return true;
17329 : }
17330 : else
17331 : return false;
17332 :
17333 : default:
17334 : return false;
17335 : }
17336 :
17337 4657 : emit_move_insn (target, gen_lowpart (mode, tmp2));
17338 4657 : return true;
17339 : }
17340 :
17341 : /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
17342 : fill target with val via vec_duplicate. */
17343 :
17344 : static bool
17345 144867 : ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
17346 : {
17347 144867 : bool ok;
17348 144867 : rtx_insn *insn;
17349 144867 : rtx dup;
17350 :
17351 144867 : if ((mode == V4SImode || mode == V8SImode || mode == V16SImode)
17352 53092 : && CONST_INT_P (val)
17353 45092 : && ix86_vector_duplicate_simode_const (mode, target, INTVAL (val)))
17354 : return true;
17355 :
17356 : /* Save/restore recog_data in case this is called from splitters
17357 : or other routines where recog_data needs to stay valid across
17358 : force_reg. See PR106577. */
17359 126385 : recog_data_d recog_data_save = recog_data;
17360 :
17361 : /* First attempt to recognize VAL as-is. */
17362 126385 : dup = gen_vec_duplicate (mode, val);
17363 126385 : insn = emit_insn (gen_rtx_SET (target, dup));
17364 126385 : if (recog_memoized (insn) < 0)
17365 : {
17366 88830 : rtx_insn *seq;
17367 88830 : machine_mode innermode = GET_MODE_INNER (mode);
17368 88830 : rtx reg;
17369 :
17370 : /* If that fails, force VAL into a register or mem. */
17371 :
17372 88830 : start_sequence ();
17373 :
17374 0 : if (!TARGET_PREFER_BCST_FROM_INTEGER && CONST_INT_P (val)
17375 0 : && GET_MODE_BITSIZE (innermode) <= HOST_BITS_PER_WIDE_INT
17376 88830 : && GET_MODE_BITSIZE(mode) >= 128)
17377 0 : reg = validize_mem (force_const_mem (innermode, val));
17378 : else
17379 : {
17380 88830 : reg = force_reg (innermode, val);
17381 88830 : if (GET_MODE (reg) != innermode)
17382 0 : reg = gen_lowpart (innermode, reg);
17383 : }
17384 :
17385 88830 : SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
17386 88830 : seq = end_sequence ();
17387 88830 : if (seq)
17388 88830 : emit_insn_before (seq, insn);
17389 :
17390 88830 : ok = recog_memoized (insn) >= 0;
17391 88830 : gcc_assert (ok);
17392 : }
17393 126385 : recog_data = recog_data_save;
17394 126385 : return true;
17395 : }
17396 :
17397 : /* Get a vector mode of the same size as the original but with elements
17398 : twice as wide. This is only guaranteed to apply to integral vectors. */
17399 :
17400 : static machine_mode
17401 18868 : get_mode_wider_vector (machine_mode o)
17402 : {
17403 : /* ??? Rely on the ordering that genmodes.cc gives to vectors. */
17404 18868 : machine_mode n = GET_MODE_NEXT_MODE (o).require ();
17405 56604 : gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
17406 56604 : gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
17407 18868 : return n;
17408 : }
17409 :
17410 : static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
17411 : static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
17412 :
17413 : /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
17414 : with all elements equal to VAR. Return true if successful. */
17415 :
17416 : bool
17417 165060 : ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
17418 : rtx target, rtx val)
17419 : {
17420 165060 : bool ok;
17421 :
17422 165060 : switch (mode)
17423 : {
17424 70108 : case E_V2DImode:
17425 70108 : if (CONST_INT_P (val))
17426 : {
17427 61373 : int tmp = (int)INTVAL (val);
17428 61373 : if (tmp == (int)(INTVAL (val) >> 32))
17429 : {
17430 165 : rtx reg = gen_reg_rtx (V4SImode);
17431 165 : ok = ix86_vector_duplicate_value (V4SImode, reg,
17432 : GEN_INT (tmp));
17433 165 : if (ok)
17434 : {
17435 165 : emit_move_insn (target, gen_lowpart (V2DImode, reg));
17436 165 : return true;
17437 : }
17438 : }
17439 : }
17440 69943 : return ix86_vector_duplicate_value (mode, target, val);
17441 :
17442 1092 : case E_V4DImode:
17443 1092 : if (CONST_INT_P (val))
17444 : {
17445 781 : int tmp = (int)INTVAL (val);
17446 781 : if (tmp == (int)(INTVAL (val) >> 32))
17447 : {
17448 54 : rtx reg = gen_reg_rtx (V8SImode);
17449 54 : ok = ix86_vector_duplicate_value (V8SImode, reg,
17450 : GEN_INT (tmp));
17451 54 : if (ok)
17452 : {
17453 54 : emit_move_insn (target, gen_lowpart (V4DImode, reg));
17454 54 : return true;
17455 : }
17456 : }
17457 : }
17458 1038 : return ix86_vector_duplicate_value (mode, target, val);
17459 :
17460 513 : case E_V8DImode:
17461 513 : if (CONST_INT_P (val))
17462 : {
17463 264 : int tmp = (int)INTVAL (val);
17464 264 : if (tmp == (int)(INTVAL (val) >> 32))
17465 : {
17466 24 : rtx reg = gen_reg_rtx (V16SImode);
17467 24 : ok = ix86_vector_duplicate_value (V16SImode, reg,
17468 : GEN_INT (tmp));
17469 24 : if (ok)
17470 : {
17471 24 : emit_move_insn (target, gen_lowpart (V8DImode, reg));
17472 24 : return true;
17473 : }
17474 : }
17475 : }
17476 489 : return ix86_vector_duplicate_value (mode, target, val);
17477 :
17478 2641 : case E_V2SImode:
17479 2641 : case E_V2SFmode:
17480 2641 : if (!mmx_ok)
17481 : return false;
17482 : /* FALLTHRU */
17483 :
17484 72168 : case E_V4DFmode:
17485 72168 : case E_V8SFmode:
17486 72168 : case E_V8SImode:
17487 72168 : case E_V2DFmode:
17488 72168 : case E_V4SFmode:
17489 72168 : case E_V4SImode:
17490 72168 : case E_V16SImode:
17491 72168 : case E_V16SFmode:
17492 72168 : case E_V8DFmode:
17493 72168 : return ix86_vector_duplicate_value (mode, target, val);
17494 :
17495 398 : case E_V4HImode:
17496 398 : if (!mmx_ok)
17497 : return false;
17498 395 : if (TARGET_SSE || TARGET_3DNOW_A)
17499 : {
17500 395 : rtx x;
17501 :
17502 395 : val = gen_lowpart (SImode, val);
17503 395 : if (CONST_INT_P (val))
17504 : return false;
17505 393 : x = gen_rtx_TRUNCATE (HImode, val);
17506 393 : x = gen_rtx_VEC_DUPLICATE (mode, x);
17507 393 : emit_insn (gen_rtx_SET (target, x));
17508 393 : return true;
17509 : }
17510 0 : goto widen;
17511 :
17512 5 : case E_V4HFmode:
17513 5 : case E_V4BFmode:
17514 5 : if (TARGET_MMX_WITH_SSE)
17515 : {
17516 10 : val = force_reg (GET_MODE_INNER (mode), val);
17517 5 : rtx x = gen_rtx_VEC_DUPLICATE (mode, val);
17518 5 : emit_insn (gen_rtx_SET (target, x));
17519 5 : return true;
17520 : }
17521 : return false;
17522 :
17523 108 : case E_V2HImode:
17524 108 : if (TARGET_SSE2)
17525 : {
17526 108 : rtx x;
17527 :
17528 108 : val = gen_lowpart (SImode, val);
17529 108 : if (CONST_INT_P (val))
17530 : return false;
17531 108 : x = gen_rtx_TRUNCATE (HImode, val);
17532 108 : x = gen_rtx_VEC_DUPLICATE (mode, x);
17533 108 : emit_insn (gen_rtx_SET (target, x));
17534 108 : return true;
17535 : }
17536 : return false;
17537 :
17538 3 : case E_V2HFmode:
17539 3 : case E_V2BFmode:
17540 3 : if (TARGET_SSE2)
17541 : {
17542 6 : val = force_reg (GET_MODE_INNER (mode), val);
17543 3 : rtx x = gen_rtx_VEC_DUPLICATE (mode, val);
17544 3 : emit_insn (gen_rtx_SET (target, x));
17545 3 : return true;
17546 : }
17547 : return false;
17548 :
17549 297 : case E_V8QImode:
17550 297 : case E_V4QImode:
17551 297 : if (!mmx_ok)
17552 : return false;
17553 293 : goto widen;
17554 :
17555 10199 : case E_V8HImode:
17556 10199 : if (CONST_INT_P (val))
17557 9678 : goto widen;
17558 : /* FALLTHRU */
17559 :
17560 835 : case E_V8HFmode:
17561 835 : case E_V8BFmode:
17562 835 : if (TARGET_AVX2)
17563 391 : return ix86_vector_duplicate_value (mode, target, val);
17564 :
17565 444 : if (TARGET_SSE2)
17566 : {
17567 1140 : struct expand_vec_perm_d dperm;
17568 1140 : rtx tmp1, tmp2;
17569 :
17570 444 : permute:
17571 1140 : memset (&dperm, 0, sizeof (dperm));
17572 1140 : dperm.target = target;
17573 1140 : dperm.vmode = mode;
17574 1140 : dperm.nelt = GET_MODE_NUNITS (mode);
17575 1140 : dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
17576 1140 : dperm.one_operand_p = true;
17577 :
17578 1140 : if (mode == V8HFmode || mode == V8BFmode)
17579 : {
17580 3 : tmp1 = force_reg (GET_MODE_INNER (mode), val);
17581 3 : tmp2 = gen_reg_rtx (mode);
17582 3 : emit_insn (gen_vec_set_0 (mode, tmp2, CONST0_RTX (mode), tmp1));
17583 3 : tmp1 = gen_lowpart (mode, tmp2);
17584 : }
17585 : else
17586 : {
17587 : /* Extend to SImode using a paradoxical SUBREG. */
17588 1137 : tmp1 = gen_reg_rtx (SImode);
17589 1137 : emit_move_insn (tmp1, gen_lowpart (SImode, val));
17590 :
17591 : /* Insert the SImode value as
17592 : low element of a V4SImode vector. */
17593 1137 : tmp2 = gen_reg_rtx (V4SImode);
17594 1137 : emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
17595 1137 : tmp1 = gen_lowpart (mode, tmp2);
17596 : }
17597 :
17598 1140 : emit_move_insn (dperm.op0, tmp1);
17599 1140 : ok = (expand_vec_perm_1 (&dperm)
17600 1140 : || expand_vec_perm_broadcast_1 (&dperm));
17601 0 : gcc_assert (ok);
17602 1140 : return ok;
17603 : }
17604 0 : goto widen;
17605 :
17606 5883 : case E_V16QImode:
17607 5883 : if (CONST_INT_P (val))
17608 5131 : goto widen;
17609 752 : if (TARGET_AVX2)
17610 56 : return ix86_vector_duplicate_value (mode, target, val);
17611 :
17612 696 : if (TARGET_SSE2)
17613 696 : goto permute;
17614 0 : goto widen;
17615 :
17616 17306 : widen:
17617 : /* Replicate the value once into the next wider mode and recurse. */
17618 17306 : {
17619 17306 : machine_mode smode, wsmode, wvmode;
17620 17306 : rtx x;
17621 :
17622 17306 : smode = GET_MODE_INNER (mode);
17623 17306 : wvmode = get_mode_wider_vector (mode);
17624 17306 : wsmode = GET_MODE_INNER (wvmode);
17625 :
17626 17306 : val = convert_modes (wsmode, smode, val, true);
17627 :
17628 17306 : if (CONST_INT_P (val))
17629 : {
17630 34028 : x = simplify_binary_operation (ASHIFT, wsmode, val,
17631 17014 : GEN_INT (GET_MODE_BITSIZE (smode)));
17632 17014 : val = simplify_binary_operation (IOR, wsmode, val, x);
17633 : }
17634 292 : else if (smode == QImode && !TARGET_PARTIAL_REG_STALL)
17635 292 : emit_insn (gen_insv_1 (wsmode, val, val));
17636 : else
17637 : {
17638 0 : x = expand_simple_binop (wsmode, ASHIFT, val,
17639 0 : GEN_INT (GET_MODE_BITSIZE (smode)),
17640 : NULL_RTX, 1, OPTAB_LIB_WIDEN);
17641 0 : val = expand_simple_binop (wsmode, IOR, val, x, x, 1,
17642 : OPTAB_LIB_WIDEN);
17643 : }
17644 :
17645 17306 : x = gen_reg_rtx (wvmode);
17646 17306 : ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
17647 17306 : if (!ok)
17648 : return false;
17649 17305 : emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
17650 17305 : return true;
17651 : }
17652 :
17653 1435 : case E_V16HImode:
17654 1435 : case E_V32QImode:
17655 1435 : if (CONST_INT_P (val))
17656 1140 : goto widen;
17657 : /* FALLTHRU */
17658 :
17659 378 : case E_V16HFmode:
17660 378 : case E_V16BFmode:
17661 378 : if (TARGET_AVX2)
17662 350 : return ix86_vector_duplicate_value (mode, target, val);
17663 : else
17664 : {
17665 28 : machine_mode hvmode;
17666 28 : switch (mode)
17667 : {
17668 : case V16HImode:
17669 : hvmode = V8HImode;
17670 : break;
17671 0 : case V16HFmode:
17672 0 : hvmode = V8HFmode;
17673 0 : break;
17674 1 : case V16BFmode:
17675 1 : hvmode = V8BFmode;
17676 1 : break;
17677 14 : case V32QImode:
17678 14 : hvmode = V16QImode;
17679 14 : break;
17680 0 : default:
17681 0 : gcc_unreachable ();
17682 : }
17683 28 : rtx x = gen_reg_rtx (hvmode);
17684 :
17685 28 : ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
17686 28 : if (!ok)
17687 : return false;
17688 :
17689 28 : x = gen_rtx_VEC_CONCAT (mode, x, x);
17690 28 : emit_insn (gen_rtx_SET (target, x));
17691 : }
17692 28 : return true;
17693 :
17694 1194 : case E_V32HImode:
17695 1194 : case E_V64QImode:
17696 1194 : if (CONST_INT_P (val))
17697 1064 : goto widen;
17698 : /* FALLTHRU */
17699 :
17700 209 : case E_V32HFmode:
17701 209 : case E_V32BFmode:
17702 209 : if (TARGET_AVX512BW)
17703 189 : return ix86_vector_duplicate_value (mode, target, val);
17704 : else
17705 : {
17706 20 : machine_mode hvmode;
17707 20 : switch (mode)
17708 : {
17709 : case V32HImode:
17710 : hvmode = V16HImode;
17711 : break;
17712 0 : case V32HFmode:
17713 0 : hvmode = V16HFmode;
17714 0 : break;
17715 1 : case V32BFmode:
17716 1 : hvmode = V16BFmode;
17717 1 : break;
17718 10 : case V64QImode:
17719 10 : hvmode = V32QImode;
17720 10 : break;
17721 0 : default:
17722 0 : gcc_unreachable ();
17723 : }
17724 20 : rtx x = gen_reg_rtx (hvmode);
17725 :
17726 20 : ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
17727 20 : if (!ok)
17728 : return false;
17729 :
17730 20 : x = gen_rtx_VEC_CONCAT (mode, x, x);
17731 20 : emit_insn (gen_rtx_SET (target, x));
17732 : }
17733 20 : return true;
17734 :
17735 : default:
17736 : return false;
17737 : }
17738 : }
17739 :
17740 : /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
17741 : whose ONE_VAR element is VAR, and other elements are zero. Return true
17742 : if successful. */
17743 :
17744 : bool
17745 10282 : ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
17746 : rtx target, rtx var, int one_var)
17747 : {
17748 10282 : machine_mode vsimode;
17749 10282 : rtx new_target;
17750 10282 : rtx x, tmp;
17751 10282 : bool use_vector_set = false;
17752 10282 : rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
17753 :
17754 10282 : switch (mode)
17755 : {
17756 7860 : case E_V2DImode:
17757 : /* For SSE4.1, we normally use vector set. But if the second
17758 : element is zero and inter-unit moves are OK, we use movq
17759 : instead. */
17760 7851 : use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
17761 7983 : && !(TARGET_INTER_UNIT_MOVES_TO_VEC
17762 : && one_var == 0));
17763 : break;
17764 877 : case E_V16QImode:
17765 877 : case E_V4SImode:
17766 877 : case E_V4SFmode:
17767 877 : use_vector_set = TARGET_SSE4_1;
17768 877 : break;
17769 86 : case E_V8HImode:
17770 86 : use_vector_set = TARGET_SSE2;
17771 86 : gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
17772 86 : ? gen_vec_setv8hi_0 : NULL;
17773 : break;
17774 8 : case E_V8QImode:
17775 8 : use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
17776 : break;
17777 14 : case E_V4HImode:
17778 14 : case E_V4HFmode:
17779 14 : case E_V4BFmode:
17780 14 : use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
17781 : break;
17782 32 : case E_V4QImode:
17783 32 : use_vector_set = TARGET_SSE4_1;
17784 32 : break;
17785 0 : case E_V32QImode:
17786 0 : use_vector_set = TARGET_AVX;
17787 0 : break;
17788 5 : case E_V16HImode:
17789 5 : use_vector_set = TARGET_AVX;
17790 5 : gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
17791 5 : ? gen_vec_setv16hi_0 : NULL;
17792 : break;
17793 5 : case E_V8SImode:
17794 5 : use_vector_set = TARGET_AVX;
17795 5 : gen_vec_set_0 = gen_vec_setv8si_0;
17796 5 : break;
17797 22 : case E_V8SFmode:
17798 22 : use_vector_set = TARGET_AVX;
17799 22 : gen_vec_set_0 = gen_vec_setv8sf_0;
17800 22 : break;
17801 13 : case E_V4DFmode:
17802 13 : use_vector_set = TARGET_AVX;
17803 13 : gen_vec_set_0 = gen_vec_setv4df_0;
17804 13 : break;
17805 7 : case E_V4DImode:
17806 : /* Use ix86_expand_vector_set in 64bit mode only. */
17807 7 : use_vector_set = TARGET_AVX && TARGET_64BIT;
17808 : gen_vec_set_0 = gen_vec_setv4di_0;
17809 : break;
17810 17 : case E_V16SImode:
17811 17 : use_vector_set = TARGET_AVX512F && one_var == 0;
17812 : gen_vec_set_0 = gen_vec_setv16si_0;
17813 : break;
17814 22 : case E_V16SFmode:
17815 22 : use_vector_set = TARGET_AVX512F && one_var == 0;
17816 : gen_vec_set_0 = gen_vec_setv16sf_0;
17817 : break;
17818 0 : case E_V8DFmode:
17819 0 : use_vector_set = TARGET_AVX512F && one_var == 0;
17820 : gen_vec_set_0 = gen_vec_setv8df_0;
17821 : break;
17822 2 : case E_V8DImode:
17823 : /* Use ix86_expand_vector_set in 64bit mode only. */
17824 2 : use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
17825 : gen_vec_set_0 = gen_vec_setv8di_0;
17826 : break;
17827 39 : case E_V8HFmode:
17828 39 : use_vector_set = TARGET_AVX512FP16 && one_var == 0;
17829 : gen_vec_set_0 = gen_vec_setv8hf_0;
17830 : break;
17831 9 : case E_V16HFmode:
17832 9 : use_vector_set = TARGET_AVX512FP16 && one_var == 0;
17833 : gen_vec_set_0 = gen_vec_setv16hf_0;
17834 : break;
17835 6 : case E_V32HFmode:
17836 6 : use_vector_set = TARGET_AVX512FP16 && one_var == 0;
17837 : gen_vec_set_0 = gen_vec_setv32hf_0;
17838 : break;
17839 2 : case E_V8BFmode:
17840 2 : use_vector_set = TARGET_AVX512FP16 && one_var == 0;
17841 : gen_vec_set_0 = gen_vec_setv8bf_0;
17842 : break;
17843 0 : case E_V16BFmode:
17844 0 : use_vector_set = TARGET_AVX512FP16 && one_var == 0;
17845 : gen_vec_set_0 = gen_vec_setv16bf_0;
17846 : break;
17847 0 : case E_V32BFmode:
17848 0 : use_vector_set = TARGET_AVX512FP16 && one_var == 0;
17849 : gen_vec_set_0 = gen_vec_setv32bf_0;
17850 : break;
17851 4 : case E_V32HImode:
17852 4 : use_vector_set = TARGET_AVX512FP16 && one_var == 0;
17853 : gen_vec_set_0 = gen_vec_setv32hi_0;
17854 : default:
17855 : break;
17856 : }
17857 :
17858 8914 : if (use_vector_set)
17859 : {
17860 862 : if (gen_vec_set_0 && one_var == 0)
17861 : {
17862 354 : var = force_reg (GET_MODE_INNER (mode), var);
17863 177 : emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
17864 177 : return true;
17865 : }
17866 685 : emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
17867 1370 : var = force_reg (GET_MODE_INNER (mode), var);
17868 685 : ix86_expand_vector_set (mmx_ok, target, var, one_var);
17869 685 : return true;
17870 : }
17871 :
17872 9420 : switch (mode)
17873 : {
17874 1155 : case E_V2SFmode:
17875 1155 : case E_V2SImode:
17876 1155 : if (!mmx_ok)
17877 : return false;
17878 : /* FALLTHRU */
17879 :
17880 8098 : case E_V2DFmode:
17881 8098 : case E_V2DImode:
17882 8098 : if (one_var != 0)
17883 : return false;
17884 5010 : var = force_reg (GET_MODE_INNER (mode), var);
17885 5010 : x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
17886 2505 : emit_insn (gen_rtx_SET (target, x));
17887 2505 : return true;
17888 :
17889 313 : case E_V4SFmode:
17890 313 : case E_V4SImode:
17891 313 : if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
17892 0 : new_target = gen_reg_rtx (mode);
17893 : else
17894 : new_target = target;
17895 626 : var = force_reg (GET_MODE_INNER (mode), var);
17896 313 : x = gen_rtx_VEC_DUPLICATE (mode, var);
17897 313 : x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
17898 313 : emit_insn (gen_rtx_SET (new_target, x));
17899 313 : if (one_var != 0)
17900 : {
17901 : /* We need to shuffle the value to the correct position, so
17902 : create a new pseudo to store the intermediate result. */
17903 :
17904 : /* With SSE2, we can use the integer shuffle insns. */
17905 41 : if (mode != V4SFmode && TARGET_SSE2)
17906 : {
17907 28 : emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
17908 : const1_rtx,
17909 28 : GEN_INT (one_var == 1 ? 0 : 1),
17910 28 : GEN_INT (one_var == 2 ? 0 : 1),
17911 28 : GEN_INT (one_var == 3 ? 0 : 1)));
17912 28 : if (target != new_target)
17913 0 : emit_move_insn (target, new_target);
17914 28 : return true;
17915 : }
17916 :
17917 : /* Otherwise convert the intermediate result to V4SFmode and
17918 : use the SSE1 shuffle instructions. */
17919 0 : if (mode != V4SFmode)
17920 : {
17921 0 : tmp = gen_reg_rtx (V4SFmode);
17922 0 : emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
17923 : }
17924 : else
17925 : tmp = new_target;
17926 :
17927 43 : emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
17928 : const1_rtx,
17929 13 : GEN_INT (one_var == 1 ? 0 : 1),
17930 : GEN_INT (one_var == 2 ? 0+4 : 1+4),
17931 : GEN_INT (one_var == 3 ? 0+4 : 1+4)));
17932 :
17933 13 : if (mode != V4SFmode)
17934 0 : emit_move_insn (target, gen_lowpart (V4SImode, tmp));
17935 13 : else if (tmp != target)
17936 0 : emit_move_insn (target, tmp);
17937 : }
17938 272 : else if (target != new_target)
17939 0 : emit_move_insn (target, new_target);
17940 : return true;
17941 :
17942 13 : case E_V8HImode:
17943 13 : case E_V16QImode:
17944 13 : vsimode = V4SImode;
17945 13 : goto widen;
17946 3 : case E_V4HImode:
17947 3 : case E_V8QImode:
17948 3 : if (!mmx_ok)
17949 : return false;
17950 3 : vsimode = V2SImode;
17951 3 : goto widen;
17952 16 : widen:
17953 16 : if (one_var != 0)
17954 : return false;
17955 :
17956 : /* Zero extend the variable element to SImode and recurse. */
17957 16 : var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
17958 :
17959 8 : x = gen_reg_rtx (vsimode);
17960 8 : if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
17961 : var, one_var))
17962 0 : gcc_unreachable ();
17963 :
17964 8 : emit_move_insn (target, gen_lowpart (mode, x));
17965 8 : return true;
17966 :
17967 : default:
17968 : return false;
17969 : }
17970 : }
17971 :
17972 : /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
17973 : consisting of the values in VALS. It is known that all elements
17974 : except ONE_VAR are constants. Return true if successful. */
17975 :
17976 : static bool
17977 7750 : ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
17978 : rtx target, rtx vals, int one_var)
17979 : {
17980 7750 : rtx var = XVECEXP (vals, 0, one_var);
17981 7750 : machine_mode wmode;
17982 7750 : rtx const_vec, x;
17983 :
17984 7750 : const_vec = copy_rtx (vals);
17985 7750 : XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
17986 7750 : const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
17987 :
17988 7750 : switch (mode)
17989 : {
17990 : case E_V2DFmode:
17991 : case E_V2DImode:
17992 : case E_V2SFmode:
17993 : case E_V2SImode:
17994 : /* For the two element vectors, it's just as easy to use
17995 : the general case. */
17996 : return false;
17997 :
17998 3 : case E_V4DImode:
17999 : /* Use ix86_expand_vector_set in 64bit mode only. */
18000 3 : if (!TARGET_64BIT)
18001 : return false;
18002 : /* FALLTHRU */
18003 : case E_V8HFmode:
18004 : case E_V16HFmode:
18005 : case E_V8BFmode:
18006 : case E_V16BFmode:
18007 : case E_V4DFmode:
18008 : case E_V8SFmode:
18009 : case E_V8SImode:
18010 : case E_V16HImode:
18011 : case E_V32QImode:
18012 : case E_V4SFmode:
18013 : case E_V4SImode:
18014 : case E_V8HImode:
18015 : case E_V4HImode:
18016 : case E_V4HFmode:
18017 : case E_V4BFmode:
18018 : break;
18019 :
18020 8 : case E_V16QImode:
18021 8 : if (TARGET_SSE4_1)
18022 : break;
18023 8 : wmode = V8HImode;
18024 8 : goto widen;
18025 1 : case E_V8QImode:
18026 1 : if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
18027 : break;
18028 1 : wmode = V4HImode;
18029 1 : goto widen;
18030 38 : case E_V4QImode:
18031 38 : if (TARGET_SSE4_1)
18032 : break;
18033 : wmode = V2HImode;
18034 47 : widen:
18035 : /* There's no way to set one QImode entry easily. Combine
18036 : the variable value with its adjacent constant value, and
18037 : promote to an HImode set. */
18038 47 : x = XVECEXP (vals, 0, one_var ^ 1);
18039 47 : if (one_var & 1)
18040 : {
18041 13 : var = convert_modes (HImode, QImode, var, true);
18042 13 : var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
18043 : NULL_RTX, 1, OPTAB_LIB_WIDEN);
18044 13 : x = GEN_INT (INTVAL (x) & 0xff);
18045 : }
18046 : else
18047 : {
18048 34 : var = convert_modes (HImode, QImode, var, true);
18049 34 : x = gen_int_mode (UINTVAL (x) << 8, HImode);
18050 : }
18051 47 : if (x != const0_rtx)
18052 7 : var = expand_simple_binop (HImode, IOR, var, x, var,
18053 : 1, OPTAB_LIB_WIDEN);
18054 :
18055 47 : x = gen_reg_rtx (wmode);
18056 47 : emit_move_insn (x, gen_lowpart (wmode, const_vec));
18057 47 : ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
18058 :
18059 47 : emit_move_insn (target, gen_lowpart (mode, x));
18060 47 : return true;
18061 :
18062 : default:
18063 : return false;
18064 : }
18065 :
18066 191 : emit_move_insn (target, const_vec);
18067 191 : ix86_expand_vector_set (mmx_ok, target, var, one_var);
18068 191 : return true;
18069 : }
18070 :
18071 : /* A subroutine of ix86_expand_vector_init_general. Use vector
18072 : concatenate to handle the most general case: all values variable,
18073 : and none identical. */
18074 :
18075 : static void
18076 118491 : ix86_expand_vector_init_concat (machine_mode mode,
18077 : rtx target, rtx *ops, int n)
18078 : {
18079 118491 : machine_mode half_mode = VOIDmode;
18080 118491 : rtx half[2];
18081 118491 : rtvec v;
18082 118491 : int i, j;
18083 :
18084 118491 : switch (n)
18085 : {
18086 110057 : case 2:
18087 110057 : switch (mode)
18088 : {
18089 : case E_V32HFmode:
18090 : half_mode = V16HFmode;
18091 : break;
18092 0 : case E_V32BFmode:
18093 0 : half_mode = V16BFmode;
18094 0 : break;
18095 81 : case E_V16SImode:
18096 81 : half_mode = V8SImode;
18097 81 : break;
18098 33 : case E_V16SFmode:
18099 33 : half_mode = V8SFmode;
18100 33 : break;
18101 92 : case E_V8DImode:
18102 92 : half_mode = V4DImode;
18103 92 : break;
18104 73 : case E_V8DFmode:
18105 73 : half_mode = V4DFmode;
18106 73 : break;
18107 0 : case E_V16HFmode:
18108 0 : half_mode = V8HFmode;
18109 0 : break;
18110 0 : case E_V16BFmode:
18111 0 : half_mode = V8BFmode;
18112 0 : break;
18113 197 : case E_V8SImode:
18114 197 : half_mode = V4SImode;
18115 197 : break;
18116 271 : case E_V8SFmode:
18117 271 : half_mode = V4SFmode;
18118 271 : break;
18119 308 : case E_V4DImode:
18120 308 : half_mode = V2DImode;
18121 308 : break;
18122 633 : case E_V4DFmode:
18123 633 : half_mode = V2DFmode;
18124 633 : break;
18125 5940 : case E_V4SImode:
18126 5940 : half_mode = V2SImode;
18127 5940 : break;
18128 2273 : case E_V4SFmode:
18129 2273 : half_mode = V2SFmode;
18130 2273 : break;
18131 64065 : case E_V2DImode:
18132 64065 : half_mode = DImode;
18133 64065 : break;
18134 27072 : case E_V2SImode:
18135 27072 : half_mode = SImode;
18136 27072 : break;
18137 3529 : case E_V2DFmode:
18138 3529 : half_mode = DFmode;
18139 3529 : break;
18140 5490 : case E_V2SFmode:
18141 5490 : half_mode = SFmode;
18142 5490 : break;
18143 0 : default:
18144 0 : gcc_unreachable ();
18145 : }
18146 :
18147 110057 : if (!register_operand (ops[1], half_mode))
18148 47876 : ops[1] = force_reg (half_mode, ops[1]);
18149 110057 : if (!register_operand (ops[0], half_mode))
18150 36337 : ops[0] = force_reg (half_mode, ops[0]);
18151 110057 : emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
18152 : ops[1])));
18153 110057 : break;
18154 :
18155 7796 : case 4:
18156 7796 : switch (mode)
18157 : {
18158 : case E_V4DImode:
18159 : half_mode = V2DImode;
18160 : break;
18161 550 : case E_V4DFmode:
18162 550 : half_mode = V2DFmode;
18163 550 : break;
18164 4964 : case E_V4SImode:
18165 4964 : half_mode = V2SImode;
18166 4964 : break;
18167 2100 : case E_V4SFmode:
18168 2100 : half_mode = V2SFmode;
18169 2100 : break;
18170 0 : default:
18171 0 : gcc_unreachable ();
18172 : }
18173 7796 : goto half;
18174 :
18175 545 : case 8:
18176 545 : switch (mode)
18177 : {
18178 : case E_V8DImode:
18179 : half_mode = V4DImode;
18180 : break;
18181 73 : case E_V8DFmode:
18182 73 : half_mode = V4DFmode;
18183 73 : break;
18184 156 : case E_V8SImode:
18185 156 : half_mode = V4SImode;
18186 156 : break;
18187 265 : case E_V8SFmode:
18188 265 : half_mode = V4SFmode;
18189 265 : break;
18190 0 : default:
18191 0 : gcc_unreachable ();
18192 : }
18193 545 : goto half;
18194 :
18195 93 : case 16:
18196 93 : switch (mode)
18197 : {
18198 : case E_V16SImode:
18199 : half_mode = V8SImode;
18200 : break;
18201 33 : case E_V16SFmode:
18202 33 : half_mode = V8SFmode;
18203 33 : break;
18204 0 : default:
18205 0 : gcc_unreachable ();
18206 : }
18207 93 : goto half;
18208 :
18209 8434 : half:
18210 : /* FIXME: We process inputs backward to help RA. PR 36222. */
18211 8434 : i = n - 1;
18212 25302 : for (j = 1; j != -1; j--)
18213 : {
18214 16868 : half[j] = gen_reg_rtx (half_mode);
18215 16868 : switch (n >> 1)
18216 : {
18217 15592 : case 2:
18218 15592 : v = gen_rtvec (2, ops[i-1], ops[i]);
18219 15592 : i -= 2;
18220 15592 : break;
18221 1090 : case 4:
18222 1090 : v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
18223 1090 : i -= 4;
18224 1090 : break;
18225 186 : case 8:
18226 372 : v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
18227 186 : ops[i-3], ops[i-2], ops[i-1], ops[i]);
18228 186 : i -= 8;
18229 186 : break;
18230 0 : default:
18231 0 : gcc_unreachable ();
18232 : }
18233 16868 : ix86_expand_vector_init (false, half[j],
18234 : gen_rtx_PARALLEL (half_mode, v));
18235 : }
18236 :
18237 8434 : ix86_expand_vector_init_concat (mode, target, half, 2);
18238 8434 : break;
18239 :
18240 0 : default:
18241 0 : gcc_unreachable ();
18242 : }
18243 118491 : }
18244 :
18245 : /* A subroutine of ix86_expand_vector_init_general. Use vector
18246 : interleave to handle the most general case: all values variable,
18247 : and none identical. */
18248 :
18249 : static void
18250 3881 : ix86_expand_vector_init_interleave (machine_mode mode,
18251 : rtx target, rtx *ops, int n)
18252 : {
18253 3881 : machine_mode first_imode, second_imode, third_imode, inner_mode;
18254 3881 : int i, j;
18255 3881 : rtx op, op0, op1;
18256 3881 : rtx (*gen_load_even) (rtx, rtx, rtx);
18257 3881 : rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
18258 3881 : rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
18259 :
18260 3881 : switch (mode)
18261 : {
18262 : case E_V8HFmode:
18263 : gen_load_even = gen_vec_interleave_lowv8hf;
18264 : gen_interleave_first_low = gen_vec_interleave_lowv4si;
18265 : gen_interleave_second_low = gen_vec_interleave_lowv2di;
18266 : inner_mode = HFmode;
18267 : first_imode = V4SImode;
18268 : second_imode = V2DImode;
18269 : third_imode = VOIDmode;
18270 : break;
18271 487 : case E_V8BFmode:
18272 487 : gen_load_even = gen_vec_interleave_lowv8bf;
18273 487 : gen_interleave_first_low = gen_vec_interleave_lowv4si;
18274 487 : gen_interleave_second_low = gen_vec_interleave_lowv2di;
18275 487 : inner_mode = BFmode;
18276 487 : first_imode = V4SImode;
18277 487 : second_imode = V2DImode;
18278 487 : third_imode = VOIDmode;
18279 487 : break;
18280 793 : case E_V8HImode:
18281 793 : gen_load_even = gen_vec_setv8hi;
18282 793 : gen_interleave_first_low = gen_vec_interleave_lowv4si;
18283 793 : gen_interleave_second_low = gen_vec_interleave_lowv2di;
18284 793 : inner_mode = HImode;
18285 793 : first_imode = V4SImode;
18286 793 : second_imode = V2DImode;
18287 793 : third_imode = VOIDmode;
18288 793 : break;
18289 374 : case E_V16QImode:
18290 374 : gen_load_even = gen_vec_setv16qi;
18291 374 : gen_interleave_first_low = gen_vec_interleave_lowv8hi;
18292 374 : gen_interleave_second_low = gen_vec_interleave_lowv4si;
18293 374 : inner_mode = QImode;
18294 374 : first_imode = V8HImode;
18295 374 : second_imode = V4SImode;
18296 374 : third_imode = V2DImode;
18297 374 : break;
18298 0 : default:
18299 0 : gcc_unreachable ();
18300 : }
18301 :
18302 20901 : for (i = 0; i < n; i++)
18303 : {
18304 17020 : op = ops [i + i];
18305 17020 : if (inner_mode == HFmode || inner_mode == BFmode)
18306 : {
18307 10856 : rtx even, odd;
18308 : /* Use vpuncklwd to pack 2 HFmode or BFmode. */
18309 1948 : machine_mode vec_mode =
18310 10856 : (inner_mode == HFmode) ? V8HFmode : V8BFmode;
18311 10856 : op0 = gen_reg_rtx (vec_mode);
18312 10856 : even = lowpart_subreg (vec_mode,
18313 : force_reg (inner_mode, op), inner_mode);
18314 10856 : odd = lowpart_subreg (vec_mode,
18315 10856 : force_reg (inner_mode, ops[i + i + 1]),
18316 : inner_mode);
18317 10856 : emit_insn (gen_load_even (op0, even, odd));
18318 : }
18319 : else
18320 : {
18321 : /* Extend the odd elment to SImode using a paradoxical SUBREG. */
18322 6164 : op0 = gen_reg_rtx (SImode);
18323 6164 : emit_move_insn (op0, gen_lowpart (SImode, op));
18324 :
18325 : /* Insert the SImode value as low element of V4SImode vector. */
18326 6164 : op1 = gen_reg_rtx (V4SImode);
18327 6164 : op0 = gen_rtx_VEC_MERGE (V4SImode,
18328 : gen_rtx_VEC_DUPLICATE (V4SImode,
18329 : op0),
18330 : CONST0_RTX (V4SImode),
18331 : const1_rtx);
18332 6164 : emit_insn (gen_rtx_SET (op1, op0));
18333 :
18334 : /* Cast the V4SImode vector back to a vector in orignal mode. */
18335 6164 : op0 = gen_reg_rtx (mode);
18336 6164 : emit_move_insn (op0, gen_lowpart (mode, op1));
18337 :
18338 : /* Load even elements into the second position. */
18339 6164 : emit_insn (gen_load_even (op0,
18340 : force_reg (inner_mode,
18341 6164 : ops[i + i + 1]),
18342 : const1_rtx));
18343 : }
18344 :
18345 : /* Cast vector to FIRST_IMODE vector. */
18346 17020 : ops[i] = gen_reg_rtx (first_imode);
18347 17020 : emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
18348 : }
18349 :
18350 : /* Interleave low FIRST_IMODE vectors. */
18351 12391 : for (i = j = 0; i < n; i += 2, j++)
18352 : {
18353 8510 : op0 = gen_reg_rtx (first_imode);
18354 8510 : emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
18355 :
18356 : /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
18357 8510 : ops[j] = gen_reg_rtx (second_imode);
18358 8510 : emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
18359 : }
18360 :
18361 : /* Interleave low SECOND_IMODE vectors. */
18362 3881 : switch (second_imode)
18363 : {
18364 : case E_V4SImode:
18365 1122 : for (i = j = 0; i < n / 2; i += 2, j++)
18366 : {
18367 748 : op0 = gen_reg_rtx (second_imode);
18368 748 : emit_insn (gen_interleave_second_low (op0, ops[i],
18369 748 : ops[i + 1]));
18370 :
18371 : /* Cast the SECOND_IMODE vector to the THIRD_IMODE
18372 : vector. */
18373 748 : ops[j] = gen_reg_rtx (third_imode);
18374 748 : emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
18375 : }
18376 : second_imode = V2DImode;
18377 : gen_interleave_second_low = gen_vec_interleave_lowv2di;
18378 : /* FALLTHRU */
18379 :
18380 3881 : case E_V2DImode:
18381 3881 : op0 = gen_reg_rtx (second_imode);
18382 3881 : emit_insn (gen_interleave_second_low (op0, ops[0],
18383 : ops[1]));
18384 :
18385 : /* Cast the SECOND_IMODE vector back to a vector on original
18386 : mode. */
18387 3881 : emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
18388 3881 : break;
18389 :
18390 : default:
18391 : gcc_unreachable ();
18392 : }
18393 3881 : }
18394 :
18395 : /* A subroutine of ix86_expand_vector_init. Handle the most general case:
18396 : all values variable, and none identical. */
18397 :
18398 : static void
18399 119031 : ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
18400 : rtx target, rtx vals)
18401 : {
18402 119031 : rtx ops[64], op0, op1, op2, op3, op4, op5;
18403 119031 : machine_mode half_mode = VOIDmode;
18404 119031 : machine_mode quarter_mode = VOIDmode;
18405 119031 : machine_mode int_inner_mode = VOIDmode;
18406 119031 : int n, i;
18407 :
18408 119031 : switch (mode)
18409 : {
18410 32562 : case E_V2SFmode:
18411 32562 : case E_V2SImode:
18412 32562 : if (!mmx_ok && !TARGET_SSE)
18413 : break;
18414 : /* FALLTHRU */
18415 :
18416 108590 : case E_V16SImode:
18417 108590 : case E_V16SFmode:
18418 108590 : case E_V8DFmode:
18419 108590 : case E_V8DImode:
18420 108590 : case E_V8SFmode:
18421 108590 : case E_V8SImode:
18422 108590 : case E_V4DFmode:
18423 108590 : case E_V4DImode:
18424 108590 : case E_V4SFmode:
18425 108590 : case E_V4SImode:
18426 108590 : case E_V2DFmode:
18427 108590 : case E_V2DImode:
18428 108590 : n = GET_MODE_NUNITS (mode);
18429 345934 : for (i = 0; i < n; i++)
18430 237344 : ops[i] = XVECEXP (vals, 0, i);
18431 108590 : ix86_expand_vector_init_concat (mode, target, ops, n);
18432 219285 : return;
18433 :
18434 : case E_V2TImode:
18435 135 : for (i = 0; i < 2; i++)
18436 90 : ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
18437 45 : op0 = gen_reg_rtx (V4DImode);
18438 45 : ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
18439 45 : emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
18440 45 : return;
18441 :
18442 : case E_V4TImode:
18443 195 : for (i = 0; i < 4; i++)
18444 156 : ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
18445 39 : ops[4] = gen_reg_rtx (V4DImode);
18446 39 : ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
18447 39 : ops[5] = gen_reg_rtx (V4DImode);
18448 39 : ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
18449 39 : op0 = gen_reg_rtx (V8DImode);
18450 39 : ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
18451 39 : emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
18452 39 : return;
18453 :
18454 69 : case E_V32QImode:
18455 69 : half_mode = V16QImode;
18456 69 : goto half;
18457 :
18458 64 : case E_V16HImode:
18459 64 : half_mode = V8HImode;
18460 64 : goto half;
18461 :
18462 237 : case E_V16HFmode:
18463 237 : half_mode = V8HFmode;
18464 237 : goto half;
18465 :
18466 95 : case E_V16BFmode:
18467 95 : half_mode = V8BFmode;
18468 95 : goto half;
18469 :
18470 465 : half:
18471 465 : n = GET_MODE_NUNITS (mode);
18472 9009 : for (i = 0; i < n; i++)
18473 8544 : ops[i] = XVECEXP (vals, 0, i);
18474 465 : op0 = gen_reg_rtx (half_mode);
18475 465 : op1 = gen_reg_rtx (half_mode);
18476 465 : ix86_expand_vector_init_interleave (half_mode, op0, ops,
18477 : n >> 2);
18478 465 : ix86_expand_vector_init_interleave (half_mode, op1,
18479 465 : &ops [n >> 1], n >> 2);
18480 465 : emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
18481 465 : return;
18482 :
18483 56 : case E_V64QImode:
18484 56 : quarter_mode = V16QImode;
18485 56 : half_mode = V32QImode;
18486 56 : goto quarter;
18487 :
18488 71 : case E_V32HImode:
18489 71 : quarter_mode = V8HImode;
18490 71 : half_mode = V16HImode;
18491 71 : goto quarter;
18492 :
18493 287 : case E_V32HFmode:
18494 287 : quarter_mode = V8HFmode;
18495 287 : half_mode = V16HFmode;
18496 287 : goto quarter;
18497 :
18498 51 : case E_V32BFmode:
18499 51 : quarter_mode = V8BFmode;
18500 51 : half_mode = V16BFmode;
18501 51 : goto quarter;
18502 :
18503 465 : quarter:
18504 465 : n = GET_MODE_NUNITS (mode);
18505 17137 : for (i = 0; i < n; i++)
18506 16672 : ops[i] = XVECEXP (vals, 0, i);
18507 465 : op0 = gen_reg_rtx (quarter_mode);
18508 465 : op1 = gen_reg_rtx (quarter_mode);
18509 465 : op2 = gen_reg_rtx (quarter_mode);
18510 465 : op3 = gen_reg_rtx (quarter_mode);
18511 465 : op4 = gen_reg_rtx (half_mode);
18512 465 : op5 = gen_reg_rtx (half_mode);
18513 465 : ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
18514 : n >> 3);
18515 465 : ix86_expand_vector_init_interleave (quarter_mode, op1,
18516 465 : &ops [n >> 2], n >> 3);
18517 465 : ix86_expand_vector_init_interleave (quarter_mode, op2,
18518 465 : &ops [n >> 1], n >> 3);
18519 465 : ix86_expand_vector_init_interleave (quarter_mode, op3,
18520 465 : &ops [(n >> 1) | (n >> 2)], n >> 3);
18521 465 : emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
18522 465 : emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
18523 465 : emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
18524 465 : return;
18525 :
18526 326 : case E_V16QImode:
18527 326 : if (!TARGET_SSE4_1)
18528 : break;
18529 : /* FALLTHRU */
18530 :
18531 517 : case E_V8HImode:
18532 517 : if (!TARGET_SSE2)
18533 : break;
18534 :
18535 : /* Don't use ix86_expand_vector_init_interleave if we can't
18536 : move from GPR to SSE register directly. */
18537 517 : if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
18538 : break;
18539 : /* FALLTHRU */
18540 :
18541 1091 : case E_V8HFmode:
18542 1091 : case E_V8BFmode:
18543 :
18544 1091 : n = GET_MODE_NUNITS (mode);
18545 9915 : for (i = 0; i < n; i++)
18546 8824 : ops[i] = XVECEXP (vals, 0, i);
18547 1091 : ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
18548 1091 : return;
18549 :
18550 : case E_V4HFmode:
18551 : case E_V4BFmode:
18552 : case E_V2HFmode:
18553 : case E_V2BFmode:
18554 8336 : int_inner_mode = HImode;
18555 : break;
18556 :
18557 : case E_V4HImode:
18558 : case E_V8QImode:
18559 :
18560 : case E_V2HImode:
18561 : case E_V4QImode:
18562 : break;
18563 :
18564 0 : default:
18565 0 : gcc_unreachable ();
18566 : }
18567 :
18568 8336 : {
18569 8336 : int i, j, n_elts, n_words, n_elt_per_word;
18570 8336 : machine_mode tmp_mode, inner_mode;
18571 8336 : rtx words[4], shift;
18572 :
18573 16749 : tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode;
18574 :
18575 8336 : inner_mode = GET_MODE_INNER (mode);
18576 8336 : n_elts = GET_MODE_NUNITS (mode);
18577 16672 : n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode);
18578 8336 : n_elt_per_word = n_elts / n_words;
18579 8336 : shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
18580 :
18581 17064 : for (i = 0; i < n_words; ++i)
18582 : {
18583 : rtx word = NULL_RTX;
18584 :
18585 46292 : for (j = 0; j < n_elt_per_word; ++j)
18586 : {
18587 37564 : rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
18588 37564 : if (int_inner_mode != E_VOIDmode)
18589 : {
18590 138 : gcc_assert (TARGET_SSE2 && int_inner_mode == HImode);
18591 138 : rtx tmp = gen_reg_rtx (int_inner_mode);
18592 138 : elt = lowpart_subreg (int_inner_mode,
18593 : force_reg (inner_mode, elt),
18594 : inner_mode);
18595 138 : emit_move_insn (tmp, elt);
18596 138 : elt = tmp;
18597 : }
18598 37564 : elt = convert_modes (tmp_mode, inner_mode, elt, true);
18599 :
18600 37564 : if (j == 0)
18601 : word = elt;
18602 : else
18603 : {
18604 28836 : word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
18605 : NULL_RTX, 1, OPTAB_LIB_WIDEN);
18606 28836 : word = expand_simple_binop (tmp_mode, IOR, word, elt,
18607 : NULL_RTX, 1, OPTAB_LIB_WIDEN);
18608 : }
18609 : }
18610 :
18611 8728 : words[i] = word;
18612 : }
18613 :
18614 8336 : if (n_words == 1)
18615 7944 : emit_move_insn (target, gen_lowpart (mode, words[0]));
18616 392 : else if (n_words == 2)
18617 : {
18618 392 : gcc_assert (tmp_mode == DImode || tmp_mode == SImode);
18619 392 : machine_mode concat_mode = tmp_mode == DImode ? V2DImode : V2SImode;
18620 392 : rtx tmp = gen_reg_rtx (concat_mode);
18621 392 : vals = gen_rtx_PARALLEL (concat_mode, gen_rtvec_v (2, words));
18622 392 : ix86_expand_vector_init_general (mmx_ok, concat_mode, tmp, vals);
18623 392 : emit_move_insn (target, gen_lowpart (mode, tmp));
18624 : }
18625 0 : else if (n_words == 4)
18626 : {
18627 0 : rtx tmp = gen_reg_rtx (V4SImode);
18628 0 : gcc_assert (tmp_mode == SImode);
18629 0 : vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
18630 0 : ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
18631 0 : emit_move_insn (target, gen_lowpart (mode, tmp));
18632 : }
18633 : else
18634 0 : gcc_unreachable ();
18635 : }
18636 : }
18637 :
18638 : /* Initialize vector TARGET via VALS. Suppress the use of MMX
18639 : instructions unless MMX_OK is true. */
18640 :
18641 : void
18642 130291 : ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
18643 : {
18644 130291 : machine_mode mode = GET_MODE (target);
18645 130291 : machine_mode inner_mode = GET_MODE_INNER (mode);
18646 130291 : int n_elts = GET_MODE_NUNITS (mode);
18647 130291 : int n_var = 0, one_var = -1;
18648 130291 : bool all_same = true, all_const_zero = true;
18649 130291 : int i;
18650 130291 : rtx x;
18651 :
18652 : /* Handle first initialization from vector elts. */
18653 130291 : if (n_elts != XVECLEN (vals, 0))
18654 : {
18655 1305 : rtx subtarget = target;
18656 1305 : x = XVECEXP (vals, 0, 0);
18657 2610 : gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
18658 2610 : if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
18659 : {
18660 1305 : rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
18661 1305 : if (inner_mode == QImode
18662 1305 : || inner_mode == HImode
18663 1305 : || inner_mode == TImode
18664 : || inner_mode == HFmode
18665 : || inner_mode == BFmode)
18666 : {
18667 148 : unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
18668 148 : scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
18669 148 : n_bits /= GET_MODE_SIZE (elt_mode);
18670 148 : mode = mode_for_vector (elt_mode, n_bits).require ();
18671 148 : inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
18672 148 : ops[0] = gen_lowpart (inner_mode, ops[0]);
18673 148 : ops[1] = gen_lowpart (inner_mode, ops[1]);
18674 148 : subtarget = gen_reg_rtx (mode);
18675 : }
18676 1305 : ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
18677 1305 : if (subtarget != target)
18678 148 : emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
18679 1305 : return;
18680 : }
18681 0 : gcc_unreachable ();
18682 : }
18683 :
18684 475118 : for (i = 0; i < n_elts; ++i)
18685 : {
18686 346132 : x = XVECEXP (vals, 0, i);
18687 672018 : if (!(CONST_SCALAR_INT_P (x)
18688 329863 : || CONST_DOUBLE_P (x)
18689 : || CONST_FIXED_P (x)))
18690 325886 : n_var++, one_var = i;
18691 20246 : else if (x != CONST0_RTX (inner_mode))
18692 3257 : all_const_zero = false;
18693 346132 : if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
18694 : all_same = false;
18695 : }
18696 :
18697 : /* Handle the zero vector as special case. */
18698 128986 : if (n_var == 0 && all_const_zero)
18699 : {
18700 302 : emit_move_insn (target, CONST0_RTX (mode));
18701 302 : return;
18702 : }
18703 :
18704 : /* If all values are identical, broadcast the value. */
18705 128684 : if (all_same
18706 135960 : && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
18707 7276 : XVECEXP (vals, 0, 0)))
18708 : return;
18709 :
18710 : /* Constants are best loaded from the constant pool. */
18711 122598 : if (n_var == 0)
18712 : {
18713 41 : emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
18714 41 : return;
18715 : }
18716 :
18717 : /* Values where only one field is non-constant are best loaded from
18718 : the pool and overwritten via move later. */
18719 122557 : if (n_var == 1)
18720 : {
18721 11430 : if (all_const_zero
18722 21704 : && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
18723 10274 : XVECEXP (vals, 0, one_var),
18724 : one_var))
18725 : return;
18726 :
18727 7750 : if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
18728 : return;
18729 : }
18730 :
18731 118639 : ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
18732 : }
18733 :
18734 : /* Implemented as
18735 : V setg (V v, int idx, T val)
18736 : {
18737 : V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
18738 : V valv = (V){val, val, val, val, val, val, val, val};
18739 : V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
18740 : v = (v & ~mask) | (valv & mask);
18741 : return v;
18742 : }. */
18743 : void
18744 129 : ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
18745 : {
18746 129 : rtx vec[64];
18747 129 : machine_mode mode = GET_MODE (target);
18748 129 : machine_mode cmp_mode = mode;
18749 129 : int n_elts = GET_MODE_NUNITS (mode);
18750 129 : rtx valv,idxv,constv,idx_tmp;
18751 129 : bool ok = false;
18752 :
18753 : /* 512-bits vector byte/word broadcast and comparison only available
18754 : under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
18755 : when without TARGET_AVX512BW. */
18756 129 : if ((mode == V32HImode || mode == V32HFmode || mode == V32BFmode
18757 123 : || mode == V64QImode)
18758 10 : && !TARGET_AVX512BW)
18759 : {
18760 3 : gcc_assert (TARGET_AVX512F);
18761 3 : rtx vhi, vlo, idx_hi;
18762 3 : machine_mode half_mode;
18763 3 : rtx (*extract_hi)(rtx, rtx);
18764 3 : rtx (*extract_lo)(rtx, rtx);
18765 :
18766 3 : if (mode == V32HImode)
18767 : {
18768 : half_mode = V16HImode;
18769 : extract_hi = gen_vec_extract_hi_v32hi;
18770 : extract_lo = gen_vec_extract_lo_v32hi;
18771 : }
18772 : else if (mode == V32HFmode)
18773 : {
18774 : half_mode = V16HFmode;
18775 : extract_hi = gen_vec_extract_hi_v32hf;
18776 : extract_lo = gen_vec_extract_lo_v32hf;
18777 : }
18778 : else if (mode == V32BFmode)
18779 : {
18780 : half_mode = V16BFmode;
18781 : extract_hi = gen_vec_extract_hi_v32bf;
18782 : extract_lo = gen_vec_extract_lo_v32bf;
18783 : }
18784 : else
18785 : {
18786 3 : half_mode = V32QImode;
18787 3 : extract_hi = gen_vec_extract_hi_v64qi;
18788 3 : extract_lo = gen_vec_extract_lo_v64qi;
18789 : }
18790 :
18791 3 : vhi = gen_reg_rtx (half_mode);
18792 3 : vlo = gen_reg_rtx (half_mode);
18793 3 : idx_hi = gen_reg_rtx (GET_MODE (idx));
18794 3 : emit_insn (extract_hi (vhi, target));
18795 3 : emit_insn (extract_lo (vlo, target));
18796 3 : vec[0] = idx_hi;
18797 3 : vec[1] = idx;
18798 3 : vec[2] = GEN_INT (n_elts/2);
18799 3 : ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
18800 3 : ix86_expand_vector_set_var (vhi, val, idx_hi);
18801 3 : ix86_expand_vector_set_var (vlo, val, idx);
18802 3 : emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
18803 3 : return;
18804 : }
18805 :
18806 504 : if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
18807 : {
18808 42 : switch (mode)
18809 : {
18810 : case E_V2DFmode:
18811 : cmp_mode = V2DImode;
18812 : break;
18813 6 : case E_V4DFmode:
18814 6 : cmp_mode = V4DImode;
18815 6 : break;
18816 4 : case E_V8DFmode:
18817 4 : cmp_mode = V8DImode;
18818 4 : break;
18819 2 : case E_V2SFmode:
18820 2 : cmp_mode = V2SImode;
18821 2 : break;
18822 6 : case E_V4SFmode:
18823 6 : cmp_mode = V4SImode;
18824 6 : break;
18825 6 : case E_V8SFmode:
18826 6 : cmp_mode = V8SImode;
18827 6 : break;
18828 5 : case E_V16SFmode:
18829 5 : cmp_mode = V16SImode;
18830 5 : break;
18831 1 : case E_V2HFmode:
18832 1 : case E_V2BFmode:
18833 1 : cmp_mode = V2HImode;
18834 1 : break;
18835 1 : case E_V4HFmode:
18836 1 : case E_V4BFmode:
18837 1 : cmp_mode = V4HImode;
18838 1 : break;
18839 : case E_V8HFmode:
18840 2 : cmp_mode = V8HImode;
18841 : break;
18842 : case E_V16HFmode:
18843 2 : cmp_mode = V16HImode;
18844 : break;
18845 : case E_V32HFmode:
18846 1 : cmp_mode = V32HImode;
18847 : break;
18848 : case E_V8BFmode:
18849 2 : cmp_mode = V8HImode;
18850 : break;
18851 : case E_V16BFmode:
18852 2 : cmp_mode = V16HImode;
18853 : break;
18854 : case E_V32BFmode:
18855 1 : cmp_mode = V32HImode;
18856 : break;
18857 0 : default:
18858 0 : gcc_unreachable ();
18859 : }
18860 : }
18861 :
18862 1604 : for (int i = 0; i != n_elts; i++)
18863 1478 : vec[i] = GEN_INT (i);
18864 126 : constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
18865 126 : valv = gen_reg_rtx (mode);
18866 126 : idxv = gen_reg_rtx (cmp_mode);
18867 252 : idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
18868 :
18869 126 : ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
18870 : mode, valv, val);
18871 126 : gcc_assert (ok);
18872 126 : ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
18873 : cmp_mode, idxv, idx_tmp);
18874 126 : gcc_assert (ok);
18875 126 : vec[0] = target;
18876 126 : vec[1] = valv;
18877 126 : vec[2] = target;
18878 126 : vec[3] = gen_rtx_EQ (mode, idxv, constv);
18879 126 : vec[4] = idxv;
18880 126 : vec[5] = constv;
18881 126 : ok = ix86_expand_int_vcond (vec);
18882 126 : gcc_assert (ok);
18883 : }
18884 :
18885 : void
18886 8401 : ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
18887 : {
18888 8401 : machine_mode mode = GET_MODE (target);
18889 8401 : machine_mode inner_mode = GET_MODE_INNER (mode);
18890 8401 : machine_mode half_mode;
18891 8401 : bool use_vec_merge = false;
18892 8401 : bool blendm_const = false;
18893 8401 : rtx tmp;
18894 8401 : static rtx (*gen_extract[8][2]) (rtx, rtx)
18895 : = {
18896 : { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
18897 : { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
18898 : { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
18899 : { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
18900 : { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
18901 : { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df },
18902 : { gen_vec_extract_lo_v16hf, gen_vec_extract_hi_v16hf },
18903 : { gen_vec_extract_lo_v16bf, gen_vec_extract_hi_v16bf }
18904 : };
18905 8401 : static rtx (*gen_insert[8][2]) (rtx, rtx, rtx)
18906 : = {
18907 : { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
18908 : { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
18909 : { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
18910 : { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
18911 : { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
18912 : { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df },
18913 : { gen_vec_set_lo_v16hf, gen_vec_set_hi_v16hf },
18914 : { gen_vec_set_lo_v16bf, gen_vec_set_hi_v16bf },
18915 : };
18916 8401 : int i, j, n;
18917 8401 : machine_mode mmode = VOIDmode;
18918 8401 : rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
18919 :
18920 8401 : switch (mode)
18921 : {
18922 187 : case E_V2SImode:
18923 187 : use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
18924 : if (use_vec_merge)
18925 : break;
18926 : /* FALLTHRU */
18927 :
18928 168 : case E_V2SFmode:
18929 168 : if (mmx_ok)
18930 : {
18931 168 : tmp = gen_reg_rtx (GET_MODE_INNER (mode));
18932 168 : ix86_expand_vector_extract (true, tmp, target, 1 - elt);
18933 168 : if (elt == 0)
18934 1 : tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
18935 : else
18936 167 : tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
18937 168 : emit_insn (gen_rtx_SET (target, tmp));
18938 168 : return;
18939 : }
18940 : break;
18941 :
18942 251 : case E_V2DImode:
18943 251 : use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
18944 109 : if (use_vec_merge)
18945 : break;
18946 :
18947 109 : tmp = gen_reg_rtx (GET_MODE_INNER (mode));
18948 109 : ix86_expand_vector_extract (false, tmp, target, 1 - elt);
18949 109 : if (elt == 0)
18950 77 : tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
18951 : else
18952 32 : tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
18953 109 : emit_insn (gen_rtx_SET (target, tmp));
18954 109 : return;
18955 :
18956 153 : case E_V2DFmode:
18957 : /* NB: For ELT == 0, use standard scalar operation patterns which
18958 : preserve the rest of the vector for combiner:
18959 :
18960 : (vec_merge:V2DF
18961 : (vec_duplicate:V2DF (reg:DF))
18962 : (reg:V2DF)
18963 : (const_int 1))
18964 : */
18965 153 : if (elt == 0)
18966 68 : goto do_vec_merge;
18967 :
18968 85 : {
18969 85 : rtx op0, op1;
18970 :
18971 : /* For the two element vectors, we implement a VEC_CONCAT with
18972 : the extraction of the other element. */
18973 :
18974 85 : tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
18975 85 : tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
18976 :
18977 85 : if (elt == 0)
18978 : op0 = val, op1 = tmp;
18979 : else
18980 85 : op0 = tmp, op1 = val;
18981 :
18982 85 : tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
18983 85 : emit_insn (gen_rtx_SET (target, tmp));
18984 : }
18985 85 : return;
18986 :
18987 574 : case E_V4SFmode:
18988 574 : use_vec_merge = TARGET_SSE4_1;
18989 574 : if (use_vec_merge)
18990 : break;
18991 :
18992 62 : switch (elt)
18993 : {
18994 : case 0:
18995 : use_vec_merge = true;
18996 : break;
18997 :
18998 1 : case 1:
18999 : /* tmp = target = A B C D */
19000 1 : tmp = copy_to_reg (target);
19001 : /* target = A A B B */
19002 1 : emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
19003 : /* target = X A B B */
19004 1 : ix86_expand_vector_set (false, target, val, 0);
19005 : /* target = A X C D */
19006 1 : emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
19007 : const1_rtx, const0_rtx,
19008 : GEN_INT (2+4), GEN_INT (3+4)));
19009 1 : return;
19010 :
19011 0 : case 2:
19012 : /* tmp = target = A B C D */
19013 0 : tmp = copy_to_reg (target);
19014 : /* tmp = X B C D */
19015 0 : ix86_expand_vector_set (false, tmp, val, 0);
19016 : /* target = A B X D */
19017 0 : emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
19018 : const0_rtx, const1_rtx,
19019 : GEN_INT (0+4), GEN_INT (3+4)));
19020 0 : return;
19021 :
19022 4 : case 3:
19023 : /* tmp = target = A B C D */
19024 4 : tmp = copy_to_reg (target);
19025 : /* tmp = X B C D */
19026 4 : ix86_expand_vector_set (false, tmp, val, 0);
19027 : /* target = A B X D */
19028 4 : emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
19029 : const0_rtx, const1_rtx,
19030 : GEN_INT (2+4), GEN_INT (0+4)));
19031 4 : return;
19032 :
19033 0 : default:
19034 0 : gcc_unreachable ();
19035 : }
19036 : break;
19037 :
19038 440 : case E_V4SImode:
19039 440 : use_vec_merge = TARGET_SSE4_1;
19040 440 : if (use_vec_merge)
19041 : break;
19042 :
19043 : /* Element 0 handled by vec_merge below. */
19044 280 : if (elt == 0)
19045 : {
19046 : use_vec_merge = true;
19047 : break;
19048 : }
19049 :
19050 88 : if (TARGET_SSE2)
19051 : {
19052 : /* With SSE2, use integer shuffles to swap element 0 and ELT,
19053 : store into element 0, then shuffle them back. */
19054 :
19055 88 : rtx order[4];
19056 :
19057 88 : order[0] = GEN_INT (elt);
19058 88 : order[1] = const1_rtx;
19059 88 : order[2] = const2_rtx;
19060 88 : order[3] = GEN_INT (3);
19061 88 : order[elt] = const0_rtx;
19062 :
19063 88 : emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
19064 : order[1], order[2], order[3]));
19065 :
19066 88 : ix86_expand_vector_set (false, target, val, 0);
19067 :
19068 88 : emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
19069 : order[1], order[2], order[3]));
19070 : }
19071 : else
19072 : {
19073 : /* For SSE1, we have to reuse the V4SF code. */
19074 0 : rtx t = gen_reg_rtx (V4SFmode);
19075 0 : emit_move_insn (t, gen_lowpart (V4SFmode, target));
19076 0 : ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
19077 0 : emit_move_insn (target, gen_lowpart (mode, t));
19078 : }
19079 : return;
19080 :
19081 3533 : case E_V8HImode:
19082 3533 : case E_V8HFmode:
19083 3533 : case E_V8BFmode:
19084 3533 : case E_V2HImode:
19085 3533 : case E_V2HFmode:
19086 3533 : case E_V2BFmode:
19087 3533 : use_vec_merge = TARGET_SSE2;
19088 3533 : break;
19089 50 : case E_V4HImode:
19090 50 : case E_V4HFmode:
19091 50 : case E_V4BFmode:
19092 50 : use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
19093 : break;
19094 :
19095 3067 : case E_V16QImode:
19096 3067 : case E_V4QImode:
19097 3067 : use_vec_merge = TARGET_SSE4_1;
19098 3067 : break;
19099 :
19100 5 : case E_V8QImode:
19101 5 : use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
19102 : break;
19103 :
19104 3 : case E_V32QImode:
19105 3 : half_mode = V16QImode;
19106 3 : j = 0;
19107 3 : n = 16;
19108 3 : goto half;
19109 :
19110 17 : case E_V16HFmode:
19111 17 : case E_V16BFmode:
19112 : /* For ELT == 0, vec_setv8hf_0 can save 1 vpbroadcastw. */
19113 17 : if (TARGET_AVX2 && elt != 0)
19114 : {
19115 12 : mmode = SImode;
19116 12 : gen_blendm = ((mode == E_V16HFmode) ? gen_avx2_pblendph_1
19117 : : gen_avx2_pblendbf_1);
19118 : blendm_const = true;
19119 : break;
19120 : }
19121 : else
19122 : {
19123 5 : half_mode = ((mode == E_V16HFmode) ? V8HFmode : V8BFmode);
19124 3 : j = ((mode == E_V16HFmode) ? 6 : 7);
19125 5 : n = 8;
19126 5 : goto half;
19127 : }
19128 :
19129 5 : case E_V16HImode:
19130 5 : half_mode = V8HImode;
19131 5 : j = 1;
19132 5 : n = 8;
19133 5 : goto half;
19134 :
19135 15 : case E_V8SImode:
19136 15 : half_mode = V4SImode;
19137 15 : j = 2;
19138 15 : n = 4;
19139 15 : goto half;
19140 :
19141 15 : case E_V4DImode:
19142 15 : half_mode = V2DImode;
19143 15 : j = 3;
19144 15 : n = 2;
19145 15 : goto half;
19146 :
19147 4 : case E_V8SFmode:
19148 4 : half_mode = V4SFmode;
19149 4 : j = 4;
19150 4 : n = 4;
19151 4 : goto half;
19152 :
19153 6 : case E_V4DFmode:
19154 6 : half_mode = V2DFmode;
19155 6 : j = 5;
19156 6 : n = 2;
19157 6 : goto half;
19158 :
19159 53 : half:
19160 : /* Compute offset. */
19161 53 : i = elt / n;
19162 53 : elt %= n;
19163 :
19164 53 : gcc_assert (i <= 1);
19165 :
19166 : /* Extract the half. */
19167 53 : tmp = gen_reg_rtx (half_mode);
19168 53 : emit_insn (gen_extract[j][i] (tmp, target));
19169 :
19170 : /* Put val in tmp at elt. */
19171 53 : ix86_expand_vector_set (false, tmp, val, elt);
19172 :
19173 : /* Put it back. */
19174 53 : emit_insn (gen_insert[j][i] (target, target, tmp));
19175 53 : return;
19176 :
19177 8 : case E_V8DFmode:
19178 8 : if (TARGET_AVX512F)
19179 : {
19180 : mmode = QImode;
19181 : gen_blendm = gen_avx512f_blendmv8df;
19182 : }
19183 : break;
19184 :
19185 6 : case E_V8DImode:
19186 6 : if (TARGET_AVX512F)
19187 : {
19188 : mmode = QImode;
19189 : gen_blendm = gen_avx512f_blendmv8di;
19190 : }
19191 : break;
19192 :
19193 0 : case E_V16SFmode:
19194 0 : if (TARGET_AVX512F)
19195 : {
19196 : mmode = HImode;
19197 : gen_blendm = gen_avx512f_blendmv16sf;
19198 : }
19199 : break;
19200 :
19201 0 : case E_V16SImode:
19202 0 : if (TARGET_AVX512F)
19203 : {
19204 : mmode = HImode;
19205 : gen_blendm = gen_avx512f_blendmv16si;
19206 : }
19207 : break;
19208 :
19209 12 : case E_V32HFmode:
19210 12 : if (TARGET_AVX512BW)
19211 : {
19212 : mmode = SImode;
19213 : gen_blendm = gen_avx512bw_blendmv32hf;
19214 : }
19215 : break;
19216 12 : case E_V32BFmode:
19217 12 : if (TARGET_AVX512BW)
19218 : {
19219 : mmode = SImode;
19220 : gen_blendm = gen_avx512bw_blendmv32bf;
19221 : }
19222 : break;
19223 11 : case E_V32HImode:
19224 11 : if (TARGET_AVX512BW)
19225 : {
19226 : mmode = SImode;
19227 : gen_blendm = gen_avx512bw_blendmv32hi;
19228 : }
19229 7 : else if (TARGET_AVX512F)
19230 : {
19231 7 : half_mode = E_V8HImode;
19232 7 : n = 8;
19233 7 : goto quarter;
19234 : }
19235 : break;
19236 :
19237 12 : case E_V64QImode:
19238 12 : if (TARGET_AVX512BW)
19239 : {
19240 : mmode = DImode;
19241 : gen_blendm = gen_avx512bw_blendmv64qi;
19242 : }
19243 6 : else if (TARGET_AVX512F)
19244 : {
19245 6 : half_mode = E_V16QImode;
19246 6 : n = 16;
19247 6 : goto quarter;
19248 : }
19249 : break;
19250 :
19251 13 : quarter:
19252 : /* Compute offset. */
19253 13 : i = elt / n;
19254 13 : elt %= n;
19255 :
19256 13 : gcc_assert (i <= 3);
19257 :
19258 13 : {
19259 : /* Extract the quarter. */
19260 13 : tmp = gen_reg_rtx (V4SImode);
19261 13 : rtx tmp2 = gen_lowpart (V16SImode, target);
19262 13 : rtx mask = gen_reg_rtx (QImode);
19263 :
19264 13 : emit_move_insn (mask, constm1_rtx);
19265 13 : emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
19266 : tmp, mask));
19267 :
19268 13 : tmp2 = gen_reg_rtx (half_mode);
19269 13 : emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
19270 13 : tmp = tmp2;
19271 :
19272 : /* Put val in tmp at elt. */
19273 13 : ix86_expand_vector_set (false, tmp, val, elt);
19274 :
19275 : /* Put it back. */
19276 13 : tmp2 = gen_reg_rtx (V16SImode);
19277 13 : rtx tmp3 = gen_lowpart (V16SImode, target);
19278 13 : mask = gen_reg_rtx (HImode);
19279 13 : emit_move_insn (mask, constm1_rtx);
19280 13 : tmp = gen_lowpart (V4SImode, tmp);
19281 13 : emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
19282 : tmp3, mask));
19283 13 : emit_move_insn (target, gen_lowpart (mode, tmp2));
19284 : }
19285 13 : return;
19286 :
19287 : default:
19288 : break;
19289 : }
19290 :
19291 6600 : if (mmode != VOIDmode)
19292 : {
19293 54 : tmp = gen_reg_rtx (mode);
19294 54 : emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
19295 54 : rtx merge_mask = gen_int_mode (HOST_WIDE_INT_1U << elt, mmode);
19296 : /* The avx512*_blendm<mode> expanders have different operand order
19297 : from VEC_MERGE. In VEC_MERGE, the first input operand is used for
19298 : elements where the mask is set and second input operand otherwise,
19299 : in {sse,avx}*_*blend* the first input operand is used for elements
19300 : where the mask is clear and second input operand otherwise. */
19301 54 : if (!blendm_const)
19302 42 : merge_mask = force_reg (mmode, merge_mask);
19303 54 : emit_insn (gen_blendm (target, target, tmp, merge_mask));
19304 : }
19305 7758 : else if (use_vec_merge)
19306 : {
19307 7748 : do_vec_merge:
19308 7816 : if (!nonimmediate_operand (val, inner_mode))
19309 1 : val = force_reg (inner_mode, val);
19310 7816 : tmp = gen_rtx_VEC_DUPLICATE (mode, val);
19311 7816 : tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
19312 : GEN_INT (HOST_WIDE_INT_1U << elt));
19313 7816 : emit_insn (gen_rtx_SET (target, tmp));
19314 : }
19315 : else
19316 : {
19317 20 : rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
19318 :
19319 10 : emit_move_insn (mem, target);
19320 :
19321 20 : tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
19322 10 : emit_move_insn (tmp, val);
19323 :
19324 10 : emit_move_insn (target, mem);
19325 : }
19326 : }
19327 :
19328 : void
19329 109400 : ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
19330 : {
19331 109400 : machine_mode mode = GET_MODE (vec);
19332 109400 : machine_mode inner_mode = GET_MODE_INNER (mode);
19333 109400 : bool use_vec_extr = false;
19334 109400 : rtx tmp;
19335 :
19336 109400 : switch (mode)
19337 : {
19338 8692 : case E_V2SImode:
19339 8692 : use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
19340 : if (use_vec_extr)
19341 : break;
19342 : /* FALLTHRU */
19343 :
19344 9577 : case E_V2SFmode:
19345 9577 : if (!mmx_ok)
19346 : break;
19347 : /* FALLTHRU */
19348 :
19349 : case E_V2DFmode:
19350 : case E_V2DImode:
19351 : case E_V2TImode:
19352 : case E_V4TImode:
19353 : use_vec_extr = true;
19354 : break;
19355 :
19356 7879 : case E_V4SFmode:
19357 7879 : use_vec_extr = TARGET_SSE4_1;
19358 7879 : if (use_vec_extr)
19359 : break;
19360 :
19361 4038 : switch (elt)
19362 : {
19363 : case 0:
19364 : tmp = vec;
19365 : break;
19366 :
19367 1675 : case 1:
19368 1675 : case 3:
19369 1675 : tmp = gen_reg_rtx (mode);
19370 1675 : emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
19371 : GEN_INT (elt), GEN_INT (elt),
19372 1675 : GEN_INT (elt+4), GEN_INT (elt+4)));
19373 1675 : break;
19374 :
19375 929 : case 2:
19376 929 : tmp = gen_reg_rtx (mode);
19377 929 : emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
19378 929 : break;
19379 :
19380 0 : default:
19381 0 : gcc_unreachable ();
19382 : }
19383 : vec = tmp;
19384 : use_vec_extr = true;
19385 : elt = 0;
19386 : break;
19387 :
19388 23828 : case E_V4SImode:
19389 23828 : use_vec_extr = TARGET_SSE4_1;
19390 23828 : if (use_vec_extr)
19391 : break;
19392 :
19393 18006 : if (TARGET_SSE2)
19394 : {
19395 18002 : switch (elt)
19396 : {
19397 : case 0:
19398 : tmp = vec;
19399 : break;
19400 :
19401 5897 : case 1:
19402 5897 : case 3:
19403 5897 : tmp = gen_reg_rtx (mode);
19404 5897 : emit_insn (gen_sse2_pshufd_1 (tmp, vec,
19405 : GEN_INT (elt), GEN_INT (elt),
19406 : GEN_INT (elt), GEN_INT (elt)));
19407 5897 : break;
19408 :
19409 2934 : case 2:
19410 2934 : tmp = gen_reg_rtx (mode);
19411 2934 : emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
19412 2934 : break;
19413 :
19414 0 : default:
19415 0 : gcc_unreachable ();
19416 : }
19417 : vec = tmp;
19418 : use_vec_extr = true;
19419 : elt = 0;
19420 : }
19421 : else
19422 : {
19423 : /* For SSE1, we have to reuse the V4SF code. */
19424 4 : ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
19425 4 : gen_lowpart (V4SFmode, vec), elt);
19426 4 : return;
19427 : }
19428 : break;
19429 :
19430 6505 : case E_V8HImode:
19431 6505 : case E_V8HFmode:
19432 6505 : case E_V8BFmode:
19433 6505 : case E_V2HImode:
19434 6505 : case E_V2HFmode:
19435 6505 : case E_V2BFmode:
19436 6505 : use_vec_extr = TARGET_SSE2;
19437 6505 : break;
19438 876 : case E_V4HImode:
19439 876 : case E_V4HFmode:
19440 876 : case E_V4BFmode:
19441 876 : use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
19442 : break;
19443 :
19444 7980 : case E_V16QImode:
19445 7980 : use_vec_extr = TARGET_SSE4_1;
19446 7980 : if (!use_vec_extr
19447 6186 : && TARGET_SSE2
19448 6186 : && elt == 0
19449 11850 : && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
19450 : {
19451 3869 : tmp = gen_reg_rtx (SImode);
19452 3869 : ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
19453 : 0);
19454 3869 : emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
19455 3869 : return;
19456 : }
19457 : break;
19458 78 : case E_V4QImode:
19459 78 : use_vec_extr = TARGET_SSE4_1;
19460 78 : break;
19461 :
19462 604 : case E_V8SFmode:
19463 604 : if (TARGET_AVX)
19464 : {
19465 604 : tmp = gen_reg_rtx (V4SFmode);
19466 604 : if (elt < 4)
19467 298 : emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
19468 : else
19469 306 : emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
19470 604 : ix86_expand_vector_extract (false, target, tmp, elt & 3);
19471 604 : return;
19472 : }
19473 : break;
19474 :
19475 565 : case E_V4DFmode:
19476 565 : if (TARGET_AVX)
19477 : {
19478 565 : tmp = gen_reg_rtx (V2DFmode);
19479 565 : if (elt < 2)
19480 297 : emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
19481 : else
19482 268 : emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
19483 565 : ix86_expand_vector_extract (false, target, tmp, elt & 1);
19484 565 : return;
19485 : }
19486 : break;
19487 :
19488 253 : case E_V32QImode:
19489 253 : if (TARGET_AVX)
19490 : {
19491 253 : tmp = gen_reg_rtx (V16QImode);
19492 253 : if (elt < 16)
19493 130 : emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
19494 : else
19495 123 : emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
19496 253 : ix86_expand_vector_extract (false, target, tmp, elt & 15);
19497 253 : return;
19498 : }
19499 : break;
19500 :
19501 616 : case E_V16HImode:
19502 616 : if (TARGET_AVX)
19503 : {
19504 616 : tmp = gen_reg_rtx (V8HImode);
19505 616 : if (elt < 8)
19506 304 : emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
19507 : else
19508 312 : emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
19509 616 : ix86_expand_vector_extract (false, target, tmp, elt & 7);
19510 616 : return;
19511 : }
19512 : break;
19513 :
19514 1093 : case E_V8SImode:
19515 1093 : if (TARGET_AVX)
19516 : {
19517 1093 : tmp = gen_reg_rtx (V4SImode);
19518 1093 : if (elt < 4)
19519 527 : emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
19520 : else
19521 566 : emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
19522 1093 : ix86_expand_vector_extract (false, target, tmp, elt & 3);
19523 1093 : return;
19524 : }
19525 : break;
19526 :
19527 1558 : case E_V4DImode:
19528 1558 : if (TARGET_AVX)
19529 : {
19530 1558 : tmp = gen_reg_rtx (V2DImode);
19531 1558 : if (elt < 2)
19532 833 : emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
19533 : else
19534 725 : emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
19535 1558 : ix86_expand_vector_extract (false, target, tmp, elt & 1);
19536 1558 : return;
19537 : }
19538 : break;
19539 :
19540 8 : case E_V32HImode:
19541 8 : if (TARGET_AVX512BW)
19542 : {
19543 8 : tmp = gen_reg_rtx (V16HImode);
19544 8 : if (elt < 16)
19545 3 : emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
19546 : else
19547 5 : emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
19548 8 : ix86_expand_vector_extract (false, target, tmp, elt & 15);
19549 8 : return;
19550 : }
19551 : break;
19552 :
19553 11 : case E_V64QImode:
19554 11 : if (TARGET_AVX512BW)
19555 : {
19556 11 : tmp = gen_reg_rtx (V32QImode);
19557 11 : if (elt < 32)
19558 5 : emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
19559 : else
19560 6 : emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
19561 11 : ix86_expand_vector_extract (false, target, tmp, elt & 31);
19562 11 : return;
19563 : }
19564 : break;
19565 :
19566 311 : case E_V16SFmode:
19567 311 : tmp = gen_reg_rtx (V8SFmode);
19568 311 : if (elt < 8)
19569 157 : emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
19570 : else
19571 154 : emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
19572 311 : ix86_expand_vector_extract (false, target, tmp, elt & 7);
19573 311 : return;
19574 :
19575 296 : case E_V8DFmode:
19576 296 : tmp = gen_reg_rtx (V4DFmode);
19577 296 : if (elt < 4)
19578 160 : emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
19579 : else
19580 136 : emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
19581 296 : ix86_expand_vector_extract (false, target, tmp, elt & 3);
19582 296 : return;
19583 :
19584 332 : case E_V16SImode:
19585 332 : tmp = gen_reg_rtx (V8SImode);
19586 332 : if (elt < 8)
19587 163 : emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
19588 : else
19589 169 : emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
19590 332 : ix86_expand_vector_extract (false, target, tmp, elt & 7);
19591 332 : return;
19592 :
19593 738 : case E_V8DImode:
19594 738 : tmp = gen_reg_rtx (V4DImode);
19595 738 : if (elt < 4)
19596 419 : emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
19597 : else
19598 319 : emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
19599 738 : ix86_expand_vector_extract (false, target, tmp, elt & 3);
19600 738 : return;
19601 :
19602 45 : case E_V32HFmode:
19603 45 : case E_V32BFmode:
19604 45 : if (TARGET_AVX512BW)
19605 : {
19606 45 : tmp = (mode == E_V32HFmode
19607 45 : ? gen_reg_rtx (V16HFmode)
19608 7 : : gen_reg_rtx (V16BFmode));
19609 45 : if (elt < 16)
19610 31 : emit_insn (gen_vec_extract_lo (mode, tmp, vec));
19611 : else
19612 14 : emit_insn (gen_vec_extract_hi (mode, tmp, vec));
19613 45 : ix86_expand_vector_extract (false, target, tmp, elt & 15);
19614 45 : return;
19615 : }
19616 : break;
19617 :
19618 474 : case E_V16HFmode:
19619 474 : case E_V16BFmode:
19620 474 : if (TARGET_AVX)
19621 : {
19622 474 : tmp = (mode == E_V16HFmode
19623 474 : ? gen_reg_rtx (V8HFmode)
19624 339 : : gen_reg_rtx (V8BFmode));
19625 474 : if (elt < 8)
19626 249 : emit_insn (gen_vec_extract_lo (mode, tmp, vec));
19627 : else
19628 225 : emit_insn (gen_vec_extract_hi (mode, tmp, vec));
19629 474 : ix86_expand_vector_extract (false, target, tmp, elt & 7);
19630 474 : return;
19631 : }
19632 : break;
19633 :
19634 627 : case E_V8QImode:
19635 627 : use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
19636 : /* ??? Could extract the appropriate HImode element and shift. */
19637 : break;
19638 :
19639 : default:
19640 : break;
19641 : }
19642 :
19643 27104 : if (use_vec_extr)
19644 : {
19645 90709 : tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
19646 90709 : tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
19647 :
19648 : /* Let the rtl optimizers know about the zero extension performed. */
19649 90709 : if (inner_mode == QImode || inner_mode == HImode)
19650 : {
19651 8704 : rtx reg = gen_reg_rtx (SImode);
19652 8704 : tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
19653 8704 : emit_move_insn (reg, tmp);
19654 8704 : tmp = gen_lowpart (inner_mode, reg);
19655 8704 : SUBREG_PROMOTED_VAR_P (tmp) = 1;
19656 8704 : SUBREG_PROMOTED_SET (tmp, 1);
19657 : }
19658 :
19659 90709 : emit_move_insn (target, tmp);
19660 : }
19661 : else
19662 : {
19663 15828 : rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
19664 :
19665 7914 : emit_move_insn (mem, vec);
19666 :
19667 15828 : tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
19668 7914 : emit_move_insn (target, tmp);
19669 : }
19670 : }
19671 :
19672 : /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
19673 : to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
19674 : The upper bits of DEST are undefined, though they shouldn't cause
19675 : exceptions (some bits from src or all zeros are ok). */
19676 :
19677 : static void
19678 41877 : emit_reduc_half (rtx dest, rtx src, int i)
19679 : {
19680 41877 : rtx tem, d = dest;
19681 41877 : switch (GET_MODE (src))
19682 : {
19683 6042 : case E_V4SFmode:
19684 6042 : if (i == 128)
19685 3021 : tem = gen_sse_movhlps (dest, src, src);
19686 : else
19687 3021 : tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
19688 : GEN_INT (1 + 4), GEN_INT (1 + 4));
19689 : break;
19690 3362 : case E_V2DFmode:
19691 3362 : tem = gen_vec_interleave_highv2df (dest, src, src);
19692 3362 : break;
19693 76 : case E_V4QImode:
19694 76 : d = gen_reg_rtx (V1SImode);
19695 76 : tem = gen_mmx_lshrv1si3 (d, gen_lowpart (V1SImode, src),
19696 76 : GEN_INT (i / 2));
19697 76 : break;
19698 615 : case E_V8QImode:
19699 615 : case E_V4HImode:
19700 615 : d = gen_reg_rtx (V1DImode);
19701 615 : tem = gen_mmx_lshrv1di3 (d, gen_lowpart (V1DImode, src),
19702 615 : GEN_INT (i / 2));
19703 615 : break;
19704 31782 : case E_V16QImode:
19705 31782 : case E_V8HImode:
19706 31782 : case E_V8HFmode:
19707 31782 : case E_V4SImode:
19708 31782 : case E_V2DImode:
19709 31782 : if (TARGET_SSE_REDUCTION_PREFER_PSHUF)
19710 : {
19711 19 : if (i == 128)
19712 : {
19713 13 : d = gen_reg_rtx (V4SImode);
19714 26 : tem = gen_sse2_pshufd_1 (
19715 13 : d, force_reg (V4SImode, gen_lowpart (V4SImode, src)),
19716 : GEN_INT (2), GEN_INT (3), GEN_INT (2), GEN_INT (3));
19717 13 : break;
19718 : }
19719 6 : else if (i == 64)
19720 : {
19721 5 : d = gen_reg_rtx (V4SImode);
19722 10 : tem = gen_sse2_pshufd_1 (
19723 5 : d, force_reg (V4SImode, gen_lowpart (V4SImode, src)),
19724 : GEN_INT (1), GEN_INT (1), GEN_INT (1), GEN_INT (1));
19725 5 : break;
19726 : }
19727 1 : else if (i == 32)
19728 : {
19729 1 : d = gen_reg_rtx (V8HImode);
19730 2 : tem = gen_sse2_pshuflw_1 (
19731 1 : d, force_reg (V8HImode, gen_lowpart (V8HImode, src)),
19732 : GEN_INT (1), GEN_INT (1), GEN_INT (1), GEN_INT (1));
19733 1 : break;
19734 : }
19735 : }
19736 31763 : d = gen_reg_rtx (V1TImode);
19737 31763 : tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
19738 31763 : GEN_INT (i / 2));
19739 31763 : break;
19740 0 : case E_V8SFmode:
19741 0 : if (i == 256)
19742 0 : tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
19743 : else
19744 0 : tem = gen_avx_shufps256 (dest, src, src,
19745 : GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
19746 : break;
19747 0 : case E_V4DFmode:
19748 0 : if (i == 256)
19749 0 : tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
19750 : else
19751 0 : tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
19752 : break;
19753 0 : case E_V32QImode:
19754 0 : case E_V16HImode:
19755 0 : case E_V16HFmode:
19756 0 : case E_V8SImode:
19757 0 : case E_V4DImode:
19758 0 : if (i == 256)
19759 : {
19760 0 : if (GET_MODE (dest) != V4DImode)
19761 0 : d = gen_reg_rtx (V4DImode);
19762 0 : tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
19763 0 : gen_lowpart (V4DImode, src),
19764 : const1_rtx);
19765 : }
19766 : else
19767 : {
19768 0 : d = gen_reg_rtx (V2TImode);
19769 0 : tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
19770 0 : GEN_INT (i / 2));
19771 : }
19772 : break;
19773 0 : case E_V64QImode:
19774 0 : case E_V32HImode:
19775 0 : case E_V32HFmode:
19776 0 : if (i < 64)
19777 : {
19778 0 : d = gen_reg_rtx (V4TImode);
19779 0 : tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
19780 0 : GEN_INT (i / 2));
19781 0 : break;
19782 : }
19783 : /* FALLTHRU */
19784 0 : case E_V16SImode:
19785 0 : case E_V16SFmode:
19786 0 : case E_V8DImode:
19787 0 : case E_V8DFmode:
19788 0 : if (i > 128)
19789 0 : tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
19790 0 : gen_lowpart (V16SImode, src),
19791 0 : gen_lowpart (V16SImode, src),
19792 : GEN_INT (0x4 + (i == 512 ? 4 : 0)),
19793 : GEN_INT (0x5 + (i == 512 ? 4 : 0)),
19794 : GEN_INT (0x6 + (i == 512 ? 4 : 0)),
19795 : GEN_INT (0x7 + (i == 512 ? 4 : 0)),
19796 : GEN_INT (0xC), GEN_INT (0xD),
19797 : GEN_INT (0xE), GEN_INT (0xF),
19798 : GEN_INT (0x10), GEN_INT (0x11),
19799 : GEN_INT (0x12), GEN_INT (0x13),
19800 : GEN_INT (0x14), GEN_INT (0x15),
19801 : GEN_INT (0x16), GEN_INT (0x17));
19802 : else
19803 0 : tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
19804 0 : gen_lowpart (V16SImode, src),
19805 : GEN_INT (i == 128 ? 0x2 : 0x1),
19806 : GEN_INT (0x3),
19807 : GEN_INT (0x3),
19808 : GEN_INT (0x3),
19809 : GEN_INT (i == 128 ? 0x6 : 0x5),
19810 : GEN_INT (0x7),
19811 : GEN_INT (0x7),
19812 : GEN_INT (0x7),
19813 : GEN_INT (i == 128 ? 0xA : 0x9),
19814 : GEN_INT (0xB),
19815 : GEN_INT (0xB),
19816 : GEN_INT (0xB),
19817 : GEN_INT (i == 128 ? 0xE : 0xD),
19818 : GEN_INT (0xF),
19819 : GEN_INT (0xF),
19820 : GEN_INT (0xF));
19821 : break;
19822 0 : default:
19823 0 : gcc_unreachable ();
19824 : }
19825 41877 : emit_insn (tem);
19826 41877 : if (d != dest)
19827 32473 : emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
19828 41877 : }
19829 :
19830 : /* Expand a vector reduction. FN is the binary pattern to reduce;
19831 : DEST is the destination; IN is the input vector. */
19832 :
19833 : void
19834 20907 : ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
19835 : {
19836 20907 : rtx half, dst, vec = in;
19837 20907 : machine_mode mode = GET_MODE (in);
19838 20907 : int i;
19839 :
19840 : /* SSE4 has a special instruction for V8HImode UMIN reduction. */
19841 20907 : if (TARGET_SSE4_1
19842 9946 : && mode == V8HImode
19843 780 : && fn == gen_uminv8hi3)
19844 : {
19845 4 : emit_insn (gen_sse4_1_phminposuw (dest, in));
19846 4 : return;
19847 : }
19848 :
19849 41806 : for (i = GET_MODE_BITSIZE (mode);
19850 125560 : i > GET_MODE_UNIT_BITSIZE (mode);
19851 41877 : i >>= 1)
19852 : {
19853 41877 : half = gen_reg_rtx (mode);
19854 41877 : emit_reduc_half (half, vec, i);
19855 83754 : if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
19856 : dst = dest;
19857 : else
19858 20974 : dst = gen_reg_rtx (mode);
19859 41877 : emit_insn (fn (dst, half, vec));
19860 41877 : vec = dst;
19861 : }
19862 : }
19863 :
19864 : /* Output code to perform a conditional jump to LABEL, if C2 flag in
19865 : FP status register is set. */
19866 :
19867 : void
19868 284 : ix86_emit_fp_unordered_jump (rtx label)
19869 : {
19870 284 : rtx reg = gen_reg_rtx (HImode);
19871 284 : rtx_insn *insn;
19872 284 : rtx temp;
19873 :
19874 284 : emit_insn (gen_x86_fnstsw_1 (reg));
19875 :
19876 284 : if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
19877 : {
19878 37 : emit_insn (gen_x86_sahf_1 (reg));
19879 :
19880 37 : temp = gen_rtx_REG (CCmode, FLAGS_REG);
19881 37 : temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
19882 : }
19883 : else
19884 : {
19885 247 : emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
19886 :
19887 247 : temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
19888 247 : temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
19889 : }
19890 :
19891 284 : temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
19892 : gen_rtx_LABEL_REF (VOIDmode, label),
19893 : pc_rtx);
19894 284 : insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
19895 284 : predict_jump (REG_BR_PROB_BASE * 10 / 100);
19896 284 : JUMP_LABEL (insn) = label;
19897 284 : }
19898 :
19899 : /* Output code to perform an sinh XFmode calculation. */
19900 :
19901 : void
19902 2 : ix86_emit_i387_sinh (rtx op0, rtx op1)
19903 : {
19904 2 : rtx e1 = gen_reg_rtx (XFmode);
19905 2 : rtx e2 = gen_reg_rtx (XFmode);
19906 2 : rtx scratch = gen_reg_rtx (HImode);
19907 2 : rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
19908 2 : rtx half = const_double_from_real_value (dconsthalf, XFmode);
19909 2 : rtx cst1, tmp;
19910 2 : rtx_code_label *jump_label = gen_label_rtx ();
19911 2 : rtx_insn *insn;
19912 :
19913 : /* scratch = fxam (op1) */
19914 2 : emit_insn (gen_fxamxf2_i387 (scratch, op1));
19915 :
19916 : /* e1 = expm1 (|op1|) */
19917 2 : emit_insn (gen_absxf2 (e2, op1));
19918 2 : emit_insn (gen_expm1xf2 (e1, e2));
19919 :
19920 : /* e2 = e1 / (e1 + 1.0) + e1 */
19921 2 : cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
19922 2 : emit_insn (gen_addxf3 (e2, e1, cst1));
19923 2 : emit_insn (gen_divxf3 (e2, e1, e2));
19924 2 : emit_insn (gen_addxf3 (e2, e2, e1));
19925 :
19926 : /* flags = signbit (op1) */
19927 2 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
19928 :
19929 : /* if (flags) then e2 = -e2 */
19930 2 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
19931 : gen_rtx_EQ (VOIDmode, flags, const0_rtx),
19932 : gen_rtx_LABEL_REF (VOIDmode, jump_label),
19933 : pc_rtx);
19934 2 : insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
19935 2 : predict_jump (REG_BR_PROB_BASE * 50 / 100);
19936 2 : JUMP_LABEL (insn) = jump_label;
19937 :
19938 2 : emit_insn (gen_negxf2 (e2, e2));
19939 :
19940 2 : emit_label (jump_label);
19941 2 : LABEL_NUSES (jump_label) = 1;
19942 :
19943 : /* op0 = 0.5 * e2 */
19944 2 : half = force_reg (XFmode, half);
19945 2 : emit_insn (gen_mulxf3 (op0, e2, half));
19946 2 : }
19947 :
19948 : /* Output code to perform an cosh XFmode calculation. */
19949 :
19950 : void
19951 3 : ix86_emit_i387_cosh (rtx op0, rtx op1)
19952 : {
19953 3 : rtx e1 = gen_reg_rtx (XFmode);
19954 3 : rtx e2 = gen_reg_rtx (XFmode);
19955 3 : rtx half = const_double_from_real_value (dconsthalf, XFmode);
19956 3 : rtx cst1;
19957 :
19958 : /* e1 = exp (op1) */
19959 3 : emit_insn (gen_expxf2 (e1, op1));
19960 :
19961 : /* e2 = e1 + 1.0 / e1 */
19962 3 : cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
19963 3 : emit_insn (gen_divxf3 (e2, cst1, e1));
19964 3 : emit_insn (gen_addxf3 (e2, e1, e2));
19965 :
19966 : /* op0 = 0.5 * e2 */
19967 3 : half = force_reg (XFmode, half);
19968 3 : emit_insn (gen_mulxf3 (op0, e2, half));
19969 3 : }
19970 :
19971 : /* Output code to perform an tanh XFmode calculation. */
19972 :
19973 : void
19974 1 : ix86_emit_i387_tanh (rtx op0, rtx op1)
19975 : {
19976 1 : rtx e1 = gen_reg_rtx (XFmode);
19977 1 : rtx e2 = gen_reg_rtx (XFmode);
19978 1 : rtx scratch = gen_reg_rtx (HImode);
19979 1 : rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
19980 1 : rtx cst2, tmp;
19981 1 : rtx_code_label *jump_label = gen_label_rtx ();
19982 1 : rtx_insn *insn;
19983 :
19984 : /* scratch = fxam (op1) */
19985 1 : emit_insn (gen_fxamxf2_i387 (scratch, op1));
19986 :
19987 : /* e1 = expm1 (-|2 * op1|) */
19988 1 : emit_insn (gen_addxf3 (e2, op1, op1));
19989 1 : emit_insn (gen_absxf2 (e2, e2));
19990 1 : emit_insn (gen_negxf2 (e2, e2));
19991 1 : emit_insn (gen_expm1xf2 (e1, e2));
19992 :
19993 : /* e2 = e1 / (e1 + 2.0) */
19994 1 : cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
19995 1 : emit_insn (gen_addxf3 (e2, e1, cst2));
19996 1 : emit_insn (gen_divxf3 (e2, e1, e2));
19997 :
19998 : /* flags = signbit (op1) */
19999 1 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
20000 :
20001 : /* if (!flags) then e2 = -e2 */
20002 1 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
20003 : gen_rtx_NE (VOIDmode, flags, const0_rtx),
20004 : gen_rtx_LABEL_REF (VOIDmode, jump_label),
20005 : pc_rtx);
20006 1 : insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
20007 1 : predict_jump (REG_BR_PROB_BASE * 50 / 100);
20008 1 : JUMP_LABEL (insn) = jump_label;
20009 :
20010 1 : emit_insn (gen_negxf2 (e2, e2));
20011 :
20012 1 : emit_label (jump_label);
20013 1 : LABEL_NUSES (jump_label) = 1;
20014 :
20015 1 : emit_move_insn (op0, e2);
20016 1 : }
20017 :
20018 : /* Output code to perform an asinh XFmode calculation. */
20019 :
20020 : void
20021 0 : ix86_emit_i387_asinh (rtx op0, rtx op1)
20022 : {
20023 0 : rtx e1 = gen_reg_rtx (XFmode);
20024 0 : rtx e2 = gen_reg_rtx (XFmode);
20025 0 : rtx scratch = gen_reg_rtx (HImode);
20026 0 : rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
20027 0 : rtx cst1, tmp;
20028 0 : rtx_code_label *jump_label = gen_label_rtx ();
20029 0 : rtx_insn *insn;
20030 :
20031 : /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
20032 0 : emit_insn (gen_mulxf3 (e1, op1, op1));
20033 0 : cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
20034 0 : emit_insn (gen_addxf3 (e2, e1, cst1));
20035 0 : emit_insn (gen_sqrtxf2 (e2, e2));
20036 0 : emit_insn (gen_addxf3 (e2, e2, cst1));
20037 :
20038 : /* e1 = e1 / e2 */
20039 0 : emit_insn (gen_divxf3 (e1, e1, e2));
20040 :
20041 : /* scratch = fxam (op1) */
20042 0 : emit_insn (gen_fxamxf2_i387 (scratch, op1));
20043 :
20044 : /* e1 = e1 + |op1| */
20045 0 : emit_insn (gen_absxf2 (e2, op1));
20046 0 : emit_insn (gen_addxf3 (e1, e1, e2));
20047 :
20048 : /* e2 = log1p (e1) */
20049 0 : ix86_emit_i387_log1p (e2, e1);
20050 :
20051 : /* flags = signbit (op1) */
20052 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
20053 :
20054 : /* if (flags) then e2 = -e2 */
20055 0 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
20056 : gen_rtx_EQ (VOIDmode, flags, const0_rtx),
20057 : gen_rtx_LABEL_REF (VOIDmode, jump_label),
20058 : pc_rtx);
20059 0 : insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
20060 0 : predict_jump (REG_BR_PROB_BASE * 50 / 100);
20061 0 : JUMP_LABEL (insn) = jump_label;
20062 :
20063 0 : emit_insn (gen_negxf2 (e2, e2));
20064 :
20065 0 : emit_label (jump_label);
20066 0 : LABEL_NUSES (jump_label) = 1;
20067 :
20068 0 : emit_move_insn (op0, e2);
20069 0 : }
20070 :
20071 : /* Output code to perform an acosh XFmode calculation. */
20072 :
20073 : void
20074 0 : ix86_emit_i387_acosh (rtx op0, rtx op1)
20075 : {
20076 0 : rtx e1 = gen_reg_rtx (XFmode);
20077 0 : rtx e2 = gen_reg_rtx (XFmode);
20078 0 : rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
20079 :
20080 : /* e2 = sqrt (op1 + 1.0) */
20081 0 : emit_insn (gen_addxf3 (e2, op1, cst1));
20082 0 : emit_insn (gen_sqrtxf2 (e2, e2));
20083 :
20084 : /* e1 = sqrt (op1 - 1.0) */
20085 0 : emit_insn (gen_subxf3 (e1, op1, cst1));
20086 0 : emit_insn (gen_sqrtxf2 (e1, e1));
20087 :
20088 : /* e1 = e1 * e2 */
20089 0 : emit_insn (gen_mulxf3 (e1, e1, e2));
20090 :
20091 : /* e1 = e1 + op1 */
20092 0 : emit_insn (gen_addxf3 (e1, e1, op1));
20093 :
20094 : /* op0 = log (e1) */
20095 0 : emit_insn (gen_logxf2 (op0, e1));
20096 0 : }
20097 :
20098 : /* Output code to perform an atanh XFmode calculation. */
20099 :
20100 : void
20101 4 : ix86_emit_i387_atanh (rtx op0, rtx op1)
20102 : {
20103 4 : rtx e1 = gen_reg_rtx (XFmode);
20104 4 : rtx e2 = gen_reg_rtx (XFmode);
20105 4 : rtx scratch = gen_reg_rtx (HImode);
20106 4 : rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
20107 4 : rtx half = const_double_from_real_value (dconsthalf, XFmode);
20108 4 : rtx cst1, tmp;
20109 4 : rtx_code_label *jump_label = gen_label_rtx ();
20110 4 : rtx_insn *insn;
20111 :
20112 : /* scratch = fxam (op1) */
20113 4 : emit_insn (gen_fxamxf2_i387 (scratch, op1));
20114 :
20115 : /* e2 = |op1| */
20116 4 : emit_insn (gen_absxf2 (e2, op1));
20117 :
20118 : /* e1 = -(e2 + e2) / (e2 + 1.0) */
20119 4 : cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
20120 4 : emit_insn (gen_addxf3 (e1, e2, cst1));
20121 4 : emit_insn (gen_addxf3 (e2, e2, e2));
20122 4 : emit_insn (gen_negxf2 (e2, e2));
20123 4 : emit_insn (gen_divxf3 (e1, e2, e1));
20124 :
20125 : /* e2 = log1p (e1) */
20126 4 : ix86_emit_i387_log1p (e2, e1);
20127 :
20128 : /* flags = signbit (op1) */
20129 4 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
20130 :
20131 : /* if (!flags) then e2 = -e2 */
20132 4 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
20133 : gen_rtx_NE (VOIDmode, flags, const0_rtx),
20134 : gen_rtx_LABEL_REF (VOIDmode, jump_label),
20135 : pc_rtx);
20136 4 : insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
20137 4 : predict_jump (REG_BR_PROB_BASE * 50 / 100);
20138 4 : JUMP_LABEL (insn) = jump_label;
20139 :
20140 4 : emit_insn (gen_negxf2 (e2, e2));
20141 :
20142 4 : emit_label (jump_label);
20143 4 : LABEL_NUSES (jump_label) = 1;
20144 :
20145 : /* op0 = 0.5 * e2 */
20146 4 : half = force_reg (XFmode, half);
20147 4 : emit_insn (gen_mulxf3 (op0, e2, half));
20148 4 : }
20149 :
20150 : /* Output code to perform a log1p XFmode calculation. */
20151 :
20152 : void
20153 5 : ix86_emit_i387_log1p (rtx op0, rtx op1)
20154 : {
20155 5 : rtx_code_label *label1 = gen_label_rtx ();
20156 5 : rtx_code_label *label2 = gen_label_rtx ();
20157 :
20158 5 : rtx tmp = gen_reg_rtx (XFmode);
20159 5 : rtx res = gen_reg_rtx (XFmode);
20160 5 : rtx cst, cstln2, cst1;
20161 5 : rtx_insn *insn;
20162 :
20163 : /* The emit_jump call emits pending stack adjust, make sure it is emitted
20164 : before the conditional jump, otherwise the stack adjustment will be
20165 : only conditional. */
20166 5 : do_pending_stack_adjust ();
20167 :
20168 5 : cst = const_double_from_real_value
20169 5 : (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
20170 5 : cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
20171 :
20172 5 : emit_insn (gen_absxf2 (tmp, op1));
20173 :
20174 5 : cst = force_reg (XFmode, cst);
20175 5 : ix86_expand_branch (GE, tmp, cst, label1);
20176 5 : predict_jump (REG_BR_PROB_BASE * 10 / 100);
20177 5 : insn = get_last_insn ();
20178 5 : JUMP_LABEL (insn) = label1;
20179 :
20180 5 : emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
20181 5 : emit_jump (label2);
20182 :
20183 5 : emit_label (label1);
20184 5 : LABEL_NUSES (label1) = 1;
20185 :
20186 5 : cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
20187 5 : emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
20188 5 : emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
20189 :
20190 5 : emit_label (label2);
20191 5 : LABEL_NUSES (label2) = 1;
20192 :
20193 5 : emit_move_insn (op0, res);
20194 5 : }
20195 :
20196 : /* Emit code for round calculation. */
20197 : void
20198 68 : ix86_emit_i387_round (rtx op0, rtx op1)
20199 : {
20200 68 : machine_mode inmode = GET_MODE (op1);
20201 68 : machine_mode outmode = GET_MODE (op0);
20202 68 : rtx e1 = gen_reg_rtx (XFmode);
20203 68 : rtx e2 = gen_reg_rtx (XFmode);
20204 68 : rtx scratch = gen_reg_rtx (HImode);
20205 68 : rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
20206 68 : rtx half = const_double_from_real_value (dconsthalf, XFmode);
20207 68 : rtx res = gen_reg_rtx (outmode);
20208 68 : rtx_code_label *jump_label = gen_label_rtx ();
20209 68 : rtx (*floor_insn) (rtx, rtx);
20210 68 : rtx (*neg_insn) (rtx, rtx);
20211 68 : rtx_insn *insn;
20212 68 : rtx tmp;
20213 :
20214 68 : switch (inmode)
20215 : {
20216 37 : case E_SFmode:
20217 37 : case E_DFmode:
20218 37 : tmp = gen_reg_rtx (XFmode);
20219 :
20220 37 : emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
20221 37 : op1 = tmp;
20222 37 : break;
20223 : case E_XFmode:
20224 : break;
20225 0 : default:
20226 0 : gcc_unreachable ();
20227 : }
20228 :
20229 68 : switch (outmode)
20230 : {
20231 : case E_SFmode:
20232 : floor_insn = gen_frndintxf2_floor;
20233 : neg_insn = gen_negsf2;
20234 : break;
20235 8 : case E_DFmode:
20236 8 : floor_insn = gen_frndintxf2_floor;
20237 8 : neg_insn = gen_negdf2;
20238 8 : break;
20239 10 : case E_XFmode:
20240 10 : floor_insn = gen_frndintxf2_floor;
20241 10 : neg_insn = gen_negxf2;
20242 10 : break;
20243 0 : case E_HImode:
20244 0 : floor_insn = gen_lfloorxfhi2;
20245 0 : neg_insn = gen_neghi2;
20246 0 : break;
20247 10 : case E_SImode:
20248 10 : floor_insn = gen_lfloorxfsi2;
20249 10 : neg_insn = gen_negsi2;
20250 10 : break;
20251 36 : case E_DImode:
20252 36 : floor_insn = gen_lfloorxfdi2;
20253 36 : neg_insn = gen_negdi2;
20254 36 : break;
20255 0 : default:
20256 0 : gcc_unreachable ();
20257 : }
20258 :
20259 : /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
20260 :
20261 : /* scratch = fxam(op1) */
20262 68 : emit_insn (gen_fxamxf2_i387 (scratch, op1));
20263 :
20264 : /* e1 = fabs(op1) */
20265 68 : emit_insn (gen_absxf2 (e1, op1));
20266 :
20267 : /* e2 = e1 + 0.5 */
20268 68 : half = force_reg (XFmode, half);
20269 68 : emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
20270 :
20271 : /* res = floor(e2) */
20272 68 : switch (outmode)
20273 : {
20274 12 : case E_SFmode:
20275 12 : case E_DFmode:
20276 12 : {
20277 12 : tmp = gen_reg_rtx (XFmode);
20278 :
20279 12 : emit_insn (floor_insn (tmp, e2));
20280 12 : emit_insn (gen_rtx_SET (res,
20281 : gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
20282 : UNSPEC_TRUNC_NOOP)));
20283 : }
20284 12 : break;
20285 56 : default:
20286 56 : emit_insn (floor_insn (res, e2));
20287 : }
20288 :
20289 : /* flags = signbit(a) */
20290 68 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
20291 :
20292 : /* if (flags) then res = -res */
20293 68 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
20294 : gen_rtx_EQ (VOIDmode, flags, const0_rtx),
20295 : gen_rtx_LABEL_REF (VOIDmode, jump_label),
20296 : pc_rtx);
20297 68 : insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
20298 68 : predict_jump (REG_BR_PROB_BASE * 50 / 100);
20299 68 : JUMP_LABEL (insn) = jump_label;
20300 :
20301 68 : emit_insn (neg_insn (res, res));
20302 :
20303 68 : emit_label (jump_label);
20304 68 : LABEL_NUSES (jump_label) = 1;
20305 :
20306 68 : emit_move_insn (op0, res);
20307 68 : }
20308 :
20309 : /* Output code to perform a Newton-Rhapson approximation of a single precision
20310 : floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
20311 :
20312 : void
20313 56 : ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
20314 : {
20315 56 : rtx x0, x1, e0, e1;
20316 :
20317 56 : x0 = gen_reg_rtx (mode);
20318 56 : e0 = gen_reg_rtx (mode);
20319 56 : e1 = gen_reg_rtx (mode);
20320 56 : x1 = gen_reg_rtx (mode);
20321 :
20322 56 : b = force_reg (mode, b);
20323 :
20324 : /* x0 = rcp(b) estimate */
20325 56 : if (mode == V16SFmode || mode == V8DFmode)
20326 : {
20327 0 : emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
20328 : UNSPEC_RCP14)));
20329 : }
20330 : else
20331 56 : emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
20332 : UNSPEC_RCP)));
20333 :
20334 56 : unsigned vector_size = GET_MODE_SIZE (mode);
20335 :
20336 : /* (a - (rcp(b) * a * b)) * rcp(b) + rcp(b) * a
20337 : N-R step with 2 fma implementation. */
20338 56 : if (TARGET_FMA
20339 55 : || (TARGET_AVX512F && vector_size == 64)
20340 55 : || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
20341 : {
20342 : /* e0 = x0 * a */
20343 1 : emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
20344 : /* e1 = e0 * b - a */
20345 1 : emit_insn (gen_rtx_SET (e1, gen_rtx_FMA (mode, e0, b,
20346 : gen_rtx_NEG (mode, a))));
20347 : /* res = - e1 * x0 + e0 */
20348 1 : emit_insn (gen_rtx_SET (res, gen_rtx_FMA (mode,
20349 : gen_rtx_NEG (mode, e1),
20350 : x0, e0)));
20351 : }
20352 : else
20353 : /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
20354 : {
20355 : /* e0 = x0 * b */
20356 55 : emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
20357 :
20358 : /* e1 = x0 + x0 */
20359 55 : emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
20360 :
20361 : /* e0 = x0 * e0 */
20362 55 : emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
20363 :
20364 : /* x1 = e1 - e0 */
20365 55 : emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
20366 :
20367 : /* res = a * x1 */
20368 55 : emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
20369 : }
20370 56 : }
20371 :
20372 : /* Output code to perform a Newton-Rhapson approximation of a
20373 : single precision floating point [reciprocal] square root. */
20374 :
20375 : void
20376 85 : ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
20377 : {
20378 85 : rtx x0, e0, e1, e2, e3, mthree, mhalf;
20379 85 : REAL_VALUE_TYPE r;
20380 85 : int unspec;
20381 :
20382 85 : x0 = gen_reg_rtx (mode);
20383 85 : e0 = gen_reg_rtx (mode);
20384 85 : e1 = gen_reg_rtx (mode);
20385 85 : e2 = gen_reg_rtx (mode);
20386 85 : e3 = gen_reg_rtx (mode);
20387 :
20388 85 : real_from_integer (&r, VOIDmode, -3, SIGNED);
20389 85 : mthree = const_double_from_real_value (r, SFmode);
20390 :
20391 85 : real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
20392 85 : mhalf = const_double_from_real_value (r, SFmode);
20393 85 : unspec = UNSPEC_RSQRT;
20394 :
20395 85 : if (VECTOR_MODE_P (mode))
20396 : {
20397 66 : mthree = ix86_build_const_vector (mode, true, mthree);
20398 66 : mhalf = ix86_build_const_vector (mode, true, mhalf);
20399 : /* There is no 512-bit rsqrt. There is however rsqrt14. */
20400 132 : if (GET_MODE_SIZE (mode) == 64)
20401 0 : unspec = UNSPEC_RSQRT14;
20402 : }
20403 :
20404 : /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
20405 : rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
20406 :
20407 85 : a = force_reg (mode, a);
20408 :
20409 : /* x0 = rsqrt(a) estimate */
20410 85 : emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
20411 : unspec)));
20412 :
20413 : /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
20414 85 : if (!recip)
20415 : {
20416 57 : rtx zero = force_reg (mode, CONST0_RTX(mode));
20417 57 : rtx mask;
20418 :
20419 : /* Handle masked compare. */
20420 110 : if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
20421 : {
20422 0 : mask = gen_reg_rtx (HImode);
20423 : /* Imm value 0x4 corresponds to not-equal comparison. */
20424 0 : emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
20425 0 : emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
20426 : }
20427 : else
20428 : {
20429 57 : mask = gen_reg_rtx (mode);
20430 57 : emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
20431 57 : emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
20432 : }
20433 : }
20434 :
20435 85 : mthree = force_reg (mode, mthree);
20436 :
20437 : /* e0 = x0 * a */
20438 85 : emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
20439 :
20440 85 : unsigned vector_size = GET_MODE_SIZE (mode);
20441 85 : if (TARGET_FMA
20442 77 : || (TARGET_AVX512F && vector_size == 64)
20443 77 : || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
20444 16 : emit_insn (gen_rtx_SET (e2,
20445 : gen_rtx_FMA (mode, e0, x0, mthree)));
20446 : else
20447 : {
20448 : /* e1 = e0 * x0 */
20449 69 : emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
20450 :
20451 : /* e2 = e1 - 3. */
20452 69 : emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
20453 : }
20454 :
20455 85 : mhalf = force_reg (mode, mhalf);
20456 85 : if (recip)
20457 : /* e3 = -.5 * x0 */
20458 28 : emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
20459 : else
20460 : /* e3 = -.5 * e0 */
20461 57 : emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
20462 : /* ret = e2 * e3 */
20463 85 : emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
20464 85 : }
20465 :
20466 : /* Expand fabs (OP0) and return a new rtx that holds the result. The
20467 : mask for masking out the sign-bit is stored in *SMASK, if that is
20468 : non-null. */
20469 :
20470 : static rtx
20471 1049 : ix86_expand_sse_fabs (rtx op0, rtx *smask)
20472 : {
20473 1049 : machine_mode vmode, mode = GET_MODE (op0);
20474 1049 : rtx xa, mask;
20475 :
20476 1049 : xa = gen_reg_rtx (mode);
20477 1049 : if (mode == SFmode)
20478 : vmode = V4SFmode;
20479 467 : else if (mode == DFmode)
20480 : vmode = V2DFmode;
20481 : else
20482 0 : vmode = mode;
20483 1049 : mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
20484 1049 : if (!VECTOR_MODE_P (mode))
20485 : {
20486 : /* We need to generate a scalar mode mask in this case. */
20487 1049 : rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
20488 1049 : tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
20489 1049 : mask = gen_reg_rtx (mode);
20490 1049 : emit_insn (gen_rtx_SET (mask, tmp));
20491 : }
20492 1049 : emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
20493 :
20494 1049 : if (smask)
20495 996 : *smask = mask;
20496 :
20497 1049 : return xa;
20498 : }
20499 :
20500 : /* Expands a comparison of OP0 with OP1 using comparison code CODE,
20501 : swapping the operands if SWAP_OPERANDS is true. The expanded
20502 : code is a forward jump to a newly created label in case the
20503 : comparison is true. The generated label rtx is returned. */
20504 : static rtx_code_label *
20505 1064 : ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
20506 : bool swap_operands)
20507 : {
20508 1064 : bool unordered_compare = ix86_unordered_fp_compare (code);
20509 1064 : rtx_code_label *label;
20510 1064 : rtx tmp, reg;
20511 :
20512 1064 : if (swap_operands)
20513 34 : std::swap (op0, op1);
20514 :
20515 1064 : label = gen_label_rtx ();
20516 1064 : tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
20517 1064 : if (unordered_compare)
20518 908 : tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
20519 1064 : reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
20520 1064 : emit_insn (gen_rtx_SET (reg, tmp));
20521 1064 : tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
20522 1064 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
20523 : gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
20524 1064 : tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
20525 1064 : JUMP_LABEL (tmp) = label;
20526 :
20527 1064 : return label;
20528 : }
20529 :
20530 : /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
20531 : using comparison code CODE. Operands are swapped for the comparison if
20532 : SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
20533 : static rtx
20534 541 : ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
20535 : bool swap_operands)
20536 : {
20537 541 : rtx (*insn)(rtx, rtx, rtx, rtx);
20538 541 : machine_mode mode = GET_MODE (op0);
20539 541 : rtx mask = gen_reg_rtx (mode);
20540 :
20541 541 : if (swap_operands)
20542 362 : std::swap (op0, op1);
20543 :
20544 541 : insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
20545 :
20546 541 : emit_insn (insn (mask, op0, op1,
20547 : gen_rtx_fmt_ee (code, mode, op0, op1)));
20548 541 : return mask;
20549 : }
20550 :
20551 : /* Expand copysign from SIGN to the positive value ABS_VALUE
20552 : storing in RESULT. If MASK is non-null, it shall be a mask to mask out
20553 : the sign-bit. */
20554 :
20555 : static void
20556 1016 : ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
20557 : {
20558 1016 : machine_mode mode = GET_MODE (sign);
20559 1016 : rtx sgn = gen_reg_rtx (mode);
20560 1016 : if (mask == NULL_RTX)
20561 : {
20562 28 : machine_mode vmode;
20563 :
20564 28 : if (mode == SFmode)
20565 : vmode = V4SFmode;
20566 : else if (mode == DFmode)
20567 : vmode = V2DFmode;
20568 : else if (mode == HFmode)
20569 : vmode = V8HFmode;
20570 : else
20571 28 : vmode = mode;
20572 :
20573 28 : mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
20574 28 : if (!VECTOR_MODE_P (mode))
20575 : {
20576 : /* We need to generate a scalar mode mask in this case. */
20577 28 : rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
20578 28 : tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
20579 28 : mask = gen_reg_rtx (mode);
20580 28 : emit_insn (gen_rtx_SET (mask, tmp));
20581 : }
20582 : }
20583 : else
20584 988 : mask = gen_rtx_NOT (mode, mask);
20585 1016 : emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
20586 1016 : emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
20587 1016 : }
20588 :
20589 : /* Expand SSE sequence for computing lround from OP1 storing
20590 : into OP0. */
20591 :
20592 : void
20593 28 : ix86_expand_lround (rtx op0, rtx op1)
20594 : {
20595 : /* C code for the stuff we're doing below:
20596 : tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
20597 : return (long)tmp;
20598 : */
20599 28 : machine_mode mode = GET_MODE (op1);
20600 28 : const struct real_format *fmt;
20601 28 : REAL_VALUE_TYPE pred_half, half_minus_pred_half;
20602 28 : rtx adj;
20603 :
20604 : /* load nextafter (0.5, 0.0) */
20605 28 : fmt = REAL_MODE_FORMAT (mode);
20606 28 : real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
20607 28 : real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
20608 :
20609 : /* adj = copysign (0.5, op1) */
20610 28 : adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
20611 28 : ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
20612 :
20613 : /* adj = op1 + adj */
20614 28 : adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
20615 :
20616 : /* op0 = (imode)adj */
20617 28 : expand_fix (op0, adj, 0);
20618 28 : }
20619 :
20620 : /* Expand SSE2 sequence for computing lround from OPERAND1 storing
20621 : into OPERAND0. */
20622 :
20623 : void
20624 68 : ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
20625 : {
20626 : /* C code for the stuff we're doing below (for do_floor):
20627 : xi = (long)op1;
20628 : xi -= (double)xi > op1 ? 1 : 0;
20629 : return xi;
20630 : */
20631 68 : machine_mode fmode = GET_MODE (op1);
20632 68 : machine_mode imode = GET_MODE (op0);
20633 68 : rtx ireg, freg, tmp;
20634 68 : rtx_code_label *label;
20635 :
20636 : /* reg = (long)op1 */
20637 68 : ireg = gen_reg_rtx (imode);
20638 68 : expand_fix (ireg, op1, 0);
20639 :
20640 : /* freg = (double)reg */
20641 68 : freg = gen_reg_rtx (fmode);
20642 68 : expand_float (freg, ireg, 0);
20643 :
20644 : /* ireg = (freg > op1) ? ireg - 1 : ireg */
20645 136 : label = ix86_expand_sse_compare_and_jump (UNLE,
20646 68 : freg, op1, !do_floor);
20647 102 : tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
20648 : ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
20649 68 : emit_move_insn (ireg, tmp);
20650 :
20651 68 : emit_label (label);
20652 68 : LABEL_NUSES (label) = 1;
20653 :
20654 68 : emit_move_insn (op0, ireg);
20655 68 : }
20656 :
20657 : /* Generate and return a rtx of mode MODE for 2**n where n is the number
20658 : of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
20659 :
20660 : static rtx
20661 996 : ix86_gen_TWO52 (machine_mode mode)
20662 : {
20663 996 : const struct real_format *fmt;
20664 996 : REAL_VALUE_TYPE TWO52r;
20665 996 : rtx TWO52;
20666 :
20667 996 : fmt = REAL_MODE_FORMAT (mode);
20668 996 : real_2expN (&TWO52r, fmt->p - 1, mode);
20669 996 : TWO52 = const_double_from_real_value (TWO52r, mode);
20670 996 : TWO52 = force_reg (mode, TWO52);
20671 :
20672 996 : return TWO52;
20673 : }
20674 :
20675 : /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
20676 :
20677 : void
20678 122 : ix86_expand_rint (rtx operand0, rtx operand1)
20679 : {
20680 : /* C code for the stuff we're doing below:
20681 : xa = fabs (operand1);
20682 : if (!isless (xa, 2**52))
20683 : return operand1;
20684 : two52 = 2**52;
20685 : if (flag_rounding_math)
20686 : {
20687 : two52 = copysign (two52, operand1);
20688 : xa = operand1;
20689 : }
20690 : xa = xa + two52 - two52;
20691 : return copysign (xa, operand1);
20692 : */
20693 122 : machine_mode mode = GET_MODE (operand0);
20694 122 : rtx res, xa, TWO52, mask;
20695 122 : rtx_code_label *label;
20696 :
20697 122 : TWO52 = ix86_gen_TWO52 (mode);
20698 :
20699 : /* Temporary for holding the result, initialized to the input
20700 : operand to ease control flow. */
20701 122 : res = copy_to_reg (operand1);
20702 :
20703 : /* xa = abs (operand1) */
20704 122 : xa = ix86_expand_sse_fabs (res, &mask);
20705 :
20706 : /* if (!isless (xa, TWO52)) goto label; */
20707 122 : label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
20708 :
20709 122 : if (flag_rounding_math)
20710 : {
20711 53 : ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
20712 53 : xa = res;
20713 : }
20714 :
20715 122 : xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
20716 122 : xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
20717 :
20718 : /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
20719 122 : if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
20720 53 : xa = ix86_expand_sse_fabs (xa, NULL);
20721 :
20722 122 : ix86_sse_copysign_to_positive (res, xa, res, mask);
20723 :
20724 122 : emit_label (label);
20725 122 : LABEL_NUSES (label) = 1;
20726 :
20727 122 : emit_move_insn (operand0, res);
20728 122 : }
20729 :
20730 : /* Expand SSE2 sequence for computing floor or ceil
20731 : from OPERAND1 storing into OPERAND0. */
20732 : void
20733 541 : ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
20734 : {
20735 : /* C code for the stuff we expand below.
20736 : double xa = fabs (x), x2;
20737 : if (!isless (xa, TWO52))
20738 : return x;
20739 : x2 = (double)(long)x;
20740 :
20741 : Compensate. Floor:
20742 : if (x2 > x)
20743 : x2 -= 1;
20744 : Compensate. Ceil:
20745 : if (x2 < x)
20746 : x2 += 1;
20747 :
20748 : if (HONOR_SIGNED_ZEROS (mode))
20749 : return copysign (x2, x);
20750 : return x2;
20751 : */
20752 541 : machine_mode mode = GET_MODE (operand0);
20753 541 : rtx xa, xi, TWO52, tmp, one, res, mask;
20754 541 : rtx_code_label *label;
20755 :
20756 541 : TWO52 = ix86_gen_TWO52 (mode);
20757 :
20758 : /* Temporary for holding the result, initialized to the input
20759 : operand to ease control flow. */
20760 541 : res = copy_to_reg (operand1);
20761 :
20762 : /* xa = abs (operand1) */
20763 541 : xa = ix86_expand_sse_fabs (res, &mask);
20764 :
20765 : /* if (!isless (xa, TWO52)) goto label; */
20766 541 : label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
20767 :
20768 : /* xa = (double)(long)x */
20769 541 : xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
20770 541 : expand_fix (xi, res, 0);
20771 541 : expand_float (xa, xi, 0);
20772 :
20773 : /* generate 1.0 */
20774 541 : one = force_reg (mode, const_double_from_real_value (dconst1, mode));
20775 :
20776 : /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
20777 541 : tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
20778 541 : emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
20779 903 : tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
20780 : xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
20781 541 : if (HONOR_SIGNED_ZEROS (mode))
20782 : {
20783 : /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
20784 494 : if (do_floor && flag_rounding_math)
20785 0 : tmp = ix86_expand_sse_fabs (tmp, NULL);
20786 :
20787 494 : ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
20788 : }
20789 541 : emit_move_insn (res, tmp);
20790 :
20791 541 : emit_label (label);
20792 541 : LABEL_NUSES (label) = 1;
20793 :
20794 541 : emit_move_insn (operand0, res);
20795 541 : }
20796 :
20797 : /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
20798 : into OPERAND0 without relying on DImode truncation via cvttsd2siq
20799 : that is only available on 64bit targets. */
20800 : void
20801 0 : ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
20802 : {
20803 : /* C code for the stuff we expand below.
20804 : double xa = fabs (x), x2;
20805 : if (!isless (xa, TWO52))
20806 : return x;
20807 : xa = xa + TWO52 - TWO52;
20808 : x2 = copysign (xa, x);
20809 :
20810 : Compensate. Floor:
20811 : if (x2 > x)
20812 : x2 -= 1;
20813 : Compensate. Ceil:
20814 : if (x2 < x)
20815 : x2 += 1;
20816 :
20817 : if (HONOR_SIGNED_ZEROS (mode))
20818 : x2 = copysign (x2, x);
20819 : return x2;
20820 : */
20821 0 : machine_mode mode = GET_MODE (operand0);
20822 0 : rtx xa, TWO52, tmp, one, res, mask;
20823 0 : rtx_code_label *label;
20824 :
20825 0 : TWO52 = ix86_gen_TWO52 (mode);
20826 :
20827 : /* Temporary for holding the result, initialized to the input
20828 : operand to ease control flow. */
20829 0 : res = copy_to_reg (operand1);
20830 :
20831 : /* xa = abs (operand1) */
20832 0 : xa = ix86_expand_sse_fabs (res, &mask);
20833 :
20834 : /* if (!isless (xa, TWO52)) goto label; */
20835 0 : label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
20836 :
20837 : /* xa = xa + TWO52 - TWO52; */
20838 0 : xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
20839 0 : xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
20840 :
20841 : /* xa = copysign (xa, operand1) */
20842 0 : ix86_sse_copysign_to_positive (xa, xa, res, mask);
20843 :
20844 : /* generate 1.0 */
20845 0 : one = force_reg (mode, const_double_from_real_value (dconst1, mode));
20846 :
20847 : /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
20848 0 : tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
20849 0 : emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
20850 0 : tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
20851 : xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
20852 0 : if (HONOR_SIGNED_ZEROS (mode))
20853 : {
20854 : /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
20855 0 : if (do_floor && flag_rounding_math)
20856 0 : tmp = ix86_expand_sse_fabs (tmp, NULL);
20857 :
20858 0 : ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
20859 : }
20860 0 : emit_move_insn (res, tmp);
20861 :
20862 0 : emit_label (label);
20863 0 : LABEL_NUSES (label) = 1;
20864 :
20865 0 : emit_move_insn (operand0, res);
20866 0 : }
20867 :
20868 : /* Expand SSE sequence for computing trunc
20869 : from OPERAND1 storing into OPERAND0. */
20870 : void
20871 319 : ix86_expand_trunc (rtx operand0, rtx operand1)
20872 : {
20873 : /* C code for SSE variant we expand below.
20874 : double xa = fabs (x), x2;
20875 : if (!isless (xa, TWO52))
20876 : return x;
20877 : x2 = (double)(long)x;
20878 : if (HONOR_SIGNED_ZEROS (mode))
20879 : return copysign (x2, x);
20880 : return x2;
20881 : */
20882 319 : machine_mode mode = GET_MODE (operand0);
20883 319 : rtx xa, xi, TWO52, res, mask;
20884 319 : rtx_code_label *label;
20885 :
20886 319 : TWO52 = ix86_gen_TWO52 (mode);
20887 :
20888 : /* Temporary for holding the result, initialized to the input
20889 : operand to ease control flow. */
20890 319 : res = copy_to_reg (operand1);
20891 :
20892 : /* xa = abs (operand1) */
20893 319 : xa = ix86_expand_sse_fabs (res, &mask);
20894 :
20895 : /* if (!isless (xa, TWO52)) goto label; */
20896 319 : label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
20897 :
20898 : /* xa = (double)(long)x */
20899 319 : xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
20900 319 : expand_fix (xi, res, 0);
20901 319 : expand_float (xa, xi, 0);
20902 :
20903 319 : if (HONOR_SIGNED_ZEROS (mode))
20904 305 : ix86_sse_copysign_to_positive (xa, xa, res, mask);
20905 :
20906 319 : emit_move_insn (res, xa);
20907 :
20908 319 : emit_label (label);
20909 319 : LABEL_NUSES (label) = 1;
20910 :
20911 319 : emit_move_insn (operand0, res);
20912 319 : }
20913 :
20914 : /* Expand SSE sequence for computing trunc from OPERAND1 storing
20915 : into OPERAND0 without relying on DImode truncation via cvttsd2siq
20916 : that is only available on 64bit targets. */
20917 : void
20918 0 : ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
20919 : {
20920 0 : machine_mode mode = GET_MODE (operand0);
20921 0 : rtx xa, xa2, TWO52, tmp, one, res, mask;
20922 0 : rtx_code_label *label;
20923 :
20924 : /* C code for SSE variant we expand below.
20925 : double xa = fabs (x), x2;
20926 : if (!isless (xa, TWO52))
20927 : return x;
20928 : xa2 = xa + TWO52 - TWO52;
20929 : Compensate:
20930 : if (xa2 > xa)
20931 : xa2 -= 1.0;
20932 : x2 = copysign (xa2, x);
20933 : return x2;
20934 : */
20935 :
20936 0 : TWO52 = ix86_gen_TWO52 (mode);
20937 :
20938 : /* Temporary for holding the result, initialized to the input
20939 : operand to ease control flow. */
20940 0 : res =copy_to_reg (operand1);
20941 :
20942 : /* xa = abs (operand1) */
20943 0 : xa = ix86_expand_sse_fabs (res, &mask);
20944 :
20945 : /* if (!isless (xa, TWO52)) goto label; */
20946 0 : label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
20947 :
20948 : /* xa2 = xa + TWO52 - TWO52; */
20949 0 : xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
20950 0 : xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
20951 :
20952 : /* generate 1.0 */
20953 0 : one = force_reg (mode, const_double_from_real_value (dconst1, mode));
20954 :
20955 : /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
20956 0 : tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
20957 0 : emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
20958 0 : tmp = expand_simple_binop (mode, MINUS,
20959 : xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
20960 : /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
20961 0 : if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
20962 0 : tmp = ix86_expand_sse_fabs (tmp, NULL);
20963 :
20964 : /* res = copysign (xa2, operand1) */
20965 0 : ix86_sse_copysign_to_positive (res, tmp, res, mask);
20966 :
20967 0 : emit_label (label);
20968 0 : LABEL_NUSES (label) = 1;
20969 :
20970 0 : emit_move_insn (operand0, res);
20971 0 : }
20972 :
20973 : /* Expand SSE sequence for computing round
20974 : from OPERAND1 storing into OPERAND0. */
20975 : void
20976 14 : ix86_expand_round (rtx operand0, rtx operand1)
20977 : {
20978 : /* C code for the stuff we're doing below:
20979 : double xa = fabs (x);
20980 : if (!isless (xa, TWO52))
20981 : return x;
20982 : xa = (double)(long)(xa + nextafter (0.5, 0.0));
20983 : return copysign (xa, x);
20984 : */
20985 14 : machine_mode mode = GET_MODE (operand0);
20986 14 : rtx res, TWO52, xa, xi, half, mask;
20987 14 : rtx_code_label *label;
20988 14 : const struct real_format *fmt;
20989 14 : REAL_VALUE_TYPE pred_half, half_minus_pred_half;
20990 :
20991 : /* Temporary for holding the result, initialized to the input
20992 : operand to ease control flow. */
20993 14 : res = copy_to_reg (operand1);
20994 :
20995 14 : TWO52 = ix86_gen_TWO52 (mode);
20996 14 : xa = ix86_expand_sse_fabs (res, &mask);
20997 14 : label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
20998 :
20999 : /* load nextafter (0.5, 0.0) */
21000 14 : fmt = REAL_MODE_FORMAT (mode);
21001 14 : real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
21002 14 : real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
21003 :
21004 : /* xa = xa + 0.5 */
21005 14 : half = force_reg (mode, const_double_from_real_value (pred_half, mode));
21006 14 : xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
21007 :
21008 : /* xa = (double)(int64_t)xa */
21009 14 : xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
21010 14 : expand_fix (xi, xa, 0);
21011 14 : expand_float (xa, xi, 0);
21012 :
21013 : /* res = copysign (xa, operand1) */
21014 14 : ix86_sse_copysign_to_positive (res, xa, res, mask);
21015 :
21016 14 : emit_label (label);
21017 14 : LABEL_NUSES (label) = 1;
21018 :
21019 14 : emit_move_insn (operand0, res);
21020 14 : }
21021 :
21022 : /* Expand SSE sequence for computing round from OPERAND1 storing
21023 : into OPERAND0 without relying on DImode truncation via cvttsd2siq
21024 : that is only available on 64bit targets. */
21025 : void
21026 0 : ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
21027 : {
21028 : /* C code for the stuff we expand below.
21029 : double xa = fabs (x), xa2, x2;
21030 : if (!isless (xa, TWO52))
21031 : return x;
21032 : Using the absolute value and copying back sign makes
21033 : -0.0 -> -0.0 correct.
21034 : xa2 = xa + TWO52 - TWO52;
21035 : Compensate.
21036 : dxa = xa2 - xa;
21037 : if (dxa <= -0.5)
21038 : xa2 += 1;
21039 : else if (dxa > 0.5)
21040 : xa2 -= 1;
21041 : x2 = copysign (xa2, x);
21042 : return x2;
21043 : */
21044 0 : machine_mode mode = GET_MODE (operand0);
21045 0 : rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
21046 0 : rtx_code_label *label;
21047 :
21048 0 : TWO52 = ix86_gen_TWO52 (mode);
21049 :
21050 : /* Temporary for holding the result, initialized to the input
21051 : operand to ease control flow. */
21052 0 : res = copy_to_reg (operand1);
21053 :
21054 : /* xa = abs (operand1) */
21055 0 : xa = ix86_expand_sse_fabs (res, &mask);
21056 :
21057 : /* if (!isless (xa, TWO52)) goto label; */
21058 0 : label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
21059 :
21060 : /* xa2 = xa + TWO52 - TWO52; */
21061 0 : xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
21062 0 : xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
21063 :
21064 : /* dxa = xa2 - xa; */
21065 0 : dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
21066 :
21067 : /* generate 0.5, 1.0 and -0.5 */
21068 0 : half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
21069 0 : one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
21070 0 : mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
21071 : 0, OPTAB_DIRECT);
21072 :
21073 : /* Compensate. */
21074 : /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
21075 0 : tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
21076 0 : emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
21077 0 : xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
21078 : /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
21079 0 : tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
21080 0 : emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
21081 0 : xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
21082 :
21083 : /* res = copysign (xa2, operand1) */
21084 0 : ix86_sse_copysign_to_positive (res, xa2, res, mask);
21085 :
21086 0 : emit_label (label);
21087 0 : LABEL_NUSES (label) = 1;
21088 :
21089 0 : emit_move_insn (operand0, res);
21090 0 : }
21091 :
21092 : /* Expand SSE sequence for computing round
21093 : from OP1 storing into OP0 using sse4 round insn. */
21094 : void
21095 9 : ix86_expand_round_sse4 (rtx op0, rtx op1)
21096 : {
21097 9 : machine_mode mode = GET_MODE (op0);
21098 9 : rtx e1, e2, res, half;
21099 9 : const struct real_format *fmt;
21100 9 : REAL_VALUE_TYPE pred_half, half_minus_pred_half;
21101 9 : rtx (*gen_copysign) (rtx, rtx, rtx);
21102 9 : rtx (*gen_round) (rtx, rtx, rtx);
21103 :
21104 9 : switch (mode)
21105 : {
21106 : case E_HFmode:
21107 : gen_copysign = gen_copysignhf3;
21108 : gen_round = gen_sse4_1_roundhf2;
21109 : break;
21110 4 : case E_SFmode:
21111 4 : gen_copysign = gen_copysignsf3;
21112 4 : gen_round = gen_sse4_1_roundsf2;
21113 4 : break;
21114 4 : case E_DFmode:
21115 4 : gen_copysign = gen_copysigndf3;
21116 4 : gen_round = gen_sse4_1_rounddf2;
21117 4 : break;
21118 0 : default:
21119 0 : gcc_unreachable ();
21120 : }
21121 :
21122 : /* round (a) = trunc (a + copysign (0.5, a)) */
21123 :
21124 : /* load nextafter (0.5, 0.0) */
21125 9 : fmt = REAL_MODE_FORMAT (mode);
21126 9 : real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
21127 9 : real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
21128 9 : half = const_double_from_real_value (pred_half, mode);
21129 :
21130 : /* e1 = copysign (0.5, op1) */
21131 9 : e1 = gen_reg_rtx (mode);
21132 9 : emit_insn (gen_copysign (e1, half, op1));
21133 :
21134 : /* e2 = op1 + e1 */
21135 9 : e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
21136 :
21137 : /* res = trunc (e2) */
21138 9 : res = gen_reg_rtx (mode);
21139 9 : emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
21140 :
21141 9 : emit_move_insn (op0, res);
21142 9 : }
21143 :
21144 : /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
21145 : insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
21146 : insn every time. */
21147 :
21148 : static GTY(()) rtx_insn *vselect_insn;
21149 :
21150 : /* Initialize vselect_insn. */
21151 :
21152 : static void
21153 7514 : init_vselect_insn (void)
21154 : {
21155 7514 : unsigned i;
21156 7514 : rtx x;
21157 :
21158 7514 : x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
21159 488410 : for (i = 0; i < MAX_VECT_LEN; ++i)
21160 480896 : XVECEXP (x, 0, i) = const0_rtx;
21161 7514 : x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
21162 : const0_rtx), x);
21163 7514 : x = gen_rtx_SET (const0_rtx, x);
21164 7514 : start_sequence ();
21165 7514 : vselect_insn = emit_insn (x);
21166 7514 : end_sequence ();
21167 7514 : }
21168 :
21169 : /* Construct (set target (vec_select op0 (parallel perm))) and
21170 : return true if that's a valid instruction in the active ISA. */
21171 :
21172 : static bool
21173 540287 : expand_vselect (rtx target, rtx op0, const unsigned char *perm,
21174 : unsigned nelt, bool testing_p)
21175 : {
21176 540287 : unsigned int i;
21177 540287 : rtx x, save_vconcat;
21178 540287 : int icode;
21179 :
21180 540287 : if (vselect_insn == NULL_RTX)
21181 1685 : init_vselect_insn ();
21182 :
21183 540287 : x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
21184 540287 : PUT_NUM_ELEM (XVEC (x, 0), nelt);
21185 4237163 : for (i = 0; i < nelt; ++i)
21186 3696876 : XVECEXP (x, 0, i) = GEN_INT (perm[i]);
21187 540287 : save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
21188 540287 : XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
21189 540287 : PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
21190 540287 : SET_DEST (PATTERN (vselect_insn)) = target;
21191 540287 : icode = recog_memoized (vselect_insn);
21192 :
21193 540287 : if (icode >= 0 && !testing_p)
21194 72563 : emit_insn (copy_rtx (PATTERN (vselect_insn)));
21195 :
21196 540287 : SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
21197 540287 : XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
21198 540287 : INSN_CODE (vselect_insn) = -1;
21199 :
21200 540287 : return icode >= 0;
21201 : }
21202 :
21203 : /* Similar, but generate a vec_concat from op0 and op1 as well. */
21204 :
21205 : static bool
21206 473636 : expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
21207 : const unsigned char *perm, unsigned nelt,
21208 : bool testing_p)
21209 : {
21210 473636 : machine_mode v2mode;
21211 473636 : rtx x;
21212 473636 : bool ok;
21213 :
21214 473636 : if (vselect_insn == NULL_RTX)
21215 5829 : init_vselect_insn ();
21216 :
21217 473636 : if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
21218 : return false;
21219 473636 : x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
21220 473636 : PUT_MODE (x, v2mode);
21221 473636 : XEXP (x, 0) = op0;
21222 473636 : XEXP (x, 1) = op1;
21223 473636 : ok = expand_vselect (target, x, perm, nelt, testing_p);
21224 473636 : XEXP (x, 0) = const0_rtx;
21225 473636 : XEXP (x, 1) = const0_rtx;
21226 473636 : return ok;
21227 : }
21228 :
21229 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
21230 : using movss or movsd. */
21231 : static bool
21232 317242 : expand_vec_perm_movs (struct expand_vec_perm_d *d)
21233 : {
21234 317242 : machine_mode vmode = d->vmode;
21235 317242 : unsigned i, nelt = d->nelt;
21236 317242 : rtx x;
21237 :
21238 317242 : if (d->one_operand_p)
21239 : return false;
21240 :
21241 290387 : if (!(TARGET_SSE && (vmode == V4SFmode || vmode == V4SImode))
21242 140485 : && !(TARGET_MMX_WITH_SSE && (vmode == V2SFmode || vmode == V2SImode))
21243 85383 : && !(TARGET_SSE2 && (vmode == V2DFmode || vmode == V2DImode)))
21244 : return false;
21245 :
21246 : /* Only the first element is changed. */
21247 214071 : if (d->perm[0] != nelt && d->perm[0] != 0)
21248 : return false;
21249 160124 : for (i = 1; i < nelt; ++i)
21250 124987 : if (d->perm[i] != i + nelt - d->perm[0])
21251 : return false;
21252 :
21253 35137 : if (d->testing_p)
21254 : return true;
21255 :
21256 6531 : if (d->perm[0] == nelt)
21257 0 : x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
21258 : else
21259 6531 : x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
21260 :
21261 6531 : emit_insn (gen_rtx_SET (d->target, x));
21262 :
21263 6531 : return true;
21264 : }
21265 :
21266 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
21267 : using insertps. */
21268 : static bool
21269 282105 : expand_vec_perm_insertps (struct expand_vec_perm_d *d)
21270 : {
21271 282105 : machine_mode vmode = d->vmode;
21272 282105 : unsigned i, cnt_s, nelt = d->nelt;
21273 282105 : int cnt_d = -1;
21274 282105 : rtx src, dst;
21275 :
21276 282105 : if (d->one_operand_p)
21277 : return false;
21278 :
21279 255250 : if (!(TARGET_SSE4_1
21280 37527 : && (vmode == V4SFmode || vmode == V4SImode
21281 27282 : || (TARGET_MMX_WITH_SSE
21282 19742 : && (vmode == V2SFmode || vmode == V2SImode)))))
21283 : return false;
21284 :
21285 51004 : for (i = 0; i < nelt; ++i)
21286 : {
21287 48757 : if (d->perm[i] == i)
21288 9630 : continue;
21289 39127 : if (cnt_d != -1)
21290 : {
21291 : cnt_d = -1;
21292 : break;
21293 : }
21294 20687 : cnt_d = i;
21295 : }
21296 :
21297 20687 : if (cnt_d == -1)
21298 : {
21299 41056 : for (i = 0; i < nelt; ++i)
21300 : {
21301 39123 : if (d->perm[i] == i + nelt)
21302 4176 : continue;
21303 34947 : if (cnt_d != -1)
21304 : return false;
21305 18440 : cnt_d = i;
21306 : }
21307 :
21308 1933 : if (cnt_d == -1)
21309 : return false;
21310 : }
21311 :
21312 4180 : if (d->testing_p)
21313 : return true;
21314 :
21315 550 : gcc_assert (cnt_d != -1);
21316 :
21317 550 : cnt_s = d->perm[cnt_d];
21318 550 : if (cnt_s < nelt)
21319 : {
21320 241 : src = d->op0;
21321 241 : dst = d->op1;
21322 : }
21323 : else
21324 : {
21325 309 : cnt_s -= nelt;
21326 309 : src = d->op1;
21327 309 : dst = d->op0;
21328 : }
21329 550 : gcc_assert (cnt_s < nelt);
21330 :
21331 550 : rtx x = gen_sse4_1_insertps (vmode, d->target, dst, src,
21332 550 : GEN_INT (cnt_s << 6 | cnt_d << 4));
21333 550 : emit_insn (x);
21334 :
21335 550 : return true;
21336 : }
21337 :
21338 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
21339 : in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
21340 :
21341 : static bool
21342 321569 : expand_vec_perm_blend (struct expand_vec_perm_d *d)
21343 : {
21344 321569 : machine_mode mmode, vmode = d->vmode;
21345 321569 : unsigned i, nelt = d->nelt;
21346 321569 : unsigned HOST_WIDE_INT mask;
21347 321569 : rtx target, op0, op1, maskop, x;
21348 321569 : rtx rperm[32], vperm;
21349 :
21350 321569 : if (d->one_operand_p)
21351 : return false;
21352 6071 : if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
21353 295865 : && (TARGET_AVX512BW
21354 691 : || GET_MODE_UNIT_SIZE (vmode) >= 4))
21355 : ;
21356 305778 : else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
21357 : ;
21358 286612 : else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
21359 : ;
21360 280029 : else if (TARGET_SSE4_1
21361 312252 : && (GET_MODE_SIZE (vmode) == 16
21362 22332 : || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
21363 3073 : || GET_MODE_SIZE (vmode) == 4))
21364 : ;
21365 : else
21366 : return false;
21367 :
21368 : /* This is a blend, not a permute. Elements must stay in their
21369 : respective lanes. */
21370 106375 : for (i = 0; i < nelt; ++i)
21371 : {
21372 102048 : unsigned e = d->perm[i];
21373 102048 : if (!(e == i || e == i + nelt))
21374 : return false;
21375 : }
21376 :
21377 4327 : if (d->testing_p)
21378 : return true;
21379 :
21380 : /* ??? Without SSE4.1, we could implement this with and/andn/or. This
21381 : decision should be extracted elsewhere, so that we only try that
21382 : sequence once all budget==3 options have been tried. */
21383 3013 : target = d->target;
21384 3013 : op0 = d->op0;
21385 3013 : op1 = d->op1;
21386 3013 : mask = 0;
21387 :
21388 3013 : switch (vmode)
21389 : {
21390 : case E_V8DFmode:
21391 : case E_V16SFmode:
21392 : case E_V4DFmode:
21393 : case E_V8SFmode:
21394 : case E_V2DFmode:
21395 : case E_V4SFmode:
21396 : case E_V2SFmode:
21397 : case E_V2HImode:
21398 : case E_V4HImode:
21399 : case E_V8HImode:
21400 : case E_V8SImode:
21401 : case E_V32HImode:
21402 : case E_V64QImode:
21403 : case E_V16SImode:
21404 : case E_V8DImode:
21405 10948 : for (i = 0; i < nelt; ++i)
21406 9462 : mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
21407 : break;
21408 :
21409 : case E_V2DImode:
21410 18 : for (i = 0; i < 2; ++i)
21411 18 : mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
21412 6 : vmode = V8HImode;
21413 6 : goto do_subreg;
21414 :
21415 : case E_V2SImode:
21416 24 : for (i = 0; i < 2; ++i)
21417 24 : mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2);
21418 8 : vmode = V4HImode;
21419 8 : goto do_subreg;
21420 :
21421 871 : case E_V4SImode:
21422 871 : if (TARGET_AVX2)
21423 : {
21424 : /* Use vpblendd instead of vpblendw. */
21425 185 : for (i = 0; i < nelt; ++i)
21426 148 : mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
21427 : break;
21428 : }
21429 : else
21430 : {
21431 4170 : for (i = 0; i < 4; ++i)
21432 5200 : mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
21433 834 : vmode = V8HImode;
21434 834 : goto do_subreg;
21435 : }
21436 :
21437 : case E_V16QImode:
21438 : /* See if bytes move in pairs so we can use pblendw with
21439 : an immediate argument, rather than pblendvb with a vector
21440 : argument. */
21441 102 : for (i = 0; i < 16; i += 2)
21442 100 : if (d->perm[i] + 1 != d->perm[i + 1])
21443 : {
21444 83 : use_pblendvb:
21445 3502 : for (i = 0; i < nelt; ++i)
21446 3212 : rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
21447 :
21448 290 : finish_pblendvb:
21449 291 : vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
21450 291 : vperm = force_reg (vmode, vperm);
21451 :
21452 582 : if (GET_MODE_SIZE (vmode) == 4)
21453 135 : emit_insn (gen_mmx_pblendvb_v4qi (target, op0, op1, vperm));
21454 312 : else if (GET_MODE_SIZE (vmode) == 8)
21455 40 : emit_insn (gen_mmx_pblendvb_v8qi (target, op0, op1, vperm));
21456 232 : else if (GET_MODE_SIZE (vmode) == 16)
21457 83 : emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
21458 : else
21459 33 : emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
21460 291 : if (target != d->target)
21461 1 : emit_move_insn (d->target, gen_lowpart (d->vmode, target));
21462 291 : return true;
21463 : }
21464 :
21465 18 : for (i = 0; i < 8; ++i)
21466 16 : mask |= (d->perm[i * 2] >= 16) << i;
21467 : vmode = V8HImode;
21468 : /* FALLTHRU */
21469 :
21470 1167 : do_subreg:
21471 1167 : target = gen_reg_rtx (vmode);
21472 1167 : op0 = gen_lowpart (vmode, op0);
21473 1167 : op1 = gen_lowpart (vmode, op1);
21474 1167 : break;
21475 :
21476 : case E_V8QImode:
21477 40 : for (i = 0; i < 8; i += 2)
21478 40 : if (d->perm[i] + 1 != d->perm[i + 1])
21479 40 : goto use_pblendvb;
21480 :
21481 0 : for (i = 0; i < 4; ++i)
21482 0 : mask |= (d->perm[i * 2] >= 8) << i;
21483 0 : vmode = V4HImode;
21484 0 : goto do_subreg;
21485 :
21486 : case E_V4QImode:
21487 153 : for (i = 0; i < 4; i += 2)
21488 150 : if (d->perm[i] + 1 != d->perm[i + 1])
21489 135 : goto use_pblendvb;
21490 :
21491 9 : for (i = 0; i < 2; ++i)
21492 6 : mask |= (d->perm[i * 2] >= 4) << i;
21493 3 : vmode = V2HImode;
21494 3 : goto do_subreg;
21495 :
21496 : case E_V32QImode:
21497 : /* See if bytes move in pairs. If not, vpblendvb must be used. */
21498 4928 : for (i = 0; i < 32; i += 2)
21499 4640 : if (d->perm[i] + 1 != d->perm[i + 1])
21500 32 : goto use_pblendvb;
21501 : /* See if bytes move in quadruplets. If yes, vpblendd
21502 : with immediate can be used. */
21503 2592 : for (i = 0; i < 32; i += 4)
21504 2304 : if (d->perm[i] + 2 != d->perm[i + 2])
21505 : break;
21506 288 : if (i < 32)
21507 : {
21508 : /* See if bytes move the same in both lanes. If yes,
21509 : vpblendw with immediate can be used. */
21510 0 : for (i = 0; i < 16; i += 2)
21511 0 : if (d->perm[i] + 16 != d->perm[i + 16])
21512 0 : goto use_pblendvb;
21513 :
21514 : /* Use vpblendw. */
21515 0 : for (i = 0; i < 16; ++i)
21516 0 : mask |= (d->perm[i * 2] >= 32) << i;
21517 0 : vmode = V16HImode;
21518 0 : goto do_subreg;
21519 : }
21520 :
21521 : /* Use vpblendd. */
21522 2592 : for (i = 0; i < 8; ++i)
21523 2304 : mask |= (d->perm[i * 4] >= 32) << i;
21524 288 : vmode = V8SImode;
21525 288 : goto do_subreg;
21526 :
21527 : case E_V16HImode:
21528 : /* See if words move in pairs. If yes, vpblendd can be used. */
21529 186 : for (i = 0; i < 16; i += 2)
21530 169 : if (d->perm[i] + 1 != d->perm[i + 1])
21531 : break;
21532 50 : if (i < 16)
21533 : {
21534 : /* See if words move the same in both lanes. If not,
21535 : vpblendvb must be used. */
21536 290 : for (i = 0; i < 8; i++)
21537 258 : if (d->perm[i] + 8 != d->perm[i + 8])
21538 : {
21539 : /* Use vpblendvb. */
21540 33 : for (i = 0; i < 32; ++i)
21541 32 : rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
21542 :
21543 1 : vmode = V32QImode;
21544 1 : nelt = 32;
21545 1 : target = gen_reg_rtx (vmode);
21546 1 : op0 = gen_lowpart (vmode, op0);
21547 1 : op1 = gen_lowpart (vmode, op1);
21548 1 : goto finish_pblendvb;
21549 : }
21550 :
21551 : /* Use vpblendw. */
21552 544 : for (i = 0; i < 16; ++i)
21553 512 : mask |= (d->perm[i] >= 16) << i;
21554 : break;
21555 : }
21556 :
21557 : /* Use vpblendd. */
21558 153 : for (i = 0; i < 8; ++i)
21559 136 : mask |= (d->perm[i * 2] >= 16) << i;
21560 17 : vmode = V8SImode;
21561 17 : goto do_subreg;
21562 :
21563 : case E_V4DImode:
21564 : /* Use vpblendd. */
21565 45 : for (i = 0; i < 4; ++i)
21566 54 : mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
21567 9 : vmode = V8SImode;
21568 9 : goto do_subreg;
21569 :
21570 0 : default:
21571 0 : gcc_unreachable ();
21572 : }
21573 :
21574 2722 : switch (vmode)
21575 : {
21576 : case E_V8DFmode:
21577 : case E_V8DImode:
21578 : mmode = QImode;
21579 : break;
21580 5 : case E_V16SFmode:
21581 5 : case E_V16SImode:
21582 5 : mmode = HImode;
21583 5 : break;
21584 6 : case E_V32HImode:
21585 6 : mmode = SImode;
21586 6 : break;
21587 1 : case E_V64QImode:
21588 1 : mmode = DImode;
21589 1 : break;
21590 : default:
21591 : mmode = VOIDmode;
21592 : }
21593 :
21594 : /* Canonicalize vec_merge. */
21595 2722 : if (swap_commutative_operands_p (op1, op0)
21596 : /* Two operands have same precedence, then
21597 : first bit of mask select first operand. */
21598 2722 : || (!swap_commutative_operands_p (op0, op1)
21599 2722 : && !(mask & 1)))
21600 : {
21601 2715 : unsigned n_elts = GET_MODE_NUNITS (vmode);
21602 2715 : std::swap (op0, op1);
21603 2715 : unsigned HOST_WIDE_INT mask_all = HOST_WIDE_INT_1U;
21604 2715 : if (n_elts == HOST_BITS_PER_WIDE_INT)
21605 : mask_all = -1;
21606 : else
21607 2714 : mask_all = (HOST_WIDE_INT_1U << n_elts) - 1;
21608 2715 : mask = ~mask & mask_all;
21609 : }
21610 :
21611 2722 : if (mmode != VOIDmode)
21612 22 : maskop = force_reg (mmode, gen_int_mode (mask, mmode));
21613 : else
21614 2700 : maskop = GEN_INT (mask);
21615 :
21616 : /* This matches five different patterns with the different modes. */
21617 2722 : x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
21618 2722 : x = gen_rtx_SET (target, x);
21619 2722 : emit_insn (x);
21620 2722 : if (target != d->target)
21621 1167 : emit_move_insn (d->target, gen_lowpart (d->vmode, target));
21622 :
21623 : return true;
21624 : }
21625 :
21626 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
21627 : in terms of the variable form of vpermilps.
21628 :
21629 : Note that we will have already failed the immediate input vpermilps,
21630 : which requires that the high and low part shuffle be identical; the
21631 : variable form doesn't require that. */
21632 :
21633 : static bool
21634 137501 : expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
21635 : {
21636 137501 : rtx rperm[8], vperm;
21637 137501 : unsigned i;
21638 :
21639 137501 : if (!TARGET_AVX || !d->one_operand_p
21640 12421 : || (d->vmode != V8SImode && d->vmode != V8SFmode))
21641 : return false;
21642 :
21643 : /* We can only permute within the 128-bit lane. */
21644 20283 : for (i = 0; i < 8; ++i)
21645 : {
21646 19345 : unsigned e = d->perm[i];
21647 19345 : if (i < 4 ? e >= 4 : e < 4)
21648 : return false;
21649 : }
21650 :
21651 938 : if (d->testing_p)
21652 : return true;
21653 :
21654 657 : for (i = 0; i < 8; ++i)
21655 : {
21656 584 : unsigned e = d->perm[i];
21657 :
21658 : /* Within each 128-bit lane, the elements of op0 are numbered
21659 : from 0 and the elements of op1 are numbered from 4. */
21660 584 : if (e >= 8 + 4)
21661 0 : e -= 8;
21662 584 : else if (e >= 4)
21663 292 : e -= 4;
21664 :
21665 584 : rperm[i] = GEN_INT (e);
21666 : }
21667 :
21668 73 : vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
21669 73 : vperm = force_reg (V8SImode, vperm);
21670 73 : rtx target = d->target;
21671 73 : rtx op0 = d->op0;
21672 73 : if (d->vmode == V8SImode)
21673 : {
21674 21 : target = lowpart_subreg (V8SFmode, target, V8SImode);
21675 21 : op0 = lowpart_subreg (V8SFmode, op0, V8SImode);
21676 : }
21677 :
21678 73 : emit_insn (gen_avx_vpermilvarv8sf3 (target, op0, vperm));
21679 :
21680 73 : return true;
21681 : }
21682 :
21683 : /* For V*[QHS]Imode permutations, check if the same permutation
21684 : can't be performed in a 2x, 4x or 8x wider inner mode. */
21685 :
21686 : static bool
21687 160525 : canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
21688 : struct expand_vec_perm_d *nd)
21689 : {
21690 160525 : int i;
21691 160525 : machine_mode mode = VOIDmode;
21692 :
21693 160525 : switch (d->vmode)
21694 : {
21695 : case E_V8QImode: mode = V4HImode; break;
21696 29410 : case E_V16QImode: mode = V8HImode; break;
21697 1420 : case E_V32QImode: mode = V16HImode; break;
21698 315 : case E_V64QImode: mode = V32HImode; break;
21699 11623 : case E_V4HImode: mode = V2SImode; break;
21700 20438 : case E_V8HImode: mode = V4SImode; break;
21701 1001 : case E_V16HImode: mode = V8SImode; break;
21702 397 : case E_V32HImode: mode = V16SImode; break;
21703 40595 : case E_V4SImode: mode = V2DImode; break;
21704 1491 : case E_V8SImode: mode = V4DImode; break;
21705 65 : case E_V16SImode: mode = V8DImode; break;
21706 : default: return false;
21707 : }
21708 201913 : for (i = 0; i < d->nelt; i += 2)
21709 187615 : if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
21710 : return false;
21711 14298 : nd->vmode = mode;
21712 14298 : nd->nelt = d->nelt / 2;
21713 93526 : for (i = 0; i < nd->nelt; i++)
21714 79228 : nd->perm[i] = d->perm[2 * i] / 2;
21715 28596 : if (GET_MODE_INNER (mode) != DImode)
21716 12564 : canonicalize_vector_int_perm (nd, nd);
21717 14298 : if (nd != d)
21718 : {
21719 9053 : nd->one_operand_p = d->one_operand_p;
21720 9053 : nd->testing_p = d->testing_p;
21721 9053 : if (d->op0 == d->op1)
21722 3031 : nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
21723 : else
21724 : {
21725 6022 : nd->op0 = gen_lowpart (nd->vmode, d->op0);
21726 6022 : nd->op1 = gen_lowpart (nd->vmode, d->op1);
21727 : }
21728 9053 : if (d->testing_p)
21729 5790 : nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
21730 : else
21731 3263 : nd->target = gen_reg_rtx (nd->vmode);
21732 : }
21733 : return true;
21734 : }
21735 :
21736 : /* Return true if permutation D can be performed as VMODE permutation
21737 : instead. */
21738 :
21739 : static bool
21740 7580 : valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
21741 : {
21742 7580 : unsigned int i, j, chunk;
21743 :
21744 7580 : if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
21745 7580 : || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
21746 18636 : || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
21747 : return false;
21748 :
21749 11056 : if (GET_MODE_NUNITS (vmode) >= d->nelt)
21750 : return true;
21751 :
21752 5236 : chunk = d->nelt / GET_MODE_NUNITS (vmode);
21753 7186 : for (i = 0; i < d->nelt; i += chunk)
21754 6939 : if (d->perm[i] & (chunk - 1))
21755 : return false;
21756 : else
21757 12694 : for (j = 1; j < chunk; ++j)
21758 10744 : if (d->perm[i] + j != d->perm[i + j])
21759 : return false;
21760 :
21761 : return true;
21762 : }
21763 :
21764 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
21765 : in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
21766 :
21767 : static bool
21768 136563 : expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
21769 : {
21770 136563 : unsigned i, nelt, eltsz, mask;
21771 136563 : unsigned char perm[64];
21772 136563 : machine_mode vmode;
21773 136563 : struct expand_vec_perm_d nd;
21774 136563 : rtx rperm[64], vperm, target, op0, op1;
21775 :
21776 136563 : nelt = d->nelt;
21777 :
21778 136563 : if (!d->one_operand_p)
21779 221508 : switch (GET_MODE_SIZE (d->vmode))
21780 : {
21781 7810 : case 4:
21782 7810 : if (!TARGET_XOP)
21783 : return false;
21784 : vmode = V4QImode;
21785 : break;
21786 :
21787 18645 : case 8:
21788 18645 : if (!TARGET_XOP)
21789 : return false;
21790 : vmode = V8QImode;
21791 : break;
21792 :
21793 73660 : case 16:
21794 73660 : if (!TARGET_XOP)
21795 : return false;
21796 : vmode = V16QImode;
21797 : break;
21798 :
21799 9614 : case 32:
21800 9614 : if (!TARGET_AVX2)
21801 : return false;
21802 :
21803 4648 : if (valid_perm_using_mode_p (V2TImode, d))
21804 : {
21805 56 : if (d->testing_p)
21806 : return true;
21807 :
21808 : /* Use vperm2i128 insn. The pattern uses
21809 : V4DImode instead of V2TImode. */
21810 52 : target = d->target;
21811 52 : if (d->vmode != V4DImode)
21812 12 : target = gen_reg_rtx (V4DImode);
21813 52 : op0 = gen_lowpart (V4DImode, d->op0);
21814 52 : op1 = gen_lowpart (V4DImode, d->op1);
21815 52 : rperm[0]
21816 52 : = GEN_INT ((d->perm[0] / (nelt / 2))
21817 : | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
21818 52 : emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
21819 52 : if (target != d->target)
21820 12 : emit_move_insn (d->target, gen_lowpart (d->vmode, target));
21821 52 : return true;
21822 : }
21823 : /* FALLTHRU */
21824 :
21825 : default:
21826 : return false;
21827 : }
21828 : else
21829 51618 : switch (GET_MODE_SIZE (d->vmode))
21830 : {
21831 3455 : case 4:
21832 3455 : if (!TARGET_SSSE3)
21833 : return false;
21834 : vmode = V4QImode;
21835 : break;
21836 :
21837 2398 : case 8:
21838 2398 : if (!TARGET_SSSE3)
21839 : return false;
21840 : vmode = V8QImode;
21841 : break;
21842 :
21843 14045 : case 16:
21844 14045 : if (!TARGET_SSSE3)
21845 : return false;
21846 : vmode = V16QImode;
21847 : break;
21848 :
21849 5522 : case 32:
21850 5522 : if (!TARGET_AVX2)
21851 : return false;
21852 :
21853 : /* V4DImode should be already handled through
21854 : expand_vselect by vpermq instruction. */
21855 2663 : gcc_assert (d->vmode != V4DImode);
21856 :
21857 2663 : vmode = V32QImode;
21858 2663 : if (d->vmode == V8SImode
21859 2270 : || d->vmode == V16HImode
21860 2054 : || d->vmode == V32QImode)
21861 : {
21862 : /* First see if vpermq can be used for
21863 : V8SImode/V16HImode/V32QImode. */
21864 1379 : if (valid_perm_using_mode_p (V4DImode, d))
21865 : {
21866 770 : for (i = 0; i < 4; i++)
21867 616 : perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
21868 154 : if (d->testing_p)
21869 : return true;
21870 58 : target = gen_reg_rtx (V4DImode);
21871 58 : if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
21872 : perm, 4, false))
21873 : {
21874 116 : emit_move_insn (d->target,
21875 58 : gen_lowpart (d->vmode, target));
21876 58 : return true;
21877 : }
21878 : return false;
21879 : }
21880 :
21881 : /* Next see if vpermd can be used. */
21882 1225 : if (valid_perm_using_mode_p (V8SImode, d))
21883 : vmode = V8SImode;
21884 : }
21885 : /* Or if vpermps can be used. */
21886 1284 : else if (d->vmode == V8SFmode)
21887 : vmode = V8SImode;
21888 :
21889 : if (vmode == V32QImode)
21890 : {
21891 : /* vpshufb only works intra lanes, it is not
21892 : possible to shuffle bytes in between the lanes. */
21893 22049 : for (i = 0; i < nelt; ++i)
21894 21395 : if ((d->perm[i] ^ i) & (nelt / 2))
21895 : return false;
21896 : }
21897 : break;
21898 :
21899 389 : case 64:
21900 389 : if (!TARGET_AVX512BW)
21901 : return false;
21902 :
21903 : /* If vpermq didn't work, vpshufb won't work either. */
21904 204 : if (d->vmode == V8DFmode || d->vmode == V8DImode)
21905 : return false;
21906 :
21907 175 : vmode = V64QImode;
21908 175 : if (d->vmode == V16SImode
21909 150 : || d->vmode == V32HImode
21910 50 : || d->vmode == V64QImode)
21911 : {
21912 : /* First see if vpermq can be used for
21913 : V16SImode/V32HImode/V64QImode. */
21914 164 : if (valid_perm_using_mode_p (V8DImode, d))
21915 : {
21916 0 : for (i = 0; i < 8; i++)
21917 0 : perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
21918 0 : if (d->testing_p)
21919 : return true;
21920 0 : target = gen_reg_rtx (V8DImode);
21921 0 : if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
21922 : perm, 8, false))
21923 : {
21924 0 : emit_move_insn (d->target,
21925 0 : gen_lowpart (d->vmode, target));
21926 0 : return true;
21927 : }
21928 : return false;
21929 : }
21930 :
21931 : /* Next see if vpermd can be used. */
21932 164 : if (valid_perm_using_mode_p (V16SImode, d))
21933 : vmode = V16SImode;
21934 : }
21935 : /* Or if vpermps can be used. */
21936 11 : else if (d->vmode == V16SFmode)
21937 : vmode = V16SImode;
21938 :
21939 : if (vmode == V64QImode)
21940 : {
21941 : /* vpshufb only works intra lanes, it is not
21942 : possible to shuffle bytes in between the lanes. */
21943 578 : for (i = 0; i < nelt; ++i)
21944 578 : if ((d->perm[i] ^ i) & (3 * nelt / 4))
21945 : return false;
21946 : }
21947 : break;
21948 :
21949 : default:
21950 : return false;
21951 : }
21952 :
21953 12232 : if (d->testing_p)
21954 : return true;
21955 :
21956 : /* Try to avoid variable permutation instruction. */
21957 9327 : if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
21958 : {
21959 1839 : emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
21960 1839 : return true;
21961 : }
21962 :
21963 7488 : if (vmode == V8SImode)
21964 9639 : for (i = 0; i < 8; ++i)
21965 8568 : rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
21966 6417 : else if (vmode == V16SImode)
21967 612 : for (i = 0; i < 16; ++i)
21968 576 : rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
21969 : else
21970 : {
21971 6381 : eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21972 6381 : if (!d->one_operand_p)
21973 3210 : mask = 2 * nelt - 1;
21974 3171 : else if (vmode == V64QImode)
21975 0 : mask = nelt / 4 - 1;
21976 3171 : else if (vmode == V32QImode)
21977 648 : mask = nelt / 2 - 1;
21978 : else
21979 2523 : mask = nelt - 1;
21980 :
21981 74529 : for (i = 0; i < nelt; ++i)
21982 : {
21983 68148 : unsigned j, e = d->perm[i] & mask;
21984 178264 : for (j = 0; j < eltsz; ++j)
21985 110116 : rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
21986 : }
21987 : }
21988 :
21989 7488 : machine_mode vpmode = vmode;
21990 :
21991 7488 : nelt = GET_MODE_SIZE (vmode);
21992 :
21993 : /* Emulate narrow modes with V16QI instructions. */
21994 7488 : if (nelt < 16)
21995 : {
21996 222 : rtx m128 = GEN_INT (-128);
21997 :
21998 : /* Remap elements from the second operand, as we have to
21999 : account for inactive top elements from the first operand. */
22000 222 : if (!d->one_operand_p)
22001 : {
22002 243 : for (i = 0; i < nelt; ++i)
22003 : {
22004 216 : unsigned ival = UINTVAL (rperm[i]);
22005 216 : if (ival >= nelt)
22006 108 : rperm[i] = GEN_INT (ival + 16 - nelt);
22007 : }
22008 : }
22009 :
22010 : /* Fill inactive elements in the top positions with zeros. */
22011 2570 : for (i = nelt; i < 16; ++i)
22012 2348 : rperm[i] = m128;
22013 :
22014 : vpmode = V16QImode;
22015 : }
22016 :
22017 14976 : vperm = gen_rtx_CONST_VECTOR (vpmode,
22018 7488 : gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm));
22019 7488 : vperm = force_reg (vpmode, vperm);
22020 :
22021 7488 : if (vmode == d->vmode)
22022 2893 : target = d->target;
22023 : else
22024 4595 : target = gen_reg_rtx (vmode);
22025 :
22026 7488 : op0 = gen_lowpart (vmode, d->op0);
22027 :
22028 7488 : if (d->one_operand_p)
22029 : {
22030 4278 : rtx (*gen) (rtx, rtx, rtx);
22031 :
22032 4278 : if (vmode == V4QImode)
22033 : gen = gen_mmx_pshufbv4qi3;
22034 : else if (vmode == V8QImode)
22035 : gen = gen_mmx_pshufbv8qi3;
22036 : else if (vmode == V16QImode)
22037 : gen = gen_ssse3_pshufbv16qi3;
22038 : else if (vmode == V32QImode)
22039 : gen = gen_avx2_pshufbv32qi3;
22040 : else if (vmode == V64QImode)
22041 : gen = gen_avx512bw_pshufbv64qi3;
22042 : else if (vmode == V8SFmode)
22043 : gen = gen_avx2_permvarv8sf;
22044 : else if (vmode == V8SImode)
22045 : gen = gen_avx2_permvarv8si;
22046 : else if (vmode == V16SFmode)
22047 : gen = gen_avx512f_permvarv16sf;
22048 : else if (vmode == V16SImode)
22049 : gen = gen_avx512f_permvarv16si;
22050 : else
22051 : gcc_unreachable ();
22052 :
22053 4278 : emit_insn (gen (target, op0, vperm));
22054 : }
22055 : else
22056 : {
22057 3210 : rtx (*gen) (rtx, rtx, rtx, rtx);
22058 :
22059 3210 : op1 = gen_lowpart (vmode, d->op1);
22060 :
22061 3210 : if (vmode == V4QImode)
22062 : gen = gen_mmx_ppermv32;
22063 : else if (vmode == V8QImode)
22064 : gen = gen_mmx_ppermv64;
22065 : else if (vmode == V16QImode)
22066 : gen = gen_xop_pperm;
22067 : else
22068 0 : gcc_unreachable ();
22069 :
22070 3210 : emit_insn (gen (target, op0, op1, vperm));
22071 : }
22072 :
22073 7488 : if (target != d->target)
22074 4595 : emit_move_insn (d->target, gen_lowpart (d->vmode, target));
22075 :
22076 : return true;
22077 : }
22078 :
22079 : /* Try to expand one-operand permutation with constant mask. */
22080 :
22081 : static bool
22082 124001 : ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
22083 : {
22084 124001 : machine_mode mode = GET_MODE (d->op0);
22085 124001 : machine_mode maskmode = mode;
22086 248002 : unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
22087 124001 : rtx (*gen) (rtx, rtx, rtx) = NULL;
22088 124001 : rtx target, op0, mask;
22089 124001 : rtx vec[64];
22090 :
22091 124001 : if (!rtx_equal_p (d->op0, d->op1))
22092 : return false;
22093 :
22094 17766 : if (!TARGET_AVX512F)
22095 : return false;
22096 :
22097 : /* Accept VNxHImode and VNxQImode now. */
22098 719 : if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
22099 : return false;
22100 :
22101 : /* vpermw. */
22102 457 : if (!TARGET_AVX512BW && inner_size == 2)
22103 : return false;
22104 :
22105 : /* vpermb. */
22106 323 : if (!TARGET_AVX512VBMI && inner_size == 1)
22107 : return false;
22108 :
22109 202 : switch (mode)
22110 : {
22111 : case E_V16SImode:
22112 : gen = gen_avx512f_permvarv16si;
22113 : break;
22114 4 : case E_V16SFmode:
22115 4 : gen = gen_avx512f_permvarv16sf;
22116 4 : maskmode = V16SImode;
22117 4 : break;
22118 1 : case E_V8DImode:
22119 1 : gen = gen_avx512f_permvarv8di;
22120 1 : break;
22121 30 : case E_V8DFmode:
22122 30 : gen = gen_avx512f_permvarv8df;
22123 30 : maskmode = V8DImode;
22124 30 : break;
22125 108 : case E_V32HImode:
22126 108 : gen = gen_avx512bw_permvarv32hi;
22127 108 : break;
22128 14 : case E_V16HImode:
22129 14 : gen = gen_avx512vl_permvarv16hi;
22130 14 : break;
22131 6 : case E_V8HImode:
22132 6 : gen = gen_avx512vl_permvarv8hi;
22133 6 : break;
22134 4 : case E_V64QImode:
22135 4 : gen = gen_avx512bw_permvarv64qi;
22136 4 : break;
22137 2 : case E_V32QImode:
22138 2 : gen = gen_avx512vl_permvarv32qi;
22139 2 : break;
22140 0 : case E_V16QImode:
22141 0 : gen = gen_avx512vl_permvarv16qi;
22142 0 : break;
22143 :
22144 : default:
22145 : return false;
22146 : }
22147 :
22148 201 : if (d->testing_p)
22149 : return true;
22150 :
22151 192 : target = d->target;
22152 192 : op0 = d->op0;
22153 4920 : for (int i = 0; i < d->nelt; ++i)
22154 4728 : vec[i] = GEN_INT (d->perm[i]);
22155 192 : mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
22156 192 : emit_insn (gen (target, op0, force_reg (maskmode, mask)));
22157 192 : return true;
22158 : }
22159 :
22160 : static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
22161 :
22162 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
22163 : in a single instruction. */
22164 :
22165 : static bool
22166 355334 : expand_vec_perm_1 (struct expand_vec_perm_d *d)
22167 : {
22168 355334 : unsigned i, nelt = d->nelt;
22169 355334 : struct expand_vec_perm_d nd;
22170 :
22171 : /* Check plain VEC_SELECT first, because AVX has instructions that could
22172 : match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
22173 : input where SEL+CONCAT may not. */
22174 355334 : if (d->one_operand_p)
22175 : {
22176 : int mask = nelt - 1;
22177 : bool identity_perm = true;
22178 : bool broadcast_perm = true;
22179 :
22180 529502 : for (i = 0; i < nelt; i++)
22181 : {
22182 466142 : nd.perm[i] = d->perm[i] & mask;
22183 466142 : if (nd.perm[i] != i)
22184 349773 : identity_perm = false;
22185 466142 : if (nd.perm[i])
22186 386944 : broadcast_perm = false;
22187 : }
22188 :
22189 63360 : if (identity_perm)
22190 : {
22191 59 : if (!d->testing_p)
22192 5 : emit_move_insn (d->target, d->op0);
22193 59 : return true;
22194 : }
22195 63301 : else if (broadcast_perm && TARGET_AVX2)
22196 : {
22197 : /* Use vpbroadcast{b,w,d}. */
22198 390 : rtx (*gen) (rtx, rtx) = NULL;
22199 390 : switch (d->vmode)
22200 : {
22201 1 : case E_V64QImode:
22202 1 : if (TARGET_AVX512BW)
22203 : gen = gen_avx512bw_vec_dupv64qi_1;
22204 : break;
22205 4 : case E_V32QImode:
22206 4 : gen = gen_avx2_pbroadcastv32qi_1;
22207 4 : break;
22208 1 : case E_V32HImode:
22209 1 : if (TARGET_AVX512BW)
22210 : gen = gen_avx512bw_vec_dupv32hi_1;
22211 : break;
22212 4 : case E_V16HImode:
22213 4 : gen = gen_avx2_pbroadcastv16hi_1;
22214 4 : break;
22215 1 : case E_V16SImode:
22216 1 : if (TARGET_AVX512F)
22217 : gen = gen_avx512f_vec_dupv16si_1;
22218 : break;
22219 4 : case E_V8SImode:
22220 4 : gen = gen_avx2_pbroadcastv8si_1;
22221 4 : break;
22222 4 : case E_V16QImode:
22223 4 : gen = gen_avx2_pbroadcastv16qi;
22224 4 : break;
22225 5 : case E_V8HImode:
22226 5 : gen = gen_avx2_pbroadcastv8hi;
22227 5 : break;
22228 0 : case E_V16SFmode:
22229 0 : if (TARGET_AVX512F)
22230 : gen = gen_avx512f_vec_dupv16sf_1;
22231 : break;
22232 : case E_V8SFmode:
22233 : gen = gen_avx2_vec_dupv8sf_1;
22234 : break;
22235 0 : case E_V8DFmode:
22236 0 : if (TARGET_AVX512F)
22237 : gen = gen_avx512f_vec_dupv8df_1;
22238 : break;
22239 0 : case E_V8DImode:
22240 0 : if (TARGET_AVX512F)
22241 : gen = gen_avx512f_vec_dupv8di_1;
22242 : break;
22243 : /* For other modes prefer other shuffles this function creates. */
22244 : default: break;
22245 : }
22246 21 : if (gen != NULL)
22247 : {
22248 24 : if (!d->testing_p)
22249 24 : emit_insn (gen (d->target, d->op0));
22250 24 : return true;
22251 : }
22252 : }
22253 :
22254 63277 : if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
22255 : return true;
22256 :
22257 : /* There are plenty of patterns in sse.md that are written for
22258 : SEL+CONCAT and are not replicated for a single op. Perhaps
22259 : that should be changed, to avoid the nastiness here. */
22260 :
22261 : /* Recognize interleave style patterns, which means incrementing
22262 : every other permutation operand. */
22263 211028 : for (i = 0; i < nelt; i += 2)
22264 : {
22265 173616 : nd.perm[i] = d->perm[i] & mask;
22266 173616 : nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
22267 : }
22268 37412 : if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
22269 37412 : d->testing_p))
22270 : return true;
22271 :
22272 : /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
22273 32528 : if (nelt >= 4)
22274 : {
22275 113548 : for (i = 0; i < nelt; i += 4)
22276 : {
22277 81020 : nd.perm[i + 0] = d->perm[i + 0] & mask;
22278 81020 : nd.perm[i + 1] = d->perm[i + 1] & mask;
22279 81020 : nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
22280 81020 : nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
22281 : }
22282 :
22283 32528 : if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
22284 32528 : d->testing_p))
22285 : return true;
22286 : }
22287 : }
22288 :
22289 : /* Try the SSE4.1 blend variable merge instructions. */
22290 318829 : if (expand_vec_perm_blend (d))
22291 : return true;
22292 :
22293 : /* Try movss/movsd instructions. */
22294 317242 : if (expand_vec_perm_movs (d))
22295 : return true;
22296 :
22297 : /* Try the SSE4.1 insertps instruction. */
22298 282105 : if (expand_vec_perm_insertps (d))
22299 : return true;
22300 :
22301 : /* Try the fully general two operand permute. */
22302 277925 : if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
22303 277925 : d->testing_p))
22304 : return true;
22305 :
22306 : /* Recognize interleave style patterns with reversed operands. */
22307 137513 : if (!d->one_operand_p)
22308 : {
22309 900030 : for (i = 0; i < nelt; ++i)
22310 : {
22311 789264 : unsigned e = d->perm[i];
22312 789264 : if (e >= nelt)
22313 386356 : e -= nelt;
22314 : else
22315 402908 : e += nelt;
22316 789264 : nd.perm[i] = e;
22317 : }
22318 :
22319 110766 : if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
22320 110766 : d->testing_p))
22321 : return true;
22322 : }
22323 :
22324 : /* Try one of the AVX vpermil variable permutations. */
22325 137501 : if (expand_vec_perm_vpermil (d))
22326 : return true;
22327 :
22328 : /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
22329 : vpshufb, vpermd, vpermps or vpermq variable permutation. */
22330 136563 : if (expand_vec_perm_pshufb (d))
22331 : return true;
22332 :
22333 : /* Try the AVX2 vpalignr instruction. */
22334 124121 : if (expand_vec_perm_palignr (d, true))
22335 : return true;
22336 :
22337 : /* Try the AVX512F vperm{w,b,s,d} instructions */
22338 124001 : if (ix86_expand_vec_one_operand_perm_avx512 (d))
22339 : return true;
22340 :
22341 : /* Try the AVX512F vpermt2/vpermi2 instructions. */
22342 123800 : if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
22343 : return true;
22344 :
22345 : /* See if we can get the same permutation in different vector integer
22346 : mode. */
22347 122844 : if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
22348 : {
22349 6616 : if (!d->testing_p)
22350 1207 : emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
22351 6616 : return true;
22352 : }
22353 : return false;
22354 : }
22355 :
22356 : /* Canonicalize vec_perm index to make the first index
22357 : always comes from the first vector. */
22358 : static void
22359 8189 : ix86_vec_perm_index_canon (struct expand_vec_perm_d *d)
22360 : {
22361 8189 : unsigned nelt = d->nelt;
22362 8189 : if (d->perm[0] < nelt)
22363 : return;
22364 :
22365 5 : for (unsigned i = 0; i != nelt; i++)
22366 4 : d->perm[i] = (d->perm[i] + nelt) % (2 * nelt);
22367 :
22368 1 : std::swap (d->op0, d->op1);
22369 1 : return;
22370 : }
22371 :
22372 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
22373 : in terms of a pair of shufps+ shufps/pshufd instructions. */
22374 : static bool
22375 83619 : expand_vec_perm_shufps_shufps (struct expand_vec_perm_d *d)
22376 : {
22377 83619 : unsigned char perm1[4];
22378 83619 : machine_mode vmode = d->vmode;
22379 83619 : bool ok;
22380 83619 : unsigned i, j, k, count = 0;
22381 :
22382 83619 : if (d->one_operand_p
22383 78339 : || (vmode != V4SImode && vmode != V4SFmode))
22384 : return false;
22385 :
22386 34762 : if (d->testing_p)
22387 : return true;
22388 :
22389 8189 : ix86_vec_perm_index_canon (d);
22390 49134 : for (i = 0; i < 4; ++i)
22391 51259 : count += d->perm[i] > 3 ? 1 : 0;
22392 :
22393 8189 : gcc_assert (count & 3);
22394 :
22395 8189 : rtx tmp = gen_reg_rtx (vmode);
22396 : /* 2 from op0 and 2 from op1. */
22397 8189 : if (count == 2)
22398 : {
22399 : unsigned char perm2[4];
22400 18240 : for (i = 0, j = 0, k = 2; i < 4; ++i)
22401 14592 : if (d->perm[i] & 4)
22402 : {
22403 7296 : perm1[k++] = d->perm[i];
22404 7296 : perm2[i] = k - 1;
22405 : }
22406 : else
22407 : {
22408 7296 : perm1[j++] = d->perm[i];
22409 7296 : perm2[i] = j - 1;
22410 : }
22411 :
22412 : /* shufps. */
22413 7296 : ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
22414 3648 : perm1, d->nelt, false);
22415 3648 : gcc_assert (ok);
22416 3648 : if (vmode == V4SImode && TARGET_SSE2)
22417 : /* pshufd. */
22418 2092 : ok = expand_vselect (d->target, tmp,
22419 2092 : perm2, d->nelt, false);
22420 : else
22421 : {
22422 : /* shufps. */
22423 1556 : perm2[2] += 4;
22424 1556 : perm2[3] += 4;
22425 1556 : ok = expand_vselect_vconcat (d->target, tmp, tmp,
22426 1556 : perm2, d->nelt, false);
22427 : }
22428 3648 : gcc_assert (ok);
22429 : }
22430 : /* 3 from one op and 1 from another. */
22431 : else
22432 : {
22433 22705 : unsigned pair_idx = 8, lone_idx = 8, shift;
22434 :
22435 : /* Find the lone index. */
22436 22705 : for (i = 0; i < 4; ++i)
22437 18164 : if ((d->perm[i] > 3 && count == 1)
22438 14831 : || (d->perm[i] < 4 && count == 3))
22439 18164 : lone_idx = i;
22440 :
22441 : /* When lone_idx is not 0, it must from second op(count == 1). */
22442 5749 : gcc_assert (count == (lone_idx ? 1 : 3));
22443 :
22444 : /* Find the pair index that sits in the same half as the lone index. */
22445 4541 : shift = lone_idx & 2;
22446 4541 : pair_idx = 1 - lone_idx + 2 * shift;
22447 :
22448 : /* First permutate lone index and pair index into the same vector as
22449 : [ lone, lone, pair, pair ]. */
22450 9082 : perm1[1] = perm1[0]
22451 4541 : = (count == 3) ? d->perm[lone_idx] : d->perm[lone_idx] - 4;
22452 9082 : perm1[3] = perm1[2]
22453 4541 : = (count == 3) ? d->perm[pair_idx] : d->perm[pair_idx] + 4;
22454 :
22455 : /* Alway put the vector contains lone indx at the first. */
22456 4541 : if (count == 1)
22457 3333 : std::swap (d->op0, d->op1);
22458 :
22459 : /* shufps. */
22460 9082 : ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
22461 4541 : perm1, d->nelt, false);
22462 4541 : gcc_assert (ok);
22463 :
22464 : /* Refine lone and pair index to original order. */
22465 4541 : perm1[shift] = lone_idx << 1;
22466 4541 : perm1[shift + 1] = pair_idx << 1;
22467 :
22468 : /* Select the remaining 2 elements in another vector. */
22469 13623 : for (i = 2 - shift; i < 4 - shift; ++i)
22470 9082 : perm1[i] = lone_idx == 1 ? d->perm[i] + 4 : d->perm[i];
22471 :
22472 : /* Adjust to original selector. */
22473 4541 : if (lone_idx > 1)
22474 2246 : std::swap (tmp, d->op1);
22475 :
22476 : /* shufps. */
22477 9082 : ok = expand_vselect_vconcat (d->target, tmp, d->op1,
22478 4541 : perm1, d->nelt, false);
22479 :
22480 4541 : gcc_assert (ok);
22481 : }
22482 :
22483 : return true;
22484 : }
22485 :
22486 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
22487 : in terms of a pair of pshuflw + pshufhw instructions. */
22488 :
22489 : static bool
22490 100982 : expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
22491 : {
22492 100982 : unsigned char perm2[MAX_VECT_LEN];
22493 100982 : unsigned i;
22494 100982 : bool ok;
22495 :
22496 100982 : if (d->vmode != V8HImode || !d->one_operand_p)
22497 : return false;
22498 :
22499 : /* The two permutations only operate in 64-bit lanes. */
22500 12859 : for (i = 0; i < 4; ++i)
22501 10382 : if (d->perm[i] >= 4)
22502 : return false;
22503 12329 : for (i = 4; i < 8; ++i)
22504 9866 : if (d->perm[i] < 4)
22505 : return false;
22506 :
22507 2463 : if (d->testing_p)
22508 : return true;
22509 :
22510 : /* Emit the pshuflw. */
22511 134 : memcpy (perm2, d->perm, 4);
22512 670 : for (i = 4; i < 8; ++i)
22513 536 : perm2[i] = i;
22514 134 : ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
22515 134 : gcc_assert (ok);
22516 :
22517 : /* Emit the pshufhw. */
22518 134 : memcpy (perm2 + 4, d->perm + 4, 4);
22519 670 : for (i = 0; i < 4; ++i)
22520 536 : perm2[i] = i;
22521 134 : ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
22522 134 : gcc_assert (ok);
22523 :
22524 : return true;
22525 : }
22526 :
22527 : /* Try to permute 2 64-bit vectors by punpckldq + 128-bit vector shuffle. */
22528 : static bool
22529 48857 : expand_vec_perm_punpckldq_pshuf (struct expand_vec_perm_d *d)
22530 : {
22531 48857 : if (GET_MODE_BITSIZE (d->vmode) != 64
22532 15098 : || !TARGET_MMX_WITH_SSE
22533 63955 : || d->one_operand_p)
22534 : return false;
22535 :
22536 13703 : machine_mode widen_vmode;
22537 13703 : switch (d->vmode)
22538 : {
22539 : /* pshufd. */
22540 : case E_V2SImode:
22541 : widen_vmode = V4SImode;
22542 : break;
22543 :
22544 : /* pshufd. */
22545 1101 : case E_V2SFmode:
22546 1101 : widen_vmode = V4SFmode;
22547 1101 : break;
22548 :
22549 4663 : case E_V4HImode:
22550 4663 : widen_vmode = V8HImode;
22551 : /* pshufb. */
22552 4663 : if (!TARGET_SSSE3)
22553 : return false;
22554 : break;
22555 :
22556 5562 : case E_V8QImode:
22557 : /* pshufb. */
22558 5562 : widen_vmode = V16QImode;
22559 5562 : if (!TARGET_SSSE3)
22560 : return false;
22561 : break;
22562 :
22563 : default:
22564 : return false;
22565 : }
22566 :
22567 5274 : if (d->testing_p)
22568 : return true;
22569 :
22570 379 : struct expand_vec_perm_d dperm;
22571 379 : dperm.target = gen_reg_rtx (widen_vmode);
22572 379 : rtx op0 = gen_reg_rtx (widen_vmode);
22573 379 : emit_move_insn (op0, gen_rtx_VEC_CONCAT (widen_vmode, d->op0, d->op1));
22574 379 : dperm.op0 = op0;
22575 379 : dperm.op1 = op0;
22576 379 : dperm.vmode = widen_vmode;
22577 379 : unsigned nelt = GET_MODE_NUNITS (widen_vmode);
22578 379 : dperm.nelt = nelt;
22579 379 : dperm.one_operand_p = true;
22580 379 : dperm.testing_p = false;
22581 :
22582 2009 : for (unsigned i = 0; i != nelt / 2; i++)
22583 : {
22584 1630 : dperm.perm[i] = d->perm[i];
22585 1630 : dperm.perm[i + nelt / 2] = d->perm[i];
22586 : }
22587 :
22588 379 : gcc_assert (expand_vec_perm_1 (&dperm));
22589 379 : emit_move_insn (d->target, lowpart_subreg (d->vmode,
22590 : dperm.target,
22591 : dperm.vmode));
22592 379 : return true;
22593 : }
22594 :
22595 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
22596 : the permutation using the SSSE3 palignr instruction. This succeeds
22597 : when all of the elements in PERM fit within one vector and we merely
22598 : need to shift them down so that a single vector permutation has a
22599 : chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
22600 : the vpalignr instruction itself can perform the requested permutation. */
22601 :
22602 : static bool
22603 222640 : expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
22604 : {
22605 222640 : unsigned i, nelt = d->nelt;
22606 222640 : unsigned min, max, minswap, maxswap;
22607 222640 : bool in_order, ok, swap = false;
22608 222640 : rtx shift, target;
22609 222640 : struct expand_vec_perm_d dcopy;
22610 :
22611 : /* Even with AVX, palignr only operates on 128-bit vectors,
22612 : in AVX2 palignr operates on both 128-bit lanes. */
22613 120848 : if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
22614 267468 : && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
22615 : return false;
22616 :
22617 35543 : min = 2 * nelt;
22618 35543 : max = 0;
22619 35543 : minswap = 2 * nelt;
22620 35543 : maxswap = 0;
22621 259515 : for (i = 0; i < nelt; ++i)
22622 : {
22623 223972 : unsigned e = d->perm[i];
22624 223972 : unsigned eswap = d->perm[i] ^ nelt;
22625 447944 : if (GET_MODE_SIZE (d->vmode) == 32)
22626 : {
22627 89592 : e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
22628 89592 : eswap = e ^ (nelt / 2);
22629 : }
22630 223972 : if (e < min)
22631 : min = e;
22632 223972 : if (e > max)
22633 : max = e;
22634 223972 : if (eswap < minswap)
22635 : minswap = eswap;
22636 223972 : if (eswap > maxswap)
22637 : maxswap = eswap;
22638 : }
22639 35543 : if (min == 0
22640 51597 : || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
22641 : {
22642 32347 : if (d->one_operand_p
22643 32078 : || minswap == 0
22644 68519 : || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
22645 18086 : ? nelt / 2 : nelt))
22646 : return false;
22647 : swap = true;
22648 : min = minswap;
22649 6416 : max = maxswap;
22650 : }
22651 :
22652 : /* Given that we have SSSE3, we know we'll be able to implement the
22653 : single operand permutation after the palignr with pshufb for
22654 : 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
22655 : first. */
22656 6466 : if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
22657 : return true;
22658 :
22659 6416 : dcopy = *d;
22660 6416 : if (swap)
22661 : {
22662 3220 : dcopy.op0 = d->op1;
22663 3220 : dcopy.op1 = d->op0;
22664 16172 : for (i = 0; i < nelt; ++i)
22665 12952 : dcopy.perm[i] ^= nelt;
22666 : }
22667 :
22668 : in_order = true;
22669 32632 : for (i = 0; i < nelt; ++i)
22670 : {
22671 26216 : unsigned e = dcopy.perm[i];
22672 26216 : if (GET_MODE_SIZE (d->vmode) == 32
22673 1120 : && e >= nelt
22674 26466 : && (e & (nelt / 2 - 1)) < min)
22675 250 : e = e - min - (nelt / 2);
22676 : else
22677 25966 : e = e - min;
22678 26216 : if (e != i)
22679 19394 : in_order = false;
22680 26216 : dcopy.perm[i] = e;
22681 : }
22682 6416 : dcopy.one_operand_p = true;
22683 :
22684 6416 : if (single_insn_only_p && !in_order)
22685 : return false;
22686 :
22687 : /* For AVX2, test whether we can permute the result in one instruction. */
22688 3267 : if (d->testing_p)
22689 : {
22690 50 : if (in_order)
22691 : return true;
22692 0 : dcopy.op1 = dcopy.op0;
22693 0 : return expand_vec_perm_1 (&dcopy);
22694 : }
22695 :
22696 6434 : shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
22697 6434 : if (GET_MODE_SIZE (d->vmode) == 16)
22698 : {
22699 3145 : target = gen_reg_rtx (V1TImode);
22700 3145 : emit_insn (gen_ssse3_palignrv1ti (target,
22701 3145 : gen_lowpart (V1TImode, dcopy.op1),
22702 3145 : gen_lowpart (V1TImode, dcopy.op0),
22703 : shift));
22704 : }
22705 : else
22706 : {
22707 72 : target = gen_reg_rtx (V2TImode);
22708 72 : emit_insn (gen_avx2_palignrv2ti (target,
22709 72 : gen_lowpart (V2TImode, dcopy.op1),
22710 72 : gen_lowpart (V2TImode, dcopy.op0),
22711 : shift));
22712 : }
22713 :
22714 3217 : dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
22715 :
22716 : /* Test for the degenerate case where the alignment by itself
22717 : produces the desired permutation. */
22718 3217 : if (in_order)
22719 : {
22720 70 : emit_move_insn (d->target, dcopy.op0);
22721 70 : return true;
22722 : }
22723 :
22724 3147 : ok = expand_vec_perm_1 (&dcopy);
22725 3159 : gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
22726 :
22727 : return ok;
22728 : }
22729 :
22730 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
22731 : the permutation using the SSE4_1 pblendv instruction. Potentially
22732 : reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
22733 :
22734 : static bool
22735 88832 : expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
22736 : {
22737 88832 : unsigned i, which, nelt = d->nelt;
22738 88832 : struct expand_vec_perm_d dcopy, dcopy1;
22739 88832 : machine_mode vmode = d->vmode;
22740 88832 : bool ok;
22741 :
22742 : /* Use the same checks as in expand_vec_perm_blend. */
22743 88832 : if (d->one_operand_p)
22744 : return false;
22745 87603 : if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
22746 : ;
22747 81373 : else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
22748 : ;
22749 77602 : else if (TARGET_SSE4_1
22750 87783 : && (GET_MODE_SIZE (vmode) == 16
22751 8484 : || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
22752 2559 : || GET_MODE_SIZE (vmode) == 4))
22753 : ;
22754 : else
22755 : return false;
22756 :
22757 : /* Figure out where permutation elements stay not in their
22758 : respective lanes. */
22759 119408 : for (i = 0, which = 0; i < nelt; ++i)
22760 : {
22761 103648 : unsigned e = d->perm[i];
22762 103648 : if (e != i)
22763 141880 : which |= (e < nelt ? 1 : 2);
22764 : }
22765 : /* We can pblend the part where elements stay not in their
22766 : respective lanes only when these elements are all in one
22767 : half of a permutation.
22768 : {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
22769 : lanes, but both 8 and 9 >= 8
22770 : {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
22771 : respective lanes and 8 >= 8, but 2 not. */
22772 15760 : if (which != 1 && which != 2)
22773 : return false;
22774 3361 : if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
22775 : return true;
22776 :
22777 : /* First we apply one operand permutation to the part where
22778 : elements stay not in their respective lanes. */
22779 2051 : dcopy = *d;
22780 2051 : if (which == 2)
22781 2051 : dcopy.op0 = dcopy.op1 = d->op1;
22782 : else
22783 0 : dcopy.op0 = dcopy.op1 = d->op0;
22784 2051 : if (!d->testing_p)
22785 741 : dcopy.target = gen_reg_rtx (vmode);
22786 2051 : dcopy.one_operand_p = true;
22787 :
22788 16603 : for (i = 0; i < nelt; ++i)
22789 14552 : dcopy.perm[i] = d->perm[i] & (nelt - 1);
22790 :
22791 2051 : ok = expand_vec_perm_1 (&dcopy);
22792 4102 : if (GET_MODE_SIZE (vmode) != 16 && !ok)
22793 : return false;
22794 : else
22795 1756 : gcc_assert (ok);
22796 1756 : if (d->testing_p)
22797 : return true;
22798 :
22799 : /* Next we put permuted elements into their positions. */
22800 679 : dcopy1 = *d;
22801 679 : if (which == 2)
22802 679 : dcopy1.op1 = dcopy.target;
22803 : else
22804 0 : dcopy1.op0 = dcopy.target;
22805 :
22806 5751 : for (i = 0; i < nelt; ++i)
22807 5072 : dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
22808 :
22809 679 : ok = expand_vec_perm_blend (&dcopy1);
22810 679 : gcc_assert (ok);
22811 :
22812 : return true;
22813 : }
22814 :
22815 : static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
22816 :
22817 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
22818 : a two vector permutation into a single vector permutation by using
22819 : an interleave operation to merge the vectors. */
22820 :
22821 : static bool
22822 95384 : expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
22823 : {
22824 95384 : struct expand_vec_perm_d dremap, dfinal;
22825 95384 : unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
22826 95384 : unsigned HOST_WIDE_INT contents;
22827 95384 : unsigned char remap[2 * MAX_VECT_LEN];
22828 95384 : rtx_insn *seq;
22829 95384 : bool ok, same_halves = false;
22830 :
22831 95384 : if (GET_MODE_SIZE (d->vmode) == 4
22832 171866 : || GET_MODE_SIZE (d->vmode) == 8
22833 233334 : || GET_MODE_SIZE (d->vmode) == 16)
22834 : {
22835 87983 : if (d->one_operand_p)
22836 : return false;
22837 : }
22838 14802 : else if (GET_MODE_SIZE (d->vmode) == 32)
22839 : {
22840 7049 : if (!TARGET_AVX)
22841 : return false;
22842 : /* For 32-byte modes allow even d->one_operand_p.
22843 : The lack of cross-lane shuffling in some instructions
22844 : might prevent a single insn shuffle. */
22845 7049 : dfinal = *d;
22846 7049 : dfinal.testing_p = true;
22847 : /* If expand_vec_perm_interleave3 can expand this into
22848 : a 3 insn sequence, give up and let it be expanded as
22849 : 3 insn sequence. While that is one insn longer,
22850 : it doesn't need a memory operand and in the common
22851 : case that both interleave low and high permutations
22852 : with the same operands are adjacent needs 4 insns
22853 : for both after CSE. */
22854 7049 : if (expand_vec_perm_interleave3 (&dfinal))
22855 : return false;
22856 : }
22857 : else
22858 : return false;
22859 :
22860 : /* Examine from whence the elements come. */
22861 89613 : contents = 0;
22862 680829 : for (i = 0; i < nelt; ++i)
22863 591216 : contents |= HOST_WIDE_INT_1U << d->perm[i];
22864 :
22865 89613 : memset (remap, 0xff, sizeof (remap));
22866 89613 : dremap = *d;
22867 :
22868 89613 : if (GET_MODE_SIZE (d->vmode) == 4
22869 171440 : || GET_MODE_SIZE (d->vmode) == 8)
22870 : {
22871 23345 : unsigned HOST_WIDE_INT h1, h2, h3, h4;
22872 :
22873 : /* Split the two input vectors into 4 halves. */
22874 23345 : h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
22875 23345 : h2 = h1 << nelt2;
22876 23345 : h3 = h2 << nelt2;
22877 23345 : h4 = h3 << nelt2;
22878 :
22879 : /* If the elements from the low halves use interleave low,
22880 : and similarly for interleave high. */
22881 23345 : if ((contents & (h1 | h3)) == contents)
22882 : {
22883 : /* punpckl* */
22884 3247 : for (i = 0; i < nelt2; ++i)
22885 : {
22886 2292 : remap[i] = i * 2;
22887 2292 : remap[i + nelt] = i * 2 + 1;
22888 2292 : dremap.perm[i * 2] = i;
22889 2292 : dremap.perm[i * 2 + 1] = i + nelt;
22890 : }
22891 : }
22892 22390 : else if ((contents & (h2 | h4)) == contents)
22893 : {
22894 : /* punpckh* */
22895 2836 : for (i = 0; i < nelt2; ++i)
22896 : {
22897 2000 : remap[i + nelt2] = i * 2;
22898 2000 : remap[i + nelt + nelt2] = i * 2 + 1;
22899 2000 : dremap.perm[i * 2] = i + nelt2;
22900 2000 : dremap.perm[i * 2 + 1] = i + nelt + nelt2;
22901 : }
22902 : }
22903 : else
22904 : return false;
22905 : }
22906 132536 : else if (GET_MODE_SIZE (d->vmode) == 16)
22907 : {
22908 59437 : unsigned HOST_WIDE_INT h1, h2, h3, h4;
22909 :
22910 : /* Split the two input vectors into 4 halves. */
22911 59437 : h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
22912 59437 : h2 = h1 << nelt2;
22913 59437 : h3 = h2 << nelt2;
22914 59437 : h4 = h3 << nelt2;
22915 :
22916 : /* If the elements from the low halves use interleave low, and similarly
22917 : for interleave high. If the elements are from mis-matched halves, we
22918 : can use shufps for V4SF/V4SI or do a DImode shuffle. */
22919 59437 : if ((contents & (h1 | h3)) == contents)
22920 : {
22921 : /* punpckl* */
22922 5923 : for (i = 0; i < nelt2; ++i)
22923 : {
22924 4382 : remap[i] = i * 2;
22925 4382 : remap[i + nelt] = i * 2 + 1;
22926 4382 : dremap.perm[i * 2] = i;
22927 4382 : dremap.perm[i * 2 + 1] = i + nelt;
22928 : }
22929 1541 : if (!TARGET_SSE2 && d->vmode == V4SImode)
22930 0 : dremap.vmode = V4SFmode;
22931 : }
22932 57896 : else if ((contents & (h2 | h4)) == contents)
22933 : {
22934 : /* punpckh* */
22935 5130 : for (i = 0; i < nelt2; ++i)
22936 : {
22937 3762 : remap[i + nelt2] = i * 2;
22938 3762 : remap[i + nelt + nelt2] = i * 2 + 1;
22939 3762 : dremap.perm[i * 2] = i + nelt2;
22940 3762 : dremap.perm[i * 2 + 1] = i + nelt + nelt2;
22941 : }
22942 1368 : if (!TARGET_SSE2 && d->vmode == V4SImode)
22943 0 : dremap.vmode = V4SFmode;
22944 : }
22945 56528 : else if ((contents & (h1 | h4)) == contents)
22946 : {
22947 : /* shufps */
22948 2537 : for (i = 0; i < nelt2; ++i)
22949 : {
22950 1828 : remap[i] = i;
22951 1828 : remap[i + nelt + nelt2] = i + nelt2;
22952 1828 : dremap.perm[i] = i;
22953 1828 : dremap.perm[i + nelt2] = i + nelt + nelt2;
22954 : }
22955 709 : if (nelt != 4)
22956 : {
22957 : /* shufpd */
22958 69 : dremap.vmode = V2DImode;
22959 69 : dremap.nelt = 2;
22960 69 : dremap.perm[0] = 0;
22961 69 : dremap.perm[1] = 3;
22962 : }
22963 : }
22964 55819 : else if ((contents & (h2 | h3)) == contents)
22965 : {
22966 : /* shufps */
22967 3483 : for (i = 0; i < nelt2; ++i)
22968 : {
22969 2458 : remap[i + nelt2] = i;
22970 2458 : remap[i + nelt] = i + nelt2;
22971 2458 : dremap.perm[i] = i + nelt2;
22972 2458 : dremap.perm[i + nelt2] = i + nelt;
22973 : }
22974 1025 : if (nelt != 4)
22975 : {
22976 : /* shufpd */
22977 76 : dremap.vmode = V2DImode;
22978 76 : dremap.nelt = 2;
22979 76 : dremap.perm[0] = 1;
22980 76 : dremap.perm[1] = 2;
22981 : }
22982 : }
22983 : else
22984 : return false;
22985 : }
22986 : else
22987 : {
22988 6831 : unsigned int nelt4 = nelt / 4, nzcnt = 0;
22989 6831 : unsigned HOST_WIDE_INT q[8];
22990 6831 : unsigned int nonzero_halves[4];
22991 :
22992 : /* Split the two input vectors into 8 quarters. */
22993 6831 : q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
22994 54648 : for (i = 1; i < 8; ++i)
22995 47817 : q[i] = q[0] << (nelt4 * i);
22996 34155 : for (i = 0; i < 4; ++i)
22997 27324 : if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
22998 : {
22999 24487 : nonzero_halves[nzcnt] = i;
23000 24487 : ++nzcnt;
23001 : }
23002 :
23003 6831 : if (nzcnt == 1)
23004 : {
23005 221 : gcc_assert (d->one_operand_p);
23006 221 : nonzero_halves[1] = nonzero_halves[0];
23007 221 : same_halves = true;
23008 : }
23009 6610 : else if (d->one_operand_p)
23010 : {
23011 23 : gcc_assert (nonzero_halves[0] == 0);
23012 23 : gcc_assert (nonzero_halves[1] == 1);
23013 : }
23014 :
23015 6831 : if (nzcnt <= 2)
23016 : {
23017 544 : if (d->perm[0] / nelt2 == nonzero_halves[1])
23018 : {
23019 : /* Attempt to increase the likelihood that dfinal
23020 : shuffle will be intra-lane. */
23021 229 : std::swap (nonzero_halves[0], nonzero_halves[1]);
23022 : }
23023 :
23024 : /* vperm2f128 or vperm2i128. */
23025 3526 : for (i = 0; i < nelt2; ++i)
23026 : {
23027 2982 : remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
23028 2982 : remap[i + nonzero_halves[0] * nelt2] = i;
23029 2982 : dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
23030 2982 : dremap.perm[i] = i + nonzero_halves[0] * nelt2;
23031 : }
23032 :
23033 544 : if (d->vmode != V8SFmode
23034 : && d->vmode != V4DFmode
23035 : && d->vmode != V8SImode)
23036 : {
23037 132 : dremap.vmode = V8SImode;
23038 132 : dremap.nelt = 8;
23039 660 : for (i = 0; i < 4; ++i)
23040 : {
23041 528 : dremap.perm[i] = i + nonzero_halves[0] * 4;
23042 528 : dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
23043 : }
23044 : }
23045 : }
23046 6287 : else if (d->one_operand_p)
23047 5822 : return false;
23048 6287 : else if (TARGET_AVX2
23049 2600 : && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
23050 : {
23051 : /* vpunpckl* */
23052 491 : for (i = 0; i < nelt4; ++i)
23053 : {
23054 247 : remap[i] = i * 2;
23055 247 : remap[i + nelt] = i * 2 + 1;
23056 247 : remap[i + nelt2] = i * 2 + nelt2;
23057 247 : remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
23058 247 : dremap.perm[i * 2] = i;
23059 247 : dremap.perm[i * 2 + 1] = i + nelt;
23060 247 : dremap.perm[i * 2 + nelt2] = i + nelt2;
23061 247 : dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
23062 : }
23063 : }
23064 6043 : else if (TARGET_AVX2
23065 2356 : && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
23066 : {
23067 : /* vpunpckh* */
23068 445 : for (i = 0; i < nelt4; ++i)
23069 : {
23070 224 : remap[i + nelt4] = i * 2;
23071 224 : remap[i + nelt + nelt4] = i * 2 + 1;
23072 224 : remap[i + nelt2 + nelt4] = i * 2 + nelt2;
23073 224 : remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
23074 224 : dremap.perm[i * 2] = i + nelt4;
23075 224 : dremap.perm[i * 2 + 1] = i + nelt + nelt4;
23076 224 : dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
23077 224 : dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
23078 : }
23079 : }
23080 : else
23081 : return false;
23082 : }
23083 :
23084 : /* Use the remapping array set up above to move the elements from their
23085 : swizzled locations into their final destinations. */
23086 7443 : dfinal = *d;
23087 48735 : for (i = 0; i < nelt; ++i)
23088 : {
23089 41292 : unsigned e = remap[d->perm[i]];
23090 41292 : gcc_assert (e < nelt);
23091 : /* If same_halves is true, both halves of the remapped vector are the
23092 : same. Avoid cross-lane accesses if possible. */
23093 41292 : if (same_halves && i >= nelt2)
23094 : {
23095 816 : gcc_assert (e < nelt2);
23096 816 : dfinal.perm[i] = e + nelt2;
23097 : }
23098 : else
23099 40476 : dfinal.perm[i] = e;
23100 : }
23101 7443 : if (!d->testing_p)
23102 : {
23103 2775 : dremap.target = gen_reg_rtx (dremap.vmode);
23104 2775 : dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
23105 : }
23106 7443 : dfinal.op1 = dfinal.op0;
23107 7443 : dfinal.one_operand_p = true;
23108 :
23109 : /* Test if the final remap can be done with a single insn. For V4SFmode or
23110 : V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
23111 7443 : start_sequence ();
23112 7443 : ok = expand_vec_perm_1 (&dfinal);
23113 7443 : seq = end_sequence ();
23114 :
23115 7443 : if (!ok)
23116 : return false;
23117 :
23118 6387 : if (d->testing_p)
23119 : return true;
23120 :
23121 2736 : if (dremap.vmode != dfinal.vmode)
23122 : {
23123 55 : dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
23124 55 : dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
23125 : }
23126 :
23127 2736 : ok = expand_vec_perm_1 (&dremap);
23128 2736 : gcc_assert (ok);
23129 :
23130 2736 : emit_insn (seq);
23131 2736 : return true;
23132 : }
23133 :
23134 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
23135 : a single vector cross-lane permutation into vpermq followed
23136 : by any of the single insn permutations. */
23137 :
23138 : static bool
23139 88896 : expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
23140 : {
23141 88896 : struct expand_vec_perm_d dremap, dfinal;
23142 88896 : unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
23143 88896 : unsigned contents[2];
23144 88896 : bool ok;
23145 :
23146 88896 : if (!(TARGET_AVX2
23147 4277 : && (d->vmode == V32QImode || d->vmode == V16HImode)
23148 495 : && d->one_operand_p))
23149 : return false;
23150 :
23151 7 : contents[0] = 0;
23152 7 : contents[1] = 0;
23153 103 : for (i = 0; i < nelt2; ++i)
23154 : {
23155 96 : contents[0] |= 1u << (d->perm[i] / nelt4);
23156 96 : contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
23157 : }
23158 :
23159 7 : for (i = 0; i < 2; ++i)
23160 : {
23161 : unsigned int cnt = 0;
23162 21 : for (j = 0; j < 4; ++j)
23163 21 : if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
23164 : return false;
23165 : }
23166 :
23167 0 : if (d->testing_p)
23168 : return true;
23169 :
23170 0 : dremap = *d;
23171 0 : dremap.vmode = V4DImode;
23172 0 : dremap.nelt = 4;
23173 0 : dremap.target = gen_reg_rtx (V4DImode);
23174 0 : dremap.op0 = gen_lowpart (V4DImode, d->op0);
23175 0 : dremap.op1 = dremap.op0;
23176 0 : dremap.one_operand_p = true;
23177 0 : for (i = 0; i < 2; ++i)
23178 : {
23179 : unsigned int cnt = 0;
23180 0 : for (j = 0; j < 4; ++j)
23181 0 : if ((contents[i] & (1u << j)) != 0)
23182 0 : dremap.perm[2 * i + cnt++] = j;
23183 0 : for (; cnt < 2; ++cnt)
23184 0 : dremap.perm[2 * i + cnt] = 0;
23185 : }
23186 :
23187 0 : dfinal = *d;
23188 0 : dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
23189 0 : dfinal.op1 = dfinal.op0;
23190 0 : dfinal.one_operand_p = true;
23191 0 : for (i = 0, j = 0; i < nelt; ++i)
23192 : {
23193 0 : if (i == nelt2)
23194 0 : j = 2;
23195 0 : dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
23196 0 : if ((d->perm[i] / nelt4) == dremap.perm[j])
23197 : ;
23198 0 : else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
23199 0 : dfinal.perm[i] |= nelt4;
23200 : else
23201 0 : gcc_unreachable ();
23202 : }
23203 :
23204 0 : ok = expand_vec_perm_1 (&dremap);
23205 0 : gcc_assert (ok);
23206 :
23207 0 : ok = expand_vec_perm_1 (&dfinal);
23208 0 : gcc_assert (ok);
23209 :
23210 : return true;
23211 : }
23212 :
23213 : static bool canonicalize_perm (struct expand_vec_perm_d *d);
23214 :
23215 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
23216 : a vector permutation using two instructions, vperm2f128 resp.
23217 : vperm2i128 followed by any single in-lane permutation. */
23218 :
23219 : static bool
23220 88896 : expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
23221 : {
23222 88896 : struct expand_vec_perm_d dfirst, dsecond;
23223 88896 : unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
23224 88896 : bool ok;
23225 :
23226 88896 : if (!TARGET_AVX
23227 23226 : || GET_MODE_SIZE (d->vmode) != 32
23228 95109 : || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
23229 : return false;
23230 :
23231 6029 : dsecond = *d;
23232 6029 : dsecond.one_operand_p = false;
23233 6029 : dsecond.testing_p = true;
23234 :
23235 : /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
23236 : immediate. For perm < 16 the second permutation uses
23237 : d->op0 as first operand, for perm >= 16 it uses d->op1
23238 : as first operand. The second operand is the result of
23239 : vperm2[fi]128. */
23240 197735 : for (perm = 0; perm < 32; perm++)
23241 : {
23242 : /* Ignore permutations which do not move anything cross-lane. */
23243 191785 : if (perm < 16)
23244 : {
23245 : /* The second shuffle for e.g. V4DFmode has
23246 : 0123 and ABCD operands.
23247 : Ignore AB23, as 23 is already in the second lane
23248 : of the first operand. */
23249 96126 : if ((perm & 0xc) == (1 << 2)) continue;
23250 : /* And 01CD, as 01 is in the first lane of the first
23251 : operand. */
23252 72086 : if ((perm & 3) == 0) continue;
23253 : /* And 4567, as then the vperm2[fi]128 doesn't change
23254 : anything on the original 4567 second operand. */
23255 54049 : if ((perm & 0xf) == ((3 << 2) | 2)) continue;
23256 : }
23257 : else
23258 : {
23259 : /* The second shuffle for e.g. V4DFmode has
23260 : 4567 and ABCD operands.
23261 : Ignore AB67, as 67 is already in the second lane
23262 : of the first operand. */
23263 95659 : if ((perm & 0xc) == (3 << 2)) continue;
23264 : /* And 45CD, as 45 is in the first lane of the first
23265 : operand. */
23266 71859 : if ((perm & 3) == 2) continue;
23267 : /* And 0123, as then the vperm2[fi]128 doesn't change
23268 : anything on the original 0123 first operand. */
23269 53918 : if ((perm & 0xf) == (1 << 2)) continue;
23270 : }
23271 :
23272 277596 : for (i = 0; i < nelt; i++)
23273 : {
23274 275777 : j = d->perm[i] / nelt2;
23275 510607 : if (j == ((perm >> (2 * (i >= nelt2))) & 3))
23276 67089 : dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
23277 349601 : else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
23278 114534 : dsecond.perm[i] = d->perm[i] & (nelt - 1);
23279 : else
23280 : break;
23281 : }
23282 :
23283 95973 : if (i == nelt)
23284 : {
23285 1819 : start_sequence ();
23286 1819 : ok = expand_vec_perm_1 (&dsecond);
23287 1819 : end_sequence ();
23288 : }
23289 : else
23290 : ok = false;
23291 :
23292 1819 : if (ok)
23293 : {
23294 64 : if (d->testing_p)
23295 : return true;
23296 :
23297 : /* Found a usable second shuffle. dfirst will be
23298 : vperm2f128 on d->op0 and d->op1. */
23299 46 : dsecond.testing_p = false;
23300 46 : dfirst = *d;
23301 46 : dfirst.target = gen_reg_rtx (d->vmode);
23302 270 : for (i = 0; i < nelt; i++)
23303 448 : dfirst.perm[i] = (i & (nelt2 - 1))
23304 336 : + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
23305 :
23306 46 : canonicalize_perm (&dfirst);
23307 46 : ok = expand_vec_perm_1 (&dfirst);
23308 46 : gcc_assert (ok);
23309 :
23310 : /* And dsecond is some single insn shuffle, taking
23311 : d->op0 and result of vperm2f128 (if perm < 16) or
23312 : d->op1 and result of vperm2f128 (otherwise). */
23313 46 : if (perm >= 16)
23314 46 : dsecond.op0 = dsecond.op1;
23315 46 : dsecond.op1 = dfirst.target;
23316 :
23317 46 : ok = expand_vec_perm_1 (&dsecond);
23318 46 : gcc_assert (ok);
23319 :
23320 : return true;
23321 : }
23322 :
23323 : /* For one operand, the only useful vperm2f128 permutation is 0x01
23324 : aka lanes swap. */
23325 95909 : if (d->one_operand_p)
23326 : return false;
23327 : }
23328 :
23329 : return false;
23330 : }
23331 :
23332 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
23333 : a two vector permutation using 2 intra-lane interleave insns
23334 : and cross-lane shuffle for 32-byte vectors. */
23335 :
23336 : static bool
23337 34475 : expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
23338 : {
23339 34475 : unsigned i, nelt;
23340 34475 : rtx (*gen) (rtx, rtx, rtx);
23341 :
23342 34475 : if (d->one_operand_p)
23343 : return false;
23344 33191 : if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
23345 : ;
23346 24984 : else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
23347 : ;
23348 : else
23349 : return false;
23350 :
23351 9717 : nelt = d->nelt;
23352 9717 : if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
23353 : return false;
23354 9877 : for (i = 0; i < nelt; i += 2)
23355 9521 : if (d->perm[i] != d->perm[0] + i / 2
23356 8648 : || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
23357 : return false;
23358 :
23359 356 : if (d->testing_p)
23360 : return true;
23361 :
23362 56 : switch (d->vmode)
23363 : {
23364 32 : case E_V32QImode:
23365 32 : if (d->perm[0])
23366 : gen = gen_vec_interleave_highv32qi;
23367 : else
23368 16 : gen = gen_vec_interleave_lowv32qi;
23369 : break;
23370 18 : case E_V16HImode:
23371 18 : if (d->perm[0])
23372 : gen = gen_vec_interleave_highv16hi;
23373 : else
23374 9 : gen = gen_vec_interleave_lowv16hi;
23375 : break;
23376 0 : case E_V8SImode:
23377 0 : if (d->perm[0])
23378 : gen = gen_vec_interleave_highv8si;
23379 : else
23380 0 : gen = gen_vec_interleave_lowv8si;
23381 : break;
23382 4 : case E_V4DImode:
23383 4 : if (d->perm[0])
23384 : gen = gen_vec_interleave_highv4di;
23385 : else
23386 2 : gen = gen_vec_interleave_lowv4di;
23387 : break;
23388 2 : case E_V8SFmode:
23389 2 : if (d->perm[0])
23390 : gen = gen_vec_interleave_highv8sf;
23391 : else
23392 1 : gen = gen_vec_interleave_lowv8sf;
23393 : break;
23394 0 : case E_V4DFmode:
23395 0 : if (d->perm[0])
23396 : gen = gen_vec_interleave_highv4df;
23397 : else
23398 0 : gen = gen_vec_interleave_lowv4df;
23399 : break;
23400 0 : default:
23401 0 : gcc_unreachable ();
23402 : }
23403 :
23404 56 : emit_insn (gen (d->target, d->op0, d->op1));
23405 56 : return true;
23406 : }
23407 :
23408 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
23409 : a single vector permutation using a single intra-lane vector
23410 : permutation, vperm2f128 swapping the lanes and vblend* insn blending
23411 : the non-swapped and swapped vectors together. */
23412 :
23413 : static bool
23414 27288 : expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
23415 : {
23416 27288 : struct expand_vec_perm_d dfirst, dsecond;
23417 27288 : unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
23418 27288 : rtx_insn *seq;
23419 27288 : bool ok;
23420 27288 : rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
23421 :
23422 27288 : if (!TARGET_AVX
23423 3508 : || TARGET_AVX2
23424 2030 : || (d->vmode != V8SFmode && d->vmode != V4DFmode)
23425 1846 : || !d->one_operand_p)
23426 : return false;
23427 :
23428 0 : dfirst = *d;
23429 0 : for (i = 0; i < nelt; i++)
23430 0 : dfirst.perm[i] = 0xff;
23431 0 : for (i = 0, msk = 0; i < nelt; i++)
23432 : {
23433 0 : j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
23434 0 : if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
23435 : return false;
23436 0 : dfirst.perm[j] = d->perm[i];
23437 0 : if (j != i)
23438 0 : msk |= (1 << i);
23439 : }
23440 0 : for (i = 0; i < nelt; i++)
23441 0 : if (dfirst.perm[i] == 0xff)
23442 0 : dfirst.perm[i] = i;
23443 :
23444 0 : if (!d->testing_p)
23445 0 : dfirst.target = gen_reg_rtx (dfirst.vmode);
23446 :
23447 0 : start_sequence ();
23448 0 : ok = expand_vec_perm_1 (&dfirst);
23449 0 : seq = end_sequence ();
23450 :
23451 0 : if (!ok)
23452 : return false;
23453 :
23454 0 : if (d->testing_p)
23455 : return true;
23456 :
23457 0 : emit_insn (seq);
23458 :
23459 0 : dsecond = *d;
23460 0 : dsecond.op0 = dfirst.target;
23461 0 : dsecond.op1 = dfirst.target;
23462 0 : dsecond.one_operand_p = true;
23463 0 : dsecond.target = gen_reg_rtx (dsecond.vmode);
23464 0 : for (i = 0; i < nelt; i++)
23465 0 : dsecond.perm[i] = i ^ nelt2;
23466 :
23467 0 : ok = expand_vec_perm_1 (&dsecond);
23468 0 : gcc_assert (ok);
23469 :
23470 0 : blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
23471 0 : emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
23472 0 : return true;
23473 : }
23474 :
23475 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
23476 : a two vector permutation using two single vector permutations and
23477 : {,v}{,p}unpckl{ps,pd,bw,wd,dq}. If two_insn, succeed only if one
23478 : of dfirst or dsecond is identity permutation. */
23479 :
23480 : static bool
23481 114364 : expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
23482 : {
23483 114364 : unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
23484 114364 : struct expand_vec_perm_d dfirst, dsecond, dfinal;
23485 114364 : bool ident1 = true, ident2 = true;
23486 :
23487 114364 : if (d->one_operand_p)
23488 : return false;
23489 :
23490 207694 : if (GET_MODE_SIZE (d->vmode) == 16)
23491 : {
23492 62423 : if (!TARGET_SSE)
23493 : return false;
23494 62423 : if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
23495 : return false;
23496 : }
23497 82848 : else if (GET_MODE_SIZE (d->vmode) == 32)
23498 : {
23499 8599 : if (!TARGET_AVX)
23500 : return false;
23501 8599 : if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
23502 : return false;
23503 : lane = nelt2;
23504 : }
23505 : else
23506 : return false;
23507 :
23508 232099 : for (i = 1; i < nelt; i++)
23509 199167 : if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
23510 : return false;
23511 :
23512 32932 : dfirst = *d;
23513 32932 : dsecond = *d;
23514 32932 : dfinal = *d;
23515 32932 : dfirst.op1 = dfirst.op0;
23516 32932 : dfirst.one_operand_p = true;
23517 32932 : dsecond.op0 = dsecond.op1;
23518 32932 : dsecond.one_operand_p = true;
23519 :
23520 217684 : for (i = 0; i < nelt; i++)
23521 184752 : if (d->perm[i] >= nelt)
23522 : {
23523 92376 : dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
23524 92376 : if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
23525 83899 : ident2 = false;
23526 92376 : dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
23527 92376 : = d->perm[i] - nelt;
23528 : }
23529 : else
23530 : {
23531 92376 : dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
23532 92376 : if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
23533 75522 : ident1 = false;
23534 92376 : dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
23535 : }
23536 :
23537 32932 : if (two_insn && !ident1 && !ident2)
23538 : return false;
23539 :
23540 3957 : if (!d->testing_p)
23541 : {
23542 214 : if (!ident1)
23543 144 : dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
23544 214 : if (!ident2)
23545 148 : dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
23546 214 : if (d->perm[0] >= nelt)
23547 0 : std::swap (dfinal.op0, dfinal.op1);
23548 : }
23549 :
23550 3957 : bool ok;
23551 3957 : rtx_insn *seq1 = NULL, *seq2 = NULL;
23552 :
23553 3957 : if (!ident1)
23554 : {
23555 2645 : start_sequence ();
23556 2645 : ok = expand_vec_perm_1 (&dfirst);
23557 2645 : seq1 = end_sequence ();
23558 :
23559 2645 : if (!ok)
23560 : return false;
23561 : }
23562 :
23563 2168 : if (!ident2)
23564 : {
23565 2074 : start_sequence ();
23566 2074 : ok = expand_vec_perm_1 (&dsecond);
23567 2074 : seq2 = end_sequence ();
23568 :
23569 2074 : if (!ok)
23570 : return false;
23571 : }
23572 :
23573 602 : if (d->testing_p)
23574 : return true;
23575 :
23576 680 : for (i = 0; i < nelt; i++)
23577 : {
23578 544 : dfinal.perm[i] = i / 2;
23579 544 : if (i >= lane)
23580 4 : dfinal.perm[i] += lane / 2;
23581 544 : if ((i & 1) != 0)
23582 272 : dfinal.perm[i] += nelt;
23583 : }
23584 136 : emit_insn (seq1);
23585 136 : emit_insn (seq2);
23586 136 : ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1,
23587 : dfinal.perm, dfinal.nelt, false);
23588 136 : gcc_assert (ok);
23589 : return true;
23590 : }
23591 :
23592 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
23593 : the permutation using two single vector permutations and the SSE4_1 pblendv
23594 : instruction. If two_insn, succeed only if one of dfirst or dsecond is
23595 : identity permutation. */
23596 :
23597 : static bool
23598 113762 : expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
23599 : {
23600 113762 : unsigned i, nelt = d->nelt;
23601 113762 : struct expand_vec_perm_d dfirst, dsecond, dfinal;
23602 113762 : machine_mode vmode = d->vmode;
23603 113762 : bool ident1 = true, ident2 = true;
23604 :
23605 : /* Use the same checks as in expand_vec_perm_blend. */
23606 113762 : if (d->one_operand_p)
23607 : return false;
23608 107775 : if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
23609 : ;
23610 100503 : else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
23611 : ;
23612 95262 : else if (TARGET_SSE4_1
23613 104678 : && (GET_MODE_SIZE (vmode) == 16
23614 8190 : || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
23615 2482 : || GET_MODE_SIZE (vmode) == 4))
23616 : ;
23617 : else
23618 : return false;
23619 :
23620 16657 : dfirst = *d;
23621 16657 : dsecond = *d;
23622 16657 : dfinal = *d;
23623 16657 : dfirst.op1 = dfirst.op0;
23624 16657 : dfirst.one_operand_p = true;
23625 16657 : dsecond.op0 = dsecond.op1;
23626 16657 : dsecond.one_operand_p = true;
23627 :
23628 137649 : for (i = 0; i < nelt; ++i)
23629 120992 : if (d->perm[i] >= nelt)
23630 : {
23631 60832 : dfirst.perm[i] = 0xff;
23632 60832 : dsecond.perm[i] = d->perm[i] - nelt;
23633 60832 : if (d->perm[i] != i + nelt)
23634 120992 : ident2 = false;
23635 : }
23636 : else
23637 : {
23638 60160 : dsecond.perm[i] = 0xff;
23639 60160 : dfirst.perm[i] = d->perm[i];
23640 60160 : if (d->perm[i] != i)
23641 120992 : ident1 = false;
23642 : }
23643 :
23644 16657 : if (two_insn && !ident1 && !ident2)
23645 : return false;
23646 :
23647 : /* For now. Ideally treat 0xff as a wildcard. */
23648 57247 : for (i = 0; i < nelt; ++i)
23649 51036 : if (dfirst.perm[i] == 0xff)
23650 : {
23651 26620 : if (GET_MODE_SIZE (vmode) == 32
23652 26620 : && dfirst.perm[i ^ (nelt / 2)] != 0xff)
23653 14868 : dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
23654 : else
23655 11752 : dfirst.perm[i] = i;
23656 : }
23657 : else
23658 : {
23659 24416 : if (GET_MODE_SIZE (vmode) == 32
23660 24416 : && dsecond.perm[i ^ (nelt / 2)] != 0xff)
23661 13292 : dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
23662 : else
23663 11124 : dsecond.perm[i] = i;
23664 : }
23665 :
23666 6211 : if (!d->testing_p)
23667 : {
23668 2403 : if (!ident1)
23669 2279 : dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
23670 2403 : if (!ident2)
23671 1091 : dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
23672 : }
23673 :
23674 6211 : bool ok;
23675 6211 : rtx_insn *seq1 = NULL, *seq2 = NULL;
23676 :
23677 6211 : if (!ident1)
23678 : {
23679 5622 : start_sequence ();
23680 5622 : ok = expand_vec_perm_1 (&dfirst);
23681 5622 : seq1 = end_sequence ();
23682 :
23683 5622 : if (!ok)
23684 : return false;
23685 : }
23686 :
23687 4584 : if (!ident2)
23688 : {
23689 1489 : start_sequence ();
23690 1489 : ok = expand_vec_perm_1 (&dsecond);
23691 1489 : seq2 = end_sequence ();
23692 :
23693 1489 : if (!ok)
23694 : return false;
23695 : }
23696 :
23697 3995 : if (d->testing_p)
23698 : return true;
23699 :
23700 21825 : for (i = 0; i < nelt; ++i)
23701 19764 : dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
23702 :
23703 2061 : emit_insn (seq1);
23704 2061 : emit_insn (seq2);
23705 2061 : ok = expand_vec_perm_blend (&dfinal);
23706 2061 : gcc_assert (ok);
23707 : return true;
23708 : }
23709 :
23710 : /* A subroutine of ix86_expand_vec_perm_const_1.
23711 : Implement a permutation with psrlw, psllw and por.
23712 : It handles case:
23713 : __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
23714 : __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */
23715 :
23716 : static bool
23717 26148 : expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d)
23718 : {
23719 26148 : unsigned i;
23720 26148 : rtx (*gen_shr) (rtx, rtx, rtx);
23721 26148 : rtx (*gen_shl) (rtx, rtx, rtx);
23722 26148 : rtx (*gen_or) (rtx, rtx, rtx);
23723 26148 : machine_mode mode = VOIDmode;
23724 :
23725 26148 : if (!TARGET_SSE2 || !d->one_operand_p)
23726 : return false;
23727 :
23728 5237 : switch (d->vmode)
23729 : {
23730 1395 : case E_V8QImode:
23731 1395 : if (!TARGET_MMX_WITH_SSE)
23732 : return false;
23733 : mode = V4HImode;
23734 : gen_shr = gen_lshrv4hi3;
23735 : gen_shl = gen_ashlv4hi3;
23736 : gen_or = gen_iorv4hi3;
23737 : break;
23738 : case E_V16QImode:
23739 : mode = V8HImode;
23740 : gen_shr = gen_lshrv8hi3;
23741 : gen_shl = gen_ashlv8hi3;
23742 : gen_or = gen_iorv8hi3;
23743 : break;
23744 : default: return false;
23745 : }
23746 :
23747 3126 : if (!rtx_equal_p (d->op0, d->op1))
23748 : return false;
23749 :
23750 12166 : for (i = 0; i < d->nelt; i += 2)
23751 10728 : if (d->perm[i] != i + 1 || d->perm[i + 1] != i)
23752 : return false;
23753 :
23754 1438 : if (d->testing_p)
23755 : return true;
23756 :
23757 26 : rtx tmp1 = gen_reg_rtx (mode);
23758 26 : rtx tmp2 = gen_reg_rtx (mode);
23759 26 : rtx op0 = force_reg (d->vmode, d->op0);
23760 :
23761 26 : emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode));
23762 26 : emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode));
23763 26 : emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8)));
23764 26 : emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8)));
23765 26 : emit_insn (gen_or (tmp1, tmp1, tmp2));
23766 26 : emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode));
23767 :
23768 26 : return true;
23769 : }
23770 :
23771 : /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
23772 : permutation using two vperm2f128, followed by a vshufpd insn blending
23773 : the two vectors together. */
23774 :
23775 : static bool
23776 29997 : expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
23777 : {
23778 29997 : struct expand_vec_perm_d dfirst, dsecond, dthird;
23779 29997 : bool ok;
23780 :
23781 29997 : if (!TARGET_AVX || (d->vmode != V4DFmode))
23782 : return false;
23783 :
23784 1277 : if (d->testing_p)
23785 : return true;
23786 :
23787 206 : dfirst = *d;
23788 206 : dsecond = *d;
23789 206 : dthird = *d;
23790 :
23791 206 : dfirst.perm[0] = (d->perm[0] & ~1);
23792 206 : dfirst.perm[1] = (d->perm[0] & ~1) + 1;
23793 206 : dfirst.perm[2] = (d->perm[2] & ~1);
23794 206 : dfirst.perm[3] = (d->perm[2] & ~1) + 1;
23795 206 : dsecond.perm[0] = (d->perm[1] & ~1);
23796 206 : dsecond.perm[1] = (d->perm[1] & ~1) + 1;
23797 206 : dsecond.perm[2] = (d->perm[3] & ~1);
23798 206 : dsecond.perm[3] = (d->perm[3] & ~1) + 1;
23799 206 : dthird.perm[0] = (d->perm[0] % 2);
23800 206 : dthird.perm[1] = (d->perm[1] % 2) + 4;
23801 206 : dthird.perm[2] = (d->perm[2] % 2) + 2;
23802 206 : dthird.perm[3] = (d->perm[3] % 2) + 6;
23803 :
23804 206 : dfirst.target = gen_reg_rtx (dfirst.vmode);
23805 206 : dsecond.target = gen_reg_rtx (dsecond.vmode);
23806 206 : dthird.op0 = dfirst.target;
23807 206 : dthird.op1 = dsecond.target;
23808 206 : dthird.one_operand_p = false;
23809 :
23810 206 : canonicalize_perm (&dfirst);
23811 206 : canonicalize_perm (&dsecond);
23812 :
23813 206 : ok = expand_vec_perm_1 (&dfirst)
23814 206 : && expand_vec_perm_1 (&dsecond)
23815 412 : && expand_vec_perm_1 (&dthird);
23816 :
23817 0 : gcc_assert (ok);
23818 :
23819 : return true;
23820 : }
23821 :
23822 : static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
23823 :
23824 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
23825 : a two vector permutation using two intra-lane vector
23826 : permutations, vperm2f128 swapping the lanes and vblend* insn blending
23827 : the non-swapped and swapped vectors together. */
23828 :
23829 : static bool
23830 15790 : expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
23831 : {
23832 15790 : struct expand_vec_perm_d dfirst, dsecond, dthird;
23833 15790 : unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
23834 15790 : rtx_insn *seq1, *seq2;
23835 15790 : bool ok;
23836 15790 : rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
23837 :
23838 15790 : if (!TARGET_AVX
23839 990 : || TARGET_AVX2
23840 722 : || (d->vmode != V8SFmode && d->vmode != V4DFmode)
23841 595 : || d->one_operand_p)
23842 : return false;
23843 :
23844 595 : dfirst = *d;
23845 595 : dsecond = *d;
23846 5355 : for (i = 0; i < nelt; i++)
23847 : {
23848 4760 : dfirst.perm[i] = 0xff;
23849 4760 : dsecond.perm[i] = 0xff;
23850 : }
23851 5355 : for (i = 0, msk = 0; i < nelt; i++)
23852 : {
23853 4760 : j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
23854 4760 : if (j == i)
23855 : {
23856 3458 : dfirst.perm[j] = d->perm[i];
23857 5858 : which1 |= (d->perm[i] < nelt ? 1 : 2);
23858 : }
23859 : else
23860 : {
23861 1302 : dsecond.perm[j] = d->perm[i];
23862 1302 : which2 |= (d->perm[i] < nelt ? 1 : 2);
23863 1302 : msk |= (1U << i);
23864 : }
23865 : }
23866 595 : if (msk == 0 || msk == (1U << nelt) - 1)
23867 : return false;
23868 :
23869 595 : if (!d->testing_p)
23870 : {
23871 40 : dfirst.target = gen_reg_rtx (dfirst.vmode);
23872 40 : dsecond.target = gen_reg_rtx (dsecond.vmode);
23873 : }
23874 :
23875 5355 : for (i = 0; i < nelt; i++)
23876 : {
23877 4760 : if (dfirst.perm[i] == 0xff)
23878 1302 : dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
23879 4760 : if (dsecond.perm[i] == 0xff)
23880 3458 : dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
23881 : }
23882 595 : canonicalize_perm (&dfirst);
23883 595 : start_sequence ();
23884 595 : ok = ix86_expand_vec_perm_const_1 (&dfirst);
23885 595 : seq1 = end_sequence ();
23886 :
23887 595 : if (!ok)
23888 : return false;
23889 :
23890 595 : canonicalize_perm (&dsecond);
23891 595 : start_sequence ();
23892 595 : ok = ix86_expand_vec_perm_const_1 (&dsecond);
23893 595 : seq2 = end_sequence ();
23894 :
23895 595 : if (!ok)
23896 : return false;
23897 :
23898 595 : if (d->testing_p)
23899 : return true;
23900 :
23901 40 : emit_insn (seq1);
23902 40 : emit_insn (seq2);
23903 :
23904 40 : dthird = *d;
23905 40 : dthird.op0 = dsecond.target;
23906 40 : dthird.op1 = dsecond.target;
23907 40 : dthird.one_operand_p = true;
23908 40 : dthird.target = gen_reg_rtx (dthird.vmode);
23909 360 : for (i = 0; i < nelt; i++)
23910 320 : dthird.perm[i] = i ^ nelt2;
23911 :
23912 40 : ok = expand_vec_perm_1 (&dthird);
23913 40 : gcc_assert (ok);
23914 :
23915 40 : blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
23916 40 : emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
23917 40 : return true;
23918 : }
23919 :
23920 : /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
23921 : permutation with two pshufb insns and an ior. We should have already
23922 : failed all two instruction sequences. */
23923 :
23924 : static bool
23925 28741 : expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
23926 : {
23927 28741 : rtx rperm[2][16], vperm, l, h, op, m128;
23928 28741 : unsigned int i, nelt, eltsz;
23929 28741 : machine_mode mode;
23930 28741 : rtx (*gen) (rtx, rtx, rtx);
23931 :
23932 33459 : if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
23933 9346 : && GET_MODE_SIZE (d->vmode) != 8
23934 9306 : && GET_MODE_SIZE (d->vmode) != 4))
23935 : return false;
23936 1072 : gcc_assert (!d->one_operand_p);
23937 :
23938 1072 : if (d->testing_p)
23939 : return true;
23940 :
23941 202 : switch (GET_MODE_SIZE (d->vmode))
23942 : {
23943 : case 4:
23944 : mode = V4QImode;
23945 : gen = gen_mmx_pshufbv4qi3;
23946 : break;
23947 20 : case 8:
23948 20 : mode = V8QImode;
23949 20 : gen = gen_mmx_pshufbv8qi3;
23950 20 : break;
23951 45 : case 16:
23952 45 : mode = V16QImode;
23953 45 : gen = gen_ssse3_pshufbv16qi3;
23954 45 : break;
23955 0 : default:
23956 0 : gcc_unreachable ();
23957 : }
23958 :
23959 101 : nelt = d->nelt;
23960 101 : eltsz = GET_MODE_UNIT_SIZE (d->vmode);
23961 :
23962 : /* Generate two permutation masks. If the required element is within
23963 : the given vector it is shuffled into the proper lane. If the required
23964 : element is in the other vector, force a zero into the lane by setting
23965 : bit 7 in the permutation mask. */
23966 101 : m128 = GEN_INT (-128);
23967 1029 : for (i = 0; i < nelt; ++i)
23968 : {
23969 928 : unsigned j, k, e = d->perm[i];
23970 928 : unsigned which = (e >= nelt);
23971 928 : if (e >= nelt)
23972 480 : e -= nelt;
23973 :
23974 1952 : for (j = 0; j < eltsz; ++j)
23975 : {
23976 1024 : rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
23977 1024 : rperm[1-which][i*eltsz + j] = m128;
23978 : }
23979 :
23980 9024 : for (k = i*eltsz + j; k < 16; ++k)
23981 8096 : rperm[0][k] = rperm[1][k] = m128;
23982 : }
23983 :
23984 101 : vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
23985 101 : vperm = force_reg (V16QImode, vperm);
23986 :
23987 101 : l = gen_reg_rtx (mode);
23988 101 : op = gen_lowpart (mode, d->op0);
23989 101 : emit_insn (gen (l, op, vperm));
23990 :
23991 101 : vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
23992 101 : vperm = force_reg (V16QImode, vperm);
23993 :
23994 101 : h = gen_reg_rtx (mode);
23995 101 : op = gen_lowpart (mode, d->op1);
23996 101 : emit_insn (gen (h, op, vperm));
23997 :
23998 101 : op = d->target;
23999 101 : if (d->vmode != mode)
24000 22 : op = gen_reg_rtx (mode);
24001 101 : ix86_emit_vec_binop (IOR, mode, op, l, h);
24002 101 : if (op != d->target)
24003 22 : emit_move_insn (d->target, gen_lowpart (d->vmode, op));
24004 :
24005 : return true;
24006 : }
24007 :
24008 : /* Implement arbitrary permutation of one V32QImode and V16QImode operand
24009 : with two vpshufb insns, vpermq and vpor. We should have already failed
24010 : all two or three instruction sequences. */
24011 :
24012 : static bool
24013 23624 : expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
24014 : {
24015 23624 : rtx rperm[2][32], vperm, l, h, hp, op, m128;
24016 23624 : unsigned int i, nelt, eltsz;
24017 :
24018 23624 : if (!TARGET_AVX2
24019 401 : || !d->one_operand_p
24020 172 : || (d->vmode != V32QImode && d->vmode != V16HImode))
24021 : return false;
24022 :
24023 7 : if (d->testing_p)
24024 : return true;
24025 :
24026 7 : nelt = d->nelt;
24027 7 : eltsz = GET_MODE_UNIT_SIZE (d->vmode);
24028 :
24029 : /* Generate two permutation masks. If the required element is within
24030 : the same lane, it is shuffled in. If the required element from the
24031 : other lane, force a zero by setting bit 7 in the permutation mask.
24032 : In the other mask the mask has non-negative elements if element
24033 : is requested from the other lane, but also moved to the other lane,
24034 : so that the result of vpshufb can have the two V2TImode halves
24035 : swapped. */
24036 7 : m128 = GEN_INT (-128);
24037 199 : for (i = 0; i < nelt; ++i)
24038 : {
24039 192 : unsigned j, e = d->perm[i] & (nelt / 2 - 1);
24040 192 : unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
24041 :
24042 416 : for (j = 0; j < eltsz; ++j)
24043 : {
24044 224 : rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
24045 224 : rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
24046 : }
24047 : }
24048 :
24049 7 : vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
24050 7 : vperm = force_reg (V32QImode, vperm);
24051 :
24052 7 : h = gen_reg_rtx (V32QImode);
24053 7 : op = gen_lowpart (V32QImode, d->op0);
24054 7 : emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
24055 :
24056 : /* Swap the 128-byte lanes of h into hp. */
24057 7 : hp = gen_reg_rtx (V4DImode);
24058 7 : op = gen_lowpart (V4DImode, h);
24059 7 : emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
24060 : const1_rtx));
24061 :
24062 7 : vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
24063 7 : vperm = force_reg (V32QImode, vperm);
24064 :
24065 7 : l = gen_reg_rtx (V32QImode);
24066 7 : op = gen_lowpart (V32QImode, d->op0);
24067 7 : emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
24068 :
24069 7 : op = d->target;
24070 7 : if (d->vmode != V32QImode)
24071 2 : op = gen_reg_rtx (V32QImode);
24072 7 : emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
24073 7 : if (op != d->target)
24074 2 : emit_move_insn (d->target, gen_lowpart (d->vmode, op));
24075 :
24076 : return true;
24077 : }
24078 :
24079 : /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
24080 : and extract-odd permutations of two V32QImode and V16QImode operand
24081 : with two vpshufb insns, vpor and vpermq. We should have already
24082 : failed all two or three instruction sequences. */
24083 :
24084 : static bool
24085 23617 : expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
24086 : {
24087 23617 : rtx rperm[2][32], vperm, l, h, ior, op, m128;
24088 23617 : unsigned int i, nelt, eltsz;
24089 :
24090 23617 : if (!TARGET_AVX2
24091 394 : || d->one_operand_p
24092 229 : || (d->vmode != V32QImode && d->vmode != V16HImode))
24093 : return false;
24094 :
24095 112 : for (i = 0; i < d->nelt; ++i)
24096 112 : if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
24097 : return false;
24098 :
24099 0 : if (d->testing_p)
24100 : return true;
24101 :
24102 0 : nelt = d->nelt;
24103 0 : eltsz = GET_MODE_UNIT_SIZE (d->vmode);
24104 :
24105 : /* Generate two permutation masks. In the first permutation mask
24106 : the first quarter will contain indexes for the first half
24107 : of the op0, the second quarter will contain bit 7 set, third quarter
24108 : will contain indexes for the second half of the op0 and the
24109 : last quarter bit 7 set. In the second permutation mask
24110 : the first quarter will contain bit 7 set, the second quarter
24111 : indexes for the first half of the op1, the third quarter bit 7 set
24112 : and last quarter indexes for the second half of the op1.
24113 : I.e. the first mask e.g. for V32QImode extract even will be:
24114 : 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
24115 : (all values masked with 0xf except for -128) and second mask
24116 : for extract even will be
24117 : -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
24118 0 : m128 = GEN_INT (-128);
24119 0 : for (i = 0; i < nelt; ++i)
24120 : {
24121 0 : unsigned j, e = d->perm[i] & (nelt / 2 - 1);
24122 0 : unsigned which = d->perm[i] >= nelt;
24123 0 : unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
24124 :
24125 0 : for (j = 0; j < eltsz; ++j)
24126 : {
24127 0 : rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
24128 0 : rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
24129 : }
24130 : }
24131 :
24132 0 : vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
24133 0 : vperm = force_reg (V32QImode, vperm);
24134 :
24135 0 : l = gen_reg_rtx (V32QImode);
24136 0 : op = gen_lowpart (V32QImode, d->op0);
24137 0 : emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
24138 :
24139 0 : vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
24140 0 : vperm = force_reg (V32QImode, vperm);
24141 :
24142 0 : h = gen_reg_rtx (V32QImode);
24143 0 : op = gen_lowpart (V32QImode, d->op1);
24144 0 : emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
24145 :
24146 0 : ior = gen_reg_rtx (V32QImode);
24147 0 : emit_insn (gen_iorv32qi3 (ior, l, h));
24148 :
24149 : /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
24150 0 : op = gen_reg_rtx (V4DImode);
24151 0 : ior = gen_lowpart (V4DImode, ior);
24152 0 : emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
24153 : const1_rtx, GEN_INT (3)));
24154 0 : emit_move_insn (d->target, gen_lowpart (d->vmode, op));
24155 :
24156 0 : return true;
24157 : }
24158 :
24159 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement a
24160 : permutation (which is a bland) with and, andnot and or when pshufb is not available.
24161 :
24162 : It handles case:
24163 : __builtin_shufflevector (v1, v2, 0, 9, 2, 11, 4, 13, 6, 15);
24164 : __builtin_shufflevector (v1, v2, 8, 1, 2, 11, 4, 13, 6, 15);
24165 :
24166 : An element[i] must be chosen between op0[i] and op1[i] to satisfy the
24167 : requirement.
24168 : */
24169 :
24170 : static bool
24171 24710 : expand_vec_perm_pand_pandn_por (struct expand_vec_perm_d *d)
24172 : {
24173 24710 : rtx rperm[16], vperm;
24174 24710 : unsigned int i, nelt = d->nelt;
24175 :
24176 24710 : if (!TARGET_SSE2
24177 24710 : || d->one_operand_p
24178 20911 : || (d->vmode != V16QImode && d->vmode != V8HImode))
24179 : return false;
24180 :
24181 7687 : if (d->perm[0] != 0)
24182 : return false;
24183 :
24184 : /* The dest[i] must select an element between op0[i] and op1[i]. */
24185 16329 : for (i = 1; i < nelt; i++)
24186 15255 : if ((d->perm[i] % nelt) != i)
24187 : return false;
24188 :
24189 1074 : if (d->testing_p)
24190 : return true;
24191 :
24192 : /* Generates a blend mask for the operators AND and ANDNOT. */
24193 121 : machine_mode inner_mode = GET_MODE_INNER (d->vmode);
24194 1337 : for (i = 0; i < nelt; i++)
24195 1790 : rperm[i] = (d->perm[i] < nelt) ? CONSTM1_RTX (inner_mode)
24196 574 : : CONST0_RTX (inner_mode);
24197 :
24198 121 : vperm = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (nelt, rperm));
24199 121 : vperm = force_reg (d->vmode, vperm);
24200 :
24201 121 : ix86_expand_sse_movcc (d->target, vperm, d->op0, d->op1);
24202 :
24203 121 : return true;
24204 : }
24205 :
24206 : /* Implement permutation with pslldq + psrldq + por when pshufb is not
24207 : available. */
24208 : static bool
24209 43798 : expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d *d, bool pandn)
24210 : {
24211 43798 : unsigned i, nelt = d->nelt;
24212 43798 : unsigned start1, end1 = -1;
24213 43798 : machine_mode vmode = d->vmode, imode;
24214 43798 : int start2 = -1;
24215 43798 : bool clear_op0, clear_op1;
24216 43798 : unsigned inner_size;
24217 43798 : rtx op0, op1, dop1;
24218 43798 : rtx (*gen_vec_shr) (rtx, rtx, rtx);
24219 43798 : rtx (*gen_vec_shl) (rtx, rtx, rtx);
24220 :
24221 : /* pshufd can be used for V4SI/V2DI under TARGET_SSE2. */
24222 43798 : if (!TARGET_SSE2 || (vmode != E_V16QImode && vmode != E_V8HImode))
24223 : return false;
24224 :
24225 13749 : start1 = d->perm[0];
24226 38776 : for (i = 1; i < nelt; i++)
24227 : {
24228 37954 : if (d->perm[i] != d->perm[i-1] + 1
24229 11524 : || d->perm[i] == nelt)
24230 : {
24231 26676 : if (start2 == -1)
24232 : {
24233 13749 : start2 = d->perm[i];
24234 13749 : end1 = d->perm[i-1];
24235 : }
24236 : else
24237 : return false;
24238 : }
24239 : }
24240 :
24241 822 : clear_op0 = end1 != nelt - 1;
24242 822 : clear_op1 = start2 % nelt != 0;
24243 : /* pandn/pand is needed to clear upper/lower bits of op0/op1. */
24244 822 : if (!pandn && (clear_op0 || clear_op1))
24245 : return false;
24246 :
24247 528 : if (d->testing_p)
24248 : return true;
24249 :
24250 65 : gen_vec_shr = vmode == E_V16QImode ? gen_vec_shr_v16qi : gen_vec_shr_v8hi;
24251 24 : gen_vec_shl = vmode == E_V16QImode ? gen_vec_shl_v16qi : gen_vec_shl_v8hi;
24252 65 : imode = GET_MODE_INNER (vmode);
24253 65 : inner_size = GET_MODE_BITSIZE (imode);
24254 65 : op0 = gen_reg_rtx (vmode);
24255 65 : op1 = gen_reg_rtx (vmode);
24256 :
24257 65 : if (start1)
24258 61 : emit_insn (gen_vec_shr (op0, d->op0, GEN_INT (start1 * inner_size)));
24259 : else
24260 4 : emit_move_insn (op0, d->op0);
24261 :
24262 65 : dop1 = d->op1;
24263 65 : if (d->one_operand_p)
24264 44 : dop1 = d->op0;
24265 :
24266 65 : int shl_offset = end1 - start1 + 1 - start2 % nelt;
24267 65 : if (shl_offset)
24268 45 : emit_insn (gen_vec_shl (op1, dop1, GEN_INT (shl_offset * inner_size)));
24269 : else
24270 20 : emit_move_insn (op1, dop1);
24271 :
24272 : /* Clear lower/upper bits for op0/op1. */
24273 65 : if (clear_op0 || clear_op1)
24274 : {
24275 : rtx vec[16];
24276 : rtx const_vec;
24277 : rtx clear;
24278 664 : for (i = 0; i != nelt; i++)
24279 : {
24280 616 : if (i < (end1 - start1 + 1))
24281 251 : vec[i] = gen_int_mode ((HOST_WIDE_INT_1U << inner_size) - 1, imode);
24282 : else
24283 365 : vec[i] = CONST0_RTX (imode);
24284 : }
24285 48 : const_vec = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, vec));
24286 48 : const_vec = validize_mem (force_const_mem (vmode, const_vec));
24287 48 : clear = force_reg (vmode, const_vec);
24288 :
24289 48 : if (clear_op0)
24290 40 : emit_move_insn (op0, gen_rtx_AND (vmode, op0, clear));
24291 48 : if (clear_op1)
24292 36 : emit_move_insn (op1, gen_rtx_AND (vmode,
24293 : gen_rtx_NOT (vmode, clear),
24294 : op1));
24295 : }
24296 :
24297 65 : emit_move_insn (d->target, gen_rtx_IOR (vmode, op0, op1));
24298 65 : return true;
24299 : }
24300 :
24301 : /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
24302 : and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
24303 : operands with two "and" and "pack" or two "shift" and "pack" insns.
24304 : We should have already failed all two instruction sequences. */
24305 :
24306 : static bool
24307 45907 : expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
24308 : {
24309 45907 : rtx op, dop0, dop1, t;
24310 45907 : unsigned i, odd, c, s, nelt = d->nelt;
24311 45907 : int pblendw_i = 0;
24312 45907 : bool end_perm = false;
24313 45907 : machine_mode half_mode;
24314 45907 : rtx (*gen_and) (rtx, rtx, rtx);
24315 45907 : rtx (*gen_pack) (rtx, rtx, rtx);
24316 45907 : rtx (*gen_shift) (rtx, rtx, rtx);
24317 :
24318 45907 : if (d->one_operand_p)
24319 : return false;
24320 :
24321 40627 : switch (d->vmode)
24322 : {
24323 4222 : case E_V4HImode:
24324 : /* Required for "pack". */
24325 4222 : if (!TARGET_SSE4_1)
24326 : return false;
24327 : c = 0xffff;
24328 : s = 16;
24329 : half_mode = V2SImode;
24330 : gen_and = gen_andv2si3;
24331 : gen_pack = gen_mmx_packusdw;
24332 : gen_shift = gen_lshrv2si3;
24333 : pblendw_i = 0x5;
24334 : break;
24335 5867 : case E_V8HImode:
24336 : /* Required for "pack". */
24337 5867 : if (!TARGET_SSE4_1)
24338 : return false;
24339 : c = 0xffff;
24340 : s = 16;
24341 : half_mode = V4SImode;
24342 : gen_and = gen_andv4si3;
24343 : gen_pack = gen_sse4_1_packusdw;
24344 : gen_shift = gen_lshrv4si3;
24345 : pblendw_i = 0x55;
24346 : break;
24347 : case E_V8QImode:
24348 : /* No check as all instructions are SSE2. */
24349 : c = 0xff;
24350 : s = 8;
24351 : half_mode = V4HImode;
24352 : gen_and = gen_andv4hi3;
24353 : gen_pack = gen_mmx_packuswb;
24354 : gen_shift = gen_lshrv4hi3;
24355 : break;
24356 14218 : case E_V16QImode:
24357 : /* No check as all instructions are SSE2. */
24358 14218 : c = 0xff;
24359 14218 : s = 8;
24360 14218 : half_mode = V8HImode;
24361 14218 : gen_and = gen_andv8hi3;
24362 14218 : gen_pack = gen_sse2_packuswb;
24363 14218 : gen_shift = gen_lshrv8hi3;
24364 14218 : break;
24365 435 : case E_V16HImode:
24366 435 : if (!TARGET_AVX2)
24367 : return false;
24368 : c = 0xffff;
24369 : s = 16;
24370 : half_mode = V8SImode;
24371 : gen_and = gen_andv8si3;
24372 : gen_pack = gen_avx2_packusdw;
24373 : gen_shift = gen_lshrv8si3;
24374 : pblendw_i = 0x5555;
24375 : end_perm = true;
24376 : break;
24377 509 : case E_V32QImode:
24378 509 : if (!TARGET_AVX2)
24379 : return false;
24380 : c = 0xff;
24381 : s = 8;
24382 : half_mode = V16HImode;
24383 : gen_and = gen_andv16hi3;
24384 : gen_pack = gen_avx2_packuswb;
24385 : gen_shift = gen_lshrv16hi3;
24386 : end_perm = true;
24387 : break;
24388 : default:
24389 : /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
24390 : are more profitable than general shuffles. */
24391 : return false;
24392 : }
24393 :
24394 : /* Check that permutation is even or odd. */
24395 20089 : odd = d->perm[0];
24396 20089 : if (odd > 1)
24397 : return false;
24398 :
24399 229502 : for (i = 1; i < nelt; ++i)
24400 213652 : if (d->perm[i] != 2 * i + odd)
24401 : return false;
24402 :
24403 15850 : if (d->testing_p)
24404 : return true;
24405 :
24406 5516 : dop0 = gen_reg_rtx (half_mode);
24407 5516 : dop1 = gen_reg_rtx (half_mode);
24408 5516 : if (odd == 0)
24409 : {
24410 : /* Use pblendw since const_vector 0 should be cheaper than
24411 : const_vector 0xffff. */
24412 4794 : if (d->vmode == V4HImode
24413 : || d->vmode == E_V8HImode
24414 : || d->vmode == E_V16HImode)
24415 : {
24416 864 : rtx dop0_t = gen_reg_rtx (d->vmode);
24417 864 : rtx dop1_t = gen_reg_rtx (d->vmode);
24418 864 : t = gen_reg_rtx (d->vmode);
24419 864 : emit_move_insn (t, CONST0_RTX (d->vmode));
24420 :
24421 864 : emit_move_insn (dop0_t, gen_rtx_VEC_MERGE (d->vmode, d->op0, t,
24422 : GEN_INT (pblendw_i)));
24423 864 : emit_move_insn (dop1_t, gen_rtx_VEC_MERGE (d->vmode, d->op1, t,
24424 : GEN_INT (pblendw_i)));
24425 :
24426 864 : emit_move_insn (dop0, gen_lowpart (half_mode, dop0_t));
24427 864 : emit_move_insn (dop1, gen_lowpart (half_mode, dop1_t));
24428 864 : }
24429 : else
24430 : {
24431 3930 : t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
24432 3930 : t = force_reg (half_mode, t);
24433 3930 : emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
24434 3930 : emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
24435 : }
24436 : }
24437 : else
24438 : {
24439 1444 : emit_insn (gen_shift (dop0,
24440 722 : gen_lowpart (half_mode, d->op0),
24441 : GEN_INT (s)));
24442 1444 : emit_insn (gen_shift (dop1,
24443 722 : gen_lowpart (half_mode, d->op1),
24444 : GEN_INT (s)));
24445 : }
24446 : /* In AVX2 for 256 bit case we need to permute pack result. */
24447 5516 : if (TARGET_AVX2 && end_perm)
24448 : {
24449 411 : op = gen_reg_rtx (d->vmode);
24450 411 : t = gen_reg_rtx (V4DImode);
24451 411 : emit_insn (gen_pack (op, dop0, dop1));
24452 822 : emit_insn (gen_avx2_permv4di_1 (t,
24453 411 : gen_lowpart (V4DImode, op),
24454 : const0_rtx,
24455 : const2_rtx,
24456 : const1_rtx,
24457 : GEN_INT (3)));
24458 411 : emit_move_insn (d->target, gen_lowpart (d->vmode, t));
24459 : }
24460 : else
24461 5105 : emit_insn (gen_pack (d->target, dop0, dop1));
24462 :
24463 : return true;
24464 : }
24465 :
24466 : /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
24467 : and extract-odd permutations of two V64QI operands
24468 : with two "shifts", two "truncs" and one "concat" insns for "odd"
24469 : and two "truncs" and one concat insn for "even."
24470 : Have already failed all two instruction sequences. */
24471 :
24472 : static bool
24473 23672 : expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
24474 : {
24475 23672 : rtx t1, t2, t3, t4;
24476 23672 : unsigned i, odd, nelt = d->nelt;
24477 :
24478 23672 : if (!TARGET_AVX512BW
24479 106 : || d->one_operand_p
24480 70 : || d->vmode != V64QImode)
24481 : return false;
24482 :
24483 : /* Check that permutation is even or odd. */
24484 70 : odd = d->perm[0];
24485 70 : if (odd > 1)
24486 : return false;
24487 :
24488 2422 : for (i = 1; i < nelt; ++i)
24489 2388 : if (d->perm[i] != 2 * i + odd)
24490 : return false;
24491 :
24492 34 : if (d->testing_p)
24493 : return true;
24494 :
24495 :
24496 34 : if (odd)
24497 : {
24498 5 : t1 = gen_reg_rtx (V32HImode);
24499 5 : t2 = gen_reg_rtx (V32HImode);
24500 10 : emit_insn (gen_lshrv32hi3 (t1,
24501 5 : gen_lowpart (V32HImode, d->op0),
24502 : GEN_INT (8)));
24503 10 : emit_insn (gen_lshrv32hi3 (t2,
24504 5 : gen_lowpart (V32HImode, d->op1),
24505 : GEN_INT (8)));
24506 : }
24507 : else
24508 : {
24509 29 : t1 = gen_lowpart (V32HImode, d->op0);
24510 29 : t2 = gen_lowpart (V32HImode, d->op1);
24511 : }
24512 :
24513 34 : t3 = gen_reg_rtx (V32QImode);
24514 34 : t4 = gen_reg_rtx (V32QImode);
24515 34 : emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
24516 34 : emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
24517 34 : emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
24518 :
24519 34 : return true;
24520 : }
24521 :
24522 : /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
24523 : and extract-odd permutations. */
24524 :
24525 : static bool
24526 12576 : expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
24527 : {
24528 12576 : rtx t1, t2, t3, t4, t5;
24529 :
24530 12576 : switch (d->vmode)
24531 : {
24532 19 : case E_V4DFmode:
24533 19 : if (d->testing_p)
24534 : break;
24535 1 : t1 = gen_reg_rtx (V4DFmode);
24536 1 : t2 = gen_reg_rtx (V4DFmode);
24537 :
24538 : /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
24539 1 : emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
24540 1 : emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
24541 :
24542 : /* Now an unpck[lh]pd will produce the result required. */
24543 1 : if (odd)
24544 0 : t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
24545 : else
24546 1 : t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
24547 1 : emit_insn (t3);
24548 1 : break;
24549 :
24550 1214 : case E_V8SFmode:
24551 1214 : {
24552 1214 : int mask = odd ? 0xdd : 0x88;
24553 :
24554 1214 : if (d->testing_p)
24555 : break;
24556 186 : t1 = gen_reg_rtx (V8SFmode);
24557 186 : t2 = gen_reg_rtx (V8SFmode);
24558 186 : t3 = gen_reg_rtx (V8SFmode);
24559 :
24560 : /* Shuffle within the 128-bit lanes to produce:
24561 : { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
24562 186 : emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
24563 : GEN_INT (mask)));
24564 :
24565 : /* Shuffle the lanes around to produce:
24566 : { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
24567 186 : emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
24568 : GEN_INT (0x3)));
24569 :
24570 : /* Shuffle within the 128-bit lanes to produce:
24571 : { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
24572 186 : emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
24573 :
24574 : /* Shuffle within the 128-bit lanes to produce:
24575 : { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
24576 186 : emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
24577 :
24578 : /* Shuffle the lanes around to produce:
24579 : { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
24580 186 : emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
24581 : GEN_INT (0x20)));
24582 : }
24583 186 : break;
24584 :
24585 0 : case E_V2DFmode:
24586 0 : case E_V4SFmode:
24587 0 : case E_V2DImode:
24588 0 : case E_V2SImode:
24589 0 : case E_V4SImode:
24590 0 : case E_V2HImode:
24591 : /* These are always directly implementable by expand_vec_perm_1. */
24592 0 : gcc_unreachable ();
24593 :
24594 0 : case E_V2SFmode:
24595 0 : gcc_assert (TARGET_MMX_WITH_SSE);
24596 : /* We have no suitable instructions. */
24597 0 : if (d->testing_p)
24598 : return false;
24599 : break;
24600 :
24601 1412 : case E_V4QImode:
24602 1412 : if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
24603 0 : return expand_vec_perm_pshufb2 (d);
24604 : else
24605 : {
24606 1412 : if (d->testing_p)
24607 : break;
24608 : /* We need 2*log2(N)-1 operations to achieve odd/even
24609 : with interleave. */
24610 178 : t1 = gen_reg_rtx (V4QImode);
24611 178 : emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1));
24612 178 : emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1));
24613 178 : if (odd)
24614 41 : t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1);
24615 : else
24616 137 : t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1);
24617 178 : emit_insn (t2);
24618 : }
24619 178 : break;
24620 :
24621 1320 : case E_V4HImode:
24622 1320 : if (TARGET_SSE4_1)
24623 90 : return expand_vec_perm_even_odd_pack (d);
24624 1230 : else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
24625 20 : return expand_vec_perm_pshufb2 (d);
24626 : else
24627 : {
24628 1210 : if (d->testing_p)
24629 : break;
24630 : /* We need 2*log2(N)-1 operations to achieve odd/even
24631 : with interleave. */
24632 496 : t1 = gen_reg_rtx (V4HImode);
24633 496 : emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
24634 496 : emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
24635 496 : if (odd)
24636 8 : t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
24637 : else
24638 488 : t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
24639 496 : emit_insn (t2);
24640 : }
24641 496 : break;
24642 :
24643 6585 : case E_V8HImode:
24644 6585 : if (TARGET_SSE4_1)
24645 439 : return expand_vec_perm_even_odd_pack (d);
24646 6146 : else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
24647 1 : return expand_vec_perm_pshufb2 (d);
24648 : else
24649 : {
24650 6145 : if (d->testing_p)
24651 : break;
24652 : /* We need 2*log2(N)-1 operations to achieve odd/even
24653 : with interleave. */
24654 2716 : t1 = gen_reg_rtx (V8HImode);
24655 2716 : t2 = gen_reg_rtx (V8HImode);
24656 2716 : emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
24657 2716 : emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
24658 2716 : emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
24659 2716 : emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
24660 2716 : if (odd)
24661 92 : t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
24662 : else
24663 2624 : t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
24664 2716 : emit_insn (t3);
24665 : }
24666 2716 : break;
24667 :
24668 1339 : case E_V8QImode:
24669 1339 : case E_V16QImode:
24670 1339 : return expand_vec_perm_even_odd_pack (d);
24671 :
24672 456 : case E_V16HImode:
24673 456 : case E_V32QImode:
24674 456 : return expand_vec_perm_even_odd_pack (d);
24675 :
24676 36 : case E_V64QImode:
24677 36 : return expand_vec_perm_even_odd_trunc (d);
24678 :
24679 19 : case E_V4DImode:
24680 19 : if (!TARGET_AVX2)
24681 : {
24682 19 : struct expand_vec_perm_d d_copy = *d;
24683 19 : d_copy.vmode = V4DFmode;
24684 19 : if (d->testing_p)
24685 18 : d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
24686 : else
24687 1 : d_copy.target = gen_reg_rtx (V4DFmode);
24688 19 : d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
24689 19 : d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
24690 19 : if (expand_vec_perm_even_odd_1 (&d_copy, odd))
24691 : {
24692 19 : if (!d->testing_p)
24693 1 : emit_move_insn (d->target,
24694 1 : gen_lowpart (V4DImode, d_copy.target));
24695 19 : return true;
24696 : }
24697 : return false;
24698 : }
24699 :
24700 0 : if (d->testing_p)
24701 : break;
24702 :
24703 0 : t1 = gen_reg_rtx (V4DImode);
24704 0 : t2 = gen_reg_rtx (V4DImode);
24705 :
24706 : /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
24707 0 : emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
24708 0 : emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
24709 :
24710 : /* Now an vpunpck[lh]qdq will produce the result required. */
24711 0 : if (odd)
24712 0 : t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
24713 : else
24714 0 : t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
24715 0 : emit_insn (t3);
24716 0 : break;
24717 :
24718 176 : case E_V8SImode:
24719 176 : if (!TARGET_AVX2)
24720 : {
24721 38 : struct expand_vec_perm_d d_copy = *d;
24722 38 : d_copy.vmode = V8SFmode;
24723 38 : if (d->testing_p)
24724 38 : d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
24725 : else
24726 0 : d_copy.target = gen_reg_rtx (V8SFmode);
24727 38 : d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
24728 38 : d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
24729 38 : if (expand_vec_perm_even_odd_1 (&d_copy, odd))
24730 : {
24731 38 : if (!d->testing_p)
24732 0 : emit_move_insn (d->target,
24733 0 : gen_lowpart (V8SImode, d_copy.target));
24734 38 : return true;
24735 : }
24736 : return false;
24737 : }
24738 :
24739 138 : if (d->testing_p)
24740 : break;
24741 :
24742 138 : t1 = gen_reg_rtx (V8SImode);
24743 138 : t2 = gen_reg_rtx (V8SImode);
24744 138 : t3 = gen_reg_rtx (V4DImode);
24745 138 : t4 = gen_reg_rtx (V4DImode);
24746 138 : t5 = gen_reg_rtx (V4DImode);
24747 :
24748 : /* Shuffle the lanes around into
24749 : { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
24750 276 : emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
24751 138 : gen_lowpart (V4DImode, d->op1),
24752 : GEN_INT (0x20)));
24753 276 : emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
24754 138 : gen_lowpart (V4DImode, d->op1),
24755 : GEN_INT (0x31)));
24756 :
24757 : /* Swap the 2nd and 3rd position in each lane into
24758 : { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
24759 138 : emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
24760 : GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
24761 138 : emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
24762 : GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
24763 :
24764 : /* Now an vpunpck[lh]qdq will produce
24765 : { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
24766 138 : if (odd)
24767 0 : t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
24768 0 : gen_lowpart (V4DImode, t2));
24769 : else
24770 138 : t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
24771 138 : gen_lowpart (V4DImode, t2));
24772 138 : emit_insn (t3);
24773 138 : emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
24774 138 : break;
24775 :
24776 0 : default:
24777 0 : gcc_unreachable ();
24778 : }
24779 :
24780 : return true;
24781 : }
24782 :
24783 : /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
24784 : extract-even and extract-odd permutations. */
24785 :
24786 : static bool
24787 23545 : expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
24788 : {
24789 23545 : unsigned i, odd, nelt = d->nelt;
24790 :
24791 23545 : odd = d->perm[0];
24792 23545 : if (odd != 0 && odd != 1)
24793 : return false;
24794 :
24795 63645 : for (i = 1; i < nelt; ++i)
24796 56143 : if (d->perm[i] != 2 * i + odd)
24797 : return false;
24798 :
24799 7502 : if (d->vmode == E_V32HImode
24800 12 : && d->testing_p
24801 12 : && !TARGET_AVX512BW)
24802 : return false;
24803 :
24804 7490 : return expand_vec_perm_even_odd_1 (d, odd);
24805 : }
24806 :
24807 : /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
24808 : permutations. We assume that expand_vec_perm_1 has already failed. */
24809 :
24810 : static bool
24811 1039 : expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
24812 : {
24813 1039 : unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
24814 1039 : machine_mode vmode = d->vmode;
24815 1039 : rtx (*gen) (rtx, rtx, rtx);
24816 1039 : unsigned char perm2[4];
24817 1039 : rtx op0 = d->op0, dest;
24818 1039 : bool ok;
24819 :
24820 1039 : switch (vmode)
24821 : {
24822 0 : case E_V4DFmode:
24823 0 : case E_V8SFmode:
24824 : /* These are special-cased in sse.md so that we can optionally
24825 : use the vbroadcast instruction. They expand to two insns
24826 : if the input happens to be in a register. */
24827 0 : gcc_unreachable ();
24828 :
24829 0 : case E_V2DFmode:
24830 0 : case E_V2SFmode:
24831 0 : case E_V4SFmode:
24832 0 : case E_V2DImode:
24833 0 : case E_V2SImode:
24834 0 : case E_V4SImode:
24835 0 : case E_V2HImode:
24836 0 : case E_V4HImode:
24837 : /* These are always implementable using standard shuffle patterns. */
24838 0 : gcc_unreachable ();
24839 :
24840 16 : case E_V4QImode:
24841 : /* This can be implemented via interleave and pshuflw. */
24842 16 : if (d->testing_p)
24843 : return true;
24844 :
24845 8 : if (elt >= nelt2)
24846 : {
24847 4 : gen = gen_mmx_punpckhbw_low;
24848 4 : elt -= nelt2;
24849 : }
24850 : else
24851 : gen = gen_mmx_punpcklbw_low;
24852 :
24853 8 : dest = gen_reg_rtx (vmode);
24854 8 : emit_insn (gen (dest, op0, op0));
24855 8 : vmode = get_mode_wider_vector (vmode);
24856 8 : op0 = gen_lowpart (vmode, dest);
24857 :
24858 8 : memset (perm2, elt, 2);
24859 8 : dest = gen_reg_rtx (vmode);
24860 8 : ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
24861 8 : gcc_assert (ok);
24862 :
24863 8 : emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
24864 8 : return true;
24865 :
24866 4 : case E_V8QImode:
24867 : /* This can be implemented via interleave. We save one insn by
24868 : stopping once we have promoted to V2SImode and then use pshufd. */
24869 4 : if (d->testing_p)
24870 : return true;
24871 4 : do
24872 : {
24873 4 : if (elt >= nelt2)
24874 : {
24875 1 : gen = vmode == V8QImode ? gen_mmx_punpckhbw
24876 : : gen_mmx_punpckhwd;
24877 1 : elt -= nelt2;
24878 : }
24879 : else
24880 3 : gen = vmode == V8QImode ? gen_mmx_punpcklbw
24881 : : gen_mmx_punpcklwd;
24882 4 : nelt2 /= 2;
24883 :
24884 4 : dest = gen_reg_rtx (vmode);
24885 4 : emit_insn (gen (dest, op0, op0));
24886 4 : vmode = get_mode_wider_vector (vmode);
24887 4 : op0 = gen_lowpart (vmode, dest);
24888 : }
24889 4 : while (vmode != V2SImode);
24890 :
24891 2 : memset (perm2, elt, 2);
24892 2 : dest = gen_reg_rtx (vmode);
24893 2 : ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
24894 2 : gcc_assert (ok);
24895 :
24896 2 : emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
24897 2 : return true;
24898 :
24899 1010 : case E_V8HImode:
24900 1010 : case E_V16QImode:
24901 : /* These can be implemented via interleave. We save one insn by
24902 : stopping once we have promoted to V4SImode and then use pshufd. */
24903 1010 : if (d->testing_p)
24904 : return true;
24905 1550 : do
24906 : {
24907 1550 : if (elt >= nelt2)
24908 : {
24909 16 : gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
24910 : : gen_vec_interleave_highv8hi;
24911 16 : elt -= nelt2;
24912 : }
24913 : else
24914 1534 : gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi
24915 : : gen_vec_interleave_lowv8hi;
24916 1550 : nelt2 /= 2;
24917 :
24918 1550 : dest = gen_reg_rtx (vmode);
24919 1550 : emit_insn (gen (dest, op0, op0));
24920 1550 : vmode = get_mode_wider_vector (vmode);
24921 1550 : op0 = gen_lowpart (vmode, dest);
24922 : }
24923 1550 : while (vmode != V4SImode);
24924 :
24925 945 : memset (perm2, elt, 4);
24926 945 : dest = gen_reg_rtx (vmode);
24927 945 : ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
24928 945 : gcc_assert (ok);
24929 :
24930 945 : emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
24931 945 : return true;
24932 :
24933 1 : case E_V8HFmode:
24934 1 : case E_V8BFmode:
24935 : /* This can be implemented via interleave and pshufd. */
24936 1 : if (d->testing_p)
24937 : return true;
24938 :
24939 1 : rtx (*gen_interleave) (machine_mode, rtx, rtx, rtx);
24940 1 : if (elt >= nelt2)
24941 : {
24942 0 : gen_interleave = gen_vec_interleave_high;
24943 0 : elt -= nelt2;
24944 : }
24945 : else
24946 : gen_interleave = gen_vec_interleave_low;
24947 1 : nelt2 /= 2;
24948 :
24949 1 : dest = gen_reg_rtx (vmode);
24950 1 : emit_insn (gen_interleave (vmode, dest, op0, op0));
24951 :
24952 1 : vmode = V4SImode;
24953 1 : op0 = gen_lowpart (vmode, dest);
24954 :
24955 1 : memset (perm2, elt, 4);
24956 1 : dest = gen_reg_rtx (vmode);
24957 1 : ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
24958 1 : gcc_assert (ok);
24959 :
24960 1 : emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
24961 1 : return true;
24962 :
24963 0 : case E_V32QImode:
24964 0 : case E_V16HImode:
24965 0 : case E_V8SImode:
24966 0 : case E_V4DImode:
24967 : /* For AVX2 broadcasts of the first element vpbroadcast* or
24968 : vpermq should be used by expand_vec_perm_1. */
24969 0 : gcc_assert (!TARGET_AVX2 || d->perm[0]);
24970 : return false;
24971 :
24972 6 : case E_V64QImode:
24973 6 : gcc_assert (!TARGET_AVX512BW || d->perm[0]);
24974 : return false;
24975 :
24976 2 : case E_V32HImode:
24977 2 : gcc_assert (!TARGET_AVX512BW);
24978 : return false;
24979 :
24980 0 : default:
24981 0 : gcc_unreachable ();
24982 : }
24983 : }
24984 :
24985 : /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
24986 : broadcast permutations. */
24987 :
24988 : static bool
24989 88997 : expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
24990 : {
24991 88997 : unsigned i, elt, nelt = d->nelt;
24992 :
24993 88997 : if (!d->one_operand_p)
24994 : return false;
24995 :
24996 5385 : elt = d->perm[0];
24997 8271 : for (i = 1; i < nelt; ++i)
24998 8162 : if (d->perm[i] != elt)
24999 : return false;
25000 :
25001 109 : return expand_vec_perm_broadcast_1 (d);
25002 : }
25003 :
25004 : /* Implement arbitrary permutations of two V64QImode operands
25005 : with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
25006 : static bool
25007 23617 : expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
25008 : {
25009 23617 : if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
25010 : return false;
25011 :
25012 72 : if (d->testing_p)
25013 : return true;
25014 :
25015 72 : struct expand_vec_perm_d ds[2];
25016 72 : rtx rperm[128], vperm, target0, target1;
25017 72 : unsigned int i, nelt;
25018 72 : machine_mode vmode;
25019 :
25020 72 : nelt = d->nelt;
25021 72 : vmode = V64QImode;
25022 :
25023 216 : for (i = 0; i < 2; i++)
25024 : {
25025 144 : ds[i] = *d;
25026 144 : ds[i].vmode = V32HImode;
25027 144 : ds[i].nelt = 32;
25028 144 : ds[i].target = gen_reg_rtx (V32HImode);
25029 144 : ds[i].op0 = gen_lowpart (V32HImode, d->op0);
25030 144 : ds[i].op1 = gen_lowpart (V32HImode, d->op1);
25031 : }
25032 :
25033 : /* Prepare permutations such that the first one takes care of
25034 : putting the even bytes into the right positions or one higher
25035 : positions (ds[0]) and the second one takes care of
25036 : putting the odd bytes into the right positions or one below
25037 : (ds[1]). */
25038 :
25039 4680 : for (i = 0; i < nelt; i++)
25040 : {
25041 4608 : ds[i & 1].perm[i / 2] = d->perm[i] / 2;
25042 4608 : if (i & 1)
25043 : {
25044 2304 : rperm[i] = constm1_rtx;
25045 2304 : rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
25046 : }
25047 : else
25048 : {
25049 2304 : rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
25050 2304 : rperm[i + 64] = constm1_rtx;
25051 : }
25052 : }
25053 :
25054 72 : bool ok = expand_vec_perm_1 (&ds[0]);
25055 72 : gcc_assert (ok);
25056 72 : ds[0].target = gen_lowpart (V64QImode, ds[0].target);
25057 :
25058 72 : ok = expand_vec_perm_1 (&ds[1]);
25059 72 : gcc_assert (ok);
25060 72 : ds[1].target = gen_lowpart (V64QImode, ds[1].target);
25061 :
25062 72 : vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
25063 72 : vperm = force_reg (vmode, vperm);
25064 72 : target0 = gen_reg_rtx (V64QImode);
25065 72 : emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
25066 :
25067 72 : vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
25068 72 : vperm = force_reg (vmode, vperm);
25069 72 : target1 = gen_reg_rtx (V64QImode);
25070 72 : emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
25071 :
25072 72 : emit_insn (gen_iorv64qi3 (d->target, target0, target1));
25073 72 : return true;
25074 : }
25075 :
25076 : /* Implement arbitrary permutation of two V32QImode and V16QImode operands
25077 : with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
25078 : all the shorter instruction sequences. */
25079 :
25080 : static bool
25081 15844 : expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
25082 : {
25083 15844 : rtx rperm[4][32], vperm, l[2], h[2], op, m128;
25084 15844 : unsigned int i, nelt, eltsz;
25085 15844 : bool used[4];
25086 :
25087 15844 : if (!TARGET_AVX2
25088 322 : || d->one_operand_p
25089 193 : || (d->vmode != V32QImode && d->vmode != V16HImode))
25090 : return false;
25091 :
25092 54 : if (d->testing_p)
25093 : return true;
25094 :
25095 54 : nelt = d->nelt;
25096 54 : eltsz = GET_MODE_UNIT_SIZE (d->vmode);
25097 :
25098 : /* Generate 4 permutation masks. If the required element is within
25099 : the same lane, it is shuffled in. If the required element from the
25100 : other lane, force a zero by setting bit 7 in the permutation mask.
25101 : In the other mask the mask has non-negative elements if element
25102 : is requested from the other lane, but also moved to the other lane,
25103 : so that the result of vpshufb can have the two V2TImode halves
25104 : swapped. */
25105 54 : m128 = GEN_INT (-128);
25106 1836 : for (i = 0; i < 32; ++i)
25107 : {
25108 1728 : rperm[0][i] = m128;
25109 1728 : rperm[1][i] = m128;
25110 1728 : rperm[2][i] = m128;
25111 1728 : rperm[3][i] = m128;
25112 : }
25113 54 : used[0] = false;
25114 54 : used[1] = false;
25115 54 : used[2] = false;
25116 54 : used[3] = false;
25117 1590 : for (i = 0; i < nelt; ++i)
25118 : {
25119 1536 : unsigned j, e = d->perm[i] & (nelt / 2 - 1);
25120 1536 : unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
25121 2074 : unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
25122 :
25123 3264 : for (j = 0; j < eltsz; ++j)
25124 1728 : rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
25125 1536 : used[which] = true;
25126 : }
25127 :
25128 162 : for (i = 0; i < 2; ++i)
25129 : {
25130 108 : if (!used[2 * i + 1])
25131 : {
25132 22 : h[i] = NULL_RTX;
25133 22 : continue;
25134 : }
25135 86 : vperm = gen_rtx_CONST_VECTOR (V32QImode,
25136 86 : gen_rtvec_v (32, rperm[2 * i + 1]));
25137 86 : vperm = force_reg (V32QImode, vperm);
25138 86 : h[i] = gen_reg_rtx (V32QImode);
25139 86 : op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
25140 86 : emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
25141 : }
25142 :
25143 : /* Swap the 128-byte lanes of h[X]. */
25144 162 : for (i = 0; i < 2; ++i)
25145 : {
25146 108 : if (h[i] == NULL_RTX)
25147 22 : continue;
25148 86 : op = gen_reg_rtx (V4DImode);
25149 86 : emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
25150 : const2_rtx, GEN_INT (3), const0_rtx,
25151 : const1_rtx));
25152 86 : h[i] = gen_lowpart (V32QImode, op);
25153 : }
25154 :
25155 162 : for (i = 0; i < 2; ++i)
25156 : {
25157 108 : if (!used[2 * i])
25158 : {
25159 0 : l[i] = NULL_RTX;
25160 0 : continue;
25161 : }
25162 108 : vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
25163 108 : vperm = force_reg (V32QImode, vperm);
25164 108 : l[i] = gen_reg_rtx (V32QImode);
25165 108 : op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
25166 108 : emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
25167 : }
25168 :
25169 162 : for (i = 0; i < 2; ++i)
25170 : {
25171 108 : if (h[i] && l[i])
25172 : {
25173 86 : op = gen_reg_rtx (V32QImode);
25174 86 : emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
25175 86 : l[i] = op;
25176 : }
25177 22 : else if (h[i])
25178 0 : l[i] = h[i];
25179 : }
25180 :
25181 54 : gcc_assert (l[0] && l[1]);
25182 54 : op = d->target;
25183 54 : if (d->vmode != V32QImode)
25184 12 : op = gen_reg_rtx (V32QImode);
25185 54 : emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
25186 54 : if (op != d->target)
25187 12 : emit_move_insn (d->target, gen_lowpart (d->vmode, op));
25188 : return true;
25189 : }
25190 :
25191 : /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
25192 : taken care of, perform the expansion in D and return true on success. */
25193 :
25194 : static bool
25195 306628 : ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
25196 : {
25197 : /* Try a single instruction expansion. */
25198 306628 : if (expand_vec_perm_1 (d))
25199 : return true;
25200 :
25201 : /* Try sequences of two instructions. */
25202 :
25203 100982 : if (expand_vec_perm_pshuflw_pshufhw (d))
25204 : return true;
25205 :
25206 98519 : if (expand_vec_perm_palignr (d, false))
25207 : return true;
25208 :
25209 95384 : if (expand_vec_perm_interleave2 (d))
25210 : return true;
25211 :
25212 88997 : if (expand_vec_perm_broadcast (d))
25213 : return true;
25214 :
25215 88896 : if (expand_vec_perm_vpermq_perm_1 (d))
25216 : return true;
25217 :
25218 88896 : if (expand_vec_perm_vperm2f128 (d))
25219 : return true;
25220 :
25221 88832 : if (expand_vec_perm_pblendv (d))
25222 : return true;
25223 :
25224 87076 : if (expand_vec_perm_2perm_interleave (d, true))
25225 : return true;
25226 :
25227 86714 : if (expand_vec_perm_2perm_pblendv (d, true))
25228 : return true;
25229 :
25230 83619 : if (expand_vec_perm_shufps_shufps (d))
25231 : return true;
25232 :
25233 48857 : if (expand_vec_perm_punpckldq_pshuf (d))
25234 : return true;
25235 :
25236 : /* Try sequences of three instructions. */
25237 :
25238 43583 : if (expand_vec_perm_even_odd_pack (d))
25239 : return true;
25240 :
25241 29997 : if (expand_vec_perm_2vperm2f128_vshuf (d))
25242 : return true;
25243 :
25244 28720 : if (expand_vec_perm_pshufb2 (d))
25245 : return true;
25246 :
25247 27669 : if (expand_vec_perm_pslldq_psrldq_por (d, false))
25248 : return true;
25249 :
25250 27426 : if (expand_vec_perm_interleave3 (d))
25251 : return true;
25252 :
25253 27288 : if (expand_vec_perm_vperm2f128_vblend (d))
25254 : return true;
25255 :
25256 27288 : if (expand_vec_perm_2perm_interleave (d, false))
25257 : return true;
25258 :
25259 27048 : if (expand_vec_perm_2perm_pblendv (d, false))
25260 : return true;
25261 :
25262 26148 : if (expand_vec_perm_psrlw_psllw_por (d))
25263 : return true;
25264 :
25265 24710 : if (expand_vec_perm_pand_pandn_por (d))
25266 : return true;
25267 :
25268 : /* Try sequences of four instructions. */
25269 :
25270 23636 : if (expand_vec_perm_even_odd_trunc (d))
25271 : return true;
25272 23624 : if (expand_vec_perm_vpshufb2_vpermq (d))
25273 : return true;
25274 :
25275 23617 : if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
25276 : return true;
25277 :
25278 23617 : if (expand_vec_perm_vpermt2_vpshub2 (d))
25279 : return true;
25280 :
25281 : /* ??? Look for narrow permutations whose element orderings would
25282 : allow the promotion to a wider mode. */
25283 :
25284 : /* ??? Look for sequences of interleave or a wider permute that place
25285 : the data into the correct lanes for a half-vector shuffle like
25286 : pshuf[lh]w or vpermilps. */
25287 :
25288 : /* ??? Look for sequences of interleave that produce the desired results.
25289 : The combinatorics of punpck[lh] get pretty ugly... */
25290 :
25291 23545 : if (expand_vec_perm_even_odd (d))
25292 : return true;
25293 :
25294 : /* Generate four or five instructions. */
25295 16129 : if (expand_vec_perm_pslldq_psrldq_por (d, true))
25296 : return true;
25297 :
25298 : /* Even longer sequences. */
25299 15844 : if (expand_vec_perm_vpshufb4_vpermq2 (d))
25300 : return true;
25301 :
25302 : /* See if we can get the same permutation in different vector integer
25303 : mode. */
25304 15790 : struct expand_vec_perm_d nd;
25305 15790 : if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
25306 : {
25307 0 : if (!d->testing_p)
25308 0 : emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
25309 0 : return true;
25310 : }
25311 :
25312 : /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
25313 15790 : if (expand_vec_perm2_vperm2f128_vblend (d))
25314 : return true;
25315 :
25316 : return false;
25317 : }
25318 :
25319 : /* If a permutation only uses one operand, make it clear. Returns true
25320 : if the permutation references both operands. */
25321 :
25322 : static bool
25323 74796 : canonicalize_perm (struct expand_vec_perm_d *d)
25324 : {
25325 74796 : int i, which, nelt = d->nelt;
25326 :
25327 450922 : for (i = which = 0; i < nelt; ++i)
25328 511191 : which |= (d->perm[i] < nelt ? 1 : 2);
25329 :
25330 74796 : d->one_operand_p = true;
25331 74796 : switch (which)
25332 : {
25333 0 : default:
25334 0 : gcc_unreachable();
25335 :
25336 55750 : case 3:
25337 55750 : if (!rtx_equal_p (d->op0, d->op1))
25338 : {
25339 55699 : d->one_operand_p = false;
25340 55699 : break;
25341 : }
25342 : /* The elements of PERM do not suggest that only the first operand
25343 : is used, but both operands are identical. Allow easier matching
25344 : of the permutation by folding the permutation into the single
25345 : input vector. */
25346 : /* FALLTHRU */
25347 :
25348 : case 2:
25349 2913 : for (i = 0; i < nelt; ++i)
25350 2576 : d->perm[i] &= nelt - 1;
25351 337 : d->op0 = d->op1;
25352 337 : break;
25353 :
25354 18760 : case 1:
25355 18760 : d->op1 = d->op0;
25356 18760 : break;
25357 : }
25358 :
25359 74796 : return (which == 3);
25360 : }
25361 :
25362 : /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
25363 :
25364 : bool
25365 823314 : ix86_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
25366 : rtx target, rtx op0, rtx op1,
25367 : const vec_perm_indices &sel)
25368 : {
25369 823314 : if (vmode != op_mode)
25370 : return false;
25371 :
25372 821449 : struct expand_vec_perm_d d;
25373 821449 : unsigned char perm[MAX_VECT_LEN];
25374 821449 : unsigned int i, nelt, which;
25375 821449 : bool two_args;
25376 :
25377 : /* For HF and BF mode vector, convert it to HI using subreg. */
25378 2463889 : if (GET_MODE_INNER (vmode) == HFmode || GET_MODE_INNER (vmode) == BFmode)
25379 : {
25380 484 : machine_mode orig_mode = vmode;
25381 968 : vmode = mode_for_vector (HImode,
25382 484 : GET_MODE_NUNITS (vmode)).require ();
25383 484 : if (target)
25384 441 : target = lowpart_subreg (vmode, target, orig_mode);
25385 484 : if (op0)
25386 441 : op0 = lowpart_subreg (vmode, op0, orig_mode);
25387 484 : if (op1)
25388 441 : op1 = lowpart_subreg (vmode, op1, orig_mode);
25389 : }
25390 :
25391 821449 : d.target = target;
25392 821449 : d.op0 = op0;
25393 821449 : d.op1 = op1;
25394 :
25395 821449 : d.vmode = vmode;
25396 821449 : gcc_assert (VECTOR_MODE_P (d.vmode));
25397 821449 : d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
25398 821449 : d.testing_p = !target;
25399 :
25400 821449 : gcc_assert (sel.length () == nelt);
25401 821449 : gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
25402 :
25403 : /* Given sufficient ISA support we can just return true here
25404 : for selected vector modes. */
25405 821449 : switch (d.vmode)
25406 : {
25407 2047 : case E_V16SFmode:
25408 2047 : case E_V16SImode:
25409 2047 : case E_V8DImode:
25410 2047 : case E_V8DFmode:
25411 2047 : if (!TARGET_AVX512F)
25412 : return false;
25413 : /* All implementable with a single vperm[it]2 insn. */
25414 2047 : if (d.testing_p)
25415 : return true;
25416 : break;
25417 323 : case E_V32HImode:
25418 323 : if (!TARGET_AVX512F)
25419 : return false;
25420 323 : if (d.testing_p && TARGET_AVX512BW)
25421 : /* All implementable with a single vperm[it]2 insn. */
25422 : return true;
25423 : break;
25424 752 : case E_V64QImode:
25425 752 : if (!TARGET_AVX512F)
25426 : return false;
25427 752 : if (d.testing_p && TARGET_AVX512BW)
25428 : /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
25429 : return true;
25430 : break;
25431 12915 : case E_V8SImode:
25432 12915 : case E_V8SFmode:
25433 12915 : case E_V4DFmode:
25434 12915 : case E_V4DImode:
25435 12915 : if (!TARGET_AVX)
25436 : return false;
25437 12915 : if (d.testing_p && TARGET_AVX512VL)
25438 : /* All implementable with a single vperm[it]2 insn. */
25439 : return true;
25440 : break;
25441 614 : case E_V16HImode:
25442 614 : if (!TARGET_SSE2)
25443 : return false;
25444 614 : if (d.testing_p && TARGET_AVX2)
25445 : /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
25446 : return true;
25447 : break;
25448 696 : case E_V32QImode:
25449 696 : if (!TARGET_SSE2)
25450 : return false;
25451 696 : if (d.testing_p && TARGET_AVX2)
25452 : /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
25453 : return true;
25454 : break;
25455 37996 : case E_V8HImode:
25456 37996 : case E_V16QImode:
25457 37996 : if (!TARGET_SSE2)
25458 : return false;
25459 : /* Fall through. */
25460 237033 : case E_V4SImode:
25461 237033 : case E_V4SFmode:
25462 237033 : if (!TARGET_SSE)
25463 : return false;
25464 : /* All implementable with a single vpperm insn. */
25465 237033 : if (d.testing_p && TARGET_XOP)
25466 : return true;
25467 : /* All implementable with 2 pshufb + 1 ior. */
25468 236927 : if (d.testing_p && TARGET_SSSE3)
25469 : return true;
25470 : break;
25471 138344 : case E_V2SFmode:
25472 138344 : case E_V2SImode:
25473 138344 : case E_V4HImode:
25474 138344 : case E_V8QImode:
25475 138344 : if (!TARGET_MMX_WITH_SSE)
25476 : return false;
25477 : break;
25478 25533 : case E_V2HImode:
25479 25533 : if (!TARGET_SSE2)
25480 : return false;
25481 : /* All implementable with *punpckwd. */
25482 25533 : if (d.testing_p)
25483 : return true;
25484 : break;
25485 10804 : case E_V4QImode:
25486 10804 : if (!TARGET_SSE2)
25487 : return false;
25488 : break;
25489 390464 : case E_V2DImode:
25490 390464 : case E_V2DFmode:
25491 390464 : if (!TARGET_SSE)
25492 : return false;
25493 : /* All implementable with shufpd or unpck[lh]pd. */
25494 390464 : if (d.testing_p)
25495 : return true;
25496 : break;
25497 : default:
25498 : return false;
25499 : }
25500 :
25501 2229359 : for (i = which = 0; i < nelt; ++i)
25502 : {
25503 1824080 : unsigned char e = sel[i];
25504 1824080 : gcc_assert (e < 2 * nelt);
25505 1824080 : d.perm[i] = e;
25506 1824080 : perm[i] = e;
25507 2470775 : which |= (e < nelt ? 1 : 2);
25508 : }
25509 :
25510 405279 : if (d.testing_p)
25511 : {
25512 : /* For all elements from second vector, fold the elements to first. */
25513 332131 : if (which == 2)
25514 1345 : for (i = 0; i < nelt; ++i)
25515 1240 : d.perm[i] -= nelt;
25516 :
25517 : /* Check whether the mask can be applied to the vector type. */
25518 332131 : d.one_operand_p = (which != 3);
25519 :
25520 : /* Implementable with shufps, pshufd or pshuflw. */
25521 332131 : if (d.one_operand_p
25522 : && (d.vmode == V4SFmode || d.vmode == V2SFmode
25523 : || d.vmode == V4SImode || d.vmode == V2SImode
25524 : || d.vmode == V4HImode || d.vmode == V2HImode))
25525 : return true;
25526 :
25527 : /* Otherwise we have to go through the motions and see if we can
25528 : figure out how to generate the requested permutation. */
25529 228977 : d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
25530 228977 : d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
25531 228977 : if (!d.one_operand_p)
25532 215031 : d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
25533 :
25534 228977 : start_sequence ();
25535 228977 : bool ret = ix86_expand_vec_perm_const_1 (&d);
25536 228977 : end_sequence ();
25537 :
25538 228977 : return ret;
25539 : }
25540 :
25541 73148 : two_args = canonicalize_perm (&d);
25542 :
25543 : /* If one of the operands is a zero vector, try to match pmovzx. */
25544 73148 : if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
25545 : {
25546 583 : struct expand_vec_perm_d dzero = d;
25547 583 : if (d.op0 == CONST0_RTX (vmode))
25548 : {
25549 387 : d.op1 = dzero.op1 = force_reg (vmode, d.op1);
25550 387 : std::swap (dzero.op0, dzero.op1);
25551 7527 : for (i = 0; i < nelt; ++i)
25552 7140 : dzero.perm[i] ^= nelt;
25553 : }
25554 : else
25555 196 : d.op0 = dzero.op0 = force_reg (vmode, d.op0);
25556 :
25557 583 : if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
25558 583 : dzero.perm, nelt, dzero.testing_p))
25559 122 : return true;
25560 : }
25561 :
25562 : /* Force operands into registers. */
25563 73026 : rtx nop0 = force_reg (vmode, d.op0);
25564 73026 : if (d.op0 == d.op1)
25565 18600 : d.op1 = nop0;
25566 73026 : d.op0 = nop0;
25567 73026 : d.op1 = force_reg (vmode, d.op1);
25568 :
25569 73026 : if (ix86_expand_vec_perm_const_1 (&d))
25570 : return true;
25571 :
25572 : /* If the selector says both arguments are needed, but the operands are the
25573 : same, the above tried to expand with one_operand_p and flattened selector.
25574 : If that didn't work, retry without one_operand_p; we succeeded with that
25575 : during testing. */
25576 22 : if (two_args && d.one_operand_p)
25577 : {
25578 22 : d.one_operand_p = false;
25579 22 : memcpy (d.perm, perm, sizeof (perm));
25580 22 : return ix86_expand_vec_perm_const_1 (&d);
25581 : }
25582 :
25583 : return false;
25584 : }
25585 :
25586 : void
25587 8214 : ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
25588 : {
25589 8214 : struct expand_vec_perm_d d;
25590 8214 : unsigned i, nelt;
25591 :
25592 8214 : d.target = targ;
25593 8214 : d.op0 = op0;
25594 8214 : d.op1 = op1;
25595 8214 : d.vmode = GET_MODE (targ);
25596 8214 : d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
25597 8214 : d.one_operand_p = false;
25598 8214 : d.testing_p = false;
25599 :
25600 78090 : for (i = 0; i < nelt; ++i)
25601 69876 : d.perm[i] = i * 2 + odd;
25602 :
25603 : /* We'll either be able to implement the permutation directly... */
25604 8214 : if (expand_vec_perm_1 (&d))
25605 3185 : return;
25606 :
25607 : /* ... or we use the special-case patterns. */
25608 5029 : expand_vec_perm_even_odd_1 (&d, odd);
25609 : }
25610 :
25611 : static void
25612 924 : ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
25613 : {
25614 924 : struct expand_vec_perm_d d;
25615 924 : unsigned i, nelt, base;
25616 924 : bool ok;
25617 :
25618 924 : d.target = targ;
25619 924 : d.op0 = op0;
25620 924 : d.op1 = op1;
25621 924 : d.vmode = GET_MODE (targ);
25622 924 : d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
25623 924 : d.one_operand_p = false;
25624 924 : d.testing_p = false;
25625 :
25626 924 : base = high_p ? nelt / 2 : 0;
25627 3652 : for (i = 0; i < nelt / 2; ++i)
25628 : {
25629 2728 : d.perm[i * 2] = i + base;
25630 2728 : d.perm[i * 2 + 1] = i + base + nelt;
25631 : }
25632 :
25633 : /* Note that for AVX this isn't one instruction. */
25634 924 : ok = ix86_expand_vec_perm_const_1 (&d);
25635 924 : gcc_assert (ok);
25636 924 : }
25637 :
25638 : /* Expand a vector operation shift by constant for a V*QImode in terms of the
25639 : same operation on V*HImode. Return true if success. */
25640 : static bool
25641 389 : ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
25642 : rtx dest, rtx op1, rtx op2)
25643 : {
25644 389 : machine_mode qimode, himode;
25645 389 : HOST_WIDE_INT and_constant, xor_constant;
25646 389 : HOST_WIDE_INT shift_amount;
25647 389 : rtx vec_const_and, vec_const_xor;
25648 389 : rtx tmp, op1_subreg;
25649 389 : rtx (*gen_shift) (rtx, rtx, rtx);
25650 389 : rtx (*gen_and) (rtx, rtx, rtx);
25651 389 : rtx (*gen_xor) (rtx, rtx, rtx);
25652 389 : rtx (*gen_sub) (rtx, rtx, rtx);
25653 :
25654 : /* Only optimize shift by constant. */
25655 389 : if (!CONST_INT_P (op2))
25656 : return false;
25657 :
25658 389 : qimode = GET_MODE (dest);
25659 389 : shift_amount = INTVAL (op2);
25660 : /* Do nothing when shift amount greater equal 8. */
25661 389 : if (shift_amount > 7)
25662 : return false;
25663 :
25664 389 : gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
25665 :
25666 :
25667 389 : if (shift_amount == 7
25668 389 : && code == ASHIFTRT)
25669 : {
25670 39 : if (qimode == V16QImode
25671 10 : || qimode == V32QImode)
25672 : {
25673 38 : rtx zero = gen_reg_rtx (qimode);
25674 38 : emit_move_insn (zero, CONST0_RTX (qimode));
25675 38 : emit_move_insn (dest, gen_rtx_fmt_ee (GT, qimode, zero, op1));
25676 38 : }
25677 : else
25678 : {
25679 1 : gcc_assert (qimode == V64QImode);
25680 1 : rtx kmask = gen_reg_rtx (DImode);
25681 1 : emit_insn (gen_avx512bw_cvtb2maskv64qi (kmask, op1));
25682 1 : emit_insn (gen_avx512bw_cvtmask2bv64qi (dest, kmask));
25683 : }
25684 39 : return true;
25685 : }
25686 :
25687 : /* Record sign bit. */
25688 350 : xor_constant = 1 << (8 - shift_amount - 1);
25689 :
25690 : /* Zero upper/lower bits shift from left/right element. */
25691 350 : and_constant
25692 350 : = (code == ASHIFT ? 256 - (1 << shift_amount)
25693 317 : : (1 << (8 - shift_amount)) - 1);
25694 :
25695 350 : switch (qimode)
25696 : {
25697 333 : case V16QImode:
25698 333 : himode = V8HImode;
25699 281 : gen_shift =
25700 : ((code == ASHIFT)
25701 333 : ? gen_ashlv8hi3
25702 313 : : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
25703 : gen_and = gen_andv16qi3;
25704 : gen_xor = gen_xorv16qi3;
25705 : gen_sub = gen_subv16qi3;
25706 : break;
25707 6 : case V32QImode:
25708 6 : himode = V16HImode;
25709 1 : gen_shift =
25710 : ((code == ASHIFT)
25711 6 : ? gen_ashlv16hi3
25712 2 : : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
25713 : gen_and = gen_andv32qi3;
25714 : gen_xor = gen_xorv32qi3;
25715 : gen_sub = gen_subv32qi3;
25716 : break;
25717 11 : case V64QImode:
25718 11 : himode = V32HImode;
25719 1 : gen_shift =
25720 : ((code == ASHIFT)
25721 11 : ? gen_ashlv32hi3
25722 2 : : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
25723 : gen_and = gen_andv64qi3;
25724 : gen_xor = gen_xorv64qi3;
25725 : gen_sub = gen_subv64qi3;
25726 : break;
25727 0 : default:
25728 0 : gcc_unreachable ();
25729 : }
25730 :
25731 350 : tmp = gen_reg_rtx (himode);
25732 350 : vec_const_and = gen_reg_rtx (qimode);
25733 350 : op1_subreg = lowpart_subreg (himode, op1, qimode);
25734 :
25735 : /* For ASHIFT and LSHIFTRT, perform operation like
25736 : vpsllw/vpsrlw $shift_amount, %op1, %dest.
25737 : vpand %vec_const_and, %dest. */
25738 350 : emit_insn (gen_shift (tmp, op1_subreg, op2));
25739 350 : emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
25740 350 : emit_move_insn (vec_const_and,
25741 : ix86_build_const_vector (qimode, true,
25742 350 : gen_int_mode (and_constant, QImode)));
25743 350 : emit_insn (gen_and (dest, dest, vec_const_and));
25744 :
25745 : /* For ASHIFTRT, perform extra operation like
25746 : vpxor %vec_const_xor, %dest, %dest
25747 : vpsubb %vec_const_xor, %dest, %dest */
25748 350 : if (code == ASHIFTRT)
25749 : {
25750 34 : vec_const_xor = gen_reg_rtx (qimode);
25751 34 : emit_move_insn (vec_const_xor,
25752 : ix86_build_const_vector (qimode, true,
25753 34 : gen_int_mode (xor_constant, QImode)));
25754 34 : emit_insn (gen_xor (dest, dest, vec_const_xor));
25755 34 : emit_insn (gen_sub (dest, dest, vec_const_xor));
25756 : }
25757 : return true;
25758 : }
25759 :
25760 : void
25761 1412 : ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
25762 : {
25763 1412 : machine_mode qimode = GET_MODE (dest);
25764 1412 : rtx qop1, qop2, hop1, hop2, qdest, hdest;
25765 1412 : bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
25766 1412 : bool uns_p = code != ASHIFTRT;
25767 :
25768 1412 : switch (qimode)
25769 : {
25770 1412 : case E_V4QImode:
25771 1412 : case E_V8QImode:
25772 1412 : break;
25773 0 : default:
25774 0 : gcc_unreachable ();
25775 : }
25776 :
25777 1412 : qop1 = lowpart_subreg (V16QImode, force_reg (qimode, op1), qimode);
25778 :
25779 1412 : if (op2vec)
25780 1310 : qop2 = lowpart_subreg (V16QImode, force_reg (qimode, op2), qimode);
25781 : else
25782 : qop2 = op2;
25783 :
25784 1412 : qdest = gen_reg_rtx (V16QImode);
25785 :
25786 1412 : if (CONST_INT_P (op2)
25787 90 : && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
25788 : /* With AVX512 it's cheaper to do vpmovsxbw/op/vpmovwb.
25789 : Even with SSE4.1 the alternative is better. */
25790 90 : && !TARGET_SSE4_1
25791 1466 : && ix86_expand_vec_shift_qihi_constant (code, qdest, qop1, qop2))
25792 : {
25793 54 : emit_move_insn (dest, gen_lowpart (qimode, qdest));
25794 54 : return;
25795 : }
25796 :
25797 1358 : if (CONST_INT_P (op2)
25798 36 : && code == ASHIFTRT
25799 10 : && INTVAL (op2) == 7)
25800 : {
25801 3 : rtx zero = gen_reg_rtx (qimode);
25802 3 : emit_move_insn (zero, CONST0_RTX (qimode));
25803 3 : emit_move_insn (dest, gen_rtx_fmt_ee (GT, qimode, zero, op1));
25804 3 : return;
25805 : }
25806 :
25807 1355 : switch (code)
25808 : {
25809 1297 : case MULT:
25810 1297 : gcc_assert (op2vec);
25811 1297 : if (!TARGET_SSE4_1)
25812 : {
25813 : /* Unpack data such that we've got a source byte in each low byte
25814 : of each word. We don't care what goes into the high byte of
25815 : each word. Rather than trying to get zero in there, most
25816 : convenient is to let it be a copy of the low byte. */
25817 244 : hop1 = copy_to_reg (qop1);
25818 244 : hop2 = copy_to_reg (qop2);
25819 244 : emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1));
25820 244 : emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2));
25821 244 : break;
25822 : }
25823 : /* FALLTHRU */
25824 1111 : case ASHIFT:
25825 1111 : case ASHIFTRT:
25826 1111 : case LSHIFTRT:
25827 1111 : hop1 = gen_reg_rtx (V8HImode);
25828 1111 : ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
25829 : /* mult/vashr/vlshr/vashl */
25830 1111 : if (op2vec)
25831 : {
25832 1066 : hop2 = gen_reg_rtx (V8HImode);
25833 1066 : ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
25834 : }
25835 : else
25836 : hop2 = qop2;
25837 :
25838 : break;
25839 0 : default:
25840 0 : gcc_unreachable ();
25841 : }
25842 :
25843 1355 : if (code != MULT && op2vec)
25844 : {
25845 : /* Expand vashr/vlshr/vashl. */
25846 13 : hdest = gen_reg_rtx (V8HImode);
25847 13 : emit_insn (gen_rtx_SET (hdest,
25848 : simplify_gen_binary (code, V8HImode,
25849 : hop1, hop2)));
25850 : }
25851 : else
25852 : /* Expand mult/ashr/lshr/ashl. */
25853 1342 : hdest = expand_simple_binop (V8HImode, code, hop1, hop2,
25854 : NULL_RTX, 1, OPTAB_DIRECT);
25855 :
25856 1355 : if (TARGET_AVX512BW && TARGET_AVX512VL)
25857 : {
25858 30 : if (qimode == V8QImode)
25859 : qdest = dest;
25860 : else
25861 10 : qdest = gen_reg_rtx (V8QImode);
25862 :
25863 30 : emit_insn (gen_truncv8hiv8qi2 (qdest, hdest));
25864 : }
25865 : else
25866 : {
25867 1325 : struct expand_vec_perm_d d;
25868 1325 : rtx qres = gen_lowpart (V16QImode, hdest);
25869 1325 : bool ok;
25870 1325 : int i;
25871 :
25872 : /* Merge the data back into the right place. */
25873 1325 : d.target = qdest;
25874 1325 : d.op0 = d.op1 = qres;
25875 1325 : d.vmode = V16QImode;
25876 1325 : d.nelt = 16;
25877 1325 : d.one_operand_p = TARGET_SSSE3;
25878 1325 : d.testing_p = false;
25879 :
25880 22525 : for (i = 0; i < d.nelt; ++i)
25881 21200 : d.perm[i] = i * 2;
25882 :
25883 1325 : ok = ix86_expand_vec_perm_const_1 (&d);
25884 1325 : gcc_assert (ok);
25885 : }
25886 :
25887 1355 : if (qdest != dest)
25888 1335 : emit_move_insn (dest, gen_lowpart (qimode, qdest));
25889 : }
25890 :
25891 : /* Emit instruction in 2x wider mode. For example, optimize
25892 : vector MUL generation like
25893 :
25894 : vpmovzxbw ymm2, xmm0
25895 : vpmovzxbw ymm3, xmm1
25896 : vpmullw ymm4, ymm2, ymm3
25897 : vpmovwb xmm0, ymm4
25898 :
25899 : it would take less instructions than ix86_expand_vecop_qihi.
25900 : Return true if success. */
25901 :
25902 : static bool
25903 1361 : ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
25904 : {
25905 1361 : machine_mode himode, qimode = GET_MODE (dest);
25906 1361 : machine_mode wqimode;
25907 1361 : rtx qop1, qop2, hop1, hop2, hdest;
25908 1361 : rtx (*gen_truncate)(rtx, rtx) = NULL;
25909 1361 : bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
25910 1361 : bool uns_p = code != ASHIFTRT;
25911 :
25912 : /* Without VPMOVWB (provided by AVX512BW ISA), the expansion uses the
25913 : generic permutation to merge the data back into the right place. This
25914 : permutation results in VPERMQ, which is slow, so better fall back to
25915 : ix86_expand_vecop_qihi. */
25916 1361 : if (!TARGET_AVX512BW
25917 301 : || (qimode == V16QImode && !TARGET_AVX512VL)
25918 : /* There are no V64HImode instructions. */
25919 301 : || qimode == V64QImode)
25920 : return false;
25921 :
25922 : /* Do not generate ymm/zmm instructions when
25923 : target prefers 128/256 bit vector width. */
25924 267 : if ((qimode == V16QImode && TARGET_PREFER_AVX128)
25925 267 : || (qimode == V32QImode && TARGET_PREFER_AVX256))
25926 : return false;
25927 :
25928 260 : switch (qimode)
25929 : {
25930 : case E_V16QImode:
25931 : himode = V16HImode;
25932 : gen_truncate = gen_truncv16hiv16qi2;
25933 : break;
25934 17 : case E_V32QImode:
25935 17 : himode = V32HImode;
25936 17 : gen_truncate = gen_truncv32hiv32qi2;
25937 17 : break;
25938 0 : default:
25939 0 : gcc_unreachable ();
25940 : }
25941 :
25942 260 : wqimode = GET_MODE_2XWIDER_MODE (qimode).require ();
25943 260 : qop1 = lowpart_subreg (wqimode, force_reg (qimode, op1), qimode);
25944 :
25945 260 : if (op2vec)
25946 260 : qop2 = lowpart_subreg (wqimode, force_reg (qimode, op2), qimode);
25947 : else
25948 : qop2 = op2;
25949 :
25950 260 : hop1 = gen_reg_rtx (himode);
25951 260 : ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
25952 :
25953 260 : if (op2vec)
25954 : {
25955 260 : hop2 = gen_reg_rtx (himode);
25956 260 : ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
25957 : }
25958 : else
25959 : hop2 = qop2;
25960 :
25961 260 : if (code != MULT && op2vec)
25962 : {
25963 : /* Expand vashr/vlshr/vashl. */
25964 14 : hdest = gen_reg_rtx (himode);
25965 14 : emit_insn (gen_rtx_SET (hdest,
25966 : simplify_gen_binary (code, himode,
25967 : hop1, hop2)));
25968 : }
25969 : else
25970 : /* Expand mult/ashr/lshr/ashl. */
25971 246 : hdest = expand_simple_binop (himode, code, hop1, hop2,
25972 : NULL_RTX, 1, OPTAB_DIRECT);
25973 :
25974 260 : emit_insn (gen_truncate (dest, hdest));
25975 260 : return true;
25976 : }
25977 :
25978 : /* Expand a vector operation CODE for a V*QImode in terms of the
25979 : same operation on V*HImode. */
25980 :
25981 : void
25982 1696 : ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
25983 : {
25984 1696 : machine_mode qimode = GET_MODE (dest);
25985 1696 : machine_mode himode;
25986 1696 : rtx (*gen_il) (rtx, rtx, rtx);
25987 1696 : rtx (*gen_ih) (rtx, rtx, rtx);
25988 1696 : rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
25989 1696 : bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
25990 1696 : struct expand_vec_perm_d d;
25991 1696 : bool full_interleave = true;
25992 1696 : bool uns_p = code != ASHIFTRT;
25993 1696 : bool ok;
25994 1696 : int i;
25995 :
25996 1696 : if (CONST_INT_P (op2)
25997 335 : && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
25998 2031 : && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
25999 595 : return;
26000 :
26001 1361 : if (ix86_expand_vecop_qihi2 (code, dest, op1, op2))
26002 : return;
26003 :
26004 1101 : switch (qimode)
26005 : {
26006 : case E_V16QImode:
26007 : himode = V8HImode;
26008 : break;
26009 280 : case E_V32QImode:
26010 280 : himode = V16HImode;
26011 280 : break;
26012 34 : case E_V64QImode:
26013 34 : himode = V32HImode;
26014 34 : break;
26015 0 : default:
26016 0 : gcc_unreachable ();
26017 : }
26018 :
26019 1101 : switch (code)
26020 : {
26021 1074 : case MULT:
26022 1074 : gcc_assert (op2vec);
26023 : /* Unpack data such that we've got a source byte in each low byte of
26024 : each word. We don't care what goes into the high byte of each word.
26025 : Rather than trying to get zero in there, most convenient is to let
26026 : it be a copy of the low byte. */
26027 1074 : switch (qimode)
26028 : {
26029 : case E_V16QImode:
26030 : gen_il = gen_vec_interleave_lowv16qi;
26031 : gen_ih = gen_vec_interleave_highv16qi;
26032 : break;
26033 280 : case E_V32QImode:
26034 280 : gen_il = gen_avx2_interleave_lowv32qi;
26035 280 : gen_ih = gen_avx2_interleave_highv32qi;
26036 280 : full_interleave = false;
26037 280 : break;
26038 32 : case E_V64QImode:
26039 32 : gen_il = gen_avx512bw_interleave_lowv64qi;
26040 32 : gen_ih = gen_avx512bw_interleave_highv64qi;
26041 32 : full_interleave = false;
26042 32 : break;
26043 0 : default:
26044 0 : gcc_unreachable ();
26045 : }
26046 :
26047 1074 : op2_l = gen_reg_rtx (qimode);
26048 1074 : op2_h = gen_reg_rtx (qimode);
26049 1074 : emit_insn (gen_il (op2_l, op2, op2));
26050 1074 : emit_insn (gen_ih (op2_h, op2, op2));
26051 :
26052 1074 : op1_l = gen_reg_rtx (qimode);
26053 1074 : op1_h = gen_reg_rtx (qimode);
26054 1074 : emit_insn (gen_il (op1_l, op1, op1));
26055 1074 : emit_insn (gen_ih (op1_h, op1, op1));
26056 1074 : break;
26057 :
26058 27 : case ASHIFT:
26059 27 : case ASHIFTRT:
26060 27 : case LSHIFTRT:
26061 27 : op1_l = gen_reg_rtx (himode);
26062 27 : op1_h = gen_reg_rtx (himode);
26063 27 : ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
26064 27 : ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
26065 : /* vashr/vlshr/vashl */
26066 27 : if (op2vec)
26067 : {
26068 2 : rtx tmp = force_reg (qimode, op2);
26069 2 : op2_l = gen_reg_rtx (himode);
26070 2 : op2_h = gen_reg_rtx (himode);
26071 2 : ix86_expand_sse_unpack (op2_l, tmp, uns_p, false);
26072 2 : ix86_expand_sse_unpack (op2_h, tmp, uns_p, true);
26073 : }
26074 : else
26075 : op2_l = op2_h = op2;
26076 :
26077 : break;
26078 0 : default:
26079 0 : gcc_unreachable ();
26080 : }
26081 :
26082 1101 : if (code != MULT && op2vec)
26083 : {
26084 : /* Expand vashr/vlshr/vashl. */
26085 2 : res_l = gen_reg_rtx (himode);
26086 2 : res_h = gen_reg_rtx (himode);
26087 2 : emit_insn (gen_rtx_SET (res_l,
26088 : simplify_gen_binary (code, himode,
26089 : op1_l, op2_l)));
26090 2 : emit_insn (gen_rtx_SET (res_h,
26091 : simplify_gen_binary (code, himode,
26092 : op1_h, op2_h)));
26093 : }
26094 : else
26095 : {
26096 : /* Expand mult/ashr/lshr/ashl. */
26097 1099 : res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
26098 : 1, OPTAB_DIRECT);
26099 1099 : res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
26100 : 1, OPTAB_DIRECT);
26101 : }
26102 :
26103 1101 : gcc_assert (res_l && res_h);
26104 :
26105 : /* Merge the data back into the right place. */
26106 1101 : d.target = dest;
26107 1101 : d.op0 = gen_lowpart (qimode, res_l);
26108 1101 : d.op1 = gen_lowpart (qimode, res_h);
26109 1101 : d.vmode = qimode;
26110 1101 : d.nelt = GET_MODE_NUNITS (qimode);
26111 1101 : d.one_operand_p = false;
26112 1101 : d.testing_p = false;
26113 :
26114 1101 : if (full_interleave)
26115 : {
26116 : /* We used the full interleave, the desired
26117 : results are in the even elements. */
26118 13509 : for (i = 0; i < d.nelt; ++i)
26119 12720 : d.perm[i] = i * 2;
26120 : }
26121 : else
26122 : {
26123 : /* For AVX, the interleave used above was not cross-lane. So the
26124 : extraction is evens but with the second and third quarter swapped.
26125 : Happily, that is even one insn shorter than even extraction.
26126 : For AVX512BW we have 4 lanes. We extract evens from within a lane,
26127 : always first from the first and then from the second source operand,
26128 : the index bits above the low 4 bits remains the same.
26129 : Thus, for d.nelt == 32 we want permutation
26130 : 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
26131 : and for d.nelt == 64 we want permutation
26132 : 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
26133 : 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
26134 11320 : for (i = 0; i < d.nelt; ++i)
26135 16512 : d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
26136 : }
26137 :
26138 1101 : ok = ix86_expand_vec_perm_const_1 (&d);
26139 1101 : gcc_assert (ok);
26140 : }
26141 :
26142 : /* Helper function of ix86_expand_mul_widen_evenodd. Return true
26143 : if op is CONST_VECTOR with all odd elements equal to their
26144 : preceding element. */
26145 :
26146 : static bool
26147 8772 : const_vector_equal_evenodd_p (rtx op)
26148 : {
26149 8772 : machine_mode mode = GET_MODE (op);
26150 8772 : int i, nunits = GET_MODE_NUNITS (mode);
26151 8772 : if (!CONST_VECTOR_P (op)
26152 8772 : || nunits != CONST_VECTOR_NUNITS (op))
26153 : return false;
26154 3579 : for (i = 0; i < nunits; i += 2)
26155 2886 : if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
26156 : return false;
26157 : return true;
26158 : }
26159 :
26160 : void
26161 8888 : ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
26162 : bool uns_p, bool odd_p)
26163 : {
26164 8888 : machine_mode mode = GET_MODE (op1);
26165 8888 : machine_mode wmode = GET_MODE (dest);
26166 8888 : rtx x;
26167 8888 : rtx orig_op1 = op1, orig_op2 = op2;
26168 :
26169 8888 : if (!nonimmediate_operand (op1, mode))
26170 0 : op1 = force_reg (mode, op1);
26171 8888 : if (!nonimmediate_operand (op2, mode))
26172 3320 : op2 = force_reg (mode, op2);
26173 :
26174 : /* We only play even/odd games with vectors of SImode. */
26175 8888 : gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
26176 :
26177 : /* If we're looking for the odd results, shift those members down to
26178 : the even slots. For some cpus this is faster than a PSHUFD. */
26179 8888 : if (odd_p)
26180 : {
26181 : /* For XOP use vpmacsdqh, but only for smult, as it is only
26182 : signed. */
26183 4404 : if (TARGET_XOP && mode == V4SImode && !uns_p)
26184 : {
26185 18 : x = force_reg (wmode, CONST0_RTX (wmode));
26186 18 : emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
26187 18 : return;
26188 : }
26189 :
26190 8772 : x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
26191 4386 : if (!const_vector_equal_evenodd_p (orig_op1))
26192 4386 : op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
26193 : x, NULL, 1, OPTAB_DIRECT);
26194 4386 : if (!const_vector_equal_evenodd_p (orig_op2))
26195 3693 : op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
26196 : x, NULL, 1, OPTAB_DIRECT);
26197 4386 : op1 = gen_lowpart (mode, op1);
26198 4386 : op2 = gen_lowpart (mode, op2);
26199 : }
26200 :
26201 8870 : if (mode == V16SImode)
26202 : {
26203 10 : if (uns_p)
26204 0 : x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
26205 : else
26206 10 : x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
26207 : }
26208 8860 : else if (mode == V8SImode)
26209 : {
26210 147 : if (uns_p)
26211 59 : x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
26212 : else
26213 88 : x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
26214 : }
26215 8713 : else if (uns_p)
26216 7658 : x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
26217 1055 : else if (TARGET_SSE4_1)
26218 369 : x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
26219 : else
26220 : {
26221 686 : rtx s1, s2, t0, t1, t2;
26222 :
26223 : /* The easiest way to implement this without PMULDQ is to go through
26224 : the motions as if we are performing a full 64-bit multiply. With
26225 : the exception that we need to do less shuffling of the elements. */
26226 :
26227 : /* Compute the sign-extension, aka highparts, of the two operands. */
26228 686 : s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
26229 : op1, pc_rtx, pc_rtx);
26230 686 : s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
26231 : op2, pc_rtx, pc_rtx);
26232 :
26233 : /* Multiply LO(A) * HI(B), and vice-versa. */
26234 686 : t1 = gen_reg_rtx (wmode);
26235 686 : t2 = gen_reg_rtx (wmode);
26236 686 : emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
26237 686 : emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
26238 :
26239 : /* Multiply LO(A) * LO(B). */
26240 686 : t0 = gen_reg_rtx (wmode);
26241 686 : emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
26242 :
26243 : /* Combine and shift the highparts into place. */
26244 686 : t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
26245 686 : t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
26246 : 1, OPTAB_DIRECT);
26247 :
26248 : /* Combine high and low parts. */
26249 686 : force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
26250 686 : return;
26251 : }
26252 8184 : emit_insn (x);
26253 : }
26254 :
26255 : void
26256 985 : ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
26257 : bool uns_p, bool high_p)
26258 : {
26259 985 : machine_mode wmode = GET_MODE (dest);
26260 985 : machine_mode mode = GET_MODE (op1);
26261 985 : rtx t1, t2, t3, t4, mask;
26262 :
26263 985 : switch (mode)
26264 : {
26265 297 : case E_V4SImode:
26266 297 : t1 = gen_reg_rtx (mode);
26267 297 : t2 = gen_reg_rtx (mode);
26268 297 : if (TARGET_XOP && !uns_p)
26269 : {
26270 : /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
26271 : shuffle the elements once so that all elements are in the right
26272 : place for immediate use: { A C B D }. */
26273 33 : emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
26274 : const1_rtx, GEN_INT (3)));
26275 33 : emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
26276 : const1_rtx, GEN_INT (3)));
26277 : }
26278 : else
26279 : {
26280 : /* Put the elements into place for the multiply. */
26281 264 : ix86_expand_vec_interleave (t1, op1, op1, high_p);
26282 264 : ix86_expand_vec_interleave (t2, op2, op2, high_p);
26283 264 : high_p = false;
26284 : }
26285 297 : ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
26286 297 : break;
26287 :
26288 78 : case E_V8SImode:
26289 : /* Shuffle the elements between the lanes. After this we
26290 : have { A B E F | C D G H } for each operand. */
26291 78 : t1 = gen_reg_rtx (V4DImode);
26292 78 : t2 = gen_reg_rtx (V4DImode);
26293 78 : emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
26294 : const0_rtx, const2_rtx,
26295 : const1_rtx, GEN_INT (3)));
26296 78 : emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
26297 : const0_rtx, const2_rtx,
26298 : const1_rtx, GEN_INT (3)));
26299 :
26300 : /* Shuffle the elements within the lanes. After this we
26301 : have { A A B B | C C D D } or { E E F F | G G H H }. */
26302 78 : t3 = gen_reg_rtx (V8SImode);
26303 78 : t4 = gen_reg_rtx (V8SImode);
26304 117 : mask = GEN_INT (high_p
26305 : ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
26306 : : 0 + (0 << 2) + (1 << 4) + (1 << 6));
26307 78 : emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
26308 78 : emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
26309 :
26310 78 : ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
26311 78 : break;
26312 :
26313 396 : case E_V8HImode:
26314 396 : case E_V16HImode:
26315 396 : t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
26316 : uns_p, OPTAB_DIRECT);
26317 630 : t2 = expand_binop (mode,
26318 : uns_p ? umul_highpart_optab : smul_highpart_optab,
26319 : op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
26320 396 : gcc_assert (t1 && t2);
26321 :
26322 396 : t3 = gen_reg_rtx (mode);
26323 396 : ix86_expand_vec_interleave (t3, t1, t2, high_p);
26324 396 : emit_move_insn (dest, gen_lowpart (wmode, t3));
26325 396 : break;
26326 :
26327 214 : case E_V16QImode:
26328 214 : case E_V32QImode:
26329 214 : case E_V32HImode:
26330 214 : case E_V16SImode:
26331 214 : case E_V64QImode:
26332 214 : t1 = gen_reg_rtx (wmode);
26333 214 : t2 = gen_reg_rtx (wmode);
26334 214 : ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
26335 214 : ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
26336 :
26337 214 : emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
26338 214 : break;
26339 :
26340 0 : default:
26341 0 : gcc_unreachable ();
26342 : }
26343 985 : }
26344 :
26345 : void
26346 3661 : ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
26347 : {
26348 3661 : rtx res_1, res_2, res_3, res_4;
26349 :
26350 3661 : res_1 = gen_reg_rtx (V4SImode);
26351 3661 : res_2 = gen_reg_rtx (V4SImode);
26352 3661 : res_3 = gen_reg_rtx (V2DImode);
26353 3661 : res_4 = gen_reg_rtx (V2DImode);
26354 3661 : ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
26355 3661 : ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
26356 :
26357 : /* Move the results in element 2 down to element 1; we don't care
26358 : what goes in elements 2 and 3. Then we can merge the parts
26359 : back together with an interleave.
26360 :
26361 : Note that two other sequences were tried:
26362 : (1) Use interleaves at the start instead of psrldq, which allows
26363 : us to use a single shufps to merge things back at the end.
26364 : (2) Use shufps here to combine the two vectors, then pshufd to
26365 : put the elements in the correct order.
26366 : In both cases the cost of the reformatting stall was too high
26367 : and the overall sequence slower. */
26368 :
26369 3661 : emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
26370 : const0_rtx, const2_rtx,
26371 : const0_rtx, const0_rtx));
26372 3661 : emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
26373 : const0_rtx, const2_rtx,
26374 : const0_rtx, const0_rtx));
26375 3661 : res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
26376 :
26377 3661 : set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
26378 3661 : }
26379 :
26380 : void
26381 541 : ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
26382 : {
26383 541 : machine_mode mode = GET_MODE (op0);
26384 541 : rtx t1, t2, t3, t4, t5, t6;
26385 :
26386 541 : if (TARGET_AVX512DQ && mode == V8DImode)
26387 32 : emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
26388 509 : else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
26389 32 : emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
26390 477 : else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
26391 36 : emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
26392 441 : else if (TARGET_XOP && mode == V2DImode)
26393 : {
26394 : /* op1: A,B,C,D, op2: E,F,G,H */
26395 2 : op1 = gen_lowpart (V4SImode, op1);
26396 2 : op2 = gen_lowpart (V4SImode, op2);
26397 :
26398 2 : t1 = gen_reg_rtx (V4SImode);
26399 2 : t2 = gen_reg_rtx (V4SImode);
26400 2 : t3 = gen_reg_rtx (V2DImode);
26401 2 : t4 = gen_reg_rtx (V2DImode);
26402 :
26403 : /* t1: B,A,D,C */
26404 2 : emit_insn (gen_sse2_pshufd_1 (t1, op1,
26405 : GEN_INT (1),
26406 : GEN_INT (0),
26407 : GEN_INT (3),
26408 : GEN_INT (2)));
26409 :
26410 : /* t2: (B*E),(A*F),(D*G),(C*H) */
26411 2 : emit_insn (gen_mulv4si3 (t2, t1, op2));
26412 :
26413 : /* t3: (B*E)+(A*F), (D*G)+(C*H) */
26414 2 : emit_insn (gen_xop_phadddq (t3, t2));
26415 :
26416 : /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
26417 2 : emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
26418 :
26419 : /* Multiply lower parts and add all */
26420 2 : t5 = gen_reg_rtx (V2DImode);
26421 2 : emit_insn (gen_vec_widen_umult_even_v4si (t5,
26422 2 : gen_lowpart (V4SImode, op1),
26423 2 : gen_lowpart (V4SImode, op2)));
26424 2 : force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
26425 : }
26426 : else
26427 : {
26428 439 : machine_mode nmode;
26429 439 : rtx (*umul) (rtx, rtx, rtx);
26430 :
26431 439 : if (mode == V2DImode)
26432 : {
26433 : umul = gen_vec_widen_umult_even_v4si;
26434 : nmode = V4SImode;
26435 : }
26436 327 : else if (mode == V4DImode)
26437 : {
26438 : umul = gen_vec_widen_umult_even_v8si;
26439 : nmode = V8SImode;
26440 : }
26441 116 : else if (mode == V8DImode)
26442 : {
26443 : umul = gen_vec_widen_umult_even_v16si;
26444 : nmode = V16SImode;
26445 : }
26446 : else
26447 0 : gcc_unreachable ();
26448 :
26449 :
26450 : /* Multiply low parts. */
26451 439 : t1 = gen_reg_rtx (mode);
26452 439 : emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
26453 :
26454 : /* Shift input vectors right 32 bits so we can multiply high parts. */
26455 439 : t6 = GEN_INT (32);
26456 439 : t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
26457 439 : t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
26458 :
26459 : /* Multiply high parts by low parts. */
26460 439 : t4 = gen_reg_rtx (mode);
26461 439 : t5 = gen_reg_rtx (mode);
26462 439 : emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
26463 439 : emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
26464 :
26465 : /* Combine and shift the highparts back. */
26466 439 : t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
26467 439 : t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
26468 :
26469 : /* Combine high and low parts. */
26470 439 : force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
26471 : }
26472 :
26473 541 : set_unique_reg_note (get_last_insn (), REG_EQUAL,
26474 : gen_rtx_MULT (mode, op1, op2));
26475 541 : }
26476 :
26477 : /* Return 1 if control tansfer instruction INSN
26478 : should be encoded with notrack prefix. */
26479 :
26480 : bool
26481 14876764 : ix86_notrack_prefixed_insn_p (rtx_insn *insn)
26482 : {
26483 14876764 : if (!insn || !((flag_cf_protection & CF_BRANCH)))
26484 : return false;
26485 :
26486 3967893 : if (CALL_P (insn))
26487 : {
26488 1395229 : rtx call = get_call_rtx_from (insn);
26489 1395229 : gcc_assert (call != NULL_RTX);
26490 1395229 : rtx addr = XEXP (call, 0);
26491 :
26492 : /* Do not emit 'notrack' if it's not an indirect call. */
26493 1395229 : if (MEM_P (addr)
26494 1395229 : && SYMBOL_REF_P (XEXP (addr, 0)))
26495 : return false;
26496 : else
26497 65130 : return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
26498 : }
26499 :
26500 2572664 : if (JUMP_P (insn) && !flag_cet_switch)
26501 : {
26502 2559116 : rtx target = JUMP_LABEL (insn);
26503 2559116 : if (target == NULL_RTX || ANY_RETURN_P (target))
26504 : return false;
26505 :
26506 : /* Check the jump is a switch table. */
26507 2559078 : rtx_insn *label = as_a<rtx_insn *> (target);
26508 2559078 : rtx_insn *table = next_insn (label);
26509 2559078 : if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
26510 : return false;
26511 : else
26512 : return true;
26513 : }
26514 : return false;
26515 : }
26516 :
26517 : /* Calculate integer abs() using only SSE2 instructions. */
26518 :
26519 : void
26520 629 : ix86_expand_sse2_abs (rtx target, rtx input)
26521 : {
26522 629 : machine_mode mode = GET_MODE (target);
26523 629 : rtx tmp0, tmp1, x;
26524 :
26525 629 : switch (mode)
26526 : {
26527 33 : case E_V2DImode:
26528 33 : case E_V4DImode:
26529 : /* For 64-bit signed integer X, with SSE4.2 use
26530 : pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
26531 : Otherwise handle it similarly to V4SImode, except use 64 as W instead of
26532 : 32 and use logical instead of arithmetic right shift (which is
26533 : unimplemented) and subtract. */
26534 33 : if (TARGET_SSE4_2)
26535 : {
26536 9 : tmp0 = gen_reg_rtx (mode);
26537 9 : tmp1 = gen_reg_rtx (mode);
26538 9 : emit_move_insn (tmp1, CONST0_RTX (mode));
26539 9 : if (mode == E_V2DImode)
26540 6 : emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
26541 : else
26542 3 : emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
26543 : }
26544 : else
26545 : {
26546 48 : tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
26547 24 : GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
26548 : - 1), NULL, 0, OPTAB_DIRECT);
26549 24 : tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
26550 : }
26551 :
26552 33 : tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
26553 : NULL, 0, OPTAB_DIRECT);
26554 33 : x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
26555 : target, 0, OPTAB_DIRECT);
26556 33 : break;
26557 :
26558 61 : case E_V4SImode:
26559 : /* For 32-bit signed integer X, the best way to calculate the absolute
26560 : value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
26561 61 : tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
26562 61 : GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
26563 : NULL, 0, OPTAB_DIRECT);
26564 61 : tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
26565 : NULL, 0, OPTAB_DIRECT);
26566 61 : x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
26567 : target, 0, OPTAB_DIRECT);
26568 61 : break;
26569 :
26570 91 : case E_V8HImode:
26571 : /* For 16-bit signed integer X, the best way to calculate the absolute
26572 : value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
26573 91 : tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
26574 :
26575 91 : x = expand_simple_binop (mode, SMAX, tmp0, input,
26576 : target, 0, OPTAB_DIRECT);
26577 91 : break;
26578 :
26579 444 : case E_V16QImode:
26580 : /* For 8-bit signed integer X, the best way to calculate the absolute
26581 : value of X is min ((unsigned char) X, (unsigned char) (-X)),
26582 : as SSE2 provides the PMINUB insn. */
26583 444 : tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
26584 :
26585 444 : x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
26586 : target, 0, OPTAB_DIRECT);
26587 444 : break;
26588 :
26589 0 : default:
26590 0 : gcc_unreachable ();
26591 : }
26592 :
26593 629 : if (x != target)
26594 0 : emit_move_insn (target, x);
26595 629 : }
26596 :
26597 : /* Expand an extract from a vector register through pextr insn.
26598 : Return true if successful. */
26599 :
26600 : bool
26601 101999 : ix86_expand_pextr (rtx *operands)
26602 : {
26603 101999 : rtx dst = operands[0];
26604 101999 : rtx src = operands[1];
26605 :
26606 101999 : unsigned int size = INTVAL (operands[2]);
26607 101999 : unsigned int pos = INTVAL (operands[3]);
26608 :
26609 101999 : if (SUBREG_P (dst))
26610 : {
26611 : /* Reject non-lowpart subregs. */
26612 58870 : if (SUBREG_BYTE (dst) > 0)
26613 : return false;
26614 58743 : dst = SUBREG_REG (dst);
26615 : }
26616 :
26617 101872 : if (SUBREG_P (src))
26618 : {
26619 33746 : pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
26620 33746 : src = SUBREG_REG (src);
26621 : }
26622 :
26623 101872 : switch (GET_MODE (src))
26624 : {
26625 0 : case E_V16QImode:
26626 0 : case E_V8HImode:
26627 0 : case E_V4SImode:
26628 0 : case E_V2DImode:
26629 0 : case E_V1TImode:
26630 0 : {
26631 0 : machine_mode srcmode, dstmode;
26632 0 : rtx d, pat;
26633 :
26634 0 : if (!int_mode_for_size (size, 0).exists (&dstmode))
26635 0 : return false;
26636 :
26637 0 : switch (dstmode)
26638 : {
26639 0 : case E_QImode:
26640 0 : if (!TARGET_SSE4_1)
26641 : return false;
26642 : srcmode = V16QImode;
26643 : break;
26644 :
26645 0 : case E_HImode:
26646 0 : if (!TARGET_SSE2)
26647 : return false;
26648 : srcmode = V8HImode;
26649 : break;
26650 :
26651 0 : case E_SImode:
26652 0 : if (!TARGET_SSE4_1)
26653 : return false;
26654 : srcmode = V4SImode;
26655 : break;
26656 :
26657 0 : case E_DImode:
26658 0 : gcc_assert (TARGET_64BIT);
26659 0 : if (!TARGET_SSE4_1)
26660 : return false;
26661 : srcmode = V2DImode;
26662 : break;
26663 :
26664 : default:
26665 : return false;
26666 : }
26667 :
26668 : /* Reject extractions from misaligned positions. */
26669 0 : if (pos & (size-1))
26670 : return false;
26671 :
26672 0 : if (GET_MODE (dst) == dstmode)
26673 : d = dst;
26674 : else
26675 0 : d = gen_reg_rtx (dstmode);
26676 :
26677 : /* Construct insn pattern. */
26678 0 : pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
26679 0 : pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
26680 :
26681 : /* Let the rtl optimizers know about the zero extension performed. */
26682 0 : if (dstmode == QImode || dstmode == HImode)
26683 : {
26684 0 : pat = gen_rtx_ZERO_EXTEND (SImode, pat);
26685 0 : d = gen_lowpart (SImode, d);
26686 : }
26687 :
26688 0 : emit_insn (gen_rtx_SET (d, pat));
26689 :
26690 0 : if (d != dst)
26691 0 : emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
26692 : return true;
26693 : }
26694 :
26695 : default:
26696 : return false;
26697 : }
26698 : }
26699 :
26700 : /* Expand an insert into a vector register through pinsr insn.
26701 : Return true if successful. */
26702 :
26703 : bool
26704 108771 : ix86_expand_pinsr (rtx *operands)
26705 : {
26706 108771 : rtx dst = operands[0];
26707 108771 : rtx src = operands[3];
26708 :
26709 108771 : unsigned int size = INTVAL (operands[1]);
26710 108771 : unsigned int pos = INTVAL (operands[2]);
26711 :
26712 108771 : if (SUBREG_P (dst))
26713 : {
26714 60649 : pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
26715 60649 : dst = SUBREG_REG (dst);
26716 : }
26717 :
26718 108771 : switch (GET_MODE (dst))
26719 : {
26720 20 : case E_V16QImode:
26721 20 : case E_V8HImode:
26722 20 : case E_V4SImode:
26723 20 : case E_V2DImode:
26724 20 : case E_V1TImode:
26725 20 : {
26726 20 : machine_mode srcmode, dstmode;
26727 20 : rtx (*pinsr)(rtx, rtx, rtx, rtx);
26728 20 : rtx d;
26729 :
26730 20 : if (!int_mode_for_size (size, 0).exists (&srcmode))
26731 0 : return false;
26732 :
26733 20 : switch (srcmode)
26734 : {
26735 1 : case E_QImode:
26736 1 : if (!TARGET_SSE4_1)
26737 : return false;
26738 : dstmode = V16QImode;
26739 : pinsr = gen_sse4_1_pinsrb;
26740 : break;
26741 :
26742 5 : case E_HImode:
26743 5 : if (!TARGET_SSE2)
26744 : return false;
26745 : dstmode = V8HImode;
26746 : pinsr = gen_sse2_pinsrw;
26747 : break;
26748 :
26749 14 : case E_SImode:
26750 14 : if (!TARGET_SSE4_1)
26751 : return false;
26752 : dstmode = V4SImode;
26753 : pinsr = gen_sse4_1_pinsrd;
26754 : break;
26755 :
26756 0 : case E_DImode:
26757 0 : gcc_assert (TARGET_64BIT);
26758 0 : if (!TARGET_SSE4_1)
26759 : return false;
26760 : dstmode = V2DImode;
26761 : pinsr = gen_sse4_1_pinsrq;
26762 : break;
26763 :
26764 : default:
26765 : return false;
26766 : }
26767 :
26768 : /* Reject insertions to misaligned positions. */
26769 7 : if (pos & (size-1))
26770 : return false;
26771 :
26772 7 : if (SUBREG_P (src))
26773 : {
26774 7 : unsigned int srcpos = SUBREG_BYTE (src);
26775 :
26776 7 : if (srcpos > 0)
26777 : {
26778 0 : rtx extr_ops[4];
26779 :
26780 0 : extr_ops[0] = gen_reg_rtx (srcmode);
26781 0 : extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
26782 0 : extr_ops[2] = GEN_INT (size);
26783 0 : extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
26784 :
26785 0 : if (!ix86_expand_pextr (extr_ops))
26786 0 : return false;
26787 :
26788 0 : src = extr_ops[0];
26789 : }
26790 : else
26791 7 : src = gen_lowpart (srcmode, SUBREG_REG (src));
26792 : }
26793 :
26794 7 : if (GET_MODE (dst) == dstmode)
26795 : d = dst;
26796 : else
26797 7 : d = gen_reg_rtx (dstmode);
26798 :
26799 7 : emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
26800 7 : gen_lowpart (srcmode, src),
26801 7 : GEN_INT (1 << (pos / size))));
26802 7 : if (d != dst)
26803 7 : emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
26804 : return true;
26805 : }
26806 :
26807 : default:
26808 : return false;
26809 : }
26810 : }
26811 :
26812 : /* All CPUs prefer to avoid cross-lane operations so perform reductions
26813 : upper against lower halves up to SSE reg size. */
26814 :
26815 : machine_mode
26816 1894 : ix86_split_reduction (machine_mode mode)
26817 : {
26818 : /* Reduce lowpart against highpart until we reach SSE reg width to
26819 : avoid cross-lane operations. */
26820 1894 : switch (mode)
26821 : {
26822 : case E_V8DImode:
26823 : case E_V4DImode:
26824 : return V2DImode;
26825 9 : case E_V16SImode:
26826 9 : case E_V8SImode:
26827 9 : return V4SImode;
26828 8 : case E_V32HImode:
26829 8 : case E_V16HImode:
26830 8 : return V8HImode;
26831 4 : case E_V64QImode:
26832 4 : case E_V32QImode:
26833 4 : return V16QImode;
26834 5 : case E_V16SFmode:
26835 5 : case E_V8SFmode:
26836 5 : return V4SFmode;
26837 16 : case E_V8DFmode:
26838 16 : case E_V4DFmode:
26839 16 : return V2DFmode;
26840 1847 : default:
26841 1847 : return mode;
26842 : }
26843 : }
26844 :
26845 : /* Generate call to __divmoddi4. */
26846 :
26847 : void
26848 896 : ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
26849 : rtx op0, rtx op1,
26850 : rtx *quot_p, rtx *rem_p)
26851 : {
26852 1792 : rtx rem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
26853 :
26854 896 : rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
26855 : mode, op0, mode, op1, mode,
26856 896 : XEXP (rem, 0), Pmode);
26857 896 : *quot_p = quot;
26858 896 : *rem_p = rem;
26859 896 : }
26860 :
26861 : void
26862 64 : ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
26863 : enum rtx_code code, bool after,
26864 : bool doubleword)
26865 : {
26866 64 : rtx old_reg, new_reg, old_mem, success;
26867 64 : machine_mode mode = GET_MODE (target);
26868 64 : rtx_code_label *loop_label = NULL;
26869 :
26870 64 : old_reg = gen_reg_rtx (mode);
26871 64 : new_reg = old_reg;
26872 64 : old_mem = copy_to_reg (mem);
26873 64 : loop_label = gen_label_rtx ();
26874 64 : emit_label (loop_label);
26875 64 : emit_move_insn (old_reg, old_mem);
26876 :
26877 : /* return value for atomic_fetch_op. */
26878 64 : if (!after)
26879 32 : emit_move_insn (target, old_reg);
26880 :
26881 64 : if (code == NOT)
26882 : {
26883 16 : new_reg = expand_simple_binop (mode, AND, new_reg, val, NULL_RTX,
26884 : true, OPTAB_LIB_WIDEN);
26885 16 : new_reg = expand_simple_unop (mode, code, new_reg, NULL_RTX, true);
26886 : }
26887 : else
26888 48 : new_reg = expand_simple_binop (mode, code, new_reg, val, NULL_RTX,
26889 : true, OPTAB_LIB_WIDEN);
26890 :
26891 : /* return value for atomic_op_fetch. */
26892 64 : if (after)
26893 32 : emit_move_insn (target, new_reg);
26894 :
26895 64 : success = NULL_RTX;
26896 :
26897 64 : ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg,
26898 : gen_int_mode (MEMMODEL_SYNC_SEQ_CST,
26899 : SImode),
26900 : doubleword, loop_label);
26901 64 : }
26902 :
26903 : /* Relax cmpxchg instruction, param loop_label indicates whether
26904 : the instruction should be relaxed with a pause loop. If not,
26905 : it will be relaxed to an atomic load + compare, and skip
26906 : cmpxchg instruction if mem != exp_input. */
26907 :
26908 : void
26909 72 : ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
26910 : rtx mem, rtx exp_input, rtx new_input,
26911 : rtx mem_model, bool doubleword,
26912 : rtx_code_label *loop_label)
26913 : {
26914 72 : rtx_code_label *cmp_label = NULL;
26915 72 : rtx_code_label *done_label = NULL;
26916 72 : rtx target_bool = NULL_RTX, new_mem = NULL_RTX;
26917 72 : rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL;
26918 72 : rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL;
26919 72 : machine_mode mode = GET_MODE (target_val), hmode = mode;
26920 :
26921 72 : if (*ptarget_bool == NULL)
26922 64 : target_bool = gen_reg_rtx (QImode);
26923 : else
26924 : target_bool = *ptarget_bool;
26925 :
26926 72 : cmp_label = gen_label_rtx ();
26927 72 : done_label = gen_label_rtx ();
26928 :
26929 72 : new_mem = gen_reg_rtx (mode);
26930 : /* Load memory first. */
26931 72 : expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST);
26932 :
26933 72 : switch (mode)
26934 : {
26935 : case E_TImode:
26936 : gendw = gen_atomic_compare_and_swapti_doubleword;
26937 : hmode = DImode;
26938 : break;
26939 18 : case E_DImode:
26940 18 : if (doubleword)
26941 : {
26942 : gendw = gen_atomic_compare_and_swapdi_doubleword;
26943 : hmode = SImode;
26944 : }
26945 : else
26946 : gen = gen_atomic_compare_and_swapdi_1;
26947 : break;
26948 18 : case E_SImode:
26949 18 : gen = gen_atomic_compare_and_swapsi_1;
26950 18 : break;
26951 18 : case E_HImode:
26952 18 : gen = gen_atomic_compare_and_swaphi_1;
26953 18 : break;
26954 18 : case E_QImode:
26955 18 : gen = gen_atomic_compare_and_swapqi_1;
26956 18 : break;
26957 0 : default:
26958 0 : gcc_unreachable ();
26959 : }
26960 :
26961 : /* Compare mem value with expected value. */
26962 54 : if (doubleword)
26963 : {
26964 0 : rtx low_new_mem = gen_lowpart (hmode, new_mem);
26965 0 : rtx low_exp_input = gen_lowpart (hmode, exp_input);
26966 0 : rtx high_new_mem = gen_highpart (hmode, new_mem);
26967 0 : rtx high_exp_input = gen_highpart (hmode, exp_input);
26968 0 : emit_cmp_and_jump_insns (low_new_mem, low_exp_input, NE, NULL_RTX,
26969 : hmode, 1, cmp_label,
26970 : profile_probability::guessed_never ());
26971 0 : emit_cmp_and_jump_insns (high_new_mem, high_exp_input, NE, NULL_RTX,
26972 : hmode, 1, cmp_label,
26973 : profile_probability::guessed_never ());
26974 : }
26975 : else
26976 72 : emit_cmp_and_jump_insns (new_mem, exp_input, NE, NULL_RTX,
26977 72 : GET_MODE (exp_input), 1, cmp_label,
26978 : profile_probability::guessed_never ());
26979 :
26980 : /* Directly emits cmpxchg here. */
26981 72 : if (doubleword)
26982 0 : emit_insn (gendw (target_val, mem, exp_input,
26983 0 : gen_lowpart (hmode, new_input),
26984 : gen_highpart (hmode, new_input),
26985 : mem_model));
26986 : else
26987 72 : emit_insn (gen (target_val, mem, exp_input, new_input, mem_model));
26988 :
26989 72 : if (!loop_label)
26990 : {
26991 8 : emit_jump_insn (gen_jump (done_label));
26992 8 : emit_barrier ();
26993 8 : emit_label (cmp_label);
26994 8 : emit_move_insn (target_val, new_mem);
26995 8 : emit_label (done_label);
26996 8 : ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
26997 : const0_rtx);
26998 : }
26999 : else
27000 : {
27001 64 : ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
27002 : const0_rtx);
27003 64 : emit_cmp_and_jump_insns (target_bool, const0_rtx, EQ, const0_rtx,
27004 64 : GET_MODE (target_bool), 1, loop_label,
27005 : profile_probability::guessed_never ());
27006 64 : emit_jump_insn (gen_jump (done_label));
27007 64 : emit_barrier ();
27008 :
27009 : /* If mem is not expected, pause and loop back. */
27010 64 : emit_label (cmp_label);
27011 64 : emit_move_insn (target_val, new_mem);
27012 64 : emit_insn (gen_pause ());
27013 64 : emit_jump_insn (gen_jump (loop_label));
27014 64 : emit_barrier ();
27015 64 : emit_label (done_label);
27016 : }
27017 :
27018 72 : *ptarget_bool = target_bool;
27019 72 : }
27020 :
27021 : /* Convert a BFmode VAL to SFmode without signaling sNaNs.
27022 : This is done by returning SF SUBREG of ((HI SUBREG) (VAL)) << 16. */
27023 :
27024 : rtx
27025 2832 : ix86_expand_fast_convert_bf_to_sf (rtx val)
27026 : {
27027 2832 : rtx op = gen_lowpart (HImode, val), ret;
27028 2832 : if (CONST_INT_P (op))
27029 : {
27030 514 : ret = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
27031 : val, BFmode);
27032 514 : if (ret)
27033 : return ret;
27034 : /* FLOAT_EXTEND simplification will fail if VAL is a sNaN. */
27035 1 : ret = gen_reg_rtx (SImode);
27036 1 : emit_move_insn (ret, GEN_INT (INTVAL (op) & 0xffff));
27037 1 : emit_insn (gen_ashlsi3 (ret, ret, GEN_INT (16)));
27038 1 : return gen_lowpart (SFmode, ret);
27039 : }
27040 :
27041 2318 : ret = gen_reg_rtx (SFmode);
27042 2318 : emit_insn (gen_extendbfsf2_1 (ret, force_reg (BFmode, val)));
27043 2318 : return ret;
27044 : }
27045 :
27046 : rtx
27047 65576 : ix86_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
27048 : rtx_code code, tree treeop0, tree treeop1)
27049 : {
27050 65576 : if (!TARGET_APX_CCMP)
27051 : return NULL_RTX;
27052 :
27053 65576 : rtx op0, op1, res;
27054 65576 : machine_mode op_mode;
27055 :
27056 65576 : start_sequence ();
27057 65576 : expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
27058 :
27059 65576 : op_mode = GET_MODE (op0);
27060 65576 : if (op_mode == VOIDmode)
27061 0 : op_mode = GET_MODE (op1);
27062 :
27063 : /* We only supports following scalar comparisons that use just 1
27064 : instruction: DI/SI/QI/HI/DF/SF/HF.
27065 : Unordered/Ordered compare cannot be corretly indentified by
27066 : ccmp so they are not supported. */
27067 98348 : if (!(op_mode == DImode || op_mode == SImode || op_mode == HImode
27068 65576 : || op_mode == QImode || op_mode == DFmode || op_mode == SFmode
27069 32772 : || op_mode == HFmode)
27070 32806 : || code == ORDERED
27071 32806 : || code == UNORDERED)
27072 : {
27073 32770 : end_sequence ();
27074 32770 : return NULL_RTX;
27075 : }
27076 :
27077 : /* Canonicalize the operands according to mode. */
27078 32806 : if (SCALAR_INT_MODE_P (op_mode))
27079 : {
27080 32799 : if (!nonimmediate_operand (op0, op_mode))
27081 0 : op0 = force_reg (op_mode, op0);
27082 32799 : if (!x86_64_general_operand (op1, op_mode))
27083 0 : op1 = force_reg (op_mode, op1);
27084 : }
27085 : else
27086 : {
27087 : /* op0/op1 can be canonicallized from expand_fp_compare, so
27088 : just adjust the code to make it generate supported fp
27089 : condition. */
27090 7 : if (ix86_fp_compare_code_to_integer (code) == UNKNOWN)
27091 : {
27092 : /* First try to split condition if we don't need to honor
27093 : NaNs, as the ORDERED/UNORDERED check always fall
27094 : through. */
27095 6 : if (!HONOR_NANS (op_mode))
27096 : {
27097 6 : rtx_code first_code;
27098 6 : split_comparison (code, op_mode, &first_code, &code);
27099 : }
27100 : /* Otherwise try to swap the operand order and check if
27101 : the comparison is supported. */
27102 : else
27103 : {
27104 0 : code = swap_condition (code);
27105 0 : std::swap (op0, op1);
27106 : }
27107 :
27108 6 : if (ix86_fp_compare_code_to_integer (code) == UNKNOWN)
27109 : {
27110 0 : end_sequence ();
27111 0 : return NULL_RTX;
27112 : }
27113 : }
27114 : }
27115 :
27116 32806 : *prep_seq = end_sequence ();
27117 :
27118 32806 : start_sequence ();
27119 :
27120 32806 : res = ix86_expand_compare (code, op0, op1);
27121 :
27122 32806 : if (!res)
27123 : {
27124 : end_sequence ();
27125 : return NULL_RTX;
27126 : }
27127 32806 : *gen_seq = end_sequence ();
27128 :
27129 32806 : return res;
27130 : }
27131 :
27132 : rtx
27133 32809 : ix86_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
27134 : rtx_code cmp_code, tree treeop0, tree treeop1,
27135 : rtx_code bit_code)
27136 : {
27137 32809 : if (!TARGET_APX_CCMP)
27138 : return NULL_RTX;
27139 :
27140 32809 : rtx op0, op1, target;
27141 32809 : machine_mode op_mode, cmp_mode, cc_mode = CCmode;
27142 32809 : int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
27143 32809 : insn_code icode;
27144 32809 : rtx_code prev_code;
27145 32809 : struct expand_operand ops[5];
27146 32809 : int dfv;
27147 :
27148 : /* Exit early for non integer modes to avoid O(n^2) part of expand_operands. */
27149 32809 : cmp_mode = op_mode = TYPE_MODE (TREE_TYPE (treeop0));
27150 :
27151 32809 : if (!(op_mode == DImode || op_mode == SImode || op_mode == HImode
27152 : || op_mode == QImode))
27153 : return NULL_RTX;
27154 :
27155 32 : push_to_sequence (*prep_seq);
27156 32 : expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
27157 :
27158 32 : icode = code_for_ccmp (op_mode);
27159 :
27160 32 : op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
27161 32 : op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
27162 32 : if (!op0 || !op1)
27163 : {
27164 0 : end_sequence ();
27165 0 : return NULL_RTX;
27166 : }
27167 :
27168 32 : *prep_seq = end_sequence ();
27169 :
27170 32 : target = gen_rtx_REG (cc_mode, FLAGS_REG);
27171 32 : dfv = ix86_get_flags_cc ((rtx_code) cmp_code);
27172 :
27173 32 : prev_code = GET_CODE (prev);
27174 : /* Fixup FP compare code here. */
27175 32 : if (GET_MODE (XEXP (prev, 0)) == CCFPmode)
27176 7 : prev_code = ix86_fp_compare_code_to_integer (prev_code);
27177 :
27178 32 : if (bit_code != AND)
27179 17 : prev_code = reverse_condition (prev_code);
27180 : else
27181 15 : dfv = (int)(dfv ^ 1);
27182 :
27183 32 : prev = gen_rtx_fmt_ee (prev_code, VOIDmode, XEXP (prev, 0),
27184 : const0_rtx);
27185 :
27186 32 : create_fixed_operand (&ops[0], target);
27187 32 : create_fixed_operand (&ops[1], prev);
27188 32 : create_fixed_operand (&ops[2], op0);
27189 32 : create_fixed_operand (&ops[3], op1);
27190 32 : create_fixed_operand (&ops[4], GEN_INT (dfv));
27191 :
27192 32 : push_to_sequence (*gen_seq);
27193 32 : if (!maybe_expand_insn (icode, 5, ops))
27194 : {
27195 0 : end_sequence ();
27196 0 : return NULL_RTX;
27197 : }
27198 :
27199 32 : *gen_seq = end_sequence ();
27200 :
27201 32 : return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
27202 : }
27203 :
27204 : /* Attempt to convert a CONST_VECTOR into a bcst_mem_operand.
27205 : Returns NULL_RTX if X is cannot be expressed as a suitable
27206 : VEC_DUPLICATE in mode MODE. */
27207 :
27208 : static rtx
27209 48 : ix86_gen_bcst_mem (machine_mode mode, rtx x)
27210 : {
27211 48 : if (!TARGET_AVX512F
27212 48 : || !CONST_VECTOR_P (x)
27213 64 : || (!TARGET_AVX512VL && GET_MODE_SIZE (mode) != 64)
27214 147 : || !VALID_BCST_MODE_P (GET_MODE_INNER (mode))
27215 : /* Disallow HFmode broadcast. */
27216 126 : || GET_MODE_SIZE (GET_MODE_INNER (mode)) < 4)
27217 : return NULL_RTX;
27218 :
27219 21 : rtx cst = CONST_VECTOR_ELT (x, 0);
27220 21 : if (!CONST_SCALAR_INT_P (cst)
27221 15 : && !CONST_DOUBLE_P (cst)
27222 0 : && !CONST_FIXED_P (cst))
27223 : return NULL_RTX;
27224 :
27225 21 : int n_elts = GET_MODE_NUNITS (mode);
27226 42 : if (CONST_VECTOR_NUNITS (x) != n_elts)
27227 : return NULL_RTX;
27228 :
27229 150 : for (int i = 1; i < n_elts; i++)
27230 129 : if (!rtx_equal_p (cst, CONST_VECTOR_ELT (x, i)))
27231 : return NULL_RTX;
27232 :
27233 42 : rtx mem = force_const_mem (GET_MODE_INNER (mode), cst);
27234 21 : return gen_rtx_VEC_DUPLICATE (mode, validize_mem (mem));
27235 : }
27236 :
27237 : /* Determine the ternlog immediate index that implements 3-operand
27238 : ternary logic expression OP. This uses and modifies the 3 element
27239 : array ARGS to record and check the leaves, either 3 REGs, or 2 REGs
27240 : and MEM. Returns an index between 0 and 255 for a valid ternlog,
27241 : or -1 if the expression isn't suitable. */
27242 :
27243 : int
27244 7282473 : ix86_ternlog_idx (rtx op, rtx *args)
27245 : {
27246 7282473 : int idx0, idx1;
27247 :
27248 7282473 : if (!op)
27249 : return -1;
27250 :
27251 7282473 : switch (GET_CODE (op))
27252 : {
27253 751339 : case SUBREG:
27254 751339 : if (!register_operand (op, GET_MODE (op)))
27255 : return -1;
27256 : /* FALLTHRU */
27257 :
27258 3567619 : case REG:
27259 3567619 : if (!args[0])
27260 : {
27261 1848745 : args[0] = op;
27262 1848745 : return 0xf0;
27263 : }
27264 1718874 : if (rtx_equal_p (op, args[0]))
27265 : return 0xf0;
27266 1692241 : if (!args[1])
27267 : {
27268 1426444 : args[1] = op;
27269 1426444 : return 0xcc;
27270 : }
27271 265797 : if (rtx_equal_p (op, args[1]))
27272 : return 0xcc;
27273 249217 : if (!args[2])
27274 : {
27275 226621 : args[2] = op;
27276 226621 : return 0xaa;
27277 : }
27278 22596 : if (rtx_equal_p (op, args[2]))
27279 : return 0xaa;
27280 : return -1;
27281 :
27282 18451 : case VEC_DUPLICATE:
27283 18451 : if (!bcst_mem_operand (op, GET_MODE (op)))
27284 : return -1;
27285 302 : goto do_mem_operand;
27286 :
27287 363580 : case MEM:
27288 363580 : if (!memory_operand (op, GET_MODE (op)))
27289 : return -1;
27290 363416 : if (MEM_P (op)
27291 363416 : && MEM_VOLATILE_P (op)
27292 363510 : && !volatile_ok)
27293 : return -1;
27294 : /* FALLTHRU */
27295 :
27296 471644 : case CONST_VECTOR:
27297 471644 : do_mem_operand:
27298 471644 : if (!args[2])
27299 : {
27300 424466 : args[2] = op;
27301 424466 : return 0xaa;
27302 : }
27303 : /* Maximum of one volatile memory reference per expression. */
27304 47178 : if (side_effects_p (op))
27305 : return -1;
27306 47178 : if (rtx_equal_p (op, args[2]))
27307 : return 0xaa;
27308 : /* Check if CONST_VECTOR is the ones-complement of args[2]. */
27309 47127 : if (CONST_VECTOR_P (op)
27310 3421 : && CONST_VECTOR_P (args[2])
27311 47372 : && rtx_equal_p (simplify_const_unary_operation (NOT, GET_MODE (op),
27312 245 : op, GET_MODE (op)),
27313 : args[2]))
27314 : return 0x55;
27315 46940 : if (!args[0])
27316 : {
27317 45138 : args[0] = op;
27318 45138 : return 0xf0;
27319 : }
27320 1802 : if (rtx_equal_p (op, args[0]))
27321 : return 0xf0;
27322 : /* Check if CONST_VECTOR is the ones-complement of args[0]. */
27323 1802 : if (CONST_VECTOR_P (op)
27324 101 : && CONST_VECTOR_P (args[0])
27325 1844 : && rtx_equal_p (simplify_const_unary_operation (NOT, GET_MODE (op),
27326 42 : op, GET_MODE (op)),
27327 : args[0]))
27328 : return 0x0f;
27329 1760 : if (!args[1])
27330 : {
27331 1748 : args[1] = op;
27332 1748 : return 0xcc;
27333 : }
27334 12 : if (rtx_equal_p (op, args[1]))
27335 : return 0xcc;
27336 : /* Check if CONST_VECTOR is the ones-complement of args[1]. */
27337 12 : if (CONST_VECTOR_P (op)
27338 0 : && CONST_VECTOR_P (args[1])
27339 12 : && rtx_equal_p (simplify_const_unary_operation (NOT, GET_MODE (op),
27340 0 : op, GET_MODE (op)),
27341 : args[1]))
27342 : return 0x33;
27343 : return -1;
27344 :
27345 186535 : case NOT:
27346 186535 : idx0 = ix86_ternlog_idx (XEXP (op, 0), args);
27347 186535 : return (idx0 >= 0) ? idx0 ^ 0xff : -1;
27348 :
27349 1295489 : case AND:
27350 1295489 : idx0 = ix86_ternlog_idx (XEXP (op, 0), args);
27351 1295489 : if (idx0 < 0)
27352 : return -1;
27353 1067102 : idx1 = ix86_ternlog_idx (XEXP (op, 1), args);
27354 1067102 : return (idx1 >= 0) ? idx0 & idx1 : -1;
27355 :
27356 956695 : case IOR:
27357 956695 : idx0 = ix86_ternlog_idx (XEXP (op, 0), args);
27358 956695 : if (idx0 < 0)
27359 : return -1;
27360 711658 : idx1 = ix86_ternlog_idx (XEXP (op, 1), args);
27361 711658 : return (idx1 >= 0) ? idx0 | idx1 : -1;
27362 :
27363 405401 : case XOR:
27364 405401 : idx0 = ix86_ternlog_idx (XEXP (op, 0), args);
27365 405401 : if (idx0 < 0)
27366 : return -1;
27367 385728 : if (vector_all_ones_operand (XEXP (op, 1), GET_MODE (op)))
27368 6726 : return idx0 ^ 0xff;
27369 379002 : idx1 = ix86_ternlog_idx (XEXP (op, 1), args);
27370 379002 : return (idx1 >= 0) ? idx0 ^ idx1 : -1;
27371 :
27372 7498 : case UNSPEC:
27373 7498 : if (XINT (op, 1) != UNSPEC_VTERNLOG
27374 0 : || XVECLEN (op, 0) != 4
27375 0 : || !CONST_INT_P (XVECEXP (op, 0, 3)))
27376 : return -1;
27377 :
27378 : /* TODO: Handle permuted operands. */
27379 0 : if (ix86_ternlog_idx (XVECEXP (op, 0, 0), args) != 0xf0
27380 0 : || ix86_ternlog_idx (XVECEXP (op, 0, 1), args) != 0xcc
27381 0 : || ix86_ternlog_idx (XVECEXP (op, 0, 2), args) != 0xaa)
27382 0 : return -1;
27383 0 : return INTVAL (XVECEXP (op, 0, 3));
27384 :
27385 : default:
27386 : return -1;
27387 : }
27388 : }
27389 :
27390 : /* Return TRUE if OP (in mode MODE) is the leaf of a ternary logic
27391 : expression, such as a register or a memory reference. */
27392 :
27393 : bool
27394 3375006 : ix86_ternlog_leaf_p (rtx op, machine_mode mode)
27395 : {
27396 : /* We can't use memory_operand here, as it may return a different
27397 : value before and after reload (for volatile MEMs) which creates
27398 : problems splitting instructions. */
27399 3375006 : return register_operand (op, mode)
27400 734365 : || MEM_P (op)
27401 384826 : || CONST_VECTOR_P (op)
27402 3657777 : || bcst_mem_operand (op, mode);
27403 : }
27404 :
27405 : /* Test whether OP is a 3-operand ternary logic expression suitable
27406 : for use in a ternlog instruction. */
27407 :
27408 : bool
27409 2244378 : ix86_ternlog_operand_p (rtx op)
27410 : {
27411 2244378 : rtx op0, op1;
27412 2244378 : rtx args[3];
27413 :
27414 2244378 : args[0] = NULL_RTX;
27415 2244378 : args[1] = NULL_RTX;
27416 2244378 : args[2] = NULL_RTX;
27417 2244378 : int idx = ix86_ternlog_idx (op, args);
27418 2244378 : if (idx < 0)
27419 : return false;
27420 :
27421 : /* Don't match simple (binary or unary) expressions. */
27422 1824141 : machine_mode mode = GET_MODE (op);
27423 1824141 : switch (GET_CODE (op))
27424 : {
27425 837736 : case AND:
27426 837736 : op0 = XEXP (op, 0);
27427 837736 : op1 = XEXP (op, 1);
27428 :
27429 : /* Prefer pand. */
27430 837736 : if (ix86_ternlog_leaf_p (op0, mode)
27431 837736 : && ix86_ternlog_leaf_p (op1, mode))
27432 : return false;
27433 : /* Prefer pandn. */
27434 109360 : if (GET_CODE (op0) == NOT
27435 77778 : && register_operand (XEXP (op0, 0), mode)
27436 183516 : && ix86_ternlog_leaf_p (op1, mode))
27437 : return false;
27438 : break;
27439 :
27440 624658 : case IOR:
27441 : /* Prefer por. */
27442 624658 : if (ix86_ternlog_leaf_p (XEXP (op, 0), mode)
27443 624658 : && ix86_ternlog_leaf_p (XEXP (op, 1), mode))
27444 : return false;
27445 : break;
27446 :
27447 328561 : case XOR:
27448 328561 : op1 = XEXP (op, 1);
27449 : /* Prefer pxor, or one_cmpl<vmode>2. */
27450 328561 : if (ix86_ternlog_leaf_p (XEXP (op, 0), mode)
27451 328561 : && ix86_ternlog_leaf_p (XEXP (op, 1), mode))
27452 : return false;
27453 : break;
27454 :
27455 : default:
27456 : break;
27457 : }
27458 : return true;
27459 : }
27460 :
27461 : /* Helper function for ix86_expand_ternlog. */
27462 : static rtx
27463 0 : ix86_expand_ternlog_binop (enum rtx_code code, machine_mode mode,
27464 : rtx op0, rtx op1, rtx target)
27465 : {
27466 0 : if (GET_MODE (op0) != mode)
27467 0 : op0 = gen_lowpart (mode, op0);
27468 0 : if (GET_MODE (op1) != mode)
27469 0 : op1 = gen_lowpart (mode, op1);
27470 :
27471 0 : if (CONST_VECTOR_P (op0))
27472 0 : op0 = validize_mem (force_const_mem (mode, op0));
27473 0 : if (CONST_VECTOR_P (op1))
27474 0 : op1 = validize_mem (force_const_mem (mode, op1));
27475 :
27476 0 : if (!register_operand (op0, mode))
27477 : {
27478 0 : if (!register_operand (op1, mode))
27479 : {
27480 : /* We can't use force_reg (op0, mode). */
27481 0 : rtx reg = gen_reg_rtx (mode);
27482 0 : emit_move_insn (reg, op0);
27483 0 : op0 = reg;
27484 : }
27485 : else
27486 : std::swap (op0, op1);
27487 : }
27488 0 : rtx ops[3] = { target, op0, op1 };
27489 0 : ix86_expand_vector_logical_operator (code, mode, ops);
27490 0 : return target;
27491 : }
27492 :
27493 :
27494 : /* Helper function for ix86_expand_ternlog. */
27495 : static rtx
27496 0 : ix86_expand_ternlog_andnot (machine_mode mode, rtx op0, rtx op1, rtx target)
27497 : {
27498 0 : if (GET_MODE (op0) != mode)
27499 0 : op0 = gen_lowpart (mode, op0);
27500 0 : op0 = gen_rtx_NOT (mode, op0);
27501 0 : if (GET_MODE (op1) != mode)
27502 0 : op1 = gen_lowpart (mode, op1);
27503 0 : if (CONST_VECTOR_P (op1))
27504 0 : op1 = validize_mem (force_const_mem (mode, op1));
27505 0 : emit_move_insn (target, gen_rtx_AND (mode, op0, op1));
27506 0 : return target;
27507 : }
27508 :
27509 : /* Expand a 3-operand ternary logic expression. Return TARGET. */
27510 : rtx
27511 2429 : ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx,
27512 : rtx target)
27513 : {
27514 2429 : rtx tmp0, tmp1, tmp2;
27515 :
27516 2429 : if (!target)
27517 3 : target = gen_reg_rtx (mode);
27518 :
27519 : /* Canonicalize ternlog index for degenerate (duplicated) operands. */
27520 2429 : if (rtx_equal_p (op0, op1) && rtx_equal_p (op0, op2))
27521 0 : switch (idx & 0x81)
27522 : {
27523 : case 0x00:
27524 : idx = 0x00;
27525 : break;
27526 : case 0x01:
27527 : idx = 0x0f;
27528 : break;
27529 : case 0x80:
27530 : idx = 0xf0;
27531 : break;
27532 : case 0x81:
27533 : idx = 0xff;
27534 : break;
27535 : }
27536 :
27537 2429 : switch (idx & 0xff)
27538 : {
27539 0 : case 0x00:
27540 0 : if ((!op0 || !side_effects_p (op0))
27541 0 : && (!op1 || !side_effects_p (op1))
27542 0 : && (!op2 || !side_effects_p (op2)))
27543 : {
27544 0 : emit_move_insn (target, CONST0_RTX (mode));
27545 0 : return target;
27546 : }
27547 : break;
27548 :
27549 0 : case 0x0a: /* ~a&c */
27550 0 : if ((!op1 || !side_effects_p (op1))
27551 0 : && op0 && register_operand (op0, mode)
27552 0 : && op2 && ix86_ternlog_leaf_p (op2, mode))
27553 0 : return ix86_expand_ternlog_andnot (mode, op0, op2, target);
27554 : break;
27555 :
27556 0 : case 0x0c: /* ~a&b */
27557 0 : if ((!op2 || !side_effects_p (op2))
27558 0 : && op0 && register_operand (op0, mode)
27559 0 : && op1 && ix86_ternlog_leaf_p (op1, mode))
27560 0 : return ix86_expand_ternlog_andnot (mode, op0, op1, target);
27561 : break;
27562 :
27563 81 : case 0x0f: /* ~a */
27564 0 : if ((!op1 || !side_effects_p (op1))
27565 81 : && (!op2 || !side_effects_p (op2))
27566 162 : && op0)
27567 : {
27568 81 : emit_move_insn (target, gen_rtx_XOR (mode, op0, CONSTM1_RTX (mode)));
27569 81 : return target;
27570 : }
27571 : break;
27572 :
27573 0 : case 0x22: /* ~b&c */
27574 0 : if ((!op0 || !side_effects_p (op0))
27575 0 : && op1 && register_operand (op1, mode)
27576 0 : && op2 && ix86_ternlog_leaf_p (op2, mode))
27577 0 : return ix86_expand_ternlog_andnot (mode, op1, op2, target);
27578 : break;
27579 :
27580 0 : case 0x30: /* ~b&a */
27581 0 : if ((!op2 || !side_effects_p (op2))
27582 0 : && op0 && ix86_ternlog_leaf_p (op0, mode)
27583 0 : && op1 && register_operand (op1, mode))
27584 0 : return ix86_expand_ternlog_andnot (mode, op1, op0, target);
27585 : break;
27586 :
27587 0 : case 0x33: /* ~b */
27588 0 : if ((!op0 || !side_effects_p (op0))
27589 0 : && (!op2 || !side_effects_p (op2))
27590 0 : && op1)
27591 : {
27592 0 : emit_move_insn (target, gen_rtx_XOR (mode, op1, CONSTM1_RTX (mode)));
27593 0 : return target;
27594 : }
27595 : break;
27596 :
27597 0 : case 0x3c: /* a^b */
27598 0 : if (op0 && ix86_ternlog_leaf_p (op0, mode)
27599 0 : && op1 && ix86_ternlog_leaf_p (op1, mode)
27600 0 : && (!op2 || !side_effects_p (op2)))
27601 0 : return ix86_expand_ternlog_binop (XOR, mode, op0, op1, target);
27602 : break;
27603 :
27604 0 : case 0x44: /* ~c&b */
27605 0 : if ((!op0 || !side_effects_p (op0))
27606 0 : && op1 && ix86_ternlog_leaf_p (op1, mode)
27607 0 : && op2 && register_operand (op2, mode))
27608 0 : return ix86_expand_ternlog_andnot (mode, op2, op1, target);
27609 : break;
27610 :
27611 2 : case 0x50: /* ~c&a */
27612 0 : if ((!op1 || !side_effects_p (op1))
27613 2 : && op0 && ix86_ternlog_leaf_p (op0, mode)
27614 4 : && op2 && register_operand (op2, mode))
27615 0 : return ix86_expand_ternlog_andnot (mode, op2, op0, target);
27616 : break;
27617 :
27618 4 : case 0x55: /* ~c */
27619 1 : if ((!op0 || !side_effects_p (op0))
27620 4 : && (!op1 || !side_effects_p (op1))
27621 8 : && op2)
27622 : {
27623 4 : emit_move_insn (target, gen_rtx_XOR (mode, op2, CONSTM1_RTX (mode)));
27624 4 : return target;
27625 : }
27626 : break;
27627 :
27628 0 : case 0x5a: /* a^c */
27629 0 : if (op0 && ix86_ternlog_leaf_p (op0, mode)
27630 0 : && op2 && ix86_ternlog_leaf_p (op2, mode)
27631 0 : && (!op1 || !side_effects_p (op1)))
27632 0 : return ix86_expand_ternlog_binop (XOR, mode, op0, op2, target);
27633 : break;
27634 :
27635 0 : case 0x66: /* b^c */
27636 0 : if ((!op0 || !side_effects_p (op0))
27637 0 : && op1 && ix86_ternlog_leaf_p (op1, mode)
27638 0 : && op2 && ix86_ternlog_leaf_p (op2, mode))
27639 0 : return ix86_expand_ternlog_binop (XOR, mode, op1, op2, target);
27640 : break;
27641 :
27642 0 : case 0x88: /* b&c */
27643 0 : if ((!op0 || !side_effects_p (op0))
27644 0 : && op1 && ix86_ternlog_leaf_p (op1, mode)
27645 0 : && op2 && ix86_ternlog_leaf_p (op2, mode))
27646 0 : return ix86_expand_ternlog_binop (AND, mode, op1, op2, target);
27647 : break;
27648 :
27649 0 : case 0xa0: /* a&c */
27650 0 : if ((!op1 || !side_effects_p (op1))
27651 0 : && op0 && ix86_ternlog_leaf_p (op0, mode)
27652 0 : && op2 && ix86_ternlog_leaf_p (op2, mode))
27653 0 : return ix86_expand_ternlog_binop (AND, mode, op0, op2, target);
27654 : break;
27655 :
27656 0 : case 0xaa: /* c */
27657 0 : if ((!op0 || !side_effects_p (op0))
27658 0 : && (!op1 || !side_effects_p (op1))
27659 0 : && op2)
27660 : {
27661 0 : if (GET_MODE (op2) != mode)
27662 0 : op2 = gen_lowpart (mode, op2);
27663 0 : emit_move_insn (target, op2);
27664 0 : return target;
27665 : }
27666 : break;
27667 :
27668 0 : case 0xc0: /* a&b */
27669 0 : if (op0 && ix86_ternlog_leaf_p (op0, mode)
27670 0 : && op1 && ix86_ternlog_leaf_p (op1, mode)
27671 0 : && (!op2 || !side_effects_p (op2)))
27672 0 : return ix86_expand_ternlog_binop (AND, mode, op0, op1, target);
27673 : break;
27674 :
27675 0 : case 0xcc: /* b */
27676 0 : if ((!op0 || !side_effects_p (op0))
27677 0 : && op1
27678 0 : && (!op2 || !side_effects_p (op2)))
27679 : {
27680 0 : if (GET_MODE (op1) != mode)
27681 0 : op1 = gen_lowpart (mode, op1);
27682 0 : emit_move_insn (target, op1);
27683 0 : return target;
27684 : }
27685 : break;
27686 :
27687 0 : case 0xee: /* b|c */
27688 0 : if ((!op0 || !side_effects_p (op0))
27689 0 : && op1 && ix86_ternlog_leaf_p (op1, mode)
27690 0 : && op2 && ix86_ternlog_leaf_p (op2, mode))
27691 0 : return ix86_expand_ternlog_binop (IOR, mode, op1, op2, target);
27692 : break;
27693 :
27694 6 : case 0xf0: /* a */
27695 6 : if (op0
27696 6 : && (!op1 || !side_effects_p (op1))
27697 12 : && (!op2 || !side_effects_p (op2)))
27698 : {
27699 6 : if (GET_MODE (op0) != mode)
27700 0 : op0 = gen_lowpart (mode, op0);
27701 6 : emit_move_insn (target, op0);
27702 6 : return target;
27703 : }
27704 : break;
27705 :
27706 0 : case 0xfa: /* a|c */
27707 0 : if (op0 && ix86_ternlog_leaf_p (op0, mode)
27708 0 : && op2 && ix86_ternlog_leaf_p (op2, mode)
27709 0 : && (!op1 || !side_effects_p (op1)))
27710 0 : return ix86_expand_ternlog_binop (IOR, mode, op0, op2, target);
27711 : break;
27712 :
27713 0 : case 0xfc: /* a|b */
27714 0 : if (op0 && ix86_ternlog_leaf_p (op0, mode)
27715 0 : && op1 && ix86_ternlog_leaf_p (op1, mode)
27716 0 : && (!op2 || !side_effects_p (op2)))
27717 0 : return ix86_expand_ternlog_binop (IOR, mode, op0, op1, target);
27718 : break;
27719 :
27720 0 : case 0xff:
27721 0 : if ((!op0 || !side_effects_p (op0))
27722 0 : && (!op1 || !side_effects_p (op1))
27723 0 : && (!op2 || !side_effects_p (op2)))
27724 : {
27725 0 : emit_move_insn (target, CONSTM1_RTX (mode));
27726 0 : return target;
27727 : }
27728 : break;
27729 : }
27730 :
27731 2338 : if (!register_operand (op0, mode))
27732 : {
27733 : /* We can't use force_reg (mode, op0). */
27734 12 : tmp0 = gen_reg_rtx (GET_MODE (op0));
27735 12 : emit_move_insn (tmp0,op0);
27736 : }
27737 : else
27738 : tmp0 = op0;
27739 2338 : if (GET_MODE (tmp0) != mode)
27740 0 : tmp0 = gen_lowpart (mode, tmp0);
27741 :
27742 2338 : if (!op1 || rtx_equal_p (op0, op1))
27743 6 : tmp1 = copy_rtx (tmp0);
27744 2332 : else if (!register_operand (op1, mode))
27745 : {
27746 : /* We can't use force_reg (mode, op1). */
27747 28 : tmp1 = gen_reg_rtx (GET_MODE (op1));
27748 28 : emit_move_insn (tmp1, op1);
27749 : }
27750 : else
27751 : tmp1 = op1;
27752 2338 : if (GET_MODE (tmp1) != mode)
27753 0 : tmp1 = gen_lowpart (mode, tmp1);
27754 :
27755 2338 : if (!op2 || rtx_equal_p (op0, op2))
27756 79 : tmp2 = copy_rtx (tmp0);
27757 2259 : else if (rtx_equal_p (op1, op2))
27758 0 : tmp2 = copy_rtx (tmp1);
27759 2259 : else if (CONST_VECTOR_P (op2))
27760 : {
27761 43 : if (GET_MODE (op2) != mode)
27762 0 : op2 = gen_lowpart (mode, op2);
27763 43 : tmp2 = ix86_gen_bcst_mem (mode, op2);
27764 43 : if (!tmp2)
27765 : {
27766 25 : machine_mode bcst32_mode = mode;
27767 25 : machine_mode bcst64_mode = mode;
27768 25 : switch (mode)
27769 : {
27770 1 : case V1TImode:
27771 1 : case V4SImode:
27772 1 : case V4SFmode:
27773 1 : case V8HImode:
27774 1 : case V16QImode:
27775 1 : bcst32_mode = V4SImode;
27776 1 : bcst64_mode = V2DImode;
27777 1 : break;
27778 :
27779 0 : case V2TImode:
27780 0 : case V8SImode:
27781 0 : case V8SFmode:
27782 0 : case V16HImode:
27783 0 : case V32QImode:
27784 0 : bcst32_mode = V8SImode;
27785 0 : bcst64_mode = V4DImode;
27786 0 : break;
27787 :
27788 3 : case V4TImode:
27789 3 : case V16SImode:
27790 3 : case V16SFmode:
27791 3 : case V32HImode:
27792 3 : case V64QImode:
27793 3 : bcst32_mode = V16SImode;
27794 3 : bcst64_mode = V8DImode;
27795 3 : break;
27796 :
27797 : default:
27798 : break;
27799 : }
27800 :
27801 25 : if (bcst32_mode != mode)
27802 : {
27803 4 : tmp2 = gen_lowpart (bcst32_mode, op2);
27804 4 : if (ix86_gen_bcst_mem (bcst32_mode, tmp2))
27805 : {
27806 3 : tmp2 = ix86_expand_ternlog (bcst32_mode,
27807 3 : gen_lowpart (bcst32_mode, tmp0),
27808 3 : gen_lowpart (bcst32_mode, tmp1),
27809 : tmp2, idx, NULL_RTX);
27810 3 : emit_move_insn (target, gen_lowpart (mode, tmp2));
27811 3 : return target;
27812 : }
27813 : }
27814 :
27815 22 : if (bcst64_mode != mode)
27816 : {
27817 1 : tmp2 = gen_lowpart (bcst64_mode, op2);
27818 1 : if (ix86_gen_bcst_mem (bcst64_mode, tmp2))
27819 : {
27820 0 : tmp2 = ix86_expand_ternlog (bcst64_mode,
27821 0 : gen_lowpart (bcst64_mode, tmp0),
27822 0 : gen_lowpart (bcst64_mode, tmp1),
27823 : tmp2, idx, NULL_RTX);
27824 0 : emit_move_insn (target, gen_lowpart (mode, tmp2));
27825 0 : return target;
27826 : }
27827 : }
27828 :
27829 22 : tmp2 = force_const_mem (mode, op2);
27830 22 : rtx bcast = ix86_broadcast_from_constant (mode, tmp2);
27831 22 : tmp2 = validize_mem (tmp2);
27832 22 : if (bcast)
27833 : {
27834 12 : rtx reg2 = gen_reg_rtx (mode);
27835 12 : bool ok = ix86_expand_vector_init_duplicate (false, mode,
27836 : reg2, bcast);
27837 12 : if (ok)
27838 2335 : tmp2 = reg2;
27839 : }
27840 : }
27841 : }
27842 : else
27843 : tmp2 = op2;
27844 2335 : if (GET_MODE (tmp2) != mode)
27845 0 : tmp2 = gen_lowpart (mode, tmp2);
27846 : /* Some memory_operands are not vector_memory_operands. */
27847 2335 : if (!bcst_vector_operand (tmp2, mode))
27848 0 : tmp2 = force_reg (mode, tmp2);
27849 :
27850 2335 : rtvec vec = gen_rtvec (4, tmp0, tmp1, tmp2, GEN_INT (idx));
27851 2335 : emit_move_insn (target, gen_rtx_UNSPEC (mode, vec, UNSPEC_VTERNLOG));
27852 2335 : return target;
27853 : }
27854 :
27855 : /* GF2P8AFFINEQB matrixes to implement shift and rotate. */
27856 :
27857 : static const uint64_t matrix_ashift[8] =
27858 : {
27859 : 0,
27860 : 0x0001020408102040, /* 1 l */
27861 : 0x0000010204081020, /* 2 l */
27862 : 0x0000000102040810, /* 3 l */
27863 : 0x0000000001020408, /* 4 l */
27864 : 0x0000000000010204, /* 5 l */
27865 : 0x0000000000000102, /* 6 l */
27866 : 0x0000000000000001 /* 7 l */
27867 : };
27868 :
27869 : static const uint64_t matrix_lshiftrt[8] =
27870 : {
27871 : 0,
27872 : 0x0204081020408000, /* 1 r */
27873 : 0x0408102040800000, /* 2 r */
27874 : 0x0810204080000000, /* 3 r */
27875 : 0x1020408000000000, /* 4 r */
27876 : 0x2040800000000000, /* 5 r */
27877 : 0x4080000000000000, /* 6 r */
27878 : 0x8000000000000000 /* 7 r */
27879 : };
27880 :
27881 : static const uint64_t matrix_ashiftrt[8] =
27882 : {
27883 : 0,
27884 : 0x0204081020408080, /* 1 r */
27885 : 0x0408102040808080, /* 2 r */
27886 : 0x0810204080808080, /* 3 r */
27887 : 0x1020408080808080, /* 4 r */
27888 : 0x2040808080808080, /* 5 r */
27889 : 0x4080808080808080, /* 6 r */
27890 : 0x8080808080808080 /* 7 r */
27891 : };
27892 :
27893 : static const uint64_t matrix_rotate[8] =
27894 : {
27895 : 0,
27896 : 0x8001020408102040, /* 1 rol8 */
27897 : 0x4080010204081020, /* 2 rol8 */
27898 : 0x2040800102040810, /* 3 rol8 */
27899 : 0x1020408001020408, /* 4 rol8 */
27900 : 0x0810204080010204, /* 5 rol8 */
27901 : 0x0408102040800102, /* 6 rol8 */
27902 : 0x0204081020408001 /* 7 rol8 */
27903 : };
27904 :
27905 : static const uint64_t matrix_rotatert[8] =
27906 : {
27907 : 0,
27908 : 0x0204081020408001, /* 1 ror8 */
27909 : 0x0408102040800102, /* 2 ror8 */
27910 : 0x0810204080010204, /* 3 ror8 */
27911 : 0x1020408001020408, /* 4 ror8 */
27912 : 0x2040800102040810, /* 5 ror8 */
27913 : 0x4080010204081020, /* 6 ror8 */
27914 : 0x8001020408102040 /* 7 ror8 */
27915 : };
27916 :
27917 : /* Return rtx to load a 64bit GF2P8AFFINE GP(2) matrix implementing a shift
27918 : for CODE and shift count COUNT into register with vector of size of SRC. */
27919 :
27920 : rtx
27921 189 : ix86_vgf2p8affine_shift_matrix (rtx src, rtx count, enum rtx_code code)
27922 : {
27923 189 : machine_mode mode = GET_MODE (src);
27924 189 : const uint64_t *matrix;
27925 189 : unsigned shift = INTVAL (count) & 7;
27926 189 : gcc_assert (shift > 0 && shift < 8);
27927 :
27928 189 : switch (code)
27929 : {
27930 : case ASHIFT:
27931 : matrix = matrix_ashift;
27932 : break;
27933 26 : case ASHIFTRT:
27934 26 : matrix = matrix_ashiftrt;
27935 26 : break;
27936 28 : case LSHIFTRT:
27937 28 : matrix = matrix_lshiftrt;
27938 28 : break;
27939 32 : case ROTATE:
27940 32 : matrix = matrix_rotate;
27941 32 : break;
27942 33 : case ROTATERT:
27943 33 : matrix = matrix_rotatert;
27944 33 : break;
27945 0 : default:
27946 0 : gcc_unreachable ();
27947 : }
27948 :
27949 189 : int nelts = GET_MODE_NUNITS (mode);
27950 189 : rtvec vec = rtvec_alloc (nelts);
27951 189 : uint64_t ma = matrix[shift];
27952 7741 : for (int i = 0; i < nelts; i++)
27953 7552 : RTVEC_ELT (vec, i) = gen_int_mode ((ma >> ((i % 8) * 8)) & 0xff, QImode);
27954 :
27955 189 : return force_reg (mode, gen_rtx_CONST_VECTOR (mode, vec));
27956 : }
27957 :
27958 : /* Trunc a vector to a narrow vector, like v4di -> v4si. */
27959 :
27960 : void
27961 63 : ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input, machine_mode cvt_mode)
27962 : {
27963 63 : machine_mode out_mode = GET_MODE (output);
27964 63 : machine_mode in_mode = GET_MODE (input);
27965 63 : int len = GET_MODE_SIZE (in_mode);
27966 252 : gcc_assert (len == GET_MODE_SIZE (cvt_mode)
27967 : && GET_MODE_INNER (out_mode) == GET_MODE_INNER (cvt_mode)
27968 : && (REG_P (input) || SUBREG_P (input)));
27969 63 : scalar_mode inner_out_mode = GET_MODE_INNER (out_mode);
27970 126 : int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode));
27971 63 : int out_innersize = GET_MODE_SIZE (inner_out_mode);
27972 :
27973 63 : struct expand_vec_perm_d d;
27974 63 : d.target = gen_reg_rtx (cvt_mode);
27975 63 : d.op0 = lowpart_subreg (cvt_mode, force_reg(in_mode, input), in_mode);
27976 63 : d.op1 = d.op0;
27977 63 : d.vmode = cvt_mode;
27978 63 : d.nelt = GET_MODE_NUNITS (cvt_mode);
27979 63 : d.testing_p = false;
27980 63 : d.one_operand_p = true;
27981 :
27982 : /* Init perm. Put the needed bits of input in order and
27983 : fill the rest of bits by default. */
27984 687 : for (int i = 0; i < d.nelt; ++i)
27985 : {
27986 624 : d.perm[i] = i;
27987 1248 : if (i < GET_MODE_NUNITS (out_mode))
27988 246 : d.perm[i] = i * (in_innersize / out_innersize);
27989 : }
27990 :
27991 63 : bool ok = ix86_expand_vec_perm_const_1(&d);
27992 63 : gcc_assert (ok);
27993 63 : emit_move_insn (output, gen_lowpart (out_mode, d.target));
27994 63 : }
27995 :
27996 : /* Implement truncv8sfv8bf2 with vector permutation. */
27997 : void
27998 8 : ix86_expand_vector_sf2bf_with_vec_perm (rtx dest, rtx src)
27999 : {
28000 8 : machine_mode vperm_mode, src_mode = GET_MODE (src);
28001 8 : switch (src_mode)
28002 : {
28003 : case V16SFmode:
28004 : vperm_mode = V32BFmode;
28005 : break;
28006 2 : case V8SFmode:
28007 2 : vperm_mode = V16BFmode;
28008 2 : break;
28009 4 : case V4SFmode:
28010 4 : vperm_mode = V8BFmode;
28011 4 : break;
28012 0 : default:
28013 0 : gcc_unreachable ();
28014 : }
28015 :
28016 8 : int nelt = GET_MODE_NUNITS (vperm_mode);
28017 8 : vec_perm_builder sel (nelt, nelt, 1);
28018 8 : sel.quick_grow (nelt);
28019 136 : for (int i = 0; i != nelt; i++)
28020 128 : sel[i] = (2 * i + 1) % nelt;
28021 16 : vec_perm_indices indices (sel, 1, nelt);
28022 :
28023 8 : rtx target = gen_reg_rtx (vperm_mode);
28024 8 : rtx op0 = lowpart_subreg (vperm_mode,
28025 : force_reg (src_mode, src),
28026 : src_mode);
28027 8 : bool ok = targetm.vectorize.vec_perm_const (vperm_mode, vperm_mode,
28028 : target, op0, op0, indices);
28029 8 : gcc_assert (ok);
28030 8 : emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target, vperm_mode));
28031 8 : }
28032 :
28033 : /* Implement extendv8bf2v8sf2 with vector permutation. */
28034 : void
28035 8 : ix86_expand_vector_bf2sf_with_vec_perm (rtx dest, rtx src)
28036 : {
28037 8 : machine_mode vperm_mode, src_mode = GET_MODE (src);
28038 8 : switch (src_mode)
28039 : {
28040 : case V16BFmode:
28041 : vperm_mode = V32BFmode;
28042 : break;
28043 2 : case V8BFmode:
28044 2 : vperm_mode = V16BFmode;
28045 2 : break;
28046 4 : case V4BFmode:
28047 4 : vperm_mode = V8BFmode;
28048 4 : break;
28049 0 : default:
28050 0 : gcc_unreachable ();
28051 : }
28052 :
28053 8 : int nelt = GET_MODE_NUNITS (vperm_mode);
28054 8 : vec_perm_builder sel (nelt, nelt, 1);
28055 8 : sel.quick_grow (nelt);
28056 136 : for (int i = 0, k = 0, j = nelt; i != nelt; i++)
28057 128 : sel[i] = i & 1 ? j++ : k++;
28058 :
28059 16 : vec_perm_indices indices (sel, 2, nelt);
28060 :
28061 8 : rtx target = gen_reg_rtx (vperm_mode);
28062 8 : rtx op1 = lowpart_subreg (vperm_mode,
28063 : force_reg (src_mode, src),
28064 : src_mode);
28065 8 : rtx op0 = CONST0_RTX (vperm_mode);
28066 8 : bool ok = targetm.vectorize.vec_perm_const (vperm_mode, vperm_mode,
28067 : target, op0, op1, indices);
28068 8 : gcc_assert (ok);
28069 8 : emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target, vperm_mode));
28070 8 : }
28071 :
28072 :
28073 : #include "gt-i386-expand.h"
|