Line data Source code
1 : /* Copyright (C) 1988-2026 Free Software Foundation, Inc.
2 :
3 : This file is part of GCC.
4 :
5 : GCC is free software; you can redistribute it and/or modify
6 : it under the terms of the GNU General Public License as published by
7 : the Free Software Foundation; either version 3, or (at your option)
8 : any later version.
9 :
10 : GCC is distributed in the hope that it will be useful,
11 : but WITHOUT ANY WARRANTY; without even the implied warranty of
12 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 : GNU General Public License for more details.
14 :
15 : You should have received a copy of the GNU General Public License
16 : along with GCC; see the file COPYING3. If not see
17 : <http://www.gnu.org/licenses/>. */
18 :
19 : #define IN_TARGET_CODE 1
20 :
21 : #include "config.h"
22 : #include "system.h"
23 : #include "coretypes.h"
24 : #include "backend.h"
25 : #include "rtl.h"
26 : #include "tree.h"
27 : #include "memmodel.h"
28 : #include "gimple.h"
29 : #include "cfghooks.h"
30 : #include "cfgloop.h"
31 : #include "df.h"
32 : #include "tm_p.h"
33 : #include "stringpool.h"
34 : #include "expmed.h"
35 : #include "optabs.h"
36 : #include "regs.h"
37 : #include "emit-rtl.h"
38 : #include "recog.h"
39 : #include "cgraph.h"
40 : #include "diagnostic.h"
41 : #include "cfgbuild.h"
42 : #include "alias.h"
43 : #include "fold-const.h"
44 : #include "attribs.h"
45 : #include "calls.h"
46 : #include "stor-layout.h"
47 : #include "varasm.h"
48 : #include "output.h"
49 : #include "insn-attr.h"
50 : #include "flags.h"
51 : #include "except.h"
52 : #include "explow.h"
53 : #include "expr.h"
54 : #include "cfgrtl.h"
55 : #include "common/common-target.h"
56 : #include "langhooks.h"
57 : #include "reload.h"
58 : #include "gimplify.h"
59 : #include "dwarf2.h"
60 : #include "tm-constrs.h"
61 : #include "cselib.h"
62 : #include "sched-int.h"
63 : #include "opts.h"
64 : #include "tree-pass.h"
65 : #include "context.h"
66 : #include "pass_manager.h"
67 : #include "target-globals.h"
68 : #include "gimple-iterator.h"
69 : #include "shrink-wrap.h"
70 : #include "builtins.h"
71 : #include "rtl-iter.h"
72 : #include "tree-iterator.h"
73 : #include "dbgcnt.h"
74 : #include "case-cfn-macros.h"
75 : #include "dojump.h"
76 : #include "fold-const-call.h"
77 : #include "tree-vrp.h"
78 : #include "tree-ssanames.h"
79 : #include "selftest.h"
80 : #include "selftest-rtl.h"
81 : #include "print-rtl.h"
82 : #include "intl.h"
83 : #include "ifcvt.h"
84 : #include "symbol-summary.h"
85 : #include "sreal.h"
86 : #include "ipa-cp.h"
87 : #include "ipa-prop.h"
88 : #include "ipa-fnsummary.h"
89 : #include "wide-int-bitmask.h"
90 : #include "tree-vector-builder.h"
91 : #include "debug.h"
92 : #include "dwarf2out.h"
93 : #include "i386-options.h"
94 : #include "i386-builtins.h"
95 : #include "i386-expand.h"
96 : #include "asan.h"
97 :
98 : /* Split one or more double-mode RTL references into pairs of half-mode
99 : references. The RTL can be REG, offsettable MEM, integer constant, or
100 : CONST_DOUBLE. "operands" is a pointer to an array of double-mode RTLs to
101 : split and "num" is its length. lo_half and hi_half are output arrays
102 : that parallel "operands". */
103 :
104 : void
105 4150663 : split_double_mode (machine_mode mode, rtx operands[],
106 : int num, rtx lo_half[], rtx hi_half[])
107 : {
108 4150663 : machine_mode half_mode;
109 4150663 : unsigned int byte;
110 4150663 : rtx mem_op = NULL_RTX;
111 4150663 : int mem_num = 0;
112 :
113 4150663 : switch (mode)
114 : {
115 : case E_TImode:
116 : half_mode = DImode;
117 : break;
118 605755 : case E_DImode:
119 605755 : half_mode = SImode;
120 605755 : break;
121 6 : case E_P2HImode:
122 6 : half_mode = HImode;
123 6 : break;
124 30 : case E_P2QImode:
125 30 : half_mode = QImode;
126 30 : break;
127 0 : default:
128 0 : gcc_unreachable ();
129 : }
130 :
131 4150663 : byte = GET_MODE_SIZE (half_mode);
132 :
133 8515876 : while (num--)
134 : {
135 4365213 : rtx op = operands[num];
136 :
137 : /* simplify_subreg refuse to split volatile memory addresses,
138 : but we still have to handle it. */
139 4365213 : if (MEM_P (op))
140 : {
141 1729593 : if (mem_op && rtx_equal_p (op, mem_op))
142 : {
143 2426 : lo_half[num] = lo_half[mem_num];
144 2426 : hi_half[num] = hi_half[mem_num];
145 : }
146 : else
147 : {
148 1727167 : mem_op = op;
149 1727167 : mem_num = num;
150 1727167 : lo_half[num] = adjust_address (op, half_mode, 0);
151 1727167 : hi_half[num] = adjust_address (op, half_mode, byte);
152 : }
153 : }
154 : else
155 : {
156 2635620 : lo_half[num] = simplify_gen_subreg (half_mode, op,
157 2635620 : GET_MODE (op) == VOIDmode
158 : ? mode : GET_MODE (op), 0);
159 :
160 2635620 : rtx tmp = simplify_gen_subreg (half_mode, op,
161 2635620 : GET_MODE (op) == VOIDmode
162 2635620 : ? mode : GET_MODE (op), byte);
163 : /* simplify_gen_subreg will return NULL RTX for the
164 : high half of the paradoxical subreg. */
165 2635620 : hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode);
166 : }
167 : }
168 4150663 : }
169 :
170 : /* Emit the double word assignment DST = { LO, HI }. */
171 :
172 : void
173 101248 : split_double_concat (machine_mode mode, rtx dst, rtx lo, rtx hi)
174 : {
175 101248 : rtx dlo, dhi;
176 101248 : int deleted_move_count = 0;
177 101248 : split_double_mode (mode, &dst, 1, &dlo, &dhi);
178 : /* Constraints ensure that if both lo and hi are MEMs, then
179 : dst has early-clobber and thus addresses of MEMs don't use
180 : dlo/dhi registers. Otherwise if at least one of li and hi are MEMs,
181 : dlo/dhi are registers. */
182 101248 : if (MEM_P (lo)
183 5524 : && rtx_equal_p (dlo, hi)
184 102237 : && reg_overlap_mentioned_p (dhi, lo))
185 : {
186 : /* If dlo is same as hi and lo's address uses dhi register,
187 : code below would first emit_move_insn (dhi, hi)
188 : and then emit_move_insn (dlo, lo). But the former
189 : would invalidate lo's address. Load into dhi first,
190 : then swap. */
191 193 : emit_move_insn (dhi, lo);
192 193 : lo = dhi;
193 : }
194 101055 : else if (MEM_P (hi)
195 9657 : && !MEM_P (lo)
196 6800 : && !rtx_equal_p (dlo, lo)
197 102618 : && reg_overlap_mentioned_p (dlo, hi))
198 : {
199 : /* In this case, code below would first emit_move_insn (dlo, lo)
200 : and then emit_move_insn (dhi, hi). But the former would
201 : invalidate hi's address. */
202 15 : if (rtx_equal_p (dhi, lo))
203 : {
204 : /* We can't load into dhi first, so load into dlo
205 : first and we'll swap. */
206 9 : emit_move_insn (dlo, hi);
207 9 : hi = dlo;
208 : }
209 : else
210 : {
211 : /* Load into dhi first. */
212 6 : emit_move_insn (dhi, hi);
213 6 : hi = dhi;
214 : }
215 : }
216 101248 : if (!rtx_equal_p (dlo, hi))
217 : {
218 87460 : if (!rtx_equal_p (dlo, lo))
219 38644 : emit_move_insn (dlo, lo);
220 : else
221 : deleted_move_count++;
222 87460 : if (!rtx_equal_p (dhi, hi))
223 81378 : emit_move_insn (dhi, hi);
224 : else
225 6082 : deleted_move_count++;
226 : }
227 13788 : else if (!rtx_equal_p (lo, dhi))
228 : {
229 6857 : if (!rtx_equal_p (dhi, hi))
230 6857 : emit_move_insn (dhi, hi);
231 : else
232 : deleted_move_count++;
233 6857 : if (!rtx_equal_p (dlo, lo))
234 6761 : emit_move_insn (dlo, lo);
235 : else
236 96 : deleted_move_count++;
237 : }
238 6931 : else if (mode == TImode)
239 6913 : emit_insn (gen_swapdi (dlo, dhi));
240 : else
241 18 : emit_insn (gen_swapsi (dlo, dhi));
242 :
243 101248 : if (deleted_move_count == 2)
244 3202 : emit_note (NOTE_INSN_DELETED);
245 101248 : }
246 :
247 :
248 : /* Generate either "mov $0, reg" or "xor reg, reg", as appropriate
249 : for the target. */
250 :
251 : void
252 112711 : ix86_expand_clear (rtx dest)
253 : {
254 112711 : rtx tmp;
255 :
256 : /* We play register width games, which are only valid after reload. */
257 112711 : gcc_assert (reload_completed);
258 :
259 : /* Avoid HImode and its attendant prefix byte. */
260 225422 : if (GET_MODE_SIZE (GET_MODE (dest)) < 4)
261 991 : dest = gen_rtx_REG (SImode, REGNO (dest));
262 112711 : tmp = gen_rtx_SET (dest, const0_rtx);
263 :
264 112711 : if (!TARGET_USE_MOV0 || optimize_insn_for_size_p ())
265 : {
266 112711 : rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
267 112711 : tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
268 : }
269 :
270 112711 : emit_insn (tmp);
271 112711 : }
272 :
273 : /* Return true if V can be broadcasted from an integer of WIDTH bits
274 : which is returned in VAL_BROADCAST. Otherwise, return false. */
275 :
276 : static bool
277 4851 : ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
278 : HOST_WIDE_INT &val_broadcast)
279 : {
280 4851 : wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT);
281 4851 : val_broadcast = wi::extract_uhwi (val, 0, width);
282 6543 : for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
283 : {
284 5089 : HOST_WIDE_INT each = wi::extract_uhwi (val, i, width);
285 5089 : if (val_broadcast != each)
286 : return false;
287 : }
288 1454 : val_broadcast = sext_hwi (val_broadcast, width);
289 1454 : return true;
290 4851 : }
291 :
292 : /* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
293 :
294 : rtx
295 35293 : ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
296 : {
297 : /* Don't use integer vector broadcast if we can't move from GPR to SSE
298 : register directly. */
299 35293 : if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
300 : return nullptr;
301 :
302 35293 : unsigned int msize = GET_MODE_SIZE (mode);
303 :
304 : /* Only optimized for vpbroadcast[bwsd]/vbroadcastss with xmm/ymm/zmm. */
305 35293 : if (msize != 16 && msize != 32 && msize != 64)
306 : return nullptr;
307 :
308 : /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
309 : broadcast only if vector broadcast is available. */
310 35293 : if (!TARGET_AVX
311 1610 : || !CONST_WIDE_INT_P (op)
312 1603 : || standard_sse_constant_p (op, mode)
313 36896 : || (CONST_WIDE_INT_NUNITS (op) * HOST_BITS_PER_WIDE_INT
314 1603 : != GET_MODE_BITSIZE (mode)))
315 33698 : return nullptr;
316 :
317 1595 : HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
318 1595 : HOST_WIDE_INT val_broadcast;
319 1595 : scalar_int_mode broadcast_mode;
320 : /* vpbroadcastb zmm requires TARGET_AVX512BW. */
321 712 : if ((msize == 64 ? TARGET_AVX512BW : TARGET_AVX2)
322 2089 : && ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
323 : val_broadcast))
324 : broadcast_mode = QImode;
325 654 : else if ((msize == 64 ? TARGET_AVX512BW : TARGET_AVX2)
326 1968 : && ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
327 : val_broadcast))
328 : broadcast_mode = HImode;
329 : /* vbroadcasts[sd] only support memory operand w/o AVX2.
330 : When msize == 16, pshufs is used for vec_duplicate.
331 : when msize == 64, vpbroadcastd is used, and TARGET_AVX512F must be existed. */
332 412 : else if ((msize != 32 || TARGET_AVX2)
333 1768 : && ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
334 : val_broadcast))
335 : broadcast_mode = SImode;
336 1391 : else if (TARGET_64BIT && (msize != 32 || TARGET_AVX2)
337 2641 : && ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
338 : val_broadcast))
339 : broadcast_mode = DImode;
340 : else
341 141 : return nullptr;
342 :
343 : /* Check if OP can be broadcasted from VAL. */
344 1776 : for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
345 1561 : if (val != CONST_WIDE_INT_ELT (op, i))
346 : return nullptr;
347 :
348 215 : unsigned int nunits = (GET_MODE_SIZE (mode)
349 215 : / GET_MODE_SIZE (broadcast_mode));
350 215 : machine_mode vector_mode;
351 215 : if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
352 0 : gcc_unreachable ();
353 215 : rtx target = gen_reg_rtx (vector_mode);
354 215 : bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
355 : target,
356 : GEN_INT (val_broadcast));
357 215 : if (!ok)
358 : return nullptr;
359 215 : target = lowpart_subreg (mode, target, vector_mode);
360 215 : return target;
361 : }
362 :
363 : void
364 73032809 : ix86_expand_move (machine_mode mode, rtx operands[])
365 : {
366 73032809 : rtx op0, op1;
367 73032809 : rtx tmp, addend = NULL_RTX;
368 73032809 : enum tls_model model;
369 :
370 73032809 : op0 = operands[0];
371 73032809 : op1 = operands[1];
372 :
373 : /* Avoid complex sets of likely spilled hard registers before reload. */
374 73032809 : if (!ix86_hardreg_mov_ok (op0, op1))
375 : {
376 139360 : tmp = gen_reg_rtx (mode);
377 139360 : operands[0] = tmp;
378 139360 : ix86_expand_move (mode, operands);
379 139360 : operands[0] = op0;
380 139360 : operands[1] = tmp;
381 139360 : op1 = tmp;
382 : }
383 :
384 73032809 : switch (GET_CODE (op1))
385 : {
386 347483 : case CONST:
387 347483 : tmp = XEXP (op1, 0);
388 :
389 347483 : if (GET_CODE (tmp) != PLUS
390 335812 : || !SYMBOL_REF_P (XEXP (tmp, 0)))
391 : break;
392 :
393 333149 : op1 = XEXP (tmp, 0);
394 333149 : addend = XEXP (tmp, 1);
395 : /* FALLTHRU */
396 :
397 4881389 : case SYMBOL_REF:
398 4881389 : model = SYMBOL_REF_TLS_MODEL (op1);
399 :
400 4881389 : if (model)
401 10126 : op1 = legitimize_tls_address (op1, model, true);
402 4871263 : else if (ix86_force_load_from_GOT_p (op1))
403 : {
404 : /* Load the external function address via GOT slot to avoid PLT. */
405 24 : op1 = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, op1),
406 : (TARGET_64BIT
407 : ? UNSPEC_GOTPCREL
408 : : UNSPEC_GOT));
409 24 : op1 = gen_rtx_CONST (Pmode, op1);
410 24 : op1 = gen_const_mem (Pmode, op1);
411 20 : set_mem_alias_set (op1, GOT_ALIAS_SET);
412 : }
413 : else
414 : {
415 : #if TARGET_PECOFF
416 : tmp = legitimize_pe_coff_symbol (op1, addend != NULL_RTX);
417 :
418 : if (tmp)
419 : {
420 : op1 = tmp;
421 : if (!addend)
422 : break;
423 : }
424 : else
425 : #endif
426 4871243 : {
427 4871243 : op1 = operands[1];
428 4871243 : break;
429 : }
430 : }
431 :
432 10146 : if (addend)
433 : {
434 2786 : op1 = force_operand (op1, NULL_RTX);
435 2795 : op1 = expand_simple_binop (Pmode, PLUS, op1, addend,
436 : op0, 1, OPTAB_DIRECT);
437 : }
438 : else
439 7360 : op1 = force_operand (op1, op0);
440 :
441 10146 : if (op1 == op0)
442 : return;
443 :
444 1147 : op1 = convert_to_mode (mode, op1, 1);
445 :
446 : default:
447 : break;
448 :
449 1491586 : case SUBREG:
450 : /* Transform TImode paradoxical SUBREG into zero_extendditi2. */
451 1491586 : if (TARGET_64BIT
452 1264538 : && mode == TImode
453 : && SUBREG_P (op1)
454 74465 : && GET_MODE (SUBREG_REG (op1)) == DImode
455 1537622 : && SUBREG_BYTE (op1) == 0)
456 46036 : op1 = gen_rtx_ZERO_EXTEND (TImode, SUBREG_REG (op1));
457 : /* As not all values in XFmode are representable in real_value,
458 : we might be called with unfoldable SUBREGs of constants. */
459 1491586 : if (mode == XFmode
460 3130 : && CONSTANT_P (SUBREG_REG (op1))
461 0 : && can_create_pseudo_p ())
462 : {
463 0 : machine_mode imode = GET_MODE (SUBREG_REG (op1));
464 0 : rtx r = force_const_mem (imode, SUBREG_REG (op1));
465 0 : if (r)
466 0 : r = validize_mem (r);
467 : else
468 0 : r = force_reg (imode, SUBREG_REG (op1));
469 0 : op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
470 : }
471 : break;
472 : }
473 :
474 73023810 : if ((flag_pic || MACHOPIC_INDIRECT)
475 73023810 : && symbolic_operand (op1, mode))
476 : {
477 : #if TARGET_MACHO
478 : if (TARGET_MACHO && !TARGET_64BIT)
479 : {
480 : /* dynamic-no-pic */
481 : if (MACHOPIC_INDIRECT)
482 : {
483 : tmp = (op0 && REG_P (op0) && mode == Pmode)
484 : ? op0 : gen_reg_rtx (Pmode);
485 : op1 = machopic_indirect_data_reference (op1, tmp);
486 : if (MACHOPIC_PURE)
487 : op1 = machopic_legitimize_pic_address (op1, mode,
488 : tmp == op1 ? 0 : tmp);
489 : }
490 : if (op0 != op1 && !MEM_P (op0))
491 : {
492 : rtx insn = gen_rtx_SET (op0, op1);
493 : emit_insn (insn);
494 : return;
495 : }
496 : }
497 : #endif
498 :
499 333440 : if (MEM_P (op0))
500 87244 : op1 = force_reg (mode, op1);
501 246196 : else if (!(TARGET_64BIT && x86_64_movabs_operand (op1, DImode)))
502 : {
503 246156 : rtx reg = can_create_pseudo_p () ? NULL_RTX : op0;
504 246156 : op1 = legitimize_pic_address (op1, reg);
505 246156 : if (op0 == op1)
506 : return;
507 246156 : op1 = convert_to_mode (mode, op1, 1);
508 : }
509 : }
510 : else
511 : {
512 72690370 : if (MEM_P (op0)
513 99097018 : && (PUSH_ROUNDING (GET_MODE_SIZE (mode)) != GET_MODE_SIZE (mode)
514 10673619 : || !push_operand (op0, mode))
515 84895560 : && MEM_P (op1))
516 2159040 : op1 = force_reg (mode, op1);
517 :
518 72690370 : if (push_operand (op0, mode)
519 72690370 : && ! general_no_elim_operand (op1, mode))
520 1004 : op1 = copy_to_mode_reg (mode, op1);
521 :
522 : /* Force large constants in 64bit compilation into register
523 : to get them CSEed. */
524 72690370 : if (can_create_pseudo_p ()
525 66999401 : && (mode == DImode) && TARGET_64BIT
526 34834846 : && immediate_operand (op1, mode)
527 7863976 : && !x86_64_zext_immediate_operand (op1, VOIDmode)
528 717045 : && !register_operand (op0, mode)
529 72864013 : && optimize)
530 121927 : op1 = copy_to_mode_reg (mode, op1);
531 :
532 72690370 : if (can_create_pseudo_p ())
533 : {
534 66999401 : if (CONST_DOUBLE_P (op1))
535 : {
536 : /* If we are loading a floating point constant to a
537 : register, force the value to memory now, since we'll
538 : get better code out the back end. */
539 :
540 896725 : op1 = validize_mem (force_const_mem (mode, op1));
541 896725 : if (!register_operand (op0, mode))
542 : {
543 129652 : tmp = gen_reg_rtx (mode);
544 129652 : emit_insn (gen_rtx_SET (tmp, op1));
545 129652 : emit_move_insn (op0, tmp);
546 129652 : return;
547 : }
548 : }
549 : }
550 : }
551 :
552 : /* Special case inserting 64-bit values into a TImode register. */
553 72894158 : if (TARGET_64BIT
554 : /* Disable for -O0 (see PR110587) unless naked (PR110533). */
555 63213413 : && (optimize || ix86_function_naked (current_function_decl))
556 43463148 : && (mode == DImode || mode == DFmode)
557 29547109 : && SUBREG_P (op0)
558 479374 : && GET_MODE (SUBREG_REG (op0)) == TImode
559 396601 : && REG_P (SUBREG_REG (op0))
560 73290759 : && REG_P (op1))
561 : {
562 : /* Use *insvti_lowpart_1 to set lowpart. */
563 180336 : if (SUBREG_BYTE (op0) == 0)
564 : {
565 53951 : wide_int mask = wi::mask (64, true, 128);
566 53951 : tmp = immed_wide_int_const (mask, TImode);
567 53951 : op0 = SUBREG_REG (op0);
568 53951 : tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
569 53951 : if (mode == DFmode)
570 355 : op1 = gen_lowpart (DImode, op1);
571 53951 : op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
572 53951 : op1 = gen_rtx_IOR (TImode, tmp, op1);
573 53951 : }
574 : /* Use *insvti_highpart_1 to set highpart. */
575 126385 : else if (SUBREG_BYTE (op0) == 8)
576 : {
577 126385 : wide_int mask = wi::mask (64, false, 128);
578 126385 : tmp = immed_wide_int_const (mask, TImode);
579 126385 : op0 = SUBREG_REG (op0);
580 126385 : tmp = gen_rtx_AND (TImode, copy_rtx (op0), tmp);
581 126385 : if (mode == DFmode)
582 206 : op1 = gen_lowpart (DImode, op1);
583 126385 : op1 = gen_rtx_ZERO_EXTEND (TImode, op1);
584 126385 : op1 = gen_rtx_ASHIFT (TImode, op1, GEN_INT (64));
585 126385 : op1 = gen_rtx_IOR (TImode, tmp, op1);
586 126385 : }
587 : }
588 :
589 72894158 : emit_insn (gen_rtx_SET (op0, op1));
590 : }
591 :
592 : /* OP is a memref of CONST_VECTOR, return scalar constant mem
593 : if CONST_VECTOR is a vec_duplicate, else return NULL. */
594 : rtx
595 2461728 : ix86_broadcast_from_constant (machine_mode mode, rtx op)
596 : {
597 2461728 : int nunits = GET_MODE_NUNITS (mode);
598 2461728 : if (nunits < 2)
599 : return nullptr;
600 :
601 : /* Don't use integer vector broadcast if we can't move from GPR to SSE
602 : register directly. */
603 2323341 : if (!TARGET_INTER_UNIT_MOVES_TO_VEC
604 8172 : && INTEGRAL_MODE_P (mode))
605 : return nullptr;
606 :
607 : /* Convert CONST_VECTOR to a non-standard SSE constant integer
608 : broadcast only if vector broadcast is available. */
609 2317779 : if (standard_sse_constant_p (op, mode))
610 : return nullptr;
611 :
612 4635552 : if (GET_MODE_INNER (mode) == TImode)
613 : return nullptr;
614 :
615 2317666 : rtx constant = get_pool_constant (XEXP (op, 0));
616 2317666 : if (!CONST_VECTOR_P (constant))
617 : return nullptr;
618 :
619 : /* There could be some rtx like
620 : (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
621 : but with "*.LC1" refer to V2DI constant vector. */
622 2317666 : if (GET_MODE (constant) != mode)
623 : {
624 609 : constant = simplify_subreg (mode, constant, GET_MODE (constant),
625 : 0);
626 609 : if (constant == nullptr || !CONST_VECTOR_P (constant))
627 : return nullptr;
628 : }
629 :
630 2317666 : rtx first = XVECEXP (constant, 0, 0);
631 :
632 7628262 : for (int i = 1; i < nunits; ++i)
633 : {
634 7016480 : rtx tmp = XVECEXP (constant, 0, i);
635 : /* Vector duplicate value. */
636 7016480 : if (!rtx_equal_p (tmp, first))
637 : return nullptr;
638 : }
639 :
640 : return first;
641 : }
642 :
643 : void
644 4731949 : ix86_expand_vector_move (machine_mode mode, rtx operands[])
645 : {
646 4731949 : rtx op0 = operands[0], op1 = operands[1];
647 : /* Use GET_MODE_BITSIZE instead of GET_MODE_ALIGNMENT for IA MCU
648 : psABI since the biggest alignment is 4 byte for IA MCU psABI. */
649 4731949 : unsigned int align = (TARGET_IAMCU
650 4731949 : ? GET_MODE_BITSIZE (mode)
651 4731949 : : GET_MODE_ALIGNMENT (mode));
652 :
653 4731949 : if (push_operand (op0, VOIDmode))
654 2875 : op0 = emit_move_resolve_push (mode, op0);
655 :
656 : /* Force constants other than zero into memory. We do not know how
657 : the instructions used to build constants modify the upper 64 bits
658 : of the register, once we have that information we may be able
659 : to handle some of them more efficiently. */
660 4731949 : if (can_create_pseudo_p ()
661 4537479 : && (CONSTANT_P (op1)
662 4227665 : || (SUBREG_P (op1)
663 305643 : && CONSTANT_P (SUBREG_REG (op1))))
664 5041777 : && ((register_operand (op0, mode)
665 256345 : && !standard_sse_constant_p (op1, mode))
666 : /* ix86_expand_vector_move_misalign() does not like constants. */
667 : || (SSE_REG_MODE_P (mode)
668 251859 : && MEM_P (op0)
669 37505 : && MEM_ALIGN (op0) < align)))
670 : {
671 2065 : if (SUBREG_P (op1))
672 : {
673 14 : machine_mode imode = GET_MODE (SUBREG_REG (op1));
674 14 : rtx r = force_const_mem (imode, SUBREG_REG (op1));
675 14 : if (r)
676 14 : r = validize_mem (r);
677 : else
678 0 : r = force_reg (imode, SUBREG_REG (op1));
679 14 : op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
680 : }
681 : else
682 : {
683 2051 : machine_mode mode = GET_MODE (op0);
684 2051 : rtx tmp = ix86_convert_const_wide_int_to_broadcast
685 2051 : (mode, op1);
686 2051 : if (tmp == nullptr)
687 2030 : op1 = validize_mem (force_const_mem (mode, op1));
688 : else
689 : op1 = tmp;
690 : }
691 : }
692 :
693 4731949 : if (can_create_pseudo_p ()
694 4537479 : && GET_MODE_SIZE (mode) >= 16
695 3828947 : && VECTOR_MODE_P (mode)
696 8346178 : && (MEM_P (op1)
697 865749 : && SYMBOL_REF_P (XEXP (op1, 0))
698 488856 : && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
699 : {
700 472235 : rtx first = ix86_broadcast_from_constant (mode, op1);
701 472235 : if (first != nullptr)
702 : {
703 : /* Broadcast to XMM/YMM/ZMM register from an integer
704 : constant or scalar mem. */
705 120637 : rtx tmp = gen_reg_rtx (mode);
706 120637 : if (FLOAT_MODE_P (mode))
707 29290 : first = force_const_mem (GET_MODE_INNER (mode), first);
708 120637 : bool ok = ix86_expand_vector_init_duplicate (false, mode,
709 : tmp, first);
710 120637 : if (!ok && !TARGET_64BIT && GET_MODE_INNER (mode) == DImode)
711 : {
712 0 : first = force_const_mem (GET_MODE_INNER (mode), first);
713 0 : ok = ix86_expand_vector_init_duplicate (false, mode,
714 : tmp, first);
715 : }
716 120637 : if (ok)
717 : {
718 120637 : emit_move_insn (op0, tmp);
719 120637 : return;
720 : }
721 : }
722 : }
723 :
724 : /* We need to check memory alignment for SSE mode since attribute
725 : can make operands unaligned. */
726 4611312 : if (can_create_pseudo_p ()
727 : && SSE_REG_MODE_P (mode)
728 9349283 : && ((MEM_P (op0) && (MEM_ALIGN (op0) < align))
729 4157112 : || (MEM_P (op1) && (MEM_ALIGN (op1) < align))))
730 : {
731 488057 : rtx tmp[2];
732 :
733 : /* ix86_expand_vector_move_misalign() does not like both
734 : arguments in memory. */
735 488057 : if (!register_operand (op0, mode)
736 488057 : && !register_operand (op1, mode))
737 : {
738 153252 : rtx scratch = gen_reg_rtx (mode);
739 153252 : emit_move_insn (scratch, op1);
740 153252 : op1 = scratch;
741 : }
742 :
743 488057 : tmp[0] = op0; tmp[1] = op1;
744 488057 : ix86_expand_vector_move_misalign (mode, tmp);
745 488057 : return;
746 : }
747 :
748 : /* Special case TImode to 128-bit vector conversions via V2DI. */
749 1133337 : if (VECTOR_MODE_P (mode)
750 4072315 : && GET_MODE_SIZE (mode) == 16
751 2866126 : && SUBREG_P (op1)
752 236940 : && GET_MODE (SUBREG_REG (op1)) == TImode
753 3180 : && TARGET_64BIT && TARGET_SSE
754 4125782 : && ix86_pre_reload_split ())
755 : {
756 2425 : rtx tmp = gen_reg_rtx (V2DImode);
757 2425 : rtx lo = gen_reg_rtx (DImode);
758 2425 : rtx hi = gen_reg_rtx (DImode);
759 2425 : emit_move_insn (lo, gen_lowpart (DImode, SUBREG_REG (op1)));
760 2425 : emit_move_insn (hi, gen_highpart (DImode, SUBREG_REG (op1)));
761 2425 : emit_insn (gen_vec_concatv2di (tmp, lo, hi));
762 2425 : emit_move_insn (op0, gen_lowpart (mode, tmp));
763 2425 : return;
764 : }
765 :
766 : /* If operand0 is a hard register, make operand1 a pseudo. */
767 4120830 : if (can_create_pseudo_p ()
768 8047190 : && !ix86_hardreg_mov_ok (op0, op1))
769 : {
770 125 : rtx tmp = gen_reg_rtx (GET_MODE (op0));
771 125 : emit_move_insn (tmp, op1);
772 125 : emit_move_insn (op0, tmp);
773 125 : return;
774 : }
775 :
776 : /* Make operand1 a register if it isn't already. */
777 4120705 : if (can_create_pseudo_p ()
778 3926235 : && !register_operand (op0, mode)
779 5220383 : && !register_operand (op1, mode))
780 : {
781 212831 : rtx tmp = gen_reg_rtx (GET_MODE (op0));
782 212831 : emit_move_insn (tmp, op1);
783 212831 : emit_move_insn (op0, tmp);
784 212831 : return;
785 : }
786 :
787 3907874 : emit_insn (gen_rtx_SET (op0, op1));
788 : }
789 :
790 : /* Split 32-byte AVX unaligned load and store if needed. */
791 :
792 : static void
793 13457 : ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
794 : {
795 13457 : rtx m;
796 13457 : rtx (*extract) (rtx, rtx, rtx);
797 13457 : machine_mode mode;
798 :
799 13457 : if ((MEM_P (op1) && !TARGET_AVX256_SPLIT_UNALIGNED_LOAD)
800 4752 : || (MEM_P (op0) && !TARGET_AVX256_SPLIT_UNALIGNED_STORE))
801 : {
802 13405 : emit_insn (gen_rtx_SET (op0, op1));
803 13405 : return;
804 : }
805 :
806 52 : rtx orig_op0 = NULL_RTX;
807 52 : mode = GET_MODE (op0);
808 52 : switch (GET_MODE_CLASS (mode))
809 : {
810 35 : case MODE_VECTOR_INT:
811 35 : case MODE_INT:
812 35 : if (mode != V32QImode)
813 : {
814 7 : if (!MEM_P (op0))
815 : {
816 3 : orig_op0 = op0;
817 3 : op0 = gen_reg_rtx (V32QImode);
818 : }
819 : else
820 4 : op0 = gen_lowpart (V32QImode, op0);
821 7 : op1 = gen_lowpart (V32QImode, op1);
822 7 : mode = V32QImode;
823 : }
824 : break;
825 : case MODE_VECTOR_FLOAT:
826 : break;
827 0 : default:
828 0 : gcc_unreachable ();
829 : }
830 :
831 52 : switch (mode)
832 : {
833 0 : default:
834 0 : gcc_unreachable ();
835 : case E_V32QImode:
836 : extract = gen_avx_vextractf128v32qi;
837 : mode = V16QImode;
838 : break;
839 1 : case E_V16BFmode:
840 1 : extract = gen_avx_vextractf128v16bf;
841 1 : mode = V8BFmode;
842 1 : break;
843 0 : case E_V16HFmode:
844 0 : extract = gen_avx_vextractf128v16hf;
845 0 : mode = V8HFmode;
846 0 : break;
847 8 : case E_V8SFmode:
848 8 : extract = gen_avx_vextractf128v8sf;
849 8 : mode = V4SFmode;
850 8 : break;
851 8 : case E_V4DFmode:
852 8 : extract = gen_avx_vextractf128v4df;
853 8 : mode = V2DFmode;
854 8 : break;
855 : }
856 :
857 52 : if (MEM_P (op1))
858 : {
859 9 : rtx r = gen_reg_rtx (mode);
860 9 : m = adjust_address (op1, mode, 0);
861 9 : emit_move_insn (r, m);
862 9 : m = adjust_address (op1, mode, 16);
863 9 : r = gen_rtx_VEC_CONCAT (GET_MODE (op0), r, m);
864 9 : emit_move_insn (op0, r);
865 : }
866 43 : else if (MEM_P (op0))
867 : {
868 43 : m = adjust_address (op0, mode, 0);
869 43 : emit_insn (extract (m, op1, const0_rtx));
870 43 : m = adjust_address (op0, mode, 16);
871 43 : emit_insn (extract (m, copy_rtx (op1), const1_rtx));
872 : }
873 : else
874 0 : gcc_unreachable ();
875 :
876 52 : if (orig_op0)
877 3 : emit_move_insn (orig_op0, gen_lowpart (GET_MODE (orig_op0), op0));
878 : }
879 :
880 : /* Implement the movmisalign patterns for SSE. Non-SSE modes go
881 : straight to ix86_expand_vector_move. */
882 : /* Code generation for scalar reg-reg moves of single and double precision data:
883 : if (x86_sse_partial_reg_dependency == true | x86_sse_split_regs == true)
884 : movaps reg, reg
885 : else
886 : movss reg, reg
887 : if (x86_sse_partial_reg_dependency == true)
888 : movapd reg, reg
889 : else
890 : movsd reg, reg
891 :
892 : Code generation for scalar loads of double precision data:
893 : if (x86_sse_split_regs == true)
894 : movlpd mem, reg (gas syntax)
895 : else
896 : movsd mem, reg
897 :
898 : Code generation for unaligned packed loads of single precision data
899 : (x86_sse_unaligned_move_optimal overrides x86_sse_partial_reg_dependency):
900 : if (x86_sse_unaligned_move_optimal)
901 : movups mem, reg
902 :
903 : if (x86_sse_partial_reg_dependency == true)
904 : {
905 : xorps reg, reg
906 : movlps mem, reg
907 : movhps mem+8, reg
908 : }
909 : else
910 : {
911 : movlps mem, reg
912 : movhps mem+8, reg
913 : }
914 :
915 : Code generation for unaligned packed loads of double precision data
916 : (x86_sse_unaligned_move_optimal overrides x86_sse_split_regs):
917 : if (x86_sse_unaligned_move_optimal)
918 : movupd mem, reg
919 :
920 : if (x86_sse_split_regs == true)
921 : {
922 : movlpd mem, reg
923 : movhpd mem+8, reg
924 : }
925 : else
926 : {
927 : movsd mem, reg
928 : movhpd mem+8, reg
929 : }
930 : */
931 :
932 : void
933 812728 : ix86_expand_vector_move_misalign (machine_mode mode, rtx operands[])
934 : {
935 812728 : rtx op0, op1, m;
936 :
937 812728 : op0 = operands[0];
938 812728 : op1 = operands[1];
939 :
940 : /* Use unaligned load/store for AVX512 or when optimizing for size. */
941 1625456 : if (GET_MODE_SIZE (mode) == 64 || optimize_insn_for_size_p ())
942 : {
943 24002 : emit_insn (gen_rtx_SET (op0, op1));
944 24002 : return;
945 : }
946 :
947 788726 : if (TARGET_AVX)
948 : {
949 62696 : if (GET_MODE_SIZE (mode) == 32)
950 13457 : ix86_avx256_split_vector_move_misalign (op0, op1);
951 : else
952 : /* Always use 128-bit mov<mode>_internal pattern for AVX. */
953 17891 : emit_insn (gen_rtx_SET (op0, op1));
954 31348 : return;
955 : }
956 :
957 757378 : if (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
958 95 : || TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL)
959 : {
960 757283 : emit_insn (gen_rtx_SET (op0, op1));
961 757283 : return;
962 : }
963 :
964 : /* ??? If we have typed data, then it would appear that using
965 : movdqu is the only way to get unaligned data loaded with
966 : integer type. */
967 95 : if (TARGET_SSE2 && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
968 : {
969 81 : emit_insn (gen_rtx_SET (op0, op1));
970 81 : return;
971 : }
972 :
973 14 : if (MEM_P (op1))
974 : {
975 6 : if (TARGET_SSE2 && mode == V2DFmode)
976 : {
977 2 : rtx zero;
978 :
979 : /* When SSE registers are split into halves, we can avoid
980 : writing to the top half twice. */
981 2 : if (TARGET_SSE_SPLIT_REGS)
982 : {
983 2 : emit_clobber (op0);
984 2 : zero = op0;
985 : }
986 : else
987 : {
988 : /* ??? Not sure about the best option for the Intel chips.
989 : The following would seem to satisfy; the register is
990 : entirely cleared, breaking the dependency chain. We
991 : then store to the upper half, with a dependency depth
992 : of one. A rumor has it that Intel recommends two movsd
993 : followed by an unpacklpd, but this is unconfirmed. And
994 : given that the dependency depth of the unpacklpd would
995 : still be one, I'm not sure why this would be better. */
996 0 : zero = CONST0_RTX (V2DFmode);
997 : }
998 :
999 2 : m = adjust_address (op1, DFmode, 0);
1000 2 : emit_insn (gen_sse2_loadlpd (op0, zero, m));
1001 2 : m = adjust_address (op1, DFmode, 8);
1002 2 : emit_insn (gen_sse2_loadhpd (op0, op0, m));
1003 2 : }
1004 : else
1005 : {
1006 4 : rtx t;
1007 :
1008 4 : if (mode != V4SFmode)
1009 0 : t = gen_reg_rtx (V4SFmode);
1010 : else
1011 : t = op0;
1012 :
1013 4 : if (TARGET_SSE_PARTIAL_REG_DEPENDENCY)
1014 2 : emit_move_insn (t, CONST0_RTX (V4SFmode));
1015 : else
1016 2 : emit_clobber (t);
1017 :
1018 4 : m = adjust_address (op1, V2SFmode, 0);
1019 4 : emit_insn (gen_sse_loadlps (t, t, m));
1020 4 : m = adjust_address (op1, V2SFmode, 8);
1021 4 : emit_insn (gen_sse_loadhps (t, t, m));
1022 4 : if (mode != V4SFmode)
1023 0 : emit_move_insn (op0, gen_lowpart (mode, t));
1024 : }
1025 : }
1026 8 : else if (MEM_P (op0))
1027 : {
1028 8 : if (TARGET_SSE2 && mode == V2DFmode)
1029 : {
1030 2 : m = adjust_address (op0, DFmode, 0);
1031 2 : emit_insn (gen_sse2_storelpd (m, op1));
1032 2 : m = adjust_address (op0, DFmode, 8);
1033 2 : emit_insn (gen_sse2_storehpd (m, op1));
1034 : }
1035 : else
1036 : {
1037 6 : if (mode != V4SFmode)
1038 0 : op1 = gen_lowpart (V4SFmode, op1);
1039 :
1040 6 : m = adjust_address (op0, V2SFmode, 0);
1041 6 : emit_insn (gen_sse_storelps (m, op1));
1042 6 : m = adjust_address (op0, V2SFmode, 8);
1043 6 : emit_insn (gen_sse_storehps (m, copy_rtx (op1)));
1044 : }
1045 : }
1046 : else
1047 0 : gcc_unreachable ();
1048 : }
1049 :
1050 : /* Move bits 64:95 to bits 32:63. */
1051 :
1052 : void
1053 868 : ix86_move_vector_high_sse_to_mmx (rtx op)
1054 : {
1055 868 : rtx mask = gen_rtx_PARALLEL (VOIDmode,
1056 : gen_rtvec (4, GEN_INT (0), GEN_INT (2),
1057 : GEN_INT (0), GEN_INT (0)));
1058 868 : rtx dest = lowpart_subreg (V4SImode, op, GET_MODE (op));
1059 868 : op = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1060 868 : rtx insn = gen_rtx_SET (dest, op);
1061 868 : emit_insn (insn);
1062 868 : }
1063 :
1064 : /* Split MMX pack with signed/unsigned saturation with SSE/SSE2. */
1065 :
1066 : void
1067 778 : ix86_split_mmx_pack (rtx operands[], enum rtx_code code)
1068 : {
1069 778 : rtx op0 = operands[0];
1070 778 : rtx op1 = operands[1];
1071 778 : rtx op2 = operands[2];
1072 778 : rtx src;
1073 :
1074 778 : machine_mode dmode = GET_MODE (op0);
1075 778 : machine_mode smode = GET_MODE (op1);
1076 778 : machine_mode inner_dmode = GET_MODE_INNER (dmode);
1077 778 : machine_mode inner_smode = GET_MODE_INNER (smode);
1078 :
1079 : /* Get the corresponding SSE mode for destination. */
1080 778 : int nunits = 16 / GET_MODE_SIZE (inner_dmode);
1081 1556 : machine_mode sse_dmode = mode_for_vector (GET_MODE_INNER (dmode),
1082 1556 : nunits).require ();
1083 778 : machine_mode sse_half_dmode = mode_for_vector (GET_MODE_INNER (dmode),
1084 1556 : nunits / 2).require ();
1085 :
1086 : /* Get the corresponding SSE mode for source. */
1087 778 : nunits = 16 / GET_MODE_SIZE (inner_smode);
1088 1556 : machine_mode sse_smode = mode_for_vector (GET_MODE_INNER (smode),
1089 1556 : nunits).require ();
1090 :
1091 : /* Generate SSE pack with signed/unsigned saturation. */
1092 778 : rtx dest = lowpart_subreg (sse_dmode, op0, GET_MODE (op0));
1093 778 : op1 = lowpart_subreg (sse_smode, op1, GET_MODE (op1));
1094 778 : op2 = lowpart_subreg (sse_smode, op2, GET_MODE (op2));
1095 :
1096 : /* paskusdw/packuswb does unsigned saturation of a signed source
1097 : which is different from generic us_truncate RTX. */
1098 778 : if (code == US_TRUNCATE)
1099 676 : src = gen_rtx_UNSPEC (sse_dmode,
1100 : gen_rtvec (2, op1, op2),
1101 : UNSPEC_US_TRUNCATE);
1102 : else
1103 : {
1104 102 : op1 = gen_rtx_fmt_e (code, sse_half_dmode, op1);
1105 102 : op2 = gen_rtx_fmt_e (code, sse_half_dmode, op2);
1106 102 : src = gen_rtx_VEC_CONCAT (sse_dmode, op1, op2);
1107 : }
1108 :
1109 778 : emit_move_insn (dest, src);
1110 :
1111 778 : ix86_move_vector_high_sse_to_mmx (op0);
1112 778 : }
1113 :
1114 : /* Split MMX punpcklXX/punpckhXX with SSE punpcklXX. This is also used
1115 : for a full unpack of OPERANDS[1] and OPERANDS[2] into a wider
1116 : OPERANDS[0]. */
1117 :
1118 : void
1119 6063 : ix86_split_mmx_punpck (rtx operands[], bool high_p)
1120 : {
1121 6063 : rtx op0 = operands[0];
1122 6063 : rtx op1 = operands[1];
1123 6063 : rtx op2 = operands[2];
1124 6063 : machine_mode mode = GET_MODE (op1);
1125 6063 : rtx mask;
1126 : /* The corresponding SSE mode. */
1127 6063 : machine_mode sse_mode, double_sse_mode;
1128 :
1129 6063 : switch (mode)
1130 : {
1131 1606 : case E_V8QImode:
1132 1606 : case E_V4QImode:
1133 1606 : case E_V2QImode:
1134 1606 : sse_mode = V16QImode;
1135 1606 : double_sse_mode = V32QImode;
1136 1606 : mask = gen_rtx_PARALLEL (VOIDmode,
1137 : gen_rtvec (16,
1138 : GEN_INT (0), GEN_INT (16),
1139 : GEN_INT (1), GEN_INT (17),
1140 : GEN_INT (2), GEN_INT (18),
1141 : GEN_INT (3), GEN_INT (19),
1142 : GEN_INT (4), GEN_INT (20),
1143 : GEN_INT (5), GEN_INT (21),
1144 : GEN_INT (6), GEN_INT (22),
1145 : GEN_INT (7), GEN_INT (23)));
1146 1606 : break;
1147 :
1148 3366 : case E_V4HImode:
1149 3366 : case E_V2HImode:
1150 3366 : sse_mode = V8HImode;
1151 3366 : double_sse_mode = V16HImode;
1152 3366 : mask = gen_rtx_PARALLEL (VOIDmode,
1153 : gen_rtvec (8,
1154 : GEN_INT (0), GEN_INT (8),
1155 : GEN_INT (1), GEN_INT (9),
1156 : GEN_INT (2), GEN_INT (10),
1157 : GEN_INT (3), GEN_INT (11)));
1158 3366 : break;
1159 :
1160 740 : case E_V2SImode:
1161 740 : sse_mode = V4SImode;
1162 740 : double_sse_mode = V8SImode;
1163 740 : mask = gen_rtx_PARALLEL (VOIDmode,
1164 : gen_rtvec (4,
1165 : GEN_INT (0), GEN_INT (4),
1166 : GEN_INT (1), GEN_INT (5)));
1167 740 : break;
1168 :
1169 351 : case E_V2SFmode:
1170 351 : sse_mode = V4SFmode;
1171 351 : double_sse_mode = V8SFmode;
1172 351 : mask = gen_rtx_PARALLEL (VOIDmode,
1173 : gen_rtvec (4,
1174 : GEN_INT (0), GEN_INT (4),
1175 : GEN_INT (1), GEN_INT (5)));
1176 351 : break;
1177 :
1178 0 : default:
1179 0 : gcc_unreachable ();
1180 : }
1181 :
1182 : /* Generate SSE punpcklXX. */
1183 6063 : rtx dest = lowpart_subreg (sse_mode, op0, GET_MODE (op0));
1184 6063 : op1 = lowpart_subreg (sse_mode, op1, GET_MODE (op1));
1185 6063 : op2 = lowpart_subreg (sse_mode, op2, GET_MODE (op2));
1186 :
1187 6063 : op1 = gen_rtx_VEC_CONCAT (double_sse_mode, op1, op2);
1188 6063 : op2 = gen_rtx_VEC_SELECT (sse_mode, op1, mask);
1189 6063 : rtx insn = gen_rtx_SET (dest, op2);
1190 6063 : emit_insn (insn);
1191 :
1192 : /* Move high bits to low bits. */
1193 6063 : if (high_p)
1194 : {
1195 2480 : if (sse_mode == V4SFmode)
1196 : {
1197 121 : mask = gen_rtx_PARALLEL (VOIDmode,
1198 : gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1199 : GEN_INT (4), GEN_INT (5)));
1200 121 : op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
1201 121 : op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
1202 : }
1203 : else
1204 : {
1205 2359 : int sz = GET_MODE_SIZE (mode);
1206 :
1207 2359 : if (sz == 4)
1208 239 : mask = gen_rtx_PARALLEL (VOIDmode,
1209 : gen_rtvec (4, GEN_INT (1), GEN_INT (0),
1210 : GEN_INT (0), GEN_INT (1)));
1211 2120 : else if (sz == 8)
1212 2120 : mask = gen_rtx_PARALLEL (VOIDmode,
1213 : gen_rtvec (4, GEN_INT (2), GEN_INT (3),
1214 : GEN_INT (0), GEN_INT (1)));
1215 : else
1216 0 : gcc_unreachable ();
1217 :
1218 2359 : dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
1219 2359 : op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
1220 : }
1221 :
1222 2480 : insn = gen_rtx_SET (dest, op1);
1223 2480 : emit_insn (insn);
1224 : }
1225 6063 : }
1226 :
1227 : /* Helper function of ix86_fixup_binary_operands to canonicalize
1228 : operand order. Returns true if the operands should be swapped. */
1229 :
1230 : static bool
1231 174495999 : ix86_swap_binary_operands_p (enum rtx_code code, machine_mode mode,
1232 : rtx operands[])
1233 : {
1234 174495999 : rtx dst = operands[0];
1235 174495999 : rtx src1 = operands[1];
1236 174495999 : rtx src2 = operands[2];
1237 :
1238 : /* If the operation is not commutative, we can't do anything. */
1239 174495999 : if (GET_RTX_CLASS (code) != RTX_COMM_ARITH
1240 26817930 : && GET_RTX_CLASS (code) != RTX_COMM_COMPARE)
1241 : return false;
1242 :
1243 : /* Highest priority is that src1 should match dst. */
1244 147689515 : if (rtx_equal_p (dst, src1))
1245 : return false;
1246 107333356 : if (rtx_equal_p (dst, src2))
1247 : return true;
1248 :
1249 : /* Next highest priority is that immediate constants come second. */
1250 107248612 : if (immediate_operand (src2, mode))
1251 : return false;
1252 25900369 : if (immediate_operand (src1, mode))
1253 : return true;
1254 :
1255 : /* Lowest priority is that memory references should come second. */
1256 25900369 : if (MEM_P (src2))
1257 : return false;
1258 24480677 : if (MEM_P (src1))
1259 : return true;
1260 :
1261 : return false;
1262 : }
1263 :
1264 : /* Fix up OPERANDS to satisfy ix86_binary_operator_ok. Return the
1265 : destination to use for the operation. If different from the true
1266 : destination in operands[0], a copy operation will be required except
1267 : under TARGET_APX_NDD. */
1268 :
1269 : rtx
1270 13517531 : ix86_fixup_binary_operands (enum rtx_code code, machine_mode mode,
1271 : rtx operands[], bool use_ndd)
1272 : {
1273 13517531 : rtx dst = operands[0];
1274 13517531 : rtx src1 = operands[1];
1275 13517531 : rtx src2 = operands[2];
1276 :
1277 : /* Canonicalize operand order. */
1278 13517531 : if (ix86_swap_binary_operands_p (code, mode, operands))
1279 : {
1280 : /* It is invalid to swap operands of different modes. */
1281 87855 : gcc_assert (GET_MODE (src1) == GET_MODE (src2));
1282 :
1283 : std::swap (src1, src2);
1284 : }
1285 :
1286 : /* Both source operands cannot be in memory. */
1287 13517531 : if (MEM_P (src1) && MEM_P (src2))
1288 : {
1289 : /* Optimization: Only read from memory once. */
1290 112049 : if (rtx_equal_p (src1, src2))
1291 : {
1292 17 : src2 = force_reg (mode, src2);
1293 17 : src1 = src2;
1294 : }
1295 112032 : else if (rtx_equal_p (dst, src1))
1296 3294 : src2 = force_reg (mode, src2);
1297 : else
1298 108738 : src1 = force_reg (mode, src1);
1299 : }
1300 :
1301 : /* If the destination is memory, and we do not have matching source
1302 : operands, do things in registers. */
1303 13517531 : if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1304 484678 : dst = gen_reg_rtx (mode);
1305 :
1306 : /* Source 1 cannot be a constant. */
1307 13517531 : if (CONSTANT_P (src1))
1308 713 : src1 = force_reg (mode, src1);
1309 :
1310 : /* Source 1 cannot be a non-matching memory. */
1311 13517531 : if (!use_ndd && MEM_P (src1) && !rtx_equal_p (dst, src1))
1312 448752 : src1 = force_reg (mode, src1);
1313 :
1314 : /* Improve address combine. */
1315 13517531 : if (code == PLUS
1316 9926379 : && GET_MODE_CLASS (mode) == MODE_INT
1317 9815933 : && MEM_P (src2))
1318 175990 : src2 = force_reg (mode, src2);
1319 :
1320 13517531 : operands[1] = src1;
1321 13517531 : operands[2] = src2;
1322 13517531 : return dst;
1323 : }
1324 :
1325 : /* Similarly, but assume that the destination has already been
1326 : set up properly. */
1327 :
1328 : void
1329 288852 : ix86_fixup_binary_operands_no_copy (enum rtx_code code,
1330 : machine_mode mode, rtx operands[],
1331 : bool use_ndd)
1332 : {
1333 288852 : rtx dst = ix86_fixup_binary_operands (code, mode, operands, use_ndd);
1334 288852 : gcc_assert (dst == operands[0]);
1335 288852 : }
1336 :
1337 : /* Attempt to expand a binary operator. Make the expansion closer to the
1338 : actual machine, then just general_operand, which will allow 3 separate
1339 : memory references (one output, two input) in a single insn. */
1340 :
1341 : void
1342 13228550 : ix86_expand_binary_operator (enum rtx_code code, machine_mode mode,
1343 : rtx operands[], bool use_ndd)
1344 : {
1345 13228550 : rtx src1, src2, dst, op, clob;
1346 :
1347 13228550 : dst = ix86_fixup_binary_operands (code, mode, operands, use_ndd);
1348 13228550 : src1 = operands[1];
1349 13228550 : src2 = operands[2];
1350 :
1351 : /* Emit the instruction. */
1352 :
1353 13228550 : op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
1354 :
1355 13228550 : if (reload_completed
1356 82347 : && code == PLUS
1357 904 : && !rtx_equal_p (dst, src1)
1358 13228550 : && !use_ndd)
1359 : {
1360 : /* This is going to be an LEA; avoid splitting it later. */
1361 0 : emit_insn (op);
1362 : }
1363 : else
1364 : {
1365 13228550 : clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1366 13228550 : emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1367 : }
1368 :
1369 : /* Fix up the destination if needed. */
1370 13228550 : if (dst != operands[0])
1371 484669 : emit_move_insn (operands[0], dst);
1372 13228550 : }
1373 :
1374 : /* Expand vector logical operation CODE (AND, IOR, XOR) in MODE with
1375 : the given OPERANDS. */
1376 :
1377 : void
1378 82403 : ix86_expand_vector_logical_operator (enum rtx_code code, machine_mode mode,
1379 : rtx operands[])
1380 : {
1381 82403 : rtx op1 = NULL_RTX, op2 = NULL_RTX;
1382 82403 : if (SUBREG_P (operands[1]))
1383 : {
1384 312 : op1 = operands[1];
1385 312 : op2 = operands[2];
1386 : }
1387 82091 : else if (SUBREG_P (operands[2]))
1388 : {
1389 : op1 = operands[2];
1390 : op2 = operands[1];
1391 : }
1392 : /* Optimize (__m128i) d | (__m128i) e and similar code
1393 : when d and e are float vectors into float vector logical
1394 : insn. In C/C++ without using intrinsics there is no other way
1395 : to express vector logical operation on float vectors than
1396 : to cast them temporarily to integer vectors. */
1397 3081 : if (op1
1398 3081 : && !TARGET_SSE_PACKED_SINGLE_INSN_OPTIMAL
1399 3081 : && (SUBREG_P (op2) || CONST_VECTOR_P (op2))
1400 298 : && GET_MODE_CLASS (GET_MODE (SUBREG_REG (op1))) == MODE_VECTOR_FLOAT
1401 303 : && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op1))) == GET_MODE_SIZE (mode)
1402 101 : && SUBREG_BYTE (op1) == 0
1403 101 : && (CONST_VECTOR_P (op2)
1404 1 : || (GET_MODE (SUBREG_REG (op1)) == GET_MODE (SUBREG_REG (op2))
1405 1 : && SUBREG_BYTE (op2) == 0))
1406 101 : && can_create_pseudo_p ())
1407 : {
1408 101 : rtx dst;
1409 101 : switch (GET_MODE (SUBREG_REG (op1)))
1410 : {
1411 17 : case E_V4SFmode:
1412 17 : case E_V8SFmode:
1413 17 : case E_V16SFmode:
1414 17 : case E_V2DFmode:
1415 17 : case E_V4DFmode:
1416 17 : case E_V8DFmode:
1417 17 : dst = gen_reg_rtx (GET_MODE (SUBREG_REG (op1)));
1418 17 : if (CONST_VECTOR_P (op2))
1419 : {
1420 16 : op2 = gen_lowpart (GET_MODE (dst), op2);
1421 16 : op2 = force_reg (GET_MODE (dst), op2);
1422 : }
1423 : else
1424 : {
1425 1 : op1 = operands[1];
1426 1 : op2 = SUBREG_REG (operands[2]);
1427 1 : if (!vector_operand (op2, GET_MODE (dst)))
1428 0 : op2 = force_reg (GET_MODE (dst), op2);
1429 : }
1430 17 : op1 = SUBREG_REG (op1);
1431 17 : if (!vector_operand (op1, GET_MODE (dst)))
1432 0 : op1 = force_reg (GET_MODE (dst), op1);
1433 17 : emit_insn (gen_rtx_SET (dst,
1434 : gen_rtx_fmt_ee (code, GET_MODE (dst),
1435 : op1, op2)));
1436 17 : emit_move_insn (operands[0], gen_lowpart (mode, dst));
1437 17 : return;
1438 : default:
1439 : break;
1440 : }
1441 : }
1442 82386 : if (!vector_operand (operands[1], mode))
1443 0 : operands[1] = force_reg (mode, operands[1]);
1444 82386 : if (!vector_operand (operands[2], mode))
1445 11108 : operands[2] = force_reg (mode, operands[2]);
1446 82386 : ix86_fixup_binary_operands_no_copy (code, mode, operands);
1447 82386 : emit_insn (gen_rtx_SET (operands[0],
1448 : gen_rtx_fmt_ee (code, mode, operands[1],
1449 : operands[2])));
1450 : }
1451 :
1452 : /* Return TRUE or FALSE depending on whether the binary operator meets the
1453 : appropriate constraints. */
1454 :
1455 : bool
1456 161997066 : ix86_binary_operator_ok (enum rtx_code code, machine_mode mode,
1457 : rtx operands[3], bool use_ndd)
1458 : {
1459 161997066 : rtx dst = operands[0];
1460 161997066 : rtx src1 = operands[1];
1461 161997066 : rtx src2 = operands[2];
1462 :
1463 : /* Both source operands cannot be in memory. */
1464 154473172 : if ((MEM_P (src1) || bcst_mem_operand (src1, mode))
1465 161997451 : && (MEM_P (src2) || bcst_mem_operand (src2, mode)))
1466 1018598 : return false;
1467 :
1468 : /* Canonicalize operand order for commutative operators. */
1469 160978468 : if (ix86_swap_binary_operands_p (code, mode, operands))
1470 537827 : std::swap (src1, src2);
1471 :
1472 : /* If the destination is memory, we must have a matching source operand. */
1473 160978468 : if (MEM_P (dst) && !rtx_equal_p (dst, src1))
1474 : return false;
1475 :
1476 : /* Source 1 cannot be a constant. */
1477 155927318 : if (CONSTANT_P (src1))
1478 : return false;
1479 :
1480 : /* Source 1 cannot be a non-matching memory. */
1481 155924269 : if (!use_ndd && MEM_P (src1) && !rtx_equal_p (dst, src1))
1482 : /* Support "andhi/andsi/anddi" as a zero-extending move. */
1483 4531324 : return (code == AND
1484 591141 : && (mode == HImode
1485 591141 : || mode == SImode
1486 361743 : || (TARGET_64BIT && mode == DImode))
1487 4854637 : && satisfies_constraint_L (src2));
1488 :
1489 : return true;
1490 : }
1491 :
1492 : /* Attempt to expand a unary operator. Make the expansion closer to the
1493 : actual machine, then just general_operand, which will allow 2 separate
1494 : memory references (one output, one input) in a single insn. */
1495 :
1496 : void
1497 118215 : ix86_expand_unary_operator (enum rtx_code code, machine_mode mode,
1498 : rtx operands[], bool use_ndd)
1499 : {
1500 118215 : bool matching_memory = false;
1501 118215 : rtx src, dst, op, clob;
1502 :
1503 118215 : dst = operands[0];
1504 118215 : src = operands[1];
1505 :
1506 : /* If the destination is memory, and we do not have matching source
1507 : operands, do things in registers. */
1508 118215 : if (MEM_P (dst))
1509 : {
1510 3205 : if (rtx_equal_p (dst, src))
1511 : matching_memory = true;
1512 : else
1513 2890 : dst = gen_reg_rtx (mode);
1514 : }
1515 :
1516 : /* When source operand is memory, destination must match. */
1517 118215 : if (!use_ndd && MEM_P (src) && !matching_memory)
1518 4680 : src = force_reg (mode, src);
1519 :
1520 : /* Emit the instruction. */
1521 :
1522 118215 : op = gen_rtx_SET (dst, gen_rtx_fmt_e (code, mode, src));
1523 :
1524 118215 : if (code == NOT)
1525 67749 : emit_insn (op);
1526 : else
1527 : {
1528 50466 : clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1529 50466 : emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1530 : }
1531 :
1532 : /* Fix up the destination if needed. */
1533 118215 : if (dst != operands[0])
1534 2890 : emit_move_insn (operands[0], dst);
1535 118215 : }
1536 :
1537 : /* Return TRUE or FALSE depending on whether the unary operator meets the
1538 : appropriate constraints. */
1539 :
1540 : bool
1541 1710713 : ix86_unary_operator_ok (enum rtx_code,
1542 : machine_mode,
1543 : rtx operands[2],
1544 : bool use_ndd)
1545 : {
1546 : /* If one of operands is memory, source and destination must match. */
1547 1710713 : if ((MEM_P (operands[0])
1548 1667959 : || (!use_ndd && MEM_P (operands[1])))
1549 1739573 : && ! rtx_equal_p (operands[0], operands[1]))
1550 : return false;
1551 : return true;
1552 : }
1553 :
1554 : /* Predict just emitted jump instruction to be taken with probability PROB. */
1555 :
1556 : static void
1557 66018 : predict_jump (int prob)
1558 : {
1559 66018 : rtx_insn *insn = get_last_insn ();
1560 66018 : gcc_assert (JUMP_P (insn));
1561 66018 : add_reg_br_prob_note (insn, profile_probability::from_reg_br_prob_base (prob));
1562 66018 : }
1563 :
1564 : /* Split 32bit/64bit divmod with 8bit unsigned divmod if dividend and
1565 : divisor are within the range [0-255]. */
1566 :
1567 : void
1568 27 : ix86_split_idivmod (machine_mode mode, rtx operands[],
1569 : bool unsigned_p)
1570 : {
1571 27 : rtx_code_label *end_label, *qimode_label;
1572 27 : rtx div, mod;
1573 27 : rtx_insn *insn;
1574 27 : rtx scratch, tmp0, tmp1, tmp2;
1575 27 : rtx (*gen_divmod4_1) (rtx, rtx, rtx, rtx);
1576 :
1577 27 : operands[2] = force_reg (mode, operands[2]);
1578 27 : operands[3] = force_reg (mode, operands[3]);
1579 :
1580 27 : switch (mode)
1581 : {
1582 20 : case E_SImode:
1583 20 : if (GET_MODE (operands[0]) == SImode)
1584 : {
1585 16 : if (GET_MODE (operands[1]) == SImode)
1586 14 : gen_divmod4_1 = unsigned_p ? gen_udivmodsi4_1 : gen_divmodsi4_1;
1587 : else
1588 2 : gen_divmod4_1
1589 2 : = unsigned_p ? gen_udivmodsi4_zext_2 : gen_divmodsi4_zext_2;
1590 : }
1591 : else
1592 4 : gen_divmod4_1
1593 4 : = unsigned_p ? gen_udivmodsi4_zext_1 : gen_divmodsi4_zext_1;
1594 : break;
1595 :
1596 7 : case E_DImode:
1597 7 : gen_divmod4_1 = unsigned_p ? gen_udivmoddi4_1 : gen_divmoddi4_1;
1598 : break;
1599 :
1600 0 : default:
1601 0 : gcc_unreachable ();
1602 : }
1603 :
1604 27 : end_label = gen_label_rtx ();
1605 27 : qimode_label = gen_label_rtx ();
1606 :
1607 27 : scratch = gen_reg_rtx (mode);
1608 :
1609 : /* Use 8bit unsigned divimod if dividend and divisor are within
1610 : the range [0-255]. */
1611 27 : emit_move_insn (scratch, operands[2]);
1612 27 : scratch = expand_simple_binop (mode, IOR, scratch, operands[3],
1613 : scratch, 1, OPTAB_DIRECT);
1614 27 : emit_insn (gen_test_ccno_1 (mode, scratch, GEN_INT (-0x100)));
1615 27 : tmp0 = gen_rtx_REG (CCNOmode, FLAGS_REG);
1616 27 : tmp0 = gen_rtx_EQ (VOIDmode, tmp0, const0_rtx);
1617 27 : tmp0 = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp0,
1618 : gen_rtx_LABEL_REF (VOIDmode, qimode_label),
1619 : pc_rtx);
1620 27 : insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp0));
1621 27 : predict_jump (REG_BR_PROB_BASE * 50 / 100);
1622 27 : JUMP_LABEL (insn) = qimode_label;
1623 :
1624 : /* Generate original signed/unsigned divimod. */
1625 27 : emit_insn (gen_divmod4_1 (operands[0], operands[1],
1626 : operands[2], operands[3]));
1627 :
1628 : /* Branch to the end. */
1629 27 : emit_jump_insn (gen_jump (end_label));
1630 27 : emit_barrier ();
1631 :
1632 : /* Generate 8bit unsigned divide. */
1633 27 : emit_label (qimode_label);
1634 : /* Don't use operands[0] for result of 8bit divide since not all
1635 : registers support QImode ZERO_EXTRACT. */
1636 27 : tmp0 = lowpart_subreg (HImode, scratch, mode);
1637 27 : tmp1 = lowpart_subreg (HImode, operands[2], mode);
1638 27 : tmp2 = lowpart_subreg (QImode, operands[3], mode);
1639 27 : emit_insn (gen_udivmodhiqi3 (tmp0, tmp1, tmp2));
1640 :
1641 27 : if (unsigned_p)
1642 : {
1643 12 : div = gen_rtx_UDIV (mode, operands[2], operands[3]);
1644 12 : mod = gen_rtx_UMOD (mode, operands[2], operands[3]);
1645 : }
1646 : else
1647 : {
1648 15 : div = gen_rtx_DIV (mode, operands[2], operands[3]);
1649 15 : mod = gen_rtx_MOD (mode, operands[2], operands[3]);
1650 : }
1651 27 : if (mode == SImode)
1652 : {
1653 20 : if (GET_MODE (operands[0]) != SImode)
1654 4 : div = gen_rtx_ZERO_EXTEND (DImode, div);
1655 20 : if (GET_MODE (operands[1]) != SImode)
1656 2 : mod = gen_rtx_ZERO_EXTEND (DImode, mod);
1657 : }
1658 :
1659 : /* Extract remainder from AH. */
1660 27 : scratch = gen_lowpart (GET_MODE (operands[1]), scratch);
1661 27 : tmp1 = gen_rtx_ZERO_EXTRACT (GET_MODE (operands[1]), scratch,
1662 : GEN_INT (8), GEN_INT (8));
1663 27 : insn = emit_move_insn (operands[1], tmp1);
1664 27 : set_unique_reg_note (insn, REG_EQUAL, mod);
1665 :
1666 : /* Zero extend quotient from AL. */
1667 27 : tmp1 = gen_lowpart (QImode, tmp0);
1668 27 : insn = emit_insn (gen_extend_insn
1669 27 : (operands[0], tmp1,
1670 27 : GET_MODE (operands[0]), QImode, 1));
1671 27 : set_unique_reg_note (insn, REG_EQUAL, div);
1672 :
1673 27 : emit_label (end_label);
1674 27 : }
1675 :
1676 : /* Emit x86 binary operand CODE in mode MODE, where the first operand
1677 : matches destination. RTX includes clobber of FLAGS_REG. */
1678 :
1679 : void
1680 7890 : ix86_emit_binop (enum rtx_code code, machine_mode mode,
1681 : rtx dst, rtx src)
1682 : {
1683 7890 : rtx op, clob;
1684 :
1685 7890 : op = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, dst, src));
1686 7890 : clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
1687 :
1688 7890 : emit_insn (gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, op, clob)));
1689 7890 : }
1690 :
1691 : /* Return true if regno1 def is nearest to the insn. */
1692 :
1693 : static bool
1694 15 : find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
1695 : {
1696 15 : rtx_insn *prev = insn;
1697 15 : rtx_insn *start = BB_HEAD (BLOCK_FOR_INSN (insn));
1698 :
1699 15 : if (insn == start)
1700 : return false;
1701 40 : while (prev && prev != start)
1702 : {
1703 30 : if (!INSN_P (prev) || !NONDEBUG_INSN_P (prev))
1704 : {
1705 10 : prev = PREV_INSN (prev);
1706 10 : continue;
1707 : }
1708 20 : if (insn_defines_reg (regno1, INVALID_REGNUM, prev))
1709 : return true;
1710 15 : else if (insn_defines_reg (regno2, INVALID_REGNUM, prev))
1711 : return false;
1712 15 : prev = PREV_INSN (prev);
1713 : }
1714 :
1715 : /* None of the regs is defined in the bb. */
1716 : return false;
1717 : }
1718 :
1719 : /* INSN_UID of the last insn emitted by zero store peephole2s. */
1720 : int ix86_last_zero_store_uid;
1721 :
1722 : /* Split lea instructions into a sequence of instructions
1723 : which are executed on ALU to avoid AGU stalls.
1724 : It is assumed that it is allowed to clobber flags register
1725 : at lea position. */
1726 :
1727 : void
1728 6071 : ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
1729 : {
1730 6071 : unsigned int regno0, regno1, regno2;
1731 6071 : struct ix86_address parts;
1732 6071 : rtx target, tmp;
1733 6071 : int ok, adds;
1734 :
1735 6071 : ok = ix86_decompose_address (operands[1], &parts);
1736 6071 : gcc_assert (ok);
1737 :
1738 6071 : target = gen_lowpart (mode, operands[0]);
1739 :
1740 6071 : regno0 = true_regnum (target);
1741 6071 : regno1 = INVALID_REGNUM;
1742 6071 : regno2 = INVALID_REGNUM;
1743 :
1744 6071 : if (parts.base)
1745 : {
1746 6063 : parts.base = gen_lowpart (mode, parts.base);
1747 6063 : regno1 = true_regnum (parts.base);
1748 : }
1749 :
1750 6071 : if (parts.index)
1751 : {
1752 6068 : parts.index = gen_lowpart (mode, parts.index);
1753 6068 : regno2 = true_regnum (parts.index);
1754 : }
1755 :
1756 6071 : if (parts.disp)
1757 167 : parts.disp = gen_lowpart (mode, parts.disp);
1758 :
1759 6071 : if (parts.scale > 1)
1760 : {
1761 : /* Case r1 = r1 + ... */
1762 11 : if (regno1 == regno0)
1763 : {
1764 : /* If we have a case r1 = r1 + C * r2 then we
1765 : should use multiplication which is very
1766 : expensive. Assume cost model is wrong if we
1767 : have such case here. */
1768 0 : gcc_assert (regno2 != regno0);
1769 :
1770 0 : for (adds = parts.scale; adds > 0; adds--)
1771 0 : ix86_emit_binop (PLUS, mode, target, parts.index);
1772 : }
1773 : else
1774 : {
1775 : /* r1 = r2 + r3 * C case. Need to move r3 into r1. */
1776 11 : if (regno0 != regno2)
1777 8 : emit_insn (gen_rtx_SET (target, parts.index));
1778 :
1779 : /* Use shift for scaling, but emit it as MULT instead
1780 : to avoid it being immediately peephole2 optimized back
1781 : into lea. */
1782 11 : ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
1783 :
1784 11 : if (parts.base)
1785 3 : ix86_emit_binop (PLUS, mode, target, parts.base);
1786 :
1787 11 : if (parts.disp && parts.disp != const0_rtx)
1788 3 : ix86_emit_binop (PLUS, mode, target, parts.disp);
1789 : }
1790 : }
1791 6060 : else if (!parts.base && !parts.index)
1792 : {
1793 0 : gcc_assert(parts.disp);
1794 0 : emit_insn (gen_rtx_SET (target, parts.disp));
1795 : }
1796 : else
1797 : {
1798 6060 : if (!parts.base)
1799 : {
1800 0 : if (regno0 != regno2)
1801 0 : emit_insn (gen_rtx_SET (target, parts.index));
1802 : }
1803 6060 : else if (!parts.index)
1804 : {
1805 3 : if (regno0 != regno1)
1806 1 : emit_insn (gen_rtx_SET (target, parts.base));
1807 : }
1808 : else
1809 : {
1810 6057 : if (regno0 == regno1)
1811 : tmp = parts.index;
1812 3257 : else if (regno0 == regno2)
1813 : tmp = parts.base;
1814 : else
1815 : {
1816 15 : rtx tmp1;
1817 :
1818 : /* Find better operand for SET instruction, depending
1819 : on which definition is farther from the insn. */
1820 15 : if (find_nearest_reg_def (insn, regno1, regno2))
1821 5 : tmp = parts.index, tmp1 = parts.base;
1822 : else
1823 10 : tmp = parts.base, tmp1 = parts.index;
1824 :
1825 15 : emit_insn (gen_rtx_SET (target, tmp));
1826 :
1827 15 : if (parts.disp && parts.disp != const0_rtx)
1828 0 : ix86_emit_binop (PLUS, mode, target, parts.disp);
1829 :
1830 15 : ix86_emit_binop (PLUS, mode, target, tmp1);
1831 15 : return;
1832 : }
1833 :
1834 6042 : ix86_emit_binop (PLUS, mode, target, tmp);
1835 : }
1836 :
1837 6045 : if (parts.disp && parts.disp != const0_rtx)
1838 4 : ix86_emit_binop (PLUS, mode, target, parts.disp);
1839 : }
1840 : }
1841 :
1842 : /* Post-reload splitter for converting an SF or DFmode value in an
1843 : SSE register into an unsigned SImode. */
1844 :
1845 : void
1846 0 : ix86_split_convert_uns_si_sse (rtx operands[])
1847 : {
1848 0 : machine_mode vecmode;
1849 0 : rtx value, large, zero_or_two31, input, two31, x;
1850 :
1851 0 : large = operands[1];
1852 0 : zero_or_two31 = operands[2];
1853 0 : input = operands[3];
1854 0 : two31 = operands[4];
1855 0 : vecmode = GET_MODE (large);
1856 0 : value = gen_rtx_REG (vecmode, REGNO (operands[0]));
1857 :
1858 : /* Load up the value into the low element. We must ensure that the other
1859 : elements are valid floats -- zero is the easiest such value. */
1860 0 : if (MEM_P (input))
1861 : {
1862 0 : if (vecmode == V4SFmode)
1863 0 : emit_insn (gen_vec_setv4sf_0 (value, CONST0_RTX (V4SFmode), input));
1864 : else
1865 0 : emit_insn (gen_sse2_loadlpd (value, CONST0_RTX (V2DFmode), input));
1866 : }
1867 : else
1868 : {
1869 0 : input = gen_rtx_REG (vecmode, REGNO (input));
1870 0 : emit_move_insn (value, CONST0_RTX (vecmode));
1871 0 : if (vecmode == V4SFmode)
1872 0 : emit_insn (gen_sse_movss_v4sf (value, value, input));
1873 : else
1874 0 : emit_insn (gen_sse2_movsd_v2df (value, value, input));
1875 : }
1876 :
1877 0 : emit_move_insn (large, two31);
1878 0 : emit_move_insn (zero_or_two31, MEM_P (two31) ? large : two31);
1879 :
1880 0 : x = gen_rtx_fmt_ee (LE, vecmode, large, value);
1881 0 : emit_insn (gen_rtx_SET (large, x));
1882 :
1883 0 : x = gen_rtx_AND (vecmode, zero_or_two31, large);
1884 0 : emit_insn (gen_rtx_SET (zero_or_two31, x));
1885 :
1886 0 : x = gen_rtx_MINUS (vecmode, value, zero_or_two31);
1887 0 : emit_insn (gen_rtx_SET (value, x));
1888 :
1889 0 : large = gen_rtx_REG (V4SImode, REGNO (large));
1890 0 : emit_insn (gen_ashlv4si3 (large, large, GEN_INT (31)));
1891 :
1892 0 : x = gen_rtx_REG (V4SImode, REGNO (value));
1893 0 : if (vecmode == V4SFmode)
1894 0 : emit_insn (gen_fix_truncv4sfv4si2 (x, value));
1895 : else
1896 0 : emit_insn (gen_sse2_cvttpd2dq (x, value));
1897 0 : value = x;
1898 :
1899 0 : emit_insn (gen_xorv4si3 (value, value, large));
1900 0 : }
1901 :
1902 : /* Convert an unsigned DImode value into a DFmode, using only SSE.
1903 : Expects the 64-bit DImode to be supplied in a pair of integral
1904 : registers. Requires SSE2; will use SSE3 if available. For x86_32,
1905 : -mfpmath=sse, !optimize_size only. */
1906 :
1907 : void
1908 0 : ix86_expand_convert_uns_didf_sse (rtx target, rtx input)
1909 : {
1910 0 : REAL_VALUE_TYPE bias_lo_rvt, bias_hi_rvt;
1911 0 : rtx int_xmm, fp_xmm;
1912 0 : rtx biases, exponents;
1913 0 : rtx x;
1914 :
1915 0 : int_xmm = gen_reg_rtx (V4SImode);
1916 0 : if (TARGET_INTER_UNIT_MOVES_TO_VEC)
1917 0 : emit_insn (gen_movdi_to_sse (int_xmm, input));
1918 0 : else if (TARGET_SSE_SPLIT_REGS)
1919 : {
1920 0 : emit_clobber (int_xmm);
1921 0 : emit_move_insn (gen_lowpart (DImode, int_xmm), input);
1922 : }
1923 : else
1924 : {
1925 0 : x = gen_reg_rtx (V2DImode);
1926 0 : ix86_expand_vector_init_one_nonzero (false, V2DImode, x, input, 0);
1927 0 : emit_move_insn (int_xmm, gen_lowpart (V4SImode, x));
1928 : }
1929 :
1930 0 : x = gen_rtx_CONST_VECTOR (V4SImode,
1931 : gen_rtvec (4, GEN_INT (0x43300000UL),
1932 : GEN_INT (0x45300000UL),
1933 : const0_rtx, const0_rtx));
1934 0 : exponents = validize_mem (force_const_mem (V4SImode, x));
1935 :
1936 : /* int_xmm = {0x45300000UL, fp_xmm/hi, 0x43300000, fp_xmm/lo } */
1937 0 : emit_insn (gen_vec_interleave_lowv4si (int_xmm, int_xmm, exponents));
1938 :
1939 : /* Concatenating (juxtaposing) (0x43300000UL ## fp_value_low_xmm)
1940 : yields a valid DF value equal to (0x1.0p52 + double(fp_value_lo_xmm)).
1941 : Similarly (0x45300000UL ## fp_value_hi_xmm) yields
1942 : (0x1.0p84 + double(fp_value_hi_xmm)).
1943 : Note these exponents differ by 32. */
1944 :
1945 0 : fp_xmm = copy_to_mode_reg (V2DFmode, gen_lowpart (V2DFmode, int_xmm));
1946 :
1947 : /* Subtract off those 0x1.0p52 and 0x1.0p84 biases, to produce values
1948 : in [0,2**32-1] and [0]+[2**32,2**64-1] respectively. */
1949 0 : real_ldexp (&bias_lo_rvt, &dconst1, 52);
1950 0 : real_ldexp (&bias_hi_rvt, &dconst1, 84);
1951 0 : biases = const_double_from_real_value (bias_lo_rvt, DFmode);
1952 0 : x = const_double_from_real_value (bias_hi_rvt, DFmode);
1953 0 : biases = gen_rtx_CONST_VECTOR (V2DFmode, gen_rtvec (2, biases, x));
1954 0 : biases = validize_mem (force_const_mem (V2DFmode, biases));
1955 0 : emit_insn (gen_subv2df3 (fp_xmm, fp_xmm, biases));
1956 :
1957 : /* Add the upper and lower DFmode values together. */
1958 0 : if (TARGET_SSE3)
1959 0 : emit_insn (gen_sse3_haddv2df3 (fp_xmm, fp_xmm, fp_xmm));
1960 : else
1961 : {
1962 0 : x = copy_to_mode_reg (V2DFmode, fp_xmm);
1963 0 : emit_insn (gen_vec_interleave_highv2df (fp_xmm, fp_xmm, fp_xmm));
1964 0 : emit_insn (gen_addv2df3 (fp_xmm, fp_xmm, x));
1965 : }
1966 :
1967 0 : ix86_expand_vector_extract (false, target, fp_xmm, 0);
1968 0 : }
1969 :
1970 : /* Not used, but eases macroization of patterns. */
1971 : void
1972 0 : ix86_expand_convert_uns_sixf_sse (rtx, rtx)
1973 : {
1974 0 : gcc_unreachable ();
1975 : }
1976 :
1977 : static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
1978 :
1979 : /* Convert an unsigned SImode value into a DFmode. Only currently used
1980 : for SSE, but applicable anywhere. */
1981 :
1982 : void
1983 0 : ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
1984 : {
1985 0 : REAL_VALUE_TYPE TWO31r;
1986 0 : rtx x, fp;
1987 :
1988 0 : x = expand_simple_binop (SImode, PLUS, input, GEN_INT (-2147483647 - 1),
1989 : NULL, 1, OPTAB_DIRECT);
1990 :
1991 0 : fp = gen_reg_rtx (DFmode);
1992 0 : emit_insn (gen_floatsidf2 (fp, x));
1993 :
1994 0 : real_ldexp (&TWO31r, &dconst1, 31);
1995 0 : x = const_double_from_real_value (TWO31r, DFmode);
1996 :
1997 0 : x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
1998 :
1999 : /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
2000 0 : if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
2001 0 : x = ix86_expand_sse_fabs (x, NULL);
2002 :
2003 0 : if (x != target)
2004 0 : emit_move_insn (target, x);
2005 0 : }
2006 :
2007 : /* Convert a signed DImode value into a DFmode. Only used for SSE in
2008 : 32-bit mode; otherwise we have a direct convert instruction. */
2009 :
2010 : void
2011 0 : ix86_expand_convert_sign_didf_sse (rtx target, rtx input)
2012 : {
2013 0 : REAL_VALUE_TYPE TWO32r;
2014 0 : rtx fp_lo, fp_hi, x;
2015 :
2016 0 : fp_lo = gen_reg_rtx (DFmode);
2017 0 : fp_hi = gen_reg_rtx (DFmode);
2018 :
2019 0 : emit_insn (gen_floatsidf2 (fp_hi, gen_highpart (SImode, input)));
2020 :
2021 0 : real_ldexp (&TWO32r, &dconst1, 32);
2022 0 : x = const_double_from_real_value (TWO32r, DFmode);
2023 0 : fp_hi = expand_simple_binop (DFmode, MULT, fp_hi, x, fp_hi, 0, OPTAB_DIRECT);
2024 :
2025 0 : ix86_expand_convert_uns_sidf_sse (fp_lo, gen_lowpart (SImode, input));
2026 :
2027 0 : x = expand_simple_binop (DFmode, PLUS, fp_hi, fp_lo, target,
2028 : 0, OPTAB_DIRECT);
2029 0 : if (x != target)
2030 0 : emit_move_insn (target, x);
2031 0 : }
2032 :
2033 : /* Convert an unsigned SImode value into a SFmode, using only SSE.
2034 : For x86_32, -mfpmath=sse, !optimize_size only. */
2035 : void
2036 0 : ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
2037 : {
2038 0 : REAL_VALUE_TYPE ONE16r;
2039 0 : rtx fp_hi, fp_lo, int_hi, int_lo, x;
2040 :
2041 0 : real_ldexp (&ONE16r, &dconst1, 16);
2042 0 : x = const_double_from_real_value (ONE16r, SFmode);
2043 0 : int_lo = expand_simple_binop (SImode, AND, input, GEN_INT(0xffff),
2044 : NULL, 0, OPTAB_DIRECT);
2045 0 : int_hi = expand_simple_binop (SImode, LSHIFTRT, input, GEN_INT(16),
2046 : NULL, 0, OPTAB_DIRECT);
2047 0 : fp_hi = gen_reg_rtx (SFmode);
2048 0 : fp_lo = gen_reg_rtx (SFmode);
2049 0 : emit_insn (gen_floatsisf2 (fp_hi, int_hi));
2050 0 : emit_insn (gen_floatsisf2 (fp_lo, int_lo));
2051 0 : if (TARGET_FMA)
2052 : {
2053 0 : x = validize_mem (force_const_mem (SFmode, x));
2054 0 : fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo);
2055 0 : emit_move_insn (target, fp_hi);
2056 : }
2057 : else
2058 : {
2059 0 : fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
2060 : 0, OPTAB_DIRECT);
2061 0 : fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
2062 : 0, OPTAB_DIRECT);
2063 0 : if (!rtx_equal_p (target, fp_hi))
2064 0 : emit_move_insn (target, fp_hi);
2065 : }
2066 0 : }
2067 :
2068 : /* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
2069 : a vector of unsigned ints VAL to vector of floats TARGET. */
2070 :
2071 : void
2072 54 : ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
2073 : {
2074 54 : rtx tmp[8];
2075 54 : REAL_VALUE_TYPE TWO16r;
2076 54 : machine_mode intmode = GET_MODE (val);
2077 54 : machine_mode fltmode = GET_MODE (target);
2078 54 : rtx (*cvt) (rtx, rtx);
2079 :
2080 54 : if (intmode == V4SImode)
2081 : cvt = gen_floatv4siv4sf2;
2082 : else
2083 2 : cvt = gen_floatv8siv8sf2;
2084 54 : tmp[0] = ix86_build_const_vector (intmode, 1, GEN_INT (0xffff));
2085 54 : tmp[0] = force_reg (intmode, tmp[0]);
2086 54 : tmp[1] = expand_simple_binop (intmode, AND, val, tmp[0], NULL_RTX, 1,
2087 : OPTAB_DIRECT);
2088 54 : tmp[2] = expand_simple_binop (intmode, LSHIFTRT, val, GEN_INT (16),
2089 : NULL_RTX, 1, OPTAB_DIRECT);
2090 54 : tmp[3] = gen_reg_rtx (fltmode);
2091 54 : emit_insn (cvt (tmp[3], tmp[1]));
2092 54 : tmp[4] = gen_reg_rtx (fltmode);
2093 54 : emit_insn (cvt (tmp[4], tmp[2]));
2094 54 : real_ldexp (&TWO16r, &dconst1, 16);
2095 54 : tmp[5] = const_double_from_real_value (TWO16r, SFmode);
2096 54 : tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
2097 54 : if (TARGET_FMA)
2098 : {
2099 1 : tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]);
2100 1 : emit_move_insn (target, tmp[6]);
2101 : }
2102 : else
2103 : {
2104 53 : tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5],
2105 : NULL_RTX, 1, OPTAB_DIRECT);
2106 53 : tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6],
2107 : target, 1, OPTAB_DIRECT);
2108 53 : if (tmp[7] != target)
2109 0 : emit_move_insn (target, tmp[7]);
2110 : }
2111 54 : }
2112 :
2113 : /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
2114 : pattern can be used on it instead of fixuns_trunc*.
2115 : This is done by doing just signed conversion if < 0x1p31, and otherwise by
2116 : subtracting 0x1p31 first and xoring in 0x80000000 from *XORP afterwards. */
2117 :
2118 : rtx
2119 264 : ix86_expand_adjust_ufix_to_sfix_si (rtx val, rtx *xorp)
2120 : {
2121 264 : REAL_VALUE_TYPE TWO31r;
2122 264 : rtx two31r, tmp[4];
2123 264 : machine_mode mode = GET_MODE (val);
2124 264 : machine_mode scalarmode = GET_MODE_INNER (mode);
2125 528 : machine_mode intmode = GET_MODE_SIZE (mode) == 32 ? V8SImode : V4SImode;
2126 264 : rtx (*cmp) (rtx, rtx, rtx, rtx);
2127 264 : int i;
2128 :
2129 1056 : for (i = 0; i < 3; i++)
2130 792 : tmp[i] = gen_reg_rtx (mode);
2131 264 : real_ldexp (&TWO31r, &dconst1, 31);
2132 264 : two31r = const_double_from_real_value (TWO31r, scalarmode);
2133 264 : two31r = ix86_build_const_vector (mode, 1, two31r);
2134 264 : two31r = force_reg (mode, two31r);
2135 264 : switch (mode)
2136 : {
2137 : case E_V8SFmode: cmp = gen_avx_maskcmpv8sf3; break;
2138 10 : case E_V4SFmode: cmp = gen_sse_maskcmpv4sf3; break;
2139 16 : case E_V4DFmode: cmp = gen_avx_maskcmpv4df3; break;
2140 238 : case E_V2DFmode: cmp = gen_sse2_maskcmpv2df3; break;
2141 0 : default: gcc_unreachable ();
2142 : }
2143 264 : tmp[3] = gen_rtx_LE (mode, two31r, val);
2144 264 : emit_insn (cmp (tmp[0], two31r, val, tmp[3]));
2145 264 : tmp[1] = expand_simple_binop (mode, AND, tmp[0], two31r, tmp[1],
2146 : 0, OPTAB_DIRECT);
2147 264 : if (intmode == V4SImode || TARGET_AVX2)
2148 528 : *xorp = expand_simple_binop (intmode, ASHIFT,
2149 264 : gen_lowpart (intmode, tmp[0]),
2150 : GEN_INT (31), NULL_RTX, 0,
2151 : OPTAB_DIRECT);
2152 : else
2153 : {
2154 0 : rtx two31 = gen_int_mode (HOST_WIDE_INT_1U << 31, SImode);
2155 0 : two31 = ix86_build_const_vector (intmode, 1, two31);
2156 0 : *xorp = expand_simple_binop (intmode, AND,
2157 0 : gen_lowpart (intmode, tmp[0]),
2158 : two31, NULL_RTX, 0,
2159 : OPTAB_DIRECT);
2160 : }
2161 264 : return expand_simple_binop (mode, MINUS, val, tmp[1], tmp[2],
2162 264 : 0, OPTAB_DIRECT);
2163 : }
2164 :
2165 : /* Generate code for floating point ABS or NEG. */
2166 :
2167 : void
2168 33105 : ix86_expand_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2169 : rtx operands[])
2170 : {
2171 33105 : rtx set, dst, src;
2172 33105 : bool use_sse = false;
2173 33105 : bool vector_mode = VECTOR_MODE_P (mode);
2174 33105 : machine_mode vmode = mode;
2175 33105 : rtvec par;
2176 :
2177 33105 : switch (mode)
2178 : {
2179 : case E_HFmode:
2180 : use_sse = true;
2181 : vmode = V8HFmode;
2182 : break;
2183 0 : case E_BFmode:
2184 0 : use_sse = true;
2185 0 : vmode = V8BFmode;
2186 0 : break;
2187 8851 : case E_SFmode:
2188 8851 : use_sse = TARGET_SSE_MATH && TARGET_SSE;
2189 : vmode = V4SFmode;
2190 : break;
2191 15386 : case E_DFmode:
2192 15386 : use_sse = TARGET_SSE_MATH && TARGET_SSE2;
2193 : vmode = V2DFmode;
2194 : break;
2195 8669 : default:
2196 8669 : use_sse = vector_mode || mode == TFmode;
2197 8669 : break;
2198 : }
2199 :
2200 33105 : dst = operands[0];
2201 33105 : src = operands[1];
2202 :
2203 33105 : set = gen_rtx_fmt_e (code, mode, src);
2204 33105 : set = gen_rtx_SET (dst, set);
2205 :
2206 33105 : if (use_sse)
2207 : {
2208 27728 : rtx mask, use, clob;
2209 :
2210 : /* NEG and ABS performed with SSE use bitwise mask operations.
2211 : Create the appropriate mask now. */
2212 27728 : mask = ix86_build_signbit_mask (vmode, vector_mode, code == ABS);
2213 27728 : use = gen_rtx_USE (VOIDmode, mask);
2214 27728 : if (vector_mode || mode == TFmode)
2215 4591 : par = gen_rtvec (2, set, use);
2216 : else
2217 : {
2218 23137 : clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2219 23137 : par = gen_rtvec (3, set, use, clob);
2220 : }
2221 : }
2222 : else
2223 : {
2224 5377 : rtx clob;
2225 :
2226 : /* Changing of sign for FP values is doable using integer unit too. */
2227 5377 : clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2228 5377 : par = gen_rtvec (2, set, clob);
2229 : }
2230 :
2231 33105 : emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2232 33105 : }
2233 :
2234 : /* Deconstruct a floating point ABS or NEG operation
2235 : with integer registers into integer operations. */
2236 :
2237 : void
2238 24 : ix86_split_fp_absneg_operator (enum rtx_code code, machine_mode mode,
2239 : rtx operands[])
2240 : {
2241 24 : enum rtx_code absneg_op;
2242 24 : rtx dst, set;
2243 :
2244 24 : gcc_assert (operands_match_p (operands[0], operands[1]));
2245 :
2246 24 : switch (mode)
2247 : {
2248 0 : case E_SFmode:
2249 0 : dst = gen_lowpart (SImode, operands[0]);
2250 :
2251 0 : if (code == ABS)
2252 : {
2253 0 : set = gen_int_mode (0x7fffffff, SImode);
2254 0 : absneg_op = AND;
2255 : }
2256 : else
2257 : {
2258 0 : set = gen_int_mode (0x80000000, SImode);
2259 0 : absneg_op = XOR;
2260 : }
2261 0 : set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2262 0 : break;
2263 :
2264 1 : case E_DFmode:
2265 1 : if (TARGET_64BIT)
2266 : {
2267 1 : dst = gen_lowpart (DImode, operands[0]);
2268 1 : dst = gen_rtx_ZERO_EXTRACT (DImode, dst, const1_rtx, GEN_INT (63));
2269 :
2270 1 : if (code == ABS)
2271 0 : set = const0_rtx;
2272 : else
2273 1 : set = gen_rtx_NOT (DImode, dst);
2274 : }
2275 : else
2276 : {
2277 0 : dst = gen_highpart (SImode, operands[0]);
2278 :
2279 0 : if (code == ABS)
2280 : {
2281 0 : set = gen_int_mode (0x7fffffff, SImode);
2282 0 : absneg_op = AND;
2283 : }
2284 : else
2285 : {
2286 0 : set = gen_int_mode (0x80000000, SImode);
2287 0 : absneg_op = XOR;
2288 : }
2289 0 : set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2290 : }
2291 : break;
2292 :
2293 23 : case E_XFmode:
2294 23 : dst = gen_rtx_REG (SImode,
2295 23 : REGNO (operands[0]) + (TARGET_64BIT ? 1 : 2));
2296 23 : if (code == ABS)
2297 : {
2298 1 : set = GEN_INT (0x7fff);
2299 1 : absneg_op = AND;
2300 : }
2301 : else
2302 : {
2303 22 : set = GEN_INT (0x8000);
2304 22 : absneg_op = XOR;
2305 : }
2306 23 : set = gen_rtx_fmt_ee (absneg_op, SImode, dst, set);
2307 23 : break;
2308 :
2309 0 : default:
2310 0 : gcc_unreachable ();
2311 : }
2312 :
2313 24 : set = gen_rtx_SET (dst, set);
2314 :
2315 24 : rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
2316 24 : rtvec par = gen_rtvec (2, set, clob);
2317 :
2318 24 : emit_insn (gen_rtx_PARALLEL (VOIDmode, par));
2319 24 : }
2320 :
2321 : /* Expand a copysign operation. Special case operand 0 being a constant. */
2322 :
2323 : void
2324 23249 : ix86_expand_copysign (rtx operands[])
2325 : {
2326 23249 : machine_mode mode, vmode;
2327 23249 : rtx dest, vdest, op0, op1, mask, op2, op3;
2328 :
2329 23249 : mode = GET_MODE (operands[0]);
2330 :
2331 23249 : switch (mode)
2332 : {
2333 : case E_HFmode:
2334 : vmode = V8HFmode;
2335 : break;
2336 0 : case E_BFmode:
2337 0 : vmode = V8BFmode;
2338 0 : break;
2339 11566 : case E_SFmode:
2340 11566 : vmode = V4SFmode;
2341 11566 : break;
2342 11544 : case E_DFmode:
2343 11544 : vmode = V2DFmode;
2344 11544 : break;
2345 127 : case E_TFmode:
2346 127 : vmode = mode;
2347 127 : break;
2348 0 : default:
2349 0 : gcc_unreachable();
2350 : }
2351 :
2352 23249 : if (rtx_equal_p (operands[1], operands[2]))
2353 : {
2354 0 : emit_move_insn (operands[0], operands[1]);
2355 0 : return;
2356 : }
2357 :
2358 23249 : dest = operands[0];
2359 23249 : vdest = lowpart_subreg (vmode, dest, mode);
2360 23249 : if (vdest == NULL_RTX)
2361 0 : vdest = gen_reg_rtx (vmode);
2362 : else
2363 : dest = NULL_RTX;
2364 23249 : op1 = lowpart_subreg (vmode, force_reg (mode, operands[1]), mode);
2365 46484 : mask = ix86_build_signbit_mask (vmode, TARGET_AVX512F && mode != HFmode, 0);
2366 :
2367 23249 : if (CONST_DOUBLE_P (operands[2]))
2368 : {
2369 79 : if (real_isneg (CONST_DOUBLE_REAL_VALUE (operands[2])))
2370 : /* Simplify b = copysign (a, negative) to b = mask | a. */
2371 76 : op1 = gen_rtx_IOR (vmode, mask, op1);
2372 : else
2373 : {
2374 : /* Simplify b = copysign (a, positive) to b = invert_mask & a. */
2375 3 : rtx invert_mask
2376 3 : = ix86_build_signbit_mask (vmode,
2377 3 : TARGET_AVX512F && mode != HFmode,
2378 : true);
2379 3 : op1 = gen_rtx_AND (vmode, invert_mask, op1);
2380 : }
2381 79 : emit_move_insn (vdest, op1);
2382 79 : if (dest)
2383 0 : emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2384 79 : return;
2385 : }
2386 : else
2387 23170 : op0 = lowpart_subreg (vmode, force_reg (mode, operands[2]), mode);
2388 :
2389 23170 : op2 = gen_reg_rtx (vmode);
2390 23170 : op3 = gen_reg_rtx (vmode);
2391 23170 : rtx invert_mask;
2392 : /* NB: Generate vmovdqa, vpandn, vpand, vpor for AVX and generate pand,
2393 : pand, por for SSE. */
2394 23170 : if (TARGET_AVX)
2395 33 : invert_mask = gen_rtx_NOT (vmode, mask);
2396 : else
2397 23137 : invert_mask = ix86_build_signbit_mask (vmode,
2398 23137 : TARGET_AVX512F && mode != HFmode,
2399 : true);
2400 23170 : emit_move_insn (op2, gen_rtx_AND (vmode, invert_mask, op1));
2401 23170 : emit_move_insn (op3, gen_rtx_AND (vmode, mask, op0));
2402 23170 : emit_move_insn (vdest, gen_rtx_IOR (vmode, op2, op3));
2403 23170 : if (dest)
2404 0 : emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2405 : }
2406 :
2407 : /* Expand an xorsign operation. */
2408 :
2409 : void
2410 20 : ix86_expand_xorsign (rtx operands[])
2411 : {
2412 20 : machine_mode mode, vmode;
2413 20 : rtx dest, vdest, op0, op1, mask, x, temp;
2414 :
2415 20 : dest = operands[0];
2416 20 : op0 = operands[1];
2417 20 : op1 = operands[2];
2418 :
2419 20 : mode = GET_MODE (dest);
2420 :
2421 20 : switch (mode)
2422 : {
2423 : case E_HFmode:
2424 : vmode = V8HFmode;
2425 : break;
2426 : case E_BFmode:
2427 : vmode = V8BFmode;
2428 : break;
2429 : case E_SFmode:
2430 : vmode = V4SFmode;
2431 : break;
2432 : case E_DFmode:
2433 : vmode = V2DFmode;
2434 : break;
2435 0 : default:
2436 0 : gcc_unreachable ();
2437 20 : break;
2438 : }
2439 :
2440 20 : temp = gen_reg_rtx (vmode);
2441 20 : mask = ix86_build_signbit_mask (vmode, 0, 0);
2442 :
2443 20 : op1 = lowpart_subreg (vmode, force_reg (mode, op1), mode);
2444 20 : x = gen_rtx_AND (vmode, op1, mask);
2445 20 : emit_insn (gen_rtx_SET (temp, x));
2446 :
2447 20 : op0 = lowpart_subreg (vmode, force_reg (mode, op0), mode);
2448 20 : x = gen_rtx_XOR (vmode, temp, op0);
2449 :
2450 20 : vdest = lowpart_subreg (vmode, dest, mode);
2451 20 : if (vdest == NULL_RTX)
2452 0 : vdest = gen_reg_rtx (vmode);
2453 : else
2454 : dest = NULL_RTX;
2455 20 : emit_insn (gen_rtx_SET (vdest, x));
2456 :
2457 20 : if (dest)
2458 0 : emit_move_insn (dest, lowpart_subreg (mode, vdest, vmode));
2459 20 : }
2460 :
2461 : static rtx ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1);
2462 :
2463 : void
2464 6657123 : ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
2465 : {
2466 6657123 : machine_mode mode = GET_MODE (op0);
2467 6657123 : rtx tmp;
2468 :
2469 : /* Handle special case - vector comparsion with boolean result, transform
2470 : it using ptest instruction or vpcmpeq + kortest. */
2471 6657123 : if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT
2472 6637928 : || (mode == TImode && !TARGET_64BIT)
2473 6637928 : || mode == OImode
2474 13295051 : || GET_MODE_SIZE (mode) == 64)
2475 : {
2476 19195 : unsigned msize = GET_MODE_SIZE (mode);
2477 19195 : machine_mode p_mode
2478 19195 : = msize == 64 ? V16SImode : msize == 32 ? V4DImode : V2DImode;
2479 : /* kortest set CF when result is 0xFFFF (op0 == op1). */
2480 19195 : rtx flag = gen_rtx_REG (msize == 64 ? CCCmode : CCZmode, FLAGS_REG);
2481 :
2482 19195 : gcc_assert (code == EQ || code == NE);
2483 :
2484 : /* Using vpcmpeq zmm zmm k + kortest for 512-bit vectors. */
2485 19195 : if (msize == 64)
2486 : {
2487 2195 : if (mode != V16SImode)
2488 : {
2489 2195 : op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
2490 2195 : op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
2491 : }
2492 :
2493 2195 : tmp = gen_reg_rtx (HImode);
2494 2195 : emit_insn (gen_avx512f_cmpv16si3 (tmp, op0, op1, GEN_INT (0)));
2495 2195 : emit_insn (gen_kortesthi_ccc (tmp, tmp));
2496 : }
2497 : /* Using ptest for 128/256-bit vectors. */
2498 : else
2499 : {
2500 17000 : if (GET_MODE_CLASS (mode) != MODE_VECTOR_INT)
2501 : {
2502 0 : op0 = lowpart_subreg (p_mode, force_reg (mode, op0), mode);
2503 0 : op1 = lowpart_subreg (p_mode, force_reg (mode, op1), mode);
2504 0 : mode = p_mode;
2505 : }
2506 :
2507 : /* Generate XOR since we can't check that one operand is zero
2508 : vector. */
2509 17000 : tmp = gen_reg_rtx (mode);
2510 17000 : rtx ops[3] = { tmp, op0, op1 };
2511 17000 : ix86_expand_vector_logical_operator (XOR, mode, ops);
2512 17000 : tmp = gen_lowpart (p_mode, tmp);
2513 17000 : emit_insn (gen_rtx_SET (gen_rtx_REG (CCZmode, FLAGS_REG),
2514 : gen_rtx_UNSPEC (CCZmode,
2515 : gen_rtvec (2, tmp, tmp),
2516 : UNSPEC_PTEST)));
2517 : }
2518 19195 : tmp = gen_rtx_fmt_ee (code, VOIDmode, flag, const0_rtx);
2519 19195 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2520 : gen_rtx_LABEL_REF (VOIDmode, label),
2521 : pc_rtx);
2522 19195 : emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2523 19195 : return;
2524 : }
2525 :
2526 6637928 : switch (mode)
2527 : {
2528 6605030 : case E_HFmode:
2529 6605030 : case E_SFmode:
2530 6605030 : case E_DFmode:
2531 6605030 : case E_XFmode:
2532 6605030 : case E_QImode:
2533 6605030 : case E_HImode:
2534 6605030 : case E_SImode:
2535 6605030 : simple:
2536 6605030 : tmp = ix86_expand_compare (code, op0, op1);
2537 6605030 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
2538 : gen_rtx_LABEL_REF (VOIDmode, label),
2539 : pc_rtx);
2540 6605030 : emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
2541 6605030 : return;
2542 :
2543 7 : case E_BFmode:
2544 7 : gcc_assert (TARGET_AVX10_2 && !flag_trapping_math);
2545 7 : goto simple;
2546 :
2547 2675218 : case E_DImode:
2548 2675218 : if (TARGET_64BIT)
2549 2646729 : goto simple;
2550 : /* FALLTHRU */
2551 91358 : case E_TImode:
2552 : /* DI and TI mode equality/inequality comparisons may be performed
2553 : on SSE registers. Avoid splitting them, except when optimizing
2554 : for size. */
2555 91358 : if ((code == EQ || code == NE)
2556 91358 : && !optimize_insn_for_size_p ())
2557 58460 : goto simple;
2558 :
2559 : /* Expand DImode branch into multiple compare+branch. */
2560 32898 : {
2561 32898 : rtx lo[2], hi[2];
2562 32898 : rtx_code_label *label2;
2563 32898 : enum rtx_code code1, code2, code3;
2564 32898 : machine_mode submode;
2565 :
2566 32898 : if (CONSTANT_P (op0) && !CONSTANT_P (op1))
2567 : {
2568 0 : std::swap (op0, op1);
2569 0 : code = swap_condition (code);
2570 : }
2571 :
2572 32898 : split_double_mode (mode, &op0, 1, lo+0, hi+0);
2573 32898 : split_double_mode (mode, &op1, 1, lo+1, hi+1);
2574 :
2575 32898 : submode = mode == DImode ? SImode : DImode;
2576 :
2577 : /* If we are doing less-than or greater-or-equal-than,
2578 : op1 is a constant and the low word is zero, then we can just
2579 : examine the high word. Similarly for low word -1 and
2580 : less-or-equal-than or greater-than. */
2581 :
2582 32898 : if (CONST_INT_P (hi[1]))
2583 21936 : switch (code)
2584 : {
2585 10682 : case LT: case LTU: case GE: case GEU:
2586 10682 : if (lo[1] == const0_rtx)
2587 : {
2588 10273 : ix86_expand_branch (code, hi[0], hi[1], label);
2589 10273 : return;
2590 : }
2591 : break;
2592 9681 : case LE: case LEU: case GT: case GTU:
2593 9681 : if (lo[1] == constm1_rtx)
2594 : {
2595 529 : ix86_expand_branch (code, hi[0], hi[1], label);
2596 529 : return;
2597 : }
2598 : break;
2599 : default:
2600 : break;
2601 : }
2602 :
2603 : /* Emulate comparisons that do not depend on Zero flag with
2604 : double-word subtraction. Note that only Overflow, Sign
2605 : and Carry flags are valid, so swap arguments and condition
2606 : of comparisons that would otherwise test Zero flag. */
2607 :
2608 22096 : switch (code)
2609 : {
2610 14640 : case LE: case LEU: case GT: case GTU:
2611 14640 : std::swap (lo[0], lo[1]);
2612 14640 : std::swap (hi[0], hi[1]);
2613 14640 : code = swap_condition (code);
2614 : /* FALLTHRU */
2615 :
2616 19030 : case LT: case LTU: case GE: case GEU:
2617 19030 : {
2618 19030 : bool uns = (code == LTU || code == GEU);
2619 3989 : rtx (*sbb_insn) (machine_mode, rtx, rtx, rtx)
2620 19030 : = uns ? gen_sub3_carry_ccc : gen_sub3_carry_ccgz;
2621 :
2622 19030 : if (!nonimmediate_operand (lo[0], submode))
2623 9152 : lo[0] = force_reg (submode, lo[0]);
2624 19030 : if (!x86_64_general_operand (lo[1], submode))
2625 0 : lo[1] = force_reg (submode, lo[1]);
2626 :
2627 19030 : if (!register_operand (hi[0], submode))
2628 9974 : hi[0] = force_reg (submode, hi[0]);
2629 15041 : if ((uns && !nonimmediate_operand (hi[1], submode))
2630 19030 : || (!uns && !x86_64_general_operand (hi[1], submode)))
2631 315 : hi[1] = force_reg (submode, hi[1]);
2632 :
2633 19030 : emit_insn (gen_cmp_1 (submode, lo[0], lo[1]));
2634 :
2635 19030 : tmp = gen_rtx_SCRATCH (submode);
2636 19030 : emit_insn (sbb_insn (submode, tmp, hi[0], hi[1]));
2637 :
2638 23019 : tmp = gen_rtx_REG (uns ? CCCmode : CCGZmode, FLAGS_REG);
2639 19030 : ix86_expand_branch (code, tmp, const0_rtx, label);
2640 19030 : return;
2641 : }
2642 :
2643 3066 : default:
2644 3066 : break;
2645 : }
2646 :
2647 : /* Otherwise, we need two or three jumps. */
2648 :
2649 3066 : label2 = gen_label_rtx ();
2650 :
2651 3066 : code1 = code;
2652 3066 : code2 = swap_condition (code);
2653 3066 : code3 = unsigned_condition (code);
2654 :
2655 3066 : switch (code)
2656 : {
2657 : case LT: case GT: case LTU: case GTU:
2658 : break;
2659 :
2660 : case LE: code1 = LT; code2 = GT; break;
2661 : case GE: code1 = GT; code2 = LT; break;
2662 0 : case LEU: code1 = LTU; code2 = GTU; break;
2663 0 : case GEU: code1 = GTU; code2 = LTU; break;
2664 :
2665 : case EQ: code1 = UNKNOWN; code2 = NE; break;
2666 : case NE: code2 = UNKNOWN; break;
2667 :
2668 0 : default:
2669 0 : gcc_unreachable ();
2670 : }
2671 :
2672 : /*
2673 : * a < b =>
2674 : * if (hi(a) < hi(b)) goto true;
2675 : * if (hi(a) > hi(b)) goto false;
2676 : * if (lo(a) < lo(b)) goto true;
2677 : * false:
2678 : */
2679 :
2680 0 : if (code1 != UNKNOWN)
2681 2328 : ix86_expand_branch (code1, hi[0], hi[1], label);
2682 3066 : if (code2 != UNKNOWN)
2683 738 : ix86_expand_branch (code2, hi[0], hi[1], label2);
2684 :
2685 3066 : ix86_expand_branch (code3, lo[0], lo[1], label);
2686 :
2687 3066 : if (code2 != UNKNOWN)
2688 738 : emit_label (label2);
2689 : return;
2690 : }
2691 :
2692 19472 : default:
2693 19472 : gcc_assert (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC);
2694 19472 : goto simple;
2695 : }
2696 : }
2697 :
2698 : /* Figure out whether to use unordered fp comparisons. */
2699 :
2700 : static bool
2701 1148001 : ix86_unordered_fp_compare (enum rtx_code code)
2702 : {
2703 1148001 : if (!TARGET_IEEE_FP)
2704 : return false;
2705 :
2706 1143619 : switch (code)
2707 : {
2708 : case LT:
2709 : case LE:
2710 : case GT:
2711 : case GE:
2712 : case LTGT:
2713 : return false;
2714 :
2715 : case EQ:
2716 : case NE:
2717 :
2718 : case UNORDERED:
2719 : case ORDERED:
2720 : case UNLT:
2721 : case UNLE:
2722 : case UNGT:
2723 : case UNGE:
2724 : case UNEQ:
2725 : return true;
2726 :
2727 0 : default:
2728 0 : gcc_unreachable ();
2729 : }
2730 : }
2731 :
2732 : /* Return a comparison we can do and that it is equivalent to
2733 : swap_condition (code) apart possibly from orderedness.
2734 : But, never change orderedness if TARGET_IEEE_FP, returning
2735 : UNKNOWN in that case if necessary. */
2736 :
2737 : static enum rtx_code
2738 37454 : ix86_fp_swap_condition (enum rtx_code code)
2739 : {
2740 37454 : switch (code)
2741 : {
2742 1847 : case GT: /* GTU - CF=0 & ZF=0 */
2743 1847 : return TARGET_IEEE_FP ? UNKNOWN : UNLT;
2744 533 : case GE: /* GEU - CF=0 */
2745 533 : return TARGET_IEEE_FP ? UNKNOWN : UNLE;
2746 446 : case UNLT: /* LTU - CF=1 */
2747 446 : return TARGET_IEEE_FP ? UNKNOWN : GT;
2748 6315 : case UNLE: /* LEU - CF=1 | ZF=1 */
2749 6315 : return TARGET_IEEE_FP ? UNKNOWN : GE;
2750 28313 : default:
2751 28313 : return swap_condition (code);
2752 : }
2753 : }
2754 :
2755 : /* Return cost of comparison CODE using the best strategy for performance.
2756 : All following functions do use number of instructions as a cost metrics.
2757 : In future this should be tweaked to compute bytes for optimize_size and
2758 : take into account performance of various instructions on various CPUs. */
2759 :
2760 : static int
2761 1146866 : ix86_fp_comparison_cost (enum rtx_code code)
2762 : {
2763 1146866 : int arith_cost;
2764 :
2765 : /* The cost of code using bit-twiddling on %ah. */
2766 1146866 : switch (code)
2767 : {
2768 : case UNLE:
2769 : case UNLT:
2770 : case LTGT:
2771 : case GT:
2772 : case GE:
2773 : case UNORDERED:
2774 : case ORDERED:
2775 : case UNEQ:
2776 : arith_cost = 4;
2777 : break;
2778 84147 : case LT:
2779 84147 : case NE:
2780 84147 : case EQ:
2781 84147 : case UNGE:
2782 84147 : arith_cost = TARGET_IEEE_FP ? 5 : 4;
2783 : break;
2784 25472 : case LE:
2785 25472 : case UNGT:
2786 1063530 : arith_cost = TARGET_IEEE_FP ? 6 : 4;
2787 : break;
2788 0 : default:
2789 0 : gcc_unreachable ();
2790 : }
2791 :
2792 1146866 : switch (ix86_fp_comparison_strategy (code))
2793 : {
2794 1146866 : case IX86_FPCMP_COMI:
2795 1146866 : return arith_cost > 4 ? 3 : 2;
2796 0 : case IX86_FPCMP_SAHF:
2797 0 : return arith_cost > 4 ? 4 : 3;
2798 : default:
2799 : return arith_cost;
2800 : }
2801 : }
2802 :
2803 : /* Swap, force into registers, or otherwise massage the two operands
2804 : to a fp comparison. The operands are updated in place; the new
2805 : comparison code is returned. */
2806 :
2807 : static enum rtx_code
2808 573433 : ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
2809 : {
2810 573504 : bool unordered_compare = ix86_unordered_fp_compare (code);
2811 573504 : rtx op0 = *pop0, op1 = *pop1;
2812 573504 : machine_mode op_mode = GET_MODE (op0);
2813 573504 : bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (op_mode);
2814 :
2815 571113 : if (op_mode == BFmode && (!TARGET_AVX10_2 || flag_trapping_math))
2816 : {
2817 71 : rtx op = gen_lowpart (HImode, op0);
2818 71 : if (CONST_INT_P (op))
2819 0 : op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
2820 : op0, BFmode);
2821 : else
2822 : {
2823 71 : rtx t1 = gen_reg_rtx (SImode);
2824 71 : emit_insn (gen_zero_extendhisi2 (t1, op));
2825 71 : emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
2826 71 : op = gen_lowpart (SFmode, t1);
2827 : }
2828 71 : *pop0 = op;
2829 71 : op = gen_lowpart (HImode, op1);
2830 71 : if (CONST_INT_P (op))
2831 6 : op = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
2832 : op1, BFmode);
2833 : else
2834 : {
2835 65 : rtx t1 = gen_reg_rtx (SImode);
2836 65 : emit_insn (gen_zero_extendhisi2 (t1, op));
2837 65 : emit_insn (gen_ashlsi3 (t1, t1, GEN_INT (16)));
2838 65 : op = gen_lowpart (SFmode, t1);
2839 : }
2840 71 : *pop1 = op;
2841 71 : return ix86_prepare_fp_compare_args (code, pop0, pop1);
2842 : }
2843 :
2844 : /* All of the unordered compare instructions only work on registers.
2845 : The same is true of the fcomi compare instructions. The XFmode
2846 : compare instructions require registers except when comparing
2847 : against zero or when converting operand 1 from fixed point to
2848 : floating point. */
2849 :
2850 573433 : if (!is_sse
2851 573433 : && (unordered_compare
2852 8222 : || (op_mode == XFmode
2853 10523 : && ! (standard_80387_constant_p (op0) == 1
2854 5259 : || standard_80387_constant_p (op1) == 1)
2855 4918 : && GET_CODE (op1) != FLOAT)
2856 3304 : || ix86_fp_comparison_strategy (code) == IX86_FPCMP_COMI))
2857 : {
2858 147672 : op0 = force_reg (op_mode, op0);
2859 147672 : op1 = force_reg (op_mode, op1);
2860 : }
2861 : else
2862 : {
2863 : /* %%% We only allow op1 in memory; op0 must be st(0). So swap
2864 : things around if they appear profitable, otherwise force op0
2865 : into a register. */
2866 :
2867 425761 : if (standard_80387_constant_p (op0) == 0
2868 425761 : || (MEM_P (op0)
2869 56544 : && ! (standard_80387_constant_p (op1) == 0
2870 41217 : || MEM_P (op1))))
2871 : {
2872 37454 : enum rtx_code new_code = ix86_fp_swap_condition (code);
2873 37454 : if (new_code != UNKNOWN)
2874 : {
2875 : std::swap (op0, op1);
2876 425761 : code = new_code;
2877 : }
2878 : }
2879 :
2880 425761 : if (!REG_P (op0))
2881 52797 : op0 = force_reg (op_mode, op0);
2882 :
2883 425761 : if (CONSTANT_P (op1))
2884 : {
2885 193014 : int tmp = standard_80387_constant_p (op1);
2886 193014 : if (tmp == 0)
2887 73719 : op1 = validize_mem (force_const_mem (op_mode, op1));
2888 119295 : else if (tmp == 1)
2889 : {
2890 65207 : if (TARGET_CMOVE)
2891 65207 : op1 = force_reg (op_mode, op1);
2892 : }
2893 : else
2894 54088 : op1 = force_reg (op_mode, op1);
2895 : }
2896 : }
2897 :
2898 : /* Try to rearrange the comparison to make it cheaper. */
2899 573433 : if (ix86_fp_comparison_cost (code)
2900 573433 : > ix86_fp_comparison_cost (swap_condition (code))
2901 573433 : && (REG_P (op1) || can_create_pseudo_p ()))
2902 : {
2903 0 : std::swap (op0, op1);
2904 0 : code = swap_condition (code);
2905 0 : if (!REG_P (op0))
2906 0 : op0 = force_reg (op_mode, op0);
2907 : }
2908 :
2909 573433 : *pop0 = op0;
2910 573433 : *pop1 = op1;
2911 573433 : return code;
2912 : }
2913 :
2914 : /* Generate insn patterns to do a floating point compare of OPERANDS. */
2915 :
2916 : static rtx
2917 573433 : ix86_expand_fp_compare (enum rtx_code code, rtx op0, rtx op1)
2918 : {
2919 573433 : bool unordered_compare = ix86_unordered_fp_compare (code);
2920 573433 : machine_mode cmp_mode;
2921 573433 : rtx tmp, scratch;
2922 :
2923 573433 : code = ix86_prepare_fp_compare_args (code, &op0, &op1);
2924 :
2925 573433 : tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2926 573433 : if (unordered_compare)
2927 498659 : tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2928 :
2929 : /* Do fcomi/sahf based test when profitable. */
2930 573433 : switch (ix86_fp_comparison_strategy (code))
2931 : {
2932 573433 : case IX86_FPCMP_COMI:
2933 573433 : tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
2934 : /* We only have vcomisbf16, No vcomubf16 nor vcomxbf16 */
2935 573433 : if (GET_MODE (op0) != E_BFmode)
2936 : {
2937 573405 : if (TARGET_AVX10_2 && (code == EQ || code == NE))
2938 972 : tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_OPTCOMX);
2939 573405 : if (unordered_compare)
2940 498651 : tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
2941 : }
2942 573433 : cmp_mode = CCFPmode;
2943 573433 : emit_insn (gen_rtx_SET (gen_rtx_REG (CCFPmode, FLAGS_REG), tmp));
2944 573433 : break;
2945 :
2946 0 : case IX86_FPCMP_SAHF:
2947 0 : cmp_mode = CCFPmode;
2948 0 : tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2949 0 : scratch = gen_reg_rtx (HImode);
2950 0 : emit_insn (gen_rtx_SET (scratch, tmp));
2951 0 : emit_insn (gen_x86_sahf_1 (scratch));
2952 0 : break;
2953 :
2954 0 : case IX86_FPCMP_ARITH:
2955 0 : cmp_mode = CCNOmode;
2956 0 : tmp = gen_rtx_UNSPEC (HImode, gen_rtvec (1, tmp), UNSPEC_FNSTSW);
2957 0 : scratch = gen_reg_rtx (HImode);
2958 0 : emit_insn (gen_rtx_SET (scratch, tmp));
2959 :
2960 : /* In the unordered case, we have to check C2 for NaN's, which
2961 : doesn't happen to work out to anything nice combination-wise.
2962 : So do some bit twiddling on the value we've got in AH to come
2963 : up with an appropriate set of condition codes. */
2964 :
2965 0 : switch (code)
2966 : {
2967 0 : case GT:
2968 0 : case UNGT:
2969 0 : if (code == GT || !TARGET_IEEE_FP)
2970 : {
2971 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
2972 0 : code = EQ;
2973 : }
2974 : else
2975 : {
2976 0 : emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2977 0 : emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
2978 0 : emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x44)));
2979 0 : cmp_mode = CCmode;
2980 0 : code = GEU;
2981 : }
2982 : break;
2983 0 : case LT:
2984 0 : case UNLT:
2985 0 : if (code == LT && TARGET_IEEE_FP)
2986 : {
2987 0 : emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
2988 0 : emit_insn (gen_cmpqi_ext_3 (scratch, const1_rtx));
2989 0 : cmp_mode = CCmode;
2990 0 : code = EQ;
2991 : }
2992 : else
2993 : {
2994 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, const1_rtx));
2995 0 : code = NE;
2996 : }
2997 : break;
2998 0 : case GE:
2999 0 : case UNGE:
3000 0 : if (code == GE || !TARGET_IEEE_FP)
3001 : {
3002 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x05)));
3003 0 : code = EQ;
3004 : }
3005 : else
3006 : {
3007 0 : emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
3008 0 : emit_insn (gen_xorqi_ext_1_cc (scratch, scratch, const1_rtx));
3009 0 : code = NE;
3010 : }
3011 : break;
3012 0 : case LE:
3013 0 : case UNLE:
3014 0 : if (code == LE && TARGET_IEEE_FP)
3015 : {
3016 0 : emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
3017 0 : emit_insn (gen_addqi_ext_1 (scratch, scratch, constm1_rtx));
3018 0 : emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
3019 0 : cmp_mode = CCmode;
3020 0 : code = LTU;
3021 : }
3022 : else
3023 : {
3024 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x45)));
3025 0 : code = NE;
3026 : }
3027 : break;
3028 0 : case EQ:
3029 0 : case UNEQ:
3030 0 : if (code == EQ && TARGET_IEEE_FP)
3031 : {
3032 0 : emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
3033 0 : emit_insn (gen_cmpqi_ext_3 (scratch, GEN_INT (0x40)));
3034 0 : cmp_mode = CCmode;
3035 0 : code = EQ;
3036 : }
3037 : else
3038 : {
3039 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
3040 0 : code = NE;
3041 : }
3042 : break;
3043 0 : case NE:
3044 0 : case LTGT:
3045 0 : if (code == NE && TARGET_IEEE_FP)
3046 : {
3047 0 : emit_insn (gen_andqi_ext_1 (scratch, scratch, GEN_INT (0x45)));
3048 0 : emit_insn (gen_xorqi_ext_1_cc (scratch, scratch,
3049 : GEN_INT (0x40)));
3050 0 : code = NE;
3051 : }
3052 : else
3053 : {
3054 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x40)));
3055 0 : code = EQ;
3056 : }
3057 : break;
3058 :
3059 0 : case UNORDERED:
3060 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
3061 0 : code = NE;
3062 0 : break;
3063 0 : case ORDERED:
3064 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x04)));
3065 0 : code = EQ;
3066 0 : break;
3067 :
3068 0 : default:
3069 0 : gcc_unreachable ();
3070 : }
3071 : break;
3072 :
3073 0 : default:
3074 0 : gcc_unreachable();
3075 : }
3076 :
3077 : /* Return the test that should be put into the flags user, i.e.
3078 : the bcc, scc, or cmov instruction. */
3079 573433 : return gen_rtx_fmt_ee (code, VOIDmode,
3080 : gen_rtx_REG (cmp_mode, FLAGS_REG),
3081 : const0_rtx);
3082 : }
3083 :
3084 : /* Generate insn patterns to do an integer compare of OPERANDS. */
3085 :
3086 : static rtx
3087 6983930 : ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
3088 : {
3089 6983930 : machine_mode cmpmode;
3090 6983930 : rtx tmp, flags;
3091 :
3092 : /* Swap operands to emit carry flag comparison. */
3093 6983930 : if ((code == GTU || code == LEU)
3094 6983930 : && nonimmediate_operand (op1, VOIDmode))
3095 : {
3096 144928 : std::swap (op0, op1);
3097 144928 : code = swap_condition (code);
3098 : }
3099 :
3100 6983930 : cmpmode = SELECT_CC_MODE (code, op0, op1);
3101 6983930 : flags = gen_rtx_REG (cmpmode, FLAGS_REG);
3102 :
3103 : /* Attempt to use PTEST, if available, when testing vector modes for
3104 : equality/inequality against zero. */
3105 6983930 : if (op1 == const0_rtx
3106 2908159 : && SUBREG_P (op0)
3107 22842 : && cmpmode == CCZmode
3108 10269 : && SUBREG_BYTE (op0) == 0
3109 8634 : && REG_P (SUBREG_REG (op0))
3110 8634 : && VECTOR_MODE_P (GET_MODE (SUBREG_REG (op0)))
3111 7 : && TARGET_SSE4_1
3112 1 : && GET_MODE (op0) == TImode
3113 6983932 : && GET_MODE_SIZE (GET_MODE (SUBREG_REG (op0))) == 16)
3114 : {
3115 1 : tmp = SUBREG_REG (op0);
3116 1 : if (GET_MODE (tmp) == V8HFmode || GET_MODE (tmp) == V8BFmode)
3117 1 : tmp = gen_lowpart (V8HImode, tmp);
3118 1 : tmp = gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, tmp, tmp), UNSPEC_PTEST);
3119 : }
3120 : else
3121 6983929 : tmp = gen_rtx_COMPARE (cmpmode, op0, op1);
3122 :
3123 : /* This is very simple, but making the interface the same as in the
3124 : FP case makes the rest of the code easier. */
3125 6983930 : emit_insn (gen_rtx_SET (flags, tmp));
3126 :
3127 : /* Return the test that should be put into the flags user, i.e.
3128 : the bcc, scc, or cmov instruction. */
3129 6983930 : return gen_rtx_fmt_ee (code, VOIDmode, flags, const0_rtx);
3130 : }
3131 :
3132 : static rtx
3133 7688388 : ix86_expand_compare (enum rtx_code code, rtx op0, rtx op1)
3134 : {
3135 7688388 : rtx ret;
3136 :
3137 7688388 : if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_CC)
3138 133111 : ret = gen_rtx_fmt_ee (code, VOIDmode, op0, op1);
3139 :
3140 7555277 : else if (SCALAR_FLOAT_MODE_P (GET_MODE (op0)))
3141 : {
3142 571347 : gcc_assert (!DECIMAL_FLOAT_MODE_P (GET_MODE (op0)));
3143 571347 : ret = ix86_expand_fp_compare (code, op0, op1);
3144 : }
3145 : else
3146 6983930 : ret = ix86_expand_int_compare (code, op0, op1);
3147 :
3148 7688388 : return ret;
3149 : }
3150 :
3151 : void
3152 586372 : ix86_expand_setcc (rtx dest, enum rtx_code code, rtx op0, rtx op1)
3153 : {
3154 586372 : rtx ret;
3155 :
3156 586372 : gcc_assert (GET_MODE (dest) == QImode);
3157 :
3158 586372 : ret = ix86_expand_compare (code, op0, op1);
3159 586372 : PUT_MODE (ret, QImode);
3160 586372 : emit_insn (gen_rtx_SET (dest, ret));
3161 586372 : }
3162 :
3163 : /* Expand floating point op0 <=> op1, i.e.
3164 : dest = op0 == op1 ? 0 : op0 < op1 ? -1 : op0 > op1 ? 1 : -128. */
3165 :
3166 : void
3167 244 : ix86_expand_fp_spaceship (rtx dest, rtx op0, rtx op1, rtx op2)
3168 : {
3169 244 : gcc_checking_assert (ix86_fp_comparison_strategy (GT) != IX86_FPCMP_ARITH);
3170 244 : rtx zero = NULL_RTX;
3171 244 : if (op2 != const0_rtx
3172 52 : && (TARGET_IEEE_FP || TARGET_ZERO_EXTEND_WITH_AND)
3173 34 : && GET_MODE (dest) == SImode)
3174 34 : zero = force_reg (SImode, const0_rtx);
3175 244 : rtx gt = ix86_expand_fp_compare (GT, op0, op1);
3176 244 : rtx l0 = op2 == const0_rtx ? gen_label_rtx () : NULL_RTX;
3177 244 : rtx l1 = op2 == const0_rtx ? gen_label_rtx () : NULL_RTX;
3178 244 : rtx l2 = TARGET_IEEE_FP ? gen_label_rtx () : NULL_RTX;
3179 244 : rtx lend = gen_label_rtx ();
3180 244 : rtx tmp;
3181 244 : rtx_insn *jmp;
3182 244 : if (l2)
3183 : {
3184 207 : rtx un = gen_rtx_fmt_ee (UNORDERED, VOIDmode,
3185 : gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
3186 207 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, un,
3187 : gen_rtx_LABEL_REF (VOIDmode, l2), pc_rtx);
3188 207 : jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3189 207 : add_reg_br_prob_note (jmp, profile_probability:: very_unlikely ());
3190 : }
3191 244 : if (op2 == const0_rtx)
3192 : {
3193 192 : rtx eq = gen_rtx_fmt_ee (UNEQ, VOIDmode,
3194 : gen_rtx_REG (CCFPmode, FLAGS_REG), const0_rtx);
3195 192 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, eq,
3196 : gen_rtx_LABEL_REF (VOIDmode, l0), pc_rtx);
3197 192 : jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3198 192 : add_reg_br_prob_note (jmp, profile_probability::unlikely ());
3199 192 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, gt,
3200 : gen_rtx_LABEL_REF (VOIDmode, l1), pc_rtx);
3201 192 : jmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
3202 192 : add_reg_br_prob_note (jmp, profile_probability::even ());
3203 192 : emit_move_insn (dest, constm1_rtx);
3204 192 : emit_jump (lend);
3205 192 : emit_label (l0);
3206 192 : emit_move_insn (dest, const0_rtx);
3207 192 : emit_jump (lend);
3208 192 : emit_label (l1);
3209 192 : emit_move_insn (dest, const1_rtx);
3210 : }
3211 : else
3212 : {
3213 52 : rtx lt_tmp = NULL_RTX;
3214 52 : if (GET_MODE (dest) != SImode || !TARGET_ZERO_EXTEND_WITH_AND)
3215 : {
3216 52 : lt_tmp = gen_reg_rtx (QImode);
3217 52 : ix86_expand_setcc (lt_tmp, UNLT, gen_rtx_REG (CCFPmode, FLAGS_REG),
3218 : const0_rtx);
3219 52 : if (GET_MODE (dest) != QImode)
3220 : {
3221 52 : tmp = gen_reg_rtx (GET_MODE (dest));
3222 52 : emit_insn (gen_rtx_SET (tmp,
3223 : gen_rtx_ZERO_EXTEND (GET_MODE (dest),
3224 : lt_tmp)));
3225 52 : lt_tmp = tmp;
3226 : }
3227 : }
3228 52 : rtx gt_tmp;
3229 52 : if (zero)
3230 : {
3231 : /* If TARGET_IEEE_FP and dest has SImode, emit SImode clear
3232 : before the floating point comparison and use setcc_si_slp
3233 : pattern to hide it from the combiner, so that it doesn't
3234 : undo it. Similarly for TARGET_ZERO_EXTEND_WITH_AND, where
3235 : the ZERO_EXTEND normally emitted would need to be AND
3236 : with flags clobber. */
3237 34 : tmp = ix86_expand_compare (GT, XEXP (gt, 0), const0_rtx);
3238 34 : PUT_MODE (tmp, QImode);
3239 34 : emit_insn (gen_setcc_si_slp (zero, tmp, zero));
3240 34 : gt_tmp = zero;
3241 : }
3242 : else
3243 : {
3244 18 : gt_tmp = gen_reg_rtx (QImode);
3245 18 : ix86_expand_setcc (gt_tmp, GT, XEXP (gt, 0), const0_rtx);
3246 18 : if (GET_MODE (dest) != QImode)
3247 : {
3248 18 : tmp = gen_reg_rtx (GET_MODE (dest));
3249 18 : emit_insn (gen_rtx_SET (tmp,
3250 : gen_rtx_ZERO_EXTEND (GET_MODE (dest),
3251 : gt_tmp)));
3252 18 : gt_tmp = tmp;
3253 : }
3254 : }
3255 52 : if (lt_tmp)
3256 : {
3257 52 : tmp = expand_simple_binop (GET_MODE (dest), MINUS, gt_tmp, lt_tmp,
3258 : dest, 0, OPTAB_DIRECT);
3259 52 : if (!rtx_equal_p (tmp, dest))
3260 0 : emit_move_insn (dest, tmp);
3261 : }
3262 : else
3263 : {
3264 : /* For TARGET_ZERO_EXTEND_WITH_AND emit sbb directly, as we can't
3265 : do ZERO_EXTEND without clobbering flags. */
3266 0 : tmp = ix86_expand_compare (UNLT, XEXP (gt, 0), const0_rtx);
3267 0 : PUT_MODE (tmp, SImode);
3268 0 : emit_insn (gen_subsi3_carry (dest, gt_tmp,
3269 0 : force_reg (GET_MODE (dest), const0_rtx),
3270 : XEXP (gt, 0), tmp));
3271 : }
3272 : }
3273 244 : emit_jump (lend);
3274 244 : if (l2)
3275 : {
3276 207 : emit_label (l2);
3277 207 : emit_move_insn (dest, op2 == const0_rtx ? GEN_INT (-128) : op2);
3278 : }
3279 244 : emit_label (lend);
3280 244 : }
3281 :
3282 : /* Expand integral op0 <=> op1, i.e.
3283 : dest = op0 == op1 ? 0 : op0 < op1 ? -1 : 1. */
3284 :
3285 : void
3286 35 : ix86_expand_int_spaceship (rtx dest, rtx op0, rtx op1, rtx op2)
3287 : {
3288 35 : gcc_assert (INTVAL (op2));
3289 35 : rtx zero1 = NULL_RTX, zero2 = NULL_RTX;
3290 35 : if (TARGET_ZERO_EXTEND_WITH_AND && GET_MODE (dest) == SImode)
3291 : {
3292 0 : zero1 = force_reg (SImode, const0_rtx);
3293 0 : if (INTVAL (op2) != 1)
3294 0 : zero2 = force_reg (SImode, const0_rtx);
3295 : }
3296 :
3297 : /* Not using ix86_expand_int_compare here, so that it doesn't swap
3298 : operands nor optimize CC mode - we need a mode usable for both
3299 : LT and GT resp. LTU and GTU comparisons with the same unswapped
3300 : operands. */
3301 51 : rtx flags = gen_rtx_REG (INTVAL (op2) != 1 ? CCGCmode : CCmode, FLAGS_REG);
3302 35 : rtx tmp = gen_rtx_COMPARE (GET_MODE (flags), op0, op1);
3303 35 : emit_insn (gen_rtx_SET (flags, tmp));
3304 35 : rtx lt_tmp = NULL_RTX;
3305 35 : if (zero2)
3306 : {
3307 : /* For TARGET_ZERO_EXTEND_WITH_AND, emit setcc_si_slp to avoid
3308 : ZERO_EXTEND. */
3309 0 : tmp = ix86_expand_compare (LT, flags, const0_rtx);
3310 0 : PUT_MODE (tmp, QImode);
3311 0 : emit_insn (gen_setcc_si_slp (zero2, tmp, zero2));
3312 0 : lt_tmp = zero2;
3313 : }
3314 35 : else if (!zero1)
3315 : {
3316 35 : lt_tmp = gen_reg_rtx (QImode);
3317 51 : ix86_expand_setcc (lt_tmp, INTVAL (op2) != 1 ? LT : LTU, flags,
3318 : const0_rtx);
3319 35 : if (GET_MODE (dest) != QImode)
3320 : {
3321 35 : tmp = gen_reg_rtx (GET_MODE (dest));
3322 35 : emit_insn (gen_rtx_SET (tmp, gen_rtx_ZERO_EXTEND (GET_MODE (dest),
3323 : lt_tmp)));
3324 35 : lt_tmp = tmp;
3325 : }
3326 : }
3327 35 : rtx gt_tmp;
3328 35 : if (zero1)
3329 : {
3330 : /* For TARGET_ZERO_EXTEND_WITH_AND, emit setcc_si_slp to avoid
3331 : ZERO_EXTEND. */
3332 0 : tmp = ix86_expand_compare (INTVAL (op2) != 1 ? GT : GTU, flags,
3333 : const0_rtx);
3334 0 : PUT_MODE (tmp, QImode);
3335 0 : emit_insn (gen_setcc_si_slp (zero1, tmp, zero1));
3336 0 : gt_tmp = zero1;
3337 : }
3338 : else
3339 : {
3340 35 : gt_tmp = gen_reg_rtx (QImode);
3341 51 : ix86_expand_setcc (gt_tmp, INTVAL (op2) != 1 ? GT : GTU, flags,
3342 : const0_rtx);
3343 35 : if (GET_MODE (dest) != QImode)
3344 : {
3345 35 : tmp = gen_reg_rtx (GET_MODE (dest));
3346 35 : emit_insn (gen_rtx_SET (tmp, gen_rtx_ZERO_EXTEND (GET_MODE (dest),
3347 : gt_tmp)));
3348 35 : gt_tmp = tmp;
3349 : }
3350 : }
3351 35 : if (lt_tmp)
3352 : {
3353 35 : tmp = expand_simple_binop (GET_MODE (dest), MINUS, gt_tmp, lt_tmp, dest,
3354 : 0, OPTAB_DIRECT);
3355 35 : if (!rtx_equal_p (tmp, dest))
3356 0 : emit_move_insn (dest, tmp);
3357 : }
3358 : else
3359 : {
3360 : /* For TARGET_ZERO_EXTEND_WITH_AND emit sbb directly, as we can't
3361 : do ZERO_EXTEND without clobbering flags. */
3362 0 : tmp = ix86_expand_compare (LTU, flags, const0_rtx);
3363 0 : PUT_MODE (tmp, SImode);
3364 0 : emit_insn (gen_subsi3_carry (dest, gt_tmp,
3365 0 : force_reg (GET_MODE (dest), const0_rtx),
3366 : flags, tmp));
3367 : }
3368 35 : }
3369 :
3370 : /* Expand comparison setting or clearing carry flag. Return true when
3371 : successful and set pop for the operation. */
3372 : static bool
3373 33919 : ix86_expand_carry_flag_compare (enum rtx_code code, rtx op0, rtx op1, rtx *pop)
3374 : {
3375 67838 : machine_mode mode
3376 33919 : = GET_MODE (op0) != VOIDmode ? GET_MODE (op0) : GET_MODE (op1);
3377 :
3378 : /* Do not handle double-mode compares that go through special path. */
3379 36154 : if (mode == (TARGET_64BIT ? TImode : DImode))
3380 : return false;
3381 :
3382 33909 : if (SCALAR_FLOAT_MODE_P (mode))
3383 : {
3384 1844 : rtx compare_op;
3385 1844 : rtx_insn *compare_seq;
3386 :
3387 1844 : gcc_assert (!DECIMAL_FLOAT_MODE_P (mode));
3388 :
3389 : /* Shortcut: following common codes never translate
3390 : into carry flag compares. */
3391 1844 : if (code == EQ || code == NE || code == UNEQ || code == LTGT
3392 : || code == ORDERED || code == UNORDERED)
3393 : return false;
3394 :
3395 : /* These comparisons require zero flag; swap operands so they won't. */
3396 : if ((code == GT || code == UNLE || code == LE || code == UNGT)
3397 1779 : && !TARGET_IEEE_FP)
3398 : {
3399 2 : std::swap (op0, op1);
3400 2 : code = swap_condition (code);
3401 : }
3402 :
3403 : /* Try to expand the comparison and verify that we end up with
3404 : carry flag based comparison. This fails to be true only when
3405 : we decide to expand comparison using arithmetic that is not
3406 : too common scenario. */
3407 1842 : start_sequence ();
3408 1842 : compare_op = ix86_expand_fp_compare (code, op0, op1);
3409 1842 : compare_seq = end_sequence ();
3410 :
3411 1842 : if (GET_MODE (XEXP (compare_op, 0)) == CCFPmode)
3412 1842 : code = ix86_fp_compare_code_to_integer (GET_CODE (compare_op));
3413 : else
3414 0 : code = GET_CODE (compare_op);
3415 :
3416 1842 : if (code != LTU && code != GEU)
3417 : return false;
3418 :
3419 63 : emit_insn (compare_seq);
3420 63 : *pop = compare_op;
3421 63 : return true;
3422 : }
3423 :
3424 32065 : if (!INTEGRAL_MODE_P (mode))
3425 : return false;
3426 :
3427 31929 : switch (code)
3428 : {
3429 : case LTU:
3430 : case GEU:
3431 : break;
3432 :
3433 : /* Convert a==0 into (unsigned)a<1. */
3434 28274 : case EQ:
3435 28274 : case NE:
3436 28274 : if (op1 != const0_rtx)
3437 : return false;
3438 10129 : op1 = const1_rtx;
3439 10129 : code = (code == EQ ? LTU : GEU);
3440 : break;
3441 :
3442 : /* Convert a>b into b<a or a>=b-1. */
3443 827 : case GTU:
3444 827 : case LEU:
3445 827 : if (CONST_INT_P (op1))
3446 : {
3447 785 : op1 = gen_int_mode (INTVAL (op1) + 1, GET_MODE (op0));
3448 : /* Bail out on overflow. We still can swap operands but that
3449 : would force loading of the constant into register. */
3450 785 : if (op1 == const0_rtx
3451 785 : || !x86_64_immediate_operand (op1, GET_MODE (op1)))
3452 0 : return false;
3453 785 : code = (code == GTU ? GEU : LTU);
3454 : }
3455 : else
3456 : {
3457 42 : std::swap (op0, op1);
3458 42 : code = (code == GTU ? LTU : GEU);
3459 : }
3460 : break;
3461 :
3462 : /* Convert a>=0 into (unsigned)a<0x80000000. */
3463 1300 : case LT:
3464 1300 : case GE:
3465 1300 : if (mode == DImode || op1 != const0_rtx)
3466 : return false;
3467 204 : op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3468 102 : code = (code == LT ? GEU : LTU);
3469 : break;
3470 833 : case LE:
3471 833 : case GT:
3472 833 : if (mode == DImode || op1 != constm1_rtx)
3473 : return false;
3474 0 : op1 = gen_int_mode (1 << (GET_MODE_BITSIZE (mode) - 1), mode);
3475 0 : code = (code == LE ? GEU : LTU);
3476 : break;
3477 :
3478 : default:
3479 : return false;
3480 : }
3481 : /* Swapping operands may cause constant to appear as first operand. */
3482 11753 : if (!nonimmediate_operand (op0, VOIDmode))
3483 : {
3484 0 : if (!can_create_pseudo_p ())
3485 : return false;
3486 0 : op0 = force_reg (mode, op0);
3487 : }
3488 11753 : *pop = ix86_expand_compare (code, op0, op1);
3489 11753 : gcc_assert (GET_CODE (*pop) == LTU || GET_CODE (*pop) == GEU);
3490 : return true;
3491 : }
3492 :
3493 : /* Expand conditional increment or decrement using adb/sbb instructions.
3494 : The default case using setcc followed by the conditional move can be
3495 : done by generic code. */
3496 : bool
3497 6816 : ix86_expand_int_addcc (rtx operands[])
3498 : {
3499 6816 : enum rtx_code code = GET_CODE (operands[1]);
3500 6816 : rtx flags;
3501 6816 : rtx (*insn) (machine_mode, rtx, rtx, rtx, rtx, rtx);
3502 6816 : rtx compare_op;
3503 6816 : rtx val = const0_rtx;
3504 6816 : bool fpcmp = false;
3505 6816 : machine_mode mode;
3506 6816 : rtx op0 = XEXP (operands[1], 0);
3507 6816 : rtx op1 = XEXP (operands[1], 1);
3508 :
3509 6816 : if (operands[3] != const1_rtx
3510 2809 : && operands[3] != constm1_rtx)
3511 : return false;
3512 4728 : if (!ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3513 : return false;
3514 1274 : code = GET_CODE (compare_op);
3515 :
3516 1274 : flags = XEXP (compare_op, 0);
3517 :
3518 1274 : if (GET_MODE (flags) == CCFPmode)
3519 : {
3520 4 : fpcmp = true;
3521 4 : code = ix86_fp_compare_code_to_integer (code);
3522 : }
3523 :
3524 1274 : if (code != LTU)
3525 : {
3526 733 : val = constm1_rtx;
3527 733 : if (fpcmp)
3528 4 : PUT_CODE (compare_op,
3529 : reverse_condition_maybe_unordered
3530 : (GET_CODE (compare_op)));
3531 : else
3532 729 : PUT_CODE (compare_op, reverse_condition (GET_CODE (compare_op)));
3533 : }
3534 :
3535 1274 : mode = GET_MODE (operands[0]);
3536 :
3537 : /* Construct either adc or sbb insn. */
3538 1274 : if ((code == LTU) == (operands[3] == constm1_rtx))
3539 : insn = gen_sub3_carry;
3540 : else
3541 521 : insn = gen_add3_carry;
3542 :
3543 1274 : emit_insn (insn (mode, operands[0], operands[2], val, flags, compare_op));
3544 :
3545 1274 : return true;
3546 : }
3547 :
3548 : bool
3549 438315 : ix86_expand_int_movcc (rtx operands[])
3550 : {
3551 438315 : enum rtx_code code = GET_CODE (operands[1]), compare_code;
3552 438315 : rtx_insn *compare_seq;
3553 438315 : rtx compare_op;
3554 438315 : machine_mode mode = GET_MODE (operands[0]);
3555 438315 : bool sign_bit_compare_p = false;
3556 438315 : bool negate_cc_compare_p = false;
3557 438315 : rtx op0 = XEXP (operands[1], 0);
3558 438315 : rtx op1 = XEXP (operands[1], 1);
3559 438315 : rtx op2 = operands[2];
3560 438315 : rtx op3 = operands[3];
3561 :
3562 438315 : if (GET_MODE (op0) == TImode
3563 422897 : || (GET_MODE (op0) == DImode
3564 102538 : && !TARGET_64BIT))
3565 : return false;
3566 :
3567 421801 : if (GET_MODE (op0) == BFmode
3568 421801 : && !ix86_fp_comparison_operator (operands[1], VOIDmode))
3569 : return false;
3570 :
3571 421801 : start_sequence ();
3572 421801 : compare_op = ix86_expand_compare (code, op0, op1);
3573 421801 : compare_seq = end_sequence ();
3574 :
3575 421801 : compare_code = GET_CODE (compare_op);
3576 :
3577 421801 : if ((op1 == const0_rtx && (code == GE || code == LT))
3578 379820 : || (op1 == constm1_rtx && (code == GT || code == LE)))
3579 : sign_bit_compare_p = true;
3580 :
3581 : /* op0 == op1 ? op0 : op3 is equivalent to op0 == op1 ? op1 : op3,
3582 : but if op1 is a constant, the latter form allows more optimizations,
3583 : either through the last 2 ops being constant handling, or the one
3584 : constant and one variable cases. On the other side, for cmov the
3585 : former might be better as we don't need to load the constant into
3586 : another register. */
3587 379820 : if (code == EQ && CONST_INT_P (op1) && rtx_equal_p (op0, op2))
3588 : op2 = op1;
3589 : /* Similarly for op0 != op1 ? op2 : op0 and op0 != op1 ? op2 : op1. */
3590 421287 : else if (code == NE && CONST_INT_P (op1) && rtx_equal_p (op0, op3))
3591 : op3 = op1;
3592 :
3593 : /* Don't attempt mode expansion here -- if we had to expand 5 or 6
3594 : HImode insns, we'd be swallowed in word prefix ops. */
3595 :
3596 4882 : if ((mode != HImode || TARGET_FAST_PREFIX)
3597 451124 : && (mode != (TARGET_64BIT ? TImode : DImode))
3598 421801 : && CONST_INT_P (op2)
3599 458743 : && CONST_INT_P (op3))
3600 : {
3601 29973 : rtx out = operands[0];
3602 29973 : HOST_WIDE_INT ct = INTVAL (op2);
3603 29973 : HOST_WIDE_INT cf = INTVAL (op3);
3604 29973 : HOST_WIDE_INT diff;
3605 :
3606 29973 : if ((mode == SImode
3607 16375 : || (TARGET_64BIT && mode == DImode))
3608 18435 : && (GET_MODE (op0) == SImode
3609 14370 : || (TARGET_64BIT && GET_MODE (op0) == DImode)))
3610 : {
3611 : /* Special case x != 0 ? -1 : y. */
3612 13176 : if (code == NE && op1 == const0_rtx && ct == -1)
3613 : {
3614 : negate_cc_compare_p = true;
3615 : std::swap (ct, cf);
3616 : code = EQ;
3617 : }
3618 13077 : else if (code == EQ && op1 == const0_rtx && cf == -1)
3619 29973 : negate_cc_compare_p = true;
3620 : }
3621 :
3622 29973 : diff = (unsigned HOST_WIDE_INT) ct - cf;
3623 : /* Make sure we can represent the difference between the two values. */
3624 29973 : if ((diff > 0) != ((cf < 0) != (ct < 0) ? cf < 0 : cf < ct))
3625 438315 : return false;
3626 :
3627 : /* Sign bit compares are better done using shifts than we do by using
3628 : sbb. */
3629 29825 : if (sign_bit_compare_p
3630 29825 : || negate_cc_compare_p
3631 29825 : || ix86_expand_carry_flag_compare (code, op0, op1, &compare_op))
3632 : {
3633 : /* Detect overlap between destination and compare sources. */
3634 11176 : rtx tmp = out;
3635 :
3636 11176 : if (negate_cc_compare_p)
3637 : {
3638 280 : if (GET_MODE (op0) == DImode)
3639 106 : emit_insn (gen_x86_negdi_ccc (gen_reg_rtx (DImode), op0));
3640 : else
3641 174 : emit_insn (gen_x86_negsi_ccc (gen_reg_rtx (SImode),
3642 174 : gen_lowpart (SImode, op0)));
3643 :
3644 280 : tmp = gen_reg_rtx (mode);
3645 280 : if (mode == DImode)
3646 123 : emit_insn (gen_x86_movdicc_0_m1_neg (tmp));
3647 : else
3648 157 : emit_insn (gen_x86_movsicc_0_m1_neg (gen_lowpart (SImode,
3649 : tmp)));
3650 : }
3651 10896 : else if (!sign_bit_compare_p)
3652 : {
3653 10542 : rtx flags;
3654 10542 : bool fpcmp = false;
3655 :
3656 10542 : compare_code = GET_CODE (compare_op);
3657 :
3658 10542 : flags = XEXP (compare_op, 0);
3659 :
3660 10542 : if (GET_MODE (flags) == CCFPmode)
3661 : {
3662 59 : fpcmp = true;
3663 59 : compare_code
3664 59 : = ix86_fp_compare_code_to_integer (compare_code);
3665 : }
3666 :
3667 : /* To simplify rest of code, restrict to the GEU case. */
3668 10542 : if (compare_code == LTU)
3669 : {
3670 6001 : std::swap (ct, cf);
3671 6001 : compare_code = reverse_condition (compare_code);
3672 6001 : code = reverse_condition (code);
3673 : }
3674 : else
3675 : {
3676 4541 : if (fpcmp)
3677 59 : PUT_CODE (compare_op,
3678 : reverse_condition_maybe_unordered
3679 : (GET_CODE (compare_op)));
3680 : else
3681 4482 : PUT_CODE (compare_op,
3682 : reverse_condition (GET_CODE (compare_op)));
3683 : }
3684 :
3685 10542 : diff = (unsigned HOST_WIDE_INT) ct - cf;
3686 : /* Make sure we can represent the difference
3687 : between the two values. */
3688 10542 : if ((diff > 0) != ((cf < 0) != (ct < 0) ? cf < 0 : cf < ct))
3689 : return false;
3690 :
3691 10541 : if (reg_overlap_mentioned_p (out, compare_op))
3692 0 : tmp = gen_reg_rtx (mode);
3693 :
3694 10541 : if (mode == DImode)
3695 2182 : emit_insn (gen_x86_movdicc_0_m1 (tmp, flags, compare_op));
3696 : else
3697 8359 : emit_insn (gen_x86_movsicc_0_m1 (gen_lowpart (SImode, tmp),
3698 : flags, compare_op));
3699 : }
3700 : else
3701 : {
3702 354 : if (code == GT || code == GE)
3703 153 : code = reverse_condition (code);
3704 : else
3705 : {
3706 201 : std::swap (ct, cf);
3707 :
3708 201 : diff = (unsigned HOST_WIDE_INT) ct - cf;
3709 : /* Make sure we can represent the difference
3710 : between the two values. */
3711 201 : if ((diff > 0) != ((cf < 0) != (ct < 0) ? cf < 0 : cf < ct))
3712 : return false;
3713 : }
3714 349 : tmp = emit_store_flag (tmp, code, op0, op1, VOIDmode, 0, -1);
3715 : }
3716 :
3717 11170 : if (diff == 1)
3718 : {
3719 : /*
3720 : * cmpl op0,op1
3721 : * sbbl dest,dest
3722 : * [addl dest, ct]
3723 : *
3724 : * Size 5 - 8.
3725 : */
3726 1138 : if (ct)
3727 965 : tmp = expand_simple_binop (mode, PLUS,
3728 : tmp, GEN_INT (ct),
3729 : copy_rtx (tmp), 1, OPTAB_DIRECT);
3730 : }
3731 10032 : else if (cf == -1)
3732 : {
3733 : /*
3734 : * cmpl op0,op1
3735 : * sbbl dest,dest
3736 : * orl $ct, dest
3737 : *
3738 : * Size 8.
3739 : */
3740 595 : tmp = expand_simple_binop (mode, IOR,
3741 : tmp, GEN_INT (ct),
3742 : copy_rtx (tmp), 1, OPTAB_DIRECT);
3743 : }
3744 9437 : else if (diff == -1 && ct)
3745 : {
3746 : /*
3747 : * cmpl op0,op1
3748 : * sbbl dest,dest
3749 : * notl dest
3750 : * [addl dest, cf]
3751 : *
3752 : * Size 8 - 11.
3753 : */
3754 687 : tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3755 687 : if (cf)
3756 669 : tmp = expand_simple_binop (mode, PLUS,
3757 : copy_rtx (tmp), GEN_INT (cf),
3758 : copy_rtx (tmp), 1, OPTAB_DIRECT);
3759 : }
3760 : else
3761 : {
3762 : /*
3763 : * cmpl op0,op1
3764 : * sbbl dest,dest
3765 : * [notl dest]
3766 : * andl cf - ct, dest
3767 : * [addl dest, ct]
3768 : *
3769 : * Size 8 - 11.
3770 : */
3771 :
3772 8750 : if (cf == 0)
3773 : {
3774 895 : cf = ct;
3775 895 : ct = 0;
3776 895 : tmp = expand_simple_unop (mode, NOT, tmp, copy_rtx (tmp), 1);
3777 : }
3778 :
3779 8750 : HOST_WIDE_INT ival = (unsigned HOST_WIDE_INT) cf - ct;
3780 : /* Make sure we can represent the difference
3781 : between the two values. */
3782 8750 : if ((ival > 0) != ((ct < 0) != (cf < 0) ? ct < 0 : ct < cf))
3783 16668 : return false;
3784 :
3785 8750 : tmp = expand_simple_binop (mode, AND,
3786 : copy_rtx (tmp),
3787 8750 : gen_int_mode (ival, mode),
3788 : copy_rtx (tmp), 1, OPTAB_DIRECT);
3789 8750 : if (ct)
3790 7072 : tmp = expand_simple_binop (mode, PLUS,
3791 : copy_rtx (tmp), GEN_INT (ct),
3792 : copy_rtx (tmp), 1, OPTAB_DIRECT);
3793 : }
3794 :
3795 11170 : if (!rtx_equal_p (tmp, out))
3796 470 : emit_move_insn (copy_rtx (out), copy_rtx (tmp));
3797 :
3798 11170 : return true;
3799 : }
3800 :
3801 18649 : if (diff < 0)
3802 : {
3803 8848 : machine_mode cmp_mode = GET_MODE (op0);
3804 8848 : enum rtx_code new_code;
3805 :
3806 8848 : if (SCALAR_FLOAT_MODE_P (cmp_mode))
3807 : {
3808 54 : gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3809 :
3810 : /* We may be reversing a non-trapping
3811 : comparison to a trapping comparison. */
3812 104 : if (HONOR_NANS (cmp_mode) && flag_trapping_math
3813 41 : && code != EQ && code != NE
3814 95 : && code != ORDERED && code != UNORDERED)
3815 : new_code = UNKNOWN;
3816 : else
3817 13 : new_code = reverse_condition_maybe_unordered (code);
3818 : }
3819 : else
3820 8794 : new_code = ix86_reverse_condition (code, cmp_mode);
3821 8807 : if (new_code != UNKNOWN)
3822 : {
3823 8807 : std::swap (ct, cf);
3824 :
3825 8807 : diff = (unsigned HOST_WIDE_INT) ct - cf;
3826 : /* Make sure we can represent the difference
3827 : between the two values. */
3828 8807 : if ((diff > 0) != ((cf < 0) != (ct < 0) ? cf < 0 : cf < ct))
3829 : return false;
3830 :
3831 : code = new_code;
3832 : }
3833 : }
3834 :
3835 18649 : compare_code = UNKNOWN;
3836 18649 : if (GET_MODE_CLASS (GET_MODE (op0)) == MODE_INT
3837 16878 : && CONST_INT_P (op1))
3838 : {
3839 11029 : if (op1 == const0_rtx
3840 214 : && (code == LT || code == GE))
3841 : compare_code = code;
3842 11029 : else if (op1 == constm1_rtx)
3843 : {
3844 295 : if (code == LE)
3845 : compare_code = LT;
3846 295 : else if (code == GT)
3847 : compare_code = GE;
3848 : }
3849 : }
3850 :
3851 : /* Optimize dest = (op0 < 0) ? -1 : cf. */
3852 : if (compare_code != UNKNOWN
3853 0 : && GET_MODE (op0) == GET_MODE (out)
3854 0 : && (cf == -1 || ct == -1))
3855 : {
3856 : /* If lea code below could be used, only optimize
3857 : if it results in a 2 insn sequence. */
3858 :
3859 0 : if (! (diff == 1 || diff == 2 || diff == 4 || diff == 8
3860 0 : || diff == 3 || diff == 5 || diff == 9)
3861 0 : || (compare_code == LT && ct == -1)
3862 0 : || (compare_code == GE && cf == -1))
3863 : {
3864 : /*
3865 : * notl op1 (if necessary)
3866 : * sarl $31, op1
3867 : * orl cf, op1
3868 : */
3869 0 : if (ct != -1)
3870 : {
3871 0 : cf = ct;
3872 0 : ct = -1;
3873 0 : code = reverse_condition (code);
3874 : }
3875 :
3876 0 : out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
3877 :
3878 0 : out = expand_simple_binop (mode, IOR,
3879 : out, GEN_INT (cf),
3880 : out, 1, OPTAB_DIRECT);
3881 0 : if (out != operands[0])
3882 0 : emit_move_insn (operands[0], out);
3883 :
3884 0 : return true;
3885 : }
3886 : }
3887 :
3888 :
3889 29746 : if ((diff == 1 || diff == 2 || diff == 4 || diff == 8
3890 11097 : || diff == 3 || diff == 5 || diff == 9)
3891 7895 : && ((mode != QImode && mode != HImode) || !TARGET_PARTIAL_REG_STALL)
3892 26544 : && (mode != DImode
3893 1922 : || x86_64_immediate_operand (GEN_INT (cf), VOIDmode)))
3894 : {
3895 : /*
3896 : * xorl dest,dest
3897 : * cmpl op1,op2
3898 : * setcc dest
3899 : * lea cf(dest*(ct-cf)),dest
3900 : *
3901 : * Size 14.
3902 : *
3903 : * This also catches the degenerate setcc-only case.
3904 : */
3905 :
3906 7895 : rtx tmp;
3907 7895 : int nops;
3908 :
3909 7895 : out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
3910 :
3911 7895 : nops = 0;
3912 : /* On x86_64 the lea instruction operates on Pmode, so we need
3913 : to get arithmetics done in proper mode to match. */
3914 7895 : if (diff == 1)
3915 6671 : tmp = copy_rtx (out);
3916 : else
3917 : {
3918 1224 : rtx out1;
3919 1224 : out1 = copy_rtx (out);
3920 1224 : tmp = gen_rtx_MULT (mode, out1, GEN_INT (diff & ~1));
3921 1224 : nops++;
3922 1224 : if (diff & 1)
3923 : {
3924 254 : tmp = gen_rtx_PLUS (mode, tmp, out1);
3925 254 : nops++;
3926 : }
3927 : }
3928 7895 : if (cf != 0)
3929 : {
3930 6925 : tmp = plus_constant (mode, tmp, cf);
3931 6925 : nops++;
3932 : }
3933 7895 : if (!rtx_equal_p (tmp, out))
3934 : {
3935 7165 : if (nops == 1)
3936 6039 : out = force_operand (tmp, copy_rtx (out));
3937 : else
3938 1126 : emit_insn (gen_rtx_SET (copy_rtx (out), copy_rtx (tmp)));
3939 : }
3940 7895 : if (!rtx_equal_p (out, operands[0]))
3941 894 : emit_move_insn (operands[0], copy_rtx (out));
3942 :
3943 7895 : return true;
3944 : }
3945 :
3946 : /*
3947 : * General case: Jumpful:
3948 : * xorl dest,dest cmpl op1, op2
3949 : * cmpl op1, op2 movl ct, dest
3950 : * setcc dest jcc 1f
3951 : * decl dest movl cf, dest
3952 : * andl (cf-ct),dest 1:
3953 : * addl ct,dest
3954 : *
3955 : * Size 20. Size 14.
3956 : *
3957 : * This is reasonably steep, but branch mispredict costs are
3958 : * high on modern cpus, so consider failing only if optimizing
3959 : * for space.
3960 : */
3961 :
3962 10754 : if ((!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
3963 10754 : && BRANCH_COST (optimize_insn_for_speed_p (),
3964 : false) >= 2)
3965 : {
3966 0 : if (cf == 0)
3967 : {
3968 0 : machine_mode cmp_mode = GET_MODE (op0);
3969 0 : enum rtx_code new_code;
3970 :
3971 0 : if (SCALAR_FLOAT_MODE_P (cmp_mode))
3972 : {
3973 0 : gcc_assert (!DECIMAL_FLOAT_MODE_P (cmp_mode));
3974 :
3975 : /* We may be reversing a non-trapping
3976 : comparison to a trapping comparison. */
3977 0 : if (HONOR_NANS (cmp_mode) && flag_trapping_math
3978 0 : && code != EQ && code != NE
3979 0 : && code != ORDERED && code != UNORDERED)
3980 : new_code = UNKNOWN;
3981 : else
3982 0 : new_code = reverse_condition_maybe_unordered (code);
3983 :
3984 : }
3985 : else
3986 : {
3987 0 : new_code = ix86_reverse_condition (code, cmp_mode);
3988 0 : if (compare_code != UNKNOWN && new_code != UNKNOWN)
3989 0 : compare_code = reverse_condition (compare_code);
3990 : }
3991 :
3992 0 : if (new_code != UNKNOWN)
3993 : {
3994 0 : cf = ct;
3995 0 : ct = 0;
3996 0 : code = new_code;
3997 : }
3998 : }
3999 :
4000 0 : if (compare_code != UNKNOWN)
4001 : {
4002 : /* notl op1 (if needed)
4003 : sarl $31, op1
4004 : andl (cf-ct), op1
4005 : addl ct, op1
4006 :
4007 : For x < 0 (resp. x <= -1) there will be no notl,
4008 : so if possible swap the constants to get rid of the
4009 : complement.
4010 : True/false will be -1/0 while code below (store flag
4011 : followed by decrement) is 0/-1, so the constants need
4012 : to be exchanged once more. */
4013 :
4014 0 : if (compare_code == GE || !cf)
4015 : {
4016 0 : code = reverse_condition (code);
4017 0 : compare_code = LT;
4018 : }
4019 : else
4020 : std::swap (ct, cf);
4021 :
4022 0 : out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, -1);
4023 : }
4024 : else
4025 : {
4026 0 : out = emit_store_flag (out, code, op0, op1, VOIDmode, 0, 1);
4027 :
4028 0 : out = expand_simple_binop (mode, PLUS, copy_rtx (out),
4029 : constm1_rtx,
4030 : copy_rtx (out), 1, OPTAB_DIRECT);
4031 : }
4032 :
4033 0 : HOST_WIDE_INT ival = (unsigned HOST_WIDE_INT) cf - ct;
4034 : /* Make sure we can represent the difference
4035 : between the two values. */
4036 0 : if ((ival > 0) != ((ct < 0) != (cf < 0) ? ct < 0 : ct < cf))
4037 : return false;
4038 :
4039 0 : out = expand_simple_binop (mode, AND, copy_rtx (out),
4040 0 : gen_int_mode (ival, mode),
4041 : copy_rtx (out), 1, OPTAB_DIRECT);
4042 0 : if (ct)
4043 0 : out = expand_simple_binop (mode, PLUS, copy_rtx (out), GEN_INT (ct),
4044 : copy_rtx (out), 1, OPTAB_DIRECT);
4045 0 : if (!rtx_equal_p (out, operands[0]))
4046 0 : emit_move_insn (operands[0], copy_rtx (out));
4047 :
4048 0 : return true;
4049 : }
4050 : }
4051 :
4052 402582 : if (!TARGET_CMOVE || (mode == QImode && TARGET_PARTIAL_REG_STALL))
4053 : {
4054 : /* Try a few things more with specific constants and a variable. */
4055 :
4056 0 : optab op;
4057 0 : rtx var, orig_out, out, tmp;
4058 :
4059 0 : if (BRANCH_COST (optimize_insn_for_speed_p (), false) <= 2)
4060 : return false;
4061 :
4062 0 : operands[2] = op2;
4063 0 : operands[3] = op3;
4064 :
4065 : /* If one of the two operands is an interesting constant, load a
4066 : constant with the above and mask it in with a logical operation. */
4067 :
4068 0 : if (CONST_INT_P (operands[2]))
4069 : {
4070 0 : var = operands[3];
4071 0 : if (INTVAL (operands[2]) == 0 && operands[3] != constm1_rtx)
4072 0 : operands[3] = constm1_rtx, op = and_optab;
4073 0 : else if (INTVAL (operands[2]) == -1 && operands[3] != const0_rtx)
4074 0 : operands[3] = const0_rtx, op = ior_optab;
4075 : else
4076 : return false;
4077 : }
4078 0 : else if (CONST_INT_P (operands[3]))
4079 : {
4080 0 : var = operands[2];
4081 0 : if (INTVAL (operands[3]) == 0 && operands[2] != constm1_rtx)
4082 : {
4083 : /* For smin (x, 0), expand as "x < 0 ? x : 0" instead of
4084 : "x <= 0 ? x : 0" to enable sign_bit_compare_p. */
4085 0 : if (code == LE && op1 == const0_rtx && rtx_equal_p (op0, var))
4086 0 : operands[1] = simplify_gen_relational (LT, VOIDmode,
4087 0 : GET_MODE (op0),
4088 : op0, const0_rtx);
4089 :
4090 0 : operands[2] = constm1_rtx;
4091 0 : op = and_optab;
4092 : }
4093 0 : else if (INTVAL (operands[3]) == -1 && operands[3] != const0_rtx)
4094 0 : operands[2] = const0_rtx, op = ior_optab;
4095 : else
4096 : return false;
4097 : }
4098 : else
4099 : return false;
4100 :
4101 0 : orig_out = operands[0];
4102 0 : tmp = gen_reg_rtx (mode);
4103 0 : operands[0] = tmp;
4104 :
4105 : /* Recurse to get the constant loaded. */
4106 0 : if (!ix86_expand_int_movcc (operands))
4107 : return false;
4108 :
4109 : /* Mask in the interesting variable. */
4110 0 : out = expand_binop (mode, op, var, tmp, orig_out, 0,
4111 : OPTAB_WIDEN);
4112 0 : if (!rtx_equal_p (out, orig_out))
4113 0 : emit_move_insn (copy_rtx (orig_out), copy_rtx (out));
4114 :
4115 0 : return true;
4116 : }
4117 :
4118 : /*
4119 : * For comparison with above,
4120 : *
4121 : * movl cf,dest
4122 : * movl ct,tmp
4123 : * cmpl op1,op2
4124 : * cmovcc tmp,dest
4125 : *
4126 : * Size 15.
4127 : */
4128 :
4129 402582 : if (! nonimmediate_operand (operands[2], mode))
4130 27677 : operands[2] = force_reg (mode, operands[2]);
4131 402582 : if (! nonimmediate_operand (operands[3], mode))
4132 178197 : operands[3] = force_reg (mode, operands[3]);
4133 :
4134 402582 : if (! register_operand (operands[2], VOIDmode)
4135 402582 : && (mode == QImode
4136 1093 : || ! register_operand (operands[3], VOIDmode)))
4137 1564 : operands[2] = force_reg (mode, operands[2]);
4138 :
4139 402582 : if (mode == QImode
4140 402582 : && ! register_operand (operands[3], VOIDmode))
4141 592 : operands[3] = force_reg (mode, operands[3]);
4142 :
4143 402582 : emit_insn (compare_seq);
4144 402582 : emit_insn (gen_rtx_SET (operands[0],
4145 : gen_rtx_IF_THEN_ELSE (mode,
4146 : compare_op, operands[2],
4147 : operands[3])));
4148 402582 : return true;
4149 : }
4150 :
4151 : /* Detect conditional moves that exactly match min/max operational
4152 : semantics. Note that this is IEEE safe, as long as we don't
4153 : interchange the operands.
4154 :
4155 : Returns FALSE if this conditional move doesn't match a MIN/MAX,
4156 : and TRUE if the operation is successful and instructions are emitted. */
4157 :
4158 : static bool
4159 9779 : ix86_expand_sse_fp_minmax (rtx dest, enum rtx_code code, rtx cmp_op0,
4160 : rtx cmp_op1, rtx if_true, rtx if_false)
4161 : {
4162 9779 : machine_mode mode = GET_MODE (dest);
4163 9779 : bool is_min;
4164 9779 : rtx tmp;
4165 :
4166 9779 : if (code == LT)
4167 : ;
4168 3290 : else if (code == LE && !HONOR_NANS (mode))
4169 : {
4170 : /* We can swap LE to GE and then invert to LT. */
4171 : std::swap (cmp_op0, cmp_op1);
4172 : std::swap (if_true, if_false);
4173 : }
4174 3249 : else if (code == UNGE)
4175 : std::swap (if_true, if_false);
4176 : else
4177 : return false;
4178 :
4179 8653 : if (rtx_equal_p (cmp_op0, if_true) && rtx_equal_p (cmp_op1, if_false))
4180 : is_min = true;
4181 4595 : else if (rtx_equal_p (cmp_op1, if_true) && rtx_equal_p (cmp_op0, if_false))
4182 : is_min = false;
4183 : else
4184 1026 : return false;
4185 :
4186 7627 : if (immediate_operand (if_false, mode))
4187 8 : if_false = force_reg (mode, if_false);
4188 7627 : if (immediate_operand (if_true, mode))
4189 0 : if_true = force_reg (mode, if_true);
4190 :
4191 : /* We want to check HONOR_NANS and HONOR_SIGNED_ZEROS here,
4192 : but MODE may be a vector mode and thus not appropriate. */
4193 7627 : if (!flag_finite_math_only || flag_signed_zeros)
4194 : {
4195 7627 : int u = is_min ? UNSPEC_IEEE_MIN : UNSPEC_IEEE_MAX;
4196 7627 : rtvec v;
4197 :
4198 7627 : if_true = force_reg (mode, if_true);
4199 7627 : v = gen_rtvec (2, if_true, if_false);
4200 7627 : tmp = gen_rtx_UNSPEC (mode, v, u);
4201 7627 : }
4202 : else
4203 : {
4204 0 : code = is_min ? SMIN : SMAX;
4205 0 : if (MEM_P (if_true) && MEM_P (if_false))
4206 0 : if_true = force_reg (mode, if_true);
4207 0 : tmp = gen_rtx_fmt_ee (code, mode, if_true, if_false);
4208 : }
4209 :
4210 7627 : emit_insn (gen_rtx_SET (dest, tmp));
4211 7627 : return true;
4212 : }
4213 :
4214 : /* Return true if MODE is valid for vector compare to mask register,
4215 : Same result for conditionl vector move with mask register. */
4216 : static bool
4217 14242 : ix86_valid_mask_cmp_mode (machine_mode mode)
4218 : {
4219 : /* XOP has its own vector conditional movement. */
4220 14242 : if (TARGET_XOP && !TARGET_AVX512F)
4221 : return false;
4222 :
4223 : /* HFmode only supports vcmpsh whose dest is mask register. */
4224 14236 : if (TARGET_AVX512FP16 && mode == HFmode)
4225 : return true;
4226 :
4227 : /* AVX512F is needed for mask operation. */
4228 14144 : if (!(TARGET_AVX512F && VECTOR_MODE_P (mode)))
4229 : return false;
4230 :
4231 : /* AVX512BW is needed for vector QI/HImode,
4232 : AVX512VL is needed for 128/256-bit vector. */
4233 182 : machine_mode inner_mode = GET_MODE_INNER (mode);
4234 182 : int vector_size = GET_MODE_SIZE (mode);
4235 182 : if ((inner_mode == QImode || inner_mode == HImode) && !TARGET_AVX512BW)
4236 : return false;
4237 :
4238 162 : return vector_size == 64 || TARGET_AVX512VL;
4239 : }
4240 :
4241 : /* Return true if integer mask comparison should be used. */
4242 : static bool
4243 50571 : ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
4244 : rtx op_true, rtx op_false)
4245 : {
4246 50571 : int vector_size = GET_MODE_SIZE (mode);
4247 :
4248 50571 : if (cmp_mode == HFmode)
4249 : return true;
4250 50479 : else if (vector_size < 16)
4251 : return false;
4252 44083 : else if (vector_size == 64)
4253 : return true;
4254 88050 : else if (GET_MODE_INNER (cmp_mode) == HFmode)
4255 : return true;
4256 88050 : else if (GET_MODE_INNER (cmp_mode) == BFmode)
4257 : return true;
4258 :
4259 : /* When op_true is NULL, op_false must be NULL, or vice versa. */
4260 44025 : gcc_assert (!op_true == !op_false);
4261 :
4262 : /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
4263 : vector dest is required. */
4264 44025 : if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
4265 : return false;
4266 :
4267 : /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
4268 48 : if (op_false == CONST0_RTX (mode)
4269 48 : || op_true == CONST0_RTX (mode)
4270 48 : || (INTEGRAL_MODE_P (mode)
4271 40 : && (op_true == CONSTM1_RTX (mode)
4272 40 : || op_false == CONSTM1_RTX (mode))))
4273 0 : return false;
4274 :
4275 : return true;
4276 : }
4277 :
4278 : /* Expand an SSE comparison. Return the register with the result. */
4279 :
4280 : static rtx
4281 34119 : ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
4282 : rtx op_true, rtx op_false)
4283 : {
4284 34119 : machine_mode mode = GET_MODE (dest);
4285 34119 : machine_mode cmp_ops_mode = GET_MODE (cmp_op0);
4286 :
4287 : /* In general case result of comparison can differ from operands' type. */
4288 34119 : machine_mode cmp_mode;
4289 :
4290 : /* In AVX512F the result of comparison is an integer mask. */
4291 34119 : bool maskcmp = false;
4292 34119 : rtx x;
4293 :
4294 34119 : if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
4295 : {
4296 145 : unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
4297 145 : maskcmp = true;
4298 145 : cmp_mode = nbits > 8 ? int_mode_for_size (nbits, 0).require () : E_QImode;
4299 : }
4300 : else
4301 : cmp_mode = cmp_ops_mode;
4302 :
4303 34119 : cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
4304 :
4305 68238 : bool (*op1_predicate)(rtx, machine_mode)
4306 34119 : = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
4307 :
4308 34119 : if (!op1_predicate (cmp_op1, cmp_ops_mode))
4309 0 : cmp_op1 = force_reg (cmp_ops_mode, cmp_op1);
4310 :
4311 34119 : if (optimize
4312 505 : || (maskcmp && cmp_mode != mode)
4313 505 : || (op_true && reg_overlap_mentioned_p (dest, op_true))
4314 34624 : || (op_false && reg_overlap_mentioned_p (dest, op_false)))
4315 67083 : dest = gen_reg_rtx (maskcmp ? cmp_mode : mode);
4316 :
4317 34119 : if (maskcmp)
4318 : {
4319 145 : bool ok = ix86_expand_mask_vec_cmp (dest, code, cmp_op0, cmp_op1);
4320 145 : gcc_assert (ok);
4321 : return dest;
4322 : }
4323 :
4324 33974 : x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
4325 :
4326 33974 : if (cmp_mode != mode)
4327 : {
4328 6701 : x = force_reg (cmp_ops_mode, x);
4329 6701 : convert_move (dest, x, false);
4330 : }
4331 : else
4332 27273 : emit_insn (gen_rtx_SET (dest, x));
4333 :
4334 : return dest;
4335 : }
4336 :
4337 : /* Emit x86 binary operand CODE in mode MODE for SSE vector
4338 : instructions that can be performed using GP registers. */
4339 :
4340 : static void
4341 7055 : ix86_emit_vec_binop (enum rtx_code code, machine_mode mode,
4342 : rtx dst, rtx src1, rtx src2)
4343 : {
4344 7055 : rtx tmp;
4345 :
4346 7055 : tmp = gen_rtx_SET (dst, gen_rtx_fmt_ee (code, mode, src1, src2));
4347 :
4348 7055 : if (GET_MODE_SIZE (mode) <= GET_MODE_SIZE (SImode)
4349 7055 : && GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
4350 : {
4351 94 : rtx clob = gen_rtx_CLOBBER (VOIDmode, gen_rtx_REG (CCmode, FLAGS_REG));
4352 94 : tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, tmp, clob));
4353 : }
4354 :
4355 7055 : emit_insn (tmp);
4356 7055 : }
4357 :
4358 : /* Expand DEST = CMP ? OP_TRUE : OP_FALSE into a sequence of logical
4359 : operations. This is used for both scalar and vector conditional moves. */
4360 :
4361 : void
4362 10141 : ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
4363 : {
4364 10141 : machine_mode mode = GET_MODE (dest);
4365 10141 : machine_mode cmpmode = GET_MODE (cmp);
4366 10141 : rtx x;
4367 :
4368 : /* Simplify trivial VEC_COND_EXPR to avoid ICE in pr97506. */
4369 10141 : if (rtx_equal_p (op_true, op_false))
4370 : {
4371 0 : emit_move_insn (dest, op_true);
4372 0 : return;
4373 : }
4374 :
4375 : /* If we have an integer mask and FP value then we need
4376 : to cast mask to FP mode. */
4377 10141 : if (mode != cmpmode && VECTOR_MODE_P (cmpmode))
4378 : {
4379 1483 : cmp = force_reg (cmpmode, cmp);
4380 1483 : cmp = gen_rtx_SUBREG (mode, cmp, 0);
4381 : }
4382 :
4383 : /* In AVX512F the result of comparison is an integer mask. */
4384 10141 : if (mode != cmpmode
4385 1628 : && GET_MODE_CLASS (cmpmode) == MODE_INT)
4386 : {
4387 145 : gcc_assert (ix86_valid_mask_cmp_mode (mode));
4388 : /* Using scalar/vector move with mask register. */
4389 145 : cmp = force_reg (cmpmode, cmp);
4390 : /* Optimize for mask zero. */
4391 290 : op_true = (op_true != CONST0_RTX (mode)
4392 145 : ? force_reg (mode, op_true) : op_true);
4393 290 : op_false = (op_false != CONST0_RTX (mode)
4394 145 : ? force_reg (mode, op_false) : op_false);
4395 145 : if (op_true == CONST0_RTX (mode))
4396 : {
4397 0 : if (cmpmode == E_DImode && !TARGET_64BIT)
4398 : {
4399 0 : x = gen_reg_rtx (cmpmode);
4400 0 : emit_insn (gen_knotdi (x, cmp));
4401 : }
4402 : else
4403 0 : x = expand_simple_unop (cmpmode, NOT, cmp, NULL, 1);
4404 : cmp = x;
4405 : /* Reverse op_true op_false. */
4406 : std::swap (op_true, op_false);
4407 : }
4408 :
4409 145 : if (mode == HFmode)
4410 92 : emit_insn (gen_movhf_mask (dest, op_true, op_false, cmp));
4411 : else
4412 53 : emit_insn (gen_rtx_SET (dest,
4413 : gen_rtx_VEC_MERGE (mode,
4414 : op_true, op_false, cmp)));
4415 145 : return;
4416 : }
4417 :
4418 9996 : if (vector_all_ones_operand (op_true, mode)
4419 9996 : && op_false == CONST0_RTX (mode))
4420 : {
4421 2 : emit_move_insn (dest, cmp);
4422 2 : return;
4423 : }
4424 9994 : else if (op_false == CONST0_RTX (mode))
4425 : {
4426 907 : x = expand_simple_binop (mode, AND, cmp, op_true,
4427 : dest, 1, OPTAB_DIRECT);
4428 907 : if (x != dest)
4429 0 : emit_move_insn (dest, x);
4430 907 : return;
4431 : }
4432 9087 : else if (op_true == CONST0_RTX (mode))
4433 : {
4434 94 : op_false = force_reg (mode, op_false);
4435 94 : x = gen_rtx_NOT (mode, cmp);
4436 94 : ix86_emit_vec_binop (AND, mode, dest, x, op_false);
4437 94 : return;
4438 : }
4439 8993 : else if (vector_all_ones_operand (op_true, mode))
4440 : {
4441 0 : x = expand_simple_binop (mode, IOR, cmp, op_false,
4442 : dest, 1, OPTAB_DIRECT);
4443 0 : if (x != dest)
4444 0 : emit_move_insn (dest, x);
4445 0 : return;
4446 : }
4447 :
4448 8993 : if (TARGET_XOP)
4449 : {
4450 65 : op_true = force_reg (mode, op_true);
4451 :
4452 65 : if (GET_MODE_SIZE (mode) < 16
4453 65 : || !nonimmediate_operand (op_false, mode))
4454 49 : op_false = force_reg (mode, op_false);
4455 :
4456 65 : emit_insn (gen_rtx_SET (dest,
4457 : gen_rtx_IF_THEN_ELSE (mode, cmp,
4458 : op_true, op_false)));
4459 65 : return;
4460 : }
4461 :
4462 8928 : rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
4463 8928 : machine_mode blend_mode = mode;
4464 :
4465 8928 : if (GET_MODE_SIZE (mode) < 16
4466 8928 : || !vector_operand (op_true, mode))
4467 2419 : op_true = force_reg (mode, op_true);
4468 :
4469 8928 : op_false = force_reg (mode, op_false);
4470 :
4471 8928 : switch (mode)
4472 : {
4473 29 : case E_V2SFmode:
4474 29 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4475 : gen = gen_mmx_blendvps;
4476 : break;
4477 289 : case E_V4SFmode:
4478 289 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4479 : gen = gen_sse4_1_blendvps;
4480 : break;
4481 132 : case E_V2DFmode:
4482 132 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4483 : gen = gen_sse4_1_blendvpd;
4484 : break;
4485 1093 : case E_SFmode:
4486 1093 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4487 : gen = gen_sse4_1_blendvss;
4488 : break;
4489 824 : case E_DFmode:
4490 824 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4491 : gen = gen_sse4_1_blendvsd;
4492 : break;
4493 350 : case E_V8QImode:
4494 350 : case E_V4HImode:
4495 350 : case E_V4HFmode:
4496 350 : case E_V4BFmode:
4497 350 : case E_V2SImode:
4498 350 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4499 : {
4500 : gen = gen_mmx_pblendvb_v8qi;
4501 : blend_mode = V8QImode;
4502 : }
4503 : break;
4504 87 : case E_V4QImode:
4505 87 : case E_V2HImode:
4506 87 : case E_V2HFmode:
4507 87 : case E_V2BFmode:
4508 87 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4509 : {
4510 : gen = gen_mmx_pblendvb_v4qi;
4511 : blend_mode = V4QImode;
4512 : }
4513 : break;
4514 36 : case E_V2QImode:
4515 36 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4516 : gen = gen_mmx_pblendvb_v2qi;
4517 : break;
4518 5425 : case E_V16QImode:
4519 5425 : case E_V8HImode:
4520 5425 : case E_V8HFmode:
4521 5425 : case E_V8BFmode:
4522 5425 : case E_V4SImode:
4523 5425 : case E_V2DImode:
4524 5425 : case E_V1TImode:
4525 5425 : if (TARGET_SSE_MOVCC_USE_BLENDV && TARGET_SSE4_1)
4526 : {
4527 : gen = gen_sse4_1_pblendvb;
4528 : blend_mode = V16QImode;
4529 : }
4530 : break;
4531 91 : case E_V8SFmode:
4532 91 : if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV)
4533 : gen = gen_avx_blendvps256;
4534 : break;
4535 192 : case E_V4DFmode:
4536 192 : if (TARGET_AVX && TARGET_SSE_MOVCC_USE_BLENDV)
4537 : gen = gen_avx_blendvpd256;
4538 : break;
4539 380 : case E_V32QImode:
4540 380 : case E_V16HImode:
4541 380 : case E_V16HFmode:
4542 380 : case E_V16BFmode:
4543 380 : case E_V8SImode:
4544 380 : case E_V4DImode:
4545 380 : if (TARGET_AVX2 && TARGET_SSE_MOVCC_USE_BLENDV)
4546 : {
4547 : gen = gen_avx2_pblendvb;
4548 : blend_mode = V32QImode;
4549 : }
4550 : break;
4551 :
4552 0 : case E_V64QImode:
4553 0 : gen = gen_avx512bw_blendmv64qi;
4554 0 : break;
4555 0 : case E_V32HImode:
4556 0 : gen = gen_avx512bw_blendmv32hi;
4557 0 : break;
4558 0 : case E_V32HFmode:
4559 0 : gen = gen_avx512bw_blendmv32hf;
4560 0 : break;
4561 0 : case E_V32BFmode:
4562 0 : gen = gen_avx512bw_blendmv32bf;
4563 0 : break;
4564 0 : case E_V16SImode:
4565 0 : gen = gen_avx512f_blendmv16si;
4566 0 : break;
4567 0 : case E_V8DImode:
4568 0 : gen = gen_avx512f_blendmv8di;
4569 0 : break;
4570 0 : case E_V8DFmode:
4571 0 : gen = gen_avx512f_blendmv8df;
4572 0 : break;
4573 : case E_V16SFmode:
4574 : gen = gen_avx512f_blendmv16sf;
4575 : break;
4576 :
4577 : default:
4578 : break;
4579 : }
4580 :
4581 0 : if (gen != NULL)
4582 : {
4583 2068 : if (blend_mode == mode)
4584 : x = dest;
4585 : else
4586 : {
4587 1005 : x = gen_reg_rtx (blend_mode);
4588 1005 : op_false = gen_lowpart (blend_mode, op_false);
4589 1005 : op_true = gen_lowpart (blend_mode, op_true);
4590 1005 : cmp = gen_lowpart (blend_mode, cmp);
4591 : }
4592 :
4593 2068 : emit_insn (gen (x, op_false, op_true, cmp));
4594 :
4595 2068 : if (x != dest)
4596 1005 : emit_move_insn (dest, gen_lowpart (mode, x));
4597 : }
4598 : else
4599 : {
4600 6860 : rtx t2, t3;
4601 :
4602 6860 : t2 = expand_simple_binop (mode, AND, op_true, cmp,
4603 : NULL, 1, OPTAB_DIRECT);
4604 :
4605 6860 : t3 = gen_reg_rtx (mode);
4606 6860 : x = gen_rtx_NOT (mode, cmp);
4607 6860 : ix86_emit_vec_binop (AND, mode, t3, x, op_false);
4608 :
4609 6860 : x = expand_simple_binop (mode, IOR, t3, t2,
4610 : dest, 1, OPTAB_DIRECT);
4611 6860 : if (x != dest)
4612 0 : emit_move_insn (dest, x);
4613 : }
4614 : }
4615 :
4616 : /* Swap, force into registers, or otherwise massage the two operands
4617 : to an sse comparison with a mask result. Thus we differ a bit from
4618 : ix86_prepare_fp_compare_args which expects to produce a flags result.
4619 :
4620 : The DEST operand exists to help determine whether to commute commutative
4621 : operators. The POP0/POP1 operands are updated in place. The new
4622 : comparison code is returned, or UNKNOWN if not implementable. */
4623 :
4624 : static enum rtx_code
4625 16480 : ix86_prepare_sse_fp_compare_args (rtx dest, enum rtx_code code,
4626 : rtx *pop0, rtx *pop1)
4627 : {
4628 16480 : switch (code)
4629 : {
4630 67 : case LTGT:
4631 67 : case UNEQ:
4632 : /* AVX supports all the needed comparisons. */
4633 67 : if (TARGET_AVX)
4634 : break;
4635 : /* We have no LTGT as an operator. We could implement it with
4636 : NE & ORDERED, but this requires an extra temporary. It's
4637 : not clear that it's worth it. */
4638 : return UNKNOWN;
4639 :
4640 : case LT:
4641 : case LE:
4642 : case UNGT:
4643 : case UNGE:
4644 : /* These are supported directly. */
4645 : break;
4646 :
4647 4923 : case EQ:
4648 4923 : case NE:
4649 4923 : case UNORDERED:
4650 4923 : case ORDERED:
4651 : /* AVX has 3 operand comparisons, no need to swap anything. */
4652 4923 : if (TARGET_AVX)
4653 : break;
4654 : /* For commutative operators, try to canonicalize the destination
4655 : operand to be first in the comparison - this helps reload to
4656 : avoid extra moves. */
4657 344 : if (!dest || !rtx_equal_p (dest, *pop1))
4658 : break;
4659 : /* FALLTHRU */
4660 :
4661 10559 : case GE:
4662 10559 : case GT:
4663 10559 : case UNLE:
4664 10559 : case UNLT:
4665 : /* These are not supported directly before AVX, and furthermore
4666 : ix86_expand_sse_fp_minmax only optimizes LT/UNGE. Swap the
4667 : comparison operands to transform into something that is
4668 : supported. */
4669 10559 : std::swap (*pop0, *pop1);
4670 10559 : code = swap_condition (code);
4671 10559 : break;
4672 :
4673 0 : default:
4674 0 : gcc_unreachable ();
4675 : }
4676 :
4677 : return code;
4678 : }
4679 :
4680 : /* Expand a floating-point conditional move. Return true if successful. */
4681 :
4682 : bool
4683 96053 : ix86_expand_fp_movcc (rtx operands[])
4684 : {
4685 96053 : machine_mode mode = GET_MODE (operands[0]);
4686 96053 : enum rtx_code code = GET_CODE (operands[1]);
4687 96053 : rtx tmp, compare_op;
4688 96053 : rtx op0 = XEXP (operands[1], 0);
4689 96053 : rtx op1 = XEXP (operands[1], 1);
4690 :
4691 96053 : if (GET_MODE (op0) == BFmode
4692 96053 : && !ix86_fp_comparison_operator (operands[1], VOIDmode))
4693 : return false;
4694 :
4695 96053 : if (SSE_FLOAT_MODE_SSEMATH_OR_HFBF_P (mode))
4696 : {
4697 65535 : machine_mode cmode;
4698 :
4699 : /* Since we've no cmove for sse registers, don't force bad register
4700 : allocation just to gain access to it. Deny movcc when the
4701 : comparison mode doesn't match the move mode. */
4702 65535 : cmode = GET_MODE (op0);
4703 65535 : if (cmode == VOIDmode)
4704 0 : cmode = GET_MODE (op1);
4705 65535 : if (cmode != mode)
4706 : return false;
4707 :
4708 9799 : code = ix86_prepare_sse_fp_compare_args (operands[0], code, &op0, &op1);
4709 9799 : if (code == UNKNOWN)
4710 : return false;
4711 :
4712 9779 : if (ix86_expand_sse_fp_minmax (operands[0], code, op0, op1,
4713 : operands[2], operands[3]))
4714 : return true;
4715 :
4716 2152 : tmp = ix86_expand_sse_cmp (operands[0], code, op0, op1,
4717 : operands[2], operands[3]);
4718 2152 : ix86_expand_sse_movcc (operands[0], tmp, operands[2], operands[3]);
4719 2152 : return true;
4720 : }
4721 :
4722 30518 : if (GET_MODE (op0) == TImode
4723 30518 : || (GET_MODE (op0) == DImode
4724 72 : && !TARGET_64BIT))
4725 : return false;
4726 :
4727 : /* The floating point conditional move instructions don't directly
4728 : support conditions resulting from a signed integer comparison. */
4729 :
4730 30446 : compare_op = ix86_expand_compare (code, op0, op1);
4731 30446 : if (!fcmov_comparison_operator (compare_op, VOIDmode))
4732 : {
4733 146 : tmp = gen_reg_rtx (QImode);
4734 146 : ix86_expand_setcc (tmp, code, op0, op1);
4735 :
4736 146 : compare_op = ix86_expand_compare (NE, tmp, const0_rtx);
4737 : }
4738 :
4739 30446 : operands[2] = force_reg (mode, operands[2]);
4740 30446 : operands[3] = force_reg (mode, operands[3]);
4741 30446 : emit_insn (gen_rtx_SET (operands[0],
4742 : gen_rtx_IF_THEN_ELSE (mode, compare_op,
4743 : operands[2], operands[3])));
4744 :
4745 30446 : return true;
4746 : }
4747 :
4748 : /* Helper for ix86_cmp_code_to_pcmp_immediate for int modes. */
4749 :
4750 : static int
4751 4854 : ix86_int_cmp_code_to_pcmp_immediate (enum rtx_code code)
4752 : {
4753 4854 : switch (code)
4754 : {
4755 : case EQ:
4756 : return 0;
4757 377 : case LT:
4758 377 : case LTU:
4759 377 : return 1;
4760 212 : case LE:
4761 212 : case LEU:
4762 212 : return 2;
4763 3051 : case NE:
4764 3051 : return 4;
4765 307 : case GE:
4766 307 : case GEU:
4767 307 : return 5;
4768 498 : case GT:
4769 498 : case GTU:
4770 498 : return 6;
4771 0 : default:
4772 0 : gcc_unreachable ();
4773 : }
4774 : }
4775 :
4776 : /* Helper for ix86_cmp_code_to_pcmp_immediate for fp modes. */
4777 :
4778 : static int
4779 1781 : ix86_fp_cmp_code_to_pcmp_immediate (enum rtx_code code)
4780 : {
4781 1781 : switch (code)
4782 : {
4783 : case EQ:
4784 : return 0x00;
4785 354 : case NE:
4786 354 : return 0x04;
4787 514 : case GT:
4788 514 : return 0x0e;
4789 88 : case LE:
4790 88 : return 0x02;
4791 53 : case GE:
4792 53 : return 0x0d;
4793 620 : case LT:
4794 620 : return 0x01;
4795 2 : case UNLE:
4796 2 : return 0x0a;
4797 2 : case UNLT:
4798 2 : return 0x09;
4799 11 : case UNGE:
4800 11 : return 0x05;
4801 44 : case UNGT:
4802 44 : return 0x06;
4803 2 : case UNEQ:
4804 2 : return 0x18;
4805 0 : case LTGT:
4806 0 : return 0x0c;
4807 2 : case ORDERED:
4808 2 : return 0x07;
4809 2 : case UNORDERED:
4810 2 : return 0x03;
4811 0 : default:
4812 0 : gcc_unreachable ();
4813 : }
4814 : }
4815 :
4816 : /* Return immediate value to be used in UNSPEC_PCMP
4817 : for comparison CODE in MODE. */
4818 :
4819 : static int
4820 6635 : ix86_cmp_code_to_pcmp_immediate (enum rtx_code code, machine_mode mode)
4821 : {
4822 6635 : if (FLOAT_MODE_P (mode))
4823 1781 : return ix86_fp_cmp_code_to_pcmp_immediate (code);
4824 4854 : return ix86_int_cmp_code_to_pcmp_immediate (code);
4825 : }
4826 :
4827 : /* Expand AVX-512 vector comparison. */
4828 :
4829 : bool
4830 6635 : ix86_expand_mask_vec_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1)
4831 : {
4832 6635 : machine_mode mask_mode = GET_MODE (dest);
4833 6635 : machine_mode cmp_mode = GET_MODE (cmp_op0);
4834 6635 : rtx imm = GEN_INT (ix86_cmp_code_to_pcmp_immediate (code, cmp_mode));
4835 6635 : int unspec_code;
4836 6635 : rtx unspec;
4837 :
4838 6635 : switch (code)
4839 : {
4840 : case LEU:
4841 : case GTU:
4842 : case GEU:
4843 : case LTU:
4844 : unspec_code = UNSPEC_UNSIGNED_PCMP;
4845 : break;
4846 :
4847 6221 : default:
4848 6221 : unspec_code = UNSPEC_PCMP;
4849 : }
4850 :
4851 6635 : unspec = gen_rtx_UNSPEC (mask_mode, gen_rtvec (3, cmp_op0, cmp_op1, imm),
4852 : unspec_code);
4853 6635 : emit_insn (gen_rtx_SET (dest, unspec));
4854 :
4855 6635 : return true;
4856 : }
4857 :
4858 : /* Expand fp vector comparison. */
4859 :
4860 : bool
4861 6681 : ix86_expand_fp_vec_cmp (rtx operands[])
4862 : {
4863 6681 : enum rtx_code code = GET_CODE (operands[1]);
4864 6681 : rtx cmp;
4865 :
4866 6681 : code = ix86_prepare_sse_fp_compare_args (operands[0], code,
4867 : &operands[2], &operands[3]);
4868 6681 : if (code == UNKNOWN)
4869 : {
4870 20 : rtx temp;
4871 20 : switch (GET_CODE (operands[1]))
4872 : {
4873 2 : case LTGT:
4874 2 : temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[2],
4875 : operands[3], NULL, NULL);
4876 2 : cmp = ix86_expand_sse_cmp (operands[0], NE, operands[2],
4877 : operands[3], NULL, NULL);
4878 2 : code = AND;
4879 2 : break;
4880 18 : case UNEQ:
4881 18 : temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[2],
4882 : operands[3], NULL, NULL);
4883 18 : cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[2],
4884 : operands[3], NULL, NULL);
4885 18 : code = IOR;
4886 18 : break;
4887 0 : default:
4888 0 : gcc_unreachable ();
4889 : }
4890 20 : cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
4891 : OPTAB_DIRECT);
4892 : }
4893 : else
4894 6661 : cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
4895 : NULL, NULL);
4896 :
4897 6681 : if (operands[0] != cmp)
4898 6598 : emit_move_insn (operands[0], cmp);
4899 :
4900 6681 : return true;
4901 : }
4902 :
4903 : static rtx
4904 16650 : ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
4905 : rtx op_true, rtx op_false, bool *negate)
4906 : {
4907 16650 : machine_mode data_mode = GET_MODE (dest);
4908 16650 : machine_mode mode = GET_MODE (cop0);
4909 16650 : rtx x;
4910 :
4911 16650 : *negate = false;
4912 :
4913 : /* XOP supports all of the comparisons on all 128-bit vector int types. */
4914 16650 : if (TARGET_XOP
4915 201 : && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
4916 16851 : && GET_MODE_SIZE (mode) <= 16)
4917 : ;
4918 : /* AVX512F supports all of the comparsions
4919 : on all 128/256/512-bit vector int types. */
4920 16452 : else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
4921 : ;
4922 : else
4923 : {
4924 : /* Canonicalize the comparison to EQ, GT, GTU. */
4925 16399 : switch (code)
4926 : {
4927 : case EQ:
4928 : case GT:
4929 : case GTU:
4930 : break;
4931 :
4932 832 : case LE:
4933 832 : case LEU:
4934 : /* x <= cst can be handled as x < cst + 1 unless there is
4935 : wrap around in cst + 1. */
4936 832 : if (CONST_VECTOR_P (cop1)
4937 1394 : && GET_MODE_INNER (mode) != TImode)
4938 : {
4939 562 : unsigned int n_elts = GET_MODE_NUNITS (mode), i;
4940 562 : machine_mode eltmode = GET_MODE_INNER (mode);
4941 3515 : for (i = 0; i < n_elts; ++i)
4942 : {
4943 2954 : rtx elt = CONST_VECTOR_ELT (cop1, i);
4944 2954 : if (!CONST_INT_P (elt))
4945 : break;
4946 2954 : if (code == LE)
4947 : {
4948 : /* For LE punt if some element is signed maximum. */
4949 1874 : if ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
4950 : == (GET_MODE_MASK (eltmode) >> 1))
4951 : break;
4952 : }
4953 : /* For LEU punt if some element is unsigned maximum. */
4954 1080 : else if (elt == constm1_rtx)
4955 : break;
4956 : }
4957 562 : if (i == n_elts)
4958 : {
4959 561 : rtvec v = rtvec_alloc (n_elts);
4960 4074 : for (i = 0; i < n_elts; ++i)
4961 2952 : RTVEC_ELT (v, i)
4962 2952 : = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) + 1,
4963 : eltmode);
4964 561 : cop1 = gen_rtx_CONST_VECTOR (mode, v);
4965 561 : std::swap (cop0, cop1);
4966 561 : code = code == LE ? GT : GTU;
4967 : break;
4968 : }
4969 : }
4970 : /* FALLTHRU */
4971 3525 : case NE:
4972 3525 : code = reverse_condition (code);
4973 3525 : *negate = true;
4974 3525 : break;
4975 :
4976 408 : case GE:
4977 408 : case GEU:
4978 : /* x >= cst can be handled as x > cst - 1 unless there is
4979 : wrap around in cst - 1. */
4980 408 : if (CONST_VECTOR_P (cop1)
4981 579 : && GET_MODE_INNER (mode) != TImode)
4982 : {
4983 171 : unsigned int n_elts = GET_MODE_NUNITS (mode), i;
4984 171 : machine_mode eltmode = GET_MODE_INNER (mode);
4985 1145 : for (i = 0; i < n_elts; ++i)
4986 : {
4987 1022 : rtx elt = CONST_VECTOR_ELT (cop1, i);
4988 1022 : if (!CONST_INT_P (elt))
4989 : break;
4990 1022 : if (code == GE)
4991 : {
4992 : /* For GE punt if some element is signed minimum. */
4993 974 : if (INTVAL (elt) < 0
4994 136 : && ((INTVAL (elt) & (GET_MODE_MASK (eltmode) >> 1))
4995 : == 0))
4996 : break;
4997 : }
4998 : /* For GEU punt if some element is zero. */
4999 48 : else if (elt == const0_rtx)
5000 : break;
5001 : }
5002 171 : if (i == n_elts)
5003 : {
5004 123 : rtvec v = rtvec_alloc (n_elts);
5005 1220 : for (i = 0; i < n_elts; ++i)
5006 974 : RTVEC_ELT (v, i)
5007 974 : = gen_int_mode (INTVAL (CONST_VECTOR_ELT (cop1, i)) - 1,
5008 : eltmode);
5009 123 : cop1 = gen_rtx_CONST_VECTOR (mode, v);
5010 123 : code = code == GE ? GT : GTU;
5011 : break;
5012 : }
5013 : }
5014 285 : code = reverse_condition (code);
5015 285 : *negate = true;
5016 : /* FALLTHRU */
5017 :
5018 1580 : case LT:
5019 1580 : case LTU:
5020 1580 : std::swap (cop0, cop1);
5021 1580 : code = swap_condition (code);
5022 1580 : break;
5023 :
5024 0 : default:
5025 0 : gcc_unreachable ();
5026 : }
5027 :
5028 : /* Only SSE4.1/SSE4.2 supports V2DImode. */
5029 16399 : if (mode == V2DImode)
5030 : {
5031 787 : switch (code)
5032 : {
5033 583 : case EQ:
5034 : /* SSE4.1 supports EQ. */
5035 583 : if (!TARGET_SSE4_1)
5036 16650 : return NULL;
5037 : break;
5038 :
5039 204 : case GT:
5040 204 : case GTU:
5041 : /* SSE4.2 supports GT/GTU. */
5042 204 : if (!TARGET_SSE4_2)
5043 : return NULL;
5044 : break;
5045 :
5046 0 : default:
5047 0 : gcc_unreachable ();
5048 : }
5049 : }
5050 :
5051 16399 : if (CONST_VECTOR_P (cop0))
5052 1177 : cop0 = force_reg (mode, cop0);
5053 15222 : else if (CONST_VECTOR_P (cop1))
5054 7265 : cop1 = force_reg (mode, cop1);
5055 :
5056 16399 : rtx optrue = op_true ? op_true : CONSTM1_RTX (data_mode);
5057 16399 : rtx opfalse = op_false ? op_false : CONST0_RTX (data_mode);
5058 16399 : if (*negate)
5059 3810 : std::swap (optrue, opfalse);
5060 :
5061 : /* Transform x > y ? 0 : -1 (i.e. x <= y ? -1 : 0 or x <= y) when
5062 : not using integer masks into min (x, y) == x ? -1 : 0 (i.e.
5063 : min (x, y) == x). While we add one instruction (the minimum),
5064 : we remove the need for two instructions in the negation, as the
5065 : result is done this way.
5066 : When using masks, do it for SI/DImode element types, as it is shorter
5067 : than the two subtractions. */
5068 16399 : if ((code != EQ
5069 6976 : && GET_MODE_SIZE (mode) != 64
5070 6976 : && vector_all_ones_operand (opfalse, data_mode)
5071 556 : && optrue == CONST0_RTX (data_mode))
5072 22819 : || (code == GTU
5073 1894 : && GET_MODE_SIZE (GET_MODE_INNER (mode)) >= 4
5074 : /* Don't do it if not using integer masks and we'd end up with
5075 : the right values in the registers though. */
5076 631 : && (GET_MODE_SIZE (mode) == 64
5077 631 : || !vector_all_ones_operand (optrue, data_mode)
5078 522 : || opfalse != CONST0_RTX (data_mode))))
5079 : {
5080 665 : rtx (*gen) (rtx, rtx, rtx) = NULL;
5081 :
5082 665 : switch (mode)
5083 : {
5084 0 : case E_V16SImode:
5085 0 : gen = (code == GTU) ? gen_uminv16si3 : gen_sminv16si3;
5086 : break;
5087 0 : case E_V8DImode:
5088 0 : gen = (code == GTU) ? gen_uminv8di3 : gen_sminv8di3;
5089 0 : cop0 = force_reg (mode, cop0);
5090 0 : cop1 = force_reg (mode, cop1);
5091 0 : break;
5092 24 : case E_V32QImode:
5093 24 : if (TARGET_AVX2)
5094 24 : gen = (code == GTU) ? gen_uminv32qi3 : gen_sminv32qi3;
5095 : break;
5096 24 : case E_V16HImode:
5097 24 : if (TARGET_AVX2)
5098 24 : gen = (code == GTU) ? gen_uminv16hi3 : gen_sminv16hi3;
5099 : break;
5100 26 : case E_V8SImode:
5101 26 : if (TARGET_AVX2)
5102 26 : gen = (code == GTU) ? gen_uminv8si3 : gen_sminv8si3;
5103 : break;
5104 20 : case E_V4DImode:
5105 20 : if (TARGET_AVX512VL)
5106 : {
5107 0 : gen = (code == GTU) ? gen_uminv4di3 : gen_sminv4di3;
5108 0 : cop0 = force_reg (mode, cop0);
5109 0 : cop1 = force_reg (mode, cop1);
5110 : }
5111 : break;
5112 59 : case E_V16QImode:
5113 59 : if (code == GTU && TARGET_SSE2)
5114 : gen = gen_uminv16qi3;
5115 20 : else if (code == GT && TARGET_SSE4_1)
5116 : gen = gen_sminv16qi3;
5117 : break;
5118 40 : case E_V8QImode:
5119 40 : if (code == GTU && TARGET_SSE2)
5120 : gen = gen_uminv8qi3;
5121 38 : else if (code == GT && TARGET_SSE4_1)
5122 : gen = gen_sminv8qi3;
5123 : break;
5124 13 : case E_V4QImode:
5125 13 : if (code == GTU && TARGET_SSE2)
5126 : gen = gen_uminv4qi3;
5127 2 : else if (code == GT && TARGET_SSE4_1)
5128 : gen = gen_sminv4qi3;
5129 : break;
5130 8 : case E_V2QImode:
5131 8 : if (code == GTU && TARGET_SSE2)
5132 : gen = gen_uminv2qi3;
5133 6 : else if (code == GT && TARGET_SSE4_1)
5134 : gen = gen_sminv2qi3;
5135 : break;
5136 68 : case E_V8HImode:
5137 68 : if (code == GTU && TARGET_SSE4_1)
5138 : gen = gen_uminv8hi3;
5139 58 : else if (code == GT && TARGET_SSE2)
5140 : gen = gen_sminv8hi3;
5141 : break;
5142 4 : case E_V4HImode:
5143 4 : if (code == GTU && TARGET_SSE4_1)
5144 : gen = gen_uminv4hi3;
5145 4 : else if (code == GT && TARGET_SSE2)
5146 : gen = gen_sminv4hi3;
5147 : break;
5148 16 : case E_V2HImode:
5149 16 : if (code == GTU && TARGET_SSE4_1)
5150 : gen = gen_uminv2hi3;
5151 16 : else if (code == GT && TARGET_SSE2)
5152 : gen = gen_sminv2hi3;
5153 : break;
5154 238 : case E_V4SImode:
5155 238 : if (TARGET_SSE4_1)
5156 56 : gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
5157 : break;
5158 101 : case E_V2SImode:
5159 101 : if (TARGET_SSE4_1)
5160 0 : gen = (code == GTU) ? gen_uminv2si3 : gen_sminv2si3;
5161 : break;
5162 24 : case E_V2DImode:
5163 24 : if (TARGET_AVX512VL)
5164 : {
5165 0 : gen = (code == GTU) ? gen_uminv2di3 : gen_sminv2di3;
5166 0 : cop0 = force_reg (mode, cop0);
5167 0 : cop1 = force_reg (mode, cop1);
5168 : }
5169 : break;
5170 : default:
5171 : break;
5172 : }
5173 :
5174 0 : if (gen)
5175 : {
5176 283 : rtx tem = gen_reg_rtx (mode);
5177 283 : if (!vector_operand (cop0, mode))
5178 0 : cop0 = force_reg (mode, cop0);
5179 283 : if (!vector_operand (cop1, mode))
5180 0 : cop1 = force_reg (mode, cop1);
5181 283 : *negate = !*negate;
5182 283 : emit_insn (gen (tem, cop0, cop1));
5183 283 : cop1 = tem;
5184 283 : code = EQ;
5185 : }
5186 : }
5187 :
5188 : /* Unsigned parallel compare is not supported by the hardware.
5189 : Play some tricks to turn this into a signed comparison
5190 : against 0. */
5191 16399 : if (code == GTU)
5192 : {
5193 1077 : cop0 = force_reg (mode, cop0);
5194 :
5195 1077 : switch (mode)
5196 : {
5197 730 : case E_V16SImode:
5198 730 : case E_V8DImode:
5199 730 : case E_V8SImode:
5200 730 : case E_V4DImode:
5201 730 : case E_V4SImode:
5202 730 : case E_V2SImode:
5203 730 : case E_V2DImode:
5204 730 : {
5205 730 : rtx t1, t2, mask;
5206 :
5207 : /* Subtract (-(INT MAX) - 1) from both operands to make
5208 : them signed. */
5209 730 : mask = ix86_build_signbit_mask (mode, true, false);
5210 730 : t1 = gen_reg_rtx (mode);
5211 730 : emit_insn (gen_sub3_insn (t1, cop0, mask));
5212 :
5213 730 : t2 = gen_reg_rtx (mode);
5214 730 : emit_insn (gen_sub3_insn (t2, cop1, mask));
5215 :
5216 730 : cop0 = t1;
5217 730 : cop1 = t2;
5218 730 : code = GT;
5219 : }
5220 730 : break;
5221 :
5222 347 : case E_V64QImode:
5223 347 : case E_V32HImode:
5224 347 : case E_V32QImode:
5225 347 : case E_V16HImode:
5226 347 : case E_V16QImode:
5227 347 : case E_V8QImode:
5228 347 : case E_V4QImode:
5229 347 : case E_V2QImode:
5230 347 : case E_V8HImode:
5231 347 : case E_V4HImode:
5232 347 : case E_V2HImode:
5233 : /* Perform a parallel unsigned saturating subtraction. */
5234 347 : x = gen_reg_rtx (mode);
5235 347 : emit_insn (gen_rtx_SET
5236 : (x, gen_rtx_US_MINUS (mode, cop0, cop1)));
5237 347 : cop0 = x;
5238 347 : cop1 = CONST0_RTX (mode);
5239 347 : code = EQ;
5240 347 : *negate = !*negate;
5241 347 : break;
5242 :
5243 0 : default:
5244 0 : gcc_unreachable ();
5245 : }
5246 : }
5247 : }
5248 :
5249 16650 : if (*negate)
5250 3812 : std::swap (op_true, op_false);
5251 :
5252 16650 : if (CONST_VECTOR_P (cop1))
5253 416 : cop1 = force_reg (mode, cop1);
5254 :
5255 : /* Allow the comparison to be done in one mode, but the movcc to
5256 : happen in another mode. */
5257 16650 : if (data_mode == mode)
5258 16608 : x = ix86_expand_sse_cmp (dest, code, cop0, cop1, op_true, op_false);
5259 : else
5260 : {
5261 126 : gcc_assert (GET_MODE_SIZE (data_mode) == GET_MODE_SIZE (mode));
5262 42 : x = ix86_expand_sse_cmp (gen_reg_rtx (mode), code, cop0, cop1,
5263 : op_true, op_false);
5264 42 : if (GET_MODE (x) == mode)
5265 24 : x = gen_lowpart (data_mode, x);
5266 : }
5267 :
5268 : return x;
5269 : }
5270 :
5271 : /* Expand integer vector comparison. */
5272 :
5273 : bool
5274 9592 : ix86_expand_int_vec_cmp (rtx operands[])
5275 : {
5276 9592 : rtx_code code = GET_CODE (operands[1]);
5277 9592 : bool negate = false;
5278 9592 : rtx cmp = ix86_expand_int_sse_cmp (operands[0], code, operands[2],
5279 : operands[3], NULL, NULL, &negate);
5280 :
5281 9592 : if (!cmp)
5282 : return false;
5283 :
5284 9592 : if (negate)
5285 3716 : cmp = ix86_expand_int_sse_cmp (operands[0], EQ, cmp,
5286 3716 : CONST0_RTX (GET_MODE (cmp)),
5287 : NULL, NULL, &negate);
5288 :
5289 9592 : gcc_assert (!negate);
5290 :
5291 9592 : if (operands[0] != cmp)
5292 9298 : emit_move_insn (operands[0], cmp);
5293 :
5294 : return true;
5295 : }
5296 :
5297 : /* Expand a floating-point vector conditional move; a vcond operation
5298 : rather than a movcc operation. */
5299 :
5300 : bool
5301 0 : ix86_expand_fp_vcond (rtx operands[])
5302 : {
5303 0 : enum rtx_code code = GET_CODE (operands[3]);
5304 0 : rtx cmp;
5305 :
5306 0 : code = ix86_prepare_sse_fp_compare_args (operands[0], code,
5307 : &operands[4], &operands[5]);
5308 0 : if (code == UNKNOWN)
5309 : {
5310 0 : rtx temp;
5311 0 : switch (GET_CODE (operands[3]))
5312 : {
5313 0 : case LTGT:
5314 0 : temp = ix86_expand_sse_cmp (operands[0], ORDERED, operands[4],
5315 : operands[5], operands[0], operands[0]);
5316 0 : cmp = ix86_expand_sse_cmp (operands[0], NE, operands[4],
5317 : operands[5], operands[1], operands[2]);
5318 0 : code = AND;
5319 0 : break;
5320 0 : case UNEQ:
5321 0 : temp = ix86_expand_sse_cmp (operands[0], UNORDERED, operands[4],
5322 : operands[5], operands[0], operands[0]);
5323 0 : cmp = ix86_expand_sse_cmp (operands[0], EQ, operands[4],
5324 : operands[5], operands[1], operands[2]);
5325 0 : code = IOR;
5326 0 : break;
5327 0 : default:
5328 0 : gcc_unreachable ();
5329 : }
5330 0 : cmp = expand_simple_binop (GET_MODE (cmp), code, temp, cmp, cmp, 1,
5331 : OPTAB_DIRECT);
5332 0 : ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
5333 0 : return true;
5334 : }
5335 :
5336 0 : if (ix86_expand_sse_fp_minmax (operands[0], code, operands[4],
5337 : operands[5], operands[1], operands[2]))
5338 : return true;
5339 :
5340 0 : cmp = ix86_expand_sse_cmp (operands[0], code, operands[4], operands[5],
5341 : operands[1], operands[2]);
5342 0 : ix86_expand_sse_movcc (operands[0], cmp, operands[1], operands[2]);
5343 0 : return true;
5344 : }
5345 :
5346 : /* Expand a signed/unsigned integral vector conditional move. */
5347 :
5348 : bool
5349 3342 : ix86_expand_int_vcond (rtx operands[])
5350 : {
5351 3342 : machine_mode data_mode = GET_MODE (operands[0]);
5352 3342 : machine_mode mode = GET_MODE (operands[4]);
5353 3342 : enum rtx_code code = GET_CODE (operands[3]);
5354 3342 : bool negate = false;
5355 3342 : rtx x, cop0, cop1;
5356 :
5357 3342 : cop0 = operands[4];
5358 3342 : cop1 = operands[5];
5359 :
5360 : /* Try to optimize x < 0 ? -1 : 0 into (signed) x >> 31
5361 : and x < 0 ? 1 : 0 into (unsigned) x >> 31. */
5362 3342 : if ((code == LT || code == GE)
5363 0 : && data_mode == mode
5364 0 : && cop1 == CONST0_RTX (mode)
5365 0 : && operands[1 + (code == LT)] == CONST0_RTX (data_mode)
5366 0 : && GET_MODE_UNIT_SIZE (data_mode) > 1
5367 0 : && GET_MODE_UNIT_SIZE (data_mode) <= 8
5368 3342 : && (GET_MODE_SIZE (data_mode) == 16
5369 0 : || (TARGET_AVX2 && GET_MODE_SIZE (data_mode) == 32)))
5370 : {
5371 0 : rtx negop = operands[2 - (code == LT)];
5372 0 : int shift = GET_MODE_UNIT_BITSIZE (data_mode) - 1;
5373 0 : if (negop == CONST1_RTX (data_mode))
5374 : {
5375 0 : rtx res = expand_simple_binop (mode, LSHIFTRT, cop0, GEN_INT (shift),
5376 : operands[0], 1, OPTAB_DIRECT);
5377 0 : if (res != operands[0])
5378 0 : emit_move_insn (operands[0], res);
5379 0 : return true;
5380 : }
5381 0 : else if (GET_MODE_INNER (data_mode) != DImode
5382 0 : && vector_all_ones_operand (negop, data_mode))
5383 : {
5384 0 : rtx res = expand_simple_binop (mode, ASHIFTRT, cop0, GEN_INT (shift),
5385 : operands[0], 0, OPTAB_DIRECT);
5386 0 : if (res != operands[0])
5387 0 : emit_move_insn (operands[0], res);
5388 0 : return true;
5389 : }
5390 : }
5391 :
5392 3342 : if (!nonimmediate_operand (cop1, mode))
5393 126 : cop1 = force_reg (mode, cop1);
5394 3342 : if (!general_operand (operands[1], data_mode))
5395 0 : operands[1] = force_reg (data_mode, operands[1]);
5396 3342 : if (!general_operand (operands[2], data_mode))
5397 0 : operands[2] = force_reg (data_mode, operands[2]);
5398 :
5399 3342 : x = ix86_expand_int_sse_cmp (operands[0], code, cop0, cop1,
5400 : operands[1], operands[2], &negate);
5401 :
5402 3342 : if (!x)
5403 : return false;
5404 :
5405 3342 : ix86_expand_sse_movcc (operands[0], x, operands[1+negate],
5406 3342 : operands[2-negate]);
5407 3342 : return true;
5408 : }
5409 :
5410 : static bool
5411 124953 : ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
5412 : struct expand_vec_perm_d *d)
5413 : {
5414 : /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5415 : expander, so args are either in d, or in op0, op1 etc. */
5416 124953 : machine_mode mode = GET_MODE (d ? d->op0 : op0);
5417 124953 : machine_mode maskmode = mode;
5418 124953 : rtx (*gen) (rtx, rtx, rtx, rtx) = NULL;
5419 :
5420 124953 : switch (mode)
5421 : {
5422 23251 : case E_V16QImode:
5423 23251 : if (TARGET_AVX512VL && TARGET_AVX512VBMI)
5424 : gen = gen_avx512vl_vpermt2varv16qi3;
5425 : break;
5426 750 : case E_V32QImode:
5427 750 : if (TARGET_AVX512VL && TARGET_AVX512VBMI)
5428 : gen = gen_avx512vl_vpermt2varv32qi3;
5429 : break;
5430 235 : case E_V64QImode:
5431 235 : if (TARGET_AVX512VBMI)
5432 : gen = gen_avx512bw_vpermt2varv64qi3;
5433 : break;
5434 13011 : case E_V8HImode:
5435 13011 : if (TARGET_AVX512VL && TARGET_AVX512BW)
5436 : gen = gen_avx512vl_vpermt2varv8hi3;
5437 : break;
5438 758 : case E_V16HImode:
5439 758 : if (TARGET_AVX512VL && TARGET_AVX512BW)
5440 : gen = gen_avx512vl_vpermt2varv16hi3;
5441 : break;
5442 391 : case E_V32HImode:
5443 391 : if (TARGET_AVX512BW)
5444 : gen = gen_avx512bw_vpermt2varv32hi3;
5445 : break;
5446 34960 : case E_V4SImode:
5447 34960 : if (TARGET_AVX512VL)
5448 : gen = gen_avx512vl_vpermt2varv4si3;
5449 : break;
5450 1171 : case E_V8SImode:
5451 1171 : if (TARGET_AVX512VL)
5452 : gen = gen_avx512vl_vpermt2varv8si3;
5453 : break;
5454 126 : case E_V16SImode:
5455 126 : if (TARGET_AVX512F)
5456 : gen = gen_avx512f_vpermt2varv16si3;
5457 : break;
5458 10330 : case E_V4SFmode:
5459 10330 : if (TARGET_AVX512VL)
5460 : {
5461 : gen = gen_avx512vl_vpermt2varv4sf3;
5462 : maskmode = V4SImode;
5463 : }
5464 : break;
5465 7647 : case E_V8SFmode:
5466 7647 : if (TARGET_AVX512VL)
5467 : {
5468 : gen = gen_avx512vl_vpermt2varv8sf3;
5469 : maskmode = V8SImode;
5470 : }
5471 : break;
5472 239 : case E_V16SFmode:
5473 239 : if (TARGET_AVX512F)
5474 : {
5475 : gen = gen_avx512f_vpermt2varv16sf3;
5476 : maskmode = V16SImode;
5477 : }
5478 : break;
5479 0 : case E_V2DImode:
5480 0 : if (TARGET_AVX512VL)
5481 : gen = gen_avx512vl_vpermt2varv2di3;
5482 : break;
5483 290 : case E_V4DImode:
5484 290 : if (TARGET_AVX512VL)
5485 : gen = gen_avx512vl_vpermt2varv4di3;
5486 : break;
5487 10 : case E_V8DImode:
5488 10 : if (TARGET_AVX512F)
5489 : gen = gen_avx512f_vpermt2varv8di3;
5490 : break;
5491 0 : case E_V2DFmode:
5492 0 : if (TARGET_AVX512VL)
5493 : {
5494 : gen = gen_avx512vl_vpermt2varv2df3;
5495 : maskmode = V2DImode;
5496 : }
5497 : break;
5498 1960 : case E_V4DFmode:
5499 1960 : if (TARGET_AVX512VL)
5500 : {
5501 : gen = gen_avx512vl_vpermt2varv4df3;
5502 : maskmode = V4DImode;
5503 : }
5504 : break;
5505 202 : case E_V8DFmode:
5506 202 : if (TARGET_AVX512F)
5507 : {
5508 : gen = gen_avx512f_vpermt2varv8df3;
5509 : maskmode = V8DImode;
5510 : }
5511 : break;
5512 : default:
5513 : break;
5514 : }
5515 :
5516 : if (gen == NULL)
5517 : return false;
5518 :
5519 964 : if (d && d->testing_p)
5520 : return true;
5521 :
5522 : /* ix86_expand_vec_perm_vpermt2 is called from both const and non-const
5523 : expander, so args are either in d, or in op0, op1 etc. */
5524 953 : if (d)
5525 : {
5526 953 : rtx vec[64];
5527 953 : target = d->target;
5528 953 : op0 = d->op0;
5529 953 : op1 = d->op1;
5530 17421 : for (int i = 0; i < d->nelt; ++i)
5531 16468 : vec[i] = GEN_INT (d->perm[i]);
5532 953 : mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
5533 : }
5534 :
5535 961 : emit_insn (gen (target, force_reg (maskmode, mask), op0, op1));
5536 961 : return true;
5537 : }
5538 :
5539 : /* Expand a variable vector permutation. */
5540 :
5541 : void
5542 10 : ix86_expand_vec_perm (rtx operands[])
5543 : {
5544 10 : rtx target = operands[0];
5545 10 : rtx op0 = operands[1];
5546 10 : rtx op1 = operands[2];
5547 10 : rtx mask = operands[3];
5548 10 : rtx t1, t2, t3, t4, t5, t6, t7, t8, vt, vt2, vec[32];
5549 10 : machine_mode mode = GET_MODE (op0);
5550 10 : machine_mode maskmode = GET_MODE (mask);
5551 10 : int w, e, i;
5552 10 : bool one_operand_shuffle = rtx_equal_p (op0, op1);
5553 :
5554 : /* Number of elements in the vector. */
5555 10 : w = GET_MODE_NUNITS (mode);
5556 10 : e = GET_MODE_UNIT_SIZE (mode);
5557 10 : gcc_assert (w <= 64);
5558 :
5559 : /* For HF mode vector, convert it to HI using subreg. */
5560 20 : if (GET_MODE_INNER (mode) == HFmode)
5561 : {
5562 6 : machine_mode orig_mode = mode;
5563 6 : mode = mode_for_vector (HImode, w).require ();
5564 6 : target = lowpart_subreg (mode, target, orig_mode);
5565 6 : op0 = lowpart_subreg (mode, op0, orig_mode);
5566 6 : op1 = lowpart_subreg (mode, op1, orig_mode);
5567 : }
5568 :
5569 10 : if (TARGET_AVX512F && one_operand_shuffle)
5570 : {
5571 5 : rtx (*gen) (rtx, rtx, rtx) = NULL;
5572 5 : switch (mode)
5573 : {
5574 : case E_V16SImode:
5575 : gen =gen_avx512f_permvarv16si;
5576 : break;
5577 0 : case E_V16SFmode:
5578 0 : gen = gen_avx512f_permvarv16sf;
5579 0 : break;
5580 0 : case E_V8DImode:
5581 0 : gen = gen_avx512f_permvarv8di;
5582 0 : break;
5583 0 : case E_V8DFmode:
5584 0 : gen = gen_avx512f_permvarv8df;
5585 0 : break;
5586 : default:
5587 : break;
5588 : }
5589 0 : if (gen != NULL)
5590 : {
5591 0 : emit_insn (gen (target, op0, mask));
5592 8 : return;
5593 : }
5594 : }
5595 :
5596 10 : if (ix86_expand_vec_perm_vpermt2 (target, mask, op0, op1, NULL))
5597 : return;
5598 :
5599 2 : if (TARGET_AVX2)
5600 : {
5601 1 : if (mode == V4DImode || mode == V4DFmode || mode == V16HImode)
5602 : {
5603 : /* Unfortunately, the VPERMQ and VPERMPD instructions only support
5604 : an constant shuffle operand. With a tiny bit of effort we can
5605 : use VPERMD instead. A re-interpretation stall for V4DFmode is
5606 : unfortunate but there's no avoiding it.
5607 : Similarly for V16HImode we don't have instructions for variable
5608 : shuffling, while for V32QImode we can use after preparing suitable
5609 : masks vpshufb; vpshufb; vpermq; vpor. */
5610 :
5611 : if (mode == V16HImode)
5612 : {
5613 : maskmode = mode = V32QImode;
5614 : w = 32;
5615 : e = 1;
5616 : }
5617 : else
5618 : {
5619 : maskmode = mode = V8SImode;
5620 : w = 8;
5621 : e = 4;
5622 : }
5623 0 : t1 = gen_reg_rtx (maskmode);
5624 :
5625 : /* Replicate the low bits of the V4DImode mask into V8SImode:
5626 : mask = { A B C D }
5627 : t1 = { A A B B C C D D }. */
5628 0 : for (i = 0; i < w / 2; ++i)
5629 0 : vec[i*2 + 1] = vec[i*2] = GEN_INT (i * 2);
5630 0 : vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5631 0 : vt = force_reg (maskmode, vt);
5632 0 : mask = gen_lowpart (maskmode, mask);
5633 0 : if (maskmode == V8SImode)
5634 0 : emit_insn (gen_avx2_permvarv8si (t1, mask, vt));
5635 : else
5636 0 : emit_insn (gen_avx2_pshufbv32qi3 (t1, mask, vt));
5637 :
5638 : /* Multiply the shuffle indicies by two. */
5639 0 : t1 = expand_simple_binop (maskmode, PLUS, t1, t1, t1, 1,
5640 : OPTAB_DIRECT);
5641 :
5642 : /* Add one to the odd shuffle indicies:
5643 : t1 = { A*2, A*2+1, B*2, B*2+1, ... }. */
5644 0 : for (i = 0; i < w / 2; ++i)
5645 : {
5646 0 : vec[i * 2] = const0_rtx;
5647 0 : vec[i * 2 + 1] = const1_rtx;
5648 : }
5649 0 : vt = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (w, vec));
5650 0 : vt = validize_mem (force_const_mem (maskmode, vt));
5651 0 : t1 = expand_simple_binop (maskmode, PLUS, t1, vt, t1, 1,
5652 : OPTAB_DIRECT);
5653 :
5654 : /* Continue as if V8SImode (resp. V32QImode) was used initially. */
5655 0 : operands[3] = mask = t1;
5656 0 : target = gen_reg_rtx (mode);
5657 0 : op0 = gen_lowpart (mode, op0);
5658 0 : op1 = gen_lowpart (mode, op1);
5659 : }
5660 :
5661 1 : switch (mode)
5662 : {
5663 1 : case E_V8SImode:
5664 : /* The VPERMD and VPERMPS instructions already properly ignore
5665 : the high bits of the shuffle elements. No need for us to
5666 : perform an AND ourselves. */
5667 1 : if (one_operand_shuffle)
5668 : {
5669 0 : emit_insn (gen_avx2_permvarv8si (target, op0, mask));
5670 0 : if (target != operands[0])
5671 0 : emit_move_insn (operands[0],
5672 0 : gen_lowpart (GET_MODE (operands[0]), target));
5673 : }
5674 : else
5675 : {
5676 1 : t1 = gen_reg_rtx (V8SImode);
5677 1 : t2 = gen_reg_rtx (V8SImode);
5678 1 : emit_insn (gen_avx2_permvarv8si (t1, op0, mask));
5679 1 : emit_insn (gen_avx2_permvarv8si (t2, op1, mask));
5680 1 : goto merge_two;
5681 : }
5682 0 : return;
5683 :
5684 0 : case E_V8SFmode:
5685 0 : mask = gen_lowpart (V8SImode, mask);
5686 0 : if (one_operand_shuffle)
5687 0 : emit_insn (gen_avx2_permvarv8sf (target, op0, mask));
5688 : else
5689 : {
5690 0 : t1 = gen_reg_rtx (V8SFmode);
5691 0 : t2 = gen_reg_rtx (V8SFmode);
5692 0 : emit_insn (gen_avx2_permvarv8sf (t1, op0, mask));
5693 0 : emit_insn (gen_avx2_permvarv8sf (t2, op1, mask));
5694 0 : goto merge_two;
5695 : }
5696 0 : return;
5697 :
5698 0 : case E_V4SImode:
5699 : /* By combining the two 128-bit input vectors into one 256-bit
5700 : input vector, we can use VPERMD and VPERMPS for the full
5701 : two-operand shuffle. */
5702 0 : t1 = gen_reg_rtx (V8SImode);
5703 0 : t2 = gen_reg_rtx (V8SImode);
5704 0 : emit_insn (gen_avx_vec_concatv8si (t1, op0, op1));
5705 0 : emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5706 0 : emit_insn (gen_avx2_permvarv8si (t1, t1, t2));
5707 0 : emit_insn (gen_avx_vextractf128v8si (target, t1, const0_rtx));
5708 0 : return;
5709 :
5710 0 : case E_V4SFmode:
5711 0 : t1 = gen_reg_rtx (V8SFmode);
5712 0 : t2 = gen_reg_rtx (V8SImode);
5713 0 : mask = gen_lowpart (V4SImode, mask);
5714 0 : emit_insn (gen_avx_vec_concatv8sf (t1, op0, op1));
5715 0 : emit_insn (gen_avx_vec_concatv8si (t2, mask, mask));
5716 0 : emit_insn (gen_avx2_permvarv8sf (t1, t1, t2));
5717 0 : emit_insn (gen_avx_vextractf128v8sf (target, t1, const0_rtx));
5718 0 : return;
5719 :
5720 0 : case E_V32QImode:
5721 0 : t1 = gen_reg_rtx (V32QImode);
5722 0 : t2 = gen_reg_rtx (V32QImode);
5723 0 : t3 = gen_reg_rtx (V32QImode);
5724 0 : vt2 = GEN_INT (-128);
5725 0 : vt = gen_const_vec_duplicate (V32QImode, vt2);
5726 0 : vt = force_reg (V32QImode, vt);
5727 0 : for (i = 0; i < 32; i++)
5728 0 : vec[i] = i < 16 ? vt2 : const0_rtx;
5729 0 : vt2 = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, vec));
5730 0 : vt2 = force_reg (V32QImode, vt2);
5731 : /* From mask create two adjusted masks, which contain the same
5732 : bits as mask in the low 7 bits of each vector element.
5733 : The first mask will have the most significant bit clear
5734 : if it requests element from the same 128-bit lane
5735 : and MSB set if it requests element from the other 128-bit lane.
5736 : The second mask will have the opposite values of the MSB,
5737 : and additionally will have its 128-bit lanes swapped.
5738 : E.g. { 07 12 1e 09 ... | 17 19 05 1f ... } mask vector will have
5739 : t1 { 07 92 9e 09 ... | 17 19 85 1f ... } and
5740 : t3 { 97 99 05 9f ... | 87 12 1e 89 ... } where each ...
5741 : stands for other 12 bytes. */
5742 : /* The bit whether element is from the same lane or the other
5743 : lane is bit 4, so shift it up by 3 to the MSB position. */
5744 0 : t5 = gen_reg_rtx (V4DImode);
5745 0 : emit_insn (gen_ashlv4di3 (t5, gen_lowpart (V4DImode, mask),
5746 : GEN_INT (3)));
5747 : /* Clear MSB bits from the mask just in case it had them set. */
5748 0 : emit_insn (gen_avx2_andnotv32qi3 (t2, vt, mask));
5749 : /* After this t1 will have MSB set for elements from other lane. */
5750 0 : emit_insn (gen_xorv32qi3 (t1, gen_lowpart (V32QImode, t5), vt2));
5751 : /* Clear bits other than MSB. */
5752 0 : emit_insn (gen_andv32qi3 (t1, t1, vt));
5753 : /* Or in the lower bits from mask into t3. */
5754 0 : emit_insn (gen_iorv32qi3 (t3, t1, t2));
5755 : /* And invert MSB bits in t1, so MSB is set for elements from the same
5756 : lane. */
5757 0 : emit_insn (gen_xorv32qi3 (t1, t1, vt));
5758 : /* Swap 128-bit lanes in t3. */
5759 0 : t6 = gen_reg_rtx (V4DImode);
5760 0 : emit_insn (gen_avx2_permv4di_1 (t6, gen_lowpart (V4DImode, t3),
5761 : const2_rtx, GEN_INT (3),
5762 : const0_rtx, const1_rtx));
5763 : /* And or in the lower bits from mask into t1. */
5764 0 : emit_insn (gen_iorv32qi3 (t1, t1, t2));
5765 0 : if (one_operand_shuffle)
5766 : {
5767 : /* Each of these shuffles will put 0s in places where
5768 : element from the other 128-bit lane is needed, otherwise
5769 : will shuffle in the requested value. */
5770 0 : emit_insn (gen_avx2_pshufbv32qi3 (t3, op0,
5771 0 : gen_lowpart (V32QImode, t6)));
5772 0 : emit_insn (gen_avx2_pshufbv32qi3 (t1, op0, t1));
5773 : /* For t3 the 128-bit lanes are swapped again. */
5774 0 : t7 = gen_reg_rtx (V4DImode);
5775 0 : emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t3),
5776 : const2_rtx, GEN_INT (3),
5777 : const0_rtx, const1_rtx));
5778 : /* And oring both together leads to the result. */
5779 0 : emit_insn (gen_iorv32qi3 (target, t1,
5780 0 : gen_lowpart (V32QImode, t7)));
5781 0 : if (target != operands[0])
5782 0 : emit_move_insn (operands[0],
5783 0 : gen_lowpart (GET_MODE (operands[0]), target));
5784 0 : return;
5785 : }
5786 :
5787 0 : t4 = gen_reg_rtx (V32QImode);
5788 : /* Similarly to the above one_operand_shuffle code,
5789 : just for repeated twice for each operand. merge_two:
5790 : code will merge the two results together. */
5791 0 : emit_insn (gen_avx2_pshufbv32qi3 (t4, op0,
5792 0 : gen_lowpart (V32QImode, t6)));
5793 0 : emit_insn (gen_avx2_pshufbv32qi3 (t3, op1,
5794 0 : gen_lowpart (V32QImode, t6)));
5795 0 : emit_insn (gen_avx2_pshufbv32qi3 (t2, op0, t1));
5796 0 : emit_insn (gen_avx2_pshufbv32qi3 (t1, op1, t1));
5797 0 : t7 = gen_reg_rtx (V4DImode);
5798 0 : emit_insn (gen_avx2_permv4di_1 (t7, gen_lowpart (V4DImode, t4),
5799 : const2_rtx, GEN_INT (3),
5800 : const0_rtx, const1_rtx));
5801 0 : t8 = gen_reg_rtx (V4DImode);
5802 0 : emit_insn (gen_avx2_permv4di_1 (t8, gen_lowpart (V4DImode, t3),
5803 : const2_rtx, GEN_INT (3),
5804 : const0_rtx, const1_rtx));
5805 0 : emit_insn (gen_iorv32qi3 (t4, t2, gen_lowpart (V32QImode, t7)));
5806 0 : emit_insn (gen_iorv32qi3 (t3, t1, gen_lowpart (V32QImode, t8)));
5807 0 : t1 = t4;
5808 0 : t2 = t3;
5809 0 : goto merge_two;
5810 :
5811 0 : default:
5812 0 : gcc_assert (GET_MODE_SIZE (mode) <= 16);
5813 : break;
5814 : }
5815 : }
5816 :
5817 1 : if (TARGET_XOP)
5818 : {
5819 : /* The XOP VPPERM insn supports three inputs. By ignoring the
5820 : one_operand_shuffle special case, we avoid creating another
5821 : set of constant vectors in memory. */
5822 0 : one_operand_shuffle = false;
5823 :
5824 : /* mask = mask & {2*w-1, ...} */
5825 0 : vt = GEN_INT (2*w - 1);
5826 : }
5827 : else
5828 : {
5829 : /* mask = mask & {w-1, ...} */
5830 1 : vt = GEN_INT (w - 1);
5831 : }
5832 :
5833 1 : vt = gen_const_vec_duplicate (maskmode, vt);
5834 1 : mask = expand_simple_binop (maskmode, AND, mask, vt,
5835 : NULL_RTX, 0, OPTAB_DIRECT);
5836 :
5837 : /* For non-QImode operations, convert the word permutation control
5838 : into a byte permutation control. */
5839 1 : if (mode != V16QImode)
5840 : {
5841 1 : mask = expand_simple_binop (maskmode, ASHIFT, mask,
5842 2 : GEN_INT (exact_log2 (e)),
5843 : NULL_RTX, 0, OPTAB_DIRECT);
5844 :
5845 : /* Convert mask to vector of chars. */
5846 1 : mask = force_reg (V16QImode, gen_lowpart (V16QImode, mask));
5847 :
5848 : /* Replicate each of the input bytes into byte positions:
5849 : (v2di) --> {0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8}
5850 : (v4si) --> {0,0,0,0, 4,4,4,4, 8,8,8,8, 12,12,12,12}
5851 : (v8hi) --> {0,0, 2,2, 4,4, 6,6, ...}. */
5852 18 : for (i = 0; i < 16; ++i)
5853 16 : vec[i] = GEN_INT (i/e * e);
5854 1 : vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5855 1 : vt = validize_mem (force_const_mem (V16QImode, vt));
5856 1 : if (TARGET_XOP)
5857 0 : emit_insn (gen_xop_pperm (mask, mask, mask, vt));
5858 : else
5859 1 : emit_insn (gen_ssse3_pshufbv16qi3 (mask, mask, vt));
5860 :
5861 : /* Convert it into the byte positions by doing
5862 : mask = mask + {0,1,..,16/w, 0,1,..,16/w, ...} */
5863 17 : for (i = 0; i < 16; ++i)
5864 16 : vec[i] = GEN_INT (i % e);
5865 1 : vt = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, vec));
5866 1 : vt = validize_mem (force_const_mem (V16QImode, vt));
5867 1 : emit_insn (gen_addv16qi3 (mask, mask, vt));
5868 : }
5869 :
5870 : /* The actual shuffle operations all operate on V16QImode. */
5871 1 : op0 = gen_lowpart (V16QImode, op0);
5872 1 : op1 = gen_lowpart (V16QImode, op1);
5873 :
5874 1 : if (TARGET_XOP)
5875 : {
5876 0 : if (GET_MODE (target) != V16QImode)
5877 0 : target = gen_reg_rtx (V16QImode);
5878 0 : emit_insn (gen_xop_pperm (target, op0, op1, mask));
5879 0 : if (target != operands[0])
5880 0 : emit_move_insn (operands[0],
5881 0 : gen_lowpart (GET_MODE (operands[0]), target));
5882 : }
5883 1 : else if (one_operand_shuffle)
5884 : {
5885 1 : if (GET_MODE (target) != V16QImode)
5886 1 : target = gen_reg_rtx (V16QImode);
5887 1 : emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, mask));
5888 1 : if (target != operands[0])
5889 1 : emit_move_insn (operands[0],
5890 1 : gen_lowpart (GET_MODE (operands[0]), target));
5891 : }
5892 : else
5893 : {
5894 0 : rtx xops[6];
5895 0 : bool ok;
5896 :
5897 : /* Shuffle the two input vectors independently. */
5898 0 : t1 = gen_reg_rtx (V16QImode);
5899 0 : t2 = gen_reg_rtx (V16QImode);
5900 0 : emit_insn (gen_ssse3_pshufbv16qi3 (t1, op0, mask));
5901 0 : emit_insn (gen_ssse3_pshufbv16qi3 (t2, op1, mask));
5902 :
5903 1 : merge_two:
5904 : /* Then merge them together. The key is whether any given control
5905 : element contained a bit set that indicates the second word. */
5906 1 : mask = operands[3];
5907 1 : vt = GEN_INT (w);
5908 1 : if (maskmode == V2DImode && !TARGET_SSE4_1)
5909 : {
5910 : /* Without SSE4.1, we don't have V2DImode EQ. Perform one
5911 : more shuffle to convert the V2DI input mask into a V4SI
5912 : input mask. At which point the masking that expand_int_vcond
5913 : will work as desired. */
5914 0 : rtx t3 = gen_reg_rtx (V4SImode);
5915 0 : emit_insn (gen_sse2_pshufd_1 (t3, gen_lowpart (V4SImode, mask),
5916 : const0_rtx, const0_rtx,
5917 : const2_rtx, const2_rtx));
5918 0 : mask = t3;
5919 0 : maskmode = V4SImode;
5920 0 : e = w = 4;
5921 : }
5922 :
5923 1 : vt = gen_const_vec_duplicate (maskmode, vt);
5924 1 : vt = force_reg (maskmode, vt);
5925 1 : mask = expand_simple_binop (maskmode, AND, mask, vt,
5926 : NULL_RTX, 0, OPTAB_DIRECT);
5927 :
5928 1 : if (GET_MODE (target) != mode)
5929 0 : target = gen_reg_rtx (mode);
5930 1 : xops[0] = target;
5931 1 : xops[1] = gen_lowpart (mode, t2);
5932 1 : xops[2] = gen_lowpart (mode, t1);
5933 1 : xops[3] = gen_rtx_EQ (maskmode, mask, vt);
5934 1 : xops[4] = mask;
5935 1 : xops[5] = vt;
5936 1 : ok = ix86_expand_int_vcond (xops);
5937 1 : gcc_assert (ok);
5938 1 : if (target != operands[0])
5939 0 : emit_move_insn (operands[0],
5940 0 : gen_lowpart (GET_MODE (operands[0]), target));
5941 : }
5942 : }
5943 :
5944 : /* Extend SRC into next wider integer vector type. UNSIGNED_P is
5945 : true if we should do zero extension, else sign extension. */
5946 :
5947 : void
5948 290 : ix86_expand_sse_extend (rtx dest, rtx src, bool unsigned_p)
5949 : {
5950 290 : machine_mode imode = GET_MODE (src);
5951 290 : rtx ops[3];
5952 :
5953 290 : switch (imode)
5954 : {
5955 290 : case E_V8QImode:
5956 290 : case E_V4QImode:
5957 290 : case E_V2QImode:
5958 290 : case E_V4HImode:
5959 290 : case E_V2HImode:
5960 290 : case E_V2SImode:
5961 290 : break;
5962 0 : default:
5963 0 : gcc_unreachable ();
5964 : }
5965 :
5966 290 : ops[0] = dest;
5967 :
5968 290 : ops[1] = force_reg (imode, src);
5969 :
5970 290 : if (unsigned_p)
5971 92 : ops[2] = force_reg (imode, CONST0_RTX (imode));
5972 : else
5973 198 : ops[2] = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
5974 : ops[1], pc_rtx, pc_rtx);
5975 :
5976 290 : ix86_split_mmx_punpck (ops, false);
5977 290 : }
5978 :
5979 : /* Unpack SRC into the next wider integer vector type. UNSIGNED_P is
5980 : true if we should do zero extension, else sign extension. HIGH_P is
5981 : true if we want the N/2 high elements, else the low elements. */
5982 :
5983 : void
5984 18245 : ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
5985 : {
5986 18245 : machine_mode imode = GET_MODE (src);
5987 18245 : rtx tmp;
5988 :
5989 18245 : if (TARGET_SSE4_1)
5990 : {
5991 6313 : rtx (*unpack)(rtx, rtx);
5992 6313 : rtx (*extract)(rtx, rtx) = NULL;
5993 6313 : machine_mode halfmode = BLKmode;
5994 :
5995 6313 : switch (imode)
5996 : {
5997 116 : case E_V64QImode:
5998 116 : if (unsigned_p)
5999 : unpack = gen_avx512bw_zero_extendv32qiv32hi2;
6000 : else
6001 62 : unpack = gen_avx512bw_sign_extendv32qiv32hi2;
6002 116 : halfmode = V32QImode;
6003 116 : extract
6004 116 : = high_p ? gen_vec_extract_hi_v64qi : gen_vec_extract_lo_v64qi;
6005 : break;
6006 697 : case E_V32QImode:
6007 697 : if (unsigned_p)
6008 : unpack = gen_avx2_zero_extendv16qiv16hi2;
6009 : else
6010 150 : unpack = gen_avx2_sign_extendv16qiv16hi2;
6011 697 : halfmode = V16QImode;
6012 697 : extract
6013 697 : = high_p ? gen_vec_extract_hi_v32qi : gen_vec_extract_lo_v32qi;
6014 : break;
6015 102 : case E_V32HImode:
6016 102 : if (unsigned_p)
6017 : unpack = gen_avx512f_zero_extendv16hiv16si2;
6018 : else
6019 60 : unpack = gen_avx512f_sign_extendv16hiv16si2;
6020 102 : halfmode = V16HImode;
6021 102 : extract
6022 102 : = high_p ? gen_vec_extract_hi_v32hi : gen_vec_extract_lo_v32hi;
6023 : break;
6024 455 : case E_V16HImode:
6025 455 : if (unsigned_p)
6026 : unpack = gen_avx2_zero_extendv8hiv8si2;
6027 : else
6028 332 : unpack = gen_avx2_sign_extendv8hiv8si2;
6029 455 : halfmode = V8HImode;
6030 455 : extract
6031 455 : = high_p ? gen_vec_extract_hi_v16hi : gen_vec_extract_lo_v16hi;
6032 : break;
6033 108 : case E_V16SImode:
6034 108 : if (unsigned_p)
6035 : unpack = gen_avx512f_zero_extendv8siv8di2;
6036 : else
6037 90 : unpack = gen_avx512f_sign_extendv8siv8di2;
6038 108 : halfmode = V8SImode;
6039 108 : extract
6040 108 : = high_p ? gen_vec_extract_hi_v16si : gen_vec_extract_lo_v16si;
6041 : break;
6042 396 : case E_V8SImode:
6043 396 : if (unsigned_p)
6044 : unpack = gen_avx2_zero_extendv4siv4di2;
6045 : else
6046 334 : unpack = gen_avx2_sign_extendv4siv4di2;
6047 396 : halfmode = V4SImode;
6048 396 : extract
6049 396 : = high_p ? gen_vec_extract_hi_v8si : gen_vec_extract_lo_v8si;
6050 : break;
6051 2558 : case E_V16QImode:
6052 2558 : if (unsigned_p)
6053 : unpack = gen_sse4_1_zero_extendv8qiv8hi2;
6054 : else
6055 257 : unpack = gen_sse4_1_sign_extendv8qiv8hi2;
6056 : break;
6057 963 : case E_V8HImode:
6058 963 : if (unsigned_p)
6059 : unpack = gen_sse4_1_zero_extendv4hiv4si2;
6060 : else
6061 750 : unpack = gen_sse4_1_sign_extendv4hiv4si2;
6062 : break;
6063 538 : case E_V4SImode:
6064 538 : if (unsigned_p)
6065 : unpack = gen_sse4_1_zero_extendv2siv2di2;
6066 : else
6067 478 : unpack = gen_sse4_1_sign_extendv2siv2di2;
6068 : break;
6069 111 : case E_V8QImode:
6070 111 : if (unsigned_p)
6071 : unpack = gen_sse4_1_zero_extendv4qiv4hi2;
6072 : else
6073 72 : unpack = gen_sse4_1_sign_extendv4qiv4hi2;
6074 : break;
6075 263 : case E_V4HImode:
6076 263 : if (unsigned_p)
6077 : unpack = gen_sse4_1_zero_extendv2hiv2si2;
6078 : else
6079 208 : unpack = gen_sse4_1_sign_extendv2hiv2si2;
6080 : break;
6081 6 : case E_V4QImode:
6082 6 : if (unsigned_p)
6083 : unpack = gen_sse4_1_zero_extendv2qiv2hi2;
6084 : else
6085 0 : unpack = gen_sse4_1_sign_extendv2qiv2hi2;
6086 : break;
6087 0 : default:
6088 0 : gcc_unreachable ();
6089 : }
6090 :
6091 12626 : if (GET_MODE_SIZE (imode) >= 32)
6092 : {
6093 1874 : tmp = gen_reg_rtx (halfmode);
6094 1874 : emit_insn (extract (tmp, src));
6095 : }
6096 4439 : else if (high_p)
6097 : {
6098 2280 : switch (GET_MODE_SIZE (imode))
6099 : {
6100 947 : case 16:
6101 : /* Shift higher 8 bytes to lower 8 bytes. */
6102 947 : tmp = gen_reg_rtx (V1TImode);
6103 947 : emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
6104 : GEN_INT (64)));
6105 947 : break;
6106 190 : case 8:
6107 : /* Shift higher 4 bytes to lower 4 bytes. */
6108 190 : tmp = gen_reg_rtx (V1DImode);
6109 190 : emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
6110 : GEN_INT (32)));
6111 190 : break;
6112 3 : case 4:
6113 : /* Shift higher 2 bytes to lower 2 bytes. */
6114 3 : tmp = gen_reg_rtx (V1SImode);
6115 3 : emit_insn (gen_mmx_lshrv1si3 (tmp, gen_lowpart (V1SImode, src),
6116 : GEN_INT (16)));
6117 3 : break;
6118 0 : default:
6119 0 : gcc_unreachable ();
6120 : }
6121 :
6122 1140 : tmp = gen_lowpart (imode, tmp);
6123 : }
6124 : else
6125 : tmp = src;
6126 :
6127 6313 : emit_insn (unpack (dest, tmp));
6128 : }
6129 : else
6130 : {
6131 11932 : rtx (*unpack)(rtx, rtx, rtx);
6132 :
6133 11932 : switch (imode)
6134 : {
6135 3116 : case E_V16QImode:
6136 3116 : if (high_p)
6137 : unpack = gen_vec_interleave_highv16qi;
6138 : else
6139 1561 : unpack = gen_vec_interleave_lowv16qi;
6140 : break;
6141 4734 : case E_V8HImode:
6142 4734 : if (high_p)
6143 : unpack = gen_vec_interleave_highv8hi;
6144 : else
6145 2367 : unpack = gen_vec_interleave_lowv8hi;
6146 : break;
6147 2352 : case E_V4SImode:
6148 2352 : if (high_p)
6149 : unpack = gen_vec_interleave_highv4si;
6150 : else
6151 1176 : unpack = gen_vec_interleave_lowv4si;
6152 : break;
6153 590 : case E_V8QImode:
6154 590 : if (high_p)
6155 : unpack = gen_mmx_punpckhbw;
6156 : else
6157 295 : unpack = gen_mmx_punpcklbw;
6158 : break;
6159 1126 : case E_V4HImode:
6160 1126 : if (high_p)
6161 : unpack = gen_mmx_punpckhwd;
6162 : else
6163 563 : unpack = gen_mmx_punpcklwd;
6164 : break;
6165 14 : case E_V4QImode:
6166 14 : if (high_p)
6167 : unpack = gen_mmx_punpckhbw_low;
6168 : else
6169 7 : unpack = gen_mmx_punpcklbw_low;
6170 : break;
6171 0 : default:
6172 0 : gcc_unreachable ();
6173 : }
6174 :
6175 11932 : if (unsigned_p)
6176 4886 : tmp = force_reg (imode, CONST0_RTX (imode));
6177 : else
6178 7046 : tmp = ix86_expand_sse_cmp (gen_reg_rtx (imode), GT, CONST0_RTX (imode),
6179 : src, pc_rtx, pc_rtx);
6180 :
6181 11932 : rtx tmp2 = gen_reg_rtx (imode);
6182 11932 : emit_insn (unpack (tmp2, src, tmp));
6183 11932 : emit_move_insn (dest, gen_lowpart (GET_MODE (dest), tmp2));
6184 : }
6185 18245 : }
6186 :
6187 : /* Return true if mem is pool constant which contains a const_vector
6188 : perm index, assign the index to PERM. */
6189 : bool
6190 35 : ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
6191 : {
6192 35 : machine_mode mode = GET_MODE (mem);
6193 35 : int nelt = GET_MODE_NUNITS (mode);
6194 :
6195 35 : if (!INTEGRAL_MODE_P (mode))
6196 : return false;
6197 :
6198 : /* Needs to be constant pool. */
6199 35 : if (!(MEM_P (mem))
6200 35 : || !SYMBOL_REF_P (XEXP (mem, 0))
6201 70 : || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
6202 : return false;
6203 :
6204 35 : rtx constant = get_pool_constant (XEXP (mem, 0));
6205 :
6206 35 : if (!CONST_VECTOR_P (constant))
6207 : return false;
6208 :
6209 : /* There could be some rtx like
6210 : (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
6211 : but with "*.LC1" refer to V2DI constant vector. */
6212 35 : if (GET_MODE (constant) != mode)
6213 : {
6214 0 : constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
6215 :
6216 0 : if (constant == nullptr || !CONST_VECTOR_P (constant))
6217 : return false;
6218 : }
6219 :
6220 771 : for (int i = 0; i != nelt; i++)
6221 736 : perm[i] = UINTVAL (XVECEXP (constant, 0, i));
6222 :
6223 : return true;
6224 : }
6225 :
6226 : /* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
6227 : but works for floating pointer parameters and nonoffsetable memories.
6228 : For pushes, it returns just stack offsets; the values will be saved
6229 : in the right order. Maximally three parts are generated. */
6230 :
6231 : static int
6232 4106250 : ix86_split_to_parts (rtx operand, rtx *parts, machine_mode mode)
6233 : {
6234 4106250 : int size;
6235 :
6236 4106250 : if (!TARGET_64BIT)
6237 1562762 : size = mode==XFmode ? 3 : GET_MODE_SIZE (mode) / 4;
6238 : else
6239 6648480 : size = (GET_MODE_SIZE (mode) + 4) / 8;
6240 :
6241 4106250 : gcc_assert (!REG_P (operand) || !MMX_REGNO_P (REGNO (operand)));
6242 4106250 : gcc_assert (size >= 2 && size <= 4);
6243 :
6244 : /* Optimize constant pool reference to immediates. This is used by fp
6245 : moves, that force all constants to memory to allow combining. */
6246 4106250 : if (MEM_P (operand) && MEM_READONLY_P (operand))
6247 36162 : operand = avoid_constant_pool_reference (operand);
6248 :
6249 4106250 : if (MEM_P (operand) && !offsettable_memref_p (operand))
6250 : {
6251 : /* The only non-offsetable memories we handle are pushes. */
6252 184225 : int ok = push_operand (operand, VOIDmode);
6253 :
6254 184225 : gcc_assert (ok);
6255 :
6256 184225 : operand = copy_rtx (operand);
6257 184225 : PUT_MODE (operand, word_mode);
6258 184225 : parts[0] = parts[1] = parts[2] = parts[3] = operand;
6259 184225 : return size;
6260 : }
6261 :
6262 3922025 : if (CONST_VECTOR_P (operand))
6263 : {
6264 41953 : scalar_int_mode imode = int_mode_for_mode (mode).require ();
6265 : /* Caution: if we looked through a constant pool memory above,
6266 : the operand may actually have a different mode now. That's
6267 : ok, since we want to pun this all the way back to an integer. */
6268 41953 : operand = simplify_subreg (imode, operand, GET_MODE (operand), 0);
6269 41953 : gcc_assert (operand != NULL);
6270 41953 : mode = imode;
6271 : }
6272 :
6273 3922025 : if (!TARGET_64BIT)
6274 : {
6275 623659 : if (mode == DImode)
6276 494323 : split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
6277 : else
6278 : {
6279 129336 : int i;
6280 :
6281 129336 : if (REG_P (operand))
6282 : {
6283 67150 : gcc_assert (reload_completed);
6284 201450 : for (i = 0; i < size; i++)
6285 134300 : parts[i] = gen_rtx_REG (SImode, REGNO (operand) + i);
6286 : }
6287 62186 : else if (offsettable_memref_p (operand))
6288 : {
6289 60909 : operand = adjust_address (operand, SImode, 0);
6290 60909 : parts[0] = operand;
6291 122318 : for (i = 1; i < size; i++)
6292 61409 : parts[i] = adjust_address (operand, SImode, 4 * i);
6293 : }
6294 1277 : else if (CONST_DOUBLE_P (operand))
6295 : {
6296 1277 : const REAL_VALUE_TYPE *r;
6297 1277 : long l[4];
6298 :
6299 1277 : r = CONST_DOUBLE_REAL_VALUE (operand);
6300 1277 : switch (mode)
6301 : {
6302 0 : case E_TFmode:
6303 0 : real_to_target (l, r, mode);
6304 0 : parts[3] = gen_int_mode (l[3], SImode);
6305 0 : parts[2] = gen_int_mode (l[2], SImode);
6306 0 : break;
6307 204 : case E_XFmode:
6308 : /* We can't use REAL_VALUE_TO_TARGET_LONG_DOUBLE since
6309 : long double may not be 80-bit. */
6310 204 : real_to_target (l, r, mode);
6311 204 : parts[2] = gen_int_mode (l[2], SImode);
6312 204 : break;
6313 1073 : case E_DFmode:
6314 1073 : REAL_VALUE_TO_TARGET_DOUBLE (*r, l);
6315 1073 : break;
6316 0 : default:
6317 0 : gcc_unreachable ();
6318 : }
6319 1277 : parts[1] = gen_int_mode (l[1], SImode);
6320 1277 : parts[0] = gen_int_mode (l[0], SImode);
6321 : }
6322 : else
6323 0 : gcc_unreachable ();
6324 : }
6325 : }
6326 : else
6327 : {
6328 3298366 : if (mode == TImode)
6329 3278497 : split_double_mode (mode, &operand, 1, &parts[0], &parts[1]);
6330 3298366 : if (mode == XFmode || mode == TFmode)
6331 : {
6332 19869 : machine_mode upper_mode = mode==XFmode ? SImode : DImode;
6333 19869 : if (REG_P (operand))
6334 : {
6335 1539 : gcc_assert (reload_completed);
6336 1539 : parts[0] = gen_rtx_REG (DImode, REGNO (operand) + 0);
6337 1539 : parts[1] = gen_rtx_REG (upper_mode, REGNO (operand) + 1);
6338 : }
6339 18330 : else if (offsettable_memref_p (operand))
6340 : {
6341 14663 : operand = adjust_address (operand, DImode, 0);
6342 14663 : parts[0] = operand;
6343 14663 : parts[1] = adjust_address (operand, upper_mode, 8);
6344 : }
6345 3667 : else if (CONST_DOUBLE_P (operand))
6346 : {
6347 3667 : long l[4];
6348 :
6349 3667 : real_to_target (l, CONST_DOUBLE_REAL_VALUE (operand), mode);
6350 :
6351 : /* real_to_target puts 32-bit pieces in each long. */
6352 7334 : parts[0] = gen_int_mode ((l[0] & HOST_WIDE_INT_C (0xffffffff))
6353 3667 : | ((l[1] & HOST_WIDE_INT_C (0xffffffff))
6354 3667 : << 32), DImode);
6355 :
6356 3667 : if (upper_mode == SImode)
6357 2733 : parts[1] = gen_int_mode (l[2], SImode);
6358 : else
6359 934 : parts[1]
6360 934 : = gen_int_mode ((l[2] & HOST_WIDE_INT_C (0xffffffff))
6361 934 : | ((l[3] & HOST_WIDE_INT_C (0xffffffff))
6362 934 : << 32), DImode);
6363 : }
6364 : else
6365 0 : gcc_unreachable ();
6366 : }
6367 : }
6368 :
6369 : return size;
6370 : }
6371 :
6372 : /* Emit insns to perform a move or push of DI, DF, XF, and TF values.
6373 : Return false when normal moves are needed; true when all required
6374 : insns have been emitted. Operands 2-4 contain the input values
6375 : int the correct order; operands 5-7 contain the output values. */
6376 :
6377 : void
6378 2066038 : ix86_split_long_move (rtx operands[])
6379 : {
6380 2066038 : rtx part[2][4];
6381 2066038 : int nparts, i, j;
6382 2066038 : int push = 0;
6383 2066038 : int collisions = 0;
6384 2066038 : machine_mode mode = GET_MODE (operands[0]);
6385 2066038 : bool collisionparts[4];
6386 :
6387 : /* The DFmode expanders may ask us to move double.
6388 : For 64bit target this is single move. By hiding the fact
6389 : here we simplify i386.md splitters. */
6390 3741071 : if (TARGET_64BIT && GET_MODE_SIZE (GET_MODE (operands[0])) == 8)
6391 : {
6392 : /* Optimize constant pool reference to immediates. This is used by
6393 : fp moves, that force all constants to memory to allow combining. */
6394 :
6395 12913 : if (MEM_P (operands[1])
6396 12499 : && SYMBOL_REF_P (XEXP (operands[1], 0))
6397 13519 : && CONSTANT_POOL_ADDRESS_P (XEXP (operands[1], 0)))
6398 117 : operands[1] = get_pool_constant (XEXP (operands[1], 0));
6399 12913 : if (push_operand (operands[0], VOIDmode))
6400 : {
6401 12913 : operands[0] = copy_rtx (operands[0]);
6402 12913 : PUT_MODE (operands[0], word_mode);
6403 : }
6404 : else
6405 0 : operands[0] = gen_lowpart (DImode, operands[0]);
6406 12913 : operands[1] = gen_lowpart (DImode, operands[1]);
6407 12913 : emit_move_insn (operands[0], operands[1]);
6408 12913 : return;
6409 : }
6410 :
6411 : /* The only non-offsettable memory we handle is push. */
6412 2053125 : if (push_operand (operands[0], VOIDmode))
6413 : push = 1;
6414 : else
6415 1868900 : gcc_assert (!MEM_P (operands[0])
6416 : || offsettable_memref_p (operands[0]));
6417 :
6418 2053125 : nparts = ix86_split_to_parts (operands[1], part[1], GET_MODE (operands[0]));
6419 2053125 : ix86_split_to_parts (operands[0], part[0], GET_MODE (operands[0]));
6420 :
6421 : /* When emitting push, take care for source operands on the stack. */
6422 184225 : if (push && MEM_P (operands[1])
6423 2150667 : && reg_overlap_mentioned_p (stack_pointer_rtx, operands[1]))
6424 : {
6425 56742 : rtx src_base = XEXP (part[1][nparts - 1], 0);
6426 :
6427 : /* Compensate for the stack decrement by 4. */
6428 56742 : if (!TARGET_64BIT && nparts == 3
6429 51583 : && mode == XFmode && TARGET_128BIT_LONG_DOUBLE)
6430 0 : src_base = plus_constant (Pmode, src_base, 4);
6431 :
6432 : /* src_base refers to the stack pointer and is
6433 : automatically decreased by emitted push. */
6434 170505 : for (i = 0; i < nparts; i++)
6435 113763 : part[1][i] = change_address (part[1][i],
6436 113763 : GET_MODE (part[1][i]), src_base);
6437 : }
6438 :
6439 : /* We need to do copy in the right order in case an address register
6440 : of the source overlaps the destination. */
6441 2053125 : if (REG_P (part[0][0]) && MEM_P (part[1][0]))
6442 : {
6443 : rtx tmp;
6444 :
6445 2341080 : for (i = 0; i < nparts; i++)
6446 : {
6447 1560720 : collisionparts[i]
6448 1560720 : = reg_overlap_mentioned_p (part[0][i], XEXP (part[1][0], 0));
6449 1560720 : if (collisionparts[i])
6450 16633 : collisions++;
6451 : }
6452 :
6453 : /* Collision in the middle part can be handled by reordering. */
6454 780360 : if (collisions == 1 && nparts == 3 && collisionparts [1])
6455 : {
6456 0 : std::swap (part[0][1], part[0][2]);
6457 0 : std::swap (part[1][1], part[1][2]);
6458 : }
6459 780360 : else if (collisions == 1
6460 780360 : && nparts == 4
6461 0 : && (collisionparts [1] || collisionparts [2]))
6462 : {
6463 0 : if (collisionparts [1])
6464 : {
6465 0 : std::swap (part[0][1], part[0][2]);
6466 0 : std::swap (part[1][1], part[1][2]);
6467 : }
6468 : else
6469 : {
6470 0 : std::swap (part[0][2], part[0][3]);
6471 0 : std::swap (part[1][2], part[1][3]);
6472 : }
6473 : }
6474 :
6475 : /* If there are more collisions, we can't handle it by reordering.
6476 : Do an lea to the last part and use only one colliding move. */
6477 780360 : else if (collisions > 1)
6478 : {
6479 83 : rtx base, addr;
6480 :
6481 83 : collisions = 1;
6482 :
6483 83 : base = part[0][nparts - 1];
6484 :
6485 : /* Handle the case when the last part isn't valid for lea.
6486 : Happens in 64-bit mode storing the 12-byte XFmode. */
6487 124 : if (GET_MODE (base) != Pmode)
6488 0 : base = gen_rtx_REG (Pmode, REGNO (base));
6489 :
6490 83 : addr = XEXP (part[1][0], 0);
6491 83 : if (TARGET_TLS_DIRECT_SEG_REFS)
6492 : {
6493 83 : struct ix86_address parts;
6494 83 : int ok = ix86_decompose_address (addr, &parts);
6495 83 : gcc_assert (ok);
6496 : /* It is not valid to use %gs: or %fs: in lea. */
6497 83 : gcc_assert (parts.seg == ADDR_SPACE_GENERIC);
6498 : }
6499 83 : emit_insn (gen_rtx_SET (base, addr));
6500 83 : part[1][0] = replace_equiv_address (part[1][0], base);
6501 166 : for (i = 1; i < nparts; i++)
6502 : {
6503 165 : tmp = plus_constant (Pmode, base, UNITS_PER_WORD * i);
6504 83 : part[1][i] = replace_equiv_address (part[1][i], tmp);
6505 : }
6506 : }
6507 : }
6508 :
6509 2053125 : if (push)
6510 : {
6511 184225 : if (!TARGET_64BIT)
6512 : {
6513 158351 : if (nparts == 3)
6514 : {
6515 554 : if (TARGET_128BIT_LONG_DOUBLE && mode == XFmode)
6516 0 : emit_insn (gen_add2_insn (stack_pointer_rtx, GEN_INT (-4)));
6517 554 : emit_move_insn (part[0][2], part[1][2]);
6518 : }
6519 157797 : else if (nparts == 4)
6520 : {
6521 0 : emit_move_insn (part[0][3], part[1][3]);
6522 0 : emit_move_insn (part[0][2], part[1][2]);
6523 : }
6524 : }
6525 : else
6526 : {
6527 : /* In 64bit mode we don't have 32bit push available. In case this is
6528 : register, it is OK - we will just use larger counterpart. We also
6529 : retype memory - these comes from attempt to avoid REX prefix on
6530 : moving of second half of TFmode value. */
6531 25874 : if (GET_MODE (part[1][1]) == SImode)
6532 : {
6533 11796 : switch (GET_CODE (part[1][1]))
6534 : {
6535 11356 : case MEM:
6536 11356 : part[1][1] = adjust_address (part[1][1], DImode, 0);
6537 11356 : break;
6538 :
6539 440 : case REG:
6540 440 : part[1][1] = gen_rtx_REG (DImode, REGNO (part[1][1]));
6541 440 : break;
6542 :
6543 0 : default:
6544 0 : gcc_unreachable ();
6545 : }
6546 :
6547 11796 : if (GET_MODE (part[1][0]) == SImode)
6548 0 : part[1][0] = part[1][1];
6549 : }
6550 : }
6551 184225 : emit_move_insn (part[0][1], part[1][1]);
6552 184225 : emit_move_insn (part[0][0], part[1][0]);
6553 184225 : return;
6554 : }
6555 :
6556 : /* Choose correct order to not overwrite the source before it is copied. */
6557 1868900 : if ((REG_P (part[0][0])
6558 1019600 : && REG_P (part[1][1])
6559 81457 : && (REGNO (part[0][0]) == REGNO (part[1][1])
6560 66368 : || (nparts == 3
6561 0 : && REGNO (part[0][0]) == REGNO (part[1][2]))
6562 66368 : || (nparts == 4
6563 0 : && REGNO (part[0][0]) == REGNO (part[1][3]))))
6564 2873411 : || (collisions > 0
6565 16550 : && reg_overlap_mentioned_p (part[0][0], XEXP (part[1][0], 0))))
6566 : {
6567 94128 : for (i = 0, j = nparts - 1; i < nparts; i++, j--)
6568 : {
6569 62752 : operands[2 + i] = part[0][j];
6570 62752 : operands[6 + i] = part[1][j];
6571 : }
6572 : }
6573 : else
6574 : {
6575 5512647 : for (i = 0; i < nparts; i++)
6576 : {
6577 3675123 : operands[2 + i] = part[0][i];
6578 3675123 : operands[6 + i] = part[1][i];
6579 : }
6580 : }
6581 :
6582 : /* Attempt to locally unCSE nonzero constants. */
6583 3737875 : for (j = 0; j < nparts - 1; j++)
6584 1868975 : if (CONST_INT_P (operands[6 + j])
6585 224149 : && operands[6 + j] != const0_rtx
6586 63397 : && REG_P (operands[2 + j]))
6587 112992 : for (i = j; i < nparts - 1; i++)
6588 56496 : if (CONST_INT_P (operands[7 + i])
6589 56496 : && INTVAL (operands[7 + i]) == INTVAL (operands[6 + j]))
6590 22572 : operands[7 + i] = operands[2 + j];
6591 :
6592 5606775 : for (i = 0; i < nparts; i++)
6593 3737875 : emit_move_insn (operands[2 + i], operands[6 + i]);
6594 :
6595 : return;
6596 : }
6597 :
6598 : /* Helper function of ix86_split_ashl used to generate an SImode/DImode
6599 : left shift by a constant, either using a single shift or
6600 : a sequence of add instructions. */
6601 :
6602 : static void
6603 4343 : ix86_expand_ashl_const (rtx operand, int count, machine_mode mode)
6604 : {
6605 4343 : if (count == 1
6606 4343 : || (count * ix86_cost->add <= ix86_cost->shift_const
6607 0 : && !optimize_insn_for_size_p ()))
6608 : {
6609 16 : while (count-- > 0)
6610 8 : emit_insn (gen_add2_insn (operand, operand));
6611 : }
6612 : else
6613 : {
6614 4335 : rtx (*insn)(rtx, rtx, rtx);
6615 :
6616 4335 : insn = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6617 4335 : emit_insn (insn (operand, operand, GEN_INT (count)));
6618 : }
6619 4343 : }
6620 :
6621 : void
6622 10279 : ix86_split_ashl (rtx *operands, rtx scratch, machine_mode mode)
6623 : {
6624 10279 : rtx (*gen_ashl3)(rtx, rtx, rtx);
6625 10279 : rtx (*gen_shld)(rtx, rtx, rtx);
6626 10279 : int half_width = GET_MODE_BITSIZE (mode) >> 1;
6627 10279 : machine_mode half_mode;
6628 :
6629 10279 : rtx low[2], high[2];
6630 10279 : int count;
6631 :
6632 10279 : if (CONST_INT_P (operands[2]))
6633 : {
6634 8582 : split_double_mode (mode, operands, 2, low, high);
6635 8582 : count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6636 :
6637 8582 : if (count >= half_width)
6638 : {
6639 2464 : emit_move_insn (high[0], low[1]);
6640 2464 : ix86_expand_clear (low[0]);
6641 :
6642 2464 : if (count > half_width)
6643 141 : ix86_expand_ashl_const (high[0], count - half_width, mode);
6644 : }
6645 6118 : else if (count == 1)
6646 : {
6647 1916 : if (!rtx_equal_p (operands[0], operands[1]))
6648 0 : emit_move_insn (operands[0], operands[1]);
6649 1916 : rtx x3 = gen_rtx_REG (CCCmode, FLAGS_REG);
6650 1916 : rtx x4 = gen_rtx_LTU (mode, x3, const0_rtx);
6651 1916 : half_mode = mode == DImode ? SImode : DImode;
6652 1916 : emit_insn (gen_add3_cc_overflow_1 (half_mode, low[0],
6653 : low[0], low[0]));
6654 1916 : emit_insn (gen_add3_carry (half_mode, high[0], high[0], high[0],
6655 : x3, x4));
6656 : }
6657 : else
6658 : {
6659 4202 : gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6660 :
6661 4202 : if (!rtx_equal_p (operands[0], operands[1]))
6662 0 : emit_move_insn (operands[0], operands[1]);
6663 :
6664 4202 : emit_insn (gen_shld (high[0], low[0], GEN_INT (count)));
6665 4202 : ix86_expand_ashl_const (low[0], count, mode);
6666 : }
6667 8852 : return;
6668 : }
6669 :
6670 1697 : split_double_mode (mode, operands, 1, low, high);
6671 1697 : half_mode = mode == DImode ? SImode : DImode;
6672 :
6673 1697 : gen_ashl3 = mode == DImode ? gen_ashlsi3 : gen_ashldi3;
6674 :
6675 1697 : if (operands[1] == const1_rtx)
6676 : {
6677 : /* Assuming we've chosen a QImode capable registers, then 1 << N
6678 : can be done with two 32/64-bit shifts, no branches, no cmoves. */
6679 270 : if (ANY_QI_REG_P (low[0]) && ANY_QI_REG_P (high[0]))
6680 : {
6681 159 : rtx s, d, flags = gen_rtx_REG (CCZmode, FLAGS_REG);
6682 :
6683 159 : ix86_expand_clear (low[0]);
6684 159 : ix86_expand_clear (high[0]);
6685 159 : emit_insn (gen_testqi_ccz_1 (operands[2], GEN_INT (half_width)));
6686 :
6687 159 : d = gen_lowpart (QImode, low[0]);
6688 159 : d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6689 159 : s = gen_rtx_EQ (QImode, flags, const0_rtx);
6690 159 : emit_insn (gen_rtx_SET (d, s));
6691 :
6692 159 : d = gen_lowpart (QImode, high[0]);
6693 159 : d = gen_rtx_STRICT_LOW_PART (VOIDmode, d);
6694 159 : s = gen_rtx_NE (QImode, flags, const0_rtx);
6695 159 : emit_insn (gen_rtx_SET (d, s));
6696 : }
6697 :
6698 : /* Otherwise, we can get the same results by manually performing
6699 : a bit extract operation on bit 5/6, and then performing the two
6700 : shifts. The two methods of getting 0/1 into low/high are exactly
6701 : the same size. Avoiding the shift in the bit extract case helps
6702 : pentium4 a bit; no one else seems to care much either way. */
6703 : else
6704 : {
6705 111 : rtx (*gen_lshr3)(rtx, rtx, rtx);
6706 111 : rtx (*gen_and3)(rtx, rtx, rtx);
6707 111 : rtx (*gen_xor3)(rtx, rtx, rtx);
6708 111 : HOST_WIDE_INT bits;
6709 111 : rtx x;
6710 :
6711 111 : if (mode == DImode)
6712 : {
6713 : gen_lshr3 = gen_lshrsi3;
6714 : gen_and3 = gen_andsi3;
6715 : gen_xor3 = gen_xorsi3;
6716 : bits = 5;
6717 : }
6718 : else
6719 : {
6720 0 : gen_lshr3 = gen_lshrdi3;
6721 0 : gen_and3 = gen_anddi3;
6722 0 : gen_xor3 = gen_xordi3;
6723 0 : bits = 6;
6724 : }
6725 :
6726 111 : if (TARGET_PARTIAL_REG_STALL && !optimize_insn_for_size_p ())
6727 0 : x = gen_rtx_ZERO_EXTEND (half_mode, operands[2]);
6728 : else
6729 111 : x = gen_lowpart (half_mode, operands[2]);
6730 111 : emit_insn (gen_rtx_SET (high[0], x));
6731 :
6732 111 : emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (bits)));
6733 111 : emit_insn (gen_and3 (high[0], high[0], const1_rtx));
6734 111 : emit_move_insn (low[0], high[0]);
6735 111 : emit_insn (gen_xor3 (low[0], low[0], const1_rtx));
6736 : }
6737 :
6738 270 : emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6739 270 : emit_insn (gen_ashl3 (high[0], high[0], operands[2]));
6740 270 : return;
6741 : }
6742 :
6743 1427 : if (operands[1] == constm1_rtx)
6744 : {
6745 : /* For -1 << N, we can avoid the shld instruction, because we
6746 : know that we're shifting 0...31/63 ones into a -1. */
6747 118 : emit_move_insn (low[0], constm1_rtx);
6748 118 : if (optimize_insn_for_size_p ())
6749 6 : emit_move_insn (high[0], low[0]);
6750 : else
6751 112 : emit_move_insn (high[0], constm1_rtx);
6752 : }
6753 : else
6754 : {
6755 1309 : gen_shld = mode == DImode ? gen_x86_shld : gen_x86_64_shld;
6756 :
6757 1309 : if (!rtx_equal_p (operands[0], operands[1]))
6758 0 : emit_move_insn (operands[0], operands[1]);
6759 :
6760 1309 : split_double_mode (mode, operands, 1, low, high);
6761 1309 : emit_insn (gen_shld (high[0], low[0], operands[2]));
6762 : }
6763 :
6764 1427 : emit_insn (gen_ashl3 (low[0], low[0], operands[2]));
6765 :
6766 1427 : if (TARGET_CMOVE && scratch)
6767 : {
6768 949 : ix86_expand_clear (scratch);
6769 949 : emit_insn (gen_x86_shift_adj_1
6770 : (half_mode, high[0], low[0], operands[2], scratch));
6771 : }
6772 : else
6773 478 : emit_insn (gen_x86_shift_adj_2 (half_mode, high[0], low[0], operands[2]));
6774 : }
6775 :
6776 : void
6777 6038 : ix86_split_ashr (rtx *operands, rtx scratch, machine_mode mode)
6778 : {
6779 4798 : rtx (*gen_ashr3)(rtx, rtx, rtx)
6780 6038 : = mode == DImode ? gen_ashrsi3 : gen_ashrdi3;
6781 6038 : rtx (*gen_shrd)(rtx, rtx, rtx);
6782 6038 : int half_width = GET_MODE_BITSIZE (mode) >> 1;
6783 :
6784 6038 : rtx low[2], high[2];
6785 6038 : int count;
6786 :
6787 6038 : if (CONST_INT_P (operands[2]))
6788 : {
6789 5861 : split_double_mode (mode, operands, 2, low, high);
6790 5861 : count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6791 :
6792 11722 : if (count == GET_MODE_BITSIZE (mode) - 1)
6793 : {
6794 86 : emit_move_insn (high[0], high[1]);
6795 86 : emit_insn (gen_ashr3 (high[0], high[0],
6796 86 : GEN_INT (half_width - 1)));
6797 86 : emit_move_insn (low[0], high[0]);
6798 :
6799 : }
6800 5775 : else if (count >= half_width)
6801 : {
6802 1619 : emit_move_insn (low[0], high[1]);
6803 1619 : emit_move_insn (high[0], low[0]);
6804 1619 : emit_insn (gen_ashr3 (high[0], high[0],
6805 1619 : GEN_INT (half_width - 1)));
6806 :
6807 1619 : if (count > half_width)
6808 38 : emit_insn (gen_ashr3 (low[0], low[0],
6809 38 : GEN_INT (count - half_width)));
6810 : }
6811 4156 : else if (count == 1
6812 766 : && (TARGET_USE_RCR || optimize_size > 1))
6813 : {
6814 1 : if (!rtx_equal_p (operands[0], operands[1]))
6815 0 : emit_move_insn (operands[0], operands[1]);
6816 1 : if (mode == DImode)
6817 : {
6818 0 : emit_insn (gen_ashrsi3_carry (high[0], high[0]));
6819 0 : emit_insn (gen_rcrsi2 (low[0], low[0]));
6820 : }
6821 : else
6822 : {
6823 1 : emit_insn (gen_ashrdi3_carry (high[0], high[0]));
6824 1 : emit_insn (gen_rcrdi2 (low[0], low[0]));
6825 : }
6826 : }
6827 : else
6828 : {
6829 4155 : gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6830 :
6831 4155 : if (!rtx_equal_p (operands[0], operands[1]))
6832 0 : emit_move_insn (operands[0], operands[1]);
6833 :
6834 4155 : emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6835 4155 : emit_insn (gen_ashr3 (high[0], high[0], GEN_INT (count)));
6836 : }
6837 : }
6838 : else
6839 : {
6840 177 : machine_mode half_mode;
6841 :
6842 177 : gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6843 :
6844 177 : if (!rtx_equal_p (operands[0], operands[1]))
6845 0 : emit_move_insn (operands[0], operands[1]);
6846 :
6847 177 : split_double_mode (mode, operands, 1, low, high);
6848 177 : half_mode = mode == DImode ? SImode : DImode;
6849 :
6850 177 : emit_insn (gen_shrd (low[0], high[0], operands[2]));
6851 177 : emit_insn (gen_ashr3 (high[0], high[0], operands[2]));
6852 :
6853 177 : if (TARGET_CMOVE && scratch)
6854 : {
6855 140 : emit_move_insn (scratch, high[0]);
6856 140 : emit_insn (gen_ashr3 (scratch, scratch,
6857 140 : GEN_INT (half_width - 1)));
6858 140 : emit_insn (gen_x86_shift_adj_1
6859 : (half_mode, low[0], high[0], operands[2], scratch));
6860 : }
6861 : else
6862 37 : emit_insn (gen_x86_shift_adj_3
6863 : (half_mode, low[0], high[0], operands[2]));
6864 : }
6865 6038 : }
6866 :
6867 : void
6868 14067 : ix86_split_lshr (rtx *operands, rtx scratch, machine_mode mode)
6869 : {
6870 6725 : rtx (*gen_lshr3)(rtx, rtx, rtx)
6871 14067 : = mode == DImode ? gen_lshrsi3 : gen_lshrdi3;
6872 14067 : rtx (*gen_shrd)(rtx, rtx, rtx);
6873 14067 : int half_width = GET_MODE_BITSIZE (mode) >> 1;
6874 :
6875 14067 : rtx low[2], high[2];
6876 14067 : int count;
6877 :
6878 14067 : if (CONST_INT_P (operands[2]))
6879 : {
6880 12681 : split_double_mode (mode, operands, 2, low, high);
6881 12681 : count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (mode) - 1);
6882 :
6883 12681 : if (count >= half_width)
6884 : {
6885 8465 : emit_move_insn (low[0], high[1]);
6886 8465 : ix86_expand_clear (high[0]);
6887 :
6888 8465 : if (count > half_width)
6889 651 : emit_insn (gen_lshr3 (low[0], low[0],
6890 651 : GEN_INT (count - half_width)));
6891 : }
6892 4216 : else if (count == 1
6893 805 : && (TARGET_USE_RCR || optimize_size > 1))
6894 : {
6895 1 : if (!rtx_equal_p (operands[0], operands[1]))
6896 0 : emit_move_insn (operands[0], operands[1]);
6897 1 : if (mode == DImode)
6898 : {
6899 0 : emit_insn (gen_lshrsi3_carry (high[0], high[0]));
6900 0 : emit_insn (gen_rcrsi2 (low[0], low[0]));
6901 : }
6902 : else
6903 : {
6904 1 : emit_insn (gen_lshrdi3_carry (high[0], high[0]));
6905 1 : emit_insn (gen_rcrdi2 (low[0], low[0]));
6906 : }
6907 : }
6908 : else
6909 : {
6910 4215 : gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6911 :
6912 4215 : if (!rtx_equal_p (operands[0], operands[1]))
6913 0 : emit_move_insn (operands[0], operands[1]);
6914 :
6915 4215 : emit_insn (gen_shrd (low[0], high[0], GEN_INT (count)));
6916 4215 : emit_insn (gen_lshr3 (high[0], high[0], GEN_INT (count)));
6917 : }
6918 : }
6919 : else
6920 : {
6921 1386 : machine_mode half_mode;
6922 :
6923 1386 : gen_shrd = mode == DImode ? gen_x86_shrd : gen_x86_64_shrd;
6924 :
6925 1386 : if (!rtx_equal_p (operands[0], operands[1]))
6926 0 : emit_move_insn (operands[0], operands[1]);
6927 :
6928 1386 : split_double_mode (mode, operands, 1, low, high);
6929 1386 : half_mode = mode == DImode ? SImode : DImode;
6930 :
6931 1386 : emit_insn (gen_shrd (low[0], high[0], operands[2]));
6932 1386 : emit_insn (gen_lshr3 (high[0], high[0], operands[2]));
6933 :
6934 1386 : if (TARGET_CMOVE && scratch)
6935 : {
6936 1123 : ix86_expand_clear (scratch);
6937 1123 : emit_insn (gen_x86_shift_adj_1
6938 : (half_mode, low[0], high[0], operands[2], scratch));
6939 : }
6940 : else
6941 263 : emit_insn (gen_x86_shift_adj_2
6942 : (half_mode, low[0], high[0], operands[2]));
6943 : }
6944 14067 : }
6945 :
6946 : /* Helper function to split TImode ashl under NDD. */
6947 : void
6948 1 : ix86_split_ashl_ndd (rtx *operands, rtx scratch)
6949 : {
6950 1 : gcc_assert (TARGET_APX_NDD);
6951 1 : int half_width = GET_MODE_BITSIZE (TImode) >> 1;
6952 :
6953 1 : rtx low[2], high[2];
6954 1 : int count;
6955 :
6956 1 : split_double_mode (TImode, operands, 2, low, high);
6957 1 : if (CONST_INT_P (operands[2]))
6958 : {
6959 0 : count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (TImode) - 1);
6960 :
6961 0 : if (count >= half_width)
6962 : {
6963 0 : count = count - half_width;
6964 0 : if (count == 0)
6965 : {
6966 0 : if (!rtx_equal_p (high[0], low[1]))
6967 0 : emit_move_insn (high[0], low[1]);
6968 : }
6969 0 : else if (count == 1)
6970 0 : emit_insn (gen_adddi3 (high[0], low[1], low[1]));
6971 : else
6972 0 : emit_insn (gen_ashldi3 (high[0], low[1], GEN_INT (count)));
6973 :
6974 0 : ix86_expand_clear (low[0]);
6975 : }
6976 0 : else if (count == 1)
6977 : {
6978 0 : rtx x3 = gen_rtx_REG (CCCmode, FLAGS_REG);
6979 0 : rtx x4 = gen_rtx_LTU (TImode, x3, const0_rtx);
6980 0 : emit_insn (gen_add3_cc_overflow_1 (DImode, low[0],
6981 : low[1], low[1]));
6982 0 : emit_insn (gen_add3_carry (DImode, high[0], high[1], high[1],
6983 : x3, x4));
6984 : }
6985 : else
6986 : {
6987 0 : emit_insn (gen_x86_64_shld_ndd (high[0], high[1], low[1],
6988 : GEN_INT (count)));
6989 0 : emit_insn (gen_ashldi3 (low[0], low[1], GEN_INT (count)));
6990 : }
6991 : }
6992 : else
6993 : {
6994 1 : emit_insn (gen_x86_64_shld_ndd (high[0], high[1], low[1],
6995 : operands[2]));
6996 1 : emit_insn (gen_ashldi3 (low[0], low[1], operands[2]));
6997 1 : if (TARGET_CMOVE && scratch)
6998 : {
6999 1 : ix86_expand_clear (scratch);
7000 1 : emit_insn (gen_x86_shift_adj_1
7001 : (DImode, high[0], low[0], operands[2], scratch));
7002 : }
7003 : else
7004 0 : emit_insn (gen_x86_shift_adj_2 (DImode, high[0], low[0], operands[2]));
7005 : }
7006 1 : }
7007 :
7008 : /* Helper function to split TImode l/ashr under NDD. */
7009 : void
7010 2 : ix86_split_rshift_ndd (enum rtx_code code, rtx *operands, rtx scratch)
7011 : {
7012 2 : gcc_assert (TARGET_APX_NDD);
7013 2 : int half_width = GET_MODE_BITSIZE (TImode) >> 1;
7014 2 : bool ashr_p = code == ASHIFTRT;
7015 2 : rtx (*gen_shr)(rtx, rtx, rtx) = ashr_p ? gen_ashrdi3
7016 : : gen_lshrdi3;
7017 :
7018 2 : rtx low[2], high[2];
7019 2 : int count;
7020 :
7021 2 : split_double_mode (TImode, operands, 2, low, high);
7022 2 : if (CONST_INT_P (operands[2]))
7023 : {
7024 0 : count = INTVAL (operands[2]) & (GET_MODE_BITSIZE (TImode) - 1);
7025 :
7026 0 : if (ashr_p && (count == GET_MODE_BITSIZE (TImode) - 1))
7027 : {
7028 0 : emit_insn (gen_shr (high[0], high[1],
7029 : GEN_INT (half_width - 1)));
7030 0 : emit_move_insn (low[0], high[0]);
7031 : }
7032 0 : else if (count >= half_width)
7033 : {
7034 0 : if (ashr_p)
7035 0 : emit_insn (gen_shr (high[0], high[1],
7036 : GEN_INT (half_width - 1)));
7037 : else
7038 0 : ix86_expand_clear (high[0]);
7039 :
7040 0 : if (count > half_width)
7041 0 : emit_insn (gen_shr (low[0], high[1],
7042 0 : GEN_INT (count - half_width)));
7043 : else
7044 0 : emit_move_insn (low[0], high[1]);
7045 : }
7046 : else
7047 : {
7048 0 : emit_insn (gen_x86_64_shrd_ndd (low[0], low[1], high[1],
7049 : GEN_INT (count)));
7050 0 : emit_insn (gen_shr (high[0], high[1], GEN_INT (count)));
7051 : }
7052 : }
7053 : else
7054 : {
7055 2 : emit_insn (gen_x86_64_shrd_ndd (low[0], low[1], high[1],
7056 : operands[2]));
7057 2 : emit_insn (gen_shr (high[0], high[1], operands[2]));
7058 :
7059 2 : if (TARGET_CMOVE && scratch)
7060 : {
7061 2 : if (ashr_p)
7062 : {
7063 1 : emit_move_insn (scratch, high[0]);
7064 1 : emit_insn (gen_shr (scratch, scratch,
7065 : GEN_INT (half_width - 1)));
7066 : }
7067 : else
7068 1 : ix86_expand_clear (scratch);
7069 :
7070 2 : emit_insn (gen_x86_shift_adj_1
7071 : (DImode, low[0], high[0], operands[2], scratch));
7072 : }
7073 0 : else if (ashr_p)
7074 0 : emit_insn (gen_x86_shift_adj_3
7075 : (DImode, low[0], high[0], operands[2]));
7076 : else
7077 0 : emit_insn (gen_x86_shift_adj_2
7078 : (DImode, low[0], high[0], operands[2]));
7079 : }
7080 2 : }
7081 :
7082 : /* Expand move of V1TI mode register X to a new TI mode register. */
7083 : static rtx
7084 17 : ix86_expand_v1ti_to_ti (rtx x)
7085 : {
7086 17 : rtx result = gen_reg_rtx (TImode);
7087 17 : if (TARGET_SSE2)
7088 : {
7089 17 : rtx temp = force_reg (V2DImode, gen_lowpart (V2DImode, x));
7090 17 : rtx lo = gen_lowpart (DImode, result);
7091 17 : emit_insn (gen_vec_extractv2didi (lo, temp, const0_rtx));
7092 17 : rtx hi = gen_highpart (DImode, result);
7093 17 : emit_insn (gen_vec_extractv2didi (hi, temp, const1_rtx));
7094 : }
7095 : else
7096 0 : emit_move_insn (result, gen_lowpart (TImode, x));
7097 17 : return result;
7098 : }
7099 :
7100 : /* Expand move of TI mode register X to a new V1TI mode register. */
7101 : static rtx
7102 17 : ix86_expand_ti_to_v1ti (rtx x)
7103 : {
7104 17 : if (TARGET_SSE2)
7105 : {
7106 17 : rtx lo = gen_lowpart (DImode, x);
7107 17 : rtx hi = gen_highpart (DImode, x);
7108 17 : rtx tmp = gen_reg_rtx (V2DImode);
7109 17 : emit_insn (gen_vec_concatv2di (tmp, lo, hi));
7110 17 : return force_reg (V1TImode, gen_lowpart (V1TImode, tmp));
7111 : }
7112 :
7113 0 : return force_reg (V1TImode, gen_lowpart (V1TImode, x));
7114 : }
7115 :
7116 : /* Expand V1TI mode shift (of rtx_code CODE) by constant. */
7117 : void
7118 42 : ix86_expand_v1ti_shift (enum rtx_code code, rtx operands[])
7119 : {
7120 42 : rtx op1 = force_reg (V1TImode, operands[1]);
7121 :
7122 42 : if (!CONST_INT_P (operands[2]))
7123 : {
7124 6 : rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
7125 6 : rtx tmp2 = gen_reg_rtx (TImode);
7126 3 : rtx (*shift) (rtx, rtx, rtx)
7127 6 : = (code == ASHIFT) ? gen_ashlti3 : gen_lshrti3;
7128 6 : emit_insn (shift (tmp2, tmp1, operands[2]));
7129 6 : rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
7130 6 : emit_move_insn (operands[0], tmp3);
7131 6 : return;
7132 : }
7133 :
7134 36 : HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
7135 :
7136 36 : if (bits == 0)
7137 : {
7138 0 : emit_move_insn (operands[0], op1);
7139 0 : return;
7140 : }
7141 :
7142 36 : if ((bits & 7) == 0)
7143 : {
7144 0 : rtx tmp = gen_reg_rtx (V1TImode);
7145 0 : if (code == ASHIFT)
7146 0 : emit_insn (gen_sse2_ashlv1ti3 (tmp, op1, GEN_INT (bits)));
7147 : else
7148 0 : emit_insn (gen_sse2_lshrv1ti3 (tmp, op1, GEN_INT (bits)));
7149 0 : emit_move_insn (operands[0], tmp);
7150 0 : return;
7151 : }
7152 :
7153 36 : rtx tmp1 = gen_reg_rtx (V1TImode);
7154 36 : if (code == ASHIFT)
7155 18 : emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (64)));
7156 : else
7157 18 : emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
7158 :
7159 : /* tmp2 is operands[1] shifted by 64, in V2DImode. */
7160 36 : rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
7161 :
7162 : /* tmp3 will be the V2DImode result. */
7163 36 : rtx tmp3 = gen_reg_rtx (V2DImode);
7164 :
7165 36 : if (bits > 64)
7166 : {
7167 18 : if (code == ASHIFT)
7168 9 : emit_insn (gen_ashlv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
7169 : else
7170 9 : emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (bits - 64)));
7171 : }
7172 : else
7173 : {
7174 : /* tmp4 is operands[1], in V2DImode. */
7175 18 : rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7176 :
7177 18 : rtx tmp5 = gen_reg_rtx (V2DImode);
7178 18 : if (code == ASHIFT)
7179 9 : emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (bits)));
7180 : else
7181 9 : emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
7182 :
7183 18 : rtx tmp6 = gen_reg_rtx (V2DImode);
7184 18 : if (code == ASHIFT)
7185 9 : emit_insn (gen_lshrv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
7186 : else
7187 9 : emit_insn (gen_ashlv2di3 (tmp6, tmp2, GEN_INT (64 - bits)));
7188 :
7189 18 : emit_insn (gen_iorv2di3 (tmp3, tmp5, tmp6));
7190 : }
7191 :
7192 : /* Convert the result back to V1TImode and store in operands[0]. */
7193 36 : rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7194 36 : emit_move_insn (operands[0], tmp7);
7195 : }
7196 :
7197 : /* Expand V1TI mode rotate (of rtx_code CODE) by constant. */
7198 : void
7199 39 : ix86_expand_v1ti_rotate (enum rtx_code code, rtx operands[])
7200 : {
7201 39 : rtx op1 = force_reg (V1TImode, operands[1]);
7202 :
7203 39 : if (!CONST_INT_P (operands[2]))
7204 : {
7205 8 : rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
7206 8 : rtx tmp2 = gen_reg_rtx (TImode);
7207 4 : rtx (*rotate) (rtx, rtx, rtx)
7208 8 : = (code == ROTATE) ? gen_rotlti3 : gen_rotrti3;
7209 8 : emit_insn (rotate (tmp2, tmp1, operands[2]));
7210 8 : rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
7211 8 : emit_move_insn (operands[0], tmp3);
7212 8 : return;
7213 : }
7214 :
7215 31 : HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
7216 :
7217 31 : if (bits == 0)
7218 : {
7219 0 : emit_move_insn (operands[0], op1);
7220 0 : return;
7221 : }
7222 :
7223 31 : if (code == ROTATERT)
7224 16 : bits = 128 - bits;
7225 :
7226 31 : if ((bits & 31) == 0)
7227 : {
7228 5 : rtx tmp2 = gen_reg_rtx (V4SImode);
7229 5 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7230 5 : if (bits == 32)
7231 1 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x93)));
7232 4 : else if (bits == 64)
7233 2 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x4e)));
7234 : else
7235 2 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0x39)));
7236 5 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp2));
7237 5 : return;
7238 : }
7239 :
7240 26 : if ((bits & 7) == 0)
7241 : {
7242 6 : rtx tmp1 = gen_reg_rtx (V1TImode);
7243 6 : rtx tmp2 = gen_reg_rtx (V1TImode);
7244 6 : rtx tmp3 = gen_reg_rtx (V1TImode);
7245 :
7246 6 : emit_insn (gen_sse2_ashlv1ti3 (tmp1, op1, GEN_INT (bits)));
7247 6 : emit_insn (gen_sse2_lshrv1ti3 (tmp2, op1, GEN_INT (128 - bits)));
7248 6 : emit_insn (gen_iorv1ti3 (tmp3, tmp1, tmp2));
7249 6 : emit_move_insn (operands[0], tmp3);
7250 6 : return;
7251 : }
7252 :
7253 20 : rtx op1_v4si = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7254 :
7255 20 : rtx lobits;
7256 20 : rtx hibits;
7257 :
7258 20 : switch (bits >> 5)
7259 : {
7260 7 : case 0:
7261 7 : lobits = op1_v4si;
7262 7 : hibits = gen_reg_rtx (V4SImode);
7263 7 : emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x93)));
7264 7 : break;
7265 :
7266 2 : case 1:
7267 2 : lobits = gen_reg_rtx (V4SImode);
7268 2 : hibits = gen_reg_rtx (V4SImode);
7269 2 : emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x93)));
7270 2 : emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x4e)));
7271 2 : break;
7272 :
7273 2 : case 2:
7274 2 : lobits = gen_reg_rtx (V4SImode);
7275 2 : hibits = gen_reg_rtx (V4SImode);
7276 2 : emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x4e)));
7277 2 : emit_insn (gen_sse2_pshufd (hibits, op1_v4si, GEN_INT (0x39)));
7278 2 : break;
7279 :
7280 9 : default:
7281 9 : lobits = gen_reg_rtx (V4SImode);
7282 9 : emit_insn (gen_sse2_pshufd (lobits, op1_v4si, GEN_INT (0x39)));
7283 9 : hibits = op1_v4si;
7284 9 : break;
7285 : }
7286 :
7287 20 : rtx tmp1 = gen_reg_rtx (V4SImode);
7288 20 : rtx tmp2 = gen_reg_rtx (V4SImode);
7289 20 : rtx tmp3 = gen_reg_rtx (V4SImode);
7290 :
7291 20 : emit_insn (gen_ashlv4si3 (tmp1, lobits, GEN_INT (bits & 31)));
7292 20 : emit_insn (gen_lshrv4si3 (tmp2, hibits, GEN_INT (32 - (bits & 31))));
7293 20 : emit_insn (gen_iorv4si3 (tmp3, tmp1, tmp2));
7294 :
7295 20 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
7296 : }
7297 :
7298 : /* Expand V1TI mode ashiftrt by constant. */
7299 : void
7300 109 : ix86_expand_v1ti_ashiftrt (rtx operands[])
7301 : {
7302 109 : rtx op1 = force_reg (V1TImode, operands[1]);
7303 :
7304 109 : if (!CONST_INT_P (operands[2]))
7305 : {
7306 3 : rtx tmp1 = ix86_expand_v1ti_to_ti (op1);
7307 3 : rtx tmp2 = gen_reg_rtx (TImode);
7308 3 : emit_insn (gen_ashrti3 (tmp2, tmp1, operands[2]));
7309 3 : rtx tmp3 = ix86_expand_ti_to_v1ti (tmp2);
7310 3 : emit_move_insn (operands[0], tmp3);
7311 3 : return;
7312 : }
7313 :
7314 106 : HOST_WIDE_INT bits = INTVAL (operands[2]) & 127;
7315 :
7316 106 : if (bits == 0)
7317 : {
7318 0 : emit_move_insn (operands[0], op1);
7319 0 : return;
7320 : }
7321 :
7322 106 : if (bits == 127)
7323 : {
7324 : /* Two operations. */
7325 3 : rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
7326 3 : rtx tmp2 = gen_reg_rtx (V4SImode);
7327 3 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7328 :
7329 3 : rtx tmp3 = gen_reg_rtx (V4SImode);
7330 3 : emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7331 :
7332 3 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp3));
7333 3 : return;
7334 : }
7335 :
7336 103 : if (bits == 64)
7337 : {
7338 : /* Three operations. */
7339 3 : rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
7340 3 : rtx tmp2 = gen_reg_rtx (V4SImode);
7341 3 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7342 :
7343 3 : rtx tmp3 = gen_reg_rtx (V4SImode);
7344 3 : emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7345 :
7346 3 : rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
7347 3 : rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7348 3 : rtx tmp6 = gen_reg_rtx (V2DImode);
7349 3 : emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
7350 :
7351 3 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
7352 3 : return;
7353 : }
7354 :
7355 100 : if (bits == 96)
7356 : {
7357 : /* Three operations. */
7358 3 : rtx tmp1 = force_reg(V4SImode, gen_lowpart (V4SImode, op1));
7359 3 : rtx tmp2 = gen_reg_rtx (V4SImode);
7360 3 : emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
7361 :
7362 3 : rtx tmp3 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
7363 3 : rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
7364 3 : rtx tmp5 = gen_reg_rtx (V2DImode);
7365 3 : emit_insn (gen_vec_interleave_highv2di (tmp5, tmp3, tmp4));
7366 :
7367 3 : rtx tmp6 = force_reg(V4SImode, gen_lowpart (V4SImode, tmp5));
7368 3 : rtx tmp7 = gen_reg_rtx (V4SImode);
7369 3 : emit_insn (gen_sse2_pshufd (tmp7, tmp6, GEN_INT (0xfd)));
7370 :
7371 3 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
7372 3 : return;
7373 : }
7374 :
7375 97 : if (bits >= 111)
7376 : {
7377 : /* Three operations. */
7378 21 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7379 21 : rtx tmp2 = gen_reg_rtx (V4SImode);
7380 21 : emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
7381 :
7382 21 : rtx tmp3 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
7383 21 : rtx tmp4 = gen_reg_rtx (V8HImode);
7384 21 : emit_insn (gen_sse2_pshufhw (tmp4, tmp3, GEN_INT (0xfe)));
7385 :
7386 21 : rtx tmp5 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp4));
7387 21 : rtx tmp6 = gen_reg_rtx (V4SImode);
7388 21 : emit_insn (gen_sse2_pshufd (tmp6, tmp5, GEN_INT (0xfe)));
7389 :
7390 21 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
7391 21 : return;
7392 : }
7393 :
7394 76 : if (TARGET_AVX2 || TARGET_SSE4_1)
7395 : {
7396 : /* Three operations. */
7397 50 : if (bits == 32)
7398 : {
7399 2 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7400 2 : rtx tmp2 = gen_reg_rtx (V4SImode);
7401 2 : emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (31)));
7402 :
7403 2 : rtx tmp3 = gen_reg_rtx (V1TImode);
7404 2 : emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (32)));
7405 :
7406 2 : if (TARGET_AVX2)
7407 : {
7408 1 : rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
7409 1 : rtx tmp5 = gen_reg_rtx (V4SImode);
7410 1 : emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
7411 : GEN_INT (7)));
7412 :
7413 1 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
7414 : }
7415 : else
7416 : {
7417 1 : rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
7418 1 : rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
7419 1 : rtx tmp6 = gen_reg_rtx (V8HImode);
7420 1 : emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
7421 : GEN_INT (0x3f)));
7422 :
7423 1 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
7424 : }
7425 2 : return;
7426 : }
7427 :
7428 : /* Three operations. */
7429 48 : if (bits == 8 || bits == 16 || bits == 24)
7430 : {
7431 6 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7432 6 : rtx tmp2 = gen_reg_rtx (V4SImode);
7433 6 : emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
7434 :
7435 6 : rtx tmp3 = gen_reg_rtx (V1TImode);
7436 6 : emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (bits)));
7437 :
7438 6 : if (TARGET_AVX2)
7439 : {
7440 3 : rtx tmp4 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp3));
7441 3 : rtx tmp5 = gen_reg_rtx (V4SImode);
7442 3 : emit_insn (gen_avx2_pblenddv4si (tmp5, tmp2, tmp4,
7443 : GEN_INT (7)));
7444 :
7445 3 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp5));
7446 : }
7447 : else
7448 : {
7449 3 : rtx tmp4 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
7450 3 : rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
7451 3 : rtx tmp6 = gen_reg_rtx (V8HImode);
7452 3 : emit_insn (gen_sse4_1_pblendw (tmp6, tmp4, tmp5,
7453 : GEN_INT (0x3f)));
7454 :
7455 3 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp6));
7456 : }
7457 6 : return;
7458 : }
7459 : }
7460 :
7461 68 : if (bits > 96)
7462 : {
7463 : /* Four operations. */
7464 3 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7465 3 : rtx tmp2 = gen_reg_rtx (V4SImode);
7466 3 : emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits - 96)));
7467 :
7468 3 : rtx tmp3 = gen_reg_rtx (V4SImode);
7469 3 : emit_insn (gen_ashrv4si3 (tmp3, tmp1, GEN_INT (31)));
7470 :
7471 3 : rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp2));
7472 3 : rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7473 3 : rtx tmp6 = gen_reg_rtx (V2DImode);
7474 3 : emit_insn (gen_vec_interleave_highv2di (tmp6, tmp4, tmp5));
7475 :
7476 3 : rtx tmp7 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp6));
7477 3 : rtx tmp8 = gen_reg_rtx (V4SImode);
7478 3 : emit_insn (gen_sse2_pshufd (tmp8, tmp7, GEN_INT (0xfd)));
7479 :
7480 3 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp8));
7481 3 : return;
7482 : }
7483 :
7484 65 : if (TARGET_SSE4_1 && (bits == 48 || bits == 80))
7485 : {
7486 : /* Four operations. */
7487 4 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7488 4 : rtx tmp2 = gen_reg_rtx (V4SImode);
7489 4 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7490 :
7491 4 : rtx tmp3 = gen_reg_rtx (V4SImode);
7492 4 : emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7493 :
7494 4 : rtx tmp4 = gen_reg_rtx (V1TImode);
7495 4 : emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
7496 :
7497 4 : rtx tmp5 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp3));
7498 4 : rtx tmp6 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp4));
7499 4 : rtx tmp7 = gen_reg_rtx (V8HImode);
7500 6 : emit_insn (gen_sse4_1_pblendw (tmp7, tmp5, tmp6,
7501 : GEN_INT (bits == 48 ? 0x1f : 0x07)));
7502 :
7503 4 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp7));
7504 4 : return;
7505 : }
7506 :
7507 61 : if ((bits & 7) == 0)
7508 : {
7509 : /* Five operations. */
7510 9 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7511 9 : rtx tmp2 = gen_reg_rtx (V4SImode);
7512 9 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7513 :
7514 9 : rtx tmp3 = gen_reg_rtx (V4SImode);
7515 9 : emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7516 :
7517 9 : rtx tmp4 = gen_reg_rtx (V1TImode);
7518 9 : emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (bits)));
7519 :
7520 9 : rtx tmp5 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7521 9 : rtx tmp6 = gen_reg_rtx (V1TImode);
7522 9 : emit_insn (gen_sse2_ashlv1ti3 (tmp6, tmp5, GEN_INT (128 - bits)));
7523 :
7524 9 : rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
7525 9 : rtx tmp8 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp6));
7526 9 : rtx tmp9 = gen_reg_rtx (V2DImode);
7527 9 : emit_insn (gen_iorv2di3 (tmp9, tmp7, tmp8));
7528 :
7529 9 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp9));
7530 9 : return;
7531 : }
7532 :
7533 52 : if (TARGET_AVX2 && bits < 32)
7534 : {
7535 : /* Six operations. */
7536 9 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7537 9 : rtx tmp2 = gen_reg_rtx (V4SImode);
7538 9 : emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
7539 :
7540 9 : rtx tmp3 = gen_reg_rtx (V1TImode);
7541 9 : emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
7542 :
7543 9 : rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7544 9 : rtx tmp5 = gen_reg_rtx (V2DImode);
7545 9 : emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
7546 :
7547 9 : rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7548 9 : rtx tmp7 = gen_reg_rtx (V2DImode);
7549 9 : emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
7550 :
7551 9 : rtx tmp8 = gen_reg_rtx (V2DImode);
7552 9 : emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
7553 :
7554 9 : rtx tmp9 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp8));
7555 9 : rtx tmp10 = gen_reg_rtx (V4SImode);
7556 9 : emit_insn (gen_avx2_pblenddv4si (tmp10, tmp2, tmp9, GEN_INT (7)));
7557 :
7558 9 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp10));
7559 9 : return;
7560 : }
7561 :
7562 43 : if (TARGET_SSE4_1 && bits < 15)
7563 : {
7564 : /* Six operations. */
7565 4 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7566 4 : rtx tmp2 = gen_reg_rtx (V4SImode);
7567 4 : emit_insn (gen_ashrv4si3 (tmp2, tmp1, GEN_INT (bits)));
7568 :
7569 4 : rtx tmp3 = gen_reg_rtx (V1TImode);
7570 4 : emit_insn (gen_sse2_lshrv1ti3 (tmp3, op1, GEN_INT (64)));
7571 :
7572 4 : rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7573 4 : rtx tmp5 = gen_reg_rtx (V2DImode);
7574 4 : emit_insn (gen_lshrv2di3 (tmp5, tmp4, GEN_INT (bits)));
7575 :
7576 4 : rtx tmp6 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7577 4 : rtx tmp7 = gen_reg_rtx (V2DImode);
7578 4 : emit_insn (gen_ashlv2di3 (tmp7, tmp6, GEN_INT (64 - bits)));
7579 :
7580 4 : rtx tmp8 = gen_reg_rtx (V2DImode);
7581 4 : emit_insn (gen_iorv2di3 (tmp8, tmp5, tmp7));
7582 :
7583 4 : rtx tmp9 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp2));
7584 4 : rtx tmp10 = force_reg (V8HImode, gen_lowpart (V8HImode, tmp8));
7585 4 : rtx tmp11 = gen_reg_rtx (V8HImode);
7586 4 : emit_insn (gen_sse4_1_pblendw (tmp11, tmp9, tmp10, GEN_INT (0x3f)));
7587 :
7588 4 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp11));
7589 4 : return;
7590 : }
7591 :
7592 18 : if (bits == 1)
7593 : {
7594 : /* Eight operations. */
7595 1 : rtx tmp1 = gen_reg_rtx (V1TImode);
7596 1 : emit_insn (gen_sse2_lshrv1ti3 (tmp1, op1, GEN_INT (64)));
7597 :
7598 1 : rtx tmp2 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7599 1 : rtx tmp3 = gen_reg_rtx (V2DImode);
7600 1 : emit_insn (gen_lshrv2di3 (tmp3, tmp2, GEN_INT (1)));
7601 :
7602 1 : rtx tmp4 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp1));
7603 1 : rtx tmp5 = gen_reg_rtx (V2DImode);
7604 1 : emit_insn (gen_ashlv2di3 (tmp5, tmp4, GEN_INT (63)));
7605 :
7606 1 : rtx tmp6 = gen_reg_rtx (V2DImode);
7607 1 : emit_insn (gen_iorv2di3 (tmp6, tmp3, tmp5));
7608 :
7609 1 : rtx tmp7 = gen_reg_rtx (V2DImode);
7610 1 : emit_insn (gen_lshrv2di3 (tmp7, tmp2, GEN_INT (63)));
7611 :
7612 1 : rtx tmp8 = force_reg (V4SImode, gen_lowpart (V4SImode, tmp7));
7613 1 : rtx tmp9 = gen_reg_rtx (V4SImode);
7614 1 : emit_insn (gen_sse2_pshufd (tmp9, tmp8, GEN_INT (0xbf)));
7615 :
7616 1 : rtx tmp10 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp9));
7617 1 : rtx tmp11 = gen_reg_rtx (V2DImode);
7618 1 : emit_insn (gen_ashlv2di3 (tmp11, tmp10, GEN_INT (31)));
7619 :
7620 1 : rtx tmp12 = gen_reg_rtx (V2DImode);
7621 1 : emit_insn (gen_iorv2di3 (tmp12, tmp6, tmp11));
7622 :
7623 1 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp12));
7624 1 : return;
7625 : }
7626 :
7627 38 : if (bits > 64)
7628 : {
7629 : /* Eight operations. */
7630 12 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7631 12 : rtx tmp2 = gen_reg_rtx (V4SImode);
7632 12 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7633 :
7634 12 : rtx tmp3 = gen_reg_rtx (V4SImode);
7635 12 : emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7636 :
7637 12 : rtx tmp4 = gen_reg_rtx (V1TImode);
7638 12 : emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
7639 :
7640 12 : rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
7641 12 : rtx tmp6 = gen_reg_rtx (V2DImode);
7642 12 : emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits - 64)));
7643 :
7644 12 : rtx tmp7 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7645 12 : rtx tmp8 = gen_reg_rtx (V1TImode);
7646 12 : emit_insn (gen_sse2_ashlv1ti3 (tmp8, tmp7, GEN_INT (64)));
7647 :
7648 12 : rtx tmp9 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp3));
7649 12 : rtx tmp10 = gen_reg_rtx (V2DImode);
7650 12 : emit_insn (gen_ashlv2di3 (tmp10, tmp9, GEN_INT (128 - bits)));
7651 :
7652 12 : rtx tmp11 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp8));
7653 12 : rtx tmp12 = gen_reg_rtx (V2DImode);
7654 12 : emit_insn (gen_iorv2di3 (tmp12, tmp10, tmp11));
7655 :
7656 12 : rtx tmp13 = gen_reg_rtx (V2DImode);
7657 12 : emit_insn (gen_iorv2di3 (tmp13, tmp6, tmp12));
7658 :
7659 12 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp13));
7660 : }
7661 : else
7662 : {
7663 : /* Nine operations. */
7664 26 : rtx tmp1 = force_reg (V4SImode, gen_lowpart (V4SImode, op1));
7665 26 : rtx tmp2 = gen_reg_rtx (V4SImode);
7666 26 : emit_insn (gen_sse2_pshufd (tmp2, tmp1, GEN_INT (0xff)));
7667 :
7668 26 : rtx tmp3 = gen_reg_rtx (V4SImode);
7669 26 : emit_insn (gen_ashrv4si3 (tmp3, tmp2, GEN_INT (31)));
7670 :
7671 26 : rtx tmp4 = gen_reg_rtx (V1TImode);
7672 26 : emit_insn (gen_sse2_lshrv1ti3 (tmp4, op1, GEN_INT (64)));
7673 :
7674 26 : rtx tmp5 = force_reg (V2DImode, gen_lowpart (V2DImode, op1));
7675 26 : rtx tmp6 = gen_reg_rtx (V2DImode);
7676 26 : emit_insn (gen_lshrv2di3 (tmp6, tmp5, GEN_INT (bits)));
7677 :
7678 26 : rtx tmp7 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp4));
7679 26 : rtx tmp8 = gen_reg_rtx (V2DImode);
7680 26 : emit_insn (gen_ashlv2di3 (tmp8, tmp7, GEN_INT (64 - bits)));
7681 :
7682 26 : rtx tmp9 = gen_reg_rtx (V2DImode);
7683 26 : emit_insn (gen_iorv2di3 (tmp9, tmp6, tmp8));
7684 :
7685 26 : rtx tmp10 = force_reg (V1TImode, gen_lowpart (V1TImode, tmp3));
7686 26 : rtx tmp11 = gen_reg_rtx (V1TImode);
7687 26 : emit_insn (gen_sse2_ashlv1ti3 (tmp11, tmp10, GEN_INT (64)));
7688 :
7689 26 : rtx tmp12 = force_reg (V2DImode, gen_lowpart (V2DImode, tmp11));
7690 26 : rtx tmp13 = gen_reg_rtx (V2DImode);
7691 26 : emit_insn (gen_ashlv2di3 (tmp13, tmp12, GEN_INT (64 - bits)));
7692 :
7693 26 : rtx tmp14 = gen_reg_rtx (V2DImode);
7694 26 : emit_insn (gen_iorv2di3 (tmp14, tmp9, tmp13));
7695 :
7696 26 : emit_move_insn (operands[0], gen_lowpart (V1TImode, tmp14));
7697 : }
7698 : }
7699 :
7700 : /* Expand V2DI mode ashiftrt. */
7701 : void
7702 371 : ix86_expand_v2di_ashiftrt (rtx operands[])
7703 : {
7704 371 : if (operands[2] == const0_rtx)
7705 : {
7706 0 : emit_move_insn (operands[0], operands[1]);
7707 0 : return;
7708 : }
7709 :
7710 371 : if (TARGET_SSE4_2
7711 133 : && CONST_INT_P (operands[2])
7712 133 : && UINTVAL (operands[2]) >= 63
7713 379 : && !optimize_insn_for_size_p ())
7714 : {
7715 8 : rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode));
7716 8 : emit_insn (gen_sse4_2_gtv2di3 (operands[0], zero, operands[1]));
7717 8 : return;
7718 : }
7719 :
7720 363 : if (CONST_INT_P (operands[2])
7721 349 : && (!TARGET_XOP || UINTVAL (operands[2]) >= 63))
7722 : {
7723 253 : vec_perm_builder sel (4, 4, 1);
7724 253 : sel.quick_grow (4);
7725 253 : rtx arg0, arg1;
7726 253 : rtx op1 = lowpart_subreg (V4SImode,
7727 : force_reg (V2DImode, operands[1]),
7728 : V2DImode);
7729 253 : rtx target = gen_reg_rtx (V4SImode);
7730 253 : if (UINTVAL (operands[2]) >= 63)
7731 : {
7732 87 : arg0 = arg1 = gen_reg_rtx (V4SImode);
7733 87 : emit_insn (gen_ashrv4si3 (arg0, op1, GEN_INT (31)));
7734 87 : sel[0] = 1;
7735 87 : sel[1] = 1;
7736 87 : sel[2] = 3;
7737 87 : sel[3] = 3;
7738 : }
7739 166 : else if (INTVAL (operands[2]) > 32)
7740 : {
7741 15 : arg0 = gen_reg_rtx (V4SImode);
7742 15 : arg1 = gen_reg_rtx (V4SImode);
7743 15 : emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31)));
7744 15 : emit_insn (gen_ashrv4si3 (arg0, op1,
7745 15 : GEN_INT (INTVAL (operands[2]) - 32)));
7746 15 : sel[0] = 1;
7747 15 : sel[1] = 5;
7748 15 : sel[2] = 3;
7749 15 : sel[3] = 7;
7750 : }
7751 151 : else if (INTVAL (operands[2]) == 32)
7752 : {
7753 3 : arg0 = op1;
7754 3 : arg1 = gen_reg_rtx (V4SImode);
7755 3 : emit_insn (gen_ashrv4si3 (arg1, op1, GEN_INT (31)));
7756 3 : sel[0] = 1;
7757 3 : sel[1] = 5;
7758 3 : sel[2] = 3;
7759 3 : sel[3] = 7;
7760 : }
7761 : else
7762 : {
7763 148 : arg0 = gen_reg_rtx (V2DImode);
7764 148 : arg1 = gen_reg_rtx (V4SImode);
7765 148 : emit_insn (gen_lshrv2di3 (arg0, operands[1], operands[2]));
7766 148 : emit_insn (gen_ashrv4si3 (arg1, op1, operands[2]));
7767 148 : arg0 = lowpart_subreg (V4SImode, arg0, V2DImode);
7768 148 : sel[0] = 0;
7769 148 : sel[1] = 5;
7770 148 : sel[2] = 2;
7771 148 : sel[3] = 7;
7772 : }
7773 340 : vec_perm_indices indices (sel, arg0 != arg1 ? 2 : 1, 4);
7774 253 : rtx op0 = operands[0];
7775 253 : bool ok = targetm.vectorize.vec_perm_const (V4SImode, V4SImode,
7776 : target, arg0, arg1,
7777 : indices);
7778 253 : gcc_assert (ok);
7779 253 : emit_move_insn (op0, lowpart_subreg (V2DImode, target, V4SImode));
7780 253 : return;
7781 253 : }
7782 110 : if (!TARGET_XOP)
7783 : {
7784 14 : rtx zero = force_reg (V2DImode, CONST0_RTX (V2DImode));
7785 14 : rtx zero_or_all_ones;
7786 14 : if (TARGET_SSE4_2)
7787 : {
7788 0 : zero_or_all_ones = gen_reg_rtx (V2DImode);
7789 0 : emit_insn (gen_sse4_2_gtv2di3 (zero_or_all_ones, zero,
7790 : operands[1]));
7791 : }
7792 : else
7793 : {
7794 14 : rtx temp = gen_reg_rtx (V4SImode);
7795 14 : emit_insn (gen_ashrv4si3 (temp,
7796 : lowpart_subreg (V4SImode,
7797 : force_reg (V2DImode,
7798 : operands[1]),
7799 : V2DImode),
7800 : GEN_INT (31)));
7801 14 : zero_or_all_ones = gen_reg_rtx (V4SImode);
7802 14 : emit_insn (gen_sse2_pshufd_1 (zero_or_all_ones, temp,
7803 : const1_rtx, const1_rtx,
7804 : GEN_INT (3), GEN_INT (3)));
7805 14 : zero_or_all_ones = lowpart_subreg (V2DImode, zero_or_all_ones,
7806 : V4SImode);
7807 : }
7808 14 : rtx lshr_res = gen_reg_rtx (V2DImode);
7809 14 : emit_insn (gen_lshrv2di3 (lshr_res, operands[1], operands[2]));
7810 14 : rtx ashl_res = gen_reg_rtx (V2DImode);
7811 14 : rtx amount;
7812 14 : if (TARGET_64BIT)
7813 : {
7814 14 : amount = gen_reg_rtx (DImode);
7815 14 : emit_insn (gen_subdi3 (amount, force_reg (DImode, GEN_INT (64)),
7816 : operands[2]));
7817 : }
7818 : else
7819 : {
7820 0 : rtx temp = gen_reg_rtx (SImode);
7821 0 : emit_insn (gen_subsi3 (temp, force_reg (SImode, GEN_INT (64)),
7822 : lowpart_subreg (SImode, operands[2],
7823 : DImode)));
7824 0 : amount = gen_reg_rtx (V4SImode);
7825 0 : emit_insn (gen_vec_setv4si_0 (amount, CONST0_RTX (V4SImode),
7826 : temp));
7827 : }
7828 14 : amount = lowpart_subreg (DImode, amount, GET_MODE (amount));
7829 14 : emit_insn (gen_ashlv2di3 (ashl_res, zero_or_all_ones, amount));
7830 14 : emit_insn (gen_iorv2di3 (operands[0], lshr_res, ashl_res));
7831 14 : return;
7832 : }
7833 :
7834 96 : rtx reg = gen_reg_rtx (V2DImode);
7835 96 : rtx par;
7836 96 : bool negate = false;
7837 96 : int i;
7838 :
7839 96 : if (CONST_INT_P (operands[2]))
7840 96 : operands[2] = GEN_INT (-INTVAL (operands[2]));
7841 : else
7842 : negate = true;
7843 :
7844 96 : par = gen_rtx_PARALLEL (V2DImode, rtvec_alloc (2));
7845 288 : for (i = 0; i < 2; i++)
7846 192 : XVECEXP (par, 0, i) = operands[2];
7847 :
7848 96 : emit_insn (gen_vec_initv2didi (reg, par));
7849 :
7850 96 : if (negate)
7851 0 : emit_insn (gen_negv2di2 (reg, reg));
7852 :
7853 96 : emit_insn (gen_xop_shav2di3 (operands[0], operands[1], reg));
7854 : }
7855 :
7856 : /* Replace all occurrences of REG FROM with REG TO in X, including
7857 : occurrences with different modes. */
7858 :
7859 : rtx
7860 38659 : ix86_replace_reg_with_reg (rtx x, rtx from, rtx to)
7861 : {
7862 38659 : gcc_checking_assert (REG_P (from)
7863 : && REG_P (to)
7864 : && GET_MODE (from) == GET_MODE (to));
7865 38659 : if (!reg_overlap_mentioned_p (from, x))
7866 : return x;
7867 100 : rtx ret = copy_rtx (x);
7868 100 : subrtx_ptr_iterator::array_type array;
7869 488 : FOR_EACH_SUBRTX_PTR (iter, array, &ret, NONCONST)
7870 : {
7871 388 : rtx *loc = *iter;
7872 388 : x = *loc;
7873 388 : if (REG_P (x) && REGNO (x) == REGNO (from))
7874 : {
7875 100 : if (x == from)
7876 100 : *loc = to;
7877 : else
7878 : {
7879 0 : gcc_checking_assert (REG_NREGS (x) == 1);
7880 0 : *loc = gen_rtx_REG (GET_MODE (x), REGNO (to));
7881 : }
7882 : }
7883 : }
7884 100 : return ret;
7885 100 : }
7886 :
7887 : /* Return mode for the memcpy/memset loop counter. Prefer SImode over
7888 : DImode for constant loop counts. */
7889 :
7890 : static machine_mode
7891 33570 : counter_mode (rtx count_exp)
7892 : {
7893 7376 : if (GET_MODE (count_exp) != VOIDmode)
7894 26767 : return GET_MODE (count_exp);
7895 6803 : if (!CONST_INT_P (count_exp))
7896 0 : return Pmode;
7897 : if (TARGET_64BIT && (INTVAL (count_exp) & ~0xffffffff))
7898 : return DImode;
7899 : return SImode;
7900 : }
7901 :
7902 : /* When ISSETMEM is FALSE, output simple loop to move memory pointer to SRCPTR
7903 : to DESTPTR via chunks of MODE unrolled UNROLL times, overall size is COUNT
7904 : specified in bytes. When ISSETMEM is TRUE, output the equivalent loop to set
7905 : memory by VALUE (supposed to be in MODE).
7906 :
7907 : The size is rounded down to whole number of chunk size moved at once.
7908 : SRCMEM and DESTMEM provide MEMrtx to feed proper aliasing info. */
7909 :
7910 :
7911 : static void
7912 18537 : expand_set_or_cpymem_via_loop (rtx destmem, rtx srcmem,
7913 : rtx destptr, rtx srcptr, rtx value,
7914 : rtx count, machine_mode mode, int unroll,
7915 : int expected_size, bool issetmem)
7916 : {
7917 18537 : rtx_code_label *out_label = nullptr;
7918 18537 : rtx_code_label *top_label = nullptr;
7919 18537 : rtx iter, tmp;
7920 18537 : machine_mode iter_mode = counter_mode (count);
7921 18537 : int piece_size_n = GET_MODE_SIZE (mode) * unroll;
7922 18537 : rtx piece_size = GEN_INT (piece_size_n);
7923 37074 : rtx piece_size_mask = GEN_INT (~((GET_MODE_SIZE (mode) * unroll) - 1));
7924 18537 : rtx size;
7925 18537 : int i;
7926 18537 : int loop_count;
7927 :
7928 18537 : if (expected_size != -1 && CONST_INT_P (count))
7929 6719 : loop_count = INTVAL (count) / GET_MODE_SIZE (mode) / unroll;
7930 : else
7931 : loop_count = -1;
7932 :
7933 : /* Don't generate the loop if the loop count is 1. */
7934 6719 : if (loop_count != 1)
7935 : {
7936 18465 : top_label = gen_label_rtx ();
7937 18465 : out_label = gen_label_rtx ();
7938 : }
7939 18537 : iter = gen_reg_rtx (iter_mode);
7940 :
7941 18537 : size = expand_simple_binop (iter_mode, AND, count, piece_size_mask,
7942 : NULL, 1, OPTAB_DIRECT);
7943 : /* Those two should combine. */
7944 18537 : if (piece_size == const1_rtx)
7945 : {
7946 4439 : emit_cmp_and_jump_insns (size, const0_rtx, EQ, NULL_RTX, iter_mode,
7947 : true, out_label);
7948 4439 : predict_jump (REG_BR_PROB_BASE * 10 / 100);
7949 : }
7950 18537 : emit_move_insn (iter, const0_rtx);
7951 :
7952 18537 : if (loop_count != 1)
7953 18465 : emit_label (top_label);
7954 :
7955 21298 : tmp = convert_modes (Pmode, iter_mode, iter, true);
7956 :
7957 : /* This assert could be relaxed - in this case we'll need to compute
7958 : smallest power of two, containing in PIECE_SIZE_N and pass it to
7959 : offset_address. */
7960 18537 : gcc_assert ((piece_size_n & (piece_size_n - 1)) == 0);
7961 18537 : destmem = offset_address (destmem, tmp, piece_size_n);
7962 18537 : destmem = adjust_address (destmem, mode, 0);
7963 :
7964 18537 : if (!issetmem)
7965 : {
7966 12112 : srcmem = offset_address (srcmem, copy_rtx (tmp), piece_size_n);
7967 12112 : srcmem = adjust_address (srcmem, mode, 0);
7968 :
7969 : /* When unrolling for chips that reorder memory reads and writes,
7970 : we can save registers by using single temporary.
7971 : Also using 4 temporaries is overkill in 32bit mode. */
7972 12112 : if (!TARGET_64BIT && 0)
7973 : {
7974 : for (i = 0; i < unroll; i++)
7975 : {
7976 : if (i)
7977 : {
7978 : destmem = adjust_address (copy_rtx (destmem), mode,
7979 : GET_MODE_SIZE (mode));
7980 : srcmem = adjust_address (copy_rtx (srcmem), mode,
7981 : GET_MODE_SIZE (mode));
7982 : }
7983 : emit_move_insn (destmem, srcmem);
7984 : }
7985 : }
7986 : else
7987 : {
7988 12112 : rtx tmpreg[4];
7989 12112 : gcc_assert (unroll <= 4);
7990 49549 : for (i = 0; i < unroll; i++)
7991 : {
7992 37437 : tmpreg[i] = gen_reg_rtx (mode);
7993 37437 : if (i)
7994 50650 : srcmem = adjust_address (copy_rtx (srcmem), mode,
7995 : GET_MODE_SIZE (mode));
7996 37437 : emit_move_insn (tmpreg[i], srcmem);
7997 : }
7998 49549 : for (i = 0; i < unroll; i++)
7999 : {
8000 37437 : if (i)
8001 50650 : destmem = adjust_address (copy_rtx (destmem), mode,
8002 : GET_MODE_SIZE (mode));
8003 37437 : emit_move_insn (destmem, tmpreg[i]);
8004 : }
8005 : }
8006 : }
8007 : else
8008 29668 : for (i = 0; i < unroll; i++)
8009 : {
8010 23243 : if (i)
8011 33636 : destmem = adjust_address (copy_rtx (destmem), mode,
8012 : GET_MODE_SIZE (mode));
8013 23243 : emit_move_insn (destmem, value);
8014 : }
8015 :
8016 18537 : tmp = expand_simple_binop (iter_mode, PLUS, iter, piece_size, iter,
8017 : true, OPTAB_LIB_WIDEN);
8018 18537 : if (tmp != iter)
8019 0 : emit_move_insn (iter, tmp);
8020 :
8021 18537 : if (loop_count != 1)
8022 : {
8023 18465 : emit_cmp_and_jump_insns (iter, size, LT, NULL_RTX, iter_mode,
8024 : true, top_label);
8025 18465 : if (expected_size != -1)
8026 : {
8027 9024 : expected_size /= GET_MODE_SIZE (mode) * unroll;
8028 9024 : if (expected_size == 0)
8029 1 : predict_jump (0);
8030 9023 : else if (expected_size > REG_BR_PROB_BASE)
8031 2 : predict_jump (REG_BR_PROB_BASE - 1);
8032 : else
8033 9021 : predict_jump (REG_BR_PROB_BASE
8034 9021 : - (REG_BR_PROB_BASE + expected_size / 2)
8035 9021 : / expected_size);
8036 : }
8037 : else
8038 9441 : predict_jump (REG_BR_PROB_BASE * 80 / 100);
8039 : }
8040 18537 : iter = ix86_zero_extend_to_Pmode (iter);
8041 21298 : tmp = expand_simple_binop (Pmode, PLUS, destptr, iter, destptr,
8042 : true, OPTAB_LIB_WIDEN);
8043 18537 : if (tmp != destptr)
8044 0 : emit_move_insn (destptr, tmp);
8045 18537 : if (!issetmem)
8046 : {
8047 13449 : tmp = expand_simple_binop (Pmode, PLUS, srcptr, iter, srcptr,
8048 : true, OPTAB_LIB_WIDEN);
8049 12112 : if (tmp != srcptr)
8050 0 : emit_move_insn (srcptr, tmp);
8051 : }
8052 18537 : if (loop_count != 1)
8053 18465 : emit_label (out_label);
8054 18537 : }
8055 :
8056 : /* Divide COUNTREG by SCALE. */
8057 : static rtx
8058 14595 : scale_counter (rtx countreg, int scale)
8059 : {
8060 14595 : rtx sc;
8061 :
8062 14595 : if (scale == 1)
8063 : return countreg;
8064 9389 : if (CONST_INT_P (countreg))
8065 9373 : return GEN_INT (INTVAL (countreg) / scale);
8066 16 : gcc_assert (REG_P (countreg));
8067 :
8068 48 : sc = expand_simple_binop (GET_MODE (countreg), LSHIFTRT, countreg,
8069 32 : GEN_INT (exact_log2 (scale)),
8070 : NULL, 1, OPTAB_DIRECT);
8071 16 : return sc;
8072 : }
8073 :
8074 : /* Output "rep; mov" or "rep; stos" instruction depending on ISSETMEM argument.
8075 : When ISSETMEM is true, arguments SRCMEM and SRCPTR are ignored.
8076 : When ISSETMEM is false, arguments VALUE and ORIG_VALUE are ignored.
8077 : For setmem case, VALUE is a promoted to a wider size ORIG_VALUE.
8078 : ORIG_VALUE is the original value passed to memset to fill the memory with.
8079 : Other arguments have same meaning as for previous function. */
8080 :
8081 : static void
8082 14595 : expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
8083 : rtx destptr, rtx srcptr, rtx value, rtx orig_value,
8084 : rtx count,
8085 : machine_mode mode, bool issetmem)
8086 : {
8087 14595 : rtx destexp;
8088 14595 : rtx srcexp;
8089 14595 : rtx countreg;
8090 14595 : HOST_WIDE_INT rounded_count;
8091 :
8092 : /* If possible, it is shorter to use rep movs.
8093 : TODO: Maybe it is better to move this logic to decide_alg. */
8094 14595 : if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
8095 242 : && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
8096 238 : && (!issetmem || orig_value == const0_rtx))
8097 14595 : mode = SImode;
8098 :
8099 14595 : if (destptr != XEXP (destmem, 0) || GET_MODE (destmem) != BLKmode)
8100 14335 : destmem = adjust_automodify_address_nv (destmem, BLKmode, destptr, 0);
8101 :
8102 29190 : countreg = ix86_zero_extend_to_Pmode (scale_counter (count,
8103 14595 : GET_MODE_SIZE (mode)));
8104 14595 : if (mode != QImode)
8105 : {
8106 28427 : destexp = gen_rtx_ASHIFT (Pmode, countreg,
8107 : GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
8108 9649 : destexp = gen_rtx_PLUS (Pmode, destexp, destptr);
8109 : }
8110 : else
8111 5228 : destexp = gen_rtx_PLUS (Pmode, destptr, countreg);
8112 14595 : if ((!issetmem || orig_value == const0_rtx) && CONST_INT_P (count))
8113 : {
8114 10064 : rounded_count
8115 10064 : = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
8116 10064 : destmem = shallow_copy_rtx (destmem);
8117 10064 : set_mem_size (destmem, rounded_count);
8118 : }
8119 4538 : else if (MEM_SIZE_KNOWN_P (destmem))
8120 333 : clear_mem_size (destmem);
8121 :
8122 14595 : if (issetmem)
8123 : {
8124 5348 : value = force_reg (mode, gen_lowpart (mode, value));
8125 5348 : emit_insn (gen_rep_stos (destptr, countreg, destmem, value, destexp));
8126 : }
8127 : else
8128 : {
8129 9247 : if (srcptr != XEXP (srcmem, 0) || GET_MODE (srcmem) != BLKmode)
8130 9045 : srcmem = adjust_automodify_address_nv (srcmem, BLKmode, srcptr, 0);
8131 9247 : if (mode != QImode)
8132 : {
8133 16252 : srcexp = gen_rtx_ASHIFT (Pmode, countreg,
8134 : GEN_INT (exact_log2 (GET_MODE_SIZE (mode))));
8135 5540 : srcexp = gen_rtx_PLUS (Pmode, srcexp, srcptr);
8136 : }
8137 : else
8138 3909 : srcexp = gen_rtx_PLUS (Pmode, srcptr, countreg);
8139 9247 : if (CONST_INT_P (count))
8140 : {
8141 5855 : rounded_count
8142 5855 : = ROUND_DOWN (INTVAL (count), (HOST_WIDE_INT) GET_MODE_SIZE (mode));
8143 5855 : srcmem = shallow_copy_rtx (srcmem);
8144 5855 : set_mem_size (srcmem, rounded_count);
8145 : }
8146 : else
8147 : {
8148 3406 : if (MEM_SIZE_KNOWN_P (srcmem))
8149 0 : clear_mem_size (srcmem);
8150 : }
8151 9247 : emit_insn (gen_rep_mov (destptr, destmem, srcptr, srcmem, countreg,
8152 : destexp, srcexp));
8153 : }
8154 14595 : }
8155 :
8156 : /* This function emits moves to copy SIZE_TO_MOVE bytes from SRCMEM to
8157 : DESTMEM.
8158 : SRC is passed by pointer to be updated on return.
8159 : Return value is updated DST. */
8160 : static rtx
8161 13 : emit_memmov (rtx destmem, rtx *srcmem, rtx destptr, rtx srcptr,
8162 : HOST_WIDE_INT size_to_move)
8163 : {
8164 13 : rtx dst = destmem, src = *srcmem, tempreg;
8165 13 : enum insn_code code;
8166 13 : machine_mode move_mode;
8167 13 : int piece_size, i;
8168 :
8169 : /* Find the widest mode in which we could perform moves.
8170 : Start with the biggest power of 2 less than SIZE_TO_MOVE and half
8171 : it until move of such size is supported. */
8172 13 : piece_size = 1 << floor_log2 (size_to_move);
8173 26 : while (!int_mode_for_size (piece_size * BITS_PER_UNIT, 0).exists (&move_mode)
8174 26 : || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
8175 : {
8176 0 : gcc_assert (piece_size > 1);
8177 0 : piece_size >>= 1;
8178 : }
8179 :
8180 : /* Find the corresponding vector mode with the same size as MOVE_MODE.
8181 : MOVE_MODE is an integer mode at the moment (SI, DI, TI, etc.). */
8182 39 : if (GET_MODE_SIZE (move_mode) > GET_MODE_SIZE (word_mode))
8183 : {
8184 0 : int nunits = GET_MODE_SIZE (move_mode) / GET_MODE_SIZE (word_mode);
8185 0 : if (!mode_for_vector (word_mode, nunits).exists (&move_mode)
8186 0 : || (code = optab_handler (mov_optab, move_mode)) == CODE_FOR_nothing)
8187 : {
8188 0 : move_mode = word_mode;
8189 0 : piece_size = GET_MODE_SIZE (move_mode);
8190 0 : code = optab_handler (mov_optab, move_mode);
8191 : }
8192 : }
8193 13 : gcc_assert (code != CODE_FOR_nothing);
8194 :
8195 13 : dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
8196 13 : src = adjust_automodify_address_nv (src, move_mode, srcptr, 0);
8197 :
8198 : /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
8199 13 : gcc_assert (size_to_move % piece_size == 0);
8200 :
8201 26 : for (i = 0; i < size_to_move; i += piece_size)
8202 : {
8203 : /* We move from memory to memory, so we'll need to do it via
8204 : a temporary register. */
8205 13 : tempreg = gen_reg_rtx (move_mode);
8206 13 : emit_insn (GEN_FCN (code) (tempreg, src));
8207 13 : emit_insn (GEN_FCN (code) (dst, tempreg));
8208 :
8209 26 : emit_move_insn (destptr,
8210 13 : plus_constant (Pmode, copy_rtx (destptr), piece_size));
8211 26 : emit_move_insn (srcptr,
8212 13 : plus_constant (Pmode, copy_rtx (srcptr), piece_size));
8213 :
8214 13 : dst = adjust_automodify_address_nv (dst, move_mode, destptr,
8215 : piece_size);
8216 13 : src = adjust_automodify_address_nv (src, move_mode, srcptr,
8217 : piece_size);
8218 : }
8219 :
8220 : /* Update DST and SRC rtx. */
8221 13 : *srcmem = src;
8222 13 : return dst;
8223 : }
8224 :
8225 : /* Helper function for the string operations below. Dest VARIABLE whether
8226 : it is aligned to VALUE bytes. If true, jump to the label. */
8227 :
8228 : static rtx_code_label *
8229 35255 : ix86_expand_aligntest (rtx variable, int value, bool epilogue)
8230 : {
8231 35255 : rtx_code_label *label = gen_label_rtx ();
8232 35255 : rtx tmpcount = gen_reg_rtx (GET_MODE (variable));
8233 35255 : if (GET_MODE (variable) == DImode)
8234 897 : emit_insn (gen_anddi3 (tmpcount, variable, GEN_INT (value)));
8235 : else
8236 34358 : emit_insn (gen_andsi3 (tmpcount, variable, GEN_INT (value)));
8237 35255 : emit_cmp_and_jump_insns (tmpcount, const0_rtx, EQ, 0, GET_MODE (variable),
8238 : 1, label);
8239 35255 : if (epilogue)
8240 3 : predict_jump (REG_BR_PROB_BASE * 50 / 100);
8241 : else
8242 35252 : predict_jump (REG_BR_PROB_BASE * 90 / 100);
8243 35255 : return label;
8244 : }
8245 :
8246 :
8247 : /* Output code to copy at most count & (max_size - 1) bytes from SRC to DEST. */
8248 :
8249 : static void
8250 7818 : expand_cpymem_epilogue (rtx destmem, rtx srcmem,
8251 : rtx destptr, rtx srcptr, rtx count, int max_size)
8252 : {
8253 7818 : rtx src, dest;
8254 7818 : if (CONST_INT_P (count))
8255 : {
8256 5776 : unsigned HOST_WIDE_INT countval = UINTVAL (count);
8257 5776 : unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
8258 5776 : unsigned int destalign = MEM_ALIGN (destmem);
8259 5776 : cfun->machine->by_pieces_in_use = true;
8260 5776 : move_by_pieces (destmem, srcmem, epilogue_size, destalign,
8261 : RETURN_BEGIN);
8262 5776 : cfun->machine->by_pieces_in_use = false;
8263 5776 : return;
8264 : }
8265 2042 : if (max_size > 8)
8266 : {
8267 2042 : count = expand_simple_binop (GET_MODE (count), AND, count, GEN_INT (max_size - 1),
8268 : count, 1, OPTAB_DIRECT);
8269 2042 : expand_set_or_cpymem_via_loop (destmem, srcmem, destptr, srcptr, NULL,
8270 : count, QImode, 1, 4, false);
8271 2042 : return;
8272 : }
8273 :
8274 : /* When there are stringops, we can cheaply increase dest and src pointers.
8275 : Otherwise we save code size by maintaining offset (zero is readily
8276 : available from preceding rep operation) and using x86 addressing modes.
8277 : */
8278 0 : if (TARGET_SINGLE_STRINGOP)
8279 : {
8280 0 : if (max_size > 4)
8281 : {
8282 0 : rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
8283 0 : src = change_address (srcmem, SImode, srcptr);
8284 0 : dest = change_address (destmem, SImode, destptr);
8285 0 : emit_insn (gen_strmov (destptr, dest, srcptr, src));
8286 0 : emit_label (label);
8287 0 : LABEL_NUSES (label) = 1;
8288 : }
8289 0 : if (max_size > 2)
8290 : {
8291 0 : rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
8292 0 : src = change_address (srcmem, HImode, srcptr);
8293 0 : dest = change_address (destmem, HImode, destptr);
8294 0 : emit_insn (gen_strmov (destptr, dest, srcptr, src));
8295 0 : emit_label (label);
8296 0 : LABEL_NUSES (label) = 1;
8297 : }
8298 0 : if (max_size > 1)
8299 : {
8300 0 : rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
8301 0 : src = change_address (srcmem, QImode, srcptr);
8302 0 : dest = change_address (destmem, QImode, destptr);
8303 0 : emit_insn (gen_strmov (destptr, dest, srcptr, src));
8304 0 : emit_label (label);
8305 0 : LABEL_NUSES (label) = 1;
8306 : }
8307 : }
8308 : else
8309 : {
8310 0 : rtx offset = force_reg (Pmode, const0_rtx);
8311 0 : rtx tmp;
8312 :
8313 0 : if (max_size > 4)
8314 : {
8315 0 : rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
8316 0 : src = change_address (srcmem, SImode, srcptr);
8317 0 : dest = change_address (destmem, SImode, destptr);
8318 0 : emit_move_insn (dest, src);
8319 0 : tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (4), NULL,
8320 : true, OPTAB_LIB_WIDEN);
8321 0 : if (tmp != offset)
8322 0 : emit_move_insn (offset, tmp);
8323 0 : emit_label (label);
8324 0 : LABEL_NUSES (label) = 1;
8325 : }
8326 0 : if (max_size > 2)
8327 : {
8328 0 : rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
8329 0 : tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
8330 0 : src = change_address (srcmem, HImode, tmp);
8331 0 : tmp = gen_rtx_PLUS (Pmode, destptr, offset);
8332 0 : dest = change_address (destmem, HImode, tmp);
8333 0 : emit_move_insn (dest, src);
8334 0 : tmp = expand_simple_binop (Pmode, PLUS, offset, GEN_INT (2), tmp,
8335 : true, OPTAB_LIB_WIDEN);
8336 0 : if (tmp != offset)
8337 0 : emit_move_insn (offset, tmp);
8338 0 : emit_label (label);
8339 0 : LABEL_NUSES (label) = 1;
8340 : }
8341 0 : if (max_size > 1)
8342 : {
8343 0 : rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
8344 0 : tmp = gen_rtx_PLUS (Pmode, srcptr, offset);
8345 0 : src = change_address (srcmem, QImode, tmp);
8346 0 : tmp = gen_rtx_PLUS (Pmode, destptr, offset);
8347 0 : dest = change_address (destmem, QImode, tmp);
8348 0 : emit_move_insn (dest, src);
8349 0 : emit_label (label);
8350 0 : LABEL_NUSES (label) = 1;
8351 : }
8352 : }
8353 : }
8354 :
8355 : /* This function emits moves to fill SIZE_TO_MOVE bytes starting from DESTMEM
8356 : with value PROMOTED_VAL.
8357 : SRC is passed by pointer to be updated on return.
8358 : Return value is updated DST. */
8359 : static rtx
8360 6 : emit_memset (rtx destmem, rtx destptr, rtx promoted_val,
8361 : HOST_WIDE_INT size_to_move)
8362 : {
8363 6 : rtx dst = destmem;
8364 6 : enum insn_code code;
8365 6 : machine_mode move_mode;
8366 6 : int piece_size, i;
8367 :
8368 : /* Find the widest mode in which we could perform moves.
8369 : Start with the biggest power of 2 less than SIZE_TO_MOVE and half
8370 : it until move of such size is supported. */
8371 6 : move_mode = GET_MODE (promoted_val);
8372 6 : if (move_mode == VOIDmode)
8373 0 : move_mode = QImode;
8374 12 : if (size_to_move < GET_MODE_SIZE (move_mode))
8375 : {
8376 5 : unsigned int move_bits = size_to_move * BITS_PER_UNIT;
8377 5 : move_mode = int_mode_for_size (move_bits, 0).require ();
8378 5 : promoted_val = gen_lowpart (move_mode, promoted_val);
8379 : }
8380 6 : piece_size = GET_MODE_SIZE (move_mode);
8381 6 : code = optab_handler (mov_optab, move_mode);
8382 6 : gcc_assert (code != CODE_FOR_nothing && promoted_val != NULL_RTX);
8383 :
8384 6 : dst = adjust_automodify_address_nv (dst, move_mode, destptr, 0);
8385 :
8386 : /* Emit moves. We'll need SIZE_TO_MOVE/PIECE_SIZES moves. */
8387 6 : gcc_assert (size_to_move % piece_size == 0);
8388 :
8389 12 : for (i = 0; i < size_to_move; i += piece_size)
8390 : {
8391 12 : if (piece_size <= GET_MODE_SIZE (word_mode))
8392 : {
8393 4 : emit_insn (gen_strset (destptr, dst, promoted_val));
8394 4 : dst = adjust_automodify_address_nv (dst, move_mode, destptr,
8395 : piece_size);
8396 4 : continue;
8397 : }
8398 :
8399 2 : emit_insn (GEN_FCN (code) (dst, promoted_val));
8400 :
8401 4 : emit_move_insn (destptr,
8402 2 : plus_constant (Pmode, copy_rtx (destptr), piece_size));
8403 :
8404 2 : dst = adjust_automodify_address_nv (dst, move_mode, destptr,
8405 : piece_size);
8406 : }
8407 :
8408 : /* Update DST rtx. */
8409 6 : return dst;
8410 : }
8411 : /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
8412 : static void
8413 325 : expand_setmem_epilogue_via_loop (rtx destmem, rtx destptr, rtx value,
8414 : rtx count, int max_size)
8415 : {
8416 650 : count = expand_simple_binop (counter_mode (count), AND, count,
8417 325 : GEN_INT (max_size - 1), count, 1, OPTAB_DIRECT);
8418 325 : expand_set_or_cpymem_via_loop (destmem, NULL, destptr, NULL,
8419 325 : gen_lowpart (QImode, value), count, QImode,
8420 : 1, max_size / 2, true);
8421 325 : }
8422 :
8423 : /* Callback routine for store_by_pieces. Return the RTL of a register
8424 : containing GET_MODE_SIZE (MODE) bytes in the RTL register op_p which
8425 : is an integer or a word vector register. If PREV_P isn't nullptr,
8426 : it has the RTL info from the previous iteration. */
8427 :
8428 : static rtx
8429 4993 : setmem_epilogue_gen_val (void *op_p, void *prev_p, HOST_WIDE_INT,
8430 : fixed_size_mode mode)
8431 : {
8432 4993 : rtx target;
8433 4993 : by_pieces_prev *prev = (by_pieces_prev *) prev_p;
8434 4993 : if (prev)
8435 : {
8436 4993 : rtx prev_op = prev->data;
8437 4993 : if (prev_op)
8438 : {
8439 2890 : machine_mode prev_mode = GET_MODE (prev_op);
8440 2890 : if (prev_mode == mode)
8441 : return prev_op;
8442 54 : if (VECTOR_MODE_P (prev_mode)
8443 1097 : && VECTOR_MODE_P (mode)
8444 1151 : && GET_MODE_INNER (prev_mode) == GET_MODE_INNER (mode))
8445 : {
8446 0 : target = gen_rtx_SUBREG (mode, prev_op, 0);
8447 0 : return target;
8448 : }
8449 : }
8450 : }
8451 :
8452 3254 : rtx op = (rtx) op_p;
8453 3254 : machine_mode op_mode = GET_MODE (op);
8454 :
8455 3254 : if (VECTOR_MODE_P (mode))
8456 : {
8457 3678 : gcc_assert (GET_MODE_INNER (mode) == QImode);
8458 :
8459 1839 : unsigned int op_size = GET_MODE_SIZE (op_mode);
8460 1839 : unsigned int size = GET_MODE_SIZE (mode);
8461 1839 : unsigned int nunits;
8462 1839 : machine_mode vec_mode;
8463 1839 : if (op_size < size)
8464 : {
8465 : /* If OP size is smaller than MODE size, duplicate it. */
8466 1 : nunits = size / GET_MODE_SIZE (QImode);
8467 1 : vec_mode = mode_for_vector (QImode, nunits).require ();
8468 1 : nunits = size / op_size;
8469 1 : gcc_assert (SCALAR_INT_MODE_P (op_mode));
8470 1 : machine_mode dup_mode
8471 1 : = mode_for_vector (as_a <scalar_mode> (op_mode),
8472 2 : nunits).require ();
8473 1 : target = gen_reg_rtx (vec_mode);
8474 1 : op = gen_vec_duplicate (dup_mode, op);
8475 1 : rtx dup_op = gen_reg_rtx (dup_mode);
8476 1 : emit_move_insn (dup_op, op);
8477 1 : op = gen_rtx_SUBREG (vec_mode, dup_op, 0);
8478 1 : emit_move_insn (target, op);
8479 1 : return target;
8480 : }
8481 1838 : nunits = op_size / GET_MODE_SIZE (QImode);
8482 1838 : vec_mode = mode_for_vector (QImode, nunits).require ();
8483 1838 : target = gen_reg_rtx (vec_mode);
8484 1838 : op = gen_rtx_SUBREG (vec_mode, op, 0);
8485 1838 : emit_move_insn (target, op);
8486 1838 : if (op_size == size)
8487 : return target;
8488 :
8489 0 : rtx tmp = gen_reg_rtx (mode);
8490 0 : target = gen_rtx_SUBREG (mode, target, 0);
8491 0 : emit_move_insn (tmp, target);
8492 0 : return tmp;
8493 : }
8494 :
8495 1415 : if (VECTOR_MODE_P (op_mode))
8496 : {
8497 2820 : gcc_assert (GET_MODE_INNER (op_mode) == word_mode);
8498 1410 : target = gen_reg_rtx (word_mode);
8499 1410 : op = gen_rtx_SUBREG (word_mode, op, 0);
8500 1410 : emit_move_insn (target, op);
8501 : }
8502 : else
8503 : target = op;
8504 :
8505 1415 : if (mode == GET_MODE (target))
8506 : return target;
8507 :
8508 241 : rtx tmp = gen_reg_rtx (mode);
8509 241 : target = gen_rtx_SUBREG (mode, target, 0);
8510 241 : emit_move_insn (tmp, target);
8511 241 : return tmp;
8512 : }
8513 :
8514 : /* Output code to set at most count & (max_size - 1) bytes starting by DEST. */
8515 : static void
8516 7344 : expand_setmem_epilogue (rtx destmem, rtx destptr, rtx value, rtx vec_value,
8517 : rtx count, int max_size)
8518 : {
8519 7344 : rtx dest;
8520 :
8521 7344 : if (CONST_INT_P (count))
8522 : {
8523 7018 : unsigned HOST_WIDE_INT countval = UINTVAL (count);
8524 7018 : unsigned HOST_WIDE_INT epilogue_size = countval % max_size;
8525 7018 : unsigned int destalign = MEM_ALIGN (destmem);
8526 7018 : cfun->machine->by_pieces_in_use = true;
8527 11004 : store_by_pieces (destmem, epilogue_size, setmem_epilogue_gen_val,
8528 : vec_value ? vec_value : value, destalign, true,
8529 : RETURN_BEGIN);
8530 7018 : cfun->machine->by_pieces_in_use = false;
8531 7018 : return;
8532 : }
8533 326 : if (max_size > 32)
8534 : {
8535 325 : expand_setmem_epilogue_via_loop (destmem, destptr, value, count, max_size);
8536 325 : return;
8537 : }
8538 1 : if (max_size > 16)
8539 : {
8540 0 : rtx_code_label *label = ix86_expand_aligntest (count, 16, true);
8541 0 : if (TARGET_64BIT)
8542 : {
8543 0 : dest = change_address (destmem, DImode, destptr);
8544 0 : emit_insn (gen_strset (destptr, dest, value));
8545 0 : dest = adjust_automodify_address_nv (dest, DImode, destptr, 8);
8546 0 : emit_insn (gen_strset (destptr, dest, value));
8547 : }
8548 : else
8549 : {
8550 0 : dest = change_address (destmem, SImode, destptr);
8551 0 : emit_insn (gen_strset (destptr, dest, value));
8552 0 : dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
8553 0 : emit_insn (gen_strset (destptr, dest, value));
8554 0 : dest = adjust_automodify_address_nv (dest, SImode, destptr, 8);
8555 0 : emit_insn (gen_strset (destptr, dest, value));
8556 0 : dest = adjust_automodify_address_nv (dest, SImode, destptr, 12);
8557 0 : emit_insn (gen_strset (destptr, dest, value));
8558 : }
8559 0 : emit_label (label);
8560 0 : LABEL_NUSES (label) = 1;
8561 : }
8562 1 : if (max_size > 8)
8563 : {
8564 0 : rtx_code_label *label = ix86_expand_aligntest (count, 8, true);
8565 0 : if (TARGET_64BIT)
8566 : {
8567 0 : dest = change_address (destmem, DImode, destptr);
8568 0 : emit_insn (gen_strset (destptr, dest, value));
8569 : }
8570 : else
8571 : {
8572 0 : dest = change_address (destmem, SImode, destptr);
8573 0 : emit_insn (gen_strset (destptr, dest, value));
8574 0 : dest = adjust_automodify_address_nv (dest, SImode, destptr, 4);
8575 0 : emit_insn (gen_strset (destptr, dest, value));
8576 : }
8577 0 : emit_label (label);
8578 0 : LABEL_NUSES (label) = 1;
8579 : }
8580 1 : if (max_size > 4)
8581 : {
8582 1 : rtx_code_label *label = ix86_expand_aligntest (count, 4, true);
8583 1 : dest = change_address (destmem, SImode, destptr);
8584 1 : emit_insn (gen_strset (destptr, dest, gen_lowpart (SImode, value)));
8585 1 : emit_label (label);
8586 1 : LABEL_NUSES (label) = 1;
8587 : }
8588 1 : if (max_size > 2)
8589 : {
8590 1 : rtx_code_label *label = ix86_expand_aligntest (count, 2, true);
8591 1 : dest = change_address (destmem, HImode, destptr);
8592 1 : emit_insn (gen_strset (destptr, dest, gen_lowpart (HImode, value)));
8593 1 : emit_label (label);
8594 1 : LABEL_NUSES (label) = 1;
8595 : }
8596 1 : if (max_size > 1)
8597 : {
8598 1 : rtx_code_label *label = ix86_expand_aligntest (count, 1, true);
8599 1 : dest = change_address (destmem, QImode, destptr);
8600 1 : emit_insn (gen_strset (destptr, dest, gen_lowpart (QImode, value)));
8601 1 : emit_label (label);
8602 1 : LABEL_NUSES (label) = 1;
8603 : }
8604 : }
8605 :
8606 : /* Adjust COUNTER by the VALUE. */
8607 : static void
8608 19 : ix86_adjust_counter (rtx countreg, HOST_WIDE_INT value)
8609 : {
8610 19 : emit_insn (gen_add2_insn (countreg, GEN_INT (-value)));
8611 19 : }
8612 :
8613 : /* Depending on ISSETMEM, copy enough from SRCMEM to DESTMEM or set enough to
8614 : DESTMEM to align it to DESIRED_ALIGNMENT. Original alignment is ALIGN.
8615 : Depending on ISSETMEM, either arguments SRCMEM/SRCPTR or VALUE/VEC_VALUE are
8616 : ignored.
8617 : Return value is updated DESTMEM. */
8618 :
8619 : static rtx
8620 7 : expand_set_or_cpymem_prologue (rtx destmem, rtx srcmem,
8621 : rtx destptr, rtx srcptr, rtx value,
8622 : rtx vec_value, rtx count, int align,
8623 : int desired_alignment, bool issetmem)
8624 : {
8625 7 : int i;
8626 35 : for (i = 1; i < desired_alignment; i <<= 1)
8627 : {
8628 28 : if (align <= i)
8629 : {
8630 19 : rtx_code_label *label = ix86_expand_aligntest (destptr, i, false);
8631 19 : if (issetmem)
8632 : {
8633 12 : if (vec_value && i > GET_MODE_SIZE (GET_MODE (value)))
8634 2 : destmem = emit_memset (destmem, destptr, vec_value, i);
8635 : else
8636 4 : destmem = emit_memset (destmem, destptr, value, i);
8637 : }
8638 : else
8639 13 : destmem = emit_memmov (destmem, &srcmem, destptr, srcptr, i);
8640 19 : ix86_adjust_counter (count, i);
8641 19 : emit_label (label);
8642 19 : LABEL_NUSES (label) = 1;
8643 19 : set_mem_align (destmem, i * 2 * BITS_PER_UNIT);
8644 : }
8645 : }
8646 7 : return destmem;
8647 : }
8648 :
8649 : /* Test if COUNT&SIZE is nonzero and if so, expand movme
8650 : or setmem sequence that is valid for SIZE..2*SIZE-1 bytes
8651 : and jump to DONE_LABEL. */
8652 : static void
8653 28182 : expand_small_cpymem_or_setmem (rtx destmem, rtx srcmem,
8654 : rtx destptr, rtx srcptr,
8655 : rtx value, rtx vec_value,
8656 : rtx count, int size,
8657 : rtx done_label, bool issetmem)
8658 : {
8659 28182 : rtx_code_label *label = ix86_expand_aligntest (count, size, false);
8660 28182 : machine_mode mode = int_mode_for_size (size * BITS_PER_UNIT, 1).else_blk ();
8661 28182 : rtx modesize;
8662 28182 : rtx scalar_value = value;
8663 28182 : int n;
8664 :
8665 : /* If we do not have vector value to copy, we must reduce size. */
8666 28182 : if (issetmem)
8667 : {
8668 3680 : if (!vec_value)
8669 : {
8670 7 : if (GET_MODE (value) == VOIDmode && size > 8)
8671 0 : mode = Pmode;
8672 21 : else if (GET_MODE_SIZE (mode) > GET_MODE_SIZE (GET_MODE (value)))
8673 1 : mode = GET_MODE (value);
8674 : }
8675 : else
8676 3673 : mode = GET_MODE (vec_value), value = vec_value;
8677 : }
8678 : else
8679 : {
8680 : /* Choose appropriate vector mode. */
8681 24502 : if (size >= 32)
8682 6124 : switch (MOVE_MAX)
8683 : {
8684 0 : case 64:
8685 0 : if (size >= 64)
8686 : {
8687 : mode = V64QImode;
8688 : break;
8689 : }
8690 : /* FALLTHRU */
8691 0 : case 32:
8692 0 : mode = V32QImode;
8693 0 : break;
8694 : case 16:
8695 : mode = V16QImode;
8696 : break;
8697 : case 8:
8698 : mode = DImode;
8699 : break;
8700 0 : default:
8701 0 : gcc_unreachable ();
8702 : }
8703 18378 : else if (size >= 16)
8704 6124 : mode = TARGET_SSE ? V16QImode : DImode;
8705 24502 : srcmem = change_address (srcmem, mode, srcptr);
8706 : }
8707 31855 : if (issetmem && vec_value && GET_MODE_SIZE (mode) > size)
8708 : {
8709 : /* For memset with vector and the size is smaller than the vector
8710 : size, first try the narrower vector, otherwise, use the
8711 : original value. */
8712 1841 : machine_mode inner_mode = GET_MODE_INNER (mode);
8713 1841 : unsigned int nunits = size / GET_MODE_SIZE (inner_mode);
8714 1841 : if (nunits > 1)
8715 : {
8716 364 : mode = mode_for_vector (GET_MODE_INNER (mode),
8717 364 : nunits).require ();
8718 182 : value = gen_rtx_SUBREG (mode, value, 0);
8719 : }
8720 : else
8721 : {
8722 1659 : scalar_int_mode smode
8723 1659 : = smallest_int_mode_for_size (size * BITS_PER_UNIT).require ();
8724 4977 : gcc_assert (GET_MODE_SIZE (GET_MODE (scalar_value))
8725 : >= GET_MODE_SIZE (smode));
8726 1659 : mode = smode;
8727 1659 : if (GET_MODE (scalar_value) == mode)
8728 : value = scalar_value;
8729 : else
8730 743 : value = gen_rtx_SUBREG (mode, scalar_value, 0);
8731 : }
8732 : }
8733 28182 : destmem = change_address (destmem, mode, destptr);
8734 56364 : modesize = GEN_INT (GET_MODE_SIZE (mode));
8735 56364 : gcc_assert (GET_MODE_SIZE (mode) <= size);
8736 126810 : for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
8737 : {
8738 35223 : if (issetmem)
8739 4597 : emit_move_insn (destmem, gen_lowpart (mode, value));
8740 : else
8741 : {
8742 30626 : emit_move_insn (destmem, srcmem);
8743 61252 : srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
8744 : }
8745 70446 : destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
8746 : }
8747 :
8748 28182 : destmem = offset_address (destmem, count, 1);
8749 56364 : destmem = offset_address (destmem, GEN_INT (-2 * size),
8750 28182 : GET_MODE_SIZE (mode));
8751 28182 : if (!issetmem)
8752 : {
8753 24502 : srcmem = offset_address (srcmem, count, 1);
8754 49004 : srcmem = offset_address (srcmem, GEN_INT (-2 * size),
8755 24502 : GET_MODE_SIZE (mode));
8756 : }
8757 126810 : for (n = 0; n * GET_MODE_SIZE (mode) < size; n++)
8758 : {
8759 35223 : if (issetmem)
8760 4597 : emit_move_insn (destmem, gen_lowpart (mode, value));
8761 : else
8762 : {
8763 30626 : emit_move_insn (destmem, srcmem);
8764 61252 : srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
8765 : }
8766 70446 : destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
8767 : }
8768 28182 : emit_jump_insn (gen_jump (done_label));
8769 28182 : emit_barrier ();
8770 :
8771 28182 : emit_label (label);
8772 28182 : LABEL_NUSES (label) = 1;
8773 28182 : }
8774 :
8775 : /* Handle small memcpy (up to SIZE that is supposed to be small power of 2.
8776 : and get ready for the main memcpy loop by copying iniital DESIRED_ALIGN-ALIGN
8777 : bytes and last SIZE bytes adjusitng DESTPTR/SRCPTR/COUNT in a way we can
8778 : proceed with an loop copying SIZE bytes at once. Do moves in MODE.
8779 : DONE_LABEL is a label after the whole copying sequence. The label is created
8780 : on demand if *DONE_LABEL is NULL.
8781 : MIN_SIZE is minimal size of block copied. This value gets adjusted for new
8782 : bounds after the initial copies.
8783 :
8784 : DESTMEM/SRCMEM are memory expressions pointing to the copies block,
8785 : DESTPTR/SRCPTR are pointers to the block. DYNAMIC_CHECK indicate whether
8786 : we will dispatch to a library call for large blocks.
8787 :
8788 : In pseudocode we do:
8789 :
8790 : if (COUNT < SIZE)
8791 : {
8792 : Assume that SIZE is 4. Bigger sizes are handled analogously
8793 : if (COUNT & 4)
8794 : {
8795 : copy 4 bytes from SRCPTR to DESTPTR
8796 : copy 4 bytes from SRCPTR + COUNT - 4 to DESTPTR + COUNT - 4
8797 : goto done_label
8798 : }
8799 : if (!COUNT)
8800 : goto done_label;
8801 : copy 1 byte from SRCPTR to DESTPTR
8802 : if (COUNT & 2)
8803 : {
8804 : copy 2 bytes from SRCPTR to DESTPTR
8805 : copy 2 bytes from SRCPTR + COUNT - 2 to DESTPTR + COUNT - 2
8806 : }
8807 : }
8808 : else
8809 : {
8810 : copy at least DESIRED_ALIGN-ALIGN bytes from SRCPTR to DESTPTR
8811 : copy SIZE bytes from SRCPTR + COUNT - SIZE to DESTPTR + COUNT -SIZE
8812 :
8813 : OLD_DESPTR = DESTPTR;
8814 : Align DESTPTR up to DESIRED_ALIGN
8815 : SRCPTR += DESTPTR - OLD_DESTPTR
8816 : COUNT -= DEST_PTR - OLD_DESTPTR
8817 : if (DYNAMIC_CHECK)
8818 : Round COUNT down to multiple of SIZE
8819 : << optional caller supplied zero size guard is here >>
8820 : << optional caller supplied dynamic check is here >>
8821 : << caller supplied main copy loop is here >>
8822 : }
8823 : done_label:
8824 : */
8825 : static void
8826 10444 : expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves (rtx destmem, rtx srcmem,
8827 : rtx *destptr, rtx *srcptr,
8828 : machine_mode mode,
8829 : rtx value, rtx vec_value,
8830 : rtx *count,
8831 : rtx_code_label **done_label,
8832 : int size,
8833 : int desired_align,
8834 : int align,
8835 : unsigned HOST_WIDE_INT *min_size,
8836 : bool dynamic_check,
8837 : bool issetmem)
8838 : {
8839 10444 : rtx_code_label *loop_label = NULL, *label;
8840 10444 : int n;
8841 10444 : rtx modesize;
8842 10444 : int prolog_size = 0;
8843 10444 : rtx mode_value;
8844 :
8845 : /* Chose proper value to copy. */
8846 10444 : if (issetmem && VECTOR_MODE_P (mode))
8847 : mode_value = vec_value;
8848 : else
8849 10444 : mode_value = value;
8850 20888 : gcc_assert (GET_MODE_SIZE (mode) <= size);
8851 :
8852 : /* See if block is big or small, handle small blocks. */
8853 10444 : if (!CONST_INT_P (*count) && *min_size < (unsigned HOST_WIDE_INT)size)
8854 : {
8855 7051 : int size2 = size;
8856 7051 : loop_label = gen_label_rtx ();
8857 :
8858 7051 : if (!*done_label)
8859 7051 : *done_label = gen_label_rtx ();
8860 :
8861 7051 : emit_cmp_and_jump_insns (*count, GEN_INT (size2), GE, 0, GET_MODE (*count),
8862 : 1, loop_label);
8863 7051 : size2 >>= 1;
8864 :
8865 : /* Handle sizes > 3. */
8866 35233 : for (;size2 > 2; size2 >>= 1)
8867 28182 : expand_small_cpymem_or_setmem (destmem, srcmem,
8868 : *destptr, *srcptr,
8869 : value, vec_value,
8870 : *count,
8871 : size2, *done_label, issetmem);
8872 : /* Nothing to copy? Jump to DONE_LABEL if so */
8873 7051 : emit_cmp_and_jump_insns (*count, const0_rtx, EQ, 0, GET_MODE (*count),
8874 : 1, *done_label);
8875 :
8876 : /* Do a byte copy. */
8877 7051 : destmem = change_address (destmem, QImode, *destptr);
8878 7051 : if (issetmem)
8879 921 : emit_move_insn (destmem, gen_lowpart (QImode, value));
8880 : else
8881 : {
8882 6130 : srcmem = change_address (srcmem, QImode, *srcptr);
8883 6130 : emit_move_insn (destmem, srcmem);
8884 : }
8885 :
8886 : /* Handle sizes 2 and 3. */
8887 7051 : label = ix86_expand_aligntest (*count, 2, false);
8888 7051 : destmem = change_address (destmem, HImode, *destptr);
8889 7051 : destmem = offset_address (destmem, *count, 1);
8890 7051 : destmem = offset_address (destmem, GEN_INT (-2), 2);
8891 7051 : if (issetmem)
8892 921 : emit_move_insn (destmem, gen_lowpart (HImode, value));
8893 : else
8894 : {
8895 6130 : srcmem = change_address (srcmem, HImode, *srcptr);
8896 6130 : srcmem = offset_address (srcmem, *count, 1);
8897 6130 : srcmem = offset_address (srcmem, GEN_INT (-2), 2);
8898 6130 : emit_move_insn (destmem, srcmem);
8899 : }
8900 :
8901 7051 : emit_label (label);
8902 7051 : LABEL_NUSES (label) = 1;
8903 7051 : emit_jump_insn (gen_jump (*done_label));
8904 7051 : emit_barrier ();
8905 : }
8906 : else
8907 3393 : gcc_assert (*min_size >= (unsigned HOST_WIDE_INT)size
8908 : || UINTVAL (*count) >= (unsigned HOST_WIDE_INT)size);
8909 :
8910 : /* Start memcpy for COUNT >= SIZE. */
8911 7051 : if (loop_label)
8912 : {
8913 7051 : emit_label (loop_label);
8914 7051 : LABEL_NUSES (loop_label) = 1;
8915 : }
8916 :
8917 : /* Copy first desired_align bytes. */
8918 10444 : if (!issetmem)
8919 7843 : srcmem = change_address (srcmem, mode, *srcptr);
8920 10444 : destmem = change_address (destmem, mode, *destptr);
8921 10444 : modesize = GEN_INT (GET_MODE_SIZE (mode));
8922 20909 : for (n = 0; prolog_size < desired_align - align; n++)
8923 : {
8924 21 : if (issetmem)
8925 3 : emit_move_insn (destmem, mode_value);
8926 : else
8927 : {
8928 18 : emit_move_insn (destmem, srcmem);
8929 36 : srcmem = offset_address (srcmem, modesize, GET_MODE_SIZE (mode));
8930 : }
8931 42 : destmem = offset_address (destmem, modesize, GET_MODE_SIZE (mode));
8932 42 : prolog_size += GET_MODE_SIZE (mode);
8933 : }
8934 :
8935 :
8936 : /* Copy last SIZE bytes. */
8937 10444 : destmem = offset_address (destmem, *count, 1);
8938 10444 : destmem = offset_address (destmem,
8939 10444 : GEN_INT (-size - prolog_size),
8940 : 1);
8941 10444 : if (issetmem)
8942 2601 : emit_move_insn (destmem, mode_value);
8943 : else
8944 : {
8945 7843 : srcmem = offset_address (srcmem, *count, 1);
8946 7843 : srcmem = offset_address (srcmem,
8947 : GEN_INT (-size - prolog_size),
8948 : 1);
8949 7843 : emit_move_insn (destmem, srcmem);
8950 : }
8951 81938 : for (n = 1; n * GET_MODE_SIZE (mode) < size; n++)
8952 : {
8953 30525 : destmem = offset_address (destmem, modesize, 1);
8954 30525 : if (issetmem)
8955 7620 : emit_move_insn (destmem, mode_value);
8956 : else
8957 : {
8958 22905 : srcmem = offset_address (srcmem, modesize, 1);
8959 22905 : emit_move_insn (destmem, srcmem);
8960 : }
8961 : }
8962 :
8963 : /* Align destination. */
8964 10444 : if (desired_align > 1 && desired_align > align)
8965 : {
8966 21 : rtx saveddest = *destptr;
8967 :
8968 21 : gcc_assert (desired_align <= size);
8969 : /* Align destptr up, place it to new register. */
8970 21 : *destptr = expand_simple_binop (GET_MODE (*destptr), PLUS, *destptr,
8971 : GEN_INT (prolog_size),
8972 : NULL_RTX, 1, OPTAB_DIRECT);
8973 21 : if (REG_P (*destptr) && REG_P (saveddest) && REG_POINTER (saveddest))
8974 21 : REG_POINTER (*destptr) = 1;
8975 21 : *destptr = expand_simple_binop (GET_MODE (*destptr), AND, *destptr,
8976 21 : GEN_INT (-desired_align),
8977 : *destptr, 1, OPTAB_DIRECT);
8978 : /* See how many bytes we skipped. */
8979 21 : saveddest = expand_simple_binop (GET_MODE (*destptr), MINUS, saveddest,
8980 : *destptr,
8981 : NULL_RTX, 1, OPTAB_DIRECT);
8982 : /* Adjust srcptr and count. */
8983 21 : if (!issetmem)
8984 18 : *srcptr = expand_simple_binop (GET_MODE (*srcptr), MINUS, *srcptr,
8985 : saveddest, *srcptr, 1, OPTAB_DIRECT);
8986 21 : *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
8987 : saveddest, *count, 1, OPTAB_DIRECT);
8988 : /* We copied at most size + prolog_size. */
8989 21 : if (*min_size > (unsigned HOST_WIDE_INT)(size + prolog_size))
8990 14 : *min_size
8991 14 : = ROUND_DOWN (*min_size - size, (unsigned HOST_WIDE_INT)size);
8992 : else
8993 7 : *min_size = 0;
8994 :
8995 : /* Our loops always round down the block size, but for dispatch to
8996 : library we need precise value. */
8997 21 : if (dynamic_check)
8998 21 : *count = expand_simple_binop (GET_MODE (*count), AND, *count,
8999 : GEN_INT (-size), *count, 1, OPTAB_DIRECT);
9000 : }
9001 : else
9002 : {
9003 10423 : gcc_assert (prolog_size == 0);
9004 : /* Decrease count, so we won't end up copying last word twice. */
9005 10423 : if (!CONST_INT_P (*count))
9006 7121 : *count = expand_simple_binop (GET_MODE (*count), PLUS, *count,
9007 : constm1_rtx, *count, 1, OPTAB_DIRECT);
9008 : else
9009 3302 : *count = GEN_INT (ROUND_DOWN (UINTVAL (*count) - 1,
9010 : (unsigned HOST_WIDE_INT)size));
9011 10423 : if (*min_size)
9012 9775 : *min_size = ROUND_DOWN (*min_size - 1, (unsigned HOST_WIDE_INT)size);
9013 : }
9014 10444 : }
9015 :
9016 :
9017 : /* This function is like the previous one, except here we know how many bytes
9018 : need to be copied. That allows us to update alignment not only of DST, which
9019 : is returned, but also of SRC, which is passed as a pointer for that
9020 : reason. */
9021 : static rtx
9022 0 : expand_set_or_cpymem_constant_prologue (rtx dst, rtx *srcp, rtx destreg,
9023 : rtx srcreg, rtx value, rtx vec_value,
9024 : int desired_align, int align_bytes,
9025 : bool issetmem)
9026 : {
9027 0 : rtx src = NULL;
9028 0 : rtx orig_dst = dst;
9029 0 : rtx orig_src = NULL;
9030 0 : int piece_size = 1;
9031 0 : int copied_bytes = 0;
9032 :
9033 0 : if (!issetmem)
9034 : {
9035 0 : gcc_assert (srcp != NULL);
9036 0 : src = *srcp;
9037 0 : orig_src = src;
9038 : }
9039 :
9040 0 : for (piece_size = 1;
9041 0 : piece_size <= desired_align && copied_bytes < align_bytes;
9042 0 : piece_size <<= 1)
9043 : {
9044 0 : if (align_bytes & piece_size)
9045 : {
9046 0 : if (issetmem)
9047 : {
9048 0 : if (vec_value && piece_size > GET_MODE_SIZE (GET_MODE (value)))
9049 0 : dst = emit_memset (dst, destreg, vec_value, piece_size);
9050 : else
9051 0 : dst = emit_memset (dst, destreg, value, piece_size);
9052 : }
9053 : else
9054 0 : dst = emit_memmov (dst, &src, destreg, srcreg, piece_size);
9055 0 : copied_bytes += piece_size;
9056 : }
9057 : }
9058 0 : if (MEM_ALIGN (dst) < (unsigned int) desired_align * BITS_PER_UNIT)
9059 0 : set_mem_align (dst, desired_align * BITS_PER_UNIT);
9060 0 : if (MEM_SIZE_KNOWN_P (orig_dst))
9061 0 : set_mem_size (dst, MEM_SIZE (orig_dst) - align_bytes);
9062 :
9063 0 : if (!issetmem)
9064 : {
9065 0 : int src_align_bytes = get_mem_align_offset (src, desired_align
9066 : * BITS_PER_UNIT);
9067 0 : if (src_align_bytes >= 0)
9068 0 : src_align_bytes = desired_align - src_align_bytes;
9069 0 : if (src_align_bytes >= 0)
9070 : {
9071 : unsigned int src_align;
9072 0 : for (src_align = desired_align; src_align >= 2; src_align >>= 1)
9073 : {
9074 0 : if ((src_align_bytes & (src_align - 1))
9075 0 : == (align_bytes & (src_align - 1)))
9076 : break;
9077 : }
9078 0 : if (src_align > (unsigned int) desired_align)
9079 : src_align = desired_align;
9080 0 : if (MEM_ALIGN (src) < src_align * BITS_PER_UNIT)
9081 0 : set_mem_align (src, src_align * BITS_PER_UNIT);
9082 : }
9083 0 : if (MEM_SIZE_KNOWN_P (orig_src))
9084 0 : set_mem_size (src, MEM_SIZE (orig_src) - align_bytes);
9085 0 : *srcp = src;
9086 : }
9087 :
9088 0 : return dst;
9089 : }
9090 :
9091 : /* Return true if ALG can be used in current context.
9092 : Assume we expand memset if MEMSET is true. */
9093 : static bool
9094 829558 : alg_usable_p (enum stringop_alg alg, bool memset,
9095 : addr_space_t dst_as, addr_space_t src_as)
9096 : {
9097 829558 : if (alg == no_stringop)
9098 : return false;
9099 : /* It is not possible to use a library call if we have non-default
9100 : address space. We can do better than the generic byte-at-a-time
9101 : loop, used as a fallback. */
9102 829558 : if (alg == libcall &&
9103 465386 : !(ADDR_SPACE_GENERIC_P (dst_as) && ADDR_SPACE_GENERIC_P (src_as)))
9104 : return false;
9105 829551 : if (alg == vector_loop)
9106 368319 : return TARGET_SSE || TARGET_AVX;
9107 : /* Algorithms using the rep prefix want at least edi and ecx;
9108 : additionally, memset wants eax and memcpy wants esi. Don't
9109 : consider such algorithms if the user has appropriated those
9110 : registers for their own purposes, or if we have the destination
9111 : in the non-default address space, since string insns cannot
9112 : override the destination segment. */
9113 645360 : if (alg == rep_prefix_1_byte
9114 : || alg == rep_prefix_4_byte
9115 645360 : || alg == rep_prefix_8_byte)
9116 : {
9117 31012 : if (fixed_regs[CX_REG]
9118 31008 : || fixed_regs[DI_REG]
9119 31004 : || (memset ? fixed_regs[AX_REG] : fixed_regs[SI_REG])
9120 31000 : || !ADDR_SPACE_GENERIC_P (dst_as)
9121 62012 : || !(ADDR_SPACE_GENERIC_P (src_as) || Pmode == word_mode))
9122 12 : return false;
9123 : }
9124 : return true;
9125 : }
9126 :
9127 : /* Given COUNT and EXPECTED_SIZE, decide on codegen of string operation. */
9128 : static enum stringop_alg
9129 164375 : decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
9130 : unsigned HOST_WIDE_INT min_size, unsigned HOST_WIDE_INT max_size,
9131 : bool memset, bool zero_memset, addr_space_t dst_as,
9132 : addr_space_t src_as, int *dynamic_check, bool *noalign, bool recur)
9133 : {
9134 164375 : const struct stringop_algs *algs;
9135 164375 : bool optimize_for_speed;
9136 164375 : int max = 0;
9137 164375 : const struct processor_costs *cost;
9138 164375 : int i;
9139 164375 : bool any_alg_usable_p = false;
9140 :
9141 164375 : *noalign = false;
9142 164375 : *dynamic_check = -1;
9143 :
9144 : /* Even if the string operation call is cold, we still might spend a lot
9145 : of time processing large blocks. */
9146 164375 : if (optimize_function_for_size_p (cfun)
9147 164375 : || (optimize_insn_for_size_p ()
9148 8299 : && (max_size < 256
9149 3196 : || (expected_size != -1 && expected_size < 256))))
9150 : optimize_for_speed = false;
9151 : else
9152 148926 : optimize_for_speed = true;
9153 :
9154 148926 : cost = optimize_for_speed ? ix86_cost : &ix86_size_cost;
9155 164375 : if (memset)
9156 48904 : algs = &cost->memset[TARGET_64BIT != 0];
9157 : else
9158 124210 : algs = &cost->memcpy[TARGET_64BIT != 0];
9159 :
9160 : /* See maximal size for user defined algorithm. */
9161 821875 : for (i = 0; i < MAX_STRINGOP_ALGS; i++)
9162 : {
9163 657500 : enum stringop_alg candidate = algs->size[i].alg;
9164 657500 : bool usable = alg_usable_p (candidate, memset, dst_as, src_as);
9165 657500 : any_alg_usable_p |= usable;
9166 :
9167 657500 : if (candidate != libcall && candidate && usable)
9168 313234 : max = algs->size[i].max;
9169 : }
9170 :
9171 : /* If expected size is not known but max size is small enough
9172 : so inline version is a win, set expected size into
9173 : the range. */
9174 164375 : if (((max > 1 && (unsigned HOST_WIDE_INT) max >= max_size) || max == -1)
9175 34847 : && expected_size == -1)
9176 18371 : expected_size = min_size / 2 + max_size / 2;
9177 :
9178 : /* If user specified the algorithm, honor it if possible. */
9179 164375 : if (ix86_stringop_alg != no_stringop
9180 164375 : && alg_usable_p (ix86_stringop_alg, memset, dst_as, src_as))
9181 : return ix86_stringop_alg;
9182 : /* rep; movq or rep; movl is the smallest variant. */
9183 164266 : else if (!optimize_for_speed)
9184 : {
9185 15369 : *noalign = true;
9186 15369 : if (!count || (count & 3) || (memset && !zero_memset))
9187 5901 : return alg_usable_p (rep_prefix_1_byte, memset, dst_as, src_as)
9188 5901 : ? rep_prefix_1_byte : loop_1_byte;
9189 : else
9190 9468 : return alg_usable_p (rep_prefix_4_byte, memset, dst_as, src_as)
9191 9468 : ? rep_prefix_4_byte : loop;
9192 : }
9193 : /* Very tiny blocks are best handled via the loop, REP is expensive to
9194 : setup. */
9195 148897 : else if (expected_size != -1 && expected_size < 4)
9196 : return loop_1_byte;
9197 145617 : else if (expected_size != -1)
9198 : {
9199 : enum stringop_alg alg = libcall;
9200 : bool alg_noalign = false;
9201 180603 : for (i = 0; i < MAX_STRINGOP_ALGS; i++)
9202 : {
9203 : /* We get here if the algorithms that were not libcall-based
9204 : were rep-prefix based and we are unable to use rep prefixes
9205 : based on global register usage. Break out of the loop and
9206 : use the heuristic below. */
9207 177670 : if (algs->size[i].max == 0)
9208 : break;
9209 177670 : if (algs->size[i].max >= expected_size || algs->size[i].max == -1)
9210 : {
9211 74710 : enum stringop_alg candidate = algs->size[i].alg;
9212 :
9213 74710 : if (candidate != libcall
9214 74710 : && alg_usable_p (candidate, memset, dst_as, src_as))
9215 : {
9216 20273 : alg = candidate;
9217 20273 : alg_noalign = algs->size[i].noalign;
9218 : }
9219 : /* Honor TARGET_INLINE_ALL_STRINGOPS by picking
9220 : last non-libcall inline algorithm. */
9221 74710 : if (TARGET_INLINE_ALL_STRINGOPS)
9222 : {
9223 : /* When the current size is best to be copied by a libcall,
9224 : but we are still forced to inline, run the heuristic below
9225 : that will pick code for medium sized blocks. */
9226 10992 : if (alg != libcall)
9227 : {
9228 5117 : *noalign = alg_noalign;
9229 5117 : return alg;
9230 : }
9231 5875 : else if (!any_alg_usable_p)
9232 : break;
9233 : }
9234 63718 : else if (alg_usable_p (candidate, memset, dst_as, src_as)
9235 63718 : && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
9236 22 : && candidate == rep_prefix_1_byte
9237 : /* NB: If min_size != max_size, size is
9238 : unknown. */
9239 22 : && min_size != max_size))
9240 : {
9241 63699 : *noalign = algs->size[i].noalign;
9242 63699 : return candidate;
9243 : }
9244 : }
9245 : }
9246 : }
9247 : /* When asked to inline the call anyway, try to pick meaningful choice.
9248 : We look for maximal size of block that is faster to copy by hand and
9249 : take blocks of at most of that size guessing that average size will
9250 : be roughly half of the block.
9251 :
9252 : If this turns out to be bad, we might simply specify the preferred
9253 : choice in ix86_costs. */
9254 72572 : if ((TARGET_INLINE_ALL_STRINGOPS || TARGET_INLINE_STRINGOPS_DYNAMICALLY)
9255 76807 : && (algs->unknown_size == libcall
9256 0 : || !alg_usable_p (algs->unknown_size, memset, dst_as, src_as)))
9257 : {
9258 4235 : enum stringop_alg alg;
9259 4235 : HOST_WIDE_INT new_expected_size = (max > 0 ? max : 4096) / 2;
9260 :
9261 : /* If there aren't any usable algorithms or if recursing already,
9262 : then recursing on smaller sizes or same size isn't going to
9263 : find anything. Just return the simple byte-at-a-time copy loop. */
9264 4235 : if (!any_alg_usable_p || recur)
9265 : {
9266 : /* Pick something reasonable. */
9267 0 : if (TARGET_INLINE_STRINGOPS_DYNAMICALLY && !recur)
9268 0 : *dynamic_check = 128;
9269 0 : return loop_1_byte;
9270 : }
9271 4235 : alg = decide_alg (count, new_expected_size, min_size, max_size,
9272 : memset, zero_memset, dst_as, src_as,
9273 : dynamic_check, noalign, true);
9274 4235 : gcc_assert (*dynamic_check == -1);
9275 4235 : if (TARGET_INLINE_STRINGOPS_DYNAMICALLY)
9276 8 : *dynamic_check = max;
9277 : else
9278 4227 : gcc_assert (alg != libcall);
9279 4235 : return alg;
9280 : }
9281 :
9282 : /* Try to use some reasonable fallback algorithm. Note that for
9283 : non-default address spaces we default to a loop instead of
9284 : a libcall. */
9285 :
9286 72566 : bool have_as = !(ADDR_SPACE_GENERIC_P (dst_as)
9287 : && ADDR_SPACE_GENERIC_P (src_as));
9288 :
9289 72566 : return (alg_usable_p (algs->unknown_size, memset, dst_as, src_as)
9290 72566 : ? algs->unknown_size : have_as ? loop : libcall);
9291 : }
9292 :
9293 : /* Decide on alignment. We know that the operand is already aligned to ALIGN
9294 : (ALIGN can be based on profile feedback and thus it is not 100% guaranteed). */
9295 : static int
9296 32885 : decide_alignment (int align,
9297 : enum stringop_alg alg,
9298 : int expected_size,
9299 : machine_mode move_mode)
9300 : {
9301 32885 : int desired_align = 0;
9302 :
9303 32885 : gcc_assert (alg != no_stringop);
9304 :
9305 32885 : if (alg == libcall)
9306 : return 0;
9307 32885 : if (move_mode == VOIDmode)
9308 : return 0;
9309 :
9310 32885 : desired_align = GET_MODE_SIZE (move_mode);
9311 : /* PentiumPro has special logic triggering for 8 byte aligned blocks.
9312 : copying whole cacheline at once. */
9313 32885 : if (TARGET_CPU_P (PENTIUMPRO)
9314 0 : && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
9315 32885 : desired_align = 8;
9316 :
9317 32885 : if (optimize_size)
9318 9317 : desired_align = 1;
9319 32885 : if (desired_align < align)
9320 : desired_align = align;
9321 32885 : if (expected_size != -1 && expected_size < 4)
9322 0 : desired_align = align;
9323 :
9324 : return desired_align;
9325 : }
9326 :
9327 :
9328 : /* Helper function for memcpy. For QImode value 0xXY produce
9329 : 0xXYXYXYXY of wide specified by MODE. This is essentially
9330 : a * 0x10101010, but we can do slightly better than
9331 : synth_mult by unwinding the sequence by hand on CPUs with
9332 : slow multiply. */
9333 : static rtx
9334 16131 : promote_duplicated_reg (machine_mode mode, rtx val)
9335 : {
9336 16131 : if (val == const0_rtx)
9337 14313 : return copy_to_mode_reg (mode, CONST0_RTX (mode));
9338 :
9339 1818 : machine_mode valmode = GET_MODE (val);
9340 1818 : if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT)
9341 : {
9342 : /* Duplicate the scalar value for integer vector. */
9343 1495 : gcc_assert ((val == const0_rtx || val == constm1_rtx)
9344 : || GET_MODE_INNER (mode) == valmode);
9345 759 : rtx dup = gen_reg_rtx (mode);
9346 759 : bool ok = ix86_expand_vector_init_duplicate (false, mode, dup,
9347 : val);
9348 759 : gcc_assert (ok);
9349 : return dup;
9350 : }
9351 :
9352 1059 : rtx tmp;
9353 1059 : int nops = mode == DImode ? 3 : 2;
9354 :
9355 30 : gcc_assert (mode == SImode || mode == DImode);
9356 1059 : if (CONST_INT_P (val))
9357 : {
9358 766 : HOST_WIDE_INT v = INTVAL (val) & 255;
9359 :
9360 766 : v |= v << 8;
9361 766 : v |= v << 16;
9362 766 : if (mode == DImode)
9363 744 : v |= (v << 16) << 16;
9364 766 : return copy_to_mode_reg (mode, gen_int_mode (v, mode));
9365 : }
9366 :
9367 293 : if (valmode == VOIDmode)
9368 : valmode = QImode;
9369 293 : if (valmode != QImode)
9370 0 : val = gen_lowpart (QImode, val);
9371 293 : if (mode == QImode)
9372 : return val;
9373 293 : if (!TARGET_PARTIAL_REG_STALL)
9374 293 : nops--;
9375 293 : if (ix86_cost->mult_init[mode == DImode ? 3 : 2]
9376 293 : + ix86_cost->mult_bit * (mode == DImode ? 8 : 4)
9377 293 : <= (ix86_cost->shift_const + ix86_cost->add) * nops
9378 293 : + (COSTS_N_INSNS (TARGET_PARTIAL_REG_STALL == 0)))
9379 : {
9380 293 : rtx reg = convert_modes (mode, QImode, val, true);
9381 293 : tmp = promote_duplicated_reg (mode, const1_rtx);
9382 293 : return expand_simple_binop (mode, MULT, reg, tmp, NULL, 1,
9383 293 : OPTAB_DIRECT);
9384 : }
9385 : else
9386 : {
9387 0 : rtx reg = convert_modes (mode, QImode, val, true);
9388 :
9389 0 : if (!TARGET_PARTIAL_REG_STALL)
9390 0 : emit_insn (gen_insv_1 (mode, reg, reg));
9391 : else
9392 : {
9393 0 : tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (8),
9394 : NULL, 1, OPTAB_DIRECT);
9395 0 : reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1,
9396 : OPTAB_DIRECT);
9397 : }
9398 0 : tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (16),
9399 : NULL, 1, OPTAB_DIRECT);
9400 0 : reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
9401 0 : if (mode == SImode)
9402 : return reg;
9403 0 : tmp = expand_simple_binop (mode, ASHIFT, reg, GEN_INT (32),
9404 : NULL, 1, OPTAB_DIRECT);
9405 0 : reg = expand_simple_binop (mode, IOR, reg, tmp, reg, 1, OPTAB_DIRECT);
9406 0 : return reg;
9407 : }
9408 : }
9409 :
9410 : /* Duplicate value VAL using promote_duplicated_reg into maximal size that will
9411 : be needed by main loop copying SIZE_NEEDED chunks and prologue getting
9412 : alignment from ALIGN to DESIRED_ALIGN. */
9413 : static rtx
9414 11742 : promote_duplicated_reg_to_size (rtx val, int size_needed, int desired_align,
9415 : int align)
9416 : {
9417 11742 : rtx promoted_val;
9418 :
9419 11742 : if (TARGET_64BIT
9420 10239 : && (size_needed > 4 || (desired_align > align && desired_align > 4)))
9421 4498 : promoted_val = promote_duplicated_reg (DImode, val);
9422 7244 : else if (size_needed > 2 || (desired_align > align && desired_align > 2))
9423 5447 : promoted_val = promote_duplicated_reg (SImode, val);
9424 1797 : else if (size_needed > 1 || (desired_align > align && desired_align > 1))
9425 0 : promoted_val = promote_duplicated_reg (HImode, val);
9426 : else
9427 : promoted_val = val;
9428 :
9429 11742 : return promoted_val;
9430 : }
9431 :
9432 : /* Copy the address to a Pmode register. This is used for x32 to
9433 : truncate DImode TLS address to a SImode register. */
9434 :
9435 : static rtx
9436 66236 : ix86_copy_addr_to_reg (rtx addr)
9437 : {
9438 66236 : rtx reg;
9439 70817 : if (GET_MODE (addr) == Pmode || GET_MODE (addr) == VOIDmode)
9440 : {
9441 66236 : reg = copy_addr_to_reg (addr);
9442 66236 : REG_POINTER (reg) = 1;
9443 66236 : return reg;
9444 : }
9445 : else
9446 : {
9447 0 : gcc_assert (GET_MODE (addr) == DImode && Pmode == SImode);
9448 0 : reg = copy_to_mode_reg (DImode, addr);
9449 0 : REG_POINTER (reg) = 1;
9450 0 : return gen_rtx_SUBREG (SImode, reg, 0);
9451 : }
9452 : }
9453 :
9454 : /* Expand string move (memcpy) ot store (memset) operation. Use i386 string
9455 : operations when profitable. The code depends upon architecture, block size
9456 : and alignment, but always has one of the following overall structures:
9457 :
9458 : Aligned move sequence:
9459 :
9460 : 1) Prologue guard: Conditional that jumps up to epilogues for small
9461 : blocks that can be handled by epilogue alone. This is faster
9462 : but also needed for correctness, since prologue assume the block
9463 : is larger than the desired alignment.
9464 :
9465 : Optional dynamic check for size and libcall for large
9466 : blocks is emitted here too, with -minline-stringops-dynamically.
9467 :
9468 : 2) Prologue: copy first few bytes in order to get destination
9469 : aligned to DESIRED_ALIGN. It is emitted only when ALIGN is less
9470 : than DESIRED_ALIGN and up to DESIRED_ALIGN - ALIGN bytes can be
9471 : copied. We emit either a jump tree on power of two sized
9472 : blocks, or a byte loop.
9473 :
9474 : 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
9475 : with specified algorithm.
9476 :
9477 : 4) Epilogue: code copying tail of the block that is too small to be
9478 : handled by main body (or up to size guarded by prologue guard).
9479 :
9480 : Misaligned move sequence
9481 :
9482 : 1) missaligned move prologue/epilogue containing:
9483 : a) Prologue handling small memory blocks and jumping to done_label
9484 : (skipped if blocks are known to be large enough)
9485 : b) Signle move copying first DESIRED_ALIGN-ALIGN bytes if alignment is
9486 : needed by single possibly misaligned move
9487 : (skipped if alignment is not needed)
9488 : c) Copy of last SIZE_NEEDED bytes by possibly misaligned moves
9489 :
9490 : 2) Zero size guard dispatching to done_label, if needed
9491 :
9492 : 3) dispatch to library call, if needed,
9493 :
9494 : 3) Main body: the copying loop itself, copying in SIZE_NEEDED chunks
9495 : with specified algorithm. */
9496 : bool
9497 145522 : ix86_expand_set_or_cpymem (rtx dst, rtx src, rtx count_exp, rtx val_exp,
9498 : rtx align_exp, rtx expected_align_exp,
9499 : rtx expected_size_exp, rtx min_size_exp,
9500 : rtx max_size_exp, rtx probable_max_size_exp,
9501 : bool issetmem)
9502 : {
9503 145522 : rtx destreg;
9504 145522 : rtx srcreg = NULL;
9505 145522 : rtx_code_label *label = NULL;
9506 145522 : rtx tmp;
9507 145522 : rtx_code_label *jump_around_label = NULL;
9508 145522 : HOST_WIDE_INT align = 1;
9509 145522 : unsigned HOST_WIDE_INT count = 0;
9510 145522 : HOST_WIDE_INT expected_size = -1;
9511 145522 : int size_needed = 0, epilogue_size_needed;
9512 145522 : int desired_align = 0, align_bytes = 0;
9513 145522 : enum stringop_alg alg;
9514 145522 : rtx promoted_val = NULL;
9515 145522 : rtx vec_promoted_val = NULL;
9516 145522 : bool force_loopy_epilogue = false;
9517 145522 : int dynamic_check;
9518 145522 : bool need_zero_guard = false;
9519 145522 : bool noalign;
9520 145522 : machine_mode move_mode = VOIDmode;
9521 145522 : int unroll_factor = 1;
9522 : /* TODO: Once value ranges are available, fill in proper data. */
9523 145522 : unsigned HOST_WIDE_INT min_size = HOST_WIDE_INT_0U;
9524 145522 : unsigned HOST_WIDE_INT max_size = HOST_WIDE_INT_M1U;
9525 145522 : unsigned HOST_WIDE_INT probable_max_size = HOST_WIDE_INT_M1U;
9526 145522 : bool misaligned_prologue_used = false;
9527 145522 : addr_space_t dst_as, src_as = ADDR_SPACE_GENERIC;
9528 :
9529 145522 : if (CONST_INT_P (align_exp))
9530 145522 : align = INTVAL (align_exp);
9531 : /* i386 can do misaligned access on reasonably increased cost. */
9532 145522 : if (CONST_INT_P (expected_align_exp)
9533 145522 : && INTVAL (expected_align_exp) > align)
9534 : align = INTVAL (expected_align_exp);
9535 : /* ALIGN is the minimum of destination and source alignment, but we care here
9536 : just about destination alignment. */
9537 138930 : else if (!issetmem
9538 233547 : && MEM_ALIGN (dst) > (unsigned HOST_WIDE_INT) align * BITS_PER_UNIT)
9539 2980 : align = MEM_ALIGN (dst) / BITS_PER_UNIT;
9540 :
9541 145522 : if (CONST_INT_P (count_exp))
9542 : {
9543 65810 : min_size = max_size = probable_max_size = count = expected_size
9544 65810 : = INTVAL (count_exp);
9545 : /* When COUNT is 0, there is nothing to do. */
9546 65810 : if (!count)
9547 : return true;
9548 : }
9549 : else
9550 : {
9551 79712 : if (min_size_exp)
9552 79712 : min_size = INTVAL (min_size_exp);
9553 79712 : if (max_size_exp)
9554 66508 : max_size = INTVAL (max_size_exp);
9555 79712 : if (probable_max_size_exp)
9556 68413 : probable_max_size = INTVAL (probable_max_size_exp);
9557 79712 : if (CONST_INT_P (expected_size_exp))
9558 79712 : expected_size = INTVAL (expected_size_exp);
9559 : }
9560 :
9561 : /* Make sure we don't need to care about overflow later on. */
9562 145520 : if (count > (HOST_WIDE_INT_1U << 30))
9563 : return false;
9564 :
9565 145345 : dst_as = MEM_ADDR_SPACE (dst);
9566 145345 : if (!issetmem)
9567 101098 : src_as = MEM_ADDR_SPACE (src);
9568 :
9569 : /* Step 0: Decide on preferred algorithm, desired alignment and
9570 : size of chunks to be copied by main loop. */
9571 145345 : alg = decide_alg (count, expected_size, min_size, probable_max_size,
9572 44247 : issetmem, issetmem && val_exp == const0_rtx,
9573 : dst_as, src_as, &dynamic_check, &noalign, false);
9574 :
9575 145345 : if (dump_file)
9576 7 : fprintf (dump_file, "Selected stringop expansion strategy: %s\n",
9577 7 : stringop_alg_names[alg]);
9578 :
9579 145345 : if (alg == libcall)
9580 : return false;
9581 32885 : gcc_assert (alg != no_stringop);
9582 :
9583 32885 : if (!count)
9584 15726 : count_exp = copy_to_mode_reg (GET_MODE (count_exp), count_exp);
9585 32885 : destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
9586 32885 : if (!issetmem)
9587 21143 : srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
9588 :
9589 32885 : bool aligned_dstmem = false;
9590 32885 : unsigned int nunits = issetmem ? STORE_MAX_PIECES : MOVE_MAX;
9591 32885 : bool single_insn_p = count && count <= nunits;
9592 32885 : if (single_insn_p)
9593 : {
9594 : /* If it can be done with a single instruction, use vector
9595 : instruction and don't align destination. */
9596 6 : alg = vector_loop;
9597 6 : noalign = true;
9598 6 : dynamic_check = -1;
9599 : }
9600 :
9601 32885 : unroll_factor = 1;
9602 32885 : move_mode = word_mode;
9603 32885 : switch (alg)
9604 : {
9605 0 : case libcall:
9606 0 : case no_stringop:
9607 0 : case last_alg:
9608 0 : gcc_unreachable ();
9609 2072 : case loop_1_byte:
9610 2072 : need_zero_guard = true;
9611 2072 : move_mode = QImode;
9612 2072 : break;
9613 45 : case loop:
9614 45 : need_zero_guard = true;
9615 45 : break;
9616 20 : case unrolled_loop:
9617 20 : need_zero_guard = true;
9618 20 : unroll_factor = (TARGET_64BIT ? 4 : 2);
9619 : break;
9620 16153 : case vector_loop:
9621 16153 : need_zero_guard = true;
9622 16153 : unroll_factor = 4;
9623 : /* Get the vector mode to move STORE_MAX_PIECES/MOVE_MAX bytes. */
9624 16153 : nunits /= GET_MODE_SIZE (word_mode);
9625 16153 : if (nunits > 1)
9626 : {
9627 16149 : move_mode = mode_for_vector (word_mode, nunits).require ();
9628 16149 : gcc_assert (optab_handler (mov_optab, move_mode)
9629 : != CODE_FOR_nothing);
9630 : }
9631 : break;
9632 24 : case rep_prefix_8_byte:
9633 24 : move_mode = DImode;
9634 24 : break;
9635 9364 : case rep_prefix_4_byte:
9636 9364 : move_mode = SImode;
9637 9364 : break;
9638 5207 : case rep_prefix_1_byte:
9639 5207 : move_mode = QImode;
9640 5207 : break;
9641 : }
9642 32885 : size_needed = GET_MODE_SIZE (move_mode) * unroll_factor;
9643 32885 : epilogue_size_needed = size_needed;
9644 :
9645 : /* If we are going to call any library calls conditionally, make sure any
9646 : pending stack adjustment happen before the first conditional branch,
9647 : otherwise they will be emitted before the library call only and won't
9648 : happen from the other branches. */
9649 32885 : if (dynamic_check != -1)
9650 7 : do_pending_stack_adjust ();
9651 :
9652 32885 : desired_align = decide_alignment (align, alg, expected_size, move_mode);
9653 32885 : if (!TARGET_ALIGN_STRINGOPS || noalign)
9654 30729 : align = desired_align;
9655 :
9656 : /* Step 1: Prologue guard. */
9657 :
9658 : /* Alignment code needs count to be in register. */
9659 32885 : if (CONST_INT_P (count_exp) && desired_align > align)
9660 : {
9661 20 : if (INTVAL (count_exp) > desired_align
9662 20 : && INTVAL (count_exp) > size_needed)
9663 : {
9664 20 : align_bytes
9665 20 : = get_mem_align_offset (dst, desired_align * BITS_PER_UNIT);
9666 20 : if (align_bytes <= 0)
9667 : align_bytes = 0;
9668 : else
9669 0 : align_bytes = desired_align - align_bytes;
9670 : }
9671 0 : if (align_bytes == 0)
9672 40 : count_exp = force_reg (counter_mode (count_exp), count_exp);
9673 : }
9674 32885 : gcc_assert (desired_align >= 1 && align >= 1);
9675 :
9676 32885 : if (!single_insn_p)
9677 : {
9678 : /* Misaligned move sequences handle both prologue and epilogue
9679 : at once. Default code generation results in a smaller code
9680 : for large alignments and also avoids redundant job when sizes
9681 : are known precisely. */
9682 32879 : misaligned_prologue_used
9683 65758 : = (TARGET_MISALIGNED_MOVE_STRING_PRO_EPILOGUES
9684 32873 : && MAX (desired_align, epilogue_size_needed) <= 32
9685 16472 : && desired_align <= epilogue_size_needed
9686 38941 : && ((desired_align > align && !align_bytes)
9687 6041 : || (!count && epilogue_size_needed > 1)));
9688 :
9689 : /* Destination is aligned after the misaligned prologue. */
9690 32879 : aligned_dstmem = misaligned_prologue_used;
9691 :
9692 32879 : if (noalign && !misaligned_prologue_used)
9693 : {
9694 : /* Also use misaligned prologue if alignment isn't needed and
9695 : destination isn't aligned. Since alignment isn't needed,
9696 : the destination after prologue won't be aligned. */
9697 30723 : aligned_dstmem = (GET_MODE_ALIGNMENT (move_mode)
9698 30723 : <= MEM_ALIGN (dst));
9699 30723 : if (!aligned_dstmem)
9700 10423 : misaligned_prologue_used = true;
9701 : }
9702 : }
9703 :
9704 : /* Do the cheap promotion to allow better CSE across the
9705 : main loop and epilogue (ie one load of the big constant in the
9706 : front of all code.
9707 : For now the misaligned move sequences do not have fast path
9708 : without broadcasting. */
9709 32885 : if (issetmem
9710 11742 : && (alg == vector_loop
9711 5849 : || CONST_INT_P (val_exp)
9712 47 : || misaligned_prologue_used))
9713 : {
9714 5802 : if (alg == vector_loop)
9715 : {
9716 5893 : promoted_val = promote_duplicated_reg_to_size (val_exp,
9717 11786 : GET_MODE_SIZE (word_mode),
9718 : desired_align, align);
9719 : /* Duplicate the promoted scalar value if not 0 nor -1. */
9720 5893 : vec_promoted_val
9721 5893 : = promote_duplicated_reg (move_mode,
9722 5893 : (val_exp == const0_rtx
9723 759 : || val_exp == constm1_rtx)
9724 : ? val_exp : promoted_val);
9725 : }
9726 : else
9727 : {
9728 5802 : promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
9729 : desired_align, align);
9730 : }
9731 : }
9732 : /* Misaligned move sequences handles both prologues and epilogues at once.
9733 : Default code generation results in smaller code for large alignments and
9734 : also avoids redundant job when sizes are known precisely. */
9735 32838 : if (misaligned_prologue_used)
9736 : {
9737 : /* Misaligned move prologue handled small blocks by itself. */
9738 10444 : expand_set_or_cpymem_prologue_epilogue_by_misaligned_moves
9739 10444 : (dst, src, &destreg, &srcreg,
9740 : move_mode, promoted_val, vec_promoted_val,
9741 : &count_exp,
9742 : &jump_around_label,
9743 10444 : desired_align < align
9744 0 : ? MAX (desired_align, epilogue_size_needed) : epilogue_size_needed,
9745 : desired_align, align, &min_size, dynamic_check, issetmem);
9746 10444 : if (!issetmem)
9747 7843 : src = change_address (src, BLKmode, srcreg);
9748 10444 : dst = change_address (dst, BLKmode, destreg);
9749 10444 : if (aligned_dstmem)
9750 21 : set_mem_align (dst, desired_align * BITS_PER_UNIT);
9751 10444 : epilogue_size_needed = 0;
9752 10444 : if (need_zero_guard
9753 10184 : && min_size < (unsigned HOST_WIDE_INT) size_needed)
9754 : {
9755 : /* It is possible that we copied enough so the main loop will not
9756 : execute. */
9757 7118 : gcc_assert (size_needed > 1);
9758 7118 : if (jump_around_label == NULL_RTX)
9759 68 : jump_around_label = gen_label_rtx ();
9760 14236 : emit_cmp_and_jump_insns (count_exp,
9761 : GEN_INT (size_needed),
9762 : LTU, 0, counter_mode (count_exp), 1, jump_around_label);
9763 7118 : if (expected_size == -1
9764 53 : || expected_size < (desired_align - align) / 2 + size_needed)
9765 7066 : predict_jump (REG_BR_PROB_BASE * 20 / 100);
9766 : else
9767 52 : predict_jump (REG_BR_PROB_BASE * 60 / 100);
9768 : }
9769 : }
9770 : /* Ensure that alignment prologue won't copy past end of block. */
9771 22441 : else if (size_needed > 1 || (desired_align > 1 && desired_align > align))
9772 : {
9773 15162 : epilogue_size_needed = MAX (size_needed - 1, desired_align - align);
9774 : /* Epilogue always copies COUNT_EXP & EPILOGUE_SIZE_NEEDED bytes.
9775 : Make sure it is power of 2. */
9776 15162 : epilogue_size_needed = 1 << (floor_log2 (epilogue_size_needed) + 1);
9777 :
9778 : /* To improve performance of small blocks, we jump around the VAL
9779 : promoting mode. This mean that if the promoted VAL is not constant,
9780 : we might not use it in the epilogue and have to use byte
9781 : loop variant. */
9782 15162 : if (issetmem && epilogue_size_needed > 2 && !promoted_val)
9783 15162 : force_loopy_epilogue = true;
9784 15162 : if ((count && count < (unsigned HOST_WIDE_INT) epilogue_size_needed)
9785 15154 : || max_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
9786 : {
9787 : /* If main algorithm works on QImode, no epilogue is needed.
9788 : For small sizes just don't align anything. */
9789 2119 : if (size_needed == 1)
9790 0 : desired_align = align;
9791 : else
9792 2119 : goto epilogue;
9793 : }
9794 13043 : else if (!count
9795 251 : && min_size < (unsigned HOST_WIDE_INT) epilogue_size_needed)
9796 : {
9797 251 : label = gen_label_rtx ();
9798 502 : emit_cmp_and_jump_insns (count_exp,
9799 : GEN_INT (epilogue_size_needed),
9800 : LTU, 0, counter_mode (count_exp), 1, label);
9801 251 : if (expected_size == -1 || expected_size < epilogue_size_needed)
9802 251 : predict_jump (REG_BR_PROB_BASE * 60 / 100);
9803 : else
9804 0 : predict_jump (REG_BR_PROB_BASE * 20 / 100);
9805 : }
9806 : }
9807 :
9808 : /* Emit code to decide on runtime whether library call or inline should be
9809 : used. */
9810 30766 : if (dynamic_check != -1)
9811 : {
9812 7 : if (!issetmem && CONST_INT_P (count_exp))
9813 : {
9814 1 : if (UINTVAL (count_exp) >= (unsigned HOST_WIDE_INT)dynamic_check)
9815 : {
9816 1 : emit_block_copy_via_libcall (dst, src, count_exp);
9817 1 : count_exp = const0_rtx;
9818 1 : goto epilogue;
9819 : }
9820 : }
9821 : else
9822 : {
9823 6 : rtx_code_label *hot_label = gen_label_rtx ();
9824 6 : if (jump_around_label == NULL_RTX)
9825 1 : jump_around_label = gen_label_rtx ();
9826 12 : emit_cmp_and_jump_insns (count_exp, GEN_INT (dynamic_check - 1),
9827 : LEU, 0, counter_mode (count_exp),
9828 : 1, hot_label);
9829 6 : predict_jump (REG_BR_PROB_BASE * 90 / 100);
9830 6 : if (issetmem)
9831 4 : set_storage_via_libcall (dst, count_exp, val_exp);
9832 : else
9833 2 : emit_block_copy_via_libcall (dst, src, count_exp);
9834 6 : emit_jump (jump_around_label);
9835 6 : emit_label (hot_label);
9836 : }
9837 : }
9838 :
9839 : /* Step 2: Alignment prologue. */
9840 : /* Do the expensive promotion once we branched off the small blocks. */
9841 30765 : if (issetmem && !promoted_val)
9842 47 : promoted_val = promote_duplicated_reg_to_size (val_exp, size_needed,
9843 : desired_align, align);
9844 :
9845 30765 : if (desired_align > align && !misaligned_prologue_used)
9846 : {
9847 7 : if (align_bytes == 0)
9848 : {
9849 : /* Except for the first move in prologue, we no longer know
9850 : constant offset in aliasing info. It don't seems to worth
9851 : the pain to maintain it for the first move, so throw away
9852 : the info early. */
9853 7 : dst = change_address (dst, BLKmode, destreg);
9854 7 : if (!issetmem)
9855 5 : src = change_address (src, BLKmode, srcreg);
9856 7 : dst = expand_set_or_cpymem_prologue (dst, src, destreg, srcreg,
9857 : promoted_val, vec_promoted_val,
9858 : count_exp, align, desired_align,
9859 : issetmem);
9860 : /* At most desired_align - align bytes are copied. */
9861 7 : if (min_size < (unsigned)(desired_align - align))
9862 0 : min_size = 0;
9863 : else
9864 7 : min_size -= desired_align - align;
9865 : }
9866 : else
9867 : {
9868 : /* If we know how many bytes need to be stored before dst is
9869 : sufficiently aligned, maintain aliasing info accurately. */
9870 0 : dst = expand_set_or_cpymem_constant_prologue (dst, &src, destreg,
9871 : srcreg,
9872 : promoted_val,
9873 : vec_promoted_val,
9874 : desired_align,
9875 : align_bytes,
9876 : issetmem);
9877 :
9878 0 : count_exp = plus_constant (counter_mode (count_exp),
9879 0 : count_exp, -align_bytes);
9880 0 : count -= align_bytes;
9881 0 : min_size -= align_bytes;
9882 0 : max_size -= align_bytes;
9883 : }
9884 7 : if (need_zero_guard
9885 7 : && min_size < (unsigned HOST_WIDE_INT) size_needed
9886 1 : && (count < (unsigned HOST_WIDE_INT) size_needed
9887 0 : || (align_bytes == 0
9888 0 : && count < ((unsigned HOST_WIDE_INT) size_needed
9889 0 : + desired_align - align))))
9890 : {
9891 : /* It is possible that we copied enough so the main loop will not
9892 : execute. */
9893 1 : gcc_assert (size_needed > 1);
9894 1 : if (label == NULL_RTX)
9895 0 : label = gen_label_rtx ();
9896 2 : emit_cmp_and_jump_insns (count_exp,
9897 : GEN_INT (size_needed),
9898 : LTU, 0, counter_mode (count_exp), 1, label);
9899 1 : if (expected_size == -1
9900 0 : || expected_size < (desired_align - align) / 2 + size_needed)
9901 1 : predict_jump (REG_BR_PROB_BASE * 20 / 100);
9902 : else
9903 0 : predict_jump (REG_BR_PROB_BASE * 60 / 100);
9904 : }
9905 : }
9906 30765 : if (label && size_needed == 1)
9907 : {
9908 0 : emit_label (label);
9909 0 : LABEL_NUSES (label) = 1;
9910 0 : label = NULL;
9911 0 : epilogue_size_needed = 1;
9912 0 : if (issetmem)
9913 0 : promoted_val = val_exp;
9914 : }
9915 30765 : else if (label == NULL_RTX && !misaligned_prologue_used)
9916 20071 : epilogue_size_needed = size_needed;
9917 :
9918 : /* Step 3: Main loop. */
9919 :
9920 30765 : switch (alg)
9921 : {
9922 0 : case libcall:
9923 0 : case no_stringop:
9924 0 : case last_alg:
9925 0 : gcc_unreachable ();
9926 2137 : case loop_1_byte:
9927 2137 : case loop:
9928 2137 : case unrolled_loop:
9929 2137 : expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg, promoted_val,
9930 : count_exp, move_mode, unroll_factor,
9931 : expected_size, issetmem);
9932 2137 : break;
9933 14033 : case vector_loop:
9934 14033 : expand_set_or_cpymem_via_loop (dst, src, destreg, srcreg,
9935 : vec_promoted_val, count_exp, move_mode,
9936 : unroll_factor, expected_size, issetmem);
9937 14033 : break;
9938 14595 : case rep_prefix_8_byte:
9939 14595 : case rep_prefix_4_byte:
9940 14595 : case rep_prefix_1_byte:
9941 14595 : expand_set_or_cpymem_via_rep (dst, src, destreg, srcreg, promoted_val,
9942 : val_exp, count_exp, move_mode, issetmem);
9943 14595 : break;
9944 : }
9945 : /* Adjust properly the offset of src and dest memory for aliasing. */
9946 30765 : if (CONST_INT_P (count_exp))
9947 : {
9948 17130 : if (!issetmem)
9949 7922 : src = adjust_automodify_address_nv (src, BLKmode, srcreg,
9950 : (count / size_needed) * size_needed);
9951 17130 : dst = adjust_automodify_address_nv (dst, BLKmode, destreg,
9952 : (count / size_needed) * size_needed);
9953 : }
9954 : else
9955 : {
9956 13635 : if (!issetmem)
9957 11395 : src = change_address (src, BLKmode, srcreg);
9958 13635 : dst = change_address (dst, BLKmode, destreg);
9959 : }
9960 :
9961 : /* Step 4: Epilogue to copy the remaining bytes. */
9962 32885 : epilogue:
9963 32885 : if (label)
9964 : {
9965 : /* When the main loop is done, COUNT_EXP might hold original count,
9966 : while we want to copy only COUNT_EXP & SIZE_NEEDED bytes.
9967 : Epilogue code will actually copy COUNT_EXP & EPILOGUE_SIZE_NEEDED
9968 : bytes. Compensate if needed. */
9969 :
9970 251 : if (size_needed < epilogue_size_needed)
9971 : {
9972 0 : tmp = expand_simple_binop (counter_mode (count_exp), AND, count_exp,
9973 0 : GEN_INT (size_needed - 1), count_exp, 1,
9974 : OPTAB_DIRECT);
9975 0 : if (tmp != count_exp)
9976 0 : emit_move_insn (count_exp, tmp);
9977 : }
9978 251 : emit_label (label);
9979 251 : LABEL_NUSES (label) = 1;
9980 : }
9981 :
9982 32885 : if (count_exp != const0_rtx && epilogue_size_needed > 1)
9983 : {
9984 15162 : if (force_loopy_epilogue)
9985 0 : expand_setmem_epilogue_via_loop (dst, destreg, val_exp, count_exp,
9986 : epilogue_size_needed);
9987 : else
9988 : {
9989 15162 : if (issetmem)
9990 7344 : expand_setmem_epilogue (dst, destreg, promoted_val,
9991 : vec_promoted_val, count_exp,
9992 : epilogue_size_needed);
9993 : else
9994 7818 : expand_cpymem_epilogue (dst, src, destreg, srcreg, count_exp,
9995 : epilogue_size_needed);
9996 : }
9997 : }
9998 32885 : if (jump_around_label)
9999 7120 : emit_label (jump_around_label);
10000 : return true;
10001 : }
10002 :
10003 : /* Fully unroll memmove of known size with up to 8 registers. */
10004 :
10005 : static bool
10006 2145 : ix86_expand_unroll_movmem (rtx dst, rtx src, rtx destreg, rtx srcreg,
10007 : unsigned HOST_WIDE_INT count,
10008 : machine_mode mode)
10009 : {
10010 : /* If 8 registers registers can cover all memory, load them into
10011 : registers and store them together to avoid possible address
10012 : overlap between source and destination. */
10013 2145 : unsigned HOST_WIDE_INT moves = count / GET_MODE_SIZE (mode);
10014 2145 : if (moves == 0)
10015 : {
10016 0 : mode = smallest_int_mode_for_size
10017 0 : (count * BITS_PER_UNIT).require ();
10018 0 : if (count == GET_MODE_SIZE (mode))
10019 : moves = 1;
10020 : else
10021 : {
10022 : /* Reduce the smallest move size by half so that MOVES == 1. */
10023 0 : mode = smallest_int_mode_for_size
10024 0 : (GET_MODE_BITSIZE (mode) / 2).require ();
10025 0 : moves = count / GET_MODE_SIZE (mode);
10026 0 : gcc_assert (moves == 1);
10027 : }
10028 : }
10029 2145 : else if (moves > 8)
10030 : return false;
10031 :
10032 2131 : unsigned int i;
10033 2131 : rtx tmp[9];
10034 :
10035 4847 : for (i = 0; i < moves; i++)
10036 2716 : tmp[i] = gen_reg_rtx (mode);
10037 :
10038 2131 : rtx srcmem = change_address (src, mode, srcreg);
10039 6978 : for (i = 0; i < moves; i++)
10040 : {
10041 2716 : emit_move_insn (tmp[i], srcmem);
10042 5432 : srcmem = offset_address (srcmem,
10043 2716 : GEN_INT (GET_MODE_SIZE (mode)),
10044 2716 : GET_MODE_SIZE (mode));
10045 : }
10046 :
10047 2131 : unsigned int epilogue_size = count & (GET_MODE_SIZE (mode) - 1);
10048 2131 : machine_mode epilogue_mode = VOIDmode;
10049 2131 : if (epilogue_size)
10050 : {
10051 : /* Handle the remaining bytes with overlapping move. */
10052 1950 : epilogue_mode = smallest_int_mode_for_size
10053 1950 : (epilogue_size * BITS_PER_UNIT).require ();
10054 1950 : tmp[8] = gen_reg_rtx (epilogue_mode);
10055 1950 : srcmem = adjust_address (srcmem, epilogue_mode, 0);
10056 1950 : srcmem = offset_address (srcmem, GEN_INT (epilogue_size), 1);
10057 3900 : srcmem = offset_address (srcmem,
10058 1950 : GEN_INT (-GET_MODE_SIZE (epilogue_mode)),
10059 1950 : GET_MODE_SIZE (epilogue_mode));
10060 1950 : emit_move_insn (tmp[8], srcmem);
10061 : }
10062 :
10063 2131 : rtx destmem = change_address (dst, mode, destreg);
10064 6978 : for (i = 0; i < moves; i++)
10065 : {
10066 2716 : emit_move_insn (destmem, tmp[i]);
10067 5432 : destmem = offset_address (destmem,
10068 2716 : GEN_INT (GET_MODE_SIZE (mode)),
10069 2716 : GET_MODE_SIZE (mode));
10070 : }
10071 :
10072 2131 : if (epilogue_size)
10073 : {
10074 : /* Use overlapping move. */
10075 1950 : destmem = adjust_address (destmem, epilogue_mode, 0);
10076 1950 : destmem = offset_address (destmem, GEN_INT (epilogue_size), 1);
10077 3900 : destmem = offset_address (destmem,
10078 1950 : GEN_INT (-GET_MODE_SIZE (epilogue_mode)),
10079 1950 : GET_MODE_SIZE (epilogue_mode));
10080 1950 : emit_move_insn (destmem, tmp[8]);
10081 : }
10082 :
10083 : return true;
10084 : }
10085 :
10086 : /* Expand memmove of size with MOVES * mode size and MOVES <= 4. If
10087 : FORWARD is true, copy forward. Otherwise copy backward. */
10088 :
10089 : static void
10090 2944 : ix86_expand_n_move_movmem (rtx destmem, rtx srcmem, machine_mode mode,
10091 : unsigned int moves, bool forward)
10092 : {
10093 2944 : gcc_assert (moves <= 4);
10094 :
10095 : unsigned int i;
10096 : rtx tmp[8];
10097 :
10098 14720 : for (i = 0; i < moves; i++)
10099 11776 : tmp[i] = gen_reg_rtx (mode);
10100 :
10101 2944 : rtx step;
10102 2944 : if (forward)
10103 2944 : step = GEN_INT (GET_MODE_SIZE (mode));
10104 : else
10105 2944 : step = GEN_INT (-GET_MODE_SIZE (mode));
10106 :
10107 : /* Load MOVES. */
10108 11776 : for (i = 0; i < moves - 1; i++)
10109 : {
10110 8832 : emit_move_insn (tmp[i], srcmem);
10111 17664 : srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
10112 : }
10113 2944 : emit_move_insn (tmp[i], srcmem);
10114 :
10115 : /* Store MOVES. */
10116 14720 : for (i = 0; i < moves - 1; i++)
10117 : {
10118 8832 : emit_move_insn (destmem, tmp[i]);
10119 17664 : destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
10120 : }
10121 2944 : emit_move_insn (destmem, tmp[i]);
10122 2944 : }
10123 :
10124 : /* Load MOVES of mode size into REGS. If LAST is true, load the
10125 : last MOVES. Otherwise, load the first MOVES. */
10126 :
10127 : static void
10128 2944 : ix86_expand_load_movmem (rtx src, rtx srcreg, rtx count_exp,
10129 : machine_mode mode, unsigned int moves,
10130 : rtx regs[], bool last)
10131 : {
10132 2944 : unsigned int i;
10133 :
10134 14720 : for (i = 0; i < moves; i++)
10135 11776 : regs[i] = gen_reg_rtx (mode);
10136 :
10137 2944 : rtx srcmem = change_address (src, mode, srcreg);
10138 2944 : rtx step;
10139 2944 : if (last)
10140 : {
10141 1472 : srcmem = offset_address (srcmem, count_exp, 1);
10142 2944 : step = GEN_INT (-GET_MODE_SIZE (mode));
10143 2944 : srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
10144 : }
10145 : else
10146 2944 : step = GEN_INT (GET_MODE_SIZE (mode));
10147 :
10148 11776 : for (i = 0; i < moves - 1; i++)
10149 : {
10150 8832 : emit_move_insn (regs[i], srcmem);
10151 17664 : srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
10152 : }
10153 2944 : emit_move_insn (regs[i], srcmem);
10154 2944 : }
10155 :
10156 : /* Store MOVES of mode size into REGS. If LAST is true, store the
10157 : last MOVES. Otherwise, store the first MOVES. */
10158 :
10159 : static void
10160 2944 : ix86_expand_store_movmem (rtx dst, rtx destreg, rtx count_exp,
10161 : machine_mode mode, unsigned int moves,
10162 : rtx regs[], bool last)
10163 : {
10164 2944 : unsigned int i;
10165 :
10166 2944 : rtx destmem = change_address (dst, mode, destreg);
10167 2944 : rtx step;
10168 2944 : if (last)
10169 : {
10170 1472 : destmem = offset_address (destmem, count_exp, 1);
10171 2944 : step = GEN_INT (-GET_MODE_SIZE (mode));
10172 2944 : destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
10173 : }
10174 : else
10175 2944 : step = GEN_INT (GET_MODE_SIZE (mode));
10176 :
10177 11776 : for (i = 0; i < moves - 1; i++)
10178 : {
10179 8832 : emit_move_insn (destmem, regs[i]);
10180 17664 : destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
10181 : }
10182 2944 : emit_move_insn (destmem, regs[i]);
10183 2944 : }
10184 :
10185 : /* Expand memmove of size between (MOVES / 2) * mode size and
10186 : MOVES * mode size with overlapping load and store. MOVES is even.
10187 : MOVES >= 2 and MOVES <= 8. */
10188 :
10189 : static void
10190 14903 : ix86_expand_n_overlapping_move_movmem (rtx dst, rtx src, rtx destreg,
10191 : rtx srcreg, rtx count_exp,
10192 : machine_mode mode,
10193 : unsigned int moves)
10194 : {
10195 14903 : gcc_assert (moves >= 2 && moves <= 8 && (moves & 1) == 0);
10196 :
10197 14903 : unsigned int half_moves = moves / 2;
10198 14903 : unsigned int i, j;
10199 14903 : rtx tmp[8];
10200 :
10201 57569 : for (i = 0; i < moves; i++)
10202 42666 : tmp[i] = gen_reg_rtx (mode);
10203 :
10204 14903 : rtx base_srcmem = change_address (src, mode, srcreg);
10205 :
10206 : /* Load the first half. */
10207 14903 : rtx srcmem = base_srcmem;
10208 36236 : for (i = 0; i < half_moves - 1; i++)
10209 : {
10210 6430 : emit_move_insn (tmp[i], srcmem);
10211 12860 : srcmem = offset_address (srcmem,
10212 6430 : GEN_INT (GET_MODE_SIZE (mode)),
10213 6430 : GET_MODE_SIZE (mode));
10214 : }
10215 14903 : emit_move_insn (tmp[i], srcmem);
10216 :
10217 : /* Load the second half. */
10218 14903 : srcmem = offset_address (base_srcmem, count_exp, 1);
10219 14903 : srcmem = offset_address (srcmem,
10220 14903 : GEN_INT (-GET_MODE_SIZE (mode)),
10221 14903 : GET_MODE_SIZE (mode));
10222 36236 : for (j = half_moves, i = 0; i < half_moves - 1; i++, j++)
10223 : {
10224 6430 : emit_move_insn (tmp[j], srcmem);
10225 12860 : srcmem = offset_address (srcmem,
10226 6430 : GEN_INT (-GET_MODE_SIZE (mode)),
10227 6430 : GET_MODE_SIZE (mode));
10228 : }
10229 14903 : emit_move_insn (tmp[j], srcmem);
10230 :
10231 14903 : rtx base_destmem = change_address (dst, mode, destreg);
10232 :
10233 : /* Store the first half. */
10234 14903 : rtx destmem = base_destmem;
10235 36236 : for (i = 0; i < half_moves - 1; i++)
10236 : {
10237 6430 : emit_move_insn (destmem, tmp[i]);
10238 12860 : destmem = offset_address (destmem,
10239 6430 : GEN_INT (GET_MODE_SIZE (mode)),
10240 6430 : GET_MODE_SIZE (mode));
10241 : }
10242 14903 : emit_move_insn (destmem, tmp[i]);
10243 :
10244 : /* Store the second half. */
10245 14903 : destmem = offset_address (base_destmem, count_exp, 1);
10246 29806 : destmem = offset_address (destmem, GEN_INT (-GET_MODE_SIZE (mode)),
10247 14903 : GET_MODE_SIZE (mode));
10248 36236 : for (j = half_moves, i = 0; i < half_moves - 1; i++, j++)
10249 : {
10250 6430 : emit_move_insn (destmem, tmp[j]);
10251 12860 : destmem = offset_address (destmem, GEN_INT (-GET_MODE_SIZE (mode)),
10252 6430 : GET_MODE_SIZE (mode));
10253 : }
10254 14903 : emit_move_insn (destmem, tmp[j]);
10255 14903 : }
10256 :
10257 : /* Expand memmove of size < mode size which is <= 64. */
10258 :
10259 : static void
10260 3339 : ix86_expand_less_move_movmem (rtx dst, rtx src, rtx destreg,
10261 : rtx srcreg, rtx count_exp,
10262 : unsigned HOST_WIDE_INT min_size,
10263 : machine_mode mode,
10264 : rtx_code_label *done_label)
10265 : {
10266 3339 : bool skip = false;
10267 3339 : machine_mode count_mode = counter_mode (count_exp);
10268 :
10269 3339 : rtx_code_label *between_32_63_label
10270 3339 : = GET_MODE_SIZE (mode) > 32 ? gen_label_rtx () : nullptr;
10271 : /* Jump to BETWEEN_32_64_LABEL if size >= 32 and size < 64. */
10272 3 : if (between_32_63_label)
10273 : {
10274 3 : if (min_size && min_size >= 32)
10275 : {
10276 1 : emit_jump_insn (gen_jump (between_32_63_label));
10277 1 : emit_barrier ();
10278 1 : skip = true;
10279 : }
10280 : else
10281 2 : emit_cmp_and_jump_insns (count_exp, GEN_INT (32), GEU,
10282 : nullptr, count_mode, 1,
10283 : between_32_63_label);
10284 : }
10285 :
10286 3 : rtx_code_label *between_16_31_label
10287 3338 : = (!skip && GET_MODE_SIZE (mode) > 16) ? gen_label_rtx () : nullptr;
10288 : /* Jump to BETWEEN_16_31_LABEL if size >= 16 and size < 31. */
10289 4 : if (between_16_31_label)
10290 : {
10291 4 : if (min_size && min_size >= 16)
10292 : {
10293 2 : emit_jump_insn (gen_jump (between_16_31_label));
10294 2 : emit_barrier ();
10295 2 : skip = true;
10296 : }
10297 : else
10298 2 : emit_cmp_and_jump_insns (count_exp, GEN_INT (16), GEU,
10299 : nullptr, count_mode, 1,
10300 : between_16_31_label);
10301 : }
10302 :
10303 2 : rtx_code_label *between_8_15_label
10304 6673 : = (!skip && GET_MODE_SIZE (mode) > 8) ? gen_label_rtx () : nullptr;
10305 : /* Jump to BETWEEN_8_15_LABEL if size >= 8 and size < 15. */
10306 2231 : if (between_8_15_label)
10307 : {
10308 2231 : if (min_size && min_size >= 8)
10309 : {
10310 150 : emit_jump_insn (gen_jump (between_8_15_label));
10311 150 : emit_barrier ();
10312 150 : skip = true;
10313 : }
10314 : else
10315 2081 : emit_cmp_and_jump_insns (count_exp, GEN_INT (8), GEU,
10316 : nullptr, count_mode, 1,
10317 : between_8_15_label);
10318 : }
10319 :
10320 150 : rtx_code_label *between_4_7_label
10321 6375 : = (!skip && GET_MODE_SIZE (mode) > 4) ? gen_label_rtx () : nullptr;
10322 : /* Jump to BETWEEN_4_7_LABEL if size >= 4 and size < 7. */
10323 2513 : if (between_4_7_label)
10324 : {
10325 2513 : if (min_size && min_size >= 4)
10326 : {
10327 180 : emit_jump_insn (gen_jump (between_4_7_label));
10328 180 : emit_barrier ();
10329 180 : skip = true;
10330 : }
10331 : else
10332 2333 : emit_cmp_and_jump_insns (count_exp, GEN_INT (4), GEU,
10333 : nullptr, count_mode, 1,
10334 : between_4_7_label);
10335 : }
10336 :
10337 180 : rtx_code_label *between_2_3_label
10338 6165 : = (!skip && GET_MODE_SIZE (mode) > 2) ? gen_label_rtx () : nullptr;
10339 : /* Jump to BETWEEN_2_3_LABEL if size >= 2 and size < 3. */
10340 2853 : if (between_2_3_label)
10341 : {
10342 2853 : if (min_size && min_size >= 2)
10343 : {
10344 290 : emit_jump_insn (gen_jump (between_2_3_label));
10345 290 : emit_barrier ();
10346 290 : skip = true;
10347 : }
10348 : else
10349 2563 : emit_cmp_and_jump_insns (count_exp, GEN_INT (1), GT,
10350 : nullptr, count_mode, 1,
10351 : between_2_3_label);
10352 : }
10353 :
10354 3339 : if (!skip)
10355 : {
10356 2716 : rtx_code_label *zero_label
10357 2716 : = min_size == 0 ? gen_label_rtx () : nullptr;
10358 : /* Skip if size == 0. */
10359 1661 : if (zero_label)
10360 1661 : emit_cmp_and_jump_insns (count_exp, GEN_INT (1), LT,
10361 : nullptr, count_mode, 1,
10362 : zero_label,
10363 : profile_probability::unlikely ());
10364 :
10365 : /* Move 1 byte. */
10366 2716 : rtx tmp0 = gen_reg_rtx (QImode);
10367 2716 : rtx srcmem = change_address (src, QImode, srcreg);
10368 2716 : emit_move_insn (tmp0, srcmem);
10369 2716 : rtx destmem = change_address (dst, QImode, destreg);
10370 2716 : emit_move_insn (destmem, tmp0);
10371 :
10372 2716 : if (zero_label)
10373 1661 : emit_label (zero_label);
10374 :
10375 2716 : emit_jump_insn (gen_jump (done_label));
10376 2716 : emit_barrier ();
10377 : }
10378 :
10379 3339 : if (between_32_63_label)
10380 : {
10381 3 : emit_label (between_32_63_label);
10382 3 : ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
10383 : count_exp, OImode, 2);
10384 3 : emit_jump_insn (gen_jump (done_label));
10385 3 : emit_barrier ();
10386 : }
10387 :
10388 3339 : if (between_16_31_label)
10389 : {
10390 4 : emit_label (between_16_31_label);
10391 4 : ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
10392 : count_exp, TImode, 2);
10393 4 : emit_jump_insn (gen_jump (done_label));
10394 4 : emit_barrier ();
10395 : }
10396 :
10397 3339 : if (between_8_15_label)
10398 : {
10399 2231 : emit_label (between_8_15_label);
10400 2231 : ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
10401 : count_exp, DImode, 2);
10402 2231 : emit_jump_insn (gen_jump (done_label));
10403 2231 : emit_barrier ();
10404 : }
10405 :
10406 3339 : if (between_4_7_label)
10407 : {
10408 2513 : emit_label (between_4_7_label);
10409 2513 : ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
10410 : count_exp, SImode, 2);
10411 2513 : emit_jump_insn (gen_jump (done_label));
10412 2513 : emit_barrier ();
10413 : }
10414 :
10415 3339 : if (between_2_3_label)
10416 : {
10417 2853 : emit_label (between_2_3_label);
10418 2853 : ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
10419 : count_exp, HImode, 2);
10420 2853 : emit_jump_insn (gen_jump (done_label));
10421 2853 : emit_barrier ();
10422 : }
10423 3339 : }
10424 :
10425 : /* Expand movmem with overlapping unaligned loads and stores:
10426 : 1. Load all sources into registers and store them together to avoid
10427 : possible address overlap between source and destination.
10428 : 2. For known size, first try to fully unroll with 8 registers.
10429 : 3. For size <= 2 * MOVE_MAX, load all sources into 2 registers first
10430 : and then store them together.
10431 : 4. For size > 2 * MOVE_MAX and size <= 4 * MOVE_MAX, load all sources
10432 : into 4 registers first and then store them together.
10433 : 5. For size > 4 * MOVE_MAX and size <= 8 * MOVE_MAX, load all sources
10434 : into 8 registers first and then store them together.
10435 : 6. For size > 8 * MOVE_MAX,
10436 : a. If address of destination > address of source, copy backward
10437 : with a 4 * MOVE_MAX loop with unaligned loads and stores. Load
10438 : the first 4 * MOVE_MAX into 4 registers before the loop and
10439 : store them after the loop to support overlapping addresses.
10440 : b. Otherwise, copy forward with a 4 * MOVE_MAX loop with unaligned
10441 : loads and stores. Load the last 4 * MOVE_MAX into 4 registers
10442 : before the loop and store them after the loop to support
10443 : overlapping addresses.
10444 : */
10445 :
10446 : bool
10447 17209 : ix86_expand_movmem (rtx operands[])
10448 : {
10449 : /* Since there are much less registers available in 32-bit mode, don't
10450 : inline movmem in 32-bit mode. */
10451 17209 : if (!TARGET_64BIT)
10452 : return false;
10453 :
10454 14831 : rtx dst = operands[0];
10455 14831 : rtx src = operands[1];
10456 14831 : rtx count_exp = operands[2];
10457 14831 : rtx expected_size_exp = operands[5];
10458 14831 : rtx min_size_exp = operands[6];
10459 14831 : rtx probable_max_size_exp = operands[8];
10460 14831 : unsigned HOST_WIDE_INT count = HOST_WIDE_INT_0U;
10461 14831 : HOST_WIDE_INT expected_size = HOST_WIDE_INT_M1U;
10462 14831 : unsigned HOST_WIDE_INT min_size = HOST_WIDE_INT_0U;
10463 14831 : unsigned HOST_WIDE_INT probable_max_size = HOST_WIDE_INT_M1U;
10464 :
10465 14831 : if (CONST_INT_P (count_exp))
10466 : {
10467 2285 : min_size = probable_max_size = count = expected_size
10468 2285 : = INTVAL (count_exp);
10469 : /* When COUNT is 0, there is nothing to do. */
10470 2285 : if (!count)
10471 : return true;
10472 : }
10473 : else
10474 : {
10475 12546 : if (min_size_exp)
10476 12546 : min_size = INTVAL (min_size_exp);
10477 12546 : if (probable_max_size_exp)
10478 9138 : probable_max_size = INTVAL (probable_max_size_exp);
10479 12546 : if (CONST_INT_P (expected_size_exp))
10480 12546 : expected_size = INTVAL (expected_size_exp);
10481 : }
10482 :
10483 : /* Make sure we don't need to care about overflow later on. */
10484 14831 : if (count > (HOST_WIDE_INT_1U << 30))
10485 : return false;
10486 :
10487 14795 : addr_space_t dst_as = MEM_ADDR_SPACE (dst);
10488 14795 : addr_space_t src_as = MEM_ADDR_SPACE (src);
10489 14795 : int dynamic_check;
10490 14795 : bool noalign;
10491 14795 : enum stringop_alg alg = decide_alg (count, expected_size, min_size,
10492 : probable_max_size, false, false,
10493 : dst_as, src_as, &dynamic_check,
10494 : &noalign, false);
10495 14795 : if (alg == libcall)
10496 : return false;
10497 :
10498 6104 : rtx destreg = ix86_copy_addr_to_reg (XEXP (dst, 0));
10499 6104 : rtx srcreg = ix86_copy_addr_to_reg (XEXP (src, 0));
10500 :
10501 6104 : unsigned int move_max = MOVE_MAX;
10502 6104 : machine_mode mode = smallest_int_mode_for_size
10503 6104 : (move_max * BITS_PER_UNIT).require ();
10504 6104 : if (probable_max_size && probable_max_size < move_max)
10505 : {
10506 : /* Get a usable MOVE_MAX. */
10507 3278 : mode = smallest_int_mode_for_size
10508 3278 : (probable_max_size * BITS_PER_UNIT).require ();
10509 : /* Reduce MOVE_MAX by half so that MOVE_MAX can be used. */
10510 6556 : if (GET_MODE_SIZE (mode) > probable_max_size)
10511 2795 : mode = smallest_int_mode_for_size
10512 2795 : (GET_MODE_BITSIZE (mode) / 2).require ();
10513 6556 : move_max = GET_MODE_SIZE (mode);
10514 : }
10515 :
10516 : /* Try to fully unroll memmove of known size first. */
10517 6104 : if (count
10518 6104 : && ix86_expand_unroll_movmem (dst, src, destreg, srcreg, count,
10519 : mode))
10520 : return true;
10521 :
10522 3973 : rtx_code_label *done_label = gen_label_rtx ();
10523 :
10524 3973 : rtx_code_label *less_vec_label = nullptr;
10525 3973 : if (min_size == 0 || min_size < move_max)
10526 3339 : less_vec_label = gen_label_rtx ();
10527 :
10528 3973 : machine_mode count_mode = counter_mode (count_exp);
10529 :
10530 : /* Jump to LESS_VEC_LABEL if size < MOVE_MAX. */
10531 3973 : if (less_vec_label)
10532 3339 : emit_cmp_and_jump_insns (count_exp, GEN_INT (move_max), LTU,
10533 : nullptr, count_mode, 1,
10534 : less_vec_label);
10535 :
10536 3973 : rtx_code_label *more_2x_vec_label = nullptr;
10537 3973 : if (probable_max_size == 0 || probable_max_size > 2 * move_max)
10538 1828 : more_2x_vec_label = gen_label_rtx ();
10539 :
10540 : /* Jump to MORE_2X_VEC_LABEL if size > 2 * MOVE_MAX. */
10541 1828 : if (more_2x_vec_label)
10542 1828 : emit_cmp_and_jump_insns (count_exp, GEN_INT (2 * move_max), GTU,
10543 : nullptr, count_mode, 1,
10544 : more_2x_vec_label);
10545 :
10546 3973 : if (min_size == 0 || min_size <= 2 * move_max)
10547 : {
10548 : /* Size >= MOVE_MAX and size <= 2 * MOVE_MAX. */
10549 3949 : ix86_expand_n_overlapping_move_movmem (dst, src, destreg, srcreg,
10550 : count_exp, mode, 2);
10551 3949 : emit_jump_insn (gen_jump (done_label));
10552 3949 : emit_barrier ();
10553 : }
10554 :
10555 3973 : if (less_vec_label)
10556 : {
10557 : /* Size < MOVE_MAX. */
10558 3339 : emit_label (less_vec_label);
10559 3339 : ix86_expand_less_move_movmem (dst, src, destreg, srcreg,
10560 : count_exp, min_size, mode,
10561 : done_label);
10562 3339 : emit_jump_insn (gen_jump (done_label));
10563 3339 : emit_barrier ();
10564 : }
10565 :
10566 3973 : if (more_2x_vec_label)
10567 : {
10568 : /* Size > 2 * MOVE_MAX and destination may overlap with source. */
10569 1828 : emit_label (more_2x_vec_label);
10570 :
10571 1828 : rtx_code_label *more_8x_vec_label = nullptr;
10572 1828 : if (probable_max_size == 0 || probable_max_size > 8 * move_max)
10573 1472 : more_8x_vec_label = gen_label_rtx ();
10574 :
10575 : /* Jump to MORE_8X_VEC_LABEL if size > 8 * MOVE_MAX. */
10576 1472 : if (more_8x_vec_label)
10577 1472 : emit_cmp_and_jump_insns (count_exp, GEN_INT (8 * move_max), GTU,
10578 : nullptr, count_mode, 1,
10579 : more_8x_vec_label);
10580 :
10581 1828 : rtx_code_label *last_4x_vec_label = nullptr;
10582 1828 : if (min_size == 0 || min_size < 4 * move_max)
10583 1810 : last_4x_vec_label = gen_label_rtx ();
10584 :
10585 : /* Jump to LAST_4X_VEC_LABEL if size < 4 * MOVE_MAX. */
10586 1810 : if (last_4x_vec_label)
10587 1810 : emit_cmp_and_jump_insns (count_exp, GEN_INT (4 * move_max), LTU,
10588 : nullptr, count_mode, 1,
10589 : last_4x_vec_label);
10590 :
10591 1828 : if (probable_max_size == 0 || probable_max_size > 4 * move_max)
10592 : {
10593 : /* Size > 4 * MOVE_MAX and size <= 8 * MOVE_MAX. */
10594 1540 : ix86_expand_n_overlapping_move_movmem (dst, src, destreg,
10595 : srcreg, count_exp,
10596 : mode, 8);
10597 1540 : emit_jump_insn (gen_jump (done_label));
10598 1540 : emit_barrier ();
10599 : }
10600 :
10601 1828 : if (last_4x_vec_label)
10602 : {
10603 : /* Size > 2 * MOVE_MAX and size <= 4 * MOVE_MAX. */
10604 1810 : emit_label (last_4x_vec_label);
10605 1810 : ix86_expand_n_overlapping_move_movmem (dst, src, destreg,
10606 : srcreg, count_exp,
10607 : mode, 4);
10608 1810 : emit_jump_insn (gen_jump (done_label));
10609 1810 : emit_barrier ();
10610 : }
10611 :
10612 1828 : if (more_8x_vec_label)
10613 : {
10614 : /* Size > 8 * MOVE_MAX. */
10615 1472 : emit_label (more_8x_vec_label);
10616 :
10617 1472 : rtx loop_count = gen_reg_rtx (count_mode);
10618 1472 : emit_move_insn (loop_count, count_exp);
10619 :
10620 : /* Jump to MORE_8X_VEC_BACKWARD_LABEL if source address is
10621 : lower than destination address. */
10622 1472 : rtx_code_label *more_8x_vec_backward_label = gen_label_rtx ();
10623 1472 : emit_cmp_and_jump_insns (srcreg, destreg, LTU, nullptr,
10624 1472 : GET_MODE (destreg), 1,
10625 : more_8x_vec_backward_label);
10626 :
10627 : /* Skip if source == destination which is less common. */
10628 1472 : emit_cmp_and_jump_insns (srcreg, destreg, EQ, nullptr,
10629 1472 : GET_MODE (destreg), 1, done_label,
10630 : profile_probability::unlikely ());
10631 :
10632 1472 : rtx base_destreg = gen_reg_rtx (GET_MODE (destreg));
10633 1472 : emit_move_insn (base_destreg, destreg);
10634 :
10635 : /* Load the last 4 * MOVE_MAX. */
10636 1472 : rtx regs[4];
10637 1472 : ix86_expand_load_movmem (src, srcreg, count_exp, mode,
10638 : ARRAY_SIZE (regs), regs, true);
10639 :
10640 1472 : rtx srcmem = change_address (src, mode, srcreg);
10641 1472 : rtx destmem = change_address (dst, mode, destreg);
10642 :
10643 : /* Copy forward with a 4 * MOVE_MAX loop. */
10644 1472 : rtx_code_label *loop_4x_vec_forward_label = gen_label_rtx ();
10645 1472 : emit_label (loop_4x_vec_forward_label);
10646 :
10647 1472 : ix86_expand_n_move_movmem (destmem, srcmem, mode, 4, true);
10648 :
10649 1472 : rtx tmp;
10650 1472 : rtx delta = GEN_INT (4 * MOVE_MAX);
10651 :
10652 : /* Decrement LOOP_COUNT by 4 * MOVE_MAX. */
10653 1472 : tmp = expand_simple_binop (GET_MODE (loop_count), MINUS,
10654 : loop_count, delta, nullptr, 1,
10655 : OPTAB_DIRECT);
10656 1472 : if (tmp != loop_count)
10657 1472 : emit_move_insn (loop_count, tmp);
10658 :
10659 : /* Increment DESTREG and SRCREG by 4 * MOVE_MAX. */
10660 1472 : tmp = expand_simple_binop (GET_MODE (destreg), PLUS,
10661 : destreg, delta, nullptr, 1,
10662 : OPTAB_DIRECT);
10663 1472 : if (tmp != destreg)
10664 1472 : emit_move_insn (destreg, tmp);
10665 1472 : tmp = expand_simple_binop (GET_MODE (srcreg), PLUS, srcreg,
10666 : delta, nullptr, 1, OPTAB_DIRECT);
10667 1472 : if (tmp != srcreg)
10668 1472 : emit_move_insn (srcreg, tmp);
10669 :
10670 : /* Stop if LOOP_EXP <= 4 * MOVE_MAX. */
10671 1472 : emit_cmp_and_jump_insns (loop_count, delta, GTU, nullptr,
10672 1472 : GET_MODE (loop_count), 1,
10673 : loop_4x_vec_forward_label);
10674 :
10675 : /* Store the last 4 * MOVE_MAX. */
10676 1472 : ix86_expand_store_movmem (dst, base_destreg, count_exp, mode,
10677 : ARRAY_SIZE (regs), regs, true);
10678 :
10679 1472 : emit_jump_insn (gen_jump (done_label));
10680 1472 : emit_barrier ();
10681 :
10682 : /* Copy backward with a 4 * MOVE_MAX loop. */
10683 1472 : emit_label (more_8x_vec_backward_label);
10684 :
10685 1472 : base_destreg = gen_reg_rtx (GET_MODE (destreg));
10686 1472 : emit_move_insn (base_destreg, destreg);
10687 :
10688 : /* Load the first 4 * MOVE_MAX. */
10689 1472 : ix86_expand_load_movmem (src, srcreg, count_exp, mode,
10690 : ARRAY_SIZE (regs), regs, false);
10691 :
10692 : /* Increment DESTREG and SRCREG by COUNT_EXP. */
10693 1472 : tmp = expand_simple_binop (GET_MODE (destreg), PLUS,
10694 : destreg, count_exp, nullptr, 1,
10695 : OPTAB_DIRECT);
10696 1472 : if (tmp != destreg)
10697 1472 : emit_move_insn (destreg, tmp);
10698 1472 : tmp = expand_simple_binop (GET_MODE (srcreg), PLUS, srcreg,
10699 : count_exp, nullptr, 1, OPTAB_DIRECT);
10700 1472 : if (tmp != srcreg)
10701 1472 : emit_move_insn (srcreg, tmp);
10702 :
10703 1472 : srcmem = change_address (src, mode, srcreg);
10704 1472 : destmem = change_address (dst, mode, destreg);
10705 2944 : rtx step = GEN_INT (-GET_MODE_SIZE (mode));
10706 2944 : srcmem = offset_address (srcmem, step, GET_MODE_SIZE (mode));
10707 2944 : destmem = offset_address (destmem, step, GET_MODE_SIZE (mode));
10708 :
10709 1472 : rtx_code_label *loop_4x_vec_backward_label = gen_label_rtx ();
10710 1472 : emit_label (loop_4x_vec_backward_label);
10711 :
10712 1472 : ix86_expand_n_move_movmem (destmem, srcmem, mode, 4, false);
10713 :
10714 : /* Decrement LOOP_COUNT by 4 * MOVE_MAX. */
10715 1472 : tmp = expand_simple_binop (GET_MODE (loop_count), MINUS,
10716 : loop_count, delta, nullptr, 1,
10717 : OPTAB_DIRECT);
10718 1472 : if (tmp != loop_count)
10719 1472 : emit_move_insn (loop_count, tmp);
10720 :
10721 : /* Decrement DESTREG and SRCREG by 4 * MOVE_MAX. */
10722 1472 : tmp = expand_simple_binop (GET_MODE (destreg), MINUS,
10723 : destreg, delta, nullptr, 1,
10724 : OPTAB_DIRECT);
10725 1472 : if (tmp != destreg)
10726 1472 : emit_move_insn (destreg, tmp);
10727 1472 : tmp = expand_simple_binop (GET_MODE (srcreg), MINUS, srcreg,
10728 : delta, nullptr, 1, OPTAB_DIRECT);
10729 1472 : if (tmp != srcreg)
10730 1472 : emit_move_insn (srcreg, tmp);
10731 :
10732 : /* Stop if LOOP_EXP <= 4 * MOVE_MAX. */
10733 1472 : emit_cmp_and_jump_insns (loop_count, delta, GTU, nullptr,
10734 1472 : GET_MODE (loop_count), 1,
10735 : loop_4x_vec_backward_label);
10736 :
10737 : /* Store the first 4 * MOVE_MAX. */
10738 1472 : ix86_expand_store_movmem (dst, base_destreg, count_exp, mode,
10739 : ARRAY_SIZE (regs), regs, false);
10740 :
10741 1472 : emit_jump_insn (gen_jump (done_label));
10742 1472 : emit_barrier ();
10743 : }
10744 : }
10745 :
10746 3973 : emit_label (done_label);
10747 :
10748 3973 : return true;
10749 : }
10750 :
10751 : /* Expand cmpstrn or memcmp. */
10752 :
10753 : bool
10754 170432 : ix86_expand_cmpstrn_or_cmpmem (rtx result, rtx src1, rtx src2,
10755 : rtx length, rtx align, bool is_cmpstrn)
10756 : {
10757 : /* Expand strncmp and memcmp only with -minline-all-stringops since
10758 : "repz cmpsb" can be much slower than strncmp and memcmp functions
10759 : implemented with vector instructions, see
10760 :
10761 : https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43052
10762 : */
10763 170432 : if (!TARGET_INLINE_ALL_STRINGOPS)
10764 : return false;
10765 :
10766 : /* Can't use this if the user has appropriated ecx, esi or edi. */
10767 5780 : if (fixed_regs[CX_REG] || fixed_regs[SI_REG] || fixed_regs[DI_REG])
10768 : return false;
10769 :
10770 5780 : if (is_cmpstrn)
10771 : {
10772 : /* For strncmp, length is the maximum length, which can be larger
10773 : than actual string lengths. We can expand the cmpstrn pattern
10774 : to "repz cmpsb" only if one of the strings is a constant so
10775 : that expand_builtin_strncmp() can write the length argument to
10776 : be the minimum of the const string length and the actual length
10777 : argument. Otherwise, "repz cmpsb" may pass the 0 byte. */
10778 69 : tree t1 = MEM_EXPR (src1);
10779 69 : tree t2 = MEM_EXPR (src2);
10780 138 : if (!((t1 && TREE_CODE (t1) == MEM_REF
10781 69 : && TREE_CODE (TREE_OPERAND (t1, 0)) == ADDR_EXPR
10782 0 : && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t1, 0), 0))
10783 : == STRING_CST))
10784 69 : || (t2 && TREE_CODE (t2) == MEM_REF
10785 69 : && TREE_CODE (TREE_OPERAND (t2, 0)) == ADDR_EXPR
10786 69 : && (TREE_CODE (TREE_OPERAND (TREE_OPERAND (t2, 0), 0))
10787 : == STRING_CST))))
10788 : return false;
10789 : }
10790 :
10791 5780 : rtx addr1 = copy_addr_to_reg (XEXP (src1, 0));
10792 5780 : rtx addr2 = copy_addr_to_reg (XEXP (src2, 0));
10793 5780 : if (addr1 != XEXP (src1, 0))
10794 5780 : src1 = replace_equiv_address_nv (src1, addr1);
10795 5780 : if (addr2 != XEXP (src2, 0))
10796 5780 : src2 = replace_equiv_address_nv (src2, addr2);
10797 :
10798 : /* NB: Make a copy of the data length to avoid changing the original
10799 : data length by cmpstrnqi patterns. */
10800 5780 : length = ix86_zero_extend_to_Pmode (length);
10801 8673 : rtx lengthreg = gen_reg_rtx (Pmode);
10802 5780 : emit_move_insn (lengthreg, length);
10803 :
10804 : /* If we are testing strict equality, we can use known alignment to
10805 : good advantage. This may be possible with combine, particularly
10806 : once cc0 is dead. */
10807 5780 : if (CONST_INT_P (length))
10808 : {
10809 0 : if (length == const0_rtx)
10810 : {
10811 0 : emit_move_insn (result, const0_rtx);
10812 0 : return true;
10813 : }
10814 0 : emit_insn (gen_cmpstrnqi_nz_1 (addr1, addr2, lengthreg, align,
10815 : src1, src2));
10816 : }
10817 : else
10818 : {
10819 8673 : emit_insn (gen_cmp_1 (Pmode, lengthreg, lengthreg));
10820 5780 : emit_insn (gen_cmpstrnqi_1 (addr1, addr2, lengthreg, align,
10821 : src1, src2));
10822 : }
10823 :
10824 5780 : rtx out = gen_lowpart (QImode, result);
10825 5780 : emit_insn (gen_cmpintqi (out));
10826 5780 : emit_move_insn (result, gen_rtx_SIGN_EXTEND (SImode, out));
10827 :
10828 5780 : return true;
10829 : }
10830 :
10831 : /* Expand the appropriate insns for doing strlen if not just doing
10832 : repnz; scasb
10833 :
10834 : out = result, initialized with the start address
10835 : align_rtx = alignment of the address.
10836 : scratch = scratch register, initialized with the startaddress when
10837 : not aligned, otherwise undefined
10838 :
10839 : This is just the body. It needs the initializations mentioned above and
10840 : some address computing at the end. These things are done in i386.md. */
10841 :
10842 : static void
10843 11 : ix86_expand_strlensi_unroll_1 (rtx out, rtx src, rtx align_rtx)
10844 : {
10845 11 : int align;
10846 11 : rtx tmp;
10847 11 : rtx_code_label *align_2_label = NULL;
10848 11 : rtx_code_label *align_3_label = NULL;
10849 11 : rtx_code_label *align_4_label = gen_label_rtx ();
10850 11 : rtx_code_label *end_0_label = gen_label_rtx ();
10851 11 : rtx mem;
10852 11 : rtx tmpreg = gen_reg_rtx (SImode);
10853 11 : rtx scratch = gen_reg_rtx (SImode);
10854 11 : rtx cmp;
10855 :
10856 11 : align = 0;
10857 11 : if (CONST_INT_P (align_rtx))
10858 11 : align = INTVAL (align_rtx);
10859 :
10860 : /* Loop to check 1..3 bytes for null to get an aligned pointer. */
10861 :
10862 : /* Is there a known alignment and is it less than 4? */
10863 11 : if (align < 4)
10864 : {
10865 15 : rtx scratch1 = gen_reg_rtx (Pmode);
10866 11 : emit_move_insn (scratch1, out);
10867 : /* Is there a known alignment and is it not 2? */
10868 11 : if (align != 2)
10869 : {
10870 11 : align_3_label = gen_label_rtx (); /* Label when aligned to 3-byte */
10871 11 : align_2_label = gen_label_rtx (); /* Label when aligned to 2-byte */
10872 :
10873 : /* Leave just the 3 lower bits. */
10874 15 : align_rtx = expand_binop (Pmode, and_optab, scratch1, GEN_INT (3),
10875 : NULL_RTX, 0, OPTAB_WIDEN);
10876 :
10877 15 : emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
10878 11 : Pmode, 1, align_4_label);
10879 15 : emit_cmp_and_jump_insns (align_rtx, const2_rtx, EQ, NULL,
10880 11 : Pmode, 1, align_2_label);
10881 15 : emit_cmp_and_jump_insns (align_rtx, const2_rtx, GTU, NULL,
10882 11 : Pmode, 1, align_3_label);
10883 : }
10884 : else
10885 : {
10886 : /* Since the alignment is 2, we have to check 2 or 0 bytes;
10887 : check if is aligned to 4 - byte. */
10888 :
10889 0 : align_rtx = expand_binop (Pmode, and_optab, scratch1, const2_rtx,
10890 : NULL_RTX, 0, OPTAB_WIDEN);
10891 :
10892 0 : emit_cmp_and_jump_insns (align_rtx, const0_rtx, EQ, NULL,
10893 0 : Pmode, 1, align_4_label);
10894 : }
10895 :
10896 11 : mem = change_address (src, QImode, out);
10897 :
10898 : /* Now compare the bytes. */
10899 :
10900 : /* Compare the first n unaligned byte on a byte per byte basis. */
10901 11 : emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL,
10902 : QImode, 1, end_0_label);
10903 :
10904 : /* Increment the address. */
10905 11 : emit_insn (gen_add2_insn (out, const1_rtx));
10906 :
10907 : /* Not needed with an alignment of 2 */
10908 11 : if (align != 2)
10909 : {
10910 11 : emit_label (align_2_label);
10911 :
10912 11 : emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
10913 : end_0_label);
10914 :
10915 11 : emit_insn (gen_add2_insn (out, const1_rtx));
10916 :
10917 11 : emit_label (align_3_label);
10918 : }
10919 :
10920 11 : emit_cmp_and_jump_insns (mem, const0_rtx, EQ, NULL, QImode, 1,
10921 : end_0_label);
10922 :
10923 11 : emit_insn (gen_add2_insn (out, const1_rtx));
10924 : }
10925 :
10926 : /* Generate loop to check 4 bytes at a time. It is not a good idea to
10927 : align this loop. It gives only huge programs, but does not help to
10928 : speed up. */
10929 11 : emit_label (align_4_label);
10930 :
10931 11 : mem = change_address (src, SImode, out);
10932 11 : emit_move_insn (scratch, mem);
10933 11 : emit_insn (gen_add2_insn (out, GEN_INT (4)));
10934 :
10935 : /* This formula yields a nonzero result iff one of the bytes is zero.
10936 : This saves three branches inside loop and many cycles. */
10937 :
10938 11 : emit_insn (gen_addsi3 (tmpreg, scratch, GEN_INT (-0x01010101)));
10939 11 : emit_insn (gen_one_cmplsi2 (scratch, scratch));
10940 11 : emit_insn (gen_andsi3 (tmpreg, tmpreg, scratch));
10941 11 : emit_insn (gen_andsi3 (tmpreg, tmpreg,
10942 : gen_int_mode (0x80808080, SImode)));
10943 11 : emit_cmp_and_jump_insns (tmpreg, const0_rtx, EQ, 0, SImode, 1,
10944 : align_4_label);
10945 :
10946 11 : if (TARGET_CMOVE)
10947 : {
10948 11 : rtx reg = gen_reg_rtx (SImode);
10949 15 : rtx reg2 = gen_reg_rtx (Pmode);
10950 11 : emit_move_insn (reg, tmpreg);
10951 11 : emit_insn (gen_lshrsi3 (reg, reg, GEN_INT (16)));
10952 :
10953 : /* If zero is not in the first two bytes, move two bytes forward. */
10954 11 : emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
10955 11 : tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
10956 11 : tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
10957 11 : emit_insn (gen_rtx_SET (tmpreg,
10958 : gen_rtx_IF_THEN_ELSE (SImode, tmp,
10959 : reg,
10960 : tmpreg)));
10961 : /* Emit lea manually to avoid clobbering of flags. */
10962 15 : emit_insn (gen_rtx_SET (reg2, plus_constant (Pmode, out, 2)));
10963 :
10964 11 : tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
10965 11 : tmp = gen_rtx_EQ (VOIDmode, tmp, const0_rtx);
10966 15 : emit_insn (gen_rtx_SET (out,
10967 : gen_rtx_IF_THEN_ELSE (Pmode, tmp,
10968 : reg2,
10969 : out)));
10970 11 : }
10971 : else
10972 : {
10973 0 : rtx_code_label *end_2_label = gen_label_rtx ();
10974 : /* Is zero in the first two bytes? */
10975 :
10976 0 : emit_insn (gen_testsi_ccno_1 (tmpreg, GEN_INT (0x8080)));
10977 0 : tmp = gen_rtx_REG (CCNOmode, FLAGS_REG);
10978 0 : tmp = gen_rtx_NE (VOIDmode, tmp, const0_rtx);
10979 0 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
10980 : gen_rtx_LABEL_REF (VOIDmode, end_2_label),
10981 : pc_rtx);
10982 0 : tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
10983 0 : JUMP_LABEL (tmp) = end_2_label;
10984 :
10985 : /* Not in the first two. Move two bytes forward. */
10986 0 : emit_insn (gen_lshrsi3 (tmpreg, tmpreg, GEN_INT (16)));
10987 0 : emit_insn (gen_add2_insn (out, const2_rtx));
10988 :
10989 0 : emit_label (end_2_label);
10990 :
10991 : }
10992 :
10993 : /* Avoid branch in fixing the byte. */
10994 11 : tmpreg = gen_lowpart (QImode, tmpreg);
10995 11 : emit_insn (gen_addqi3_cconly_overflow (tmpreg, tmpreg));
10996 11 : tmp = gen_rtx_REG (CCmode, FLAGS_REG);
10997 11 : cmp = gen_rtx_LTU (VOIDmode, tmp, const0_rtx);
10998 15 : emit_insn (gen_sub3_carry (Pmode, out, out, GEN_INT (3), tmp, cmp));
10999 :
11000 11 : emit_label (end_0_label);
11001 11 : }
11002 :
11003 : /* Expand strlen. */
11004 :
11005 : bool
11006 13626 : ix86_expand_strlen (rtx out, rtx src, rtx eoschar, rtx align)
11007 : {
11008 13626 : if (TARGET_UNROLL_STRLEN
11009 13626 : && TARGET_INLINE_ALL_STRINGOPS
11010 11 : && eoschar == const0_rtx
11011 11 : && optimize > 1)
11012 : {
11013 : /* The generic case of strlen expander is long. Avoid it's
11014 : expanding unless TARGET_INLINE_ALL_STRINGOPS. */
11015 15 : rtx addr = force_reg (Pmode, XEXP (src, 0));
11016 : /* Well it seems that some optimizer does not combine a call like
11017 : foo(strlen(bar), strlen(bar));
11018 : when the move and the subtraction is done here. It does calculate
11019 : the length just once when these instructions are done inside of
11020 : output_strlen_unroll(). But I think since &bar[strlen(bar)] is
11021 : often used and I use one fewer register for the lifetime of
11022 : output_strlen_unroll() this is better. */
11023 :
11024 11 : emit_move_insn (out, addr);
11025 :
11026 11 : ix86_expand_strlensi_unroll_1 (out, src, align);
11027 :
11028 : /* strlensi_unroll_1 returns the address of the zero at the end of
11029 : the string, like memchr(), so compute the length by subtracting
11030 : the start address. */
11031 11 : emit_insn (gen_sub2_insn (out, addr));
11032 11 : return true;
11033 : }
11034 : else
11035 : return false;
11036 : }
11037 :
11038 : /* For given symbol (function) construct code to compute address of it's PLT
11039 : entry in large x86-64 PIC model. */
11040 :
11041 : static rtx
11042 31 : construct_plt_address (rtx symbol)
11043 : {
11044 31 : rtx tmp, unspec;
11045 :
11046 31 : gcc_assert (SYMBOL_REF_P (symbol));
11047 31 : gcc_assert (ix86_cmodel == CM_LARGE_PIC && !TARGET_PECOFF);
11048 31 : gcc_assert (Pmode == DImode);
11049 :
11050 31 : tmp = gen_reg_rtx (Pmode);
11051 31 : unspec = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, symbol), UNSPEC_PLTOFF);
11052 :
11053 31 : emit_move_insn (tmp, gen_rtx_CONST (Pmode, unspec));
11054 31 : emit_insn (gen_add2_insn (tmp, pic_offset_table_rtx));
11055 31 : return tmp;
11056 : }
11057 :
11058 : /* Additional registers that are clobbered by SYSV calls. */
11059 :
11060 : static int const x86_64_ms_sysv_extra_clobbered_registers
11061 : [NUM_X86_64_MS_CLOBBERED_REGS] =
11062 : {
11063 : SI_REG, DI_REG,
11064 : XMM6_REG, XMM7_REG,
11065 : XMM8_REG, XMM9_REG, XMM10_REG, XMM11_REG,
11066 : XMM12_REG, XMM13_REG, XMM14_REG, XMM15_REG
11067 : };
11068 :
11069 : rtx_insn *
11070 6195436 : ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
11071 : rtx callarg2,
11072 : rtx pop, bool sibcall)
11073 : {
11074 6195436 : rtx vec[3];
11075 6195436 : rtx use = NULL, call;
11076 6195436 : unsigned int vec_len = 0;
11077 6195436 : tree fndecl;
11078 6195436 : bool call_no_callee_saved_registers = false;
11079 :
11080 6195436 : if (SYMBOL_REF_P (XEXP (fnaddr, 0)))
11081 : {
11082 6007221 : fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
11083 6007221 : if (fndecl)
11084 : {
11085 5747305 : if (lookup_attribute ("interrupt",
11086 5747305 : TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))
11087 1 : error ("interrupt service routine cannot be called directly");
11088 5747304 : else if (ix86_type_no_callee_saved_registers_p (TREE_TYPE (fndecl)))
11089 5747305 : call_no_callee_saved_registers = true;
11090 5747305 : if (fndecl == current_function_decl
11091 5747305 : && decl_binds_to_current_def_p (fndecl))
11092 11092 : cfun->machine->recursive_function = true;
11093 : }
11094 : }
11095 : else
11096 : {
11097 188215 : if (MEM_P (fnaddr))
11098 : {
11099 188215 : tree mem_expr = MEM_EXPR (fnaddr);
11100 188215 : if (mem_expr != nullptr
11101 188170 : && TREE_CODE (mem_expr) == MEM_REF
11102 376385 : && ix86_type_no_callee_saved_registers_p (TREE_TYPE (mem_expr)))
11103 : call_no_callee_saved_registers = true;
11104 : }
11105 :
11106 : fndecl = NULL_TREE;
11107 : }
11108 :
11109 6195436 : if (pop == const0_rtx)
11110 0 : pop = NULL;
11111 6195436 : gcc_assert (!TARGET_64BIT || !pop);
11112 :
11113 6195436 : rtx addr = XEXP (fnaddr, 0);
11114 6195436 : if (TARGET_MACHO && !TARGET_64BIT)
11115 : {
11116 : #if TARGET_MACHO
11117 : if (flag_pic && SYMBOL_REF_P (XEXP (fnaddr, 0)))
11118 : fnaddr = machopic_indirect_call_target (fnaddr);
11119 : #endif
11120 : }
11121 : else
11122 : {
11123 : /* Static functions and indirect calls don't need the pic register. Also,
11124 : check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
11125 : it an indirect call. */
11126 6195436 : if (flag_pic
11127 522294 : && SYMBOL_REF_P (addr)
11128 6691305 : && ix86_call_use_plt_p (addr))
11129 : {
11130 396494 : if (flag_plt
11131 396494 : && (SYMBOL_REF_DECL (addr) == NULL_TREE
11132 396460 : || !lookup_attribute ("noplt",
11133 396460 : DECL_ATTRIBUTES (SYMBOL_REF_DECL (addr)))))
11134 : {
11135 396459 : if (!TARGET_64BIT
11136 219199 : || (ix86_cmodel == CM_LARGE_PIC
11137 : && DEFAULT_ABI != MS_ABI))
11138 : {
11139 531811 : use_reg (&use, gen_rtx_REG (Pmode,
11140 : REAL_PIC_OFFSET_TABLE_REGNUM));
11141 177291 : if (ix86_use_pseudo_pic_reg ())
11142 354551 : emit_move_insn (gen_rtx_REG (Pmode,
11143 177291 : REAL_PIC_OFFSET_TABLE_REGNUM),
11144 : pic_offset_table_rtx);
11145 : }
11146 : }
11147 35 : else if (!TARGET_PECOFF && !TARGET_MACHO)
11148 : {
11149 35 : if (TARGET_64BIT
11150 35 : && ix86_cmodel == CM_LARGE_PIC
11151 : && DEFAULT_ABI != MS_ABI)
11152 : {
11153 1 : fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
11154 : UNSPEC_GOT);
11155 1 : fnaddr = gen_rtx_CONST (Pmode, fnaddr);
11156 1 : fnaddr = force_reg (Pmode, fnaddr);
11157 1 : fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx, fnaddr);
11158 : }
11159 34 : else if (TARGET_64BIT)
11160 : {
11161 38 : fnaddr = gen_rtx_UNSPEC (Pmode,
11162 : gen_rtvec (1, addr),
11163 : UNSPEC_GOTPCREL);
11164 38 : fnaddr = gen_rtx_CONST (Pmode, fnaddr);
11165 : }
11166 : else
11167 : {
11168 0 : fnaddr = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, addr),
11169 : UNSPEC_GOT);
11170 0 : fnaddr = gen_rtx_CONST (Pmode, fnaddr);
11171 0 : fnaddr = gen_rtx_PLUS (Pmode, pic_offset_table_rtx,
11172 : fnaddr);
11173 : }
11174 39 : fnaddr = gen_const_mem (Pmode, fnaddr);
11175 : /* Pmode may not be the same as word_mode for x32, which
11176 : doesn't support indirect branch via 32-bit memory slot.
11177 : Since x32 GOT slot is 64 bit with zero upper 32 bits,
11178 : indirect branch via x32 GOT slot is OK. */
11179 35 : if (GET_MODE (fnaddr) != word_mode)
11180 4 : fnaddr = gen_rtx_ZERO_EXTEND (word_mode, fnaddr);
11181 35 : fnaddr = gen_rtx_MEM (QImode, fnaddr);
11182 : }
11183 : }
11184 : }
11185 :
11186 : /* Skip setting up RAX register for -mskip-rax-setup when there are no
11187 : parameters passed in vector registers. */
11188 6195436 : if (TARGET_64BIT
11189 5357500 : && (INTVAL (callarg2) > 0
11190 5296340 : || (INTVAL (callarg2) == 0
11191 316879 : && (TARGET_SSE || !flag_skip_rax_setup))))
11192 : {
11193 378037 : rtx al = gen_rtx_REG (QImode, AX_REG);
11194 378037 : emit_move_insn (al, callarg2);
11195 378037 : use_reg (&use, al);
11196 : }
11197 :
11198 6195436 : if (ix86_cmodel == CM_LARGE_PIC
11199 : && !TARGET_PECOFF
11200 41 : && MEM_P (fnaddr)
11201 41 : && SYMBOL_REF_P (XEXP (fnaddr, 0))
11202 6195469 : && !local_symbolic_operand (XEXP (fnaddr, 0), VOIDmode))
11203 31 : fnaddr = gen_rtx_MEM (QImode, construct_plt_address (XEXP (fnaddr, 0)));
11204 : /* Since x32 GOT slot is 64 bit with zero upper 32 bits, indirect
11205 : branch via x32 GOT slot is OK. */
11206 6195405 : else if (TARGET_X32
11207 74 : && MEM_P (fnaddr)
11208 74 : && GET_CODE (XEXP (fnaddr, 0)) == ZERO_EXTEND
11209 8 : && GOT_memory_operand (XEXP (XEXP (fnaddr, 0), 0), Pmode)
11210 6195409 : && !TARGET_INDIRECT_BRANCH_REGISTER)
11211 : ;
11212 6195405 : else if (sibcall
11213 6195405 : ? !sibcall_insn_operand (XEXP (fnaddr, 0), word_mode)
11214 6066056 : : !call_insn_operand (XEXP (fnaddr, 0), word_mode))
11215 : {
11216 531 : fnaddr = convert_to_mode (word_mode, XEXP (fnaddr, 0), 1);
11217 531 : fnaddr = gen_rtx_MEM (QImode, copy_to_mode_reg (word_mode, fnaddr));
11218 : }
11219 :
11220 : /* PR100665: Hwasan may tag code pointer which is not supported by LAM,
11221 : mask off code pointers here.
11222 : TODO: also need to handle indirect jump. */
11223 6196474 : if (ix86_memtag_can_tag_addresses () && !fndecl
11224 6195460 : && sanitize_flags_p (SANITIZE_HWADDRESS))
11225 : {
11226 24 : rtx untagged_addr = ix86_memtag_untagged_pointer (XEXP (fnaddr, 0),
11227 : NULL_RTX);
11228 24 : fnaddr = gen_rtx_MEM (QImode, untagged_addr);
11229 : }
11230 :
11231 6195436 : call = gen_rtx_CALL (VOIDmode, fnaddr, callarg1);
11232 :
11233 6195436 : if (retval)
11234 2451605 : call = gen_rtx_SET (retval, call);
11235 6195436 : vec[vec_len++] = call;
11236 :
11237 6195436 : if (pop)
11238 : {
11239 449838 : pop = gen_rtx_PLUS (Pmode, stack_pointer_rtx, pop);
11240 224919 : pop = gen_rtx_SET (stack_pointer_rtx, pop);
11241 224919 : vec[vec_len++] = pop;
11242 : }
11243 :
11244 6195436 : static const char ix86_call_used_regs[] = CALL_USED_REGISTERS;
11245 :
11246 6195436 : if ((cfun->machine->call_saved_registers
11247 6195436 : == TYPE_NO_CALLER_SAVED_REGISTERS)
11248 6195436 : && (!fndecl
11249 468 : || (!TREE_THIS_VOLATILE (fndecl)
11250 186 : && !lookup_attribute ("no_caller_saved_registers",
11251 186 : TYPE_ATTRIBUTES (TREE_TYPE (fndecl))))))
11252 : {
11253 182 : bool is_64bit_ms_abi = (TARGET_64BIT
11254 182 : && ix86_function_abi (fndecl) == MS_ABI);
11255 182 : char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
11256 :
11257 : /* If there are no caller-saved registers, add all registers
11258 : that are clobbered by the call which returns. */
11259 16926 : for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
11260 16744 : if (!fixed_regs[i]
11261 3242 : && (ix86_call_used_regs[i] == 1
11262 1506 : || (ix86_call_used_regs[i] & c_mask))
11263 2150 : && !STACK_REGNO_P (i)
11264 2150 : && !MMX_REGNO_P (i))
11265 2150 : clobber_reg (&use,
11266 2150 : gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
11267 : }
11268 5357318 : else if (TARGET_64BIT_MS_ABI
11269 6268657 : && (!callarg2 || INTVAL (callarg2) != -2))
11270 : {
11271 : unsigned i;
11272 :
11273 861718 : for (i = 0; i < NUM_X86_64_MS_CLOBBERED_REGS; i++)
11274 : {
11275 795432 : int regno = x86_64_ms_sysv_extra_clobbered_registers[i];
11276 795432 : machine_mode mode = SSE_REGNO_P (regno) ? TImode : DImode;
11277 :
11278 795432 : clobber_reg (&use, gen_rtx_REG (mode, regno));
11279 : }
11280 :
11281 : /* Set here, but it may get cleared later. */
11282 66286 : if (TARGET_CALL_MS2SYSV_XLOGUES)
11283 : {
11284 7046 : if (!TARGET_SSE)
11285 : ;
11286 :
11287 : /* Don't break hot-patched functions. */
11288 7046 : else if (ix86_function_ms_hook_prologue (current_function_decl))
11289 : ;
11290 :
11291 : /* TODO: Cases not yet examined. */
11292 7046 : else if (flag_split_stack)
11293 0 : warn_once_call_ms2sysv_xlogues ("-fsplit-stack");
11294 :
11295 : else
11296 : {
11297 7046 : gcc_assert (!reload_completed);
11298 7046 : cfun->machine->call_ms2sysv = true;
11299 : }
11300 : }
11301 : }
11302 :
11303 6195436 : if (TARGET_MACHO && TARGET_64BIT && !sibcall
11304 : && ((SYMBOL_REF_P (addr) && !SYMBOL_REF_LOCAL_P (addr))
11305 : || !fndecl || TREE_PUBLIC (fndecl)))
11306 : {
11307 : /* We allow public functions defined in a TU to bind locally for PIC
11308 : code (the default) on 64bit Mach-O.
11309 : If such functions are not inlined, we cannot tell at compile-time if
11310 : they will be called via the lazy symbol resolver (this can depend on
11311 : options given at link-time). Therefore, we must assume that the lazy
11312 : resolver could be used which clobbers R11 and R10. */
11313 : clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
11314 : clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
11315 : }
11316 :
11317 6195436 : if (call_no_callee_saved_registers)
11318 : {
11319 : /* After calling a no_callee_saved_registers function, all
11320 : registers may be clobbered. Clobber all registers that are
11321 : not used by the callee. */
11322 59 : bool is_64bit_ms_abi = (TARGET_64BIT
11323 59 : && ix86_function_abi (fndecl) == MS_ABI);
11324 59 : char c_mask = CALL_USED_REGISTERS_MASK (is_64bit_ms_abi);
11325 5487 : for (int i = 0; i < FIRST_PSEUDO_REGISTER; i++)
11326 5428 : if (!fixed_regs[i]
11327 2597 : && i != HARD_FRAME_POINTER_REGNUM
11328 2538 : && !(ix86_call_used_regs[i] == 1
11329 973 : || (ix86_call_used_regs[i] & c_mask))
11330 295 : && !STACK_REGNO_P (i)
11331 295 : && !MMX_REGNO_P (i))
11332 295 : clobber_reg (&use,
11333 295 : gen_rtx_REG (GET_MODE (regno_reg_rtx[i]), i));
11334 : }
11335 :
11336 6195436 : if (vec_len > 1)
11337 224919 : call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
11338 6195436 : rtx_insn *call_insn = emit_call_insn (call);
11339 6195436 : if (use)
11340 595756 : CALL_INSN_FUNCTION_USAGE (call_insn) = use;
11341 :
11342 6195436 : return call_insn;
11343 : }
11344 :
11345 : /* Split simple return with popping POPC bytes from stack to indirect
11346 : branch with stack adjustment . */
11347 :
11348 : void
11349 0 : ix86_split_simple_return_pop_internal (rtx popc)
11350 : {
11351 0 : struct machine_function *m = cfun->machine;
11352 0 : rtx ecx = gen_rtx_REG (SImode, CX_REG);
11353 0 : rtx_insn *insn;
11354 :
11355 : /* There is no "pascal" calling convention in any 64bit ABI. */
11356 0 : gcc_assert (!TARGET_64BIT);
11357 :
11358 0 : insn = emit_insn (gen_pop (ecx));
11359 0 : m->fs.cfa_offset -= UNITS_PER_WORD;
11360 0 : m->fs.sp_offset -= UNITS_PER_WORD;
11361 :
11362 0 : rtx x = plus_constant (Pmode, stack_pointer_rtx, UNITS_PER_WORD);
11363 0 : x = gen_rtx_SET (stack_pointer_rtx, x);
11364 0 : add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
11365 0 : add_reg_note (insn, REG_CFA_REGISTER, gen_rtx_SET (ecx, pc_rtx));
11366 0 : RTX_FRAME_RELATED_P (insn) = 1;
11367 :
11368 0 : x = gen_rtx_PLUS (Pmode, stack_pointer_rtx, popc);
11369 0 : x = gen_rtx_SET (stack_pointer_rtx, x);
11370 0 : insn = emit_insn (x);
11371 0 : add_reg_note (insn, REG_CFA_ADJUST_CFA, x);
11372 0 : RTX_FRAME_RELATED_P (insn) = 1;
11373 :
11374 : /* Now return address is in ECX. */
11375 0 : emit_jump_insn (gen_simple_return_indirect_internal (ecx));
11376 0 : }
11377 :
11378 : /* Errors in the source file can cause expand_expr to return const0_rtx
11379 : where we expect a vector. To avoid crashing, use one of the vector
11380 : clear instructions. */
11381 :
11382 : static rtx
11383 196132 : safe_vector_operand (rtx x, machine_mode mode)
11384 : {
11385 0 : if (x == const0_rtx)
11386 0 : x = CONST0_RTX (mode);
11387 24 : return x;
11388 : }
11389 :
11390 : /* Subroutine of ix86_expand_builtin to take care of binop insns. */
11391 :
11392 : static rtx
11393 8970 : ix86_expand_binop_builtin (enum insn_code icode, tree exp, rtx target)
11394 : {
11395 8970 : rtx pat;
11396 8970 : tree arg0 = CALL_EXPR_ARG (exp, 0);
11397 8970 : tree arg1 = CALL_EXPR_ARG (exp, 1);
11398 8970 : rtx op0 = expand_normal (arg0);
11399 8970 : rtx op1 = expand_normal (arg1);
11400 8970 : machine_mode tmode = insn_data[icode].operand[0].mode;
11401 8970 : machine_mode mode0 = insn_data[icode].operand[1].mode;
11402 8970 : machine_mode mode1 = insn_data[icode].operand[2].mode;
11403 :
11404 8970 : if (VECTOR_MODE_P (mode0))
11405 8959 : op0 = safe_vector_operand (op0, mode0);
11406 8970 : if (VECTOR_MODE_P (mode1))
11407 8823 : op1 = safe_vector_operand (op1, mode1);
11408 :
11409 2842 : if (optimize || !target
11410 2842 : || GET_MODE (target) != tmode
11411 11812 : || !insn_data[icode].operand[0].predicate (target, tmode))
11412 6181 : target = gen_reg_rtx (tmode);
11413 :
11414 8970 : if (GET_MODE (op1) == SImode && mode1 == TImode)
11415 : {
11416 0 : rtx x = gen_reg_rtx (V4SImode);
11417 0 : emit_insn (gen_sse2_loadd (x, op1));
11418 0 : op1 = gen_lowpart (TImode, x);
11419 : }
11420 :
11421 8970 : if (!insn_data[icode].operand[1].predicate (op0, mode0))
11422 1399 : op0 = copy_to_mode_reg (mode0, op0);
11423 8970 : if (!insn_data[icode].operand[2].predicate (op1, mode1))
11424 817 : op1 = copy_to_mode_reg (mode1, op1);
11425 :
11426 8970 : pat = GEN_FCN (icode) (target, op0, op1);
11427 8970 : if (! pat)
11428 : return 0;
11429 :
11430 8970 : emit_insn (pat);
11431 :
11432 8970 : return target;
11433 : }
11434 :
11435 : /* Subroutine of ix86_expand_builtin to take care of 2-4 argument insns. */
11436 :
11437 : static rtx
11438 1792 : ix86_expand_multi_arg_builtin (enum insn_code icode, tree exp, rtx target,
11439 : enum ix86_builtin_func_type m_type,
11440 : enum rtx_code sub_code)
11441 : {
11442 1792 : rtx pat;
11443 1792 : unsigned int i, nargs;
11444 1792 : bool comparison_p = false;
11445 1792 : bool tf_p = false;
11446 1792 : bool last_arg_constant = false;
11447 1792 : int num_memory = 0;
11448 1792 : rtx xops[4];
11449 :
11450 1792 : machine_mode tmode = insn_data[icode].operand[0].mode;
11451 :
11452 1792 : switch (m_type)
11453 : {
11454 : case MULTI_ARG_4_DF2_DI_I:
11455 : case MULTI_ARG_4_DF2_DI_I1:
11456 : case MULTI_ARG_4_SF2_SI_I:
11457 : case MULTI_ARG_4_SF2_SI_I1:
11458 : nargs = 4;
11459 : last_arg_constant = true;
11460 : break;
11461 :
11462 821 : case MULTI_ARG_3_SF:
11463 821 : case MULTI_ARG_3_DF:
11464 821 : case MULTI_ARG_3_SF2:
11465 821 : case MULTI_ARG_3_DF2:
11466 821 : case MULTI_ARG_3_DI:
11467 821 : case MULTI_ARG_3_SI:
11468 821 : case MULTI_ARG_3_SI_DI:
11469 821 : case MULTI_ARG_3_HI:
11470 821 : case MULTI_ARG_3_HI_SI:
11471 821 : case MULTI_ARG_3_QI:
11472 821 : case MULTI_ARG_3_DI2:
11473 821 : case MULTI_ARG_3_SI2:
11474 821 : case MULTI_ARG_3_HI2:
11475 821 : case MULTI_ARG_3_QI2:
11476 821 : nargs = 3;
11477 821 : break;
11478 :
11479 128 : case MULTI_ARG_2_SF:
11480 128 : case MULTI_ARG_2_DF:
11481 128 : case MULTI_ARG_2_DI:
11482 128 : case MULTI_ARG_2_SI:
11483 128 : case MULTI_ARG_2_HI:
11484 128 : case MULTI_ARG_2_QI:
11485 128 : nargs = 2;
11486 128 : break;
11487 :
11488 64 : case MULTI_ARG_2_DI_IMM:
11489 64 : case MULTI_ARG_2_SI_IMM:
11490 64 : case MULTI_ARG_2_HI_IMM:
11491 64 : case MULTI_ARG_2_QI_IMM:
11492 64 : nargs = 2;
11493 64 : last_arg_constant = true;
11494 64 : break;
11495 :
11496 187 : case MULTI_ARG_1_SF:
11497 187 : case MULTI_ARG_1_DF:
11498 187 : case MULTI_ARG_1_SF2:
11499 187 : case MULTI_ARG_1_DF2:
11500 187 : case MULTI_ARG_1_DI:
11501 187 : case MULTI_ARG_1_SI:
11502 187 : case MULTI_ARG_1_HI:
11503 187 : case MULTI_ARG_1_QI:
11504 187 : case MULTI_ARG_1_SI_DI:
11505 187 : case MULTI_ARG_1_HI_DI:
11506 187 : case MULTI_ARG_1_HI_SI:
11507 187 : case MULTI_ARG_1_QI_DI:
11508 187 : case MULTI_ARG_1_QI_SI:
11509 187 : case MULTI_ARG_1_QI_HI:
11510 187 : nargs = 1;
11511 187 : break;
11512 :
11513 384 : case MULTI_ARG_2_DI_CMP:
11514 384 : case MULTI_ARG_2_SI_CMP:
11515 384 : case MULTI_ARG_2_HI_CMP:
11516 384 : case MULTI_ARG_2_QI_CMP:
11517 384 : nargs = 2;
11518 384 : comparison_p = true;
11519 384 : break;
11520 :
11521 128 : case MULTI_ARG_2_SF_TF:
11522 128 : case MULTI_ARG_2_DF_TF:
11523 128 : case MULTI_ARG_2_DI_TF:
11524 128 : case MULTI_ARG_2_SI_TF:
11525 128 : case MULTI_ARG_2_HI_TF:
11526 128 : case MULTI_ARG_2_QI_TF:
11527 128 : nargs = 2;
11528 128 : tf_p = true;
11529 128 : break;
11530 :
11531 0 : default:
11532 0 : gcc_unreachable ();
11533 : }
11534 :
11535 628 : if (optimize || !target
11536 628 : || GET_MODE (target) != tmode
11537 2396 : || !insn_data[icode].operand[0].predicate (target, tmode))
11538 1188 : target = gen_reg_rtx (tmode);
11539 604 : else if (memory_operand (target, tmode))
11540 0 : num_memory++;
11541 :
11542 1792 : gcc_assert (nargs <= ARRAY_SIZE (xops));
11543 :
11544 6162 : for (i = 0; i < nargs; i++)
11545 : {
11546 4378 : tree arg = CALL_EXPR_ARG (exp, i);
11547 4378 : rtx op = expand_normal (arg);
11548 4378 : int adjust = (comparison_p) ? 1 : 0;
11549 4378 : machine_mode mode = insn_data[icode].operand[i+adjust+1].mode;
11550 :
11551 4378 : if (last_arg_constant && i == nargs - 1)
11552 : {
11553 144 : if (!insn_data[icode].operand[i + 1].predicate (op, mode))
11554 : {
11555 30 : enum insn_code new_icode = icode;
11556 30 : switch (icode)
11557 : {
11558 8 : case CODE_FOR_xop_vpermil2v2df3:
11559 8 : case CODE_FOR_xop_vpermil2v4sf3:
11560 8 : case CODE_FOR_xop_vpermil2v4df3:
11561 8 : case CODE_FOR_xop_vpermil2v8sf3:
11562 8 : error ("the last argument must be a 2-bit immediate");
11563 8 : return gen_reg_rtx (tmode);
11564 5 : case CODE_FOR_xop_rotlv2di3:
11565 5 : new_icode = CODE_FOR_rotlv2di3;
11566 5 : goto xop_rotl;
11567 5 : case CODE_FOR_xop_rotlv4si3:
11568 5 : new_icode = CODE_FOR_rotlv4si3;
11569 5 : goto xop_rotl;
11570 6 : case CODE_FOR_xop_rotlv8hi3:
11571 6 : new_icode = CODE_FOR_rotlv8hi3;
11572 6 : goto xop_rotl;
11573 : case CODE_FOR_xop_rotlv16qi3:
11574 : new_icode = CODE_FOR_rotlv16qi3;
11575 22 : xop_rotl:
11576 22 : if (CONST_INT_P (op))
11577 : {
11578 6 : int mask = GET_MODE_UNIT_BITSIZE (tmode) - 1;
11579 6 : op = GEN_INT (INTVAL (op) & mask);
11580 6 : gcc_checking_assert
11581 : (insn_data[icode].operand[i + 1].predicate (op, mode));
11582 : }
11583 : else
11584 : {
11585 16 : gcc_checking_assert
11586 : (nargs == 2
11587 : && insn_data[new_icode].operand[0].mode == tmode
11588 : && insn_data[new_icode].operand[1].mode == tmode
11589 : && insn_data[new_icode].operand[2].mode == mode
11590 : && insn_data[new_icode].operand[0].predicate
11591 : == insn_data[icode].operand[0].predicate
11592 : && insn_data[new_icode].operand[1].predicate
11593 : == insn_data[icode].operand[1].predicate);
11594 16 : icode = new_icode;
11595 16 : goto non_constant;
11596 : }
11597 : break;
11598 0 : default:
11599 0 : gcc_unreachable ();
11600 : }
11601 : }
11602 : }
11603 : else
11604 : {
11605 4234 : non_constant:
11606 4250 : if (VECTOR_MODE_P (mode))
11607 4234 : op = safe_vector_operand (op, mode);
11608 :
11609 : /* If we aren't optimizing, only allow one memory operand to be
11610 : generated. */
11611 4250 : if (memory_operand (op, mode))
11612 826 : num_memory++;
11613 :
11614 4250 : gcc_assert (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode);
11615 :
11616 4250 : if (optimize
11617 1506 : || !insn_data[icode].operand[i+adjust+1].predicate (op, mode)
11618 5678 : || num_memory > 1)
11619 3329 : op = force_reg (mode, op);
11620 : }
11621 :
11622 4370 : xops[i] = op;
11623 : }
11624 :
11625 1784 : switch (nargs)
11626 : {
11627 187 : case 1:
11628 187 : pat = GEN_FCN (icode) (target, xops[0]);
11629 187 : break;
11630 :
11631 704 : case 2:
11632 704 : if (tf_p)
11633 128 : pat = GEN_FCN (icode) (target, xops[0], xops[1],
11634 128 : GEN_INT ((int)sub_code));
11635 576 : else if (! comparison_p)
11636 192 : pat = GEN_FCN (icode) (target, xops[0], xops[1]);
11637 : else
11638 : {
11639 384 : rtx cmp_op = gen_rtx_fmt_ee (sub_code, GET_MODE (target),
11640 : xops[0], xops[1]);
11641 :
11642 384 : pat = GEN_FCN (icode) (target, cmp_op, xops[0], xops[1]);
11643 : }
11644 : break;
11645 :
11646 821 : case 3:
11647 821 : pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
11648 821 : break;
11649 :
11650 72 : case 4:
11651 72 : pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
11652 72 : break;
11653 :
11654 : default:
11655 : gcc_unreachable ();
11656 : }
11657 :
11658 1784 : if (! pat)
11659 : return 0;
11660 :
11661 1784 : emit_insn (pat);
11662 1784 : return target;
11663 : }
11664 :
11665 : /* Subroutine of ix86_expand_args_builtin to take care of scalar unop
11666 : insns with vec_merge. */
11667 :
11668 : static rtx
11669 52 : ix86_expand_unop_vec_merge_builtin (enum insn_code icode, tree exp,
11670 : rtx target)
11671 : {
11672 52 : rtx pat;
11673 52 : tree arg0 = CALL_EXPR_ARG (exp, 0);
11674 52 : rtx op1, op0 = expand_normal (arg0);
11675 52 : machine_mode tmode = insn_data[icode].operand[0].mode;
11676 52 : machine_mode mode0 = insn_data[icode].operand[1].mode;
11677 :
11678 16 : if (optimize || !target
11679 16 : || GET_MODE (target) != tmode
11680 68 : || !insn_data[icode].operand[0].predicate (target, tmode))
11681 36 : target = gen_reg_rtx (tmode);
11682 :
11683 52 : if (VECTOR_MODE_P (mode0))
11684 52 : op0 = safe_vector_operand (op0, mode0);
11685 :
11686 36 : if ((optimize && !register_operand (op0, mode0))
11687 88 : || !insn_data[icode].operand[1].predicate (op0, mode0))
11688 0 : op0 = copy_to_mode_reg (mode0, op0);
11689 :
11690 52 : op1 = op0;
11691 52 : if (!insn_data[icode].operand[2].predicate (op1, mode0))
11692 16 : op1 = copy_to_mode_reg (mode0, op1);
11693 :
11694 52 : pat = GEN_FCN (icode) (target, op0, op1);
11695 52 : if (! pat)
11696 : return 0;
11697 52 : emit_insn (pat);
11698 52 : return target;
11699 : }
11700 :
11701 : /* Subroutine of ix86_expand_builtin to take care of comparison insns. */
11702 :
11703 : static rtx
11704 614 : ix86_expand_sse_compare (const struct builtin_description *d,
11705 : tree exp, rtx target, bool swap)
11706 : {
11707 614 : rtx pat;
11708 614 : tree arg0 = CALL_EXPR_ARG (exp, 0);
11709 614 : tree arg1 = CALL_EXPR_ARG (exp, 1);
11710 614 : rtx op0 = expand_normal (arg0);
11711 614 : rtx op1 = expand_normal (arg1);
11712 614 : rtx op2;
11713 614 : machine_mode tmode = insn_data[d->icode].operand[0].mode;
11714 614 : machine_mode mode0 = insn_data[d->icode].operand[1].mode;
11715 614 : machine_mode mode1 = insn_data[d->icode].operand[2].mode;
11716 614 : enum rtx_code comparison = d->comparison;
11717 :
11718 614 : if (VECTOR_MODE_P (mode0))
11719 614 : op0 = safe_vector_operand (op0, mode0);
11720 614 : if (VECTOR_MODE_P (mode1))
11721 614 : op1 = safe_vector_operand (op1, mode1);
11722 :
11723 : /* Swap operands if we have a comparison that isn't available in
11724 : hardware. */
11725 614 : if (swap)
11726 80 : std::swap (op0, op1);
11727 :
11728 202 : if (optimize || !target
11729 202 : || GET_MODE (target) != tmode
11730 816 : || !insn_data[d->icode].operand[0].predicate (target, tmode))
11731 412 : target = gen_reg_rtx (tmode);
11732 :
11733 412 : if ((optimize && !register_operand (op0, mode0))
11734 956 : || !insn_data[d->icode].operand[1].predicate (op0, mode0))
11735 272 : op0 = copy_to_mode_reg (mode0, op0);
11736 412 : if ((optimize && !register_operand (op1, mode1))
11737 972 : || !insn_data[d->icode].operand[2].predicate (op1, mode1))
11738 54 : op1 = copy_to_mode_reg (mode1, op1);
11739 :
11740 614 : op2 = gen_rtx_fmt_ee (comparison, mode0, op0, op1);
11741 614 : pat = GEN_FCN (d->icode) (target, op0, op1, op2);
11742 614 : if (! pat)
11743 : return 0;
11744 614 : emit_insn (pat);
11745 614 : return target;
11746 : }
11747 :
11748 : /* Subroutine of ix86_sse_comi and ix86_sse_comi_round to take care of
11749 : * ordered EQ or unordered NE, generate PF jump. */
11750 :
11751 : static rtx
11752 644 : ix86_ssecom_setcc (const enum rtx_code comparison,
11753 : bool check_unordered, machine_mode mode,
11754 : rtx set_dst, rtx target)
11755 : {
11756 :
11757 644 : rtx_code_label *label = NULL;
11758 :
11759 : /* NB: For ordered EQ or unordered NE, check ZF alone isn't sufficient
11760 : with NAN operands.
11761 : Under TARGET_AVX10_2, VCOMX/VUCOMX are generated instead of
11762 : COMI/UCOMI. VCOMX/VUCOMX will not set ZF for NAN operands. */
11763 644 : if (check_unordered)
11764 : {
11765 122 : gcc_assert (comparison == EQ || comparison == NE);
11766 :
11767 122 : rtx flag = gen_rtx_REG (CCFPmode, FLAGS_REG);
11768 122 : label = gen_label_rtx ();
11769 122 : rtx tmp = gen_rtx_fmt_ee (UNORDERED, VOIDmode, flag, const0_rtx);
11770 122 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
11771 : gen_rtx_LABEL_REF (VOIDmode, label),
11772 : pc_rtx);
11773 122 : emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
11774 : }
11775 :
11776 : /* NB: Set CCFPmode and check a different CCmode which is in subset
11777 : of CCFPmode. */
11778 644 : if (GET_MODE (set_dst) != mode)
11779 : {
11780 200 : gcc_assert (mode == CCAmode || mode == CCCmode
11781 : || mode == CCOmode || mode == CCPmode
11782 : || mode == CCSmode || mode == CCZmode);
11783 200 : set_dst = gen_rtx_REG (mode, FLAGS_REG);
11784 : }
11785 :
11786 644 : emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
11787 : gen_rtx_fmt_ee (comparison, QImode,
11788 : set_dst,
11789 : const0_rtx)));
11790 :
11791 644 : if (label)
11792 122 : emit_label (label);
11793 :
11794 644 : return SUBREG_REG (target);
11795 : }
11796 :
11797 : /* Subroutine of ix86_expand_builtin to take care of comi insns. */
11798 :
11799 : static rtx
11800 545 : ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
11801 : rtx target, bool comx_ok)
11802 : {
11803 545 : rtx pat, set_dst;
11804 545 : tree arg0 = CALL_EXPR_ARG (exp, 0);
11805 545 : tree arg1 = CALL_EXPR_ARG (exp, 1);
11806 545 : rtx op0 = expand_normal (arg0);
11807 545 : rtx op1 = expand_normal (arg1);
11808 545 : enum insn_code icode = d->icode;
11809 545 : const struct insn_data_d *insn_p = &insn_data[icode];
11810 545 : machine_mode mode0 = insn_p->operand[0].mode;
11811 545 : machine_mode mode1 = insn_p->operand[1].mode;
11812 :
11813 545 : if (VECTOR_MODE_P (mode0))
11814 545 : op0 = safe_vector_operand (op0, mode0);
11815 545 : if (VECTOR_MODE_P (mode1))
11816 545 : op1 = safe_vector_operand (op1, mode1);
11817 :
11818 545 : enum rtx_code comparison = d->comparison;
11819 545 : rtx const_val = const0_rtx;
11820 :
11821 545 : bool check_unordered = false;
11822 545 : machine_mode mode = CCFPmode;
11823 545 : switch (comparison)
11824 : {
11825 192 : case LE: /* -> GE */
11826 192 : case LT: /* -> GT */
11827 192 : std::swap (op0, op1);
11828 192 : comparison = swap_condition (comparison);
11829 : /* FALLTHRU */
11830 : case GT:
11831 : case GE:
11832 : break;
11833 73 : case EQ:
11834 73 : if (!TARGET_AVX10_2 || !comx_ok)
11835 45 : check_unordered = true;
11836 : mode = CCZmode;
11837 : break;
11838 96 : case NE:
11839 96 : if (!TARGET_AVX10_2 || !comx_ok)
11840 68 : check_unordered = true;
11841 96 : mode = CCZmode;
11842 96 : const_val = const1_rtx;
11843 96 : break;
11844 0 : default:
11845 0 : gcc_unreachable ();
11846 : }
11847 :
11848 545 : target = gen_reg_rtx (SImode);
11849 545 : emit_move_insn (target, const_val);
11850 545 : target = gen_rtx_SUBREG (QImode, target, 0);
11851 :
11852 424 : if ((optimize && !register_operand (op0, mode0))
11853 921 : || !insn_p->operand[0].predicate (op0, mode0))
11854 169 : op0 = copy_to_mode_reg (mode0, op0);
11855 424 : if ((optimize && !register_operand (op1, mode1))
11856 921 : || !insn_p->operand[1].predicate (op1, mode1))
11857 48 : op1 = copy_to_mode_reg (mode1, op1);
11858 :
11859 545 : if ((comparison == EQ || comparison == NE)
11860 169 : && TARGET_AVX10_2 && comx_ok)
11861 : {
11862 56 : switch (icode)
11863 : {
11864 : case CODE_FOR_sse_comi:
11865 : icode = CODE_FOR_avx10_2_comxsf;
11866 : break;
11867 14 : case CODE_FOR_sse_ucomi:
11868 14 : icode = CODE_FOR_avx10_2_ucomxsf;
11869 14 : break;
11870 14 : case CODE_FOR_sse2_comi:
11871 14 : icode = CODE_FOR_avx10_2_comxdf;
11872 14 : break;
11873 14 : case CODE_FOR_sse2_ucomi:
11874 14 : icode = CODE_FOR_avx10_2_ucomxdf;
11875 14 : break;
11876 :
11877 0 : default:
11878 0 : gcc_unreachable ();
11879 : }
11880 : }
11881 545 : pat = GEN_FCN (icode) (op0, op1);
11882 545 : if (! pat)
11883 : return 0;
11884 :
11885 545 : set_dst = SET_DEST (pat);
11886 545 : emit_insn (pat);
11887 545 : return ix86_ssecom_setcc (comparison, check_unordered, mode,
11888 545 : set_dst, target);
11889 : }
11890 :
11891 : /* Subroutines of ix86_expand_args_builtin to take care of round insns. */
11892 :
11893 : static rtx
11894 0 : ix86_expand_sse_round (const struct builtin_description *d, tree exp,
11895 : rtx target)
11896 : {
11897 0 : rtx pat;
11898 0 : tree arg0 = CALL_EXPR_ARG (exp, 0);
11899 0 : rtx op1, op0 = expand_normal (arg0);
11900 0 : machine_mode tmode = insn_data[d->icode].operand[0].mode;
11901 0 : machine_mode mode0 = insn_data[d->icode].operand[1].mode;
11902 :
11903 0 : if (optimize || target == 0
11904 0 : || GET_MODE (target) != tmode
11905 0 : || !insn_data[d->icode].operand[0].predicate (target, tmode))
11906 0 : target = gen_reg_rtx (tmode);
11907 :
11908 0 : if (VECTOR_MODE_P (mode0))
11909 0 : op0 = safe_vector_operand (op0, mode0);
11910 :
11911 0 : if ((optimize && !register_operand (op0, mode0))
11912 0 : || !insn_data[d->icode].operand[0].predicate (op0, mode0))
11913 0 : op0 = copy_to_mode_reg (mode0, op0);
11914 :
11915 0 : op1 = GEN_INT (d->comparison);
11916 :
11917 0 : pat = GEN_FCN (d->icode) (target, op0, op1);
11918 0 : if (! pat)
11919 : return 0;
11920 0 : emit_insn (pat);
11921 0 : return target;
11922 : }
11923 :
11924 : static rtx
11925 12 : ix86_expand_sse_round_vec_pack_sfix (const struct builtin_description *d,
11926 : tree exp, rtx target)
11927 : {
11928 12 : rtx pat;
11929 12 : tree arg0 = CALL_EXPR_ARG (exp, 0);
11930 12 : tree arg1 = CALL_EXPR_ARG (exp, 1);
11931 12 : rtx op0 = expand_normal (arg0);
11932 12 : rtx op1 = expand_normal (arg1);
11933 12 : rtx op2;
11934 12 : machine_mode tmode = insn_data[d->icode].operand[0].mode;
11935 12 : machine_mode mode0 = insn_data[d->icode].operand[1].mode;
11936 12 : machine_mode mode1 = insn_data[d->icode].operand[2].mode;
11937 :
11938 0 : if (optimize || target == 0
11939 0 : || GET_MODE (target) != tmode
11940 12 : || !insn_data[d->icode].operand[0].predicate (target, tmode))
11941 12 : target = gen_reg_rtx (tmode);
11942 :
11943 12 : op0 = safe_vector_operand (op0, mode0);
11944 12 : op1 = safe_vector_operand (op1, mode1);
11945 :
11946 12 : if ((optimize && !register_operand (op0, mode0))
11947 12 : || !insn_data[d->icode].operand[0].predicate (op0, mode0))
11948 12 : op0 = copy_to_mode_reg (mode0, op0);
11949 12 : if ((optimize && !register_operand (op1, mode1))
11950 12 : || !insn_data[d->icode].operand[1].predicate (op1, mode1))
11951 12 : op1 = copy_to_mode_reg (mode1, op1);
11952 :
11953 12 : op2 = GEN_INT (d->comparison);
11954 :
11955 12 : pat = GEN_FCN (d->icode) (target, op0, op1, op2);
11956 12 : if (! pat)
11957 : return 0;
11958 12 : emit_insn (pat);
11959 12 : return target;
11960 : }
11961 :
11962 : /* Subroutine of ix86_expand_builtin to take care of ptest insns. */
11963 :
11964 : static rtx
11965 235 : ix86_expand_sse_ptest (const struct builtin_description *d, tree exp,
11966 : rtx target)
11967 : {
11968 235 : rtx pat;
11969 235 : tree arg0 = CALL_EXPR_ARG (exp, 0);
11970 235 : tree arg1 = CALL_EXPR_ARG (exp, 1);
11971 235 : rtx op0 = expand_normal (arg0);
11972 235 : rtx op1 = expand_normal (arg1);
11973 235 : machine_mode mode0 = insn_data[d->icode].operand[0].mode;
11974 235 : machine_mode mode1 = insn_data[d->icode].operand[1].mode;
11975 235 : enum rtx_code comparison = d->comparison;
11976 :
11977 : /* ptest reg, reg sets the carry flag. */
11978 235 : if (comparison == LTU
11979 75 : && (d->code == IX86_BUILTIN_PTESTC
11980 57 : || d->code == IX86_BUILTIN_PTESTC256)
11981 266 : && rtx_equal_p (op0, op1))
11982 : {
11983 2 : if (!target)
11984 0 : target = gen_reg_rtx (SImode);
11985 2 : emit_move_insn (target, const1_rtx);
11986 2 : return target;
11987 : }
11988 :
11989 233 : if (VECTOR_MODE_P (mode0))
11990 233 : op0 = safe_vector_operand (op0, mode0);
11991 233 : if (VECTOR_MODE_P (mode1))
11992 233 : op1 = safe_vector_operand (op1, mode1);
11993 :
11994 233 : target = gen_reg_rtx (SImode);
11995 233 : emit_move_insn (target, const0_rtx);
11996 233 : target = gen_rtx_SUBREG (QImode, target, 0);
11997 :
11998 161 : if ((optimize && !register_operand (op0, mode0))
11999 366 : || !insn_data[d->icode].operand[0].predicate (op0, mode0))
12000 100 : op0 = copy_to_mode_reg (mode0, op0);
12001 161 : if ((optimize && !register_operand (op1, mode1))
12002 367 : || !insn_data[d->icode].operand[1].predicate (op1, mode1))
12003 27 : op1 = copy_to_mode_reg (mode1, op1);
12004 :
12005 233 : pat = GEN_FCN (d->icode) (op0, op1);
12006 233 : if (! pat)
12007 : return 0;
12008 233 : emit_insn (pat);
12009 233 : emit_insn (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
12010 : gen_rtx_fmt_ee (comparison, QImode,
12011 : SET_DEST (pat),
12012 : const0_rtx)));
12013 :
12014 233 : return SUBREG_REG (target);
12015 : }
12016 :
12017 : /* Subroutine of ix86_expand_builtin to take care of pcmpestr[im] insns. */
12018 :
12019 : static rtx
12020 216 : ix86_expand_sse_pcmpestr (const struct builtin_description *d,
12021 : tree exp, rtx target)
12022 : {
12023 216 : rtx pat;
12024 216 : tree arg0 = CALL_EXPR_ARG (exp, 0);
12025 216 : tree arg1 = CALL_EXPR_ARG (exp, 1);
12026 216 : tree arg2 = CALL_EXPR_ARG (exp, 2);
12027 216 : tree arg3 = CALL_EXPR_ARG (exp, 3);
12028 216 : tree arg4 = CALL_EXPR_ARG (exp, 4);
12029 216 : rtx scratch0, scratch1;
12030 216 : rtx op0 = expand_normal (arg0);
12031 216 : rtx op1 = expand_normal (arg1);
12032 216 : rtx op2 = expand_normal (arg2);
12033 216 : rtx op3 = expand_normal (arg3);
12034 216 : rtx op4 = expand_normal (arg4);
12035 216 : machine_mode tmode0, tmode1, modev2, modei3, modev4, modei5, modeimm;
12036 :
12037 216 : tmode0 = insn_data[d->icode].operand[0].mode;
12038 216 : tmode1 = insn_data[d->icode].operand[1].mode;
12039 216 : modev2 = insn_data[d->icode].operand[2].mode;
12040 216 : modei3 = insn_data[d->icode].operand[3].mode;
12041 216 : modev4 = insn_data[d->icode].operand[4].mode;
12042 216 : modei5 = insn_data[d->icode].operand[5].mode;
12043 216 : modeimm = insn_data[d->icode].operand[6].mode;
12044 :
12045 216 : if (VECTOR_MODE_P (modev2))
12046 216 : op0 = safe_vector_operand (op0, modev2);
12047 216 : if (VECTOR_MODE_P (modev4))
12048 216 : op2 = safe_vector_operand (op2, modev4);
12049 :
12050 216 : if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
12051 6 : op0 = copy_to_mode_reg (modev2, op0);
12052 216 : if (!insn_data[d->icode].operand[3].predicate (op1, modei3))
12053 34 : op1 = copy_to_mode_reg (modei3, op1);
12054 160 : if ((optimize && !register_operand (op2, modev4))
12055 371 : || !insn_data[d->icode].operand[4].predicate (op2, modev4))
12056 5 : op2 = copy_to_mode_reg (modev4, op2);
12057 216 : if (!insn_data[d->icode].operand[5].predicate (op3, modei5))
12058 34 : op3 = copy_to_mode_reg (modei5, op3);
12059 :
12060 216 : if (!insn_data[d->icode].operand[6].predicate (op4, modeimm))
12061 : {
12062 21 : error ("the fifth argument must be an 8-bit immediate");
12063 21 : return const0_rtx;
12064 : }
12065 :
12066 195 : if (d->code == IX86_BUILTIN_PCMPESTRI128)
12067 : {
12068 5 : if (optimize || !target
12069 5 : || GET_MODE (target) != tmode0
12070 34 : || !insn_data[d->icode].operand[0].predicate (target, tmode0))
12071 24 : target = gen_reg_rtx (tmode0);
12072 :
12073 29 : scratch1 = gen_reg_rtx (tmode1);
12074 :
12075 29 : pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2, op3, op4);
12076 : }
12077 166 : else if (d->code == IX86_BUILTIN_PCMPESTRM128)
12078 : {
12079 5 : if (optimize || !target
12080 5 : || GET_MODE (target) != tmode1
12081 36 : || !insn_data[d->icode].operand[1].predicate (target, tmode1))
12082 26 : target = gen_reg_rtx (tmode1);
12083 :
12084 31 : scratch0 = gen_reg_rtx (tmode0);
12085 :
12086 31 : pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2, op3, op4);
12087 : }
12088 : else
12089 : {
12090 135 : gcc_assert (d->flag);
12091 :
12092 135 : scratch0 = gen_reg_rtx (tmode0);
12093 135 : scratch1 = gen_reg_rtx (tmode1);
12094 :
12095 135 : pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2, op3, op4);
12096 : }
12097 :
12098 195 : if (! pat)
12099 : return 0;
12100 :
12101 195 : emit_insn (pat);
12102 :
12103 195 : if (d->flag)
12104 : {
12105 135 : target = gen_reg_rtx (SImode);
12106 135 : emit_move_insn (target, const0_rtx);
12107 135 : target = gen_rtx_SUBREG (QImode, target, 0);
12108 :
12109 135 : emit_insn
12110 135 : (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
12111 : gen_rtx_fmt_ee (EQ, QImode,
12112 : gen_rtx_REG ((machine_mode) d->flag,
12113 : FLAGS_REG),
12114 : const0_rtx)));
12115 135 : return SUBREG_REG (target);
12116 : }
12117 : else
12118 : return target;
12119 : }
12120 :
12121 :
12122 : /* Subroutine of ix86_expand_builtin to take care of pcmpistr[im] insns. */
12123 :
12124 : static rtx
12125 275 : ix86_expand_sse_pcmpistr (const struct builtin_description *d,
12126 : tree exp, rtx target)
12127 : {
12128 275 : rtx pat;
12129 275 : tree arg0 = CALL_EXPR_ARG (exp, 0);
12130 275 : tree arg1 = CALL_EXPR_ARG (exp, 1);
12131 275 : tree arg2 = CALL_EXPR_ARG (exp, 2);
12132 275 : rtx scratch0, scratch1;
12133 275 : rtx op0 = expand_normal (arg0);
12134 275 : rtx op1 = expand_normal (arg1);
12135 275 : rtx op2 = expand_normal (arg2);
12136 275 : machine_mode tmode0, tmode1, modev2, modev3, modeimm;
12137 :
12138 275 : tmode0 = insn_data[d->icode].operand[0].mode;
12139 275 : tmode1 = insn_data[d->icode].operand[1].mode;
12140 275 : modev2 = insn_data[d->icode].operand[2].mode;
12141 275 : modev3 = insn_data[d->icode].operand[3].mode;
12142 275 : modeimm = insn_data[d->icode].operand[4].mode;
12143 :
12144 275 : if (VECTOR_MODE_P (modev2))
12145 275 : op0 = safe_vector_operand (op0, modev2);
12146 275 : if (VECTOR_MODE_P (modev3))
12147 275 : op1 = safe_vector_operand (op1, modev3);
12148 :
12149 275 : if (!insn_data[d->icode].operand[2].predicate (op0, modev2))
12150 4 : op0 = copy_to_mode_reg (modev2, op0);
12151 210 : if ((optimize && !register_operand (op1, modev3))
12152 481 : || !insn_data[d->icode].operand[3].predicate (op1, modev3))
12153 4 : op1 = copy_to_mode_reg (modev3, op1);
12154 :
12155 275 : if (!insn_data[d->icode].operand[4].predicate (op2, modeimm))
12156 : {
12157 21 : error ("the third argument must be an 8-bit immediate");
12158 21 : return const0_rtx;
12159 : }
12160 :
12161 254 : if (d->code == IX86_BUILTIN_PCMPISTRI128)
12162 : {
12163 5 : if (optimize || !target
12164 5 : || GET_MODE (target) != tmode0
12165 38 : || !insn_data[d->icode].operand[0].predicate (target, tmode0))
12166 28 : target = gen_reg_rtx (tmode0);
12167 :
12168 33 : scratch1 = gen_reg_rtx (tmode1);
12169 :
12170 33 : pat = GEN_FCN (d->icode) (target, scratch1, op0, op1, op2);
12171 : }
12172 221 : else if (d->code == IX86_BUILTIN_PCMPISTRM128)
12173 : {
12174 8 : if (optimize || !target
12175 8 : || GET_MODE (target) != tmode1
12176 58 : || !insn_data[d->icode].operand[1].predicate (target, tmode1))
12177 42 : target = gen_reg_rtx (tmode1);
12178 :
12179 50 : scratch0 = gen_reg_rtx (tmode0);
12180 :
12181 50 : pat = GEN_FCN (d->icode) (scratch0, target, op0, op1, op2);
12182 : }
12183 : else
12184 : {
12185 171 : gcc_assert (d->flag);
12186 :
12187 171 : scratch0 = gen_reg_rtx (tmode0);
12188 171 : scratch1 = gen_reg_rtx (tmode1);
12189 :
12190 171 : pat = GEN_FCN (d->icode) (scratch0, scratch1, op0, op1, op2);
12191 : }
12192 :
12193 254 : if (! pat)
12194 : return 0;
12195 :
12196 254 : emit_insn (pat);
12197 :
12198 254 : if (d->flag)
12199 : {
12200 171 : target = gen_reg_rtx (SImode);
12201 171 : emit_move_insn (target, const0_rtx);
12202 171 : target = gen_rtx_SUBREG (QImode, target, 0);
12203 :
12204 171 : emit_insn
12205 171 : (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
12206 : gen_rtx_fmt_ee (EQ, QImode,
12207 : gen_rtx_REG ((machine_mode) d->flag,
12208 : FLAGS_REG),
12209 : const0_rtx)));
12210 171 : return SUBREG_REG (target);
12211 : }
12212 : else
12213 : return target;
12214 : }
12215 :
12216 : /* Fixup modeless constants to fit required mode. */
12217 :
12218 : static rtx
12219 258822 : fixup_modeless_constant (rtx x, machine_mode mode)
12220 : {
12221 258822 : if (GET_MODE (x) == VOIDmode)
12222 41227 : x = convert_to_mode (mode, x, 1);
12223 258822 : return x;
12224 : }
12225 :
12226 : /* Expand the outgoing argument ARG to extract unsigned char and short
12227 : integer constants suitable for the predicates and the instruction
12228 : templates which expect the unsigned expanded value. */
12229 :
12230 : static rtx
12231 280000 : ix86_expand_unsigned_small_int_cst_argument (tree arg)
12232 : {
12233 : /* When passing 0xff as an unsigned char function argument with the
12234 : C frontend promotion, expand_normal gets
12235 :
12236 : <integer_cst 0x7fffe6aa23a8 type <integer_type 0x7fffe98225e8 int> constant 255>
12237 :
12238 : and returns the rtx value using the sign-extended representation:
12239 :
12240 : (const_int 255 [0xff])
12241 :
12242 : Without the C frontend promotion, expand_normal gets
12243 :
12244 : <integer_cst 0x7fffe9824018 type <integer_type 0x7fffe9822348 unsigned char > constant 255>
12245 :
12246 : and returns
12247 :
12248 : (const_int -1 [0xffffffffffffffff])
12249 :
12250 : which doesn't work with the predicates nor the instruction templates
12251 : which expect the unsigned expanded value. Extract the unsigned char
12252 : and short integer constants to return
12253 :
12254 : (const_int 255 [0xff])
12255 :
12256 : so that the expanded value is always unsigned, without the C frontend
12257 : promotion. */
12258 :
12259 280000 : if (TREE_CODE (arg) == INTEGER_CST)
12260 : {
12261 60052 : tree type = TREE_TYPE (arg);
12262 60052 : if (INTEGRAL_TYPE_P (type)
12263 60052 : && TYPE_UNSIGNED (type)
12264 81834 : && TYPE_PRECISION (type) < TYPE_PRECISION (integer_type_node))
12265 : {
12266 18298 : HOST_WIDE_INT cst = TREE_INT_CST_LOW (arg);
12267 18298 : return GEN_INT (cst);
12268 : }
12269 : }
12270 :
12271 261702 : return expand_normal (arg);
12272 : }
12273 :
12274 : /* Subroutine of ix86_expand_builtin to take care of insns with
12275 : variable number of operands. */
12276 :
12277 : static rtx
12278 69393 : ix86_expand_args_builtin (const struct builtin_description *d,
12279 : tree exp, rtx target)
12280 : {
12281 69393 : rtx pat, real_target;
12282 69393 : unsigned int i, nargs;
12283 69393 : unsigned int nargs_constant = 0;
12284 69393 : unsigned int mask_pos = 0;
12285 69393 : int num_memory = 0;
12286 69393 : rtx xops[6];
12287 69393 : bool second_arg_count = false;
12288 69393 : enum insn_code icode = d->icode;
12289 69393 : const struct insn_data_d *insn_p = &insn_data[icode];
12290 69393 : machine_mode tmode = insn_p->operand[0].mode;
12291 69393 : machine_mode rmode = VOIDmode;
12292 69393 : bool swap = false;
12293 69393 : enum rtx_code comparison = d->comparison;
12294 :
12295 69393 : switch ((enum ix86_builtin_func_type) d->flag)
12296 : {
12297 0 : case V2DF_FTYPE_V2DF_ROUND:
12298 0 : case V4DF_FTYPE_V4DF_ROUND:
12299 0 : case V8DF_FTYPE_V8DF_ROUND:
12300 0 : case V4SF_FTYPE_V4SF_ROUND:
12301 0 : case V8SF_FTYPE_V8SF_ROUND:
12302 0 : case V16SF_FTYPE_V16SF_ROUND:
12303 0 : case V8HF_FTYPE_V8HF_ROUND:
12304 0 : case V16HF_FTYPE_V16HF_ROUND:
12305 0 : case V32HF_FTYPE_V32HF_ROUND:
12306 0 : case V4SI_FTYPE_V4SF_ROUND:
12307 0 : case V8SI_FTYPE_V8SF_ROUND:
12308 0 : case V16SI_FTYPE_V16SF_ROUND:
12309 0 : return ix86_expand_sse_round (d, exp, target);
12310 12 : case V4SI_FTYPE_V2DF_V2DF_ROUND:
12311 12 : case V8SI_FTYPE_V4DF_V4DF_ROUND:
12312 12 : case V16SI_FTYPE_V8DF_V8DF_ROUND:
12313 12 : return ix86_expand_sse_round_vec_pack_sfix (d, exp, target);
12314 235 : case INT_FTYPE_V8SF_V8SF_PTEST:
12315 235 : case INT_FTYPE_V4DI_V4DI_PTEST:
12316 235 : case INT_FTYPE_V4DF_V4DF_PTEST:
12317 235 : case INT_FTYPE_V4SF_V4SF_PTEST:
12318 235 : case INT_FTYPE_V2DI_V2DI_PTEST:
12319 235 : case INT_FTYPE_V2DF_V2DF_PTEST:
12320 235 : return ix86_expand_sse_ptest (d, exp, target);
12321 : case FLOAT128_FTYPE_FLOAT128:
12322 : case FLOAT_FTYPE_FLOAT:
12323 : case FLOAT_FTYPE_BFLOAT16:
12324 : case INT_FTYPE_INT:
12325 : case UINT_FTYPE_UINT:
12326 : case UINT16_FTYPE_UINT16:
12327 : case UINT64_FTYPE_INT:
12328 : case UINT64_FTYPE_UINT64:
12329 : case INT64_FTYPE_INT64:
12330 : case INT64_FTYPE_V4SF:
12331 : case INT64_FTYPE_V2DF:
12332 : case INT_FTYPE_V16QI:
12333 : case INT_FTYPE_V8QI:
12334 : case INT_FTYPE_V8SF:
12335 : case INT_FTYPE_V4DF:
12336 : case INT_FTYPE_V4SF:
12337 : case INT_FTYPE_V2DF:
12338 : case INT_FTYPE_V32QI:
12339 : case V16QI_FTYPE_V16QI:
12340 : case V8SI_FTYPE_V8SF:
12341 : case V8SI_FTYPE_V4SI:
12342 : case V8HI_FTYPE_V8HI:
12343 : case V8HI_FTYPE_V16QI:
12344 : case V8QI_FTYPE_V8QI:
12345 : case V8SF_FTYPE_V8SF:
12346 : case V8SF_FTYPE_V8SI:
12347 : case V8SF_FTYPE_V4SF:
12348 : case V8SF_FTYPE_V8HI:
12349 : case V4SI_FTYPE_V4SI:
12350 : case V4SI_FTYPE_V16QI:
12351 : case V4SI_FTYPE_V4SF:
12352 : case V4SI_FTYPE_V8SI:
12353 : case V4SI_FTYPE_V8HI:
12354 : case V4SI_FTYPE_V4DF:
12355 : case V4SI_FTYPE_V2DF:
12356 : case V4HI_FTYPE_V4HI:
12357 : case V4DF_FTYPE_V4DF:
12358 : case V4DF_FTYPE_V4SI:
12359 : case V4DF_FTYPE_V4SF:
12360 : case V4DF_FTYPE_V2DF:
12361 : case V4SF_FTYPE_V4SF:
12362 : case V4SF_FTYPE_V4SI:
12363 : case V4SF_FTYPE_V8SF:
12364 : case V4SF_FTYPE_V4DF:
12365 : case V4SF_FTYPE_V8HI:
12366 : case V4SF_FTYPE_V2DF:
12367 : case V2DI_FTYPE_V2DI:
12368 : case V2DI_FTYPE_V16QI:
12369 : case V2DI_FTYPE_V8HI:
12370 : case V2DI_FTYPE_V4SI:
12371 : case V2DF_FTYPE_V2DF:
12372 : case V2DF_FTYPE_V4SI:
12373 : case V2DF_FTYPE_V4DF:
12374 : case V2DF_FTYPE_V4SF:
12375 : case V2DF_FTYPE_V2SI:
12376 : case V2SI_FTYPE_V2SI:
12377 : case V2SI_FTYPE_V4SF:
12378 : case V2SI_FTYPE_V2SF:
12379 : case V2SI_FTYPE_V2DF:
12380 : case V2SF_FTYPE_V2SF:
12381 : case V2SF_FTYPE_V2SI:
12382 : case V32QI_FTYPE_V32QI:
12383 : case V32QI_FTYPE_V16QI:
12384 : case V16HI_FTYPE_V16HI:
12385 : case V16HI_FTYPE_V8HI:
12386 : case V8SI_FTYPE_V8SI:
12387 : case V16HI_FTYPE_V16QI:
12388 : case V8SI_FTYPE_V16QI:
12389 : case V4DI_FTYPE_V16QI:
12390 : case V8SI_FTYPE_V8HI:
12391 : case V4DI_FTYPE_V8HI:
12392 : case V4DI_FTYPE_V4SI:
12393 : case V4DI_FTYPE_V2DI:
12394 : case UQI_FTYPE_UQI:
12395 : case UHI_FTYPE_UHI:
12396 : case USI_FTYPE_USI:
12397 : case USI_FTYPE_UQI:
12398 : case USI_FTYPE_UHI:
12399 : case UDI_FTYPE_UDI:
12400 : case UHI_FTYPE_V16QI:
12401 : case USI_FTYPE_V32QI:
12402 : case UDI_FTYPE_V64QI:
12403 : case V16QI_FTYPE_UHI:
12404 : case V32QI_FTYPE_USI:
12405 : case V64QI_FTYPE_UDI:
12406 : case V8HI_FTYPE_UQI:
12407 : case V16HI_FTYPE_UHI:
12408 : case V32HI_FTYPE_USI:
12409 : case V4SI_FTYPE_UQI:
12410 : case V8SI_FTYPE_UQI:
12411 : case V4SI_FTYPE_UHI:
12412 : case V8SI_FTYPE_UHI:
12413 : case UQI_FTYPE_V8HI:
12414 : case UHI_FTYPE_V16HI:
12415 : case USI_FTYPE_V32HI:
12416 : case UQI_FTYPE_V4SI:
12417 : case UQI_FTYPE_V8SI:
12418 : case UHI_FTYPE_V16SI:
12419 : case UQI_FTYPE_V2DI:
12420 : case UQI_FTYPE_V4DI:
12421 : case UQI_FTYPE_V8DI:
12422 : case V16SI_FTYPE_UHI:
12423 : case V2DI_FTYPE_UQI:
12424 : case V4DI_FTYPE_UQI:
12425 : case V16SI_FTYPE_INT:
12426 : case V16SF_FTYPE_V8SF:
12427 : case V16SI_FTYPE_V8SI:
12428 : case V16SF_FTYPE_V4SF:
12429 : case V16SI_FTYPE_V4SI:
12430 : case V16SI_FTYPE_V16SF:
12431 : case V16SI_FTYPE_V16SI:
12432 : case V64QI_FTYPE_V64QI:
12433 : case V32HI_FTYPE_V32HI:
12434 : case V16SF_FTYPE_V16SF:
12435 : case V8DI_FTYPE_UQI:
12436 : case V8DI_FTYPE_V8DI:
12437 : case V8DF_FTYPE_V4DF:
12438 : case V8DF_FTYPE_V2DF:
12439 : case V8DF_FTYPE_V8DF:
12440 : case V4DI_FTYPE_V4DI:
12441 : case V16BF_FTYPE_V16SF:
12442 : case V8BF_FTYPE_V8SF:
12443 : case V8BF_FTYPE_V4SF:
12444 : nargs = 1;
12445 : break;
12446 52 : case V4SF_FTYPE_V4SF_VEC_MERGE:
12447 52 : case V2DF_FTYPE_V2DF_VEC_MERGE:
12448 52 : return ix86_expand_unop_vec_merge_builtin (icode, exp, target);
12449 9504 : case FLOAT128_FTYPE_FLOAT128_FLOAT128:
12450 9504 : case V16QI_FTYPE_V16QI_V16QI:
12451 9504 : case V16QI_FTYPE_V8HI_V8HI:
12452 9504 : case V16HF_FTYPE_V16HF_V16HF:
12453 9504 : case V16SF_FTYPE_V16SF_V16SF:
12454 9504 : case V16SI_FTYPE_V16SI_V16SI:
12455 9504 : case V8QI_FTYPE_V8QI_V8QI:
12456 9504 : case V8QI_FTYPE_V4HI_V4HI:
12457 9504 : case V8HI_FTYPE_V8HI_V8HI:
12458 9504 : case V8HI_FTYPE_V16QI_V16QI:
12459 9504 : case V8HI_FTYPE_V4SI_V4SI:
12460 9504 : case V8HF_FTYPE_V8HF_V8HF:
12461 9504 : case V8SF_FTYPE_V8SF_V8SF:
12462 9504 : case V8SF_FTYPE_V8SF_V8SI:
12463 9504 : case V8DF_FTYPE_V8DF_V8DF:
12464 9504 : case V4SI_FTYPE_V4SI_V4SI:
12465 9504 : case V4SI_FTYPE_V8HI_V8HI:
12466 9504 : case V4SI_FTYPE_V2DF_V2DF:
12467 9504 : case V4HI_FTYPE_V4HI_V4HI:
12468 9504 : case V4HI_FTYPE_V8QI_V8QI:
12469 9504 : case V4HI_FTYPE_V2SI_V2SI:
12470 9504 : case V4DF_FTYPE_V4DF_V4DF:
12471 9504 : case V4DF_FTYPE_V4DF_V4DI:
12472 9504 : case V4SF_FTYPE_V4SF_V4SF:
12473 9504 : case V4SF_FTYPE_V4SF_V4SI:
12474 9504 : case V4SF_FTYPE_V4SF_V2SI:
12475 9504 : case V4SF_FTYPE_V4SF_V2DF:
12476 9504 : case V4SF_FTYPE_V4SF_UINT:
12477 9504 : case V4SF_FTYPE_V4SF_DI:
12478 9504 : case V4SF_FTYPE_V4SF_SI:
12479 9504 : case V4DI_FTYPE_V4DI_V2DI:
12480 9504 : case V2DI_FTYPE_V2DI_V2DI:
12481 9504 : case V2DI_FTYPE_V16QI_V16QI:
12482 9504 : case V2DI_FTYPE_V4SI_V4SI:
12483 9504 : case V2DI_FTYPE_V2DI_V16QI:
12484 9504 : case V2SI_FTYPE_V2SI_V2SI:
12485 9504 : case V2SI_FTYPE_V4HI_V4HI:
12486 9504 : case V2SI_FTYPE_V2SF_V2SF:
12487 9504 : case V2DF_FTYPE_V2DF_V2DF:
12488 9504 : case V2DF_FTYPE_V2DF_V4SF:
12489 9504 : case V2DF_FTYPE_V2DF_V2DI:
12490 9504 : case V2DF_FTYPE_V2DF_DI:
12491 9504 : case V2DF_FTYPE_V2DF_SI:
12492 9504 : case V2DF_FTYPE_V2DF_UINT:
12493 9504 : case V2SF_FTYPE_V2SF_V2SF:
12494 9504 : case V1DI_FTYPE_V1DI_V1DI:
12495 9504 : case V1DI_FTYPE_V8QI_V8QI:
12496 9504 : case V1DI_FTYPE_V2SI_V2SI:
12497 9504 : case V32QI_FTYPE_V16HI_V16HI:
12498 9504 : case V16HI_FTYPE_V8SI_V8SI:
12499 9504 : case V64QI_FTYPE_V64QI_V64QI:
12500 9504 : case V32QI_FTYPE_V32QI_V32QI:
12501 9504 : case V32BF_FTYPE_V32BF_V32BF:
12502 9504 : case V16BF_FTYPE_V16BF_V16BF:
12503 9504 : case V8BF_FTYPE_V8BF_V8BF:
12504 9504 : case V16HI_FTYPE_V32QI_V32QI:
12505 9504 : case V16HI_FTYPE_V16HI_V16HI:
12506 9504 : case V8SI_FTYPE_V4DF_V4DF:
12507 9504 : case V8SI_FTYPE_V8SI_V8SI:
12508 9504 : case V8SI_FTYPE_V16HI_V16HI:
12509 9504 : case V4DI_FTYPE_V4DI_V4DI:
12510 9504 : case V4DI_FTYPE_V8SI_V8SI:
12511 9504 : case V4DI_FTYPE_V32QI_V32QI:
12512 9504 : case V8DI_FTYPE_V64QI_V64QI:
12513 9504 : if (comparison == UNKNOWN)
12514 8970 : return ix86_expand_binop_builtin (icode, exp, target);
12515 : nargs = 2;
12516 : break;
12517 80 : case V4SF_FTYPE_V4SF_V4SF_SWAP:
12518 80 : case V2DF_FTYPE_V2DF_V2DF_SWAP:
12519 80 : gcc_assert (comparison != UNKNOWN);
12520 : nargs = 2;
12521 : swap = true;
12522 : break;
12523 1481 : case V16HI_FTYPE_V16HI_V8HI_COUNT:
12524 1481 : case V16HI_FTYPE_V16HI_SI_COUNT:
12525 1481 : case V8SI_FTYPE_V8SI_V4SI_COUNT:
12526 1481 : case V8SI_FTYPE_V8SI_SI_COUNT:
12527 1481 : case V4DI_FTYPE_V4DI_V2DI_COUNT:
12528 1481 : case V4DI_FTYPE_V4DI_INT_COUNT:
12529 1481 : case V8HI_FTYPE_V8HI_V8HI_COUNT:
12530 1481 : case V8HI_FTYPE_V8HI_SI_COUNT:
12531 1481 : case V4SI_FTYPE_V4SI_V4SI_COUNT:
12532 1481 : case V4SI_FTYPE_V4SI_SI_COUNT:
12533 1481 : case V4HI_FTYPE_V4HI_V4HI_COUNT:
12534 1481 : case V4HI_FTYPE_V4HI_SI_COUNT:
12535 1481 : case V2DI_FTYPE_V2DI_V2DI_COUNT:
12536 1481 : case V2DI_FTYPE_V2DI_SI_COUNT:
12537 1481 : case V2SI_FTYPE_V2SI_V2SI_COUNT:
12538 1481 : case V2SI_FTYPE_V2SI_SI_COUNT:
12539 1481 : case V1DI_FTYPE_V1DI_V1DI_COUNT:
12540 1481 : case V1DI_FTYPE_V1DI_SI_COUNT:
12541 1481 : nargs = 2;
12542 1481 : second_arg_count = true;
12543 1481 : break;
12544 1408 : case V16HI_FTYPE_V16HI_INT_V16HI_UHI_COUNT:
12545 1408 : case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI_COUNT:
12546 1408 : case V16SI_FTYPE_V16SI_INT_V16SI_UHI_COUNT:
12547 1408 : case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI_COUNT:
12548 1408 : case V2DI_FTYPE_V2DI_INT_V2DI_UQI_COUNT:
12549 1408 : case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI_COUNT:
12550 1408 : case V32HI_FTYPE_V32HI_INT_V32HI_USI_COUNT:
12551 1408 : case V32HI_FTYPE_V32HI_V8HI_V32HI_USI_COUNT:
12552 1408 : case V4DI_FTYPE_V4DI_INT_V4DI_UQI_COUNT:
12553 1408 : case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI_COUNT:
12554 1408 : case V4SI_FTYPE_V4SI_INT_V4SI_UQI_COUNT:
12555 1408 : case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI_COUNT:
12556 1408 : case V8DI_FTYPE_V8DI_INT_V8DI_UQI_COUNT:
12557 1408 : case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI_COUNT:
12558 1408 : case V8HI_FTYPE_V8HI_INT_V8HI_UQI_COUNT:
12559 1408 : case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI_COUNT:
12560 1408 : case V8SI_FTYPE_V8SI_INT_V8SI_UQI_COUNT:
12561 1408 : case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI_COUNT:
12562 1408 : nargs = 4;
12563 1408 : second_arg_count = true;
12564 1408 : break;
12565 966 : case UINT64_FTYPE_UINT64_UINT64:
12566 966 : case UINT_FTYPE_UINT_UINT:
12567 966 : case UINT_FTYPE_UINT_USHORT:
12568 966 : case UINT_FTYPE_UINT_UCHAR:
12569 966 : case UINT16_FTYPE_UINT16_INT:
12570 966 : case UINT8_FTYPE_UINT8_INT:
12571 966 : case UQI_FTYPE_UQI_UQI:
12572 966 : case UHI_FTYPE_UHI_UHI:
12573 966 : case USI_FTYPE_USI_USI:
12574 966 : case UDI_FTYPE_UDI_UDI:
12575 966 : case V16SI_FTYPE_V8DF_V8DF:
12576 966 : case V32BF_FTYPE_V16SF_V16SF:
12577 966 : case V16BF_FTYPE_V8SF_V8SF:
12578 966 : case V8BF_FTYPE_V4SF_V4SF:
12579 966 : case V16BF_FTYPE_V16SF_UHI:
12580 966 : case V8BF_FTYPE_V8SF_UQI:
12581 966 : case V8BF_FTYPE_V4SF_UQI:
12582 966 : case V16QI_FTYPE_V16QI_V8HF:
12583 966 : nargs = 2;
12584 966 : break;
12585 649 : case V2DI_FTYPE_V2DI_INT_CONVERT:
12586 649 : nargs = 2;
12587 649 : rmode = V1TImode;
12588 649 : nargs_constant = 1;
12589 649 : break;
12590 42 : case V4DI_FTYPE_V4DI_INT_CONVERT:
12591 42 : nargs = 2;
12592 42 : rmode = V2TImode;
12593 42 : nargs_constant = 1;
12594 42 : break;
12595 16 : case V8DI_FTYPE_V8DI_INT_CONVERT:
12596 16 : nargs = 2;
12597 16 : rmode = V4TImode;
12598 16 : nargs_constant = 1;
12599 16 : break;
12600 2380 : case V8HI_FTYPE_V8HI_INT:
12601 2380 : case V8HI_FTYPE_V8SF_INT:
12602 2380 : case V16HI_FTYPE_V16SF_INT:
12603 2380 : case V8HI_FTYPE_V4SF_INT:
12604 2380 : case V8SF_FTYPE_V8SF_INT:
12605 2380 : case V4SF_FTYPE_V16SF_INT:
12606 2380 : case V16SF_FTYPE_V16SF_INT:
12607 2380 : case V4SI_FTYPE_V4SI_INT:
12608 2380 : case V4SI_FTYPE_V8SI_INT:
12609 2380 : case V4HI_FTYPE_V4HI_INT:
12610 2380 : case V4DF_FTYPE_V4DF_INT:
12611 2380 : case V4DF_FTYPE_V8DF_INT:
12612 2380 : case V4SF_FTYPE_V4SF_INT:
12613 2380 : case V4SF_FTYPE_V8SF_INT:
12614 2380 : case V2DI_FTYPE_V2DI_INT:
12615 2380 : case V2DF_FTYPE_V2DF_INT:
12616 2380 : case V2DF_FTYPE_V4DF_INT:
12617 2380 : case V16HI_FTYPE_V16HI_INT:
12618 2380 : case V8SI_FTYPE_V8SI_INT:
12619 2380 : case V16SI_FTYPE_V16SI_INT:
12620 2380 : case V4SI_FTYPE_V16SI_INT:
12621 2380 : case V4DI_FTYPE_V4DI_INT:
12622 2380 : case V2DI_FTYPE_V4DI_INT:
12623 2380 : case V4DI_FTYPE_V8DI_INT:
12624 2380 : case UQI_FTYPE_UQI_UQI_CONST:
12625 2380 : case UHI_FTYPE_UHI_UQI:
12626 2380 : case USI_FTYPE_USI_UQI:
12627 2380 : case UDI_FTYPE_UDI_UQI:
12628 2380 : nargs = 2;
12629 2380 : nargs_constant = 1;
12630 2380 : break;
12631 18709 : case V16QI_FTYPE_V16QI_V16QI_V16QI:
12632 18709 : case V8SF_FTYPE_V8SF_V8SF_V8SF:
12633 18709 : case V4DF_FTYPE_V4DF_V4DF_V4DF:
12634 18709 : case V4SF_FTYPE_V4SF_V4SF_V4SF:
12635 18709 : case V2DF_FTYPE_V2DF_V2DF_V2DF:
12636 18709 : case V32QI_FTYPE_V32QI_V32QI_V32QI:
12637 18709 : case UHI_FTYPE_V16SI_V16SI_UHI:
12638 18709 : case UQI_FTYPE_V8DI_V8DI_UQI:
12639 18709 : case V16HI_FTYPE_V16SI_V16HI_UHI:
12640 18709 : case V16QI_FTYPE_V16SI_V16QI_UHI:
12641 18709 : case V16QI_FTYPE_V8DI_V16QI_UQI:
12642 18709 : case V32HF_FTYPE_V32HF_V32HF_USI:
12643 18709 : case V16SF_FTYPE_V16SF_V16SF_UHI:
12644 18709 : case V16SF_FTYPE_V4SF_V16SF_UHI:
12645 18709 : case V16SI_FTYPE_SI_V16SI_UHI:
12646 18709 : case V16SI_FTYPE_V16HI_V16SI_UHI:
12647 18709 : case V16SI_FTYPE_V16QI_V16SI_UHI:
12648 18709 : case V8SF_FTYPE_V4SF_V8SF_UQI:
12649 18709 : case V4DF_FTYPE_V2DF_V4DF_UQI:
12650 18709 : case V8SI_FTYPE_V4SI_V8SI_UQI:
12651 18709 : case V8SI_FTYPE_SI_V8SI_UQI:
12652 18709 : case V4SI_FTYPE_V4SI_V4SI_UQI:
12653 18709 : case V4SI_FTYPE_SI_V4SI_UQI:
12654 18709 : case V4DI_FTYPE_V2DI_V4DI_UQI:
12655 18709 : case V4DI_FTYPE_DI_V4DI_UQI:
12656 18709 : case V2DI_FTYPE_V2DI_V2DI_UQI:
12657 18709 : case V2DI_FTYPE_DI_V2DI_UQI:
12658 18709 : case V64QI_FTYPE_V64QI_V64QI_UDI:
12659 18709 : case V64QI_FTYPE_V16QI_V64QI_UDI:
12660 18709 : case V64QI_FTYPE_QI_V64QI_UDI:
12661 18709 : case V32QI_FTYPE_V32QI_V32QI_USI:
12662 18709 : case V32QI_FTYPE_V16QI_V32QI_USI:
12663 18709 : case V32QI_FTYPE_QI_V32QI_USI:
12664 18709 : case V16QI_FTYPE_V16QI_V16QI_UHI:
12665 18709 : case V16QI_FTYPE_QI_V16QI_UHI:
12666 18709 : case V32HI_FTYPE_V8HI_V32HI_USI:
12667 18709 : case V32HI_FTYPE_V32BF_V32HI_USI:
12668 18709 : case V32HI_FTYPE_HI_V32HI_USI:
12669 18709 : case V16HI_FTYPE_V8HI_V16HI_UHI:
12670 18709 : case V16HI_FTYPE_V16BF_V16HI_UHI:
12671 18709 : case V16HI_FTYPE_HI_V16HI_UHI:
12672 18709 : case V8HI_FTYPE_V8HI_V8HI_UQI:
12673 18709 : case V8HI_FTYPE_V8BF_V8HI_UQI:
12674 18709 : case V8BF_FTYPE_V8BF_V8BF_UQI:
12675 18709 : case V8HI_FTYPE_HI_V8HI_UQI:
12676 18709 : case V16HF_FTYPE_V16HF_V16HF_UHI:
12677 18709 : case V8SF_FTYPE_V8HI_V8SF_UQI:
12678 18709 : case V4SF_FTYPE_V8HI_V4SF_UQI:
12679 18709 : case V8SI_FTYPE_V8HF_V8SI_UQI:
12680 18709 : case V8SF_FTYPE_V8HF_V8SF_UQI:
12681 18709 : case V8SI_FTYPE_V8SF_V8SI_UQI:
12682 18709 : case V4SI_FTYPE_V4SF_V4SI_UQI:
12683 18709 : case V4SI_FTYPE_V8HF_V4SI_UQI:
12684 18709 : case V4SF_FTYPE_V8HF_V4SF_UQI:
12685 18709 : case V4DI_FTYPE_V8HF_V4DI_UQI:
12686 18709 : case V4DI_FTYPE_V4SF_V4DI_UQI:
12687 18709 : case V2DI_FTYPE_V8HF_V2DI_UQI:
12688 18709 : case V2DI_FTYPE_V4SF_V2DI_UQI:
12689 18709 : case V8HF_FTYPE_V8HF_V8HF_UQI:
12690 18709 : case V8HF_FTYPE_V8HF_V8HF_V8HF:
12691 18709 : case V8HF_FTYPE_V8HI_V8HF_UQI:
12692 18709 : case V8HF_FTYPE_V8SI_V8HF_UQI:
12693 18709 : case V8HF_FTYPE_V8SF_V8HF_UQI:
12694 18709 : case V8HF_FTYPE_V4SI_V8HF_UQI:
12695 18709 : case V8HF_FTYPE_V4SF_V8HF_UQI:
12696 18709 : case V8HF_FTYPE_V4DI_V8HF_UQI:
12697 18709 : case V8HF_FTYPE_V4DF_V8HF_UQI:
12698 18709 : case V8HF_FTYPE_V2DI_V8HF_UQI:
12699 18709 : case V8HF_FTYPE_V2DF_V8HF_UQI:
12700 18709 : case V4SF_FTYPE_V4DI_V4SF_UQI:
12701 18709 : case V4SF_FTYPE_V2DI_V4SF_UQI:
12702 18709 : case V4DF_FTYPE_V4DI_V4DF_UQI:
12703 18709 : case V4DF_FTYPE_V8HF_V4DF_UQI:
12704 18709 : case V2DF_FTYPE_V8HF_V2DF_UQI:
12705 18709 : case V2DF_FTYPE_V2DI_V2DF_UQI:
12706 18709 : case V16QI_FTYPE_V8HI_V16QI_UQI:
12707 18709 : case V16QI_FTYPE_V16HI_V16QI_UHI:
12708 18709 : case V16QI_FTYPE_V4SI_V16QI_UQI:
12709 18709 : case V16QI_FTYPE_V8SI_V16QI_UQI:
12710 18709 : case V8HI_FTYPE_V8HF_V8HI_UQI:
12711 18709 : case V8HI_FTYPE_V4SI_V8HI_UQI:
12712 18709 : case V8HI_FTYPE_V8SI_V8HI_UQI:
12713 18709 : case V16QI_FTYPE_V2DI_V16QI_UQI:
12714 18709 : case V16QI_FTYPE_V4DI_V16QI_UQI:
12715 18709 : case V8HI_FTYPE_V2DI_V8HI_UQI:
12716 18709 : case V8HI_FTYPE_V4DI_V8HI_UQI:
12717 18709 : case V4SI_FTYPE_V2DI_V4SI_UQI:
12718 18709 : case V4SI_FTYPE_V4DI_V4SI_UQI:
12719 18709 : case V32QI_FTYPE_V32HI_V32QI_USI:
12720 18709 : case UHI_FTYPE_V16QI_V16QI_UHI:
12721 18709 : case USI_FTYPE_V32QI_V32QI_USI:
12722 18709 : case UDI_FTYPE_V64QI_V64QI_UDI:
12723 18709 : case UQI_FTYPE_V8HI_V8HI_UQI:
12724 18709 : case UHI_FTYPE_V16HI_V16HI_UHI:
12725 18709 : case USI_FTYPE_V32HI_V32HI_USI:
12726 18709 : case UQI_FTYPE_V4SI_V4SI_UQI:
12727 18709 : case UQI_FTYPE_V8SI_V8SI_UQI:
12728 18709 : case UQI_FTYPE_V2DI_V2DI_UQI:
12729 18709 : case UQI_FTYPE_V4DI_V4DI_UQI:
12730 18709 : case V4SF_FTYPE_V2DF_V4SF_UQI:
12731 18709 : case V4SF_FTYPE_V4DF_V4SF_UQI:
12732 18709 : case V16SI_FTYPE_V16SI_V16SI_UHI:
12733 18709 : case V16SI_FTYPE_V4SI_V16SI_UHI:
12734 18709 : case V2DI_FTYPE_V4SI_V2DI_UQI:
12735 18709 : case V2DI_FTYPE_V8HI_V2DI_UQI:
12736 18709 : case V2DI_FTYPE_V16QI_V2DI_UQI:
12737 18709 : case V4DI_FTYPE_V4DI_V4DI_UQI:
12738 18709 : case V4DI_FTYPE_V4SI_V4DI_UQI:
12739 18709 : case V4DI_FTYPE_V8HI_V4DI_UQI:
12740 18709 : case V4DI_FTYPE_V16QI_V4DI_UQI:
12741 18709 : case V4DI_FTYPE_V4DF_V4DI_UQI:
12742 18709 : case V2DI_FTYPE_V2DF_V2DI_UQI:
12743 18709 : case V4SI_FTYPE_V4DF_V4SI_UQI:
12744 18709 : case V4SI_FTYPE_V2DF_V4SI_UQI:
12745 18709 : case V4SI_FTYPE_V8HI_V4SI_UQI:
12746 18709 : case V4SI_FTYPE_V16QI_V4SI_UQI:
12747 18709 : case V4DI_FTYPE_V4DI_V4DI_V4DI:
12748 18709 : case V8DF_FTYPE_V2DF_V8DF_UQI:
12749 18709 : case V8DF_FTYPE_V4DF_V8DF_UQI:
12750 18709 : case V8DF_FTYPE_V8DF_V8DF_UQI:
12751 18709 : case V8SF_FTYPE_V8SF_V8SF_UQI:
12752 18709 : case V8SF_FTYPE_V8SI_V8SF_UQI:
12753 18709 : case V4DF_FTYPE_V4DF_V4DF_UQI:
12754 18709 : case V4SF_FTYPE_V4SF_V4SF_UQI:
12755 18709 : case V2DF_FTYPE_V2DF_V2DF_UQI:
12756 18709 : case V2DF_FTYPE_V4SF_V2DF_UQI:
12757 18709 : case V2DF_FTYPE_V4SI_V2DF_UQI:
12758 18709 : case V4SF_FTYPE_V4SI_V4SF_UQI:
12759 18709 : case V4DF_FTYPE_V4SF_V4DF_UQI:
12760 18709 : case V4DF_FTYPE_V4SI_V4DF_UQI:
12761 18709 : case V8SI_FTYPE_V8SI_V8SI_UQI:
12762 18709 : case V8SI_FTYPE_V8HI_V8SI_UQI:
12763 18709 : case V8SI_FTYPE_V16QI_V8SI_UQI:
12764 18709 : case V8DF_FTYPE_V8SI_V8DF_UQI:
12765 18709 : case V8DI_FTYPE_DI_V8DI_UQI:
12766 18709 : case V16SF_FTYPE_V8SF_V16SF_UHI:
12767 18709 : case V16SI_FTYPE_V8SI_V16SI_UHI:
12768 18709 : case V16HF_FTYPE_V16HI_V16HF_UHI:
12769 18709 : case V16HF_FTYPE_V16HF_V16HF_V16HF:
12770 18709 : case V16HI_FTYPE_V16HF_V16HI_UHI:
12771 18709 : case V16HI_FTYPE_V16HI_V16HI_UHI:
12772 18709 : case V16BF_FTYPE_V16BF_V16BF_UHI:
12773 18709 : case V8HI_FTYPE_V16QI_V8HI_UQI:
12774 18709 : case V16HI_FTYPE_V16QI_V16HI_UHI:
12775 18709 : case V32HI_FTYPE_V32HI_V32HI_USI:
12776 18709 : case V32BF_FTYPE_V32BF_V32BF_USI:
12777 18709 : case V32HI_FTYPE_V32QI_V32HI_USI:
12778 18709 : case V8DI_FTYPE_V16QI_V8DI_UQI:
12779 18709 : case V8DI_FTYPE_V2DI_V8DI_UQI:
12780 18709 : case V8DI_FTYPE_V4DI_V8DI_UQI:
12781 18709 : case V8DI_FTYPE_V8DI_V8DI_UQI:
12782 18709 : case V8DI_FTYPE_V8HI_V8DI_UQI:
12783 18709 : case V8DI_FTYPE_V8SI_V8DI_UQI:
12784 18709 : case V8HI_FTYPE_V8DI_V8HI_UQI:
12785 18709 : case V8SI_FTYPE_V8DI_V8SI_UQI:
12786 18709 : case V4SI_FTYPE_V4SI_V4SI_V4SI:
12787 18709 : case V4DI_FTYPE_V4DI_V4DI_V2DI:
12788 18709 : case V16SI_FTYPE_V16SI_V16SI_V16SI:
12789 18709 : case V8DI_FTYPE_V8DI_V8DI_V8DI:
12790 18709 : case V32HI_FTYPE_V32HI_V32HI_V32HI:
12791 18709 : case V2DI_FTYPE_V2DI_V2DI_V2DI:
12792 18709 : case V16HI_FTYPE_V16HI_V16HI_V16HI:
12793 18709 : case V8SI_FTYPE_V8SI_V8SI_V8SI:
12794 18709 : case V8HI_FTYPE_V8HI_V8HI_V8HI:
12795 18709 : case V32BF_FTYPE_V16SF_V16SF_USI:
12796 18709 : case V16BF_FTYPE_V8SF_V8SF_UHI:
12797 18709 : case V8BF_FTYPE_V4SF_V4SF_UQI:
12798 18709 : case V16BF_FTYPE_V16SF_V16BF_UHI:
12799 18709 : case V8BF_FTYPE_V8SF_V8BF_UQI:
12800 18709 : case V8BF_FTYPE_V4SF_V8BF_UQI:
12801 18709 : case V16SF_FTYPE_V16SF_V32BF_V32BF:
12802 18709 : case V8SF_FTYPE_V8SF_V16BF_V16BF:
12803 18709 : case V4SF_FTYPE_V4SF_V8BF_V8BF:
12804 18709 : case V16QI_FTYPE_V16QI_V8HF_V8HF:
12805 18709 : case V32QI_FTYPE_V32QI_V16HF_V16HF:
12806 18709 : case V64QI_FTYPE_V64QI_V32HF_V32HF:
12807 18709 : case V16QI_FTYPE_V8HF_V16QI_UQI:
12808 18709 : case V16QI_FTYPE_V16HF_V16QI_UHI:
12809 18709 : case V32QI_FTYPE_V32HF_V32QI_USI:
12810 18709 : case V8HF_FTYPE_V16QI_V8HF_UQI:
12811 18709 : case V16HF_FTYPE_V16QI_V16HF_UHI:
12812 18709 : case V32HF_FTYPE_V32QI_V32HF_USI:
12813 18709 : case V16SI_FTYPE_V16SF_V16SI_UHI:
12814 18709 : case V32HI_FTYPE_V32HF_V32HI_USI:
12815 18709 : case V8DI_FTYPE_V8SF_V8DI_UQI:
12816 18709 : case V8DI_FTYPE_V8DF_V8DI_UQI:
12817 18709 : case V8SI_FTYPE_V8DF_V8SI_UQI:
12818 18709 : nargs = 3;
12819 18709 : break;
12820 1479 : case V32QI_FTYPE_V32QI_V32QI_INT:
12821 1479 : case V16HI_FTYPE_V16HI_V16HI_INT:
12822 1479 : case V16QI_FTYPE_V16QI_V16QI_INT:
12823 1479 : case V4DI_FTYPE_V4DI_V4DI_INT:
12824 1479 : case V8HI_FTYPE_V8HI_V8HI_INT:
12825 1479 : case V8SI_FTYPE_V8SI_V8SI_INT:
12826 1479 : case V8SI_FTYPE_V8SI_V4SI_INT:
12827 1479 : case V8SF_FTYPE_V8SF_V8SF_INT:
12828 1479 : case V8SF_FTYPE_V8SF_V4SF_INT:
12829 1479 : case V4SI_FTYPE_V4SI_V4SI_INT:
12830 1479 : case V4DF_FTYPE_V4DF_V4DF_INT:
12831 1479 : case V16SF_FTYPE_V16SF_V16SF_INT:
12832 1479 : case V16SF_FTYPE_V16SF_V4SF_INT:
12833 1479 : case V16SI_FTYPE_V16SI_V4SI_INT:
12834 1479 : case V4DF_FTYPE_V4DF_V2DF_INT:
12835 1479 : case V4SF_FTYPE_V4SF_V4SF_INT:
12836 1479 : case V2DI_FTYPE_V2DI_V2DI_INT:
12837 1479 : case V4DI_FTYPE_V4DI_V2DI_INT:
12838 1479 : case V2DF_FTYPE_V2DF_V2DF_INT:
12839 1479 : case UQI_FTYPE_V8DI_V8UDI_INT:
12840 1479 : case UQI_FTYPE_V8DF_V8DF_INT:
12841 1479 : case UQI_FTYPE_V2DF_V2DF_INT:
12842 1479 : case UQI_FTYPE_V4SF_V4SF_INT:
12843 1479 : case UHI_FTYPE_V16SI_V16SI_INT:
12844 1479 : case UHI_FTYPE_V16SF_V16SF_INT:
12845 1479 : case V64QI_FTYPE_V64QI_V64QI_INT:
12846 1479 : case V32HI_FTYPE_V32HI_V32HI_INT:
12847 1479 : case V16SI_FTYPE_V16SI_V16SI_INT:
12848 1479 : case V8DI_FTYPE_V8DI_V8DI_INT:
12849 1479 : nargs = 3;
12850 1479 : nargs_constant = 1;
12851 1479 : break;
12852 47 : case V4DI_FTYPE_V4DI_V4DI_INT_CONVERT:
12853 47 : nargs = 3;
12854 47 : rmode = V4DImode;
12855 47 : nargs_constant = 1;
12856 47 : break;
12857 80 : case V2DI_FTYPE_V2DI_V2DI_INT_CONVERT:
12858 80 : nargs = 3;
12859 80 : rmode = V2DImode;
12860 80 : nargs_constant = 1;
12861 80 : break;
12862 48 : case V1DI_FTYPE_V1DI_V1DI_INT_CONVERT:
12863 48 : nargs = 3;
12864 48 : rmode = DImode;
12865 48 : nargs_constant = 1;
12866 48 : break;
12867 20 : case V2DI_FTYPE_V2DI_UINT_UINT:
12868 20 : nargs = 3;
12869 20 : nargs_constant = 2;
12870 20 : break;
12871 8 : case V8DI_FTYPE_V8DI_V8DI_INT_CONVERT:
12872 8 : nargs = 3;
12873 8 : rmode = V8DImode;
12874 8 : nargs_constant = 1;
12875 8 : break;
12876 16 : case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UDI_CONVERT:
12877 16 : nargs = 5;
12878 16 : rmode = V8DImode;
12879 16 : mask_pos = 2;
12880 16 : nargs_constant = 1;
12881 16 : break;
12882 320 : case QI_FTYPE_V8DF_INT_UQI:
12883 320 : case QI_FTYPE_V4DF_INT_UQI:
12884 320 : case QI_FTYPE_V2DF_INT_UQI:
12885 320 : case HI_FTYPE_V16SF_INT_UHI:
12886 320 : case QI_FTYPE_V8SF_INT_UQI:
12887 320 : case QI_FTYPE_V4SF_INT_UQI:
12888 320 : case QI_FTYPE_V8HF_INT_UQI:
12889 320 : case HI_FTYPE_V16HF_INT_UHI:
12890 320 : case SI_FTYPE_V32HF_INT_USI:
12891 320 : case QI_FTYPE_V8BF_INT_UQI:
12892 320 : case HI_FTYPE_V16BF_INT_UHI:
12893 320 : case SI_FTYPE_V32BF_INT_USI:
12894 320 : case V4SI_FTYPE_V4SI_V4SI_UHI:
12895 320 : case V8SI_FTYPE_V8SI_V8SI_UHI:
12896 320 : nargs = 3;
12897 320 : mask_pos = 1;
12898 320 : nargs_constant = 1;
12899 320 : break;
12900 17 : case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_USI_CONVERT:
12901 17 : nargs = 5;
12902 17 : rmode = V4DImode;
12903 17 : mask_pos = 2;
12904 17 : nargs_constant = 1;
12905 17 : break;
12906 17 : case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UHI_CONVERT:
12907 17 : nargs = 5;
12908 17 : rmode = V2DImode;
12909 17 : mask_pos = 2;
12910 17 : nargs_constant = 1;
12911 17 : break;
12912 17242 : case V32QI_FTYPE_V32QI_V32QI_V32QI_USI:
12913 17242 : case V32HI_FTYPE_V32HI_V32HI_V32HI_USI:
12914 17242 : case V32BF_FTYPE_V32BF_V32BF_V32BF_USI:
12915 17242 : case V32HI_FTYPE_V64QI_V64QI_V32HI_USI:
12916 17242 : case V16SI_FTYPE_V32HI_V32HI_V16SI_UHI:
12917 17242 : case V64QI_FTYPE_V64QI_V64QI_V64QI_UDI:
12918 17242 : case V32HI_FTYPE_V32HI_V8HI_V32HI_USI:
12919 17242 : case V16HI_FTYPE_V16HI_V8HI_V16HI_UHI:
12920 17242 : case V8SI_FTYPE_V8SI_V4SI_V8SI_UQI:
12921 17242 : case V4DI_FTYPE_V4DI_V2DI_V4DI_UQI:
12922 17242 : case V64QI_FTYPE_V32HI_V32HI_V64QI_UDI:
12923 17242 : case V32QI_FTYPE_V16HI_V16HI_V32QI_USI:
12924 17242 : case V16QI_FTYPE_V8HI_V8HI_V16QI_UHI:
12925 17242 : case V32HI_FTYPE_V16SI_V16SI_V32HI_USI:
12926 17242 : case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
12927 17242 : case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
12928 17242 : case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
12929 17242 : case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
12930 17242 : case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
12931 17242 : case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
12932 17242 : case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
12933 17242 : case V2DI_FTYPE_V4SI_V4SI_V2DI_UQI:
12934 17242 : case V4DI_FTYPE_V8SI_V8SI_V4DI_UQI:
12935 17242 : case V4DF_FTYPE_V4DI_V4DF_V4DF_UQI:
12936 17242 : case V8SF_FTYPE_V8SI_V8SF_V8SF_UQI:
12937 17242 : case V2DF_FTYPE_V2DI_V2DF_V2DF_UQI:
12938 17242 : case V4SF_FTYPE_V4SI_V4SF_V4SF_UQI:
12939 17242 : case V16SF_FTYPE_V16SF_V16SF_V16SF_UHI:
12940 17242 : case V16SF_FTYPE_V16SF_V16SI_V16SF_UHI:
12941 17242 : case V16SF_FTYPE_V16SI_V16SF_V16SF_UHI:
12942 17242 : case V16SI_FTYPE_V16SI_V16SI_V16SI_UHI:
12943 17242 : case V16SI_FTYPE_V16SI_V4SI_V16SI_UHI:
12944 17242 : case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
12945 17242 : case V8BF_FTYPE_V8BF_V8BF_V8BF_UQI:
12946 17242 : case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
12947 17242 : case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
12948 17242 : case V16HF_FTYPE_V16HF_V16HF_V16HF_UQI:
12949 17242 : case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
12950 17242 : case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
12951 17242 : case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
12952 17242 : case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
12953 17242 : case V16BF_FTYPE_V16BF_V16BF_V16BF_UHI:
12954 17242 : case V2DI_FTYPE_V2DI_V2DI_V2DI_UQI:
12955 17242 : case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
12956 17242 : case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
12957 17242 : case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
12958 17242 : case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
12959 17242 : case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
12960 17242 : case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
12961 17242 : case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
12962 17242 : case V8DF_FTYPE_V8DI_V8DF_V8DF_UQI:
12963 17242 : case V8DI_FTYPE_V16SI_V16SI_V8DI_UQI:
12964 17242 : case V8DI_FTYPE_V8DI_V2DI_V8DI_UQI:
12965 17242 : case V8DI_FTYPE_V8DI_V8DI_V8DI_UQI:
12966 17242 : case V8HI_FTYPE_V16QI_V16QI_V8HI_UQI:
12967 17242 : case V16HI_FTYPE_V32QI_V32QI_V16HI_UHI:
12968 17242 : case V8SI_FTYPE_V16HI_V16HI_V8SI_UQI:
12969 17242 : case V4SI_FTYPE_V8HI_V8HI_V4SI_UQI:
12970 17242 : case V32BF_FTYPE_V16SF_V16SF_V32BF_USI:
12971 17242 : case V16BF_FTYPE_V8SF_V8SF_V16BF_UHI:
12972 17242 : case V8BF_FTYPE_V4SF_V4SF_V8BF_UQI:
12973 17242 : case V32HF_FTYPE_V16SF_V16SF_V32HF_USI:
12974 17242 : case V16HF_FTYPE_V8SF_V8SF_V16HF_UHI:
12975 17242 : case V8HF_FTYPE_V4SF_V4SF_V8HF_UQI:
12976 17242 : case V16QI_FTYPE_V8HF_V8HF_V16QI_UHI:
12977 17242 : case V32QI_FTYPE_V16HF_V16HF_V32QI_USI:
12978 17242 : case V64QI_FTYPE_V32HF_V32HF_V64QI_UDI:
12979 17242 : case V16QI_FTYPE_V16QI_V8HF_V16QI_UHI:
12980 17242 : case V16QI_FTYPE_V32QI_V16HF_V16QI_UHI:
12981 17242 : case V32QI_FTYPE_V64QI_V32HF_V32QI_USI:
12982 17242 : nargs = 4;
12983 17242 : break;
12984 11 : case V2DF_FTYPE_V2DF_V2DF_V2DI_INT:
12985 11 : case V4DF_FTYPE_V4DF_V4DF_V4DI_INT:
12986 11 : case V4SF_FTYPE_V4SF_V4SF_V4SI_INT:
12987 11 : case V8SF_FTYPE_V8SF_V8SF_V8SI_INT:
12988 11 : case V16SF_FTYPE_V16SF_V16SF_V16SI_INT:
12989 11 : case V4SI_FTYPE_V4SI_V4SI_V4SI_INT:
12990 11 : nargs = 4;
12991 11 : nargs_constant = 1;
12992 11 : break;
12993 3718 : case UQI_FTYPE_V4DI_V4DI_INT_UQI:
12994 3718 : case UQI_FTYPE_V8SI_V8SI_INT_UQI:
12995 3718 : case QI_FTYPE_V4DF_V4DF_INT_UQI:
12996 3718 : case QI_FTYPE_V8SF_V8SF_INT_UQI:
12997 3718 : case UHI_FTYPE_V16HF_V16HF_INT_UHI:
12998 3718 : case UQI_FTYPE_V2DI_V2DI_INT_UQI:
12999 3718 : case UQI_FTYPE_V4SI_V4SI_INT_UQI:
13000 3718 : case UQI_FTYPE_V2DF_V2DF_INT_UQI:
13001 3718 : case UQI_FTYPE_V4SF_V4SF_INT_UQI:
13002 3718 : case UQI_FTYPE_V8HF_V8HF_INT_UQI:
13003 3718 : case UDI_FTYPE_V64QI_V64QI_INT_UDI:
13004 3718 : case USI_FTYPE_V32QI_V32QI_INT_USI:
13005 3718 : case UHI_FTYPE_V16QI_V16QI_INT_UHI:
13006 3718 : case USI_FTYPE_V32HI_V32HI_INT_USI:
13007 3718 : case USI_FTYPE_V32BF_V32BF_INT_USI:
13008 3718 : case USI_FTYPE_V32HF_V32HF_INT_USI:
13009 3718 : case UHI_FTYPE_V16HI_V16HI_INT_UHI:
13010 3718 : case UHI_FTYPE_V16BF_V16BF_INT_UHI:
13011 3718 : case UQI_FTYPE_V8HI_V8HI_INT_UQI:
13012 3718 : case UQI_FTYPE_V8BF_V8BF_INT_UQI:
13013 3718 : nargs = 4;
13014 3718 : mask_pos = 1;
13015 3718 : nargs_constant = 1;
13016 3718 : break;
13017 23 : case V2DI_FTYPE_V2DI_V2DI_UINT_UINT:
13018 23 : nargs = 4;
13019 23 : nargs_constant = 2;
13020 23 : break;
13021 67 : case UCHAR_FTYPE_UCHAR_UINT_UINT_PUNSIGNED:
13022 67 : case UCHAR_FTYPE_UCHAR_ULONGLONG_ULONGLONG_PULONGLONG:
13023 67 : case V16SF_FTYPE_V16SF_V32BF_V32BF_UHI:
13024 67 : case V8SF_FTYPE_V8SF_V16BF_V16BF_UQI:
13025 67 : case V4SF_FTYPE_V4SF_V8BF_V8BF_UQI:
13026 67 : nargs = 4;
13027 67 : break;
13028 679 : case UQI_FTYPE_V8DI_V8DI_INT_UQI:
13029 679 : case UHI_FTYPE_V16SI_V16SI_INT_UHI:
13030 679 : mask_pos = 1;
13031 679 : nargs = 4;
13032 679 : nargs_constant = 1;
13033 679 : break;
13034 3948 : case V8SF_FTYPE_V8SF_INT_V8SF_UQI:
13035 3948 : case V4SF_FTYPE_V4SF_INT_V4SF_UQI:
13036 3948 : case V2DF_FTYPE_V4DF_INT_V2DF_UQI:
13037 3948 : case V2DI_FTYPE_V4DI_INT_V2DI_UQI:
13038 3948 : case V8SF_FTYPE_V16SF_INT_V8SF_UQI:
13039 3948 : case V8SI_FTYPE_V16SI_INT_V8SI_UQI:
13040 3948 : case V2DF_FTYPE_V8DF_INT_V2DF_UQI:
13041 3948 : case V2DI_FTYPE_V8DI_INT_V2DI_UQI:
13042 3948 : case V4SF_FTYPE_V8SF_INT_V4SF_UQI:
13043 3948 : case V4SI_FTYPE_V8SI_INT_V4SI_UQI:
13044 3948 : case V8HI_FTYPE_V8SF_INT_V8HI_UQI:
13045 3948 : case V8HI_FTYPE_V4SF_INT_V8HI_UQI:
13046 3948 : case V32HI_FTYPE_V32HI_INT_V32HI_USI:
13047 3948 : case V16HI_FTYPE_V16HI_INT_V16HI_UHI:
13048 3948 : case V8HI_FTYPE_V8HI_INT_V8HI_UQI:
13049 3948 : case V32BF_FTYPE_V32BF_INT_V32BF_USI:
13050 3948 : case V16BF_FTYPE_V16BF_INT_V16BF_UHI:
13051 3948 : case V8BF_FTYPE_V8BF_INT_V8BF_UQI:
13052 3948 : case V4DI_FTYPE_V4DI_INT_V4DI_UQI:
13053 3948 : case V2DI_FTYPE_V2DI_INT_V2DI_UQI:
13054 3948 : case V8SI_FTYPE_V8SI_INT_V8SI_UQI:
13055 3948 : case V4SI_FTYPE_V4SI_INT_V4SI_UQI:
13056 3948 : case V4DF_FTYPE_V4DF_INT_V4DF_UQI:
13057 3948 : case V2DF_FTYPE_V2DF_INT_V2DF_UQI:
13058 3948 : case V8DF_FTYPE_V8DF_INT_V8DF_UQI:
13059 3948 : case V16SF_FTYPE_V16SF_INT_V16SF_UHI:
13060 3948 : case V16HI_FTYPE_V16SF_INT_V16HI_UHI:
13061 3948 : case V16SI_FTYPE_V16SI_INT_V16SI_UHI:
13062 3948 : case V16HF_FTYPE_V16HF_INT_V16HF_UHI:
13063 3948 : case V8HF_FTYPE_V8HF_INT_V8HF_UQI:
13064 3948 : case V4SI_FTYPE_V16SI_INT_V4SI_UQI:
13065 3948 : case V4DI_FTYPE_V8DI_INT_V4DI_UQI:
13066 3948 : case V4DF_FTYPE_V8DF_INT_V4DF_UQI:
13067 3948 : case V4SF_FTYPE_V16SF_INT_V4SF_UQI:
13068 3948 : case V8DI_FTYPE_V8DI_INT_V8DI_UQI:
13069 3948 : nargs = 4;
13070 3948 : mask_pos = 2;
13071 3948 : nargs_constant = 1;
13072 3948 : break;
13073 1726 : case V16SF_FTYPE_V16SF_V4SF_INT_V16SF_UHI:
13074 1726 : case V16SI_FTYPE_V16SI_V4SI_INT_V16SI_UHI:
13075 1726 : case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI:
13076 1726 : case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_UQI:
13077 1726 : case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI:
13078 1726 : case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_UHI:
13079 1726 : case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI:
13080 1726 : case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI:
13081 1726 : case V8DF_FTYPE_V8DF_V4DF_INT_V8DF_UQI:
13082 1726 : case V8DI_FTYPE_V8DI_V4DI_INT_V8DI_UQI:
13083 1726 : case V4DF_FTYPE_V4DF_V4DF_INT_V4DF_UQI:
13084 1726 : case V8SF_FTYPE_V8SF_V8SF_INT_V8SF_UQI:
13085 1726 : case V8DF_FTYPE_V8DF_V2DF_INT_V8DF_UQI:
13086 1726 : case V8DI_FTYPE_V8DI_V2DI_INT_V8DI_UQI:
13087 1726 : case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_UQI:
13088 1726 : case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_UQI:
13089 1726 : case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_UQI:
13090 1726 : case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_UQI:
13091 1726 : case V32HI_FTYPE_V64QI_V64QI_INT_V32HI_USI:
13092 1726 : case V16HI_FTYPE_V32QI_V32QI_INT_V16HI_UHI:
13093 1726 : case V8HI_FTYPE_V16QI_V16QI_INT_V8HI_UQI:
13094 1726 : case V16SF_FTYPE_V16SF_V8SF_INT_V16SF_UHI:
13095 1726 : case V16SI_FTYPE_V16SI_V8SI_INT_V16SI_UHI:
13096 1726 : case V8SF_FTYPE_V8SF_V4SF_INT_V8SF_UQI:
13097 1726 : case V8SI_FTYPE_V8SI_V4SI_INT_V8SI_UQI:
13098 1726 : case V4DI_FTYPE_V4DI_V2DI_INT_V4DI_UQI:
13099 1726 : case V4DF_FTYPE_V4DF_V2DF_INT_V4DF_UQI:
13100 1726 : nargs = 5;
13101 1726 : mask_pos = 2;
13102 1726 : nargs_constant = 1;
13103 1726 : break;
13104 268 : case V8DI_FTYPE_V8DI_V8DI_V8DI_INT_UQI:
13105 268 : case V16SI_FTYPE_V16SI_V16SI_V16SI_INT_UHI:
13106 268 : case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_UQI:
13107 268 : case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_UQI:
13108 268 : case V8SF_FTYPE_V8SF_V8SF_V8SI_INT_UQI:
13109 268 : case V8SI_FTYPE_V8SI_V8SI_V8SI_INT_UQI:
13110 268 : case V4DF_FTYPE_V4DF_V4DF_V4DI_INT_UQI:
13111 268 : case V4DI_FTYPE_V4DI_V4DI_V4DI_INT_UQI:
13112 268 : case V4SI_FTYPE_V4SI_V4SI_V4SI_INT_UQI:
13113 268 : case V2DI_FTYPE_V2DI_V2DI_V2DI_INT_UQI:
13114 268 : nargs = 5;
13115 268 : mask_pos = 1;
13116 268 : nargs_constant = 1;
13117 268 : break;
13118 732 : case V64QI_FTYPE_V64QI_V64QI_INT_V64QI_UDI:
13119 732 : case V32QI_FTYPE_V32QI_V32QI_INT_V32QI_USI:
13120 732 : case V16QI_FTYPE_V16QI_V16QI_INT_V16QI_UHI:
13121 732 : case V32HI_FTYPE_V32HI_V32HI_INT_V32HI_INT:
13122 732 : case V16SI_FTYPE_V16SI_V16SI_INT_V16SI_INT:
13123 732 : case V8DI_FTYPE_V8DI_V8DI_INT_V8DI_INT:
13124 732 : case V16HI_FTYPE_V16HI_V16HI_INT_V16HI_INT:
13125 732 : case V8SI_FTYPE_V8SI_V8SI_INT_V8SI_INT:
13126 732 : case V4DI_FTYPE_V4DI_V4DI_INT_V4DI_INT:
13127 732 : case V8HI_FTYPE_V8HI_V8HI_INT_V8HI_INT:
13128 732 : case V4SI_FTYPE_V4SI_V4SI_INT_V4SI_INT:
13129 732 : case V2DI_FTYPE_V2DI_V2DI_INT_V2DI_INT:
13130 732 : case V8BF_FTYPE_V8BF_V8BF_INT_V8BF_UQI:
13131 732 : case V16BF_FTYPE_V16BF_V16BF_INT_V16BF_UHI:
13132 732 : case V32BF_FTYPE_V32BF_V32BF_INT_V32BF_USI:
13133 732 : case V16HF_FTYPE_V16HF_V16HF_INT_V16HF_UHI:
13134 732 : case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI:
13135 732 : nargs = 5;
13136 732 : mask_pos = 1;
13137 732 : nargs_constant = 2;
13138 732 : break;
13139 :
13140 0 : default:
13141 0 : gcc_unreachable ();
13142 : }
13143 :
13144 56117 : gcc_assert (nargs <= ARRAY_SIZE (xops));
13145 :
13146 60124 : if (comparison != UNKNOWN)
13147 : {
13148 614 : gcc_assert (nargs == 2);
13149 614 : return ix86_expand_sse_compare (d, exp, target, swap);
13150 : }
13151 :
13152 59510 : if (rmode == VOIDmode || rmode == tmode)
13153 : {
13154 59325 : if (optimize
13155 17715 : || target == 0
13156 17715 : || GET_MODE (target) != tmode
13157 76838 : || !insn_p->operand[0].predicate (target, tmode))
13158 41900 : target = gen_reg_rtx (tmode);
13159 17425 : else if (memory_operand (target, tmode))
13160 578 : num_memory++;
13161 : real_target = target;
13162 : }
13163 : else
13164 : {
13165 185 : real_target = gen_reg_rtx (tmode);
13166 185 : target = lowpart_subreg (rmode, real_target, tmode);
13167 : }
13168 :
13169 257967 : for (i = 0; i < nargs; i++)
13170 : {
13171 198690 : tree arg = CALL_EXPR_ARG (exp, i);
13172 198690 : rtx op = ix86_expand_unsigned_small_int_cst_argument (arg);
13173 198690 : machine_mode mode = insn_p->operand[i + 1].mode;
13174 : /* Need to fixup modeless constant before testing predicate. */
13175 198690 : op = fixup_modeless_constant (op, mode);
13176 198690 : bool match = insn_p->operand[i + 1].predicate (op, mode);
13177 :
13178 198690 : if (second_arg_count && i == 1)
13179 : {
13180 : /* SIMD shift insns take either an 8-bit immediate or
13181 : register as count. But builtin functions take int as
13182 : count. If count doesn't match, we put it in register.
13183 : The instructions are using 64-bit count, if op is just
13184 : 32-bit, zero-extend it, as negative shift counts
13185 : are undefined behavior and zero-extension is more
13186 : efficient. */
13187 2889 : if (!match)
13188 : {
13189 1750 : if (SCALAR_INT_MODE_P (GET_MODE (op)))
13190 489 : op = convert_modes (mode, GET_MODE (op), op, 1);
13191 : else
13192 1261 : op = lowpart_subreg (mode, op, GET_MODE (op));
13193 1750 : if (!insn_p->operand[i + 1].predicate (op, mode))
13194 190 : op = copy_to_reg (op);
13195 : }
13196 : }
13197 195801 : else if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
13198 147745 : (!mask_pos && (nargs - i) <= nargs_constant))
13199 : {
13200 16283 : if (!match)
13201 233 : switch (icode)
13202 : {
13203 2 : case CODE_FOR_avx_vinsertf128v4di:
13204 2 : case CODE_FOR_avx_vextractf128v4di:
13205 2 : error ("the last argument must be an 1-bit immediate");
13206 2 : return const0_rtx;
13207 :
13208 8 : case CODE_FOR_avx512f_cmpv8di3_mask:
13209 8 : case CODE_FOR_avx512f_cmpv16si3_mask:
13210 8 : case CODE_FOR_avx512f_ucmpv8di3_mask:
13211 8 : case CODE_FOR_avx512f_ucmpv16si3_mask:
13212 8 : case CODE_FOR_avx512vl_cmpv4di3_mask:
13213 8 : case CODE_FOR_avx512vl_cmpv8si3_mask:
13214 8 : case CODE_FOR_avx512vl_ucmpv4di3_mask:
13215 8 : case CODE_FOR_avx512vl_ucmpv8si3_mask:
13216 8 : case CODE_FOR_avx512vl_cmpv2di3_mask:
13217 8 : case CODE_FOR_avx512vl_cmpv4si3_mask:
13218 8 : case CODE_FOR_avx512vl_ucmpv2di3_mask:
13219 8 : case CODE_FOR_avx512vl_ucmpv4si3_mask:
13220 8 : error ("the last argument must be a 3-bit immediate");
13221 8 : return const0_rtx;
13222 :
13223 24 : case CODE_FOR_sse4_1_roundsd:
13224 24 : case CODE_FOR_sse4_1_roundss:
13225 :
13226 24 : case CODE_FOR_sse4_1_roundpd:
13227 24 : case CODE_FOR_sse4_1_roundps:
13228 24 : case CODE_FOR_avx_roundpd256:
13229 24 : case CODE_FOR_avx_roundps256:
13230 :
13231 24 : case CODE_FOR_sse4_1_roundpd_vec_pack_sfix:
13232 24 : case CODE_FOR_sse4_1_roundps_sfix:
13233 24 : case CODE_FOR_avx_roundpd_vec_pack_sfix256:
13234 24 : case CODE_FOR_avx_roundps_sfix256:
13235 :
13236 24 : case CODE_FOR_sse4_1_blendps:
13237 24 : case CODE_FOR_avx_blendpd256:
13238 24 : case CODE_FOR_avx_vpermilv4df:
13239 24 : case CODE_FOR_avx_vpermilv4df_mask:
13240 24 : case CODE_FOR_avx512f_getmantv8df_mask:
13241 24 : case CODE_FOR_avx512f_getmantv16sf_mask:
13242 24 : case CODE_FOR_avx512vl_getmantv16hf_mask:
13243 24 : case CODE_FOR_avx512vl_getmantv8sf_mask:
13244 24 : case CODE_FOR_avx512vl_getmantv4df_mask:
13245 24 : case CODE_FOR_avx512fp16_getmantv8hf_mask:
13246 24 : case CODE_FOR_avx512vl_getmantv4sf_mask:
13247 24 : case CODE_FOR_avx512vl_getmantv2df_mask:
13248 24 : case CODE_FOR_avx512dq_rangepv8df_mask_round:
13249 24 : case CODE_FOR_avx512dq_rangepv16sf_mask_round:
13250 24 : case CODE_FOR_avx512dq_rangepv4df_mask:
13251 24 : case CODE_FOR_avx512dq_rangepv8sf_mask:
13252 24 : case CODE_FOR_avx512dq_rangepv2df_mask:
13253 24 : case CODE_FOR_avx512dq_rangepv4sf_mask:
13254 24 : case CODE_FOR_avx_shufpd256_mask:
13255 24 : error ("the last argument must be a 4-bit immediate");
13256 24 : return const0_rtx;
13257 :
13258 15 : case CODE_FOR_sha1rnds4:
13259 15 : case CODE_FOR_sse4_1_blendpd:
13260 15 : case CODE_FOR_avx_vpermilv2df:
13261 15 : case CODE_FOR_avx_vpermilv2df_mask:
13262 15 : case CODE_FOR_xop_vpermil2v2df3:
13263 15 : case CODE_FOR_xop_vpermil2v4sf3:
13264 15 : case CODE_FOR_xop_vpermil2v4df3:
13265 15 : case CODE_FOR_xop_vpermil2v8sf3:
13266 15 : case CODE_FOR_avx512f_vinsertf32x4_mask:
13267 15 : case CODE_FOR_avx512f_vinserti32x4_mask:
13268 15 : case CODE_FOR_avx512f_vextractf32x4_mask:
13269 15 : case CODE_FOR_avx512f_vextracti32x4_mask:
13270 15 : case CODE_FOR_sse2_shufpd:
13271 15 : case CODE_FOR_sse2_shufpd_mask:
13272 15 : case CODE_FOR_avx512dq_shuf_f64x2_mask:
13273 15 : case CODE_FOR_avx512dq_shuf_i64x2_mask:
13274 15 : case CODE_FOR_avx512vl_shuf_i32x4_mask:
13275 15 : case CODE_FOR_avx512vl_shuf_f32x4_mask:
13276 15 : error ("the last argument must be a 2-bit immediate");
13277 15 : return const0_rtx;
13278 :
13279 30 : case CODE_FOR_avx_vextractf128v4df:
13280 30 : case CODE_FOR_avx_vextractf128v8sf:
13281 30 : case CODE_FOR_avx_vextractf128v8si:
13282 30 : case CODE_FOR_avx_vinsertf128v4df:
13283 30 : case CODE_FOR_avx_vinsertf128v8sf:
13284 30 : case CODE_FOR_avx_vinsertf128v8si:
13285 30 : case CODE_FOR_avx512f_vinsertf64x4_mask:
13286 30 : case CODE_FOR_avx512f_vinserti64x4_mask:
13287 30 : case CODE_FOR_avx512f_vextractf64x4_mask:
13288 30 : case CODE_FOR_avx512f_vextracti64x4_mask:
13289 30 : case CODE_FOR_avx512dq_vinsertf32x8_mask:
13290 30 : case CODE_FOR_avx512dq_vinserti32x8_mask:
13291 30 : case CODE_FOR_avx512vl_vinsertv4df:
13292 30 : case CODE_FOR_avx512vl_vinsertv4di:
13293 30 : case CODE_FOR_avx512vl_vinsertv8sf:
13294 30 : case CODE_FOR_avx512vl_vinsertv8si:
13295 30 : error ("the last argument must be a 1-bit immediate");
13296 30 : return const0_rtx;
13297 :
13298 16 : case CODE_FOR_avx_vmcmpv2df3:
13299 16 : case CODE_FOR_avx_vmcmpv4sf3:
13300 16 : case CODE_FOR_avx_cmpv2df3:
13301 16 : case CODE_FOR_avx_cmpv4sf3:
13302 16 : if (CONST_INT_P (op) && IN_RANGE (INTVAL (op), 8, 31))
13303 : {
13304 4 : error ("'%s' needs isa option %s", d->name, "-mavx");
13305 4 : return const0_rtx;
13306 : }
13307 : /* FALLTHRU */
13308 18 : case CODE_FOR_avx_cmpv4df3:
13309 18 : case CODE_FOR_avx_cmpv8sf3:
13310 18 : case CODE_FOR_avx512f_cmpv8df3_mask:
13311 18 : case CODE_FOR_avx512f_cmpv16sf3_mask:
13312 18 : case CODE_FOR_avx512f_vmcmpv2df3_mask:
13313 18 : case CODE_FOR_avx512f_vmcmpv4sf3_mask:
13314 18 : case CODE_FOR_avx512bw_cmpv32hf3_mask:
13315 18 : case CODE_FOR_avx512vl_cmpv16hf3_mask:
13316 18 : case CODE_FOR_avx512fp16_cmpv8hf3_mask:
13317 18 : error ("the last argument must be a 5-bit immediate");
13318 18 : return const0_rtx;
13319 :
13320 132 : default:
13321 132 : switch (nargs_constant)
13322 : {
13323 8 : case 2:
13324 8 : if ((mask_pos && (nargs - i - mask_pos) == nargs_constant) ||
13325 8 : (!mask_pos && (nargs - i) == nargs_constant))
13326 : {
13327 4 : error ("the next to last argument must be an 8-bit immediate");
13328 4 : break;
13329 : }
13330 : /* FALLTHRU */
13331 128 : case 1:
13332 128 : error ("the last argument must be an 8-bit immediate");
13333 128 : break;
13334 0 : default:
13335 0 : gcc_unreachable ();
13336 : }
13337 132 : return const0_rtx;
13338 : }
13339 : }
13340 : else
13341 : {
13342 179518 : if (VECTOR_MODE_P (mode))
13343 128929 : op = safe_vector_operand (op, mode);
13344 :
13345 : /* If we aren't optimizing, only allow one memory operand to
13346 : be generated. */
13347 179518 : if (memory_operand (op, mode))
13348 : {
13349 29824 : num_memory++;
13350 29824 : if (!optimize && num_memory > 1)
13351 13586 : op = copy_to_mode_reg (mode, op);
13352 : }
13353 :
13354 179518 : if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
13355 : {
13356 177365 : if (!match)
13357 42567 : op = copy_to_mode_reg (mode, op);
13358 : }
13359 : else
13360 : {
13361 2153 : op = copy_to_reg (op);
13362 2153 : op = lowpart_subreg (mode, op, GET_MODE (op));
13363 : }
13364 : }
13365 :
13366 198457 : xops[i] = op;
13367 : }
13368 :
13369 59277 : switch (nargs)
13370 : {
13371 3393 : case 1:
13372 3393 : pat = GEN_FCN (icode) (real_target, xops[0]);
13373 3393 : break;
13374 5481 : case 2:
13375 5481 : pat = GEN_FCN (icode) (real_target, xops[0], xops[1]);
13376 5481 : break;
13377 20621 : case 3:
13378 20621 : pat = GEN_FCN (icode) (real_target, xops[0], xops[1], xops[2]);
13379 20621 : break;
13380 27042 : case 4:
13381 27042 : pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
13382 27042 : xops[2], xops[3]);
13383 27042 : break;
13384 2740 : case 5:
13385 2740 : pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
13386 2740 : xops[2], xops[3], xops[4]);
13387 2740 : break;
13388 : case 6:
13389 : pat = GEN_FCN (icode) (real_target, xops[0], xops[1],
13390 : xops[2], xops[3], xops[4], xops[5]);
13391 : break;
13392 : default:
13393 : gcc_unreachable ();
13394 : }
13395 :
13396 59277 : if (! pat)
13397 : return 0;
13398 :
13399 59277 : emit_insn (pat);
13400 59277 : return target;
13401 : }
13402 :
13403 : /* Transform pattern of following layout:
13404 : (set A
13405 : (unspec [B C] UNSPEC_EMBEDDED_ROUNDING))
13406 : )
13407 : into:
13408 : (set (A B)) */
13409 :
13410 : static rtx
13411 4931 : ix86_erase_embedded_rounding (rtx pat)
13412 : {
13413 4931 : if (NONJUMP_INSN_P (pat))
13414 685 : pat = PATTERN (pat);
13415 :
13416 4931 : gcc_assert (GET_CODE (pat) == SET);
13417 4931 : rtx src = SET_SRC (pat);
13418 4931 : gcc_assert (XVECLEN (src, 0) == 2);
13419 4931 : rtx p0 = XVECEXP (src, 0, 0);
13420 4931 : gcc_assert (GET_CODE (src) == UNSPEC
13421 : && XINT (src, 1) == UNSPEC_EMBEDDED_ROUNDING);
13422 4931 : rtx res = gen_rtx_SET (SET_DEST (pat), p0);
13423 4931 : return res;
13424 : }
13425 :
13426 : /* Subroutine of ix86_expand_round_builtin to take care of comi insns
13427 : with rounding. */
13428 : static rtx
13429 103 : ix86_expand_sse_comi_round (const struct builtin_description *d,
13430 : tree exp, rtx target, bool comx_ok)
13431 : {
13432 103 : rtx pat, set_dst;
13433 103 : tree arg0 = CALL_EXPR_ARG (exp, 0);
13434 103 : tree arg1 = CALL_EXPR_ARG (exp, 1);
13435 103 : tree arg2 = CALL_EXPR_ARG (exp, 2);
13436 103 : tree arg3 = CALL_EXPR_ARG (exp, 3);
13437 103 : rtx op0 = expand_normal (arg0);
13438 103 : rtx op1 = expand_normal (arg1);
13439 103 : rtx op2 = expand_normal (arg2);
13440 103 : rtx op3 = expand_normal (arg3);
13441 103 : enum insn_code icode = d->icode;
13442 103 : const struct insn_data_d *insn_p = &insn_data[icode];
13443 103 : machine_mode mode0 = insn_p->operand[0].mode;
13444 103 : machine_mode mode1 = insn_p->operand[1].mode;
13445 :
13446 : /* See avxintrin.h for values. */
13447 103 : static const enum rtx_code comparisons[32] =
13448 : {
13449 : EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
13450 : UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED,
13451 : EQ, LT, LE, UNORDERED, NE, UNGE, UNGT, ORDERED,
13452 : UNEQ, UNLT, UNLE, UNORDERED, LTGT, GE, GT, ORDERED
13453 : };
13454 103 : static const bool ordereds[32] =
13455 : {
13456 : true, true, true, false, false, false, false, true,
13457 : false, false, false, true, true, true, true, false,
13458 : true, true, true, false, false, false, false, true,
13459 : false, false, false, true, true, true, true, false
13460 : };
13461 103 : static const bool non_signalings[32] =
13462 : {
13463 : true, false, false, true, true, false, false, true,
13464 : true, false, false, true, true, false, false, true,
13465 : false, true, true, false, false, true, true, false,
13466 : false, true, true, false, false, true, true, false
13467 : };
13468 :
13469 103 : if (!CONST_INT_P (op2))
13470 : {
13471 0 : error ("the third argument must be comparison constant");
13472 0 : return const0_rtx;
13473 : }
13474 103 : if (INTVAL (op2) < 0 || INTVAL (op2) >= 32)
13475 : {
13476 0 : error ("incorrect comparison mode");
13477 0 : return const0_rtx;
13478 : }
13479 :
13480 103 : if (!insn_p->operand[2].predicate (op3, SImode))
13481 : {
13482 4 : error ("incorrect rounding operand");
13483 4 : return const0_rtx;
13484 : }
13485 :
13486 99 : if (VECTOR_MODE_P (mode0))
13487 99 : op0 = safe_vector_operand (op0, mode0);
13488 99 : if (VECTOR_MODE_P (mode1))
13489 99 : op1 = safe_vector_operand (op1, mode1);
13490 :
13491 99 : enum rtx_code comparison = comparisons[INTVAL (op2)];
13492 99 : enum rtx_code orig_comp = comparison;
13493 99 : bool ordered = ordereds[INTVAL (op2)];
13494 99 : bool non_signaling = non_signalings[INTVAL (op2)];
13495 99 : rtx const_val = const0_rtx;
13496 :
13497 99 : bool check_unordered = false;
13498 99 : machine_mode mode = CCFPmode;
13499 99 : switch (comparison)
13500 : {
13501 8 : case ORDERED:
13502 8 : if (!ordered)
13503 : {
13504 4 : if (TARGET_AVX10_2 && comx_ok)
13505 : {
13506 : /* Unlike VCOMI{SH,SS,SD}, VCOMX{SH,SS,SD} will set SF
13507 : differently. So directly return true here. */
13508 0 : target = gen_reg_rtx (SImode);
13509 0 : emit_move_insn (target, const1_rtx);
13510 0 : return target;
13511 : }
13512 : else
13513 : {
13514 : /* NB: Use CCSmode/NE for _CMP_TRUE_UQ/_CMP_TRUE_US. */
13515 : if (!non_signaling)
13516 99 : ordered = true;
13517 99 : mode = CCSmode;
13518 : }
13519 : }
13520 : else
13521 : {
13522 : /* NB: Use CCPmode/NE for _CMP_ORD_Q/_CMP_ORD_S. */
13523 : if (non_signaling)
13524 : ordered = false;
13525 : mode = CCPmode;
13526 : }
13527 : comparison = NE;
13528 : break;
13529 8 : case UNORDERED:
13530 8 : if (ordered)
13531 : {
13532 4 : if (TARGET_AVX10_2 && comx_ok)
13533 : {
13534 : /* Unlike VCOMI{SH,SS,SD}, VCOMX{SH,SS,SD} will set SF
13535 : differently. So directly return false here. */
13536 0 : target = gen_reg_rtx (SImode);
13537 0 : emit_move_insn (target, const0_rtx);
13538 0 : return target;
13539 : }
13540 : else
13541 : {
13542 : /* NB: Use CCSmode/EQ for _CMP_FALSE_OQ/_CMP_FALSE_OS. */
13543 : if (non_signaling)
13544 99 : ordered = false;
13545 : mode = CCSmode;
13546 : }
13547 : }
13548 : else
13549 : {
13550 : /* NB: Use CCPmode/NE for _CMP_UNORD_Q/_CMP_UNORD_S. */
13551 : if (!non_signaling)
13552 99 : ordered = true;
13553 99 : mode = CCPmode;
13554 : }
13555 : comparison = EQ;
13556 : break;
13557 :
13558 40 : case LE: /* -> GE */
13559 40 : case LT: /* -> GT */
13560 40 : case UNGE: /* -> UNLE */
13561 40 : case UNGT: /* -> UNLT */
13562 40 : std::swap (op0, op1);
13563 40 : comparison = swap_condition (comparison);
13564 : /* FALLTHRU */
13565 68 : case GT:
13566 68 : case GE:
13567 68 : case UNEQ:
13568 68 : case UNLT:
13569 68 : case UNLE:
13570 68 : case LTGT:
13571 : /* These are supported by CCFPmode. NB: Use ordered/signaling
13572 : COMI or unordered/non-signaling UCOMI. Both set ZF, PF, CF
13573 : with NAN operands. */
13574 68 : if (ordered == non_signaling)
13575 : ordered = !ordered;
13576 : break;
13577 : /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
13578 : _CMP_EQ_OQ/_CMP_EQ_OS.
13579 : Under TARGET_AVX10_2, VCOMX/VUCOMX are always generated instead
13580 : of COMI/UCOMI, VCOMX/VUCOMX will not set ZF with NAN. */
13581 8 : case EQ:
13582 8 : if (!TARGET_AVX10_2 || !comx_ok)
13583 5 : check_unordered = true;
13584 : mode = CCZmode;
13585 : break;
13586 7 : case NE:
13587 : /* NB: COMI/UCOMI will set ZF with NAN operands. Use CCZmode for
13588 : _CMP_NEQ_UQ/_CMP_NEQ_US.
13589 : Under TARGET_AVX10_2, VCOMX/VUCOMX are always generated instead
13590 : of COMI/UCOMI, VCOMX/VUCOMX will not set ZF with NAN. */
13591 7 : gcc_assert (!ordered);
13592 7 : if (!TARGET_AVX10_2 || !comx_ok)
13593 4 : check_unordered = true;
13594 7 : mode = CCZmode;
13595 7 : const_val = const1_rtx;
13596 7 : break;
13597 0 : default:
13598 0 : gcc_unreachable ();
13599 : }
13600 :
13601 99 : target = gen_reg_rtx (SImode);
13602 99 : emit_move_insn (target, const_val);
13603 99 : target = gen_rtx_SUBREG (QImode, target, 0);
13604 :
13605 93 : if ((optimize && !register_operand (op0, mode0))
13606 192 : || !insn_p->operand[0].predicate (op0, mode0))
13607 6 : op0 = copy_to_mode_reg (mode0, op0);
13608 93 : if ((optimize && !register_operand (op1, mode1))
13609 192 : || !insn_p->operand[1].predicate (op1, mode1))
13610 6 : op1 = copy_to_mode_reg (mode1, op1);
13611 :
13612 : /* Generate comx instead of comi when EQ/NE to avoid NAN checks.
13613 : Use orig_comp to exclude ORDERED/UNORDERED cases. */
13614 99 : if ((orig_comp == EQ || orig_comp == NE)
13615 15 : && TARGET_AVX10_2 && comx_ok)
13616 : {
13617 6 : switch (icode)
13618 : {
13619 : case CODE_FOR_avx512fp16_comi_round:
13620 99 : icode = CODE_FOR_avx10_2_comxhf_round;
13621 : break;
13622 4 : case CODE_FOR_sse_comi_round:
13623 4 : icode = CODE_FOR_avx10_2_comxsf_round;
13624 4 : break;
13625 2 : case CODE_FOR_sse2_comi_round:
13626 2 : icode = CODE_FOR_avx10_2_comxdf_round;
13627 2 : break;
13628 :
13629 : default:
13630 : break;
13631 : }
13632 : }
13633 :
13634 : /* Generate comi instead of comx when UNEQ/LTGT to avoid NAN checks. */
13635 99 : if ((comparison == UNEQ || comparison == LTGT)
13636 8 : && TARGET_AVX10_2 && comx_ok)
13637 : {
13638 0 : switch (icode)
13639 : {
13640 : case CODE_FOR_avx10_2_comxhf_round:
13641 99 : icode = CODE_FOR_avx512fp16_comi_round;
13642 : break;
13643 0 : case CODE_FOR_avx10_2_comxsf_round:
13644 0 : icode = CODE_FOR_sse_comi_round;
13645 0 : break;
13646 0 : case CODE_FOR_avx10_2_comxdf_round:
13647 0 : icode = CODE_FOR_sse2_comi_round;
13648 0 : break;
13649 :
13650 : default:
13651 : break;
13652 : }
13653 : }
13654 :
13655 : /*
13656 : 1. COMI/VCOMX: ordered and signaling.
13657 : 2. UCOMI/VUCOMX: unordered and non-signaling.
13658 : */
13659 99 : if (non_signaling)
13660 38 : switch (icode)
13661 : {
13662 : case CODE_FOR_sse_comi_round:
13663 : icode = CODE_FOR_sse_ucomi_round;
13664 : break;
13665 17 : case CODE_FOR_sse2_comi_round:
13666 17 : icode = CODE_FOR_sse2_ucomi_round;
13667 17 : break;
13668 0 : case CODE_FOR_avx512fp16_comi_round:
13669 0 : icode = CODE_FOR_avx512fp16_ucomi_round;
13670 0 : break;
13671 3 : case CODE_FOR_avx10_2_comxsf_round:
13672 3 : icode = CODE_FOR_avx10_2_ucomxsf_round;
13673 3 : break;
13674 0 : case CODE_FOR_avx10_2_comxhf_round:
13675 0 : icode = CODE_FOR_avx10_2_ucomxhf_round;
13676 0 : break;
13677 1 : case CODE_FOR_avx10_2_comxdf_round:
13678 1 : icode = CODE_FOR_avx10_2_ucomxdf_round;
13679 1 : break;
13680 0 : default:
13681 0 : gcc_unreachable ();
13682 : }
13683 :
13684 99 : pat = GEN_FCN (icode) (op0, op1, op3);
13685 99 : if (! pat)
13686 : return 0;
13687 :
13688 : /* Rounding operand can be either NO_ROUND or ROUND_SAE at this point. */
13689 99 : if (INTVAL (op3) == NO_ROUND)
13690 : {
13691 1 : pat = ix86_erase_embedded_rounding (pat);
13692 1 : if (! pat)
13693 : return 0;
13694 :
13695 1 : set_dst = SET_DEST (pat);
13696 : }
13697 : else
13698 : {
13699 98 : gcc_assert (GET_CODE (pat) == SET);
13700 98 : set_dst = SET_DEST (pat);
13701 : }
13702 :
13703 99 : emit_insn (pat);
13704 :
13705 99 : return ix86_ssecom_setcc (comparison, check_unordered, mode,
13706 99 : set_dst, target);
13707 : }
13708 :
13709 : static rtx
13710 15554 : ix86_expand_round_builtin (const struct builtin_description *d,
13711 : tree exp, rtx target)
13712 : {
13713 15554 : rtx pat;
13714 15554 : unsigned int i, nargs;
13715 15554 : rtx xops[6];
13716 15554 : enum insn_code icode = d->icode;
13717 15554 : const struct insn_data_d *insn_p = &insn_data[icode];
13718 15554 : machine_mode tmode = insn_p->operand[0].mode;
13719 15554 : unsigned int nargs_constant = 0;
13720 15554 : unsigned int redundant_embed_rnd = 0;
13721 :
13722 15554 : switch ((enum ix86_builtin_func_type) d->flag)
13723 : {
13724 : case UINT64_FTYPE_V2DF_INT:
13725 : case UINT64_FTYPE_V4SF_INT:
13726 : case UINT64_FTYPE_V8HF_INT:
13727 : case UINT_FTYPE_V2DF_INT:
13728 : case UINT_FTYPE_V4SF_INT:
13729 : case UINT_FTYPE_V8HF_INT:
13730 : case INT64_FTYPE_V2DF_INT:
13731 : case INT64_FTYPE_V4SF_INT:
13732 : case INT64_FTYPE_V8HF_INT:
13733 : case INT_FTYPE_V2DF_INT:
13734 : case INT_FTYPE_V4SF_INT:
13735 : case INT_FTYPE_V8HF_INT:
13736 : nargs = 2;
13737 : break;
13738 642 : case V32HF_FTYPE_V32HF_V32HF_INT:
13739 642 : case V8HF_FTYPE_V8HF_V8HF_INT:
13740 642 : case V8HF_FTYPE_V8HF_INT_INT:
13741 642 : case V8HF_FTYPE_V8HF_UINT_INT:
13742 642 : case V8HF_FTYPE_V8HF_INT64_INT:
13743 642 : case V8HF_FTYPE_V8HF_UINT64_INT:
13744 642 : case V4SF_FTYPE_V4SF_UINT_INT:
13745 642 : case V4SF_FTYPE_V4SF_UINT64_INT:
13746 642 : case V2DF_FTYPE_V2DF_UINT64_INT:
13747 642 : case V4SF_FTYPE_V4SF_INT_INT:
13748 642 : case V4SF_FTYPE_V4SF_INT64_INT:
13749 642 : case V2DF_FTYPE_V2DF_INT64_INT:
13750 642 : case V4SF_FTYPE_V4SF_V4SF_INT:
13751 642 : case V2DF_FTYPE_V2DF_V2DF_INT:
13752 642 : case V4SF_FTYPE_V4SF_V2DF_INT:
13753 642 : case V2DF_FTYPE_V2DF_V4SF_INT:
13754 642 : nargs = 3;
13755 642 : break;
13756 4554 : case V8SF_FTYPE_V8DF_V8SF_QI_INT:
13757 4554 : case V8DF_FTYPE_V8DF_V8DF_QI_INT:
13758 4554 : case V32HI_FTYPE_V32HF_V32HI_USI_INT:
13759 4554 : case V32HI_FTYPE_V32BF_V32HI_USI_INT:
13760 4554 : case V8SI_FTYPE_V8DF_V8SI_QI_INT:
13761 4554 : case V8DI_FTYPE_V8HF_V8DI_UQI_INT:
13762 4554 : case V8DI_FTYPE_V8DF_V8DI_QI_INT:
13763 4554 : case V8SF_FTYPE_V8DI_V8SF_QI_INT:
13764 4554 : case V8DF_FTYPE_V8DI_V8DF_QI_INT:
13765 4554 : case V8DF_FTYPE_V8HF_V8DF_UQI_INT:
13766 4554 : case V16SF_FTYPE_V16HF_V16SF_UHI_INT:
13767 4554 : case V32HF_FTYPE_V32HI_V32HF_USI_INT:
13768 4554 : case V32HF_FTYPE_V32HF_V32HF_USI_INT:
13769 4554 : case V32HF_FTYPE_V32HF_V32HF_V32HF_INT:
13770 4554 : case V16SF_FTYPE_V16SF_V16SF_HI_INT:
13771 4554 : case V8DI_FTYPE_V8SF_V8DI_QI_INT:
13772 4554 : case V16SF_FTYPE_V16SI_V16SF_HI_INT:
13773 4554 : case V16SI_FTYPE_V16SF_V16SI_HI_INT:
13774 4554 : case V16SI_FTYPE_V16SF_V16SI_UHI_INT:
13775 4554 : case V16SI_FTYPE_V16HF_V16SI_UHI_INT:
13776 4554 : case V16HF_FTYPE_V16SI_V16HF_UHI_INT:
13777 4554 : case V8DF_FTYPE_V8SF_V8DF_QI_INT:
13778 4554 : case V16SF_FTYPE_V16HI_V16SF_HI_INT:
13779 4554 : case V2DF_FTYPE_V2DF_V2DF_V2DF_INT:
13780 4554 : case V4SF_FTYPE_V4SF_V4SF_V4SF_INT:
13781 4554 : case V8HF_FTYPE_V8DI_V8HF_UQI_INT:
13782 4554 : case V8HF_FTYPE_V8DF_V8HF_UQI_INT:
13783 4554 : case V16HF_FTYPE_V16SF_V16HF_UHI_INT:
13784 4554 : case V16HI_FTYPE_V16BF_V16HI_UHI_INT:
13785 4554 : case V8HF_FTYPE_V8HF_V8HF_V8HF_INT:
13786 4554 : nargs = 4;
13787 4554 : break;
13788 171 : case V4SF_FTYPE_V4SF_V4SF_INT_INT:
13789 171 : case V2DF_FTYPE_V2DF_V2DF_INT_INT:
13790 171 : nargs_constant = 2;
13791 171 : nargs = 4;
13792 171 : break;
13793 103 : case INT_FTYPE_V4SF_V4SF_INT_INT:
13794 103 : case INT_FTYPE_V2DF_V2DF_INT_INT:
13795 103 : return ix86_expand_sse_comi_round (d, exp, target, true);
13796 6220 : case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI_INT:
13797 6220 : case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
13798 6220 : case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
13799 6220 : case V4SF_FTYPE_V8HF_V4SF_V4SF_UQI_INT:
13800 6220 : case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
13801 6220 : case V32HF_FTYPE_V32HF_V32HF_V32HF_UHI_INT:
13802 6220 : case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
13803 6220 : case V2DF_FTYPE_V8HF_V2DF_V2DF_UQI_INT:
13804 6220 : case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
13805 6220 : case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
13806 6220 : case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
13807 6220 : case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
13808 6220 : case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
13809 6220 : case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
13810 6220 : case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT:
13811 6220 : case V8HF_FTYPE_V2DF_V8HF_V8HF_UQI_INT:
13812 6220 : case V8HF_FTYPE_V4SF_V8HF_V8HF_UQI_INT:
13813 6220 : case V32HF_FTYPE_V16SF_V16SF_V32HF_USI_INT:
13814 6220 : nargs = 5;
13815 6220 : break;
13816 635 : case V32HF_FTYPE_V32HF_INT_V32HF_USI_INT:
13817 635 : case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
13818 635 : case V8DF_FTYPE_V8DF_INT_V8DF_QI_INT:
13819 635 : case V8DF_FTYPE_V8DF_INT_V8DF_UQI_INT:
13820 635 : case V16SF_FTYPE_V16SF_INT_V16SF_UHI_INT:
13821 635 : nargs_constant = 4;
13822 635 : nargs = 5;
13823 635 : break;
13824 1181 : case UQI_FTYPE_V8DF_V8DF_INT_UQI_INT:
13825 1181 : case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
13826 1181 : case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
13827 1181 : case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
13828 1181 : case USI_FTYPE_V32HF_V32HF_INT_USI_INT:
13829 1181 : case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT:
13830 1181 : nargs_constant = 3;
13831 1181 : nargs = 5;
13832 1181 : break;
13833 1071 : case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_HI_INT:
13834 1071 : case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_QI_INT:
13835 1071 : case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_QI_INT:
13836 1071 : case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_QI_INT:
13837 1071 : case V2DF_FTYPE_V2DF_V2DF_INT_V2DF_UQI_INT:
13838 1071 : case V4SF_FTYPE_V4SF_V4SF_INT_V4SF_UQI_INT:
13839 1071 : case V8HF_FTYPE_V8HF_V8HF_INT_V8HF_UQI_INT:
13840 1071 : case V8DF_FTYPE_V8DF_V8DF_INT_V8DF_UQI_INT:
13841 1071 : case V32HF_FTYPE_V32HF_V32HF_INT_V32HF_USI_INT:
13842 1071 : case V16SF_FTYPE_V16SF_V16SF_INT_V16SF_UHI_INT:
13843 1071 : nargs = 6;
13844 1071 : nargs_constant = 4;
13845 1071 : break;
13846 252 : case V8DF_FTYPE_V8DF_V8DF_V8DI_INT_QI_INT:
13847 252 : case V16SF_FTYPE_V16SF_V16SF_V16SI_INT_HI_INT:
13848 252 : case V2DF_FTYPE_V2DF_V2DF_V2DI_INT_QI_INT:
13849 252 : case V4SF_FTYPE_V4SF_V4SF_V4SI_INT_QI_INT:
13850 252 : nargs = 6;
13851 252 : nargs_constant = 3;
13852 252 : break;
13853 0 : default:
13854 0 : gcc_unreachable ();
13855 : }
13856 14726 : gcc_assert (nargs <= ARRAY_SIZE (xops));
13857 :
13858 15451 : if (optimize
13859 4265 : || target == 0
13860 4265 : || GET_MODE (target) != tmode
13861 19716 : || !insn_p->operand[0].predicate (target, tmode))
13862 11186 : target = gen_reg_rtx (tmode);
13863 :
13864 85194 : for (i = 0; i < nargs; i++)
13865 : {
13866 70298 : tree arg = CALL_EXPR_ARG (exp, i);
13867 70298 : rtx op = ix86_expand_unsigned_small_int_cst_argument (arg);
13868 70298 : machine_mode mode = insn_p->operand[i + 1].mode;
13869 70298 : bool match = insn_p->operand[i + 1].predicate (op, mode);
13870 :
13871 70298 : if (i == nargs - nargs_constant)
13872 : {
13873 3310 : if (!match)
13874 : {
13875 40 : switch (icode)
13876 : {
13877 12 : case CODE_FOR_avx512f_getmantv8df_mask_round:
13878 12 : case CODE_FOR_avx512f_getmantv16sf_mask_round:
13879 12 : case CODE_FOR_avx512bw_getmantv32hf_mask_round:
13880 12 : case CODE_FOR_avx512f_vgetmantv2df_round:
13881 12 : case CODE_FOR_avx512f_vgetmantv2df_mask_round:
13882 12 : case CODE_FOR_avx512f_vgetmantv4sf_round:
13883 12 : case CODE_FOR_avx512f_vgetmantv4sf_mask_round:
13884 12 : case CODE_FOR_avx512f_vgetmantv8hf_mask_round:
13885 12 : error ("the immediate argument must be a 4-bit immediate");
13886 12 : return const0_rtx;
13887 8 : case CODE_FOR_avx512f_cmpv8df3_mask_round:
13888 8 : case CODE_FOR_avx512f_cmpv16sf3_mask_round:
13889 8 : case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
13890 8 : case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
13891 8 : case CODE_FOR_avx512f_vmcmpv8hf3_mask_round:
13892 8 : case CODE_FOR_avx512bw_cmpv32hf3_mask_round:
13893 8 : error ("the immediate argument must be a 5-bit immediate");
13894 8 : return const0_rtx;
13895 20 : default:
13896 20 : error ("the immediate argument must be an 8-bit immediate");
13897 20 : return const0_rtx;
13898 : }
13899 : }
13900 : }
13901 66988 : else if (i == nargs-1)
13902 : {
13903 15411 : if (!insn_p->operand[nargs].predicate (op, SImode))
13904 : {
13905 515 : error ("incorrect rounding operand");
13906 515 : return const0_rtx;
13907 : }
13908 :
13909 : /* If there is no rounding use normal version of the pattern. */
13910 14896 : if (INTVAL (op) == NO_ROUND)
13911 : {
13912 : /* Skip erasing embedded rounding for below expanders who
13913 : generates multiple insns. In ix86_erase_embedded_rounding
13914 : the pattern will be transformed to a single set, and emit_insn
13915 : appends the set instead of insert it to chain. So the insns
13916 : emitted inside define_expander would be ignored. */
13917 4962 : switch (icode)
13918 : {
13919 : case CODE_FOR_avx512bw_fmaddc_v32hf_mask1_round:
13920 : case CODE_FOR_avx512bw_fcmaddc_v32hf_mask1_round:
13921 : case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask1_round:
13922 : case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask1_round:
13923 : case CODE_FOR_avx512fp16_fmaddcsh_v8hf_mask3_round:
13924 : case CODE_FOR_avx512fp16_fcmaddcsh_v8hf_mask3_round:
13925 : redundant_embed_rnd = 0;
13926 : break;
13927 4930 : default:
13928 4930 : redundant_embed_rnd = 1;
13929 4930 : break;
13930 : }
13931 : }
13932 : }
13933 : else
13934 : {
13935 51577 : if (VECTOR_MODE_P (mode))
13936 37673 : op = safe_vector_operand (op, mode);
13937 :
13938 51577 : op = fixup_modeless_constant (op, mode);
13939 :
13940 51577 : if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
13941 : {
13942 51577 : if (optimize || !match)
13943 45249 : op = copy_to_mode_reg (mode, op);
13944 : }
13945 : else
13946 : {
13947 0 : op = copy_to_reg (op);
13948 0 : op = lowpart_subreg (mode, op, GET_MODE (op));
13949 : }
13950 : }
13951 :
13952 69743 : xops[i] = op;
13953 : }
13954 :
13955 14896 : switch (nargs)
13956 : {
13957 : case 1:
13958 : pat = GEN_FCN (icode) (target, xops[0]);
13959 : break;
13960 692 : case 2:
13961 692 : pat = GEN_FCN (icode) (target, xops[0], xops[1]);
13962 692 : break;
13963 598 : case 3:
13964 598 : pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
13965 598 : break;
13966 4601 : case 4:
13967 4601 : pat = GEN_FCN (icode) (target, xops[0], xops[1],
13968 4601 : xops[2], xops[3]);
13969 4601 : break;
13970 7732 : case 5:
13971 7732 : pat = GEN_FCN (icode) (target, xops[0], xops[1],
13972 7732 : xops[2], xops[3], xops[4]);
13973 7732 : break;
13974 1273 : case 6:
13975 1273 : pat = GEN_FCN (icode) (target, xops[0], xops[1],
13976 1273 : xops[2], xops[3], xops[4], xops[5]);
13977 1273 : break;
13978 : default:
13979 : gcc_unreachable ();
13980 : }
13981 :
13982 14896 : if (!pat)
13983 : return 0;
13984 :
13985 14896 : if (redundant_embed_rnd)
13986 4930 : pat = ix86_erase_embedded_rounding (pat);
13987 :
13988 14896 : emit_insn (pat);
13989 14896 : return target;
13990 : }
13991 :
13992 : /* Subroutine of ix86_expand_builtin to take care of special insns
13993 : with variable number of operands. */
13994 :
13995 : static rtx
13996 27174 : ix86_expand_special_args_builtin (const struct builtin_description *d,
13997 : tree exp, rtx target)
13998 : {
13999 27174 : tree arg;
14000 27174 : rtx pat, op;
14001 27174 : unsigned int i, nargs, arg_adjust, memory;
14002 27174 : unsigned int constant = 100;
14003 27174 : bool aligned_mem = false;
14004 27174 : rtx xops[4];
14005 27174 : enum insn_code icode = d->icode;
14006 27174 : const struct insn_data_d *insn_p = &insn_data[icode];
14007 27174 : machine_mode tmode = insn_p->operand[0].mode;
14008 27174 : enum { load, store } klass;
14009 :
14010 27174 : switch ((enum ix86_builtin_func_type) d->flag)
14011 : {
14012 15366 : case VOID_FTYPE_VOID:
14013 15366 : emit_insn (GEN_FCN (icode) (target));
14014 15366 : return 0;
14015 : case VOID_FTYPE_UINT64:
14016 : case VOID_FTYPE_UNSIGNED:
14017 : nargs = 0;
14018 : klass = store;
14019 : memory = 0;
14020 : break;
14021 :
14022 7581 : case INT_FTYPE_VOID:
14023 7581 : case USHORT_FTYPE_VOID:
14024 7581 : case UINT64_FTYPE_VOID:
14025 7581 : case UINT_FTYPE_VOID:
14026 7581 : case UINT8_FTYPE_VOID:
14027 7581 : case UNSIGNED_FTYPE_VOID:
14028 7581 : nargs = 0;
14029 7581 : klass = load;
14030 7581 : memory = 0;
14031 7581 : break;
14032 358 : case CHAR_FTYPE_PCCHAR:
14033 358 : case SHORT_FTYPE_PCSHORT:
14034 358 : case INT_FTYPE_PCINT:
14035 358 : case INT64_FTYPE_PCINT64:
14036 358 : case UINT64_FTYPE_PUNSIGNED:
14037 358 : case V2DI_FTYPE_PV2DI:
14038 358 : case V4DI_FTYPE_PV4DI:
14039 358 : case V32QI_FTYPE_PCCHAR:
14040 358 : case V16QI_FTYPE_PCCHAR:
14041 358 : case V8SF_FTYPE_PCV4SF:
14042 358 : case V8SF_FTYPE_PCFLOAT:
14043 358 : case V4SF_FTYPE_PCFLOAT:
14044 358 : case V4SF_FTYPE_PCFLOAT16:
14045 358 : case V4SF_FTYPE_PCBFLOAT16:
14046 358 : case V4SF_FTYPE_PCV8BF:
14047 358 : case V4SF_FTYPE_PCV8HF:
14048 358 : case V8SF_FTYPE_PCFLOAT16:
14049 358 : case V8SF_FTYPE_PCBFLOAT16:
14050 358 : case V8SF_FTYPE_PCV16HF:
14051 358 : case V8SF_FTYPE_PCV16BF:
14052 358 : case V4DF_FTYPE_PCV2DF:
14053 358 : case V4DF_FTYPE_PCDOUBLE:
14054 358 : case V2DF_FTYPE_PCDOUBLE:
14055 358 : case VOID_FTYPE_PVOID:
14056 358 : case V8DI_FTYPE_PV8DI:
14057 358 : nargs = 1;
14058 358 : klass = load;
14059 358 : memory = 0;
14060 358 : switch (icode)
14061 : {
14062 : case CODE_FOR_sse4_1_movntdqa:
14063 : case CODE_FOR_avx2_movntdqa:
14064 : case CODE_FOR_avx512f_movntdqa:
14065 : aligned_mem = true;
14066 : break;
14067 : default:
14068 : break;
14069 : }
14070 : break;
14071 371 : case VOID_FTYPE_PV2SF_V4SF:
14072 371 : case VOID_FTYPE_PV8DI_V8DI:
14073 371 : case VOID_FTYPE_PV4DI_V4DI:
14074 371 : case VOID_FTYPE_PV2DI_V2DI:
14075 371 : case VOID_FTYPE_PCHAR_V32QI:
14076 371 : case VOID_FTYPE_PCHAR_V16QI:
14077 371 : case VOID_FTYPE_PFLOAT_V16SF:
14078 371 : case VOID_FTYPE_PFLOAT_V8SF:
14079 371 : case VOID_FTYPE_PFLOAT_V4SF:
14080 371 : case VOID_FTYPE_PDOUBLE_V8DF:
14081 371 : case VOID_FTYPE_PDOUBLE_V4DF:
14082 371 : case VOID_FTYPE_PDOUBLE_V2DF:
14083 371 : case VOID_FTYPE_PLONGLONG_LONGLONG:
14084 371 : case VOID_FTYPE_PULONGLONG_ULONGLONG:
14085 371 : case VOID_FTYPE_PUNSIGNED_UNSIGNED:
14086 371 : case VOID_FTYPE_PINT_INT:
14087 371 : nargs = 1;
14088 371 : klass = store;
14089 : /* Reserve memory operand for target. */
14090 371 : memory = ARRAY_SIZE (xops);
14091 371 : switch (icode)
14092 : {
14093 : /* These builtins and instructions require the memory
14094 : to be properly aligned. */
14095 : case CODE_FOR_avx_movntv4di:
14096 : case CODE_FOR_sse2_movntv2di:
14097 : case CODE_FOR_avx_movntv8sf:
14098 : case CODE_FOR_sse_movntv4sf:
14099 : case CODE_FOR_sse4a_vmmovntv4sf:
14100 : case CODE_FOR_avx_movntv4df:
14101 : case CODE_FOR_sse2_movntv2df:
14102 : case CODE_FOR_sse4a_vmmovntv2df:
14103 : case CODE_FOR_sse2_movntidi:
14104 : case CODE_FOR_sse_movntq:
14105 : case CODE_FOR_sse2_movntisi:
14106 : case CODE_FOR_avx512f_movntv16sf:
14107 : case CODE_FOR_avx512f_movntv8df:
14108 : case CODE_FOR_avx512f_movntv8di:
14109 : aligned_mem = true;
14110 : break;
14111 : default:
14112 : break;
14113 : }
14114 : break;
14115 0 : case VOID_FTYPE_PVOID_PCVOID:
14116 0 : nargs = 1;
14117 0 : klass = store;
14118 0 : memory = 0;
14119 :
14120 0 : break;
14121 26 : case V4SF_FTYPE_V4SF_PCV2SF:
14122 26 : case V2DF_FTYPE_V2DF_PCDOUBLE:
14123 26 : nargs = 2;
14124 26 : klass = load;
14125 26 : memory = 1;
14126 26 : break;
14127 93 : case V8SF_FTYPE_PCV8SF_V8SI:
14128 93 : case V4DF_FTYPE_PCV4DF_V4DI:
14129 93 : case V4SF_FTYPE_PCV4SF_V4SI:
14130 93 : case V2DF_FTYPE_PCV2DF_V2DI:
14131 93 : case V8SI_FTYPE_PCV8SI_V8SI:
14132 93 : case V4DI_FTYPE_PCV4DI_V4DI:
14133 93 : case V4SI_FTYPE_PCV4SI_V4SI:
14134 93 : case V2DI_FTYPE_PCV2DI_V2DI:
14135 93 : case VOID_FTYPE_INT_INT64:
14136 93 : nargs = 2;
14137 93 : klass = load;
14138 93 : memory = 0;
14139 93 : break;
14140 360 : case VOID_FTYPE_PV8DF_V8DF_UQI:
14141 360 : case VOID_FTYPE_PV4DF_V4DF_UQI:
14142 360 : case VOID_FTYPE_PV2DF_V2DF_UQI:
14143 360 : case VOID_FTYPE_PV16SF_V16SF_UHI:
14144 360 : case VOID_FTYPE_PV8SF_V8SF_UQI:
14145 360 : case VOID_FTYPE_PV4SF_V4SF_UQI:
14146 360 : case VOID_FTYPE_PV8DI_V8DI_UQI:
14147 360 : case VOID_FTYPE_PV4DI_V4DI_UQI:
14148 360 : case VOID_FTYPE_PV2DI_V2DI_UQI:
14149 360 : case VOID_FTYPE_PV16SI_V16SI_UHI:
14150 360 : case VOID_FTYPE_PV8SI_V8SI_UQI:
14151 360 : case VOID_FTYPE_PV4SI_V4SI_UQI:
14152 360 : case VOID_FTYPE_PV64QI_V64QI_UDI:
14153 360 : case VOID_FTYPE_PV32HI_V32HI_USI:
14154 360 : case VOID_FTYPE_PV32QI_V32QI_USI:
14155 360 : case VOID_FTYPE_PV16QI_V16QI_UHI:
14156 360 : case VOID_FTYPE_PV16HI_V16HI_UHI:
14157 360 : case VOID_FTYPE_PV8HI_V8HI_UQI:
14158 360 : switch (icode)
14159 : {
14160 : /* These builtins and instructions require the memory
14161 : to be properly aligned. */
14162 : case CODE_FOR_avx512f_storev16sf_mask:
14163 : case CODE_FOR_avx512f_storev16si_mask:
14164 : case CODE_FOR_avx512f_storev8df_mask:
14165 : case CODE_FOR_avx512f_storev8di_mask:
14166 : case CODE_FOR_avx512vl_storev8sf_mask:
14167 : case CODE_FOR_avx512vl_storev8si_mask:
14168 : case CODE_FOR_avx512vl_storev4df_mask:
14169 : case CODE_FOR_avx512vl_storev4di_mask:
14170 : case CODE_FOR_avx512vl_storev4sf_mask:
14171 : case CODE_FOR_avx512vl_storev4si_mask:
14172 : case CODE_FOR_avx512vl_storev2df_mask:
14173 : case CODE_FOR_avx512vl_storev2di_mask:
14174 11808 : aligned_mem = true;
14175 : break;
14176 : default:
14177 : break;
14178 : }
14179 : /* FALLTHRU */
14180 : case VOID_FTYPE_PV8SF_V8SI_V8SF:
14181 : case VOID_FTYPE_PV4DF_V4DI_V4DF:
14182 : case VOID_FTYPE_PV4SF_V4SI_V4SF:
14183 : case VOID_FTYPE_PV2DF_V2DI_V2DF:
14184 : case VOID_FTYPE_PV8SI_V8SI_V8SI:
14185 : case VOID_FTYPE_PV4DI_V4DI_V4DI:
14186 : case VOID_FTYPE_PV4SI_V4SI_V4SI:
14187 : case VOID_FTYPE_PV2DI_V2DI_V2DI:
14188 : case VOID_FTYPE_PV8SI_V8DI_UQI:
14189 : case VOID_FTYPE_PV8HI_V8DI_UQI:
14190 : case VOID_FTYPE_PV16HI_V16SI_UHI:
14191 : case VOID_FTYPE_PUDI_V8DI_UQI:
14192 : case VOID_FTYPE_PV16QI_V16SI_UHI:
14193 : case VOID_FTYPE_PV4SI_V4DI_UQI:
14194 : case VOID_FTYPE_PUDI_V2DI_UQI:
14195 : case VOID_FTYPE_PUDI_V4DI_UQI:
14196 : case VOID_FTYPE_PUSI_V2DI_UQI:
14197 : case VOID_FTYPE_PV8HI_V8SI_UQI:
14198 : case VOID_FTYPE_PUDI_V4SI_UQI:
14199 : case VOID_FTYPE_PUSI_V4DI_UQI:
14200 : case VOID_FTYPE_PUHI_V2DI_UQI:
14201 : case VOID_FTYPE_PUDI_V8SI_UQI:
14202 : case VOID_FTYPE_PUSI_V4SI_UQI:
14203 : case VOID_FTYPE_PCHAR_V64QI_UDI:
14204 : case VOID_FTYPE_PCHAR_V32QI_USI:
14205 : case VOID_FTYPE_PCHAR_V16QI_UHI:
14206 : case VOID_FTYPE_PSHORT_V32HI_USI:
14207 : case VOID_FTYPE_PSHORT_V16HI_UHI:
14208 : case VOID_FTYPE_PSHORT_V8HI_UQI:
14209 : case VOID_FTYPE_PINT_V16SI_UHI:
14210 : case VOID_FTYPE_PINT_V8SI_UQI:
14211 : case VOID_FTYPE_PINT_V4SI_UQI:
14212 : case VOID_FTYPE_PINT64_V8DI_UQI:
14213 : case VOID_FTYPE_PINT64_V4DI_UQI:
14214 : case VOID_FTYPE_PINT64_V2DI_UQI:
14215 : case VOID_FTYPE_PDOUBLE_V8DF_UQI:
14216 : case VOID_FTYPE_PDOUBLE_V4DF_UQI:
14217 : case VOID_FTYPE_PDOUBLE_V2DF_UQI:
14218 : case VOID_FTYPE_PFLOAT_V16SF_UHI:
14219 : case VOID_FTYPE_PFLOAT_V8SF_UQI:
14220 : case VOID_FTYPE_PFLOAT_V4SF_UQI:
14221 : case VOID_FTYPE_PCFLOAT16_V8HF_UQI:
14222 : case VOID_FTYPE_PV32QI_V32HI_USI:
14223 : case VOID_FTYPE_PV16QI_V16HI_UHI:
14224 : case VOID_FTYPE_PUDI_V8HI_UQI:
14225 : nargs = 2;
14226 : klass = store;
14227 : /* Reserve memory operand for target. */
14228 : memory = ARRAY_SIZE (xops);
14229 : break;
14230 1243 : case V4SF_FTYPE_PCV4SF_V4SF_UQI:
14231 1243 : case V8SF_FTYPE_PCV8SF_V8SF_UQI:
14232 1243 : case V16SF_FTYPE_PCV16SF_V16SF_UHI:
14233 1243 : case V4SI_FTYPE_PCV4SI_V4SI_UQI:
14234 1243 : case V8SI_FTYPE_PCV8SI_V8SI_UQI:
14235 1243 : case V16SI_FTYPE_PCV16SI_V16SI_UHI:
14236 1243 : case V2DF_FTYPE_PCV2DF_V2DF_UQI:
14237 1243 : case V4DF_FTYPE_PCV4DF_V4DF_UQI:
14238 1243 : case V8DF_FTYPE_PCV8DF_V8DF_UQI:
14239 1243 : case V2DI_FTYPE_PCV2DI_V2DI_UQI:
14240 1243 : case V4DI_FTYPE_PCV4DI_V4DI_UQI:
14241 1243 : case V8DI_FTYPE_PCV8DI_V8DI_UQI:
14242 1243 : case V64QI_FTYPE_PCV64QI_V64QI_UDI:
14243 1243 : case V32HI_FTYPE_PCV32HI_V32HI_USI:
14244 1243 : case V32QI_FTYPE_PCV32QI_V32QI_USI:
14245 1243 : case V16QI_FTYPE_PCV16QI_V16QI_UHI:
14246 1243 : case V16HI_FTYPE_PCV16HI_V16HI_UHI:
14247 1243 : case V8HI_FTYPE_PCV8HI_V8HI_UQI:
14248 1243 : switch (icode)
14249 : {
14250 : /* These builtins and instructions require the memory
14251 : to be properly aligned. */
14252 : case CODE_FOR_avx512f_loadv16sf_mask:
14253 : case CODE_FOR_avx512f_loadv16si_mask:
14254 : case CODE_FOR_avx512f_loadv8df_mask:
14255 : case CODE_FOR_avx512f_loadv8di_mask:
14256 : case CODE_FOR_avx512vl_loadv8sf_mask:
14257 : case CODE_FOR_avx512vl_loadv8si_mask:
14258 : case CODE_FOR_avx512vl_loadv4df_mask:
14259 : case CODE_FOR_avx512vl_loadv4di_mask:
14260 : case CODE_FOR_avx512vl_loadv4sf_mask:
14261 : case CODE_FOR_avx512vl_loadv4si_mask:
14262 : case CODE_FOR_avx512vl_loadv2df_mask:
14263 : case CODE_FOR_avx512vl_loadv2di_mask:
14264 : case CODE_FOR_avx512bw_loadv64qi_mask:
14265 : case CODE_FOR_avx512vl_loadv32qi_mask:
14266 : case CODE_FOR_avx512vl_loadv16qi_mask:
14267 : case CODE_FOR_avx512bw_loadv32hi_mask:
14268 : case CODE_FOR_avx512vl_loadv16hi_mask:
14269 : case CODE_FOR_avx512vl_loadv8hi_mask:
14270 11808 : aligned_mem = true;
14271 : break;
14272 : default:
14273 : break;
14274 : }
14275 : /* FALLTHRU */
14276 : case V64QI_FTYPE_PCCHAR_V64QI_UDI:
14277 : case V32QI_FTYPE_PCCHAR_V32QI_USI:
14278 : case V16QI_FTYPE_PCCHAR_V16QI_UHI:
14279 : case V32HI_FTYPE_PCSHORT_V32HI_USI:
14280 : case V16HI_FTYPE_PCSHORT_V16HI_UHI:
14281 : case V8HI_FTYPE_PCSHORT_V8HI_UQI:
14282 : case V16SI_FTYPE_PCINT_V16SI_UHI:
14283 : case V8SI_FTYPE_PCINT_V8SI_UQI:
14284 : case V4SI_FTYPE_PCINT_V4SI_UQI:
14285 : case V8DI_FTYPE_PCINT64_V8DI_UQI:
14286 : case V4DI_FTYPE_PCINT64_V4DI_UQI:
14287 : case V2DI_FTYPE_PCINT64_V2DI_UQI:
14288 : case V8DF_FTYPE_PCDOUBLE_V8DF_UQI:
14289 : case V4DF_FTYPE_PCDOUBLE_V4DF_UQI:
14290 : case V2DF_FTYPE_PCDOUBLE_V2DF_UQI:
14291 : case V16SF_FTYPE_PCFLOAT_V16SF_UHI:
14292 : case V8SF_FTYPE_PCFLOAT_V8SF_UQI:
14293 : case V4SF_FTYPE_PCFLOAT_V4SF_UQI:
14294 : case V8HF_FTYPE_PCFLOAT16_V8HF_UQI:
14295 : nargs = 3;
14296 : klass = load;
14297 : memory = 0;
14298 : break;
14299 105 : case INT_FTYPE_PINT_INT_INT_INT:
14300 105 : case LONGLONG_FTYPE_PLONGLONG_LONGLONG_LONGLONG_INT:
14301 105 : nargs = 4;
14302 105 : klass = load;
14303 105 : memory = 0;
14304 105 : constant = 3;
14305 105 : break;
14306 0 : default:
14307 0 : gcc_unreachable ();
14308 : }
14309 :
14310 8338 : gcc_assert (nargs <= ARRAY_SIZE (xops));
14311 :
14312 11808 : if (klass == store)
14313 : {
14314 1875 : arg = CALL_EXPR_ARG (exp, 0);
14315 1875 : op = expand_normal (arg);
14316 1875 : gcc_assert (target == 0);
14317 1875 : if (memory)
14318 : {
14319 1715 : op = ix86_zero_extend_to_Pmode (op);
14320 1715 : target = gen_rtx_MEM (tmode, op);
14321 : /* target at this point has just BITS_PER_UNIT MEM_ALIGN
14322 : on it. Try to improve it using get_pointer_alignment,
14323 : and if the special builtin is one that requires strict
14324 : mode alignment, also from it's GET_MODE_ALIGNMENT.
14325 : Failure to do so could lead to ix86_legitimate_combined_insn
14326 : rejecting all changes to such insns. */
14327 1715 : unsigned int align = get_pointer_alignment (arg);
14328 1715 : if (aligned_mem && align < GET_MODE_ALIGNMENT (tmode))
14329 275 : align = GET_MODE_ALIGNMENT (tmode);
14330 3430 : if (MEM_ALIGN (target) < align)
14331 422 : set_mem_align (target, align);
14332 : }
14333 : else
14334 160 : target = force_reg (tmode, op);
14335 : arg_adjust = 1;
14336 : }
14337 : else
14338 : {
14339 9933 : arg_adjust = 0;
14340 9933 : if (optimize
14341 2918 : || target == 0
14342 2918 : || !register_operand (target, tmode)
14343 12840 : || GET_MODE (target) != tmode)
14344 7026 : target = gen_reg_rtx (tmode);
14345 : }
14346 :
14347 21193 : for (i = 0; i < nargs; i++)
14348 : {
14349 9385 : machine_mode mode = insn_p->operand[i + 1].mode;
14350 :
14351 9385 : arg = CALL_EXPR_ARG (exp, i + arg_adjust);
14352 9385 : op = ix86_expand_unsigned_small_int_cst_argument (arg);
14353 :
14354 9385 : if (i == memory)
14355 : {
14356 : /* This must be the memory operand. */
14357 2352 : op = ix86_zero_extend_to_Pmode (op);
14358 2352 : op = gen_rtx_MEM (mode, op);
14359 : /* op at this point has just BITS_PER_UNIT MEM_ALIGN
14360 : on it. Try to improve it using get_pointer_alignment,
14361 : and if the special builtin is one that requires strict
14362 : mode alignment, also from it's GET_MODE_ALIGNMENT.
14363 : Failure to do so could lead to ix86_legitimate_combined_insn
14364 : rejecting all changes to such insns. */
14365 2352 : unsigned int align = get_pointer_alignment (arg);
14366 2352 : if (aligned_mem && align < GET_MODE_ALIGNMENT (mode))
14367 299 : align = GET_MODE_ALIGNMENT (mode);
14368 4704 : if (MEM_ALIGN (op) < align)
14369 523 : set_mem_align (op, align);
14370 : }
14371 7033 : else if (i == constant)
14372 : {
14373 : /* This must be the constant. */
14374 105 : if (!insn_p->operand[nargs].predicate(op, SImode))
14375 : {
14376 0 : error ("the fourth argument must be one of enum %qs", "_CMPCCX_ENUM");
14377 0 : return const0_rtx;
14378 : }
14379 : }
14380 : else
14381 : {
14382 : /* This must be register. */
14383 6928 : if (VECTOR_MODE_P (mode))
14384 3474 : op = safe_vector_operand (op, mode);
14385 :
14386 6928 : op = fixup_modeless_constant (op, mode);
14387 :
14388 : /* NB: 3-operands load implied it's a mask load or v{p}expand*,
14389 : and that mask operand shoud be at the end.
14390 : Keep all-ones mask which would be simplified by the expander. */
14391 1770 : if (nargs == 3 && i == 2 && klass == load
14392 1770 : && constm1_operand (op, mode)
14393 7101 : && insn_p->operand[i].predicate (op, mode))
14394 : ;
14395 6928 : else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
14396 6928 : op = copy_to_mode_reg (mode, op);
14397 : else
14398 : {
14399 0 : op = copy_to_reg (op);
14400 0 : op = lowpart_subreg (mode, op, GET_MODE (op));
14401 : }
14402 : }
14403 :
14404 9385 : xops[i]= op;
14405 : }
14406 :
14407 11808 : switch (nargs)
14408 : {
14409 7741 : case 0:
14410 7741 : pat = GEN_FCN (icode) (target);
14411 7741 : break;
14412 729 : case 1:
14413 729 : pat = GEN_FCN (icode) (target, xops[0]);
14414 729 : break;
14415 1463 : case 2:
14416 1463 : pat = GEN_FCN (icode) (target, xops[0], xops[1]);
14417 1463 : break;
14418 1770 : case 3:
14419 1770 : pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2]);
14420 1770 : break;
14421 105 : case 4:
14422 105 : pat = GEN_FCN (icode) (target, xops[0], xops[1], xops[2], xops[3]);
14423 105 : break;
14424 : default:
14425 : gcc_unreachable ();
14426 : }
14427 :
14428 11808 : if (! pat)
14429 : return 0;
14430 :
14431 11808 : emit_insn (pat);
14432 11808 : return klass == store ? 0 : target;
14433 : }
14434 :
14435 : /* Return the integer constant in ARG. Constrain it to be in the range
14436 : of the subparts of VEC_TYPE; issue an error if not. */
14437 :
14438 : static int
14439 603 : get_element_number (tree vec_type, tree arg)
14440 : {
14441 603 : unsigned HOST_WIDE_INT elt, max = TYPE_VECTOR_SUBPARTS (vec_type) - 1;
14442 :
14443 603 : if (!tree_fits_uhwi_p (arg)
14444 603 : || (elt = tree_to_uhwi (arg), elt > max))
14445 : {
14446 0 : error ("selector must be an integer constant in the range "
14447 : "[0, %wi]", max);
14448 0 : return 0;
14449 : }
14450 :
14451 603 : return elt;
14452 : }
14453 :
14454 : /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
14455 : ix86_expand_vector_init. We DO have language-level syntax for this, in
14456 : the form of (type){ init-list }. Except that since we can't place emms
14457 : instructions from inside the compiler, we can't allow the use of MMX
14458 : registers unless the user explicitly asks for it. So we do *not* define
14459 : vec_set/vec_extract/vec_init patterns for MMX modes in mmx.md. Instead
14460 : we have builtins invoked by mmintrin.h that gives us license to emit
14461 : these sorts of instructions. */
14462 :
14463 : static rtx
14464 229 : ix86_expand_vec_init_builtin (tree type, tree exp, rtx target)
14465 : {
14466 229 : machine_mode tmode = TYPE_MODE (type);
14467 229 : machine_mode inner_mode = GET_MODE_INNER (tmode);
14468 229 : int i, n_elt = GET_MODE_NUNITS (tmode);
14469 229 : rtvec v = rtvec_alloc (n_elt);
14470 :
14471 229 : gcc_assert (VECTOR_MODE_P (tmode));
14472 229 : gcc_assert (call_expr_nargs (exp) == n_elt);
14473 :
14474 1203 : for (i = 0; i < n_elt; ++i)
14475 : {
14476 974 : rtx x = expand_normal (CALL_EXPR_ARG (exp, i));
14477 974 : RTVEC_ELT (v, i) = gen_lowpart (inner_mode, x);
14478 : }
14479 :
14480 229 : if (!target || !register_operand (target, tmode))
14481 0 : target = gen_reg_rtx (tmode);
14482 :
14483 229 : ix86_expand_vector_init (true, target, gen_rtx_PARALLEL (tmode, v));
14484 229 : return target;
14485 : }
14486 :
14487 : /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
14488 : ix86_expand_vector_extract. They would be redundant (for non-MMX) if we
14489 : had a language-level syntax for referencing vector elements. */
14490 :
14491 : static rtx
14492 399 : ix86_expand_vec_ext_builtin (tree exp, rtx target)
14493 : {
14494 399 : machine_mode tmode, mode0;
14495 399 : tree arg0, arg1;
14496 399 : int elt;
14497 399 : rtx op0;
14498 :
14499 399 : arg0 = CALL_EXPR_ARG (exp, 0);
14500 399 : arg1 = CALL_EXPR_ARG (exp, 1);
14501 :
14502 399 : op0 = expand_normal (arg0);
14503 399 : elt = get_element_number (TREE_TYPE (arg0), arg1);
14504 :
14505 399 : tmode = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
14506 399 : mode0 = TYPE_MODE (TREE_TYPE (arg0));
14507 399 : gcc_assert (VECTOR_MODE_P (mode0));
14508 :
14509 399 : op0 = force_reg (mode0, op0);
14510 :
14511 399 : if (optimize || !target || !register_operand (target, tmode))
14512 320 : target = gen_reg_rtx (tmode);
14513 :
14514 399 : ix86_expand_vector_extract (true, target, op0, elt);
14515 :
14516 399 : return target;
14517 : }
14518 :
14519 : /* A subroutine of ix86_expand_builtin. These builtins are a wrapper around
14520 : ix86_expand_vector_set. They would be redundant (for non-MMX) if we had
14521 : a language-level syntax for referencing vector elements. */
14522 :
14523 : static rtx
14524 204 : ix86_expand_vec_set_builtin (tree exp)
14525 : {
14526 204 : machine_mode tmode, mode1;
14527 204 : tree arg0, arg1, arg2;
14528 204 : int elt;
14529 204 : rtx op0, op1, target;
14530 :
14531 204 : arg0 = CALL_EXPR_ARG (exp, 0);
14532 204 : arg1 = CALL_EXPR_ARG (exp, 1);
14533 204 : arg2 = CALL_EXPR_ARG (exp, 2);
14534 :
14535 204 : tmode = TYPE_MODE (TREE_TYPE (arg0));
14536 204 : mode1 = TYPE_MODE (TREE_TYPE (TREE_TYPE (arg0)));
14537 204 : gcc_assert (VECTOR_MODE_P (tmode));
14538 :
14539 204 : op0 = expand_expr (arg0, NULL_RTX, tmode, EXPAND_NORMAL);
14540 204 : op1 = expand_expr (arg1, NULL_RTX, mode1, EXPAND_NORMAL);
14541 204 : elt = get_element_number (TREE_TYPE (arg0), arg2);
14542 :
14543 204 : if (GET_MODE (op1) != mode1)
14544 82 : op1 = convert_modes (mode1, GET_MODE (op1), op1, true);
14545 :
14546 204 : op0 = force_reg (tmode, op0);
14547 204 : op1 = force_reg (mode1, op1);
14548 :
14549 : /* OP0 is the source of these builtin functions and shouldn't be
14550 : modified. Create a copy, use it and return it as target. */
14551 204 : target = gen_reg_rtx (tmode);
14552 204 : emit_move_insn (target, op0);
14553 204 : ix86_expand_vector_set (true, target, op1, elt);
14554 :
14555 204 : return target;
14556 : }
14557 :
14558 : /* Return true if the necessary isa options for this builtin exist,
14559 : else false.
14560 : fcode = DECL_MD_FUNCTION_CODE (fndecl); */
14561 : bool
14562 1266654 : ix86_check_builtin_isa_match (unsigned int fcode,
14563 : HOST_WIDE_INT* pbisa,
14564 : HOST_WIDE_INT* pbisa2)
14565 : {
14566 1266654 : HOST_WIDE_INT isa = ix86_isa_flags;
14567 1266654 : HOST_WIDE_INT isa2 = ix86_isa_flags2;
14568 1266654 : HOST_WIDE_INT bisa = ix86_builtins_isa[fcode].isa;
14569 1266654 : HOST_WIDE_INT bisa2 = ix86_builtins_isa[fcode].isa2;
14570 1266654 : HOST_WIDE_INT tmp_isa = isa, tmp_isa2 = isa2;
14571 : /* The general case is we require all the ISAs specified in bisa{,2}
14572 : to be enabled.
14573 : The exceptions are:
14574 : OPTION_MASK_ISA_SSE | OPTION_MASK_ISA_3DNOW_A
14575 : OPTION_MASK_ISA_SSE4_2 | OPTION_MASK_ISA_CRC32
14576 : OPTION_MASK_ISA_FMA | OPTION_MASK_ISA_FMA4
14577 : (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL) or
14578 : OPTION_MASK_ISA2_AVXVNNI
14579 : (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL) or
14580 : OPTION_MASK_ISA2_AVXIFMA
14581 : (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_AVX512BF16) or
14582 : OPTION_MASK_ISA2_AVXNECONVERT
14583 : OPTION_MASK_ISA_AES or (OPTION_MASK_ISA_AVX512VL | OPTION_MASK_ISA2_VAES)
14584 : OPTION_MASK_ISA2_AVX10_2 or OPTION_MASK_ISA2_AVXVNNIINT8
14585 : OPTION_MASK_ISA2_AVX10_2 or OPTION_MASK_ISA2_AVXVNNIINT16
14586 : where for each such pair it is sufficient if either of the ISAs is
14587 : enabled, plus if it is ored with other options also those others.
14588 : OPTION_MASK_ISA_MMX in bisa is satisfied also if TARGET_MMX_WITH_SSE. */
14589 :
14590 : #define SHARE_BUILTIN(A1, A2, B1, B2) \
14591 : if ((((bisa & (A1)) == (A1) && (bisa2 & (A2)) == (A2)) \
14592 : && ((bisa & (B1)) == (B1) && (bisa2 & (B2)) == (B2))) \
14593 : && (((isa & (A1)) == (A1) && (isa2 & (A2)) == (A2)) \
14594 : || ((isa & (B1)) == (B1) && (isa2 & (B2)) == (B2)))) \
14595 : { \
14596 : tmp_isa |= (A1) | (B1); \
14597 : tmp_isa2 |= (A2) | (B2); \
14598 : }
14599 :
14600 1266654 : SHARE_BUILTIN (OPTION_MASK_ISA_SSE, 0, OPTION_MASK_ISA_3DNOW_A, 0);
14601 1266654 : SHARE_BUILTIN (OPTION_MASK_ISA_SSE4_2, 0, OPTION_MASK_ISA_CRC32, 0);
14602 1266654 : SHARE_BUILTIN (OPTION_MASK_ISA_FMA, 0, OPTION_MASK_ISA_FMA4, 0);
14603 1266654 : SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VNNI | OPTION_MASK_ISA_AVX512VL, 0, 0,
14604 1266654 : OPTION_MASK_ISA2_AVXVNNI);
14605 1266654 : SHARE_BUILTIN (OPTION_MASK_ISA_AVX512IFMA | OPTION_MASK_ISA_AVX512VL, 0, 0,
14606 1266654 : OPTION_MASK_ISA2_AVXIFMA);
14607 1266654 : SHARE_BUILTIN (OPTION_MASK_ISA_AVX512VL, OPTION_MASK_ISA2_AVX512BF16, 0,
14608 1266654 : OPTION_MASK_ISA2_AVXNECONVERT);
14609 1266654 : SHARE_BUILTIN (OPTION_MASK_ISA_AES, 0, OPTION_MASK_ISA_AVX512VL,
14610 1266654 : OPTION_MASK_ISA2_VAES);
14611 1266654 : SHARE_BUILTIN (0, OPTION_MASK_ISA2_AVXVNNIINT8, 0,
14612 1266654 : OPTION_MASK_ISA2_AVX10_2);
14613 1266654 : SHARE_BUILTIN (0, OPTION_MASK_ISA2_AVXVNNIINT16, 0,
14614 1266654 : OPTION_MASK_ISA2_AVX10_2);
14615 1266654 : isa = tmp_isa;
14616 1266654 : isa2 = tmp_isa2;
14617 :
14618 1266654 : if ((bisa & OPTION_MASK_ISA_MMX) && !TARGET_MMX && TARGET_MMX_WITH_SSE
14619 : /* __builtin_ia32_maskmovq requires MMX registers. */
14620 4563 : && fcode != IX86_BUILTIN_MASKMOVQ)
14621 : {
14622 4554 : bisa &= ~OPTION_MASK_ISA_MMX;
14623 4554 : bisa |= OPTION_MASK_ISA_SSE2;
14624 : }
14625 :
14626 1266654 : if (pbisa)
14627 171589 : *pbisa = bisa;
14628 1266654 : if (pbisa2)
14629 171589 : *pbisa2 = bisa2;
14630 :
14631 1266654 : return (bisa & isa) == bisa && (bisa2 & isa2) == bisa2;
14632 : }
14633 :
14634 : /* Emit instructions to set the carry flag from ARG. */
14635 :
14636 : void
14637 13074 : ix86_expand_carry (rtx arg)
14638 : {
14639 13074 : if (!CONST_INT_P (arg) || arg == const0_rtx)
14640 : {
14641 13068 : arg = convert_to_mode (QImode, arg, 1);
14642 13068 : arg = copy_to_mode_reg (QImode, arg);
14643 13068 : emit_insn (gen_addqi3_cconly_overflow (arg, constm1_rtx));
14644 : }
14645 : else
14646 6 : emit_insn (gen_x86_stc ());
14647 13074 : }
14648 :
14649 : /* Expand an expression EXP that calls a built-in function,
14650 : with result going to TARGET if that's convenient
14651 : (and in mode MODE if that's convenient).
14652 : SUBTARGET may be used as the target for computing one of EXP's operands.
14653 : IGNORE is nonzero if the value is to be ignored. */
14654 :
14655 : rtx
14656 172368 : ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
14657 : machine_mode mode, int ignore)
14658 : {
14659 172368 : size_t i;
14660 172368 : enum insn_code icode, icode2;
14661 172368 : tree fndecl = TREE_OPERAND (CALL_EXPR_FN (exp), 0);
14662 172368 : tree arg0, arg1, arg2, arg3, arg4;
14663 172368 : rtx op0, op1, op2, op3, op4, pat, pat2, insn;
14664 172368 : machine_mode mode0, mode1, mode2, mode3, mode4;
14665 172368 : unsigned int fcode = DECL_MD_FUNCTION_CODE (fndecl);
14666 172368 : HOST_WIDE_INT bisa, bisa2;
14667 :
14668 : /* For CPU builtins that can be folded, fold first and expand the fold. */
14669 172368 : switch (fcode)
14670 : {
14671 195 : case IX86_BUILTIN_CPU_INIT:
14672 195 : {
14673 : /* Make it call __cpu_indicator_init in libgcc. */
14674 195 : tree call_expr, fndecl, type;
14675 195 : type = build_function_type_list (integer_type_node, NULL_TREE);
14676 195 : fndecl = build_fn_decl ("__cpu_indicator_init", type);
14677 195 : call_expr = build_call_expr (fndecl, 0);
14678 195 : return expand_expr (call_expr, target, mode, EXPAND_NORMAL);
14679 : }
14680 584 : case IX86_BUILTIN_CPU_IS:
14681 584 : case IX86_BUILTIN_CPU_SUPPORTS:
14682 584 : {
14683 584 : tree arg0 = CALL_EXPR_ARG (exp, 0);
14684 584 : tree fold_expr = fold_builtin_cpu (fndecl, &arg0);
14685 584 : gcc_assert (fold_expr != NULL_TREE);
14686 584 : return expand_expr (fold_expr, target, mode, EXPAND_NORMAL);
14687 : }
14688 : }
14689 :
14690 171589 : if (!ix86_check_builtin_isa_match (fcode, &bisa, &bisa2))
14691 : {
14692 23 : bool add_abi_p = bisa & OPTION_MASK_ISA_64BIT;
14693 23 : if (TARGET_ABI_X32)
14694 0 : bisa |= OPTION_MASK_ABI_X32;
14695 : else
14696 23 : bisa |= OPTION_MASK_ABI_64;
14697 23 : char *opts = ix86_target_string (bisa, bisa2, 0, 0, NULL, NULL,
14698 : (enum fpmath_unit) 0,
14699 : (enum prefer_vector_width) 0,
14700 : PVW_NONE, false, add_abi_p);
14701 23 : if (!opts)
14702 0 : error ("%qE needs unknown isa option", fndecl);
14703 : else
14704 : {
14705 23 : gcc_assert (opts != NULL);
14706 23 : error ("%qE needs isa option %s", fndecl, opts);
14707 23 : free (opts);
14708 : }
14709 23 : return expand_call (exp, target, ignore);
14710 : }
14711 :
14712 171566 : switch (fcode)
14713 : {
14714 35 : case IX86_BUILTIN_MASKMOVQ:
14715 35 : case IX86_BUILTIN_MASKMOVDQU:
14716 34 : icode = (fcode == IX86_BUILTIN_MASKMOVQ
14717 35 : ? CODE_FOR_mmx_maskmovq
14718 : : CODE_FOR_sse2_maskmovdqu);
14719 : /* Note the arg order is different from the operand order. */
14720 35 : arg1 = CALL_EXPR_ARG (exp, 0);
14721 35 : arg2 = CALL_EXPR_ARG (exp, 1);
14722 35 : arg0 = CALL_EXPR_ARG (exp, 2);
14723 35 : op0 = expand_normal (arg0);
14724 35 : op1 = expand_normal (arg1);
14725 35 : op2 = expand_normal (arg2);
14726 35 : mode0 = insn_data[icode].operand[0].mode;
14727 35 : mode1 = insn_data[icode].operand[1].mode;
14728 35 : mode2 = insn_data[icode].operand[2].mode;
14729 :
14730 35 : op0 = ix86_zero_extend_to_Pmode (op0);
14731 35 : op0 = gen_rtx_MEM (mode1, op0);
14732 :
14733 35 : if (!insn_data[icode].operand[0].predicate (op0, mode0))
14734 0 : op0 = copy_to_mode_reg (mode0, op0);
14735 35 : if (!insn_data[icode].operand[1].predicate (op1, mode1))
14736 2 : op1 = copy_to_mode_reg (mode1, op1);
14737 35 : if (!insn_data[icode].operand[2].predicate (op2, mode2))
14738 2 : op2 = copy_to_mode_reg (mode2, op2);
14739 35 : pat = GEN_FCN (icode) (op0, op1, op2);
14740 35 : if (! pat)
14741 56617 : return 0;
14742 35 : emit_insn (pat);
14743 35 : return 0;
14744 :
14745 22008 : case IX86_BUILTIN_LDMXCSR:
14746 22008 : op0 = expand_normal (CALL_EXPR_ARG (exp, 0));
14747 22008 : target = assign_stack_temp (SImode, GET_MODE_SIZE (SImode));
14748 22008 : emit_move_insn (target, op0);
14749 22008 : emit_insn (gen_sse_ldmxcsr (target));
14750 22008 : return 0;
14751 :
14752 14785 : case IX86_BUILTIN_STMXCSR:
14753 14785 : target = assign_stack_temp (SImode, GET_MODE_SIZE (SImode));
14754 14785 : emit_insn (gen_sse_stmxcsr (target));
14755 14785 : return copy_to_mode_reg (SImode, target);
14756 :
14757 11 : case IX86_BUILTIN_CLFLUSH:
14758 11 : arg0 = CALL_EXPR_ARG (exp, 0);
14759 11 : op0 = expand_normal (arg0);
14760 11 : icode = CODE_FOR_sse2_clflush;
14761 11 : if (!insn_data[icode].operand[0].predicate (op0, Pmode))
14762 5 : op0 = ix86_zero_extend_to_Pmode (op0);
14763 :
14764 11 : emit_insn (gen_sse2_clflush (op0));
14765 11 : return 0;
14766 :
14767 19 : case IX86_BUILTIN_CLWB:
14768 19 : arg0 = CALL_EXPR_ARG (exp, 0);
14769 19 : op0 = expand_normal (arg0);
14770 19 : icode = CODE_FOR_clwb;
14771 19 : if (!insn_data[icode].operand[0].predicate (op0, Pmode))
14772 9 : op0 = ix86_zero_extend_to_Pmode (op0);
14773 :
14774 19 : emit_insn (gen_clwb (op0));
14775 19 : return 0;
14776 :
14777 19 : case IX86_BUILTIN_CLFLUSHOPT:
14778 19 : arg0 = CALL_EXPR_ARG (exp, 0);
14779 19 : op0 = expand_normal (arg0);
14780 19 : icode = CODE_FOR_clflushopt;
14781 19 : if (!insn_data[icode].operand[0].predicate (op0, Pmode))
14782 9 : op0 = ix86_zero_extend_to_Pmode (op0);
14783 :
14784 19 : emit_insn (gen_clflushopt (op0));
14785 19 : return 0;
14786 :
14787 47 : case IX86_BUILTIN_MONITOR:
14788 47 : case IX86_BUILTIN_MONITORX:
14789 47 : arg0 = CALL_EXPR_ARG (exp, 0);
14790 47 : arg1 = CALL_EXPR_ARG (exp, 1);
14791 47 : arg2 = CALL_EXPR_ARG (exp, 2);
14792 47 : op0 = expand_normal (arg0);
14793 47 : op1 = expand_normal (arg1);
14794 47 : op2 = expand_normal (arg2);
14795 47 : if (!REG_P (op0))
14796 19 : op0 = ix86_zero_extend_to_Pmode (op0);
14797 47 : if (!REG_P (op1))
14798 22 : op1 = copy_to_mode_reg (SImode, op1);
14799 47 : if (!REG_P (op2))
14800 25 : op2 = copy_to_mode_reg (SImode, op2);
14801 :
14802 47 : emit_insn (fcode == IX86_BUILTIN_MONITOR
14803 26 : ? gen_sse3_monitor (Pmode, op0, op1, op2)
14804 21 : : gen_monitorx (Pmode, op0, op1, op2));
14805 47 : return 0;
14806 :
14807 25 : case IX86_BUILTIN_MWAIT:
14808 25 : arg0 = CALL_EXPR_ARG (exp, 0);
14809 25 : arg1 = CALL_EXPR_ARG (exp, 1);
14810 25 : op0 = expand_normal (arg0);
14811 25 : op1 = expand_normal (arg1);
14812 25 : if (!REG_P (op0))
14813 13 : op0 = copy_to_mode_reg (SImode, op0);
14814 25 : if (!REG_P (op1))
14815 11 : op1 = copy_to_mode_reg (SImode, op1);
14816 25 : emit_insn (gen_sse3_mwait (op0, op1));
14817 25 : return 0;
14818 :
14819 21 : case IX86_BUILTIN_MWAITX:
14820 21 : arg0 = CALL_EXPR_ARG (exp, 0);
14821 21 : arg1 = CALL_EXPR_ARG (exp, 1);
14822 21 : arg2 = CALL_EXPR_ARG (exp, 2);
14823 21 : op0 = expand_normal (arg0);
14824 21 : op1 = expand_normal (arg1);
14825 21 : op2 = expand_normal (arg2);
14826 21 : if (!REG_P (op0))
14827 11 : op0 = copy_to_mode_reg (SImode, op0);
14828 21 : if (!REG_P (op1))
14829 10 : op1 = copy_to_mode_reg (SImode, op1);
14830 21 : if (!REG_P (op2))
14831 11 : op2 = copy_to_mode_reg (SImode, op2);
14832 21 : emit_insn (gen_mwaitx (op0, op1, op2));
14833 21 : return 0;
14834 :
14835 21 : case IX86_BUILTIN_UMONITOR:
14836 21 : arg0 = CALL_EXPR_ARG (exp, 0);
14837 21 : op0 = expand_normal (arg0);
14838 :
14839 21 : op0 = ix86_zero_extend_to_Pmode (op0);
14840 21 : emit_insn (gen_umonitor (Pmode, op0));
14841 21 : return 0;
14842 :
14843 42 : case IX86_BUILTIN_UMWAIT:
14844 42 : case IX86_BUILTIN_TPAUSE:
14845 42 : arg0 = CALL_EXPR_ARG (exp, 0);
14846 42 : arg1 = CALL_EXPR_ARG (exp, 1);
14847 42 : op0 = expand_normal (arg0);
14848 42 : op1 = expand_normal (arg1);
14849 :
14850 42 : if (!REG_P (op0))
14851 20 : op0 = copy_to_mode_reg (SImode, op0);
14852 :
14853 42 : op1 = force_reg (DImode, op1);
14854 :
14855 42 : if (TARGET_64BIT)
14856 : {
14857 42 : op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
14858 : NULL, 1, OPTAB_DIRECT);
14859 42 : switch (fcode)
14860 : {
14861 : case IX86_BUILTIN_UMWAIT:
14862 : icode = CODE_FOR_umwait_rex64;
14863 : break;
14864 21 : case IX86_BUILTIN_TPAUSE:
14865 21 : icode = CODE_FOR_tpause_rex64;
14866 21 : break;
14867 0 : default:
14868 0 : gcc_unreachable ();
14869 : }
14870 :
14871 42 : op2 = gen_lowpart (SImode, op2);
14872 42 : op1 = gen_lowpart (SImode, op1);
14873 42 : pat = GEN_FCN (icode) (op0, op1, op2);
14874 : }
14875 : else
14876 : {
14877 0 : switch (fcode)
14878 : {
14879 : case IX86_BUILTIN_UMWAIT:
14880 : icode = CODE_FOR_umwait;
14881 : break;
14882 0 : case IX86_BUILTIN_TPAUSE:
14883 0 : icode = CODE_FOR_tpause;
14884 0 : break;
14885 0 : default:
14886 0 : gcc_unreachable ();
14887 : }
14888 0 : pat = GEN_FCN (icode) (op0, op1);
14889 : }
14890 :
14891 42 : if (!pat)
14892 : return 0;
14893 :
14894 42 : emit_insn (pat);
14895 :
14896 42 : if (target == 0
14897 42 : || !register_operand (target, QImode))
14898 0 : target = gen_reg_rtx (QImode);
14899 :
14900 42 : pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
14901 : const0_rtx);
14902 42 : emit_insn (gen_rtx_SET (target, pat));
14903 :
14904 42 : return target;
14905 :
14906 20 : case IX86_BUILTIN_TESTUI:
14907 20 : emit_insn (gen_testui ());
14908 :
14909 20 : if (target == 0
14910 20 : || !register_operand (target, QImode))
14911 0 : target = gen_reg_rtx (QImode);
14912 :
14913 20 : pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
14914 : const0_rtx);
14915 20 : emit_insn (gen_rtx_SET (target, pat));
14916 :
14917 20 : return target;
14918 :
14919 19 : case IX86_BUILTIN_CLZERO:
14920 19 : arg0 = CALL_EXPR_ARG (exp, 0);
14921 19 : op0 = expand_normal (arg0);
14922 19 : if (!REG_P (op0))
14923 9 : op0 = ix86_zero_extend_to_Pmode (op0);
14924 19 : emit_insn (gen_clzero (Pmode, op0));
14925 19 : return 0;
14926 :
14927 19 : case IX86_BUILTIN_CLDEMOTE:
14928 19 : arg0 = CALL_EXPR_ARG (exp, 0);
14929 19 : op0 = expand_normal (arg0);
14930 19 : icode = CODE_FOR_cldemote;
14931 19 : if (!insn_data[icode].operand[0].predicate (op0, Pmode))
14932 9 : op0 = ix86_zero_extend_to_Pmode (op0);
14933 :
14934 19 : emit_insn (gen_cldemote (op0));
14935 19 : return 0;
14936 :
14937 11 : case IX86_BUILTIN_LOADIWKEY:
14938 11 : {
14939 11 : arg0 = CALL_EXPR_ARG (exp, 0);
14940 11 : arg1 = CALL_EXPR_ARG (exp, 1);
14941 11 : arg2 = CALL_EXPR_ARG (exp, 2);
14942 11 : arg3 = CALL_EXPR_ARG (exp, 3);
14943 :
14944 11 : op0 = expand_normal (arg0);
14945 11 : op1 = expand_normal (arg1);
14946 11 : op2 = expand_normal (arg2);
14947 11 : op3 = expand_normal (arg3);
14948 :
14949 11 : if (!REG_P (op0))
14950 5 : op0 = copy_to_mode_reg (V2DImode, op0);
14951 11 : if (!REG_P (op1))
14952 5 : op1 = copy_to_mode_reg (V2DImode, op1);
14953 11 : if (!REG_P (op2))
14954 5 : op2 = copy_to_mode_reg (V2DImode, op2);
14955 11 : if (!REG_P (op3))
14956 5 : op3 = copy_to_mode_reg (SImode, op3);
14957 :
14958 11 : emit_insn (gen_loadiwkey (op0, op1, op2, op3));
14959 :
14960 11 : return 0;
14961 : }
14962 :
14963 12 : case IX86_BUILTIN_AESDEC128KLU8:
14964 12 : icode = CODE_FOR_aesdec128klu8;
14965 12 : goto aesdecenc_expand;
14966 :
14967 12 : case IX86_BUILTIN_AESDEC256KLU8:
14968 12 : icode = CODE_FOR_aesdec256klu8;
14969 12 : goto aesdecenc_expand;
14970 :
14971 12 : case IX86_BUILTIN_AESENC128KLU8:
14972 12 : icode = CODE_FOR_aesenc128klu8;
14973 12 : goto aesdecenc_expand;
14974 :
14975 : case IX86_BUILTIN_AESENC256KLU8:
14976 : icode = CODE_FOR_aesenc256klu8;
14977 :
14978 48 : aesdecenc_expand:
14979 :
14980 48 : arg0 = CALL_EXPR_ARG (exp, 0); // __m128i *odata
14981 48 : arg1 = CALL_EXPR_ARG (exp, 1); // __m128i idata
14982 48 : arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
14983 :
14984 48 : op0 = expand_normal (arg0);
14985 48 : op1 = expand_normal (arg1);
14986 48 : op2 = expand_normal (arg2);
14987 :
14988 48 : if (!address_operand (op0, V2DImode))
14989 : {
14990 16 : op0 = convert_memory_address (Pmode, op0);
14991 16 : op0 = copy_addr_to_reg (op0);
14992 : }
14993 48 : op0 = gen_rtx_MEM (V2DImode, op0);
14994 :
14995 48 : if (!REG_P (op1))
14996 20 : op1 = copy_to_mode_reg (V2DImode, op1);
14997 :
14998 48 : if (!address_operand (op2, VOIDmode))
14999 : {
15000 16 : op2 = convert_memory_address (Pmode, op2);
15001 16 : op2 = copy_addr_to_reg (op2);
15002 : }
15003 48 : op2 = gen_rtx_MEM (BLKmode, op2);
15004 :
15005 48 : emit_insn (GEN_FCN (icode) (op1, op1, op2));
15006 :
15007 48 : if (target == 0)
15008 4 : target = gen_reg_rtx (QImode);
15009 :
15010 : /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
15011 : error occurs. Then the output should be cleared for safety. */
15012 48 : rtx_code_label *ok_label;
15013 48 : rtx tmp;
15014 :
15015 48 : tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
15016 48 : pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
15017 48 : ok_label = gen_label_rtx ();
15018 48 : emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
15019 : true, ok_label);
15020 : /* Usually the runtime error seldom occur, so predict OK path as
15021 : hotspot to optimize it as fallthrough block. */
15022 48 : predict_jump (REG_BR_PROB_BASE * 90 / 100);
15023 :
15024 48 : emit_insn (gen_rtx_SET (op1, const0_rtx));
15025 :
15026 48 : emit_label (ok_label);
15027 48 : emit_insn (gen_rtx_SET (target, pat));
15028 48 : emit_insn (gen_rtx_SET (op0, op1));
15029 :
15030 48 : return target;
15031 :
15032 11 : case IX86_BUILTIN_AESDECWIDE128KLU8:
15033 11 : icode = CODE_FOR_aesdecwide128klu8;
15034 11 : goto wideaesdecenc_expand;
15035 :
15036 11 : case IX86_BUILTIN_AESDECWIDE256KLU8:
15037 11 : icode = CODE_FOR_aesdecwide256klu8;
15038 11 : goto wideaesdecenc_expand;
15039 :
15040 11 : case IX86_BUILTIN_AESENCWIDE128KLU8:
15041 11 : icode = CODE_FOR_aesencwide128klu8;
15042 11 : goto wideaesdecenc_expand;
15043 :
15044 : case IX86_BUILTIN_AESENCWIDE256KLU8:
15045 : icode = CODE_FOR_aesencwide256klu8;
15046 :
15047 44 : wideaesdecenc_expand:
15048 :
15049 44 : rtx xmm_regs[8];
15050 44 : rtx op;
15051 :
15052 44 : arg0 = CALL_EXPR_ARG (exp, 0); // __m128i * odata
15053 44 : arg1 = CALL_EXPR_ARG (exp, 1); // const __m128i * idata
15054 44 : arg2 = CALL_EXPR_ARG (exp, 2); // const void *p
15055 :
15056 44 : op0 = expand_normal (arg0);
15057 44 : op1 = expand_normal (arg1);
15058 44 : op2 = expand_normal (arg2);
15059 :
15060 44 : if (GET_MODE (op1) != Pmode)
15061 0 : op1 = convert_to_mode (Pmode, op1, 1);
15062 :
15063 44 : if (!address_operand (op2, VOIDmode))
15064 : {
15065 16 : op2 = convert_memory_address (Pmode, op2);
15066 16 : op2 = copy_addr_to_reg (op2);
15067 : }
15068 44 : op2 = gen_rtx_MEM (BLKmode, op2);
15069 :
15070 440 : for (i = 0; i < 8; i++)
15071 : {
15072 352 : xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
15073 :
15074 352 : op = gen_rtx_MEM (V2DImode,
15075 352 : plus_constant (Pmode, op1, (i * 16)));
15076 :
15077 352 : emit_move_insn (xmm_regs[i], op);
15078 : }
15079 :
15080 44 : emit_insn (GEN_FCN (icode) (op2));
15081 :
15082 44 : if (target == 0)
15083 0 : target = gen_reg_rtx (QImode);
15084 :
15085 44 : tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
15086 44 : pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
15087 44 : ok_label = gen_label_rtx ();
15088 44 : emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
15089 : true, ok_label);
15090 44 : predict_jump (REG_BR_PROB_BASE * 90 / 100);
15091 :
15092 440 : for (i = 0; i < 8; i++)
15093 352 : emit_insn (gen_rtx_SET (xmm_regs[i], const0_rtx));
15094 :
15095 44 : emit_label (ok_label);
15096 44 : emit_insn (gen_rtx_SET (target, pat));
15097 :
15098 44 : if (GET_MODE (op0) != Pmode)
15099 0 : op0 = convert_to_mode (Pmode, op0, 1);
15100 :
15101 396 : for (i = 0; i < 8; i++)
15102 : {
15103 352 : op = gen_rtx_MEM (V2DImode,
15104 352 : plus_constant (Pmode, op0, (i * 16)));
15105 352 : emit_move_insn (op, xmm_regs[i]);
15106 : }
15107 :
15108 : return target;
15109 :
15110 13 : case IX86_BUILTIN_ENCODEKEY128U32:
15111 13 : {
15112 13 : rtx op, xmm_regs[7];
15113 :
15114 13 : arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
15115 13 : arg1 = CALL_EXPR_ARG (exp, 1); // __m128i key
15116 13 : arg2 = CALL_EXPR_ARG (exp, 2); // void *h
15117 :
15118 13 : op0 = expand_normal (arg0);
15119 13 : op1 = expand_normal (arg1);
15120 13 : op2 = expand_normal (arg2);
15121 :
15122 13 : if (!REG_P (op0))
15123 7 : op0 = copy_to_mode_reg (SImode, op0);
15124 :
15125 13 : if (GET_MODE (op2) != Pmode)
15126 1 : op2 = convert_to_mode (Pmode, op2, 1);
15127 :
15128 13 : op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
15129 13 : emit_move_insn (op, op1);
15130 :
15131 65 : for (i = 0; i < 3; i++)
15132 39 : xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
15133 :
15134 13 : if (target == 0 || !register_operand (target, SImode))
15135 2 : target = gen_reg_rtx (SImode);
15136 :
15137 13 : emit_insn (gen_encodekey128u32 (target, op0));
15138 :
15139 65 : for (i = 0; i < 3; i++)
15140 : {
15141 39 : op = gen_rtx_MEM (V2DImode,
15142 39 : plus_constant (Pmode, op2, (i * 16)));
15143 39 : emit_move_insn (op, xmm_regs[i]);
15144 : }
15145 :
15146 13 : return target;
15147 : }
15148 13 : case IX86_BUILTIN_ENCODEKEY256U32:
15149 13 : {
15150 13 : rtx op, xmm_regs[7];
15151 :
15152 13 : arg0 = CALL_EXPR_ARG (exp, 0); // unsigned int htype
15153 13 : arg1 = CALL_EXPR_ARG (exp, 1); // __m128i keylow
15154 13 : arg2 = CALL_EXPR_ARG (exp, 2); // __m128i keyhi
15155 13 : arg3 = CALL_EXPR_ARG (exp, 3); // void *h
15156 :
15157 13 : op0 = expand_normal (arg0);
15158 13 : op1 = expand_normal (arg1);
15159 13 : op2 = expand_normal (arg2);
15160 13 : op3 = expand_normal (arg3);
15161 :
15162 13 : if (!REG_P (op0))
15163 7 : op0 = copy_to_mode_reg (SImode, op0);
15164 :
15165 13 : if (GET_MODE (op3) != Pmode)
15166 1 : op3 = convert_to_mode (Pmode, op3, 1);
15167 :
15168 : /* Force to use xmm0, xmm1 for keylow, keyhi*/
15169 13 : op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (0));
15170 13 : emit_move_insn (op, op1);
15171 13 : op = gen_rtx_REG (V2DImode, GET_SSE_REGNO (1));
15172 13 : emit_move_insn (op, op2);
15173 :
15174 78 : for (i = 0; i < 4; i++)
15175 52 : xmm_regs[i] = gen_rtx_REG (V2DImode, GET_SSE_REGNO (i));
15176 :
15177 13 : if (target == 0 || !register_operand (target, SImode))
15178 2 : target = gen_reg_rtx (SImode);
15179 :
15180 13 : emit_insn (gen_encodekey256u32 (target, op0));
15181 :
15182 78 : for (i = 0; i < 4; i++)
15183 : {
15184 52 : op = gen_rtx_MEM (V2DImode,
15185 52 : plus_constant (Pmode, op3, (i * 16)));
15186 52 : emit_move_insn (op, xmm_regs[i]);
15187 : }
15188 :
15189 13 : return target;
15190 : }
15191 :
15192 48 : case IX86_BUILTIN_PREFETCH:
15193 48 : {
15194 48 : arg0 = CALL_EXPR_ARG (exp, 0); // const void *
15195 48 : arg1 = CALL_EXPR_ARG (exp, 1); // const int
15196 48 : arg2 = CALL_EXPR_ARG (exp, 2); // const int
15197 48 : arg3 = CALL_EXPR_ARG (exp, 3); // const int
15198 :
15199 48 : op0 = expand_normal (arg0);
15200 48 : op1 = expand_normal (arg1);
15201 48 : op2 = expand_normal (arg2);
15202 48 : op3 = expand_normal (arg3);
15203 :
15204 48 : if (!CONST_INT_P (op1) || !CONST_INT_P (op2) || !CONST_INT_P (op3))
15205 : {
15206 0 : error ("second, third and fourth argument must be a const");
15207 0 : return const0_rtx;
15208 : }
15209 :
15210 48 : if (!IN_RANGE (INTVAL (op1), 0, 2))
15211 : {
15212 1 : warning (0, "invalid second argument to"
15213 : " %<__builtin_ia32_prefetch%>; using zero");
15214 1 : op1 = const0_rtx;
15215 : }
15216 :
15217 48 : if (INTVAL (op3) == 1)
15218 : {
15219 4 : if (!IN_RANGE (INTVAL (op2), 2, 3))
15220 : {
15221 1 : error ("invalid third argument");
15222 1 : return const0_rtx;
15223 : }
15224 :
15225 3 : if (TARGET_64BIT && TARGET_PREFETCHI
15226 6 : && local_func_symbolic_operand (op0, GET_MODE (op0)))
15227 2 : emit_insn (gen_prefetchi (op0, op2));
15228 : else
15229 : {
15230 1 : warning (0, "instruction prefetch applies when in 64-bit mode"
15231 : " with RIP-relative addressing and"
15232 : " option %<-mprefetchi%>;"
15233 : " they stay NOPs otherwise");
15234 1 : emit_insn (gen_nop ());
15235 : }
15236 : }
15237 : else
15238 : {
15239 44 : if (INTVAL (op3) != 0)
15240 1 : warning (0, "invalid forth argument to"
15241 : " %<__builtin_ia32_prefetch%>; using zero");
15242 :
15243 44 : if (!address_operand (op0, VOIDmode))
15244 : {
15245 10 : op0 = convert_memory_address (Pmode, op0);
15246 10 : op0 = copy_addr_to_reg (op0);
15247 : }
15248 :
15249 44 : if (!IN_RANGE (INTVAL (op2), 0, 3))
15250 : {
15251 1 : warning (0, "invalid third argument to %<__builtin_ia32_prefetch%>; using zero");
15252 1 : op2 = const0_rtx;
15253 : }
15254 :
15255 44 : if (TARGET_3DNOW
15256 26 : || TARGET_PREFETCH_SSE
15257 0 : || TARGET_PRFCHW
15258 0 : || TARGET_MOVRS)
15259 44 : emit_insn (gen_prefetch (op0, op1, op2));
15260 0 : else if (!MEM_P (op0) && side_effects_p (op0))
15261 : /* Don't do anything with direct references to volatile memory,
15262 : but generate code to handle other side effects. */
15263 0 : emit_insn (op0);
15264 : }
15265 :
15266 : return 0;
15267 : }
15268 :
15269 21 : case IX86_BUILTIN_PREFETCHI:
15270 21 : {
15271 21 : arg0 = CALL_EXPR_ARG (exp, 0); // const void *
15272 21 : arg1 = CALL_EXPR_ARG (exp, 1); // const int
15273 :
15274 21 : op0 = expand_normal (arg0);
15275 21 : op1 = expand_normal (arg1);
15276 :
15277 21 : if (!CONST_INT_P (op1))
15278 : {
15279 0 : error ("second argument must be a const");
15280 0 : return const0_rtx;
15281 : }
15282 :
15283 : /* GOT/PLT_PIC should not be available for instruction prefetch.
15284 : It must be real instruction address. */
15285 21 : if (TARGET_64BIT
15286 21 : && local_func_symbolic_operand (op0, GET_MODE (op0)))
15287 4 : emit_insn (gen_prefetchi (op0, op1));
15288 : else
15289 : {
15290 : /* Ignore the hint. */
15291 17 : warning (0, "instruction prefetch applies when in 64-bit mode"
15292 : " with RIP-relative addressing and"
15293 : " option %<-mprefetchi%>;"
15294 : " they stay NOPs otherwise");
15295 17 : emit_insn (gen_nop ());
15296 : }
15297 :
15298 : return 0;
15299 : }
15300 :
15301 53 : case IX86_BUILTIN_URDMSR:
15302 53 : case IX86_BUILTIN_UWRMSR:
15303 53 : {
15304 53 : arg0 = CALL_EXPR_ARG (exp, 0);
15305 53 : op0 = expand_normal (arg0);
15306 :
15307 53 : if (CONST_INT_P (op0))
15308 : {
15309 12 : unsigned HOST_WIDE_INT val = UINTVAL (op0);
15310 12 : if (val > 0xffffffff)
15311 2 : op0 = force_reg (DImode, op0);
15312 : }
15313 : else
15314 41 : op0 = force_reg (DImode, op0);
15315 :
15316 53 : if (fcode == IX86_BUILTIN_UWRMSR)
15317 : {
15318 26 : arg1 = CALL_EXPR_ARG (exp, 1);
15319 26 : op1 = expand_normal (arg1);
15320 26 : op1 = force_reg (DImode, op1);
15321 26 : icode = CODE_FOR_uwrmsr;
15322 26 : target = 0;
15323 : }
15324 : else
15325 : {
15326 27 : if (target == 0 || !register_operand (target, DImode))
15327 1 : target = gen_reg_rtx (DImode);
15328 : icode = CODE_FOR_urdmsr;
15329 : op1 = op0;
15330 : op0 = target;
15331 : }
15332 53 : emit_insn (GEN_FCN (icode) (op0, op1));
15333 53 : return target;
15334 : }
15335 :
15336 229 : case IX86_BUILTIN_VEC_INIT_V2SI:
15337 229 : case IX86_BUILTIN_VEC_INIT_V4HI:
15338 229 : case IX86_BUILTIN_VEC_INIT_V8QI:
15339 229 : return ix86_expand_vec_init_builtin (TREE_TYPE (exp), exp, target);
15340 :
15341 399 : case IX86_BUILTIN_VEC_EXT_V2DF:
15342 399 : case IX86_BUILTIN_VEC_EXT_V2DI:
15343 399 : case IX86_BUILTIN_VEC_EXT_V4SF:
15344 399 : case IX86_BUILTIN_VEC_EXT_V4SI:
15345 399 : case IX86_BUILTIN_VEC_EXT_V8HI:
15346 399 : case IX86_BUILTIN_VEC_EXT_V2SI:
15347 399 : case IX86_BUILTIN_VEC_EXT_V4HI:
15348 399 : case IX86_BUILTIN_VEC_EXT_V16QI:
15349 399 : return ix86_expand_vec_ext_builtin (exp, target);
15350 :
15351 204 : case IX86_BUILTIN_VEC_SET_V2DI:
15352 204 : case IX86_BUILTIN_VEC_SET_V4SF:
15353 204 : case IX86_BUILTIN_VEC_SET_V4SI:
15354 204 : case IX86_BUILTIN_VEC_SET_V8HI:
15355 204 : case IX86_BUILTIN_VEC_SET_V4HI:
15356 204 : case IX86_BUILTIN_VEC_SET_V16QI:
15357 204 : return ix86_expand_vec_set_builtin (exp);
15358 :
15359 0 : case IX86_BUILTIN_NANQ:
15360 0 : case IX86_BUILTIN_NANSQ:
15361 0 : return expand_call (exp, target, ignore);
15362 :
15363 18 : case IX86_BUILTIN_RDPID:
15364 :
15365 18 : op0 = gen_reg_rtx (word_mode);
15366 :
15367 18 : if (TARGET_64BIT)
15368 : {
15369 18 : insn = gen_rdpid_rex64 (op0);
15370 18 : op0 = convert_to_mode (SImode, op0, 1);
15371 : }
15372 : else
15373 0 : insn = gen_rdpid (op0);
15374 :
15375 18 : emit_insn (insn);
15376 :
15377 18 : if (target == 0
15378 18 : || !register_operand (target, SImode))
15379 0 : target = gen_reg_rtx (SImode);
15380 :
15381 18 : emit_move_insn (target, op0);
15382 18 : return target;
15383 :
15384 75 : case IX86_BUILTIN_2INTERSECTD512:
15385 75 : case IX86_BUILTIN_2INTERSECTQ512:
15386 75 : case IX86_BUILTIN_2INTERSECTD256:
15387 75 : case IX86_BUILTIN_2INTERSECTQ256:
15388 75 : case IX86_BUILTIN_2INTERSECTD128:
15389 75 : case IX86_BUILTIN_2INTERSECTQ128:
15390 75 : arg0 = CALL_EXPR_ARG (exp, 0);
15391 75 : arg1 = CALL_EXPR_ARG (exp, 1);
15392 75 : arg2 = CALL_EXPR_ARG (exp, 2);
15393 75 : arg3 = CALL_EXPR_ARG (exp, 3);
15394 75 : op0 = expand_normal (arg0);
15395 75 : op1 = expand_normal (arg1);
15396 75 : op2 = expand_normal (arg2);
15397 75 : op3 = expand_normal (arg3);
15398 :
15399 75 : if (!address_operand (op0, VOIDmode))
15400 : {
15401 25 : op0 = convert_memory_address (Pmode, op0);
15402 25 : op0 = copy_addr_to_reg (op0);
15403 : }
15404 75 : if (!address_operand (op1, VOIDmode))
15405 : {
15406 25 : op1 = convert_memory_address (Pmode, op1);
15407 25 : op1 = copy_addr_to_reg (op1);
15408 : }
15409 :
15410 75 : switch (fcode)
15411 : {
15412 : case IX86_BUILTIN_2INTERSECTD512:
15413 : mode4 = P2HImode;
15414 : icode = CODE_FOR_avx512vp2intersect_2intersectv16si;
15415 : break;
15416 : case IX86_BUILTIN_2INTERSECTQ512:
15417 : mode4 = P2QImode;
15418 : icode = CODE_FOR_avx512vp2intersect_2intersectv8di;
15419 : break;
15420 : case IX86_BUILTIN_2INTERSECTD256:
15421 : mode4 = P2QImode;
15422 : icode = CODE_FOR_avx512vp2intersect_2intersectv8si;
15423 : break;
15424 : case IX86_BUILTIN_2INTERSECTQ256:
15425 : mode4 = P2QImode;
15426 : icode = CODE_FOR_avx512vp2intersect_2intersectv4di;
15427 : break;
15428 : case IX86_BUILTIN_2INTERSECTD128:
15429 : mode4 = P2QImode;
15430 : icode = CODE_FOR_avx512vp2intersect_2intersectv4si;
15431 : break;
15432 : case IX86_BUILTIN_2INTERSECTQ128:
15433 : mode4 = P2QImode;
15434 : icode = CODE_FOR_avx512vp2intersect_2intersectv2di;
15435 : break;
15436 0 : default:
15437 0 : gcc_unreachable ();
15438 : }
15439 :
15440 75 : mode2 = insn_data[icode].operand[1].mode;
15441 75 : mode3 = insn_data[icode].operand[2].mode;
15442 75 : if (!insn_data[icode].operand[1].predicate (op2, mode2))
15443 25 : op2 = copy_to_mode_reg (mode2, op2);
15444 75 : if (!insn_data[icode].operand[2].predicate (op3, mode3))
15445 6 : op3 = copy_to_mode_reg (mode3, op3);
15446 :
15447 75 : op4 = gen_reg_rtx (mode4);
15448 75 : emit_insn (GEN_FCN (icode) (op4, op2, op3));
15449 75 : mode0 = mode4 == P2HImode ? HImode : QImode;
15450 75 : emit_move_insn (gen_rtx_MEM (mode0, op0),
15451 75 : gen_lowpart (mode0, op4));
15452 75 : emit_move_insn (gen_rtx_MEM (mode0, op1),
15453 : gen_highpart (mode0, op4));
15454 :
15455 75 : return 0;
15456 :
15457 102 : case IX86_BUILTIN_RDPMC:
15458 102 : case IX86_BUILTIN_RDTSC:
15459 102 : case IX86_BUILTIN_RDTSCP:
15460 102 : case IX86_BUILTIN_XGETBV:
15461 :
15462 102 : op0 = gen_reg_rtx (DImode);
15463 102 : op1 = gen_reg_rtx (DImode);
15464 :
15465 102 : if (fcode == IX86_BUILTIN_RDPMC)
15466 : {
15467 22 : arg0 = CALL_EXPR_ARG (exp, 0);
15468 22 : op2 = expand_normal (arg0);
15469 22 : if (!register_operand (op2, SImode))
15470 11 : op2 = copy_to_mode_reg (SImode, op2);
15471 :
15472 22 : insn = (TARGET_64BIT
15473 22 : ? gen_rdpmc_rex64 (op0, op1, op2)
15474 0 : : gen_rdpmc (op0, op2));
15475 22 : emit_insn (insn);
15476 : }
15477 80 : else if (fcode == IX86_BUILTIN_XGETBV)
15478 : {
15479 22 : arg0 = CALL_EXPR_ARG (exp, 0);
15480 22 : op2 = expand_normal (arg0);
15481 22 : if (!register_operand (op2, SImode))
15482 1 : op2 = copy_to_mode_reg (SImode, op2);
15483 :
15484 22 : insn = (TARGET_64BIT
15485 22 : ? gen_xgetbv_rex64 (op0, op1, op2)
15486 0 : : gen_xgetbv (op0, op2));
15487 22 : emit_insn (insn);
15488 : }
15489 58 : else if (fcode == IX86_BUILTIN_RDTSC)
15490 : {
15491 36 : insn = (TARGET_64BIT
15492 36 : ? gen_rdtsc_rex64 (op0, op1)
15493 2 : : gen_rdtsc (op0));
15494 36 : emit_insn (insn);
15495 : }
15496 : else
15497 : {
15498 22 : op2 = gen_reg_rtx (SImode);
15499 :
15500 22 : insn = (TARGET_64BIT
15501 22 : ? gen_rdtscp_rex64 (op0, op1, op2)
15502 0 : : gen_rdtscp (op0, op2));
15503 22 : emit_insn (insn);
15504 :
15505 22 : arg0 = CALL_EXPR_ARG (exp, 0);
15506 22 : op4 = expand_normal (arg0);
15507 22 : if (!address_operand (op4, VOIDmode))
15508 : {
15509 10 : op4 = convert_memory_address (Pmode, op4);
15510 10 : op4 = copy_addr_to_reg (op4);
15511 : }
15512 22 : emit_move_insn (gen_rtx_MEM (SImode, op4), op2);
15513 : }
15514 :
15515 102 : if (target == 0
15516 102 : || !register_operand (target, DImode))
15517 10 : target = gen_reg_rtx (DImode);
15518 :
15519 102 : if (TARGET_64BIT)
15520 : {
15521 100 : op1 = expand_simple_binop (DImode, ASHIFT, op1, GEN_INT (32),
15522 : op1, 1, OPTAB_DIRECT);
15523 100 : op0 = expand_simple_binop (DImode, IOR, op0, op1,
15524 : op0, 1, OPTAB_DIRECT);
15525 : }
15526 :
15527 102 : emit_move_insn (target, op0);
15528 102 : return target;
15529 :
15530 61 : case IX86_BUILTIN_ENQCMD:
15531 61 : case IX86_BUILTIN_ENQCMDS:
15532 61 : case IX86_BUILTIN_MOVDIR64B:
15533 :
15534 61 : arg0 = CALL_EXPR_ARG (exp, 0);
15535 61 : arg1 = CALL_EXPR_ARG (exp, 1);
15536 61 : op0 = expand_normal (arg0);
15537 61 : op1 = expand_normal (arg1);
15538 :
15539 61 : op0 = ix86_zero_extend_to_Pmode (op0);
15540 61 : if (!address_operand (op1, VOIDmode))
15541 : {
15542 28 : op1 = convert_memory_address (Pmode, op1);
15543 28 : op1 = copy_addr_to_reg (op1);
15544 : }
15545 61 : op1 = gen_rtx_MEM (XImode, op1);
15546 :
15547 61 : if (fcode == IX86_BUILTIN_MOVDIR64B)
15548 : {
15549 24 : emit_insn (gen_movdir64b (Pmode, op0, op1));
15550 23 : return 0;
15551 : }
15552 : else
15553 : {
15554 38 : if (target == 0
15555 38 : || !register_operand (target, SImode))
15556 0 : target = gen_reg_rtx (SImode);
15557 :
15558 38 : emit_move_insn (target, const0_rtx);
15559 38 : target = gen_rtx_SUBREG (QImode, target, 0);
15560 :
15561 19 : int unspecv = (fcode == IX86_BUILTIN_ENQCMD
15562 38 : ? UNSPECV_ENQCMD
15563 : : UNSPECV_ENQCMDS);
15564 38 : icode = code_for_enqcmd (unspecv, Pmode);
15565 38 : emit_insn (GEN_FCN (icode) (op0, op1));
15566 :
15567 38 : emit_insn
15568 38 : (gen_rtx_SET (gen_rtx_STRICT_LOW_PART (VOIDmode, target),
15569 : gen_rtx_fmt_ee (EQ, QImode,
15570 : gen_rtx_REG (CCZmode, FLAGS_REG),
15571 : const0_rtx)));
15572 38 : return SUBREG_REG (target);
15573 : }
15574 :
15575 14775 : case IX86_BUILTIN_FXSAVE:
15576 14775 : case IX86_BUILTIN_FXRSTOR:
15577 14775 : case IX86_BUILTIN_FXSAVE64:
15578 14775 : case IX86_BUILTIN_FXRSTOR64:
15579 14775 : case IX86_BUILTIN_FNSTENV:
15580 14775 : case IX86_BUILTIN_FLDENV:
15581 14775 : mode0 = BLKmode;
15582 14775 : switch (fcode)
15583 : {
15584 : case IX86_BUILTIN_FXSAVE:
15585 : icode = CODE_FOR_fxsave;
15586 : break;
15587 19 : case IX86_BUILTIN_FXRSTOR:
15588 19 : icode = CODE_FOR_fxrstor;
15589 19 : break;
15590 23 : case IX86_BUILTIN_FXSAVE64:
15591 23 : icode = CODE_FOR_fxsave64;
15592 23 : break;
15593 21 : case IX86_BUILTIN_FXRSTOR64:
15594 21 : icode = CODE_FOR_fxrstor64;
15595 21 : break;
15596 7257 : case IX86_BUILTIN_FNSTENV:
15597 7257 : icode = CODE_FOR_fnstenv;
15598 7257 : break;
15599 7435 : case IX86_BUILTIN_FLDENV:
15600 7435 : icode = CODE_FOR_fldenv;
15601 7435 : break;
15602 0 : default:
15603 0 : gcc_unreachable ();
15604 : }
15605 :
15606 14775 : arg0 = CALL_EXPR_ARG (exp, 0);
15607 14775 : op0 = expand_normal (arg0);
15608 :
15609 14775 : if (!address_operand (op0, VOIDmode))
15610 : {
15611 36 : op0 = convert_memory_address (Pmode, op0);
15612 36 : op0 = copy_addr_to_reg (op0);
15613 : }
15614 14775 : op0 = gen_rtx_MEM (mode0, op0);
15615 :
15616 14775 : pat = GEN_FCN (icode) (op0);
15617 14775 : if (pat)
15618 14775 : emit_insn (pat);
15619 : return 0;
15620 :
15621 21 : case IX86_BUILTIN_XSETBV:
15622 21 : arg0 = CALL_EXPR_ARG (exp, 0);
15623 21 : arg1 = CALL_EXPR_ARG (exp, 1);
15624 21 : op0 = expand_normal (arg0);
15625 21 : op1 = expand_normal (arg1);
15626 :
15627 21 : if (!REG_P (op0))
15628 1 : op0 = copy_to_mode_reg (SImode, op0);
15629 :
15630 21 : op1 = force_reg (DImode, op1);
15631 :
15632 21 : if (TARGET_64BIT)
15633 : {
15634 21 : op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
15635 : NULL, 1, OPTAB_DIRECT);
15636 :
15637 21 : icode = CODE_FOR_xsetbv_rex64;
15638 :
15639 21 : op2 = gen_lowpart (SImode, op2);
15640 21 : op1 = gen_lowpart (SImode, op1);
15641 21 : pat = GEN_FCN (icode) (op0, op1, op2);
15642 : }
15643 : else
15644 : {
15645 0 : icode = CODE_FOR_xsetbv;
15646 :
15647 0 : pat = GEN_FCN (icode) (op0, op1);
15648 : }
15649 21 : if (pat)
15650 21 : emit_insn (pat);
15651 : return 0;
15652 :
15653 232 : case IX86_BUILTIN_XSAVE:
15654 232 : case IX86_BUILTIN_XRSTOR:
15655 232 : case IX86_BUILTIN_XSAVE64:
15656 232 : case IX86_BUILTIN_XRSTOR64:
15657 232 : case IX86_BUILTIN_XSAVEOPT:
15658 232 : case IX86_BUILTIN_XSAVEOPT64:
15659 232 : case IX86_BUILTIN_XSAVES:
15660 232 : case IX86_BUILTIN_XRSTORS:
15661 232 : case IX86_BUILTIN_XSAVES64:
15662 232 : case IX86_BUILTIN_XRSTORS64:
15663 232 : case IX86_BUILTIN_XSAVEC:
15664 232 : case IX86_BUILTIN_XSAVEC64:
15665 232 : arg0 = CALL_EXPR_ARG (exp, 0);
15666 232 : arg1 = CALL_EXPR_ARG (exp, 1);
15667 232 : op0 = expand_normal (arg0);
15668 232 : op1 = expand_normal (arg1);
15669 :
15670 232 : if (!address_operand (op0, VOIDmode))
15671 : {
15672 108 : op0 = convert_memory_address (Pmode, op0);
15673 108 : op0 = copy_addr_to_reg (op0);
15674 : }
15675 232 : op0 = gen_rtx_MEM (BLKmode, op0);
15676 :
15677 232 : op1 = force_reg (DImode, op1);
15678 :
15679 232 : if (TARGET_64BIT)
15680 : {
15681 232 : op2 = expand_simple_binop (DImode, LSHIFTRT, op1, GEN_INT (32),
15682 : NULL, 1, OPTAB_DIRECT);
15683 232 : switch (fcode)
15684 : {
15685 : case IX86_BUILTIN_XSAVE:
15686 : icode = CODE_FOR_xsave_rex64;
15687 : break;
15688 19 : case IX86_BUILTIN_XRSTOR:
15689 19 : icode = CODE_FOR_xrstor_rex64;
15690 19 : break;
15691 21 : case IX86_BUILTIN_XSAVE64:
15692 21 : icode = CODE_FOR_xsave64;
15693 21 : break;
15694 21 : case IX86_BUILTIN_XRSTOR64:
15695 21 : icode = CODE_FOR_xrstor64;
15696 21 : break;
15697 19 : case IX86_BUILTIN_XSAVEOPT:
15698 19 : icode = CODE_FOR_xsaveopt_rex64;
15699 19 : break;
15700 19 : case IX86_BUILTIN_XSAVEOPT64:
15701 19 : icode = CODE_FOR_xsaveopt64;
15702 19 : break;
15703 19 : case IX86_BUILTIN_XSAVES:
15704 19 : icode = CODE_FOR_xsaves_rex64;
15705 19 : break;
15706 19 : case IX86_BUILTIN_XRSTORS:
15707 19 : icode = CODE_FOR_xrstors_rex64;
15708 19 : break;
15709 19 : case IX86_BUILTIN_XSAVES64:
15710 19 : icode = CODE_FOR_xsaves64;
15711 19 : break;
15712 19 : case IX86_BUILTIN_XRSTORS64:
15713 19 : icode = CODE_FOR_xrstors64;
15714 19 : break;
15715 19 : case IX86_BUILTIN_XSAVEC:
15716 19 : icode = CODE_FOR_xsavec_rex64;
15717 19 : break;
15718 19 : case IX86_BUILTIN_XSAVEC64:
15719 19 : icode = CODE_FOR_xsavec64;
15720 19 : break;
15721 0 : default:
15722 0 : gcc_unreachable ();
15723 : }
15724 :
15725 232 : op2 = gen_lowpart (SImode, op2);
15726 232 : op1 = gen_lowpart (SImode, op1);
15727 232 : pat = GEN_FCN (icode) (op0, op1, op2);
15728 : }
15729 : else
15730 : {
15731 0 : switch (fcode)
15732 : {
15733 : case IX86_BUILTIN_XSAVE:
15734 : icode = CODE_FOR_xsave;
15735 : break;
15736 : case IX86_BUILTIN_XRSTOR:
15737 : icode = CODE_FOR_xrstor;
15738 : break;
15739 : case IX86_BUILTIN_XSAVEOPT:
15740 : icode = CODE_FOR_xsaveopt;
15741 : break;
15742 : case IX86_BUILTIN_XSAVES:
15743 : icode = CODE_FOR_xsaves;
15744 : break;
15745 : case IX86_BUILTIN_XRSTORS:
15746 : icode = CODE_FOR_xrstors;
15747 : break;
15748 : case IX86_BUILTIN_XSAVEC:
15749 : icode = CODE_FOR_xsavec;
15750 : break;
15751 0 : default:
15752 0 : gcc_unreachable ();
15753 : }
15754 0 : pat = GEN_FCN (icode) (op0, op1);
15755 : }
15756 :
15757 232 : if (pat)
15758 232 : emit_insn (pat);
15759 : return 0;
15760 :
15761 144 : case IX86_BUILTIN_LDTILECFG:
15762 144 : case IX86_BUILTIN_STTILECFG:
15763 144 : arg0 = CALL_EXPR_ARG (exp, 0);
15764 144 : op0 = expand_normal (arg0);
15765 :
15766 144 : if (!address_operand (op0, VOIDmode))
15767 : {
15768 8 : op0 = convert_memory_address (Pmode, op0);
15769 8 : op0 = copy_addr_to_reg (op0);
15770 : }
15771 144 : op0 = gen_rtx_MEM (BLKmode, op0);
15772 144 : if (fcode == IX86_BUILTIN_LDTILECFG)
15773 : icode = CODE_FOR_ldtilecfg;
15774 : else
15775 93 : icode = CODE_FOR_sttilecfg;
15776 144 : pat = GEN_FCN (icode) (op0);
15777 144 : emit_insn (pat);
15778 144 : return 0;
15779 :
15780 18 : case IX86_BUILTIN_LLWPCB:
15781 18 : arg0 = CALL_EXPR_ARG (exp, 0);
15782 18 : op0 = expand_normal (arg0);
15783 :
15784 18 : if (!register_operand (op0, Pmode))
15785 9 : op0 = ix86_zero_extend_to_Pmode (op0);
15786 18 : emit_insn (gen_lwp_llwpcb (Pmode, op0));
15787 18 : return 0;
15788 :
15789 18 : case IX86_BUILTIN_SLWPCB:
15790 18 : if (!target
15791 18 : || !register_operand (target, Pmode))
15792 0 : target = gen_reg_rtx (Pmode);
15793 18 : emit_insn (gen_lwp_slwpcb (Pmode, target));
15794 18 : return target;
15795 :
15796 51 : case IX86_BUILTIN_LWPVAL32:
15797 51 : case IX86_BUILTIN_LWPVAL64:
15798 51 : case IX86_BUILTIN_LWPINS32:
15799 51 : case IX86_BUILTIN_LWPINS64:
15800 51 : mode = ((fcode == IX86_BUILTIN_LWPVAL32
15801 51 : || fcode == IX86_BUILTIN_LWPINS32)
15802 51 : ? SImode : DImode);
15803 :
15804 51 : if (fcode == IX86_BUILTIN_LWPVAL32
15805 51 : || fcode == IX86_BUILTIN_LWPVAL64)
15806 26 : icode = code_for_lwp_lwpval (mode);
15807 : else
15808 25 : icode = code_for_lwp_lwpins (mode);
15809 :
15810 51 : arg0 = CALL_EXPR_ARG (exp, 0);
15811 51 : arg1 = CALL_EXPR_ARG (exp, 1);
15812 51 : arg2 = CALL_EXPR_ARG (exp, 2);
15813 51 : op0 = expand_normal (arg0);
15814 51 : op1 = expand_normal (arg1);
15815 51 : op2 = expand_normal (arg2);
15816 51 : mode0 = insn_data[icode].operand[0].mode;
15817 :
15818 51 : if (!insn_data[icode].operand[0].predicate (op0, mode0))
15819 13 : op0 = copy_to_mode_reg (mode0, op0);
15820 51 : if (!insn_data[icode].operand[1].predicate (op1, SImode))
15821 0 : op1 = copy_to_mode_reg (SImode, op1);
15822 :
15823 51 : if (!CONST_INT_P (op2))
15824 : {
15825 0 : error ("the last argument must be a 32-bit immediate");
15826 0 : return const0_rtx;
15827 : }
15828 :
15829 51 : emit_insn (GEN_FCN (icode) (op0, op1, op2));
15830 :
15831 51 : if (fcode == IX86_BUILTIN_LWPINS32
15832 51 : || fcode == IX86_BUILTIN_LWPINS64)
15833 : {
15834 25 : if (target == 0
15835 25 : || !nonimmediate_operand (target, QImode))
15836 0 : target = gen_reg_rtx (QImode);
15837 :
15838 25 : pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
15839 : const0_rtx);
15840 25 : emit_insn (gen_rtx_SET (target, pat));
15841 :
15842 25 : return target;
15843 : }
15844 : else
15845 : return 0;
15846 :
15847 18 : case IX86_BUILTIN_BEXTRI32:
15848 18 : case IX86_BUILTIN_BEXTRI64:
15849 18 : mode = (fcode == IX86_BUILTIN_BEXTRI32 ? SImode : DImode);
15850 :
15851 18 : arg0 = CALL_EXPR_ARG (exp, 0);
15852 18 : arg1 = CALL_EXPR_ARG (exp, 1);
15853 18 : op0 = expand_normal (arg0);
15854 18 : op1 = expand_normal (arg1);
15855 :
15856 18 : if (!CONST_INT_P (op1))
15857 : {
15858 0 : error ("last argument must be an immediate");
15859 0 : return const0_rtx;
15860 : }
15861 : else
15862 : {
15863 18 : unsigned char lsb_index = UINTVAL (op1);
15864 18 : unsigned char length = UINTVAL (op1) >> 8;
15865 :
15866 18 : unsigned char bitsize = GET_MODE_BITSIZE (mode);
15867 :
15868 18 : icode = code_for_tbm_bextri (mode);
15869 :
15870 18 : mode1 = insn_data[icode].operand[1].mode;
15871 18 : if (!insn_data[icode].operand[1].predicate (op0, mode1))
15872 12 : op0 = copy_to_mode_reg (mode1, op0);
15873 :
15874 18 : mode0 = insn_data[icode].operand[0].mode;
15875 18 : if (target == 0
15876 18 : || !register_operand (target, mode0))
15877 0 : target = gen_reg_rtx (mode0);
15878 :
15879 18 : if (length == 0 || lsb_index >= bitsize)
15880 : {
15881 8 : emit_move_insn (target, const0_rtx);
15882 8 : return target;
15883 : }
15884 :
15885 10 : if (length + lsb_index > bitsize)
15886 5 : length = bitsize - lsb_index;
15887 :
15888 10 : op1 = GEN_INT (length);
15889 10 : op2 = GEN_INT (lsb_index);
15890 :
15891 10 : emit_insn (GEN_FCN (icode) (target, op0, op1, op2));
15892 10 : return target;
15893 : }
15894 :
15895 21 : case IX86_BUILTIN_RDRAND16_STEP:
15896 21 : mode = HImode;
15897 21 : goto rdrand_step;
15898 :
15899 42 : case IX86_BUILTIN_RDRAND32_STEP:
15900 42 : mode = SImode;
15901 42 : goto rdrand_step;
15902 :
15903 : case IX86_BUILTIN_RDRAND64_STEP:
15904 : mode = DImode;
15905 :
15906 83 : rdrand_step:
15907 83 : arg0 = CALL_EXPR_ARG (exp, 0);
15908 83 : op1 = expand_normal (arg0);
15909 83 : if (!address_operand (op1, VOIDmode))
15910 : {
15911 29 : op1 = convert_memory_address (Pmode, op1);
15912 29 : op1 = copy_addr_to_reg (op1);
15913 : }
15914 :
15915 83 : op0 = gen_reg_rtx (mode);
15916 83 : emit_insn (gen_rdrand (mode, op0));
15917 :
15918 83 : emit_move_insn (gen_rtx_MEM (mode, op1), op0);
15919 :
15920 83 : op1 = force_reg (SImode, const1_rtx);
15921 :
15922 : /* Emit SImode conditional move. */
15923 83 : if (mode == HImode)
15924 : {
15925 21 : if (TARGET_ZERO_EXTEND_WITH_AND
15926 21 : && optimize_function_for_speed_p (cfun))
15927 : {
15928 0 : op2 = force_reg (SImode, const0_rtx);
15929 :
15930 0 : emit_insn (gen_movstricthi
15931 0 : (gen_lowpart (HImode, op2), op0));
15932 : }
15933 : else
15934 : {
15935 21 : op2 = gen_reg_rtx (SImode);
15936 :
15937 21 : emit_insn (gen_zero_extendhisi2 (op2, op0));
15938 : }
15939 : }
15940 62 : else if (mode == SImode)
15941 : op2 = op0;
15942 : else
15943 20 : op2 = gen_rtx_SUBREG (SImode, op0, 0);
15944 :
15945 83 : if (target == 0
15946 83 : || !register_operand (target, SImode))
15947 7 : target = gen_reg_rtx (SImode);
15948 :
15949 83 : pat = gen_rtx_GEU (VOIDmode, gen_rtx_REG (CCCmode, FLAGS_REG),
15950 : const0_rtx);
15951 83 : emit_insn (gen_rtx_SET (target,
15952 : gen_rtx_IF_THEN_ELSE (SImode, pat, op2, op1)));
15953 83 : return target;
15954 :
15955 19 : case IX86_BUILTIN_RDSEED16_STEP:
15956 19 : mode = HImode;
15957 19 : goto rdseed_step;
15958 :
15959 28 : case IX86_BUILTIN_RDSEED32_STEP:
15960 28 : mode = SImode;
15961 28 : goto rdseed_step;
15962 :
15963 : case IX86_BUILTIN_RDSEED64_STEP:
15964 : mode = DImode;
15965 :
15966 66 : rdseed_step:
15967 66 : arg0 = CALL_EXPR_ARG (exp, 0);
15968 66 : op1 = expand_normal (arg0);
15969 66 : if (!address_operand (op1, VOIDmode))
15970 : {
15971 28 : op1 = convert_memory_address (Pmode, op1);
15972 28 : op1 = copy_addr_to_reg (op1);
15973 : }
15974 :
15975 66 : op0 = gen_reg_rtx (mode);
15976 66 : emit_insn (gen_rdseed (mode, op0));
15977 :
15978 66 : emit_move_insn (gen_rtx_MEM (mode, op1), op0);
15979 :
15980 66 : op2 = gen_reg_rtx (QImode);
15981 :
15982 66 : pat = gen_rtx_LTU (QImode, gen_rtx_REG (CCCmode, FLAGS_REG),
15983 : const0_rtx);
15984 66 : emit_insn (gen_rtx_SET (op2, pat));
15985 :
15986 66 : if (target == 0
15987 66 : || !register_operand (target, SImode))
15988 1 : target = gen_reg_rtx (SImode);
15989 :
15990 66 : emit_insn (gen_zero_extendqisi2 (target, op2));
15991 66 : return target;
15992 :
15993 38 : case IX86_BUILTIN_SBB32:
15994 38 : icode = CODE_FOR_subborrowsi;
15995 38 : icode2 = CODE_FOR_subborrowsi_0;
15996 38 : mode0 = SImode;
15997 38 : mode1 = DImode;
15998 38 : mode2 = CCmode;
15999 38 : goto handlecarry;
16000 :
16001 44 : case IX86_BUILTIN_SBB64:
16002 44 : icode = CODE_FOR_subborrowdi;
16003 44 : icode2 = CODE_FOR_subborrowdi_0;
16004 44 : mode0 = DImode;
16005 44 : mode1 = TImode;
16006 44 : mode2 = CCmode;
16007 44 : goto handlecarry;
16008 :
16009 68 : case IX86_BUILTIN_ADDCARRYX32:
16010 68 : icode = CODE_FOR_addcarrysi;
16011 68 : icode2 = CODE_FOR_addcarrysi_0;
16012 68 : mode0 = SImode;
16013 68 : mode1 = DImode;
16014 68 : mode2 = CCCmode;
16015 68 : goto handlecarry;
16016 :
16017 : case IX86_BUILTIN_ADDCARRYX64:
16018 : icode = CODE_FOR_addcarrydi;
16019 : icode2 = CODE_FOR_addcarrydi_0;
16020 : mode0 = DImode;
16021 : mode1 = TImode;
16022 : mode2 = CCCmode;
16023 :
16024 212 : handlecarry:
16025 212 : arg0 = CALL_EXPR_ARG (exp, 0); /* unsigned char c_in. */
16026 212 : arg1 = CALL_EXPR_ARG (exp, 1); /* unsigned int src1. */
16027 212 : arg2 = CALL_EXPR_ARG (exp, 2); /* unsigned int src2. */
16028 212 : arg3 = CALL_EXPR_ARG (exp, 3); /* unsigned int *sum_out. */
16029 :
16030 212 : op1 = expand_normal (arg0);
16031 :
16032 212 : op2 = expand_normal (arg1);
16033 212 : if (!register_operand (op2, mode0))
16034 117 : op2 = copy_to_mode_reg (mode0, op2);
16035 :
16036 212 : op3 = expand_normal (arg2);
16037 212 : if (!register_operand (op3, mode0))
16038 120 : op3 = copy_to_mode_reg (mode0, op3);
16039 :
16040 212 : op4 = expand_normal (arg3);
16041 212 : if (!address_operand (op4, VOIDmode))
16042 : {
16043 67 : op4 = convert_memory_address (Pmode, op4);
16044 67 : op4 = copy_addr_to_reg (op4);
16045 : }
16046 :
16047 212 : op0 = gen_reg_rtx (mode0);
16048 212 : if (op1 == const0_rtx)
16049 : {
16050 : /* If arg0 is 0, optimize right away into add or sub
16051 : instruction that sets CCCmode flags. */
16052 21 : op1 = gen_rtx_REG (mode2, FLAGS_REG);
16053 21 : emit_insn (GEN_FCN (icode2) (op0, op2, op3));
16054 : }
16055 : else
16056 : {
16057 : /* Generate CF from input operand. */
16058 191 : ix86_expand_carry (op1);
16059 :
16060 : /* Generate instruction that consumes CF. */
16061 191 : op1 = gen_rtx_REG (CCCmode, FLAGS_REG);
16062 191 : pat = gen_rtx_LTU (mode1, op1, const0_rtx);
16063 191 : pat2 = gen_rtx_LTU (mode0, op1, const0_rtx);
16064 191 : emit_insn (GEN_FCN (icode) (op0, op2, op3, op1, pat, pat2));
16065 : }
16066 :
16067 : /* Return current CF value. */
16068 212 : if (target == 0)
16069 14 : target = gen_reg_rtx (QImode);
16070 :
16071 212 : pat = gen_rtx_LTU (QImode, op1, const0_rtx);
16072 212 : emit_insn (gen_rtx_SET (target, pat));
16073 :
16074 : /* Store the result. */
16075 212 : emit_move_insn (gen_rtx_MEM (mode0, op4), op0);
16076 :
16077 212 : return target;
16078 :
16079 24 : case IX86_BUILTIN_READ_FLAGS:
16080 24 : if (ignore)
16081 1 : return const0_rtx;
16082 :
16083 23 : emit_insn (gen_pushfl ());
16084 :
16085 23 : if (optimize
16086 11 : || target == NULL_RTX
16087 11 : || !nonimmediate_operand (target, word_mode)
16088 34 : || GET_MODE (target) != word_mode)
16089 12 : target = gen_reg_rtx (word_mode);
16090 :
16091 23 : emit_insn (gen_pop (target));
16092 23 : return target;
16093 :
16094 21 : case IX86_BUILTIN_WRITE_FLAGS:
16095 :
16096 21 : arg0 = CALL_EXPR_ARG (exp, 0);
16097 21 : op0 = expand_normal (arg0);
16098 21 : if (!general_no_elim_operand (op0, word_mode))
16099 0 : op0 = copy_to_mode_reg (word_mode, op0);
16100 :
16101 21 : emit_insn (gen_push (op0));
16102 21 : emit_insn (gen_popfl ());
16103 21 : return 0;
16104 :
16105 22 : case IX86_BUILTIN_KTESTC8:
16106 22 : icode = CODE_FOR_ktestqi;
16107 22 : mode3 = CCCmode;
16108 22 : goto kortest;
16109 :
16110 22 : case IX86_BUILTIN_KTESTZ8:
16111 22 : icode = CODE_FOR_ktestqi;
16112 22 : mode3 = CCZmode;
16113 22 : goto kortest;
16114 :
16115 22 : case IX86_BUILTIN_KTESTC16:
16116 22 : icode = CODE_FOR_ktesthi;
16117 22 : mode3 = CCCmode;
16118 22 : goto kortest;
16119 :
16120 22 : case IX86_BUILTIN_KTESTZ16:
16121 22 : icode = CODE_FOR_ktesthi;
16122 22 : mode3 = CCZmode;
16123 22 : goto kortest;
16124 :
16125 22 : case IX86_BUILTIN_KTESTC32:
16126 22 : icode = CODE_FOR_ktestsi;
16127 22 : mode3 = CCCmode;
16128 22 : goto kortest;
16129 :
16130 22 : case IX86_BUILTIN_KTESTZ32:
16131 22 : icode = CODE_FOR_ktestsi;
16132 22 : mode3 = CCZmode;
16133 22 : goto kortest;
16134 :
16135 22 : case IX86_BUILTIN_KTESTC64:
16136 22 : icode = CODE_FOR_ktestdi;
16137 22 : mode3 = CCCmode;
16138 22 : goto kortest;
16139 :
16140 22 : case IX86_BUILTIN_KTESTZ64:
16141 22 : icode = CODE_FOR_ktestdi;
16142 22 : mode3 = CCZmode;
16143 22 : goto kortest;
16144 :
16145 22 : case IX86_BUILTIN_KORTESTC8:
16146 22 : icode = CODE_FOR_kortestqi;
16147 22 : mode3 = CCCmode;
16148 22 : goto kortest;
16149 :
16150 76 : case IX86_BUILTIN_KORTESTZ8:
16151 76 : icode = CODE_FOR_kortestqi;
16152 76 : mode3 = CCZmode;
16153 76 : goto kortest;
16154 :
16155 38 : case IX86_BUILTIN_KORTESTC16:
16156 38 : icode = CODE_FOR_kortesthi;
16157 38 : mode3 = CCCmode;
16158 38 : goto kortest;
16159 :
16160 91 : case IX86_BUILTIN_KORTESTZ16:
16161 91 : icode = CODE_FOR_kortesthi;
16162 91 : mode3 = CCZmode;
16163 91 : goto kortest;
16164 :
16165 22 : case IX86_BUILTIN_KORTESTC32:
16166 22 : icode = CODE_FOR_kortestsi;
16167 22 : mode3 = CCCmode;
16168 22 : goto kortest;
16169 :
16170 79 : case IX86_BUILTIN_KORTESTZ32:
16171 79 : icode = CODE_FOR_kortestsi;
16172 79 : mode3 = CCZmode;
16173 79 : goto kortest;
16174 :
16175 22 : case IX86_BUILTIN_KORTESTC64:
16176 22 : icode = CODE_FOR_kortestdi;
16177 22 : mode3 = CCCmode;
16178 22 : goto kortest;
16179 :
16180 : case IX86_BUILTIN_KORTESTZ64:
16181 : icode = CODE_FOR_kortestdi;
16182 : mode3 = CCZmode;
16183 :
16184 610 : kortest:
16185 610 : arg0 = CALL_EXPR_ARG (exp, 0); /* Mask reg src1. */
16186 610 : arg1 = CALL_EXPR_ARG (exp, 1); /* Mask reg src2. */
16187 610 : op0 = expand_normal (arg0);
16188 610 : op1 = expand_normal (arg1);
16189 :
16190 610 : mode0 = insn_data[icode].operand[0].mode;
16191 610 : mode1 = insn_data[icode].operand[1].mode;
16192 :
16193 610 : if (GET_MODE (op0) != VOIDmode)
16194 610 : op0 = force_reg (GET_MODE (op0), op0);
16195 :
16196 610 : op0 = gen_lowpart (mode0, op0);
16197 :
16198 610 : if (!insn_data[icode].operand[0].predicate (op0, mode0))
16199 0 : op0 = copy_to_mode_reg (mode0, op0);
16200 :
16201 610 : if (GET_MODE (op1) != VOIDmode)
16202 609 : op1 = force_reg (GET_MODE (op1), op1);
16203 :
16204 610 : op1 = gen_lowpart (mode1, op1);
16205 :
16206 610 : if (!insn_data[icode].operand[1].predicate (op1, mode1))
16207 1 : op1 = copy_to_mode_reg (mode1, op1);
16208 :
16209 610 : target = gen_reg_rtx (QImode);
16210 :
16211 : /* Emit kortest. */
16212 610 : emit_insn (GEN_FCN (icode) (op0, op1));
16213 : /* And use setcc to return result from flags. */
16214 610 : ix86_expand_setcc (target, EQ,
16215 : gen_rtx_REG (mode3, FLAGS_REG), const0_rtx);
16216 610 : return target;
16217 :
16218 24 : case IX86_BUILTIN_GATHERSIV2DF:
16219 24 : icode = CODE_FOR_avx2_gathersiv2df;
16220 24 : goto gather_gen;
16221 18 : case IX86_BUILTIN_GATHERSIV4DF:
16222 18 : icode = CODE_FOR_avx2_gathersiv4df;
16223 18 : goto gather_gen;
16224 21 : case IX86_BUILTIN_GATHERDIV2DF:
16225 21 : icode = CODE_FOR_avx2_gatherdiv2df;
16226 21 : goto gather_gen;
16227 32 : case IX86_BUILTIN_GATHERDIV4DF:
16228 32 : icode = CODE_FOR_avx2_gatherdiv4df;
16229 32 : goto gather_gen;
16230 30 : case IX86_BUILTIN_GATHERSIV4SF:
16231 30 : icode = CODE_FOR_avx2_gathersiv4sf;
16232 30 : goto gather_gen;
16233 37 : case IX86_BUILTIN_GATHERSIV8SF:
16234 37 : icode = CODE_FOR_avx2_gathersiv8sf;
16235 37 : goto gather_gen;
16236 24 : case IX86_BUILTIN_GATHERDIV4SF:
16237 24 : icode = CODE_FOR_avx2_gatherdiv4sf;
16238 24 : goto gather_gen;
16239 18 : case IX86_BUILTIN_GATHERDIV8SF:
16240 18 : icode = CODE_FOR_avx2_gatherdiv8sf;
16241 18 : goto gather_gen;
16242 18 : case IX86_BUILTIN_GATHERSIV2DI:
16243 18 : icode = CODE_FOR_avx2_gathersiv2di;
16244 18 : goto gather_gen;
16245 18 : case IX86_BUILTIN_GATHERSIV4DI:
16246 18 : icode = CODE_FOR_avx2_gathersiv4di;
16247 18 : goto gather_gen;
16248 27 : case IX86_BUILTIN_GATHERDIV2DI:
16249 27 : icode = CODE_FOR_avx2_gatherdiv2di;
16250 27 : goto gather_gen;
16251 29 : case IX86_BUILTIN_GATHERDIV4DI:
16252 29 : icode = CODE_FOR_avx2_gatherdiv4di;
16253 29 : goto gather_gen;
16254 20 : case IX86_BUILTIN_GATHERSIV4SI:
16255 20 : icode = CODE_FOR_avx2_gathersiv4si;
16256 20 : goto gather_gen;
16257 22 : case IX86_BUILTIN_GATHERSIV8SI:
16258 22 : icode = CODE_FOR_avx2_gathersiv8si;
16259 22 : goto gather_gen;
16260 28 : case IX86_BUILTIN_GATHERDIV4SI:
16261 28 : icode = CODE_FOR_avx2_gatherdiv4si;
16262 28 : goto gather_gen;
16263 18 : case IX86_BUILTIN_GATHERDIV8SI:
16264 18 : icode = CODE_FOR_avx2_gatherdiv8si;
16265 18 : goto gather_gen;
16266 20 : case IX86_BUILTIN_GATHERALTSIV4DF:
16267 20 : icode = CODE_FOR_avx2_gathersiv4df;
16268 20 : goto gather_gen;
16269 16 : case IX86_BUILTIN_GATHERALTDIV8SF:
16270 16 : icode = CODE_FOR_avx2_gatherdiv8sf;
16271 16 : goto gather_gen;
16272 4 : case IX86_BUILTIN_GATHERALTSIV4DI:
16273 4 : icode = CODE_FOR_avx2_gathersiv4di;
16274 4 : goto gather_gen;
16275 12 : case IX86_BUILTIN_GATHERALTDIV8SI:
16276 12 : icode = CODE_FOR_avx2_gatherdiv8si;
16277 12 : goto gather_gen;
16278 36 : case IX86_BUILTIN_GATHER3SIV16SF:
16279 36 : icode = CODE_FOR_avx512f_gathersiv16sf;
16280 36 : goto gather_gen;
16281 24 : case IX86_BUILTIN_GATHER3SIV8DF:
16282 24 : icode = CODE_FOR_avx512f_gathersiv8df;
16283 24 : goto gather_gen;
16284 24 : case IX86_BUILTIN_GATHER3DIV16SF:
16285 24 : icode = CODE_FOR_avx512f_gatherdiv16sf;
16286 24 : goto gather_gen;
16287 37 : case IX86_BUILTIN_GATHER3DIV8DF:
16288 37 : icode = CODE_FOR_avx512f_gatherdiv8df;
16289 37 : goto gather_gen;
16290 30 : case IX86_BUILTIN_GATHER3SIV16SI:
16291 30 : icode = CODE_FOR_avx512f_gathersiv16si;
16292 30 : goto gather_gen;
16293 24 : case IX86_BUILTIN_GATHER3SIV8DI:
16294 24 : icode = CODE_FOR_avx512f_gathersiv8di;
16295 24 : goto gather_gen;
16296 24 : case IX86_BUILTIN_GATHER3DIV16SI:
16297 24 : icode = CODE_FOR_avx512f_gatherdiv16si;
16298 24 : goto gather_gen;
16299 37 : case IX86_BUILTIN_GATHER3DIV8DI:
16300 37 : icode = CODE_FOR_avx512f_gatherdiv8di;
16301 37 : goto gather_gen;
16302 16 : case IX86_BUILTIN_GATHER3ALTSIV8DF:
16303 16 : icode = CODE_FOR_avx512f_gathersiv8df;
16304 16 : goto gather_gen;
16305 22 : case IX86_BUILTIN_GATHER3ALTDIV16SF:
16306 22 : icode = CODE_FOR_avx512f_gatherdiv16sf;
16307 22 : goto gather_gen;
16308 14 : case IX86_BUILTIN_GATHER3ALTSIV8DI:
16309 14 : icode = CODE_FOR_avx512f_gathersiv8di;
16310 14 : goto gather_gen;
16311 18 : case IX86_BUILTIN_GATHER3ALTDIV16SI:
16312 18 : icode = CODE_FOR_avx512f_gatherdiv16si;
16313 18 : goto gather_gen;
16314 18 : case IX86_BUILTIN_GATHER3SIV2DF:
16315 18 : icode = CODE_FOR_avx512vl_gathersiv2df;
16316 18 : goto gather_gen;
16317 10 : case IX86_BUILTIN_GATHER3SIV4DF:
16318 10 : icode = CODE_FOR_avx512vl_gathersiv4df;
16319 10 : goto gather_gen;
16320 15 : case IX86_BUILTIN_GATHER3DIV2DF:
16321 15 : icode = CODE_FOR_avx512vl_gatherdiv2df;
16322 15 : goto gather_gen;
16323 16 : case IX86_BUILTIN_GATHER3DIV4DF:
16324 16 : icode = CODE_FOR_avx512vl_gatherdiv4df;
16325 16 : goto gather_gen;
16326 14 : case IX86_BUILTIN_GATHER3SIV4SF:
16327 14 : icode = CODE_FOR_avx512vl_gathersiv4sf;
16328 14 : goto gather_gen;
16329 12 : case IX86_BUILTIN_GATHER3SIV8SF:
16330 12 : icode = CODE_FOR_avx512vl_gathersiv8sf;
16331 12 : goto gather_gen;
16332 22 : case IX86_BUILTIN_GATHER3DIV4SF:
16333 22 : icode = CODE_FOR_avx512vl_gatherdiv4sf;
16334 22 : goto gather_gen;
16335 10 : case IX86_BUILTIN_GATHER3DIV8SF:
16336 10 : icode = CODE_FOR_avx512vl_gatherdiv8sf;
16337 10 : goto gather_gen;
16338 20 : case IX86_BUILTIN_GATHER3SIV2DI:
16339 20 : icode = CODE_FOR_avx512vl_gathersiv2di;
16340 20 : goto gather_gen;
16341 10 : case IX86_BUILTIN_GATHER3SIV4DI:
16342 10 : icode = CODE_FOR_avx512vl_gathersiv4di;
16343 10 : goto gather_gen;
16344 14 : case IX86_BUILTIN_GATHER3DIV2DI:
16345 14 : icode = CODE_FOR_avx512vl_gatherdiv2di;
16346 14 : goto gather_gen;
16347 13 : case IX86_BUILTIN_GATHER3DIV4DI:
16348 13 : icode = CODE_FOR_avx512vl_gatherdiv4di;
16349 13 : goto gather_gen;
16350 14 : case IX86_BUILTIN_GATHER3SIV4SI:
16351 14 : icode = CODE_FOR_avx512vl_gathersiv4si;
16352 14 : goto gather_gen;
16353 12 : case IX86_BUILTIN_GATHER3SIV8SI:
16354 12 : icode = CODE_FOR_avx512vl_gathersiv8si;
16355 12 : goto gather_gen;
16356 24 : case IX86_BUILTIN_GATHER3DIV4SI:
16357 24 : icode = CODE_FOR_avx512vl_gatherdiv4si;
16358 24 : goto gather_gen;
16359 10 : case IX86_BUILTIN_GATHER3DIV8SI:
16360 10 : icode = CODE_FOR_avx512vl_gatherdiv8si;
16361 10 : goto gather_gen;
16362 4 : case IX86_BUILTIN_GATHER3ALTSIV4DF:
16363 4 : icode = CODE_FOR_avx512vl_gathersiv4df;
16364 4 : goto gather_gen;
16365 8 : case IX86_BUILTIN_GATHER3ALTDIV8SF:
16366 8 : icode = CODE_FOR_avx512vl_gatherdiv8sf;
16367 8 : goto gather_gen;
16368 6 : case IX86_BUILTIN_GATHER3ALTSIV4DI:
16369 6 : icode = CODE_FOR_avx512vl_gathersiv4di;
16370 6 : goto gather_gen;
16371 10 : case IX86_BUILTIN_GATHER3ALTDIV8SI:
16372 10 : icode = CODE_FOR_avx512vl_gatherdiv8si;
16373 10 : goto gather_gen;
16374 40 : case IX86_BUILTIN_SCATTERSIV16SF:
16375 40 : icode = CODE_FOR_avx512f_scattersiv16sf;
16376 40 : goto scatter_gen;
16377 27 : case IX86_BUILTIN_SCATTERSIV8DF:
16378 27 : icode = CODE_FOR_avx512f_scattersiv8df;
16379 27 : goto scatter_gen;
16380 24 : case IX86_BUILTIN_SCATTERDIV16SF:
16381 24 : icode = CODE_FOR_avx512f_scatterdiv16sf;
16382 24 : goto scatter_gen;
16383 33 : case IX86_BUILTIN_SCATTERDIV8DF:
16384 33 : icode = CODE_FOR_avx512f_scatterdiv8df;
16385 33 : goto scatter_gen;
16386 30 : case IX86_BUILTIN_SCATTERSIV16SI:
16387 30 : icode = CODE_FOR_avx512f_scattersiv16si;
16388 30 : goto scatter_gen;
16389 24 : case IX86_BUILTIN_SCATTERSIV8DI:
16390 24 : icode = CODE_FOR_avx512f_scattersiv8di;
16391 24 : goto scatter_gen;
16392 24 : case IX86_BUILTIN_SCATTERDIV16SI:
16393 24 : icode = CODE_FOR_avx512f_scatterdiv16si;
16394 24 : goto scatter_gen;
16395 29 : case IX86_BUILTIN_SCATTERDIV8DI:
16396 29 : icode = CODE_FOR_avx512f_scatterdiv8di;
16397 29 : goto scatter_gen;
16398 18 : case IX86_BUILTIN_SCATTERSIV8SF:
16399 18 : icode = CODE_FOR_avx512vl_scattersiv8sf;
16400 18 : goto scatter_gen;
16401 20 : case IX86_BUILTIN_SCATTERSIV4SF:
16402 20 : icode = CODE_FOR_avx512vl_scattersiv4sf;
16403 20 : goto scatter_gen;
16404 16 : case IX86_BUILTIN_SCATTERSIV4DF:
16405 16 : icode = CODE_FOR_avx512vl_scattersiv4df;
16406 16 : goto scatter_gen;
16407 16 : case IX86_BUILTIN_SCATTERSIV2DF:
16408 16 : icode = CODE_FOR_avx512vl_scattersiv2df;
16409 16 : goto scatter_gen;
16410 16 : case IX86_BUILTIN_SCATTERDIV8SF:
16411 16 : icode = CODE_FOR_avx512vl_scatterdiv8sf;
16412 16 : goto scatter_gen;
16413 16 : case IX86_BUILTIN_SCATTERDIV4SF:
16414 16 : icode = CODE_FOR_avx512vl_scatterdiv4sf;
16415 16 : goto scatter_gen;
16416 18 : case IX86_BUILTIN_SCATTERDIV4DF:
16417 18 : icode = CODE_FOR_avx512vl_scatterdiv4df;
16418 18 : goto scatter_gen;
16419 18 : case IX86_BUILTIN_SCATTERDIV2DF:
16420 18 : icode = CODE_FOR_avx512vl_scatterdiv2df;
16421 18 : goto scatter_gen;
16422 22 : case IX86_BUILTIN_SCATTERSIV8SI:
16423 22 : icode = CODE_FOR_avx512vl_scattersiv8si;
16424 22 : goto scatter_gen;
16425 24 : case IX86_BUILTIN_SCATTERSIV4SI:
16426 24 : icode = CODE_FOR_avx512vl_scattersiv4si;
16427 24 : goto scatter_gen;
16428 16 : case IX86_BUILTIN_SCATTERSIV4DI:
16429 16 : icode = CODE_FOR_avx512vl_scattersiv4di;
16430 16 : goto scatter_gen;
16431 16 : case IX86_BUILTIN_SCATTERSIV2DI:
16432 16 : icode = CODE_FOR_avx512vl_scattersiv2di;
16433 16 : goto scatter_gen;
16434 16 : case IX86_BUILTIN_SCATTERDIV8SI:
16435 16 : icode = CODE_FOR_avx512vl_scatterdiv8si;
16436 16 : goto scatter_gen;
16437 16 : case IX86_BUILTIN_SCATTERDIV4SI:
16438 16 : icode = CODE_FOR_avx512vl_scatterdiv4si;
16439 16 : goto scatter_gen;
16440 18 : case IX86_BUILTIN_SCATTERDIV4DI:
16441 18 : icode = CODE_FOR_avx512vl_scatterdiv4di;
16442 18 : goto scatter_gen;
16443 18 : case IX86_BUILTIN_SCATTERDIV2DI:
16444 18 : icode = CODE_FOR_avx512vl_scatterdiv2di;
16445 18 : goto scatter_gen;
16446 16 : case IX86_BUILTIN_SCATTERALTSIV8DF:
16447 16 : icode = CODE_FOR_avx512f_scattersiv8df;
16448 16 : goto scatter_gen;
16449 12 : case IX86_BUILTIN_SCATTERALTDIV16SF:
16450 12 : icode = CODE_FOR_avx512f_scatterdiv16sf;
16451 12 : goto scatter_gen;
16452 8 : case IX86_BUILTIN_SCATTERALTSIV8DI:
16453 8 : icode = CODE_FOR_avx512f_scattersiv8di;
16454 8 : goto scatter_gen;
16455 24 : case IX86_BUILTIN_SCATTERALTDIV16SI:
16456 24 : icode = CODE_FOR_avx512f_scatterdiv16si;
16457 24 : goto scatter_gen;
16458 4 : case IX86_BUILTIN_SCATTERALTSIV4DF:
16459 4 : icode = CODE_FOR_avx512vl_scattersiv4df;
16460 4 : goto scatter_gen;
16461 4 : case IX86_BUILTIN_SCATTERALTDIV8SF:
16462 4 : icode = CODE_FOR_avx512vl_scatterdiv8sf;
16463 4 : goto scatter_gen;
16464 4 : case IX86_BUILTIN_SCATTERALTSIV4DI:
16465 4 : icode = CODE_FOR_avx512vl_scattersiv4di;
16466 4 : goto scatter_gen;
16467 4 : case IX86_BUILTIN_SCATTERALTDIV8SI:
16468 4 : icode = CODE_FOR_avx512vl_scatterdiv8si;
16469 4 : goto scatter_gen;
16470 8 : case IX86_BUILTIN_SCATTERALTSIV2DF:
16471 8 : icode = CODE_FOR_avx512vl_scattersiv2df;
16472 8 : goto scatter_gen;
16473 8 : case IX86_BUILTIN_SCATTERALTDIV4SF:
16474 8 : icode = CODE_FOR_avx512vl_scatterdiv4sf;
16475 8 : goto scatter_gen;
16476 8 : case IX86_BUILTIN_SCATTERALTSIV2DI:
16477 8 : icode = CODE_FOR_avx512vl_scattersiv2di;
16478 8 : goto scatter_gen;
16479 8 : case IX86_BUILTIN_SCATTERALTDIV4SI:
16480 8 : icode = CODE_FOR_avx512vl_scatterdiv4si;
16481 8 : goto scatter_gen;
16482 :
16483 1004 : gather_gen:
16484 1004 : rtx half;
16485 1004 : rtx (*gen) (rtx, rtx);
16486 :
16487 1004 : arg0 = CALL_EXPR_ARG (exp, 0);
16488 1004 : arg1 = CALL_EXPR_ARG (exp, 1);
16489 1004 : arg2 = CALL_EXPR_ARG (exp, 2);
16490 1004 : arg3 = CALL_EXPR_ARG (exp, 3);
16491 1004 : arg4 = CALL_EXPR_ARG (exp, 4);
16492 1004 : op0 = expand_normal (arg0);
16493 1004 : op1 = expand_normal (arg1);
16494 1004 : op2 = expand_normal (arg2);
16495 1004 : op3 = ix86_expand_unsigned_small_int_cst_argument (arg3);
16496 1004 : op4 = expand_normal (arg4);
16497 : /* Note the arg order is different from the operand order. */
16498 1004 : mode0 = insn_data[icode].operand[1].mode;
16499 1004 : mode2 = insn_data[icode].operand[3].mode;
16500 1004 : mode3 = insn_data[icode].operand[4].mode;
16501 1004 : mode4 = insn_data[icode].operand[5].mode;
16502 :
16503 1004 : if (target == NULL_RTX
16504 1004 : || GET_MODE (target) != insn_data[icode].operand[0].mode
16505 1904 : || !insn_data[icode].operand[0].predicate (target,
16506 : GET_MODE (target)))
16507 105 : subtarget = gen_reg_rtx (insn_data[icode].operand[0].mode);
16508 : else
16509 : subtarget = target;
16510 :
16511 1004 : switch (fcode)
16512 : {
16513 30 : case IX86_BUILTIN_GATHER3ALTSIV8DF:
16514 30 : case IX86_BUILTIN_GATHER3ALTSIV8DI:
16515 30 : half = gen_reg_rtx (V8SImode);
16516 30 : if (!nonimmediate_operand (op2, V16SImode))
16517 0 : op2 = copy_to_mode_reg (V16SImode, op2);
16518 30 : emit_insn (gen_vec_extract_lo_v16si (half, op2));
16519 30 : op2 = half;
16520 30 : break;
16521 34 : case IX86_BUILTIN_GATHER3ALTSIV4DF:
16522 34 : case IX86_BUILTIN_GATHER3ALTSIV4DI:
16523 34 : case IX86_BUILTIN_GATHERALTSIV4DF:
16524 34 : case IX86_BUILTIN_GATHERALTSIV4DI:
16525 34 : half = gen_reg_rtx (V4SImode);
16526 34 : if (!nonimmediate_operand (op2, V8SImode))
16527 0 : op2 = copy_to_mode_reg (V8SImode, op2);
16528 34 : emit_insn (gen_vec_extract_lo_v8si (half, op2));
16529 34 : op2 = half;
16530 34 : break;
16531 40 : case IX86_BUILTIN_GATHER3ALTDIV16SF:
16532 40 : case IX86_BUILTIN_GATHER3ALTDIV16SI:
16533 40 : half = gen_reg_rtx (mode0);
16534 40 : if (mode0 == V8SFmode)
16535 : gen = gen_vec_extract_lo_v16sf;
16536 : else
16537 18 : gen = gen_vec_extract_lo_v16si;
16538 40 : if (!nonimmediate_operand (op0, GET_MODE (op0)))
16539 40 : op0 = copy_to_mode_reg (GET_MODE (op0), op0);
16540 40 : emit_insn (gen (half, op0));
16541 40 : op0 = half;
16542 40 : op3 = lowpart_subreg (QImode, op3, HImode);
16543 40 : break;
16544 46 : case IX86_BUILTIN_GATHER3ALTDIV8SF:
16545 46 : case IX86_BUILTIN_GATHER3ALTDIV8SI:
16546 46 : case IX86_BUILTIN_GATHERALTDIV8SF:
16547 46 : case IX86_BUILTIN_GATHERALTDIV8SI:
16548 46 : half = gen_reg_rtx (mode0);
16549 46 : if (mode0 == V4SFmode)
16550 : gen = gen_vec_extract_lo_v8sf;
16551 : else
16552 22 : gen = gen_vec_extract_lo_v8si;
16553 46 : if (!nonimmediate_operand (op0, GET_MODE (op0)))
16554 46 : op0 = copy_to_mode_reg (GET_MODE (op0), op0);
16555 46 : emit_insn (gen (half, op0));
16556 46 : op0 = half;
16557 46 : if (VECTOR_MODE_P (GET_MODE (op3)))
16558 : {
16559 28 : half = gen_reg_rtx (mode0);
16560 28 : if (!nonimmediate_operand (op3, GET_MODE (op3)))
16561 12 : op3 = copy_to_mode_reg (GET_MODE (op3), op3);
16562 28 : emit_insn (gen (half, op3));
16563 28 : op3 = half;
16564 : }
16565 : break;
16566 : default:
16567 : break;
16568 : }
16569 :
16570 : /* Force memory operand only with base register here. But we
16571 : don't want to do it on memory operand for other builtin
16572 : functions. */
16573 1004 : op1 = ix86_zero_extend_to_Pmode (op1);
16574 :
16575 1004 : if (!insn_data[icode].operand[1].predicate (op0, mode0))
16576 403 : op0 = copy_to_mode_reg (mode0, op0);
16577 1009 : if (!insn_data[icode].operand[2].predicate (op1, Pmode))
16578 0 : op1 = copy_to_mode_reg (Pmode, op1);
16579 1004 : if (!insn_data[icode].operand[3].predicate (op2, mode2))
16580 221 : op2 = copy_to_mode_reg (mode2, op2);
16581 :
16582 1004 : op3 = fixup_modeless_constant (op3, mode3);
16583 :
16584 1004 : if (GET_MODE (op3) == mode3 || GET_MODE (op3) == VOIDmode)
16585 : {
16586 1004 : if (!insn_data[icode].operand[4].predicate (op3, mode3))
16587 356 : op3 = copy_to_mode_reg (mode3, op3);
16588 : }
16589 : else
16590 : {
16591 0 : op3 = copy_to_reg (op3);
16592 0 : op3 = lowpart_subreg (mode3, op3, GET_MODE (op3));
16593 : }
16594 1004 : if (!insn_data[icode].operand[5].predicate (op4, mode4))
16595 : {
16596 0 : error ("the last argument must be scale 1, 2, 4, 8");
16597 0 : return const0_rtx;
16598 : }
16599 :
16600 : /* Optimize. If mask is known to have all high bits set,
16601 : replace op0 with pc_rtx to signal that the instruction
16602 : overwrites the whole destination and doesn't use its
16603 : previous contents. */
16604 1004 : if (optimize)
16605 : {
16606 914 : if (TREE_CODE (arg3) == INTEGER_CST)
16607 : {
16608 209 : if (integer_all_onesp (arg3))
16609 201 : op0 = pc_rtx;
16610 : }
16611 705 : else if (TREE_CODE (arg3) == VECTOR_CST)
16612 : {
16613 : unsigned int negative = 0;
16614 755 : for (i = 0; i < VECTOR_CST_NELTS (arg3); ++i)
16615 : {
16616 620 : tree cst = VECTOR_CST_ELT (arg3, i);
16617 620 : if (TREE_CODE (cst) == INTEGER_CST
16618 620 : && tree_int_cst_sign_bit (cst))
16619 286 : negative++;
16620 334 : else if (TREE_CODE (cst) == REAL_CST
16621 334 : && REAL_VALUE_NEGATIVE (TREE_REAL_CST (cst)))
16622 306 : negative++;
16623 : }
16624 135 : if (negative == TYPE_VECTOR_SUBPARTS (TREE_TYPE (arg3)))
16625 121 : op0 = pc_rtx;
16626 : }
16627 570 : else if (TREE_CODE (arg3) == SSA_NAME
16628 570 : && VECTOR_TYPE_P (TREE_TYPE (arg3)))
16629 : {
16630 : /* Recognize also when mask is like:
16631 : __v2df src = _mm_setzero_pd ();
16632 : __v2df mask = _mm_cmpeq_pd (src, src);
16633 : or
16634 : __v8sf src = _mm256_setzero_ps ();
16635 : __v8sf mask = _mm256_cmp_ps (src, src, _CMP_EQ_OQ);
16636 : as that is a cheaper way to load all ones into
16637 : a register than having to load a constant from
16638 : memory. */
16639 259 : gimple *def_stmt = SSA_NAME_DEF_STMT (arg3);
16640 259 : if (is_gimple_call (def_stmt))
16641 : {
16642 76 : tree fndecl = gimple_call_fndecl (def_stmt);
16643 76 : if (fndecl
16644 76 : && fndecl_built_in_p (fndecl, BUILT_IN_MD))
16645 67 : switch (DECL_MD_FUNCTION_CODE (fndecl))
16646 : {
16647 24 : case IX86_BUILTIN_CMPPD:
16648 24 : case IX86_BUILTIN_CMPPS:
16649 24 : case IX86_BUILTIN_CMPPD256:
16650 24 : case IX86_BUILTIN_CMPPS256:
16651 24 : if (!integer_zerop (gimple_call_arg (def_stmt, 2)))
16652 : break;
16653 : /* FALLTHRU */
16654 49 : case IX86_BUILTIN_CMPEQPD:
16655 49 : case IX86_BUILTIN_CMPEQPS:
16656 49 : if (initializer_zerop (gimple_call_arg (def_stmt, 0))
16657 49 : && initializer_zerop (gimple_call_arg (def_stmt,
16658 : 1)))
16659 49 : op0 = pc_rtx;
16660 : break;
16661 : default:
16662 : break;
16663 : }
16664 : }
16665 : }
16666 : }
16667 :
16668 1004 : pat = GEN_FCN (icode) (subtarget, op0, op1, op2, op3, op4);
16669 1004 : if (! pat)
16670 0 : return const0_rtx;
16671 1004 : emit_insn (pat);
16672 :
16673 1004 : switch (fcode)
16674 : {
16675 24 : case IX86_BUILTIN_GATHER3DIV16SF:
16676 24 : if (target == NULL_RTX)
16677 0 : target = gen_reg_rtx (V8SFmode);
16678 24 : emit_insn (gen_vec_extract_lo_v16sf (target, subtarget));
16679 24 : break;
16680 24 : case IX86_BUILTIN_GATHER3DIV16SI:
16681 24 : if (target == NULL_RTX)
16682 0 : target = gen_reg_rtx (V8SImode);
16683 24 : emit_insn (gen_vec_extract_lo_v16si (target, subtarget));
16684 24 : break;
16685 28 : case IX86_BUILTIN_GATHER3DIV8SF:
16686 28 : case IX86_BUILTIN_GATHERDIV8SF:
16687 28 : if (target == NULL_RTX)
16688 0 : target = gen_reg_rtx (V4SFmode);
16689 28 : emit_insn (gen_vec_extract_lo_v8sf (target, subtarget));
16690 28 : break;
16691 28 : case IX86_BUILTIN_GATHER3DIV8SI:
16692 28 : case IX86_BUILTIN_GATHERDIV8SI:
16693 28 : if (target == NULL_RTX)
16694 0 : target = gen_reg_rtx (V4SImode);
16695 28 : emit_insn (gen_vec_extract_lo_v8si (target, subtarget));
16696 28 : break;
16697 : default:
16698 : target = subtarget;
16699 : break;
16700 : }
16701 : return target;
16702 :
16703 623 : scatter_gen:
16704 623 : arg0 = CALL_EXPR_ARG (exp, 0);
16705 623 : arg1 = CALL_EXPR_ARG (exp, 1);
16706 623 : arg2 = CALL_EXPR_ARG (exp, 2);
16707 623 : arg3 = CALL_EXPR_ARG (exp, 3);
16708 623 : arg4 = CALL_EXPR_ARG (exp, 4);
16709 623 : op0 = expand_normal (arg0);
16710 623 : op1 = ix86_expand_unsigned_small_int_cst_argument (arg1);
16711 623 : op2 = expand_normal (arg2);
16712 623 : op3 = expand_normal (arg3);
16713 623 : op4 = expand_normal (arg4);
16714 623 : mode1 = insn_data[icode].operand[1].mode;
16715 623 : mode2 = insn_data[icode].operand[2].mode;
16716 623 : mode3 = insn_data[icode].operand[3].mode;
16717 623 : mode4 = insn_data[icode].operand[4].mode;
16718 :
16719 : /* Scatter instruction stores operand op3 to memory with
16720 : indices from op2 and scale from op4 under writemask op1.
16721 : If index operand op2 has more elements then source operand
16722 : op3 one need to use only its low half. And vice versa. */
16723 623 : switch (fcode)
16724 : {
16725 24 : case IX86_BUILTIN_SCATTERALTSIV8DF:
16726 24 : case IX86_BUILTIN_SCATTERALTSIV8DI:
16727 24 : half = gen_reg_rtx (V8SImode);
16728 24 : if (!nonimmediate_operand (op2, V16SImode))
16729 0 : op2 = copy_to_mode_reg (V16SImode, op2);
16730 24 : emit_insn (gen_vec_extract_lo_v16si (half, op2));
16731 24 : op2 = half;
16732 24 : break;
16733 36 : case IX86_BUILTIN_SCATTERALTDIV16SF:
16734 36 : case IX86_BUILTIN_SCATTERALTDIV16SI:
16735 36 : half = gen_reg_rtx (mode3);
16736 36 : if (mode3 == V8SFmode)
16737 : gen = gen_vec_extract_lo_v16sf;
16738 : else
16739 24 : gen = gen_vec_extract_lo_v16si;
16740 36 : if (!nonimmediate_operand (op3, GET_MODE (op3)))
16741 0 : op3 = copy_to_mode_reg (GET_MODE (op3), op3);
16742 36 : emit_insn (gen (half, op3));
16743 36 : op3 = half;
16744 36 : break;
16745 8 : case IX86_BUILTIN_SCATTERALTSIV4DF:
16746 8 : case IX86_BUILTIN_SCATTERALTSIV4DI:
16747 8 : half = gen_reg_rtx (V4SImode);
16748 8 : if (!nonimmediate_operand (op2, V8SImode))
16749 0 : op2 = copy_to_mode_reg (V8SImode, op2);
16750 8 : emit_insn (gen_vec_extract_lo_v8si (half, op2));
16751 8 : op2 = half;
16752 8 : break;
16753 8 : case IX86_BUILTIN_SCATTERALTDIV8SF:
16754 8 : case IX86_BUILTIN_SCATTERALTDIV8SI:
16755 8 : half = gen_reg_rtx (mode3);
16756 8 : if (mode3 == V4SFmode)
16757 : gen = gen_vec_extract_lo_v8sf;
16758 : else
16759 4 : gen = gen_vec_extract_lo_v8si;
16760 8 : if (!nonimmediate_operand (op3, GET_MODE (op3)))
16761 0 : op3 = copy_to_mode_reg (GET_MODE (op3), op3);
16762 8 : emit_insn (gen (half, op3));
16763 8 : op3 = half;
16764 8 : break;
16765 16 : case IX86_BUILTIN_SCATTERALTSIV2DF:
16766 16 : case IX86_BUILTIN_SCATTERALTSIV2DI:
16767 16 : if (!nonimmediate_operand (op2, V4SImode))
16768 0 : op2 = copy_to_mode_reg (V4SImode, op2);
16769 : break;
16770 16 : case IX86_BUILTIN_SCATTERALTDIV4SF:
16771 16 : case IX86_BUILTIN_SCATTERALTDIV4SI:
16772 16 : if (!nonimmediate_operand (op3, GET_MODE (op3)))
16773 0 : op3 = copy_to_mode_reg (GET_MODE (op3), op3);
16774 : break;
16775 : default:
16776 : break;
16777 : }
16778 :
16779 : /* Force memory operand only with base register here. But we
16780 : don't want to do it on memory operand for other builtin
16781 : functions. */
16782 633 : op0 = force_reg (Pmode, convert_to_mode (Pmode, op0, 1));
16783 :
16784 628 : if (!insn_data[icode].operand[0].predicate (op0, Pmode))
16785 0 : op0 = copy_to_mode_reg (Pmode, op0);
16786 :
16787 623 : op1 = fixup_modeless_constant (op1, mode1);
16788 :
16789 623 : if (GET_MODE (op1) == mode1 || GET_MODE (op1) == VOIDmode)
16790 : {
16791 607 : if (!insn_data[icode].operand[1].predicate (op1, mode1))
16792 273 : op1 = copy_to_mode_reg (mode1, op1);
16793 : }
16794 : else
16795 : {
16796 16 : op1 = copy_to_reg (op1);
16797 16 : op1 = lowpart_subreg (mode1, op1, GET_MODE (op1));
16798 : }
16799 :
16800 623 : if (!insn_data[icode].operand[2].predicate (op2, mode2))
16801 57 : op2 = copy_to_mode_reg (mode2, op2);
16802 :
16803 623 : if (!insn_data[icode].operand[3].predicate (op3, mode3))
16804 82 : op3 = copy_to_mode_reg (mode3, op3);
16805 :
16806 623 : if (!insn_data[icode].operand[4].predicate (op4, mode4))
16807 : {
16808 0 : error ("the last argument must be scale 1, 2, 4, 8");
16809 0 : return const0_rtx;
16810 : }
16811 :
16812 623 : pat = GEN_FCN (icode) (op0, op1, op2, op3, op4);
16813 623 : if (! pat)
16814 0 : return const0_rtx;
16815 :
16816 623 : emit_insn (pat);
16817 623 : return 0;
16818 :
16819 23 : case IX86_BUILTIN_XABORT:
16820 23 : icode = CODE_FOR_xabort;
16821 23 : arg0 = CALL_EXPR_ARG (exp, 0);
16822 23 : op0 = expand_normal (arg0);
16823 23 : mode0 = insn_data[icode].operand[0].mode;
16824 23 : if (!insn_data[icode].operand[0].predicate (op0, mode0))
16825 : {
16826 0 : error ("the argument to %<xabort%> intrinsic must "
16827 : "be an 8-bit immediate");
16828 0 : return const0_rtx;
16829 : }
16830 23 : emit_insn (gen_xabort (op0));
16831 23 : return 0;
16832 :
16833 55 : case IX86_BUILTIN_RDSSPD:
16834 55 : case IX86_BUILTIN_RDSSPQ:
16835 55 : mode = (fcode == IX86_BUILTIN_RDSSPD ? SImode : DImode);
16836 :
16837 55 : if (target == 0
16838 55 : || !register_operand (target, mode))
16839 0 : target = gen_reg_rtx (mode);
16840 :
16841 55 : op0 = force_reg (mode, const0_rtx);
16842 :
16843 55 : emit_insn (gen_rdssp (mode, target, op0));
16844 55 : return target;
16845 :
16846 55 : case IX86_BUILTIN_INCSSPD:
16847 55 : case IX86_BUILTIN_INCSSPQ:
16848 55 : mode = (fcode == IX86_BUILTIN_INCSSPD ? SImode : DImode);
16849 :
16850 55 : arg0 = CALL_EXPR_ARG (exp, 0);
16851 55 : op0 = expand_normal (arg0);
16852 :
16853 55 : op0 = force_reg (mode, op0);
16854 :
16855 55 : emit_insn (gen_incssp (mode, op0));
16856 55 : return 0;
16857 :
16858 20 : case IX86_BUILTIN_HRESET:
16859 20 : icode = CODE_FOR_hreset;
16860 20 : arg0 = CALL_EXPR_ARG (exp, 0);
16861 20 : op0 = expand_normal (arg0);
16862 20 : op0 = force_reg (SImode, op0);
16863 20 : emit_insn (gen_hreset (op0));
16864 20 : return 0;
16865 :
16866 38 : case IX86_BUILTIN_RSTORSSP:
16867 38 : case IX86_BUILTIN_CLRSSBSY:
16868 38 : arg0 = CALL_EXPR_ARG (exp, 0);
16869 38 : op0 = expand_normal (arg0);
16870 19 : icode = (fcode == IX86_BUILTIN_RSTORSSP
16871 38 : ? CODE_FOR_rstorssp
16872 : : CODE_FOR_clrssbsy);
16873 :
16874 38 : if (!address_operand (op0, VOIDmode))
16875 : {
16876 18 : op0 = convert_memory_address (Pmode, op0);
16877 18 : op0 = copy_addr_to_reg (op0);
16878 : }
16879 38 : emit_insn (GEN_FCN (icode) (gen_rtx_MEM (DImode, op0)));
16880 38 : return 0;
16881 :
16882 76 : case IX86_BUILTIN_WRSSD:
16883 76 : case IX86_BUILTIN_WRSSQ:
16884 76 : case IX86_BUILTIN_WRUSSD:
16885 76 : case IX86_BUILTIN_WRUSSQ:
16886 76 : mode = ((fcode == IX86_BUILTIN_WRSSD
16887 76 : || fcode == IX86_BUILTIN_WRUSSD)
16888 76 : ? SImode : DImode);
16889 :
16890 76 : arg0 = CALL_EXPR_ARG (exp, 0);
16891 76 : op0 = expand_normal (arg0);
16892 76 : arg1 = CALL_EXPR_ARG (exp, 1);
16893 76 : op1 = expand_normal (arg1);
16894 :
16895 76 : op0 = force_reg (mode, op0);
16896 :
16897 76 : if (!address_operand (op1, VOIDmode))
16898 : {
16899 36 : op1 = convert_memory_address (Pmode, op1);
16900 36 : op1 = copy_addr_to_reg (op1);
16901 : }
16902 76 : op1 = gen_rtx_MEM (mode, op1);
16903 :
16904 76 : icode = ((fcode == IX86_BUILTIN_WRSSD
16905 76 : || fcode == IX86_BUILTIN_WRSSQ)
16906 76 : ? code_for_wrss (mode)
16907 38 : : code_for_wruss (mode));
16908 76 : emit_insn (GEN_FCN (icode) (op0, op1));
16909 :
16910 76 : return 0;
16911 :
16912 114949 : default:
16913 114949 : break;
16914 : }
16915 :
16916 114949 : if (fcode >= IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST
16917 114949 : && fcode <= IX86_BUILTIN__BDESC_SPECIAL_ARGS_LAST)
16918 : {
16919 27043 : i = fcode - IX86_BUILTIN__BDESC_SPECIAL_ARGS_FIRST;
16920 27043 : return ix86_expand_special_args_builtin (bdesc_special_args + i, exp,
16921 27043 : target);
16922 : }
16923 :
16924 87906 : if (fcode >= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
16925 87906 : && fcode <= IX86_BUILTIN__BDESC_PURE_ARGS_LAST)
16926 : {
16927 93 : i = fcode - IX86_BUILTIN__BDESC_PURE_ARGS_FIRST;
16928 93 : return ix86_expand_special_args_builtin (bdesc_pure_args + i, exp,
16929 93 : target);
16930 : }
16931 :
16932 87813 : if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
16933 87813 : && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
16934 : {
16935 69465 : i = fcode - IX86_BUILTIN__BDESC_ARGS_FIRST;
16936 :
16937 69465 : switch (fcode)
16938 : {
16939 0 : case IX86_BUILTIN_RDPID:
16940 0 : return ix86_expand_special_args_builtin (bdesc_args + i, exp,
16941 0 : target);
16942 72 : case IX86_BUILTIN_VCOMISBF16EQ:
16943 72 : case IX86_BUILTIN_VCOMISBF16NE:
16944 72 : case IX86_BUILTIN_VCOMISBF16GT:
16945 72 : case IX86_BUILTIN_VCOMISBF16GE:
16946 72 : case IX86_BUILTIN_VCOMISBF16LT:
16947 72 : case IX86_BUILTIN_VCOMISBF16LE:
16948 72 : return ix86_expand_sse_comi (bdesc_args + i, exp, target, false);
16949 15 : case IX86_BUILTIN_FABSQ:
16950 15 : case IX86_BUILTIN_COPYSIGNQ:
16951 15 : if (!TARGET_SSE)
16952 : /* Emit a normal call if SSE isn't available. */
16953 0 : return expand_call (exp, target, ignore);
16954 : /* FALLTHRU */
16955 69393 : default:
16956 69393 : return ix86_expand_args_builtin (bdesc_args + i, exp, target);
16957 : }
16958 : }
16959 :
16960 18348 : if (fcode >= IX86_BUILTIN__BDESC_COMI_FIRST
16961 18348 : && fcode <= IX86_BUILTIN__BDESC_COMI_LAST)
16962 : {
16963 473 : i = fcode - IX86_BUILTIN__BDESC_COMI_FIRST;
16964 473 : return ix86_expand_sse_comi (bdesc_comi + i, exp, target, true);
16965 : }
16966 :
16967 17875 : if (fcode >= IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST
16968 17875 : && fcode <= IX86_BUILTIN__BDESC_ROUND_ARGS_LAST)
16969 : {
16970 15554 : i = fcode - IX86_BUILTIN__BDESC_ROUND_ARGS_FIRST;
16971 15554 : return ix86_expand_round_builtin (bdesc_round_args + i, exp, target);
16972 : }
16973 :
16974 2321 : if (fcode >= IX86_BUILTIN__BDESC_PCMPESTR_FIRST
16975 2321 : && fcode <= IX86_BUILTIN__BDESC_PCMPESTR_LAST)
16976 : {
16977 216 : i = fcode - IX86_BUILTIN__BDESC_PCMPESTR_FIRST;
16978 216 : return ix86_expand_sse_pcmpestr (bdesc_pcmpestr + i, exp, target);
16979 : }
16980 :
16981 2105 : if (fcode >= IX86_BUILTIN__BDESC_PCMPISTR_FIRST
16982 2105 : && fcode <= IX86_BUILTIN__BDESC_PCMPISTR_LAST)
16983 : {
16984 275 : i = fcode - IX86_BUILTIN__BDESC_PCMPISTR_FIRST;
16985 275 : return ix86_expand_sse_pcmpistr (bdesc_pcmpistr + i, exp, target);
16986 : }
16987 :
16988 1830 : if (fcode >= IX86_BUILTIN__BDESC_MULTI_ARG_FIRST
16989 1830 : && fcode <= IX86_BUILTIN__BDESC_MULTI_ARG_LAST)
16990 : {
16991 1792 : i = fcode - IX86_BUILTIN__BDESC_MULTI_ARG_FIRST;
16992 1792 : const struct builtin_description *d = bdesc_multi_arg + i;
16993 1792 : return ix86_expand_multi_arg_builtin (d->icode, exp, target,
16994 : (enum ix86_builtin_func_type)
16995 1792 : d->flag, d->comparison);
16996 : }
16997 :
16998 38 : if (fcode >= IX86_BUILTIN__BDESC_CET_FIRST
16999 38 : && fcode <= IX86_BUILTIN__BDESC_CET_LAST)
17000 : {
17001 38 : i = fcode - IX86_BUILTIN__BDESC_CET_FIRST;
17002 38 : return ix86_expand_special_args_builtin (bdesc_cet + i, exp,
17003 38 : target);
17004 : }
17005 :
17006 0 : gcc_unreachable ();
17007 : }
17008 :
17009 : /* See below where shifts are handled for explanation of this enum. */
17010 : enum ix86_vec_bcast_alg
17011 : {
17012 : VEC_BCAST_PXOR,
17013 : VEC_BCAST_PCMPEQ,
17014 : VEC_BCAST_PABSB,
17015 : VEC_BCAST_PADDB,
17016 : VEC_BCAST_PSRLW,
17017 : VEC_BCAST_PSRLD,
17018 : VEC_BCAST_PSLLW,
17019 : VEC_BCAST_PSLLD
17020 : };
17021 :
17022 : struct ix86_vec_bcast_map_simode_t
17023 : {
17024 : unsigned int key;
17025 : enum ix86_vec_bcast_alg alg;
17026 : unsigned int arg;
17027 : };
17028 :
17029 : /* This table must be kept sorted as values are looked-up using bsearch. */
17030 : static const ix86_vec_bcast_map_simode_t ix86_vec_bcast_map_simode[] = {
17031 : { 0x00000000, VEC_BCAST_PXOR, 0 },
17032 : { 0x00000001, VEC_BCAST_PSRLD, 31 },
17033 : { 0x00000003, VEC_BCAST_PSRLD, 30 },
17034 : { 0x00000007, VEC_BCAST_PSRLD, 29 },
17035 : { 0x0000000f, VEC_BCAST_PSRLD, 28 },
17036 : { 0x0000001f, VEC_BCAST_PSRLD, 27 },
17037 : { 0x0000003f, VEC_BCAST_PSRLD, 26 },
17038 : { 0x0000007f, VEC_BCAST_PSRLD, 25 },
17039 : { 0x000000ff, VEC_BCAST_PSRLD, 24 },
17040 : { 0x000001ff, VEC_BCAST_PSRLD, 23 },
17041 : { 0x000003ff, VEC_BCAST_PSRLD, 22 },
17042 : { 0x000007ff, VEC_BCAST_PSRLD, 21 },
17043 : { 0x00000fff, VEC_BCAST_PSRLD, 20 },
17044 : { 0x00001fff, VEC_BCAST_PSRLD, 19 },
17045 : { 0x00003fff, VEC_BCAST_PSRLD, 18 },
17046 : { 0x00007fff, VEC_BCAST_PSRLD, 17 },
17047 : { 0x0000ffff, VEC_BCAST_PSRLD, 16 },
17048 : { 0x00010001, VEC_BCAST_PSRLW, 15 },
17049 : { 0x0001ffff, VEC_BCAST_PSRLD, 15 },
17050 : { 0x00030003, VEC_BCAST_PSRLW, 14 },
17051 : { 0x0003ffff, VEC_BCAST_PSRLD, 14 },
17052 : { 0x00070007, VEC_BCAST_PSRLW, 13 },
17053 : { 0x0007ffff, VEC_BCAST_PSRLD, 13 },
17054 : { 0x000f000f, VEC_BCAST_PSRLW, 12 },
17055 : { 0x000fffff, VEC_BCAST_PSRLD, 12 },
17056 : { 0x001f001f, VEC_BCAST_PSRLW, 11 },
17057 : { 0x001fffff, VEC_BCAST_PSRLD, 11 },
17058 : { 0x003f003f, VEC_BCAST_PSRLW, 10 },
17059 : { 0x003fffff, VEC_BCAST_PSRLD, 10 },
17060 : { 0x007f007f, VEC_BCAST_PSRLW, 9 },
17061 : { 0x007fffff, VEC_BCAST_PSRLD, 9 },
17062 : { 0x00ff00ff, VEC_BCAST_PSRLW, 8 },
17063 : { 0x00ffffff, VEC_BCAST_PSRLD, 8 },
17064 : { 0x01010101, VEC_BCAST_PABSB, 0 },
17065 : { 0x01ff01ff, VEC_BCAST_PSRLW, 7 },
17066 : { 0x01ffffff, VEC_BCAST_PSRLD, 7 },
17067 : { 0x03ff03ff, VEC_BCAST_PSRLW, 6 },
17068 : { 0x03ffffff, VEC_BCAST_PSRLD, 6 },
17069 : { 0x07ff07ff, VEC_BCAST_PSRLW, 5 },
17070 : { 0x07ffffff, VEC_BCAST_PSRLD, 5 },
17071 : { 0x0fff0fff, VEC_BCAST_PSRLW, 4 },
17072 : { 0x0fffffff, VEC_BCAST_PSRLD, 4 },
17073 : { 0x1fff1fff, VEC_BCAST_PSRLW, 3 },
17074 : { 0x1fffffff, VEC_BCAST_PSRLD, 3 },
17075 : { 0x3fff3fff, VEC_BCAST_PSRLW, 2 },
17076 : { 0x3fffffff, VEC_BCAST_PSRLD, 2 },
17077 : { 0x7fff7fff, VEC_BCAST_PSRLW, 1 },
17078 : { 0x7fffffff, VEC_BCAST_PSRLD, 1 },
17079 : { 0x80000000, VEC_BCAST_PSLLD, 31 },
17080 : { 0x80008000, VEC_BCAST_PSLLW, 15 },
17081 : { 0xc0000000, VEC_BCAST_PSLLD, 30 },
17082 : { 0xc000c000, VEC_BCAST_PSLLW, 14 },
17083 : { 0xe0000000, VEC_BCAST_PSLLD, 29 },
17084 : { 0xe000e000, VEC_BCAST_PSLLW, 13 },
17085 : { 0xf0000000, VEC_BCAST_PSLLD, 28 },
17086 : { 0xf000f000, VEC_BCAST_PSLLW, 12 },
17087 : { 0xf8000000, VEC_BCAST_PSLLD, 27 },
17088 : { 0xf800f800, VEC_BCAST_PSLLW, 11 },
17089 : { 0xfc000000, VEC_BCAST_PSLLD, 26 },
17090 : { 0xfc00fc00, VEC_BCAST_PSLLW, 10 },
17091 : { 0xfe000000, VEC_BCAST_PSLLD, 25 },
17092 : { 0xfe00fe00, VEC_BCAST_PSLLW, 9 },
17093 : { 0xfefefefe, VEC_BCAST_PADDB, 0 },
17094 : { 0xff000000, VEC_BCAST_PSLLD, 24 },
17095 : { 0xff00ff00, VEC_BCAST_PSLLW, 8 },
17096 : { 0xff800000, VEC_BCAST_PSLLD, 23 },
17097 : { 0xff80ff80, VEC_BCAST_PSLLW, 7 },
17098 : { 0xffc00000, VEC_BCAST_PSLLD, 22 },
17099 : { 0xffc0ffc0, VEC_BCAST_PSLLW, 6 },
17100 : { 0xffe00000, VEC_BCAST_PSLLD, 21 },
17101 : { 0xffe0ffe0, VEC_BCAST_PSLLW, 5 },
17102 : { 0xfff00000, VEC_BCAST_PSLLD, 20 },
17103 : { 0xfff0fff0, VEC_BCAST_PSLLW, 4 },
17104 : { 0xfff80000, VEC_BCAST_PSLLD, 19 },
17105 : { 0xfff8fff8, VEC_BCAST_PSLLW, 3 },
17106 : { 0xfffc0000, VEC_BCAST_PSLLD, 18 },
17107 : { 0xfffcfffc, VEC_BCAST_PSLLW, 2 },
17108 : { 0xfffe0000, VEC_BCAST_PSLLD, 17 },
17109 : { 0xfffefffe, VEC_BCAST_PSLLW, 1 },
17110 : { 0xffff0000, VEC_BCAST_PSLLD, 16 },
17111 : { 0xffff8000, VEC_BCAST_PSLLD, 15 },
17112 : { 0xffffc000, VEC_BCAST_PSLLD, 14 },
17113 : { 0xffffe000, VEC_BCAST_PSLLD, 13 },
17114 : { 0xfffff000, VEC_BCAST_PSLLD, 12 },
17115 : { 0xfffff800, VEC_BCAST_PSLLD, 11 },
17116 : { 0xfffffc00, VEC_BCAST_PSLLD, 10 },
17117 : { 0xfffffe00, VEC_BCAST_PSLLD, 9 },
17118 : { 0xffffff00, VEC_BCAST_PSLLD, 8 },
17119 : { 0xffffff80, VEC_BCAST_PSLLD, 7 },
17120 : { 0xffffffc0, VEC_BCAST_PSLLD, 6 },
17121 : { 0xffffffe0, VEC_BCAST_PSLLD, 5 },
17122 : { 0xfffffff0, VEC_BCAST_PSLLD, 4 },
17123 : { 0xfffffff8, VEC_BCAST_PSLLD, 3 },
17124 : { 0xfffffffc, VEC_BCAST_PSLLD, 2 },
17125 : { 0xfffffffe, VEC_BCAST_PSLLD, 1 },
17126 : { 0xffffffff, VEC_BCAST_PCMPEQ, 0 }
17127 : };
17128 :
17129 : /* Comparator for bsearch on ix86_vec_bcast_map. */
17130 : static int
17131 288296 : ix86_vec_bcast_map_simode_cmp (const void *key, const void *entry)
17132 : {
17133 288296 : return (*(const unsigned int*)key)
17134 288296 : - ((const ix86_vec_bcast_map_simode_t*)entry)->key;
17135 : }
17136 :
17137 : /* A subroutine of ix86_vector_duplicate_value. Tries to efficiently
17138 : materialize V4SImode, V8SImode and V16SImode vectors from SImode
17139 : integer constants. */
17140 : static bool
17141 44335 : ix86_vector_duplicate_simode_const (machine_mode mode, rtx target,
17142 : unsigned int val)
17143 : {
17144 44335 : const ix86_vec_bcast_map_simode_t *entry;
17145 44335 : rtx tmp1, tmp2;
17146 :
17147 44335 : entry = (const ix86_vec_bcast_map_simode_t*)
17148 44335 : bsearch(&val, ix86_vec_bcast_map_simode,
17149 : ARRAY_SIZE (ix86_vec_bcast_map_simode),
17150 : sizeof (ix86_vec_bcast_map_simode_t),
17151 : ix86_vec_bcast_map_simode_cmp);
17152 44335 : if (!entry)
17153 : return false;
17154 :
17155 18933 : switch (entry->alg)
17156 : {
17157 0 : case VEC_BCAST_PXOR:
17158 0 : if ((mode == V8SImode && !TARGET_AVX2)
17159 0 : || (mode == V16SImode && !TARGET_AVX512F))
17160 : return false;
17161 0 : emit_move_insn (target, CONST0_RTX (mode));
17162 0 : return true;
17163 :
17164 145 : case VEC_BCAST_PCMPEQ:
17165 145 : if ((mode == V4SImode && !TARGET_SSE2)
17166 144 : || (mode == V8SImode && !TARGET_AVX2)
17167 117 : || (mode == V16SImode && !TARGET_AVX512F))
17168 : return false;
17169 117 : emit_move_insn (target, CONSTM1_RTX (mode));
17170 117 : return true;
17171 :
17172 646 : case VEC_BCAST_PABSB:
17173 646 : if (mode == V4SImode && TARGET_SSE2)
17174 : {
17175 508 : tmp1 = gen_reg_rtx (V16QImode);
17176 508 : emit_move_insn (tmp1, CONSTM1_RTX (V16QImode));
17177 508 : tmp2 = gen_reg_rtx (V16QImode);
17178 508 : emit_insn (gen_absv16qi2 (tmp2, tmp1));
17179 : }
17180 138 : else if (mode == V8SImode && TARGET_AVX2)
17181 : {
17182 80 : tmp1 = gen_reg_rtx (V32QImode);
17183 80 : emit_move_insn (tmp1, CONSTM1_RTX (V32QImode));
17184 80 : tmp2 = gen_reg_rtx (V32QImode);
17185 80 : emit_insn (gen_absv32qi2 (tmp2, tmp1));
17186 : }
17187 58 : else if (mode == V16SImode && TARGET_AVX512BW)
17188 : {
17189 50 : tmp1 = gen_reg_rtx (V64QImode);
17190 50 : emit_move_insn (tmp1, CONSTM1_RTX (V64QImode));
17191 50 : tmp2 = gen_reg_rtx (V64QImode);
17192 50 : emit_insn (gen_absv64qi2 (tmp2, tmp1));
17193 : }
17194 : else
17195 : return false;
17196 : break;
17197 :
17198 102 : case VEC_BCAST_PADDB:
17199 102 : if (mode == V4SImode && TARGET_SSE2)
17200 : {
17201 95 : tmp1 = gen_reg_rtx (V16QImode);
17202 95 : emit_move_insn (tmp1, CONSTM1_RTX (V16QImode));
17203 95 : tmp2 = gen_reg_rtx (V16QImode);
17204 95 : emit_insn (gen_addv16qi3 (tmp2, tmp1, tmp1));
17205 : }
17206 7 : else if (mode == V8SImode && TARGET_AVX2)
17207 : {
17208 1 : tmp1 = gen_reg_rtx (V32QImode);
17209 1 : emit_move_insn (tmp1, CONSTM1_RTX (V32QImode));
17210 1 : tmp2 = gen_reg_rtx (V32QImode);
17211 1 : emit_insn (gen_addv32qi3 (tmp2, tmp1, tmp1));
17212 : }
17213 6 : else if (mode == V16SImode && TARGET_AVX512BW)
17214 : {
17215 6 : tmp1 = gen_reg_rtx (V64QImode);
17216 6 : emit_move_insn (tmp1, CONSTM1_RTX (V64QImode));
17217 6 : tmp2 = gen_reg_rtx (V64QImode);
17218 6 : emit_insn (gen_addv64qi3 (tmp2, tmp1, tmp1));
17219 : }
17220 : else
17221 : return false;
17222 : break;
17223 :
17224 3677 : case VEC_BCAST_PSRLW:
17225 3677 : if (mode == V4SImode && TARGET_SSE2)
17226 : {
17227 3451 : tmp1 = gen_reg_rtx (V8HImode);
17228 3451 : emit_move_insn (tmp1, CONSTM1_RTX (V8HImode));
17229 3451 : tmp2 = gen_reg_rtx (V8HImode);
17230 3451 : emit_insn (gen_lshrv8hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
17231 : }
17232 226 : else if (mode == V8SImode && TARGET_AVX2)
17233 : {
17234 133 : tmp1 = gen_reg_rtx (V16HImode);
17235 133 : emit_move_insn (tmp1, CONSTM1_RTX (V16HImode));
17236 133 : tmp2 = gen_reg_rtx (V16HImode);
17237 133 : emit_insn (gen_lshrv16hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
17238 : }
17239 93 : else if (mode == V16SImode && TARGET_AVX512BW)
17240 : {
17241 91 : tmp1 = gen_reg_rtx (V32HImode);
17242 91 : emit_move_insn (tmp1, CONSTM1_RTX (V32HImode));
17243 91 : tmp2 = gen_reg_rtx (V32HImode);
17244 91 : emit_insn (gen_lshrv32hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
17245 : }
17246 : else
17247 : return false;
17248 : break;
17249 :
17250 12694 : case VEC_BCAST_PSRLD:
17251 12694 : if (mode == V4SImode && TARGET_SSE2)
17252 : {
17253 9769 : tmp1 = gen_reg_rtx (V4SImode);
17254 9769 : emit_move_insn (tmp1, CONSTM1_RTX (V4SImode));
17255 9769 : emit_insn (gen_lshrv4si3 (target, tmp1, GEN_INT (entry->arg)));
17256 9769 : return true;
17257 : }
17258 2925 : else if (mode == V8SImode && TARGET_AVX2)
17259 : {
17260 1097 : tmp1 = gen_reg_rtx (V8SImode);
17261 1097 : emit_move_insn (tmp1, CONSTM1_RTX (V8SImode));
17262 1097 : emit_insn (gen_lshrv8si3 (target, tmp1, GEN_INT (entry->arg)));
17263 1097 : return true;
17264 : }
17265 1828 : else if (mode == V16SImode && TARGET_AVX512F)
17266 : {
17267 989 : tmp1 = gen_reg_rtx (V16SImode);
17268 989 : emit_move_insn (tmp1, CONSTM1_RTX (V16SImode));
17269 989 : emit_insn (gen_lshrv16si3 (target, tmp1, GEN_INT (entry->arg)));
17270 989 : return true;
17271 : }
17272 : else
17273 : return false;
17274 124 : break;
17275 :
17276 124 : case VEC_BCAST_PSLLW:
17277 124 : if (mode == V4SImode && TARGET_SSE2)
17278 : {
17279 94 : tmp1 = gen_reg_rtx (V8HImode);
17280 94 : emit_move_insn (tmp1, CONSTM1_RTX (V8HImode));
17281 94 : tmp2 = gen_reg_rtx (V8HImode);
17282 94 : emit_insn (gen_ashlv8hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
17283 : }
17284 30 : else if (mode == V8SImode && TARGET_AVX2)
17285 : {
17286 21 : tmp1 = gen_reg_rtx (V16HImode);
17287 21 : emit_move_insn (tmp1, CONSTM1_RTX (V16HImode));
17288 21 : tmp2 = gen_reg_rtx (V16HImode);
17289 21 : emit_insn (gen_ashlv16hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
17290 : }
17291 9 : else if (mode == V16SImode && TARGET_AVX512BW)
17292 : {
17293 9 : tmp1 = gen_reg_rtx (V32HImode);
17294 9 : emit_move_insn (tmp1, CONSTM1_RTX (V32HImode));
17295 9 : tmp2 = gen_reg_rtx (V32HImode);
17296 9 : emit_insn (gen_ashlv32hi3 (tmp2, tmp1, GEN_INT (entry->arg)));
17297 : }
17298 : else
17299 : return false;
17300 : break;
17301 :
17302 1545 : case VEC_BCAST_PSLLD:
17303 1545 : if (mode == V4SImode && TARGET_SSE2)
17304 : {
17305 1513 : tmp1 = gen_reg_rtx (V4SImode);
17306 1513 : emit_move_insn (tmp1, CONSTM1_RTX (V4SImode));
17307 1513 : emit_insn (gen_ashlv4si3 (target, tmp1, GEN_INT (entry->arg)));
17308 1513 : return true;
17309 : }
17310 32 : else if (mode == V8SImode && TARGET_AVX2)
17311 : {
17312 15 : tmp1 = gen_reg_rtx (V8SImode);
17313 15 : emit_move_insn (tmp1, CONSTM1_RTX (V8SImode));
17314 15 : emit_insn (gen_ashlv8si3 (target, tmp1, GEN_INT (entry->arg)));
17315 15 : return true;
17316 : }
17317 17 : else if (mode == V16SImode && TARGET_AVX512F)
17318 : {
17319 17 : tmp1 = gen_reg_rtx (V16SImode);
17320 17 : emit_move_insn (tmp1, CONSTM1_RTX (V16SImode));
17321 17 : emit_insn (gen_ashlv16si3 (target, tmp1, GEN_INT (entry->arg)));
17322 17 : return true;
17323 : }
17324 : else
17325 : return false;
17326 :
17327 : default:
17328 : return false;
17329 : }
17330 :
17331 4539 : emit_move_insn (target, gen_lowpart (mode, tmp2));
17332 4539 : return true;
17333 : }
17334 :
17335 : /* A subroutine of ix86_expand_vector_init_duplicate. Tries to
17336 : fill target with val via vec_duplicate. */
17337 :
17338 : static bool
17339 143847 : ix86_vector_duplicate_value (machine_mode mode, rtx target, rtx val)
17340 : {
17341 143847 : bool ok;
17342 143847 : rtx_insn *insn;
17343 143847 : rtx dup;
17344 :
17345 143847 : if ((mode == V4SImode || mode == V8SImode || mode == V16SImode)
17346 52301 : && CONST_INT_P (val)
17347 44335 : && ix86_vector_duplicate_simode_const (mode, target, INTVAL (val)))
17348 : return true;
17349 :
17350 : /* Save/restore recog_data in case this is called from splitters
17351 : or other routines where recog_data needs to stay valid across
17352 : force_reg. See PR106577. */
17353 125791 : recog_data_d recog_data_save = recog_data;
17354 :
17355 : /* First attempt to recognize VAL as-is. */
17356 125791 : dup = gen_vec_duplicate (mode, val);
17357 125791 : insn = emit_insn (gen_rtx_SET (target, dup));
17358 125791 : if (recog_memoized (insn) < 0)
17359 : {
17360 88315 : rtx_insn *seq;
17361 88315 : machine_mode innermode = GET_MODE_INNER (mode);
17362 88315 : rtx reg;
17363 :
17364 : /* If that fails, force VAL into a register or mem. */
17365 :
17366 88315 : start_sequence ();
17367 :
17368 0 : if (!TARGET_PREFER_BCST_FROM_INTEGER && CONST_INT_P (val)
17369 0 : && GET_MODE_BITSIZE (innermode) <= HOST_BITS_PER_WIDE_INT
17370 88315 : && GET_MODE_BITSIZE(mode) >= 128)
17371 0 : reg = validize_mem (force_const_mem (innermode, val));
17372 : else
17373 : {
17374 88315 : reg = force_reg (innermode, val);
17375 88315 : if (GET_MODE (reg) != innermode)
17376 0 : reg = gen_lowpart (innermode, reg);
17377 : }
17378 :
17379 88315 : SET_SRC (PATTERN (insn)) = gen_vec_duplicate (mode, reg);
17380 88315 : seq = end_sequence ();
17381 88315 : if (seq)
17382 88315 : emit_insn_before (seq, insn);
17383 :
17384 88315 : ok = recog_memoized (insn) >= 0;
17385 88315 : gcc_assert (ok);
17386 : }
17387 125791 : recog_data = recog_data_save;
17388 125791 : return true;
17389 : }
17390 :
17391 : /* Get a vector mode of the same size as the original but with elements
17392 : twice as wide. This is only guaranteed to apply to integral vectors. */
17393 :
17394 : static machine_mode
17395 18217 : get_mode_wider_vector (machine_mode o)
17396 : {
17397 : /* ??? Rely on the ordering that genmodes.cc gives to vectors. */
17398 18217 : machine_mode n = GET_MODE_NEXT_MODE (o).require ();
17399 54651 : gcc_assert (GET_MODE_NUNITS (o) == GET_MODE_NUNITS (n) * 2);
17400 54651 : gcc_assert (GET_MODE_SIZE (o) == GET_MODE_SIZE (n));
17401 18217 : return n;
17402 : }
17403 :
17404 : static bool expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d);
17405 : static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
17406 :
17407 : /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
17408 : with all elements equal to VAR. Return true if successful. */
17409 :
17410 : bool
17411 163384 : ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
17412 : rtx target, rtx val)
17413 : {
17414 163384 : bool ok;
17415 :
17416 163384 : switch (mode)
17417 : {
17418 69937 : case E_V2DImode:
17419 69937 : if (CONST_INT_P (val))
17420 : {
17421 61190 : int tmp = (int)INTVAL (val);
17422 61190 : if (tmp == (int)(INTVAL (val) >> 32))
17423 : {
17424 166 : rtx reg = gen_reg_rtx (V4SImode);
17425 166 : ok = ix86_vector_duplicate_value (V4SImode, reg,
17426 : GEN_INT (tmp));
17427 166 : if (ok)
17428 : {
17429 166 : emit_move_insn (target, gen_lowpart (V2DImode, reg));
17430 166 : return true;
17431 : }
17432 : }
17433 : }
17434 69771 : return ix86_vector_duplicate_value (mode, target, val);
17435 :
17436 1092 : case E_V4DImode:
17437 1092 : if (CONST_INT_P (val))
17438 : {
17439 781 : int tmp = (int)INTVAL (val);
17440 781 : if (tmp == (int)(INTVAL (val) >> 32))
17441 : {
17442 54 : rtx reg = gen_reg_rtx (V8SImode);
17443 54 : ok = ix86_vector_duplicate_value (V8SImode, reg,
17444 : GEN_INT (tmp));
17445 54 : if (ok)
17446 : {
17447 54 : emit_move_insn (target, gen_lowpart (V4DImode, reg));
17448 54 : return true;
17449 : }
17450 : }
17451 : }
17452 1038 : return ix86_vector_duplicate_value (mode, target, val);
17453 :
17454 513 : case E_V8DImode:
17455 513 : if (CONST_INT_P (val))
17456 : {
17457 264 : int tmp = (int)INTVAL (val);
17458 264 : if (tmp == (int)(INTVAL (val) >> 32))
17459 : {
17460 24 : rtx reg = gen_reg_rtx (V16SImode);
17461 24 : ok = ix86_vector_duplicate_value (V16SImode, reg,
17462 : GEN_INT (tmp));
17463 24 : if (ok)
17464 : {
17465 24 : emit_move_insn (target, gen_lowpart (V8DImode, reg));
17466 24 : return true;
17467 : }
17468 : }
17469 : }
17470 489 : return ix86_vector_duplicate_value (mode, target, val);
17471 :
17472 2610 : case E_V2SImode:
17473 2610 : case E_V2SFmode:
17474 2610 : if (!mmx_ok)
17475 : return false;
17476 : /* FALLTHRU */
17477 :
17478 71319 : case E_V4DFmode:
17479 71319 : case E_V8SFmode:
17480 71319 : case E_V8SImode:
17481 71319 : case E_V2DFmode:
17482 71319 : case E_V4SFmode:
17483 71319 : case E_V4SImode:
17484 71319 : case E_V16SImode:
17485 71319 : case E_V16SFmode:
17486 71319 : case E_V8DFmode:
17487 71319 : return ix86_vector_duplicate_value (mode, target, val);
17488 :
17489 398 : case E_V4HImode:
17490 398 : if (!mmx_ok)
17491 : return false;
17492 395 : if (TARGET_SSE || TARGET_3DNOW_A)
17493 : {
17494 395 : rtx x;
17495 :
17496 395 : val = gen_lowpart (SImode, val);
17497 395 : if (CONST_INT_P (val))
17498 : return false;
17499 393 : x = gen_rtx_TRUNCATE (HImode, val);
17500 393 : x = gen_rtx_VEC_DUPLICATE (mode, x);
17501 393 : emit_insn (gen_rtx_SET (target, x));
17502 393 : return true;
17503 : }
17504 0 : goto widen;
17505 :
17506 5 : case E_V4HFmode:
17507 5 : case E_V4BFmode:
17508 5 : if (TARGET_MMX_WITH_SSE)
17509 : {
17510 10 : val = force_reg (GET_MODE_INNER (mode), val);
17511 5 : rtx x = gen_rtx_VEC_DUPLICATE (mode, val);
17512 5 : emit_insn (gen_rtx_SET (target, x));
17513 5 : return true;
17514 : }
17515 : return false;
17516 :
17517 108 : case E_V2HImode:
17518 108 : if (TARGET_SSE2)
17519 : {
17520 108 : rtx x;
17521 :
17522 108 : val = gen_lowpart (SImode, val);
17523 108 : if (CONST_INT_P (val))
17524 : return false;
17525 108 : x = gen_rtx_TRUNCATE (HImode, val);
17526 108 : x = gen_rtx_VEC_DUPLICATE (mode, x);
17527 108 : emit_insn (gen_rtx_SET (target, x));
17528 108 : return true;
17529 : }
17530 : return false;
17531 :
17532 3 : case E_V2HFmode:
17533 3 : case E_V2BFmode:
17534 3 : if (TARGET_SSE2)
17535 : {
17536 6 : val = force_reg (GET_MODE_INNER (mode), val);
17537 3 : rtx x = gen_rtx_VEC_DUPLICATE (mode, val);
17538 3 : emit_insn (gen_rtx_SET (target, x));
17539 3 : return true;
17540 : }
17541 : return false;
17542 :
17543 297 : case E_V8QImode:
17544 297 : case E_V4QImode:
17545 297 : if (!mmx_ok)
17546 : return false;
17547 293 : goto widen;
17548 :
17549 9827 : case E_V8HImode:
17550 9827 : if (CONST_INT_P (val))
17551 9326 : goto widen;
17552 : /* FALLTHRU */
17553 :
17554 815 : case E_V8HFmode:
17555 815 : case E_V8BFmode:
17556 815 : if (TARGET_AVX2)
17557 391 : return ix86_vector_duplicate_value (mode, target, val);
17558 :
17559 424 : if (TARGET_SSE2)
17560 : {
17561 1106 : struct expand_vec_perm_d dperm;
17562 1106 : rtx tmp1, tmp2;
17563 :
17564 424 : permute:
17565 1106 : memset (&dperm, 0, sizeof (dperm));
17566 1106 : dperm.target = target;
17567 1106 : dperm.vmode = mode;
17568 1106 : dperm.nelt = GET_MODE_NUNITS (mode);
17569 1106 : dperm.op0 = dperm.op1 = gen_reg_rtx (mode);
17570 1106 : dperm.one_operand_p = true;
17571 :
17572 1106 : if (mode == V8HFmode || mode == V8BFmode)
17573 : {
17574 3 : tmp1 = force_reg (GET_MODE_INNER (mode), val);
17575 3 : tmp2 = gen_reg_rtx (mode);
17576 3 : emit_insn (gen_vec_set_0 (mode, tmp2, CONST0_RTX (mode), tmp1));
17577 3 : tmp1 = gen_lowpart (mode, tmp2);
17578 : }
17579 : else
17580 : {
17581 : /* Extend to SImode using a paradoxical SUBREG. */
17582 1103 : tmp1 = gen_reg_rtx (SImode);
17583 1103 : emit_move_insn (tmp1, gen_lowpart (SImode, val));
17584 :
17585 : /* Insert the SImode value as
17586 : low element of a V4SImode vector. */
17587 1103 : tmp2 = gen_reg_rtx (V4SImode);
17588 1103 : emit_insn (gen_vec_setv4si_0 (tmp2, CONST0_RTX (V4SImode), tmp1));
17589 1103 : tmp1 = gen_lowpart (mode, tmp2);
17590 : }
17591 :
17592 1106 : emit_move_insn (dperm.op0, tmp1);
17593 1106 : ok = (expand_vec_perm_1 (&dperm)
17594 1106 : || expand_vec_perm_broadcast_1 (&dperm));
17595 0 : gcc_assert (ok);
17596 1106 : return ok;
17597 : }
17598 0 : goto widen;
17599 :
17600 5618 : case E_V16QImode:
17601 5618 : if (CONST_INT_P (val))
17602 4880 : goto widen;
17603 738 : if (TARGET_AVX2)
17604 56 : return ix86_vector_duplicate_value (mode, target, val);
17605 :
17606 682 : if (TARGET_SSE2)
17607 682 : goto permute;
17608 0 : goto widen;
17609 :
17610 16703 : widen:
17611 : /* Replicate the value once into the next wider mode and recurse. */
17612 16703 : {
17613 16703 : machine_mode smode, wsmode, wvmode;
17614 16703 : rtx x;
17615 :
17616 16703 : smode = GET_MODE_INNER (mode);
17617 16703 : wvmode = get_mode_wider_vector (mode);
17618 16703 : wsmode = GET_MODE_INNER (wvmode);
17619 :
17620 16703 : val = convert_modes (wsmode, smode, val, true);
17621 :
17622 16703 : if (CONST_INT_P (val))
17623 : {
17624 32822 : x = simplify_binary_operation (ASHIFT, wsmode, val,
17625 16411 : GEN_INT (GET_MODE_BITSIZE (smode)));
17626 16411 : val = simplify_binary_operation (IOR, wsmode, val, x);
17627 : }
17628 292 : else if (smode == QImode && !TARGET_PARTIAL_REG_STALL)
17629 292 : emit_insn (gen_insv_1 (wsmode, val, val));
17630 : else
17631 : {
17632 0 : x = expand_simple_binop (wsmode, ASHIFT, val,
17633 0 : GEN_INT (GET_MODE_BITSIZE (smode)),
17634 : NULL_RTX, 1, OPTAB_LIB_WIDEN);
17635 0 : val = expand_simple_binop (wsmode, IOR, val, x, x, 1,
17636 : OPTAB_LIB_WIDEN);
17637 : }
17638 :
17639 16703 : x = gen_reg_rtx (wvmode);
17640 16703 : ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
17641 16703 : if (!ok)
17642 : return false;
17643 16702 : emit_move_insn (target, gen_lowpart (GET_MODE (target), x));
17644 16702 : return true;
17645 : }
17646 :
17647 1435 : case E_V16HImode:
17648 1435 : case E_V32QImode:
17649 1435 : if (CONST_INT_P (val))
17650 1140 : goto widen;
17651 : /* FALLTHRU */
17652 :
17653 378 : case E_V16HFmode:
17654 378 : case E_V16BFmode:
17655 378 : if (TARGET_AVX2)
17656 350 : return ix86_vector_duplicate_value (mode, target, val);
17657 : else
17658 : {
17659 28 : machine_mode hvmode;
17660 28 : switch (mode)
17661 : {
17662 : case V16HImode:
17663 : hvmode = V8HImode;
17664 : break;
17665 0 : case V16HFmode:
17666 0 : hvmode = V8HFmode;
17667 0 : break;
17668 1 : case V16BFmode:
17669 1 : hvmode = V8BFmode;
17670 1 : break;
17671 14 : case V32QImode:
17672 14 : hvmode = V16QImode;
17673 14 : break;
17674 0 : default:
17675 0 : gcc_unreachable ();
17676 : }
17677 28 : rtx x = gen_reg_rtx (hvmode);
17678 :
17679 28 : ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
17680 28 : if (!ok)
17681 : return false;
17682 :
17683 28 : x = gen_rtx_VEC_CONCAT (mode, x, x);
17684 28 : emit_insn (gen_rtx_SET (target, x));
17685 : }
17686 28 : return true;
17687 :
17688 1194 : case E_V32HImode:
17689 1194 : case E_V64QImode:
17690 1194 : if (CONST_INT_P (val))
17691 1064 : goto widen;
17692 : /* FALLTHRU */
17693 :
17694 209 : case E_V32HFmode:
17695 209 : case E_V32BFmode:
17696 209 : if (TARGET_AVX512BW)
17697 189 : return ix86_vector_duplicate_value (mode, target, val);
17698 : else
17699 : {
17700 20 : machine_mode hvmode;
17701 20 : switch (mode)
17702 : {
17703 : case V32HImode:
17704 : hvmode = V16HImode;
17705 : break;
17706 0 : case V32HFmode:
17707 0 : hvmode = V16HFmode;
17708 0 : break;
17709 1 : case V32BFmode:
17710 1 : hvmode = V16BFmode;
17711 1 : break;
17712 10 : case V64QImode:
17713 10 : hvmode = V32QImode;
17714 10 : break;
17715 0 : default:
17716 0 : gcc_unreachable ();
17717 : }
17718 20 : rtx x = gen_reg_rtx (hvmode);
17719 :
17720 20 : ok = ix86_expand_vector_init_duplicate (false, hvmode, x, val);
17721 20 : if (!ok)
17722 : return false;
17723 :
17724 20 : x = gen_rtx_VEC_CONCAT (mode, x, x);
17725 20 : emit_insn (gen_rtx_SET (target, x));
17726 : }
17727 20 : return true;
17728 :
17729 : default:
17730 : return false;
17731 : }
17732 : }
17733 :
17734 : /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
17735 : whose ONE_VAR element is VAR, and other elements are zero. Return true
17736 : if successful. */
17737 :
17738 : bool
17739 10632 : ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
17740 : rtx target, rtx var, int one_var)
17741 : {
17742 10632 : machine_mode vsimode;
17743 10632 : rtx new_target;
17744 10632 : rtx x, tmp;
17745 10632 : bool use_vector_set = false;
17746 10632 : rtx (*gen_vec_set_0) (rtx, rtx, rtx) = NULL;
17747 :
17748 10632 : switch (mode)
17749 : {
17750 8262 : case E_V2DImode:
17751 : /* For SSE4.1, we normally use vector set. But if the second
17752 : element is zero and inter-unit moves are OK, we use movq
17753 : instead. */
17754 8253 : use_vector_set = (TARGET_64BIT && TARGET_SSE4_1
17755 8385 : && !(TARGET_INTER_UNIT_MOVES_TO_VEC
17756 : && one_var == 0));
17757 : break;
17758 872 : case E_V16QImode:
17759 872 : case E_V4SImode:
17760 872 : case E_V4SFmode:
17761 872 : use_vector_set = TARGET_SSE4_1;
17762 872 : break;
17763 85 : case E_V8HImode:
17764 85 : use_vector_set = TARGET_SSE2;
17765 85 : gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
17766 85 : ? gen_vec_setv8hi_0 : NULL;
17767 : break;
17768 8 : case E_V8QImode:
17769 8 : use_vector_set = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
17770 : break;
17771 14 : case E_V4HImode:
17772 14 : case E_V4HFmode:
17773 14 : case E_V4BFmode:
17774 14 : use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
17775 : break;
17776 32 : case E_V4QImode:
17777 32 : use_vector_set = TARGET_SSE4_1;
17778 32 : break;
17779 0 : case E_V32QImode:
17780 0 : use_vector_set = TARGET_AVX;
17781 0 : break;
17782 5 : case E_V16HImode:
17783 5 : use_vector_set = TARGET_AVX;
17784 5 : gen_vec_set_0 = TARGET_AVX512FP16 && one_var == 0
17785 5 : ? gen_vec_setv16hi_0 : NULL;
17786 : break;
17787 5 : case E_V8SImode:
17788 5 : use_vector_set = TARGET_AVX;
17789 5 : gen_vec_set_0 = gen_vec_setv8si_0;
17790 5 : break;
17791 22 : case E_V8SFmode:
17792 22 : use_vector_set = TARGET_AVX;
17793 22 : gen_vec_set_0 = gen_vec_setv8sf_0;
17794 22 : break;
17795 13 : case E_V4DFmode:
17796 13 : use_vector_set = TARGET_AVX;
17797 13 : gen_vec_set_0 = gen_vec_setv4df_0;
17798 13 : break;
17799 7 : case E_V4DImode:
17800 : /* Use ix86_expand_vector_set in 64bit mode only. */
17801 7 : use_vector_set = TARGET_AVX && TARGET_64BIT;
17802 : gen_vec_set_0 = gen_vec_setv4di_0;
17803 : break;
17804 17 : case E_V16SImode:
17805 17 : use_vector_set = TARGET_AVX512F && one_var == 0;
17806 : gen_vec_set_0 = gen_vec_setv16si_0;
17807 : break;
17808 22 : case E_V16SFmode:
17809 22 : use_vector_set = TARGET_AVX512F && one_var == 0;
17810 : gen_vec_set_0 = gen_vec_setv16sf_0;
17811 : break;
17812 0 : case E_V8DFmode:
17813 0 : use_vector_set = TARGET_AVX512F && one_var == 0;
17814 : gen_vec_set_0 = gen_vec_setv8df_0;
17815 : break;
17816 2 : case E_V8DImode:
17817 : /* Use ix86_expand_vector_set in 64bit mode only. */
17818 2 : use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
17819 : gen_vec_set_0 = gen_vec_setv8di_0;
17820 : break;
17821 39 : case E_V8HFmode:
17822 39 : use_vector_set = TARGET_AVX512FP16 && one_var == 0;
17823 : gen_vec_set_0 = gen_vec_setv8hf_0;
17824 : break;
17825 9 : case E_V16HFmode:
17826 9 : use_vector_set = TARGET_AVX512FP16 && one_var == 0;
17827 : gen_vec_set_0 = gen_vec_setv16hf_0;
17828 : break;
17829 6 : case E_V32HFmode:
17830 6 : use_vector_set = TARGET_AVX512FP16 && one_var == 0;
17831 : gen_vec_set_0 = gen_vec_setv32hf_0;
17832 : break;
17833 2 : case E_V8BFmode:
17834 2 : use_vector_set = TARGET_AVX512FP16 && one_var == 0;
17835 : gen_vec_set_0 = gen_vec_setv8bf_0;
17836 : break;
17837 0 : case E_V16BFmode:
17838 0 : use_vector_set = TARGET_AVX512FP16 && one_var == 0;
17839 : gen_vec_set_0 = gen_vec_setv16bf_0;
17840 : break;
17841 0 : case E_V32BFmode:
17842 0 : use_vector_set = TARGET_AVX512FP16 && one_var == 0;
17843 : gen_vec_set_0 = gen_vec_setv32bf_0;
17844 : break;
17845 4 : case E_V32HImode:
17846 4 : use_vector_set = TARGET_AVX512FP16 && one_var == 0;
17847 : gen_vec_set_0 = gen_vec_setv32hi_0;
17848 : default:
17849 : break;
17850 : }
17851 :
17852 9310 : if (use_vector_set)
17853 : {
17854 861 : if (gen_vec_set_0 && one_var == 0)
17855 : {
17856 354 : var = force_reg (GET_MODE_INNER (mode), var);
17857 177 : emit_insn (gen_vec_set_0 (target, CONST0_RTX (mode), var));
17858 177 : return true;
17859 : }
17860 684 : emit_insn (gen_rtx_SET (target, CONST0_RTX (mode)));
17861 1368 : var = force_reg (GET_MODE_INNER (mode), var);
17862 684 : ix86_expand_vector_set (mmx_ok, target, var, one_var);
17863 684 : return true;
17864 : }
17865 :
17866 9771 : switch (mode)
17867 : {
17868 1110 : case E_V2SFmode:
17869 1110 : case E_V2SImode:
17870 1110 : if (!mmx_ok)
17871 : return false;
17872 : /* FALLTHRU */
17873 :
17874 8492 : case E_V2DFmode:
17875 8492 : case E_V2DImode:
17876 8492 : if (one_var != 0)
17877 : return false;
17878 5292 : var = force_reg (GET_MODE_INNER (mode), var);
17879 5292 : x = gen_rtx_VEC_CONCAT (mode, var, CONST0_RTX (GET_MODE_INNER (mode)));
17880 2646 : emit_insn (gen_rtx_SET (target, x));
17881 2646 : return true;
17882 :
17883 309 : case E_V4SFmode:
17884 309 : case E_V4SImode:
17885 309 : if (!REG_P (target) || REGNO (target) < FIRST_PSEUDO_REGISTER)
17886 0 : new_target = gen_reg_rtx (mode);
17887 : else
17888 : new_target = target;
17889 618 : var = force_reg (GET_MODE_INNER (mode), var);
17890 309 : x = gen_rtx_VEC_DUPLICATE (mode, var);
17891 309 : x = gen_rtx_VEC_MERGE (mode, x, CONST0_RTX (mode), const1_rtx);
17892 309 : emit_insn (gen_rtx_SET (new_target, x));
17893 309 : if (one_var != 0)
17894 : {
17895 : /* We need to shuffle the value to the correct position, so
17896 : create a new pseudo to store the intermediate result. */
17897 :
17898 : /* With SSE2, we can use the integer shuffle insns. */
17899 40 : if (mode != V4SFmode && TARGET_SSE2)
17900 : {
17901 27 : emit_insn (gen_sse2_pshufd_1 (new_target, new_target,
17902 : const1_rtx,
17903 27 : GEN_INT (one_var == 1 ? 0 : 1),
17904 27 : GEN_INT (one_var == 2 ? 0 : 1),
17905 27 : GEN_INT (one_var == 3 ? 0 : 1)));
17906 27 : if (target != new_target)
17907 0 : emit_move_insn (target, new_target);
17908 27 : return true;
17909 : }
17910 :
17911 : /* Otherwise convert the intermediate result to V4SFmode and
17912 : use the SSE1 shuffle instructions. */
17913 0 : if (mode != V4SFmode)
17914 : {
17915 0 : tmp = gen_reg_rtx (V4SFmode);
17916 0 : emit_move_insn (tmp, gen_lowpart (V4SFmode, new_target));
17917 : }
17918 : else
17919 : tmp = new_target;
17920 :
17921 43 : emit_insn (gen_sse_shufps_v4sf (tmp, tmp, tmp,
17922 : const1_rtx,
17923 13 : GEN_INT (one_var == 1 ? 0 : 1),
17924 : GEN_INT (one_var == 2 ? 0+4 : 1+4),
17925 : GEN_INT (one_var == 3 ? 0+4 : 1+4)));
17926 :
17927 13 : if (mode != V4SFmode)
17928 0 : emit_move_insn (target, gen_lowpart (V4SImode, tmp));
17929 13 : else if (tmp != target)
17930 0 : emit_move_insn (target, tmp);
17931 : }
17932 269 : else if (target != new_target)
17933 0 : emit_move_insn (target, new_target);
17934 : return true;
17935 :
17936 12 : case E_V8HImode:
17937 12 : case E_V16QImode:
17938 12 : vsimode = V4SImode;
17939 12 : goto widen;
17940 3 : case E_V4HImode:
17941 3 : case E_V8QImode:
17942 3 : if (!mmx_ok)
17943 : return false;
17944 3 : vsimode = V2SImode;
17945 3 : goto widen;
17946 15 : widen:
17947 15 : if (one_var != 0)
17948 : return false;
17949 :
17950 : /* Zero extend the variable element to SImode and recurse. */
17951 14 : var = convert_modes (SImode, GET_MODE_INNER (mode), var, true);
17952 :
17953 7 : x = gen_reg_rtx (vsimode);
17954 7 : if (!ix86_expand_vector_init_one_nonzero (mmx_ok, vsimode, x,
17955 : var, one_var))
17956 0 : gcc_unreachable ();
17957 :
17958 7 : emit_move_insn (target, gen_lowpart (mode, x));
17959 7 : return true;
17960 :
17961 : default:
17962 : return false;
17963 : }
17964 : }
17965 :
17966 : /* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
17967 : consisting of the values in VALS. It is known that all elements
17968 : except ONE_VAR are constants. Return true if successful. */
17969 :
17970 : static bool
17971 7960 : ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
17972 : rtx target, rtx vals, int one_var)
17973 : {
17974 7960 : rtx var = XVECEXP (vals, 0, one_var);
17975 7960 : machine_mode wmode;
17976 7960 : rtx const_vec, x;
17977 :
17978 7960 : const_vec = copy_rtx (vals);
17979 7960 : XVECEXP (const_vec, 0, one_var) = CONST0_RTX (GET_MODE_INNER (mode));
17980 7960 : const_vec = gen_rtx_CONST_VECTOR (mode, XVEC (const_vec, 0));
17981 :
17982 7960 : switch (mode)
17983 : {
17984 : case E_V2DFmode:
17985 : case E_V2DImode:
17986 : case E_V2SFmode:
17987 : case E_V2SImode:
17988 : /* For the two element vectors, it's just as easy to use
17989 : the general case. */
17990 : return false;
17991 :
17992 3 : case E_V4DImode:
17993 : /* Use ix86_expand_vector_set in 64bit mode only. */
17994 3 : if (!TARGET_64BIT)
17995 : return false;
17996 : /* FALLTHRU */
17997 : case E_V8HFmode:
17998 : case E_V16HFmode:
17999 : case E_V8BFmode:
18000 : case E_V16BFmode:
18001 : case E_V4DFmode:
18002 : case E_V8SFmode:
18003 : case E_V8SImode:
18004 : case E_V16HImode:
18005 : case E_V32QImode:
18006 : case E_V4SFmode:
18007 : case E_V4SImode:
18008 : case E_V8HImode:
18009 : case E_V4HImode:
18010 : case E_V4HFmode:
18011 : case E_V4BFmode:
18012 : break;
18013 :
18014 8 : case E_V16QImode:
18015 8 : if (TARGET_SSE4_1)
18016 : break;
18017 8 : wmode = V8HImode;
18018 8 : goto widen;
18019 1 : case E_V8QImode:
18020 1 : if (TARGET_MMX_WITH_SSE && TARGET_SSE4_1)
18021 : break;
18022 1 : wmode = V4HImode;
18023 1 : goto widen;
18024 38 : case E_V4QImode:
18025 38 : if (TARGET_SSE4_1)
18026 : break;
18027 : wmode = V2HImode;
18028 47 : widen:
18029 : /* There's no way to set one QImode entry easily. Combine
18030 : the variable value with its adjacent constant value, and
18031 : promote to an HImode set. */
18032 47 : x = XVECEXP (vals, 0, one_var ^ 1);
18033 47 : if (one_var & 1)
18034 : {
18035 13 : var = convert_modes (HImode, QImode, var, true);
18036 13 : var = expand_simple_binop (HImode, ASHIFT, var, GEN_INT (8),
18037 : NULL_RTX, 1, OPTAB_LIB_WIDEN);
18038 13 : x = GEN_INT (INTVAL (x) & 0xff);
18039 : }
18040 : else
18041 : {
18042 34 : var = convert_modes (HImode, QImode, var, true);
18043 34 : x = gen_int_mode (UINTVAL (x) << 8, HImode);
18044 : }
18045 47 : if (x != const0_rtx)
18046 7 : var = expand_simple_binop (HImode, IOR, var, x, var,
18047 : 1, OPTAB_LIB_WIDEN);
18048 :
18049 47 : x = gen_reg_rtx (wmode);
18050 47 : emit_move_insn (x, gen_lowpart (wmode, const_vec));
18051 47 : ix86_expand_vector_set (mmx_ok, x, var, one_var >> 1);
18052 :
18053 47 : emit_move_insn (target, gen_lowpart (mode, x));
18054 47 : return true;
18055 :
18056 : default:
18057 : return false;
18058 : }
18059 :
18060 191 : emit_move_insn (target, const_vec);
18061 191 : ix86_expand_vector_set (mmx_ok, target, var, one_var);
18062 191 : return true;
18063 : }
18064 :
18065 : /* A subroutine of ix86_expand_vector_init_general. Use vector
18066 : concatenate to handle the most general case: all values variable,
18067 : and none identical. */
18068 :
18069 : static void
18070 118287 : ix86_expand_vector_init_concat (machine_mode mode,
18071 : rtx target, rtx *ops, int n)
18072 : {
18073 118287 : machine_mode half_mode = VOIDmode;
18074 118287 : rtx half[2];
18075 118287 : rtvec v;
18076 118287 : int i, j;
18077 :
18078 118287 : switch (n)
18079 : {
18080 110024 : case 2:
18081 110024 : switch (mode)
18082 : {
18083 : case E_V32HFmode:
18084 : half_mode = V16HFmode;
18085 : break;
18086 0 : case E_V32BFmode:
18087 0 : half_mode = V16BFmode;
18088 0 : break;
18089 81 : case E_V16SImode:
18090 81 : half_mode = V8SImode;
18091 81 : break;
18092 33 : case E_V16SFmode:
18093 33 : half_mode = V8SFmode;
18094 33 : break;
18095 92 : case E_V8DImode:
18096 92 : half_mode = V4DImode;
18097 92 : break;
18098 73 : case E_V8DFmode:
18099 73 : half_mode = V4DFmode;
18100 73 : break;
18101 0 : case E_V16HFmode:
18102 0 : half_mode = V8HFmode;
18103 0 : break;
18104 0 : case E_V16BFmode:
18105 0 : half_mode = V8BFmode;
18106 0 : break;
18107 197 : case E_V8SImode:
18108 197 : half_mode = V4SImode;
18109 197 : break;
18110 271 : case E_V8SFmode:
18111 271 : half_mode = V4SFmode;
18112 271 : break;
18113 308 : case E_V4DImode:
18114 308 : half_mode = V2DImode;
18115 308 : break;
18116 618 : case E_V4DFmode:
18117 618 : half_mode = V2DFmode;
18118 618 : break;
18119 5786 : case E_V4SImode:
18120 5786 : half_mode = V2SImode;
18121 5786 : break;
18122 2257 : case E_V4SFmode:
18123 2257 : half_mode = V2SFmode;
18124 2257 : break;
18125 64630 : case E_V2DImode:
18126 64630 : half_mode = DImode;
18127 64630 : break;
18128 26731 : case E_V2SImode:
18129 26731 : half_mode = SImode;
18130 26731 : break;
18131 3488 : case E_V2DFmode:
18132 3488 : half_mode = DFmode;
18133 3488 : break;
18134 5459 : case E_V2SFmode:
18135 5459 : half_mode = SFmode;
18136 5459 : break;
18137 0 : default:
18138 0 : gcc_unreachable ();
18139 : }
18140 :
18141 110024 : if (!register_operand (ops[1], half_mode))
18142 48265 : ops[1] = force_reg (half_mode, ops[1]);
18143 110024 : if (!register_operand (ops[0], half_mode))
18144 36948 : ops[0] = force_reg (half_mode, ops[0]);
18145 110024 : emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, ops[0],
18146 : ops[1])));
18147 110024 : break;
18148 :
18149 7625 : case 4:
18150 7625 : switch (mode)
18151 : {
18152 : case E_V4DImode:
18153 : half_mode = V2DImode;
18154 : break;
18155 535 : case E_V4DFmode:
18156 535 : half_mode = V2DFmode;
18157 535 : break;
18158 4824 : case E_V4SImode:
18159 4824 : half_mode = V2SImode;
18160 4824 : break;
18161 2084 : case E_V4SFmode:
18162 2084 : half_mode = V2SFmode;
18163 2084 : break;
18164 0 : default:
18165 0 : gcc_unreachable ();
18166 : }
18167 7625 : goto half;
18168 :
18169 545 : case 8:
18170 545 : switch (mode)
18171 : {
18172 : case E_V8DImode:
18173 : half_mode = V4DImode;
18174 : break;
18175 73 : case E_V8DFmode:
18176 73 : half_mode = V4DFmode;
18177 73 : break;
18178 156 : case E_V8SImode:
18179 156 : half_mode = V4SImode;
18180 156 : break;
18181 265 : case E_V8SFmode:
18182 265 : half_mode = V4SFmode;
18183 265 : break;
18184 0 : default:
18185 0 : gcc_unreachable ();
18186 : }
18187 545 : goto half;
18188 :
18189 93 : case 16:
18190 93 : switch (mode)
18191 : {
18192 : case E_V16SImode:
18193 : half_mode = V8SImode;
18194 : break;
18195 33 : case E_V16SFmode:
18196 33 : half_mode = V8SFmode;
18197 33 : break;
18198 0 : default:
18199 0 : gcc_unreachable ();
18200 : }
18201 93 : goto half;
18202 :
18203 8263 : half:
18204 : /* FIXME: We process inputs backward to help RA. PR 36222. */
18205 8263 : i = n - 1;
18206 24789 : for (j = 1; j != -1; j--)
18207 : {
18208 16526 : half[j] = gen_reg_rtx (half_mode);
18209 16526 : switch (n >> 1)
18210 : {
18211 15250 : case 2:
18212 15250 : v = gen_rtvec (2, ops[i-1], ops[i]);
18213 15250 : i -= 2;
18214 15250 : break;
18215 1090 : case 4:
18216 1090 : v = gen_rtvec (4, ops[i-3], ops[i-2], ops[i-1], ops[i]);
18217 1090 : i -= 4;
18218 1090 : break;
18219 186 : case 8:
18220 372 : v = gen_rtvec (8, ops[i-7], ops[i-6], ops[i-5], ops[i-4],
18221 186 : ops[i-3], ops[i-2], ops[i-1], ops[i]);
18222 186 : i -= 8;
18223 186 : break;
18224 0 : default:
18225 0 : gcc_unreachable ();
18226 : }
18227 16526 : ix86_expand_vector_init (false, half[j],
18228 : gen_rtx_PARALLEL (half_mode, v));
18229 : }
18230 :
18231 8263 : ix86_expand_vector_init_concat (mode, target, half, 2);
18232 8263 : break;
18233 :
18234 0 : default:
18235 0 : gcc_unreachable ();
18236 : }
18237 118287 : }
18238 :
18239 : /* A subroutine of ix86_expand_vector_init_general. Use vector
18240 : interleave to handle the most general case: all values variable,
18241 : and none identical. */
18242 :
18243 : static void
18244 3826 : ix86_expand_vector_init_interleave (machine_mode mode,
18245 : rtx target, rtx *ops, int n)
18246 : {
18247 3826 : machine_mode first_imode, second_imode, third_imode, inner_mode;
18248 3826 : int i, j;
18249 3826 : rtx op, op0, op1;
18250 3826 : rtx (*gen_load_even) (rtx, rtx, rtx);
18251 3826 : rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
18252 3826 : rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
18253 :
18254 3826 : switch (mode)
18255 : {
18256 : case E_V8HFmode:
18257 : gen_load_even = gen_vec_interleave_lowv8hf;
18258 : gen_interleave_first_low = gen_vec_interleave_lowv4si;
18259 : gen_interleave_second_low = gen_vec_interleave_lowv2di;
18260 : inner_mode = HFmode;
18261 : first_imode = V4SImode;
18262 : second_imode = V2DImode;
18263 : third_imode = VOIDmode;
18264 : break;
18265 487 : case E_V8BFmode:
18266 487 : gen_load_even = gen_vec_interleave_lowv8bf;
18267 487 : gen_interleave_first_low = gen_vec_interleave_lowv4si;
18268 487 : gen_interleave_second_low = gen_vec_interleave_lowv2di;
18269 487 : inner_mode = BFmode;
18270 487 : first_imode = V4SImode;
18271 487 : second_imode = V2DImode;
18272 487 : third_imode = VOIDmode;
18273 487 : break;
18274 739 : case E_V8HImode:
18275 739 : gen_load_even = gen_vec_setv8hi;
18276 739 : gen_interleave_first_low = gen_vec_interleave_lowv4si;
18277 739 : gen_interleave_second_low = gen_vec_interleave_lowv2di;
18278 739 : inner_mode = HImode;
18279 739 : first_imode = V4SImode;
18280 739 : second_imode = V2DImode;
18281 739 : third_imode = VOIDmode;
18282 739 : break;
18283 374 : case E_V16QImode:
18284 374 : gen_load_even = gen_vec_setv16qi;
18285 374 : gen_interleave_first_low = gen_vec_interleave_lowv8hi;
18286 374 : gen_interleave_second_low = gen_vec_interleave_lowv4si;
18287 374 : inner_mode = QImode;
18288 374 : first_imode = V8HImode;
18289 374 : second_imode = V4SImode;
18290 374 : third_imode = V2DImode;
18291 374 : break;
18292 0 : default:
18293 0 : gcc_unreachable ();
18294 : }
18295 :
18296 20626 : for (i = 0; i < n; i++)
18297 : {
18298 16800 : op = ops [i + i];
18299 16800 : if (inner_mode == HFmode || inner_mode == BFmode)
18300 : {
18301 10852 : rtx even, odd;
18302 : /* Use vpuncklwd to pack 2 HFmode or BFmode. */
18303 1948 : machine_mode vec_mode =
18304 10852 : (inner_mode == HFmode) ? V8HFmode : V8BFmode;
18305 10852 : op0 = gen_reg_rtx (vec_mode);
18306 10852 : even = lowpart_subreg (vec_mode,
18307 : force_reg (inner_mode, op), inner_mode);
18308 10852 : odd = lowpart_subreg (vec_mode,
18309 10852 : force_reg (inner_mode, ops[i + i + 1]),
18310 : inner_mode);
18311 10852 : emit_insn (gen_load_even (op0, even, odd));
18312 : }
18313 : else
18314 : {
18315 : /* Extend the odd elment to SImode using a paradoxical SUBREG. */
18316 5948 : op0 = gen_reg_rtx (SImode);
18317 5948 : emit_move_insn (op0, gen_lowpart (SImode, op));
18318 :
18319 : /* Insert the SImode value as low element of V4SImode vector. */
18320 5948 : op1 = gen_reg_rtx (V4SImode);
18321 5948 : op0 = gen_rtx_VEC_MERGE (V4SImode,
18322 : gen_rtx_VEC_DUPLICATE (V4SImode,
18323 : op0),
18324 : CONST0_RTX (V4SImode),
18325 : const1_rtx);
18326 5948 : emit_insn (gen_rtx_SET (op1, op0));
18327 :
18328 : /* Cast the V4SImode vector back to a vector in orignal mode. */
18329 5948 : op0 = gen_reg_rtx (mode);
18330 5948 : emit_move_insn (op0, gen_lowpart (mode, op1));
18331 :
18332 : /* Load even elements into the second position. */
18333 5948 : emit_insn (gen_load_even (op0,
18334 : force_reg (inner_mode,
18335 5948 : ops[i + i + 1]),
18336 : const1_rtx));
18337 : }
18338 :
18339 : /* Cast vector to FIRST_IMODE vector. */
18340 16800 : ops[i] = gen_reg_rtx (first_imode);
18341 16800 : emit_move_insn (ops[i], gen_lowpart (first_imode, op0));
18342 : }
18343 :
18344 : /* Interleave low FIRST_IMODE vectors. */
18345 12226 : for (i = j = 0; i < n; i += 2, j++)
18346 : {
18347 8400 : op0 = gen_reg_rtx (first_imode);
18348 8400 : emit_insn (gen_interleave_first_low (op0, ops[i], ops[i + 1]));
18349 :
18350 : /* Cast FIRST_IMODE vector to SECOND_IMODE vector. */
18351 8400 : ops[j] = gen_reg_rtx (second_imode);
18352 8400 : emit_move_insn (ops[j], gen_lowpart (second_imode, op0));
18353 : }
18354 :
18355 : /* Interleave low SECOND_IMODE vectors. */
18356 3826 : switch (second_imode)
18357 : {
18358 : case E_V4SImode:
18359 1122 : for (i = j = 0; i < n / 2; i += 2, j++)
18360 : {
18361 748 : op0 = gen_reg_rtx (second_imode);
18362 748 : emit_insn (gen_interleave_second_low (op0, ops[i],
18363 748 : ops[i + 1]));
18364 :
18365 : /* Cast the SECOND_IMODE vector to the THIRD_IMODE
18366 : vector. */
18367 748 : ops[j] = gen_reg_rtx (third_imode);
18368 748 : emit_move_insn (ops[j], gen_lowpart (third_imode, op0));
18369 : }
18370 : second_imode = V2DImode;
18371 : gen_interleave_second_low = gen_vec_interleave_lowv2di;
18372 : /* FALLTHRU */
18373 :
18374 3826 : case E_V2DImode:
18375 3826 : op0 = gen_reg_rtx (second_imode);
18376 3826 : emit_insn (gen_interleave_second_low (op0, ops[0],
18377 : ops[1]));
18378 :
18379 : /* Cast the SECOND_IMODE vector back to a vector on original
18380 : mode. */
18381 3826 : emit_insn (gen_rtx_SET (target, gen_lowpart (mode, op0)));
18382 3826 : break;
18383 :
18384 : default:
18385 : gcc_unreachable ();
18386 : }
18387 3826 : }
18388 :
18389 : /* A subroutine of ix86_expand_vector_init. Handle the most general case:
18390 : all values variable, and none identical. */
18391 :
18392 : static void
18393 118855 : ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
18394 : rtx target, rtx vals)
18395 : {
18396 118855 : rtx ops[64], op0, op1, op2, op3, op4, op5;
18397 118855 : machine_mode half_mode = VOIDmode;
18398 118855 : machine_mode quarter_mode = VOIDmode;
18399 118855 : machine_mode int_inner_mode = VOIDmode;
18400 118855 : int n, i;
18401 :
18402 118855 : switch (mode)
18403 : {
18404 32190 : case E_V2SFmode:
18405 32190 : case E_V2SImode:
18406 32190 : if (!mmx_ok && !TARGET_SSE)
18407 : break;
18408 : /* FALLTHRU */
18409 :
18410 108571 : case E_V16SImode:
18411 108571 : case E_V16SFmode:
18412 108571 : case E_V8DFmode:
18413 108571 : case E_V8DImode:
18414 108571 : case E_V8SFmode:
18415 108571 : case E_V8SImode:
18416 108571 : case E_V4DFmode:
18417 108571 : case E_V4DImode:
18418 108571 : case E_V4SFmode:
18419 108571 : case E_V4SImode:
18420 108571 : case E_V2DFmode:
18421 108571 : case E_V2DImode:
18422 108571 : n = GET_MODE_NUNITS (mode);
18423 345535 : for (i = 0; i < n; i++)
18424 236964 : ops[i] = XVECEXP (vals, 0, i);
18425 108571 : ix86_expand_vector_init_concat (mode, target, ops, n);
18426 219192 : return;
18427 :
18428 : case E_V2TImode:
18429 135 : for (i = 0; i < 2; i++)
18430 90 : ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
18431 45 : op0 = gen_reg_rtx (V4DImode);
18432 45 : ix86_expand_vector_init_concat (V4DImode, op0, ops, 2);
18433 45 : emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
18434 45 : return;
18435 :
18436 : case E_V4TImode:
18437 195 : for (i = 0; i < 4; i++)
18438 156 : ops[i] = gen_lowpart (V2DImode, XVECEXP (vals, 0, i));
18439 39 : ops[4] = gen_reg_rtx (V4DImode);
18440 39 : ix86_expand_vector_init_concat (V4DImode, ops[4], ops, 2);
18441 39 : ops[5] = gen_reg_rtx (V4DImode);
18442 39 : ix86_expand_vector_init_concat (V4DImode, ops[5], ops + 2, 2);
18443 39 : op0 = gen_reg_rtx (V8DImode);
18444 39 : ix86_expand_vector_init_concat (V8DImode, op0, ops + 4, 2);
18445 39 : emit_move_insn (target, gen_lowpart (GET_MODE (target), op0));
18446 39 : return;
18447 :
18448 69 : case E_V32QImode:
18449 69 : half_mode = V16QImode;
18450 69 : goto half;
18451 :
18452 64 : case E_V16HImode:
18453 64 : half_mode = V8HImode;
18454 64 : goto half;
18455 :
18456 237 : case E_V16HFmode:
18457 237 : half_mode = V8HFmode;
18458 237 : goto half;
18459 :
18460 95 : case E_V16BFmode:
18461 95 : half_mode = V8BFmode;
18462 95 : goto half;
18463 :
18464 465 : half:
18465 465 : n = GET_MODE_NUNITS (mode);
18466 9009 : for (i = 0; i < n; i++)
18467 8544 : ops[i] = XVECEXP (vals, 0, i);
18468 465 : op0 = gen_reg_rtx (half_mode);
18469 465 : op1 = gen_reg_rtx (half_mode);
18470 465 : ix86_expand_vector_init_interleave (half_mode, op0, ops,
18471 : n >> 2);
18472 465 : ix86_expand_vector_init_interleave (half_mode, op1,
18473 465 : &ops [n >> 1], n >> 2);
18474 465 : emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op0, op1)));
18475 465 : return;
18476 :
18477 56 : case E_V64QImode:
18478 56 : quarter_mode = V16QImode;
18479 56 : half_mode = V32QImode;
18480 56 : goto quarter;
18481 :
18482 71 : case E_V32HImode:
18483 71 : quarter_mode = V8HImode;
18484 71 : half_mode = V16HImode;
18485 71 : goto quarter;
18486 :
18487 287 : case E_V32HFmode:
18488 287 : quarter_mode = V8HFmode;
18489 287 : half_mode = V16HFmode;
18490 287 : goto quarter;
18491 :
18492 51 : case E_V32BFmode:
18493 51 : quarter_mode = V8BFmode;
18494 51 : half_mode = V16BFmode;
18495 51 : goto quarter;
18496 :
18497 465 : quarter:
18498 465 : n = GET_MODE_NUNITS (mode);
18499 17137 : for (i = 0; i < n; i++)
18500 16672 : ops[i] = XVECEXP (vals, 0, i);
18501 465 : op0 = gen_reg_rtx (quarter_mode);
18502 465 : op1 = gen_reg_rtx (quarter_mode);
18503 465 : op2 = gen_reg_rtx (quarter_mode);
18504 465 : op3 = gen_reg_rtx (quarter_mode);
18505 465 : op4 = gen_reg_rtx (half_mode);
18506 465 : op5 = gen_reg_rtx (half_mode);
18507 465 : ix86_expand_vector_init_interleave (quarter_mode, op0, ops,
18508 : n >> 3);
18509 465 : ix86_expand_vector_init_interleave (quarter_mode, op1,
18510 465 : &ops [n >> 2], n >> 3);
18511 465 : ix86_expand_vector_init_interleave (quarter_mode, op2,
18512 465 : &ops [n >> 1], n >> 3);
18513 465 : ix86_expand_vector_init_interleave (quarter_mode, op3,
18514 465 : &ops [(n >> 1) | (n >> 2)], n >> 3);
18515 465 : emit_insn (gen_rtx_SET (op4, gen_rtx_VEC_CONCAT (half_mode, op0, op1)));
18516 465 : emit_insn (gen_rtx_SET (op5, gen_rtx_VEC_CONCAT (half_mode, op2, op3)));
18517 465 : emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, op4, op5)));
18518 465 : return;
18519 :
18520 261 : case E_V16QImode:
18521 261 : if (!TARGET_SSE4_1)
18522 : break;
18523 : /* FALLTHRU */
18524 :
18525 463 : case E_V8HImode:
18526 463 : if (!TARGET_SSE2)
18527 : break;
18528 :
18529 : /* Don't use ix86_expand_vector_init_interleave if we can't
18530 : move from GPR to SSE register directly. */
18531 463 : if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
18532 : break;
18533 : /* FALLTHRU */
18534 :
18535 1036 : case E_V8HFmode:
18536 1036 : case E_V8BFmode:
18537 :
18538 1036 : n = GET_MODE_NUNITS (mode);
18539 9420 : for (i = 0; i < n; i++)
18540 8384 : ops[i] = XVECEXP (vals, 0, i);
18541 1036 : ix86_expand_vector_init_interleave (mode, target, ops, n >> 1);
18542 1036 : return;
18543 :
18544 : case E_V4HFmode:
18545 : case E_V4BFmode:
18546 : case E_V2HFmode:
18547 : case E_V2BFmode:
18548 8234 : int_inner_mode = HImode;
18549 : break;
18550 :
18551 : case E_V4HImode:
18552 : case E_V8QImode:
18553 :
18554 : case E_V2HImode:
18555 : case E_V4QImode:
18556 : break;
18557 :
18558 0 : default:
18559 0 : gcc_unreachable ();
18560 : }
18561 :
18562 8234 : {
18563 8234 : int i, j, n_elts, n_words, n_elt_per_word;
18564 8234 : machine_mode tmp_mode, inner_mode;
18565 8234 : rtx words[4], shift;
18566 :
18567 16545 : tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode;
18568 :
18569 8234 : inner_mode = GET_MODE_INNER (mode);
18570 8234 : n_elts = GET_MODE_NUNITS (mode);
18571 16468 : n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode);
18572 8234 : n_elt_per_word = n_elts / n_words;
18573 8234 : shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
18574 :
18575 16795 : for (i = 0; i < n_words; ++i)
18576 : {
18577 : rtx word = NULL_RTX;
18578 :
18579 44933 : for (j = 0; j < n_elt_per_word; ++j)
18580 : {
18581 36372 : rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
18582 36372 : if (int_inner_mode != E_VOIDmode)
18583 : {
18584 138 : gcc_assert (TARGET_SSE2 && int_inner_mode == HImode);
18585 138 : rtx tmp = gen_reg_rtx (int_inner_mode);
18586 138 : elt = lowpart_subreg (int_inner_mode,
18587 : force_reg (inner_mode, elt),
18588 : inner_mode);
18589 138 : emit_move_insn (tmp, elt);
18590 138 : elt = tmp;
18591 : }
18592 36372 : elt = convert_modes (tmp_mode, inner_mode, elt, true);
18593 :
18594 36372 : if (j == 0)
18595 : word = elt;
18596 : else
18597 : {
18598 27811 : word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
18599 : NULL_RTX, 1, OPTAB_LIB_WIDEN);
18600 27811 : word = expand_simple_binop (tmp_mode, IOR, word, elt,
18601 : NULL_RTX, 1, OPTAB_LIB_WIDEN);
18602 : }
18603 : }
18604 :
18605 8561 : words[i] = word;
18606 : }
18607 :
18608 8234 : if (n_words == 1)
18609 7907 : emit_move_insn (target, gen_lowpart (mode, words[0]));
18610 327 : else if (n_words == 2)
18611 : {
18612 327 : gcc_assert (tmp_mode == DImode || tmp_mode == SImode);
18613 327 : machine_mode concat_mode = tmp_mode == DImode ? V2DImode : V2SImode;
18614 327 : rtx tmp = gen_reg_rtx (concat_mode);
18615 327 : vals = gen_rtx_PARALLEL (concat_mode, gen_rtvec_v (2, words));
18616 327 : ix86_expand_vector_init_general (mmx_ok, concat_mode, tmp, vals);
18617 327 : emit_move_insn (target, gen_lowpart (mode, tmp));
18618 : }
18619 0 : else if (n_words == 4)
18620 : {
18621 0 : rtx tmp = gen_reg_rtx (V4SImode);
18622 0 : gcc_assert (tmp_mode == SImode);
18623 0 : vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
18624 0 : ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
18625 0 : emit_move_insn (target, gen_lowpart (mode, tmp));
18626 : }
18627 : else
18628 0 : gcc_unreachable ();
18629 : }
18630 : }
18631 :
18632 : /* Initialize vector TARGET via VALS. Suppress the use of MMX
18633 : instructions unless MMX_OK is true. */
18634 :
18635 : void
18636 130214 : ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
18637 : {
18638 130214 : machine_mode mode = GET_MODE (target);
18639 130214 : machine_mode inner_mode = GET_MODE_INNER (mode);
18640 130214 : int n_elts = GET_MODE_NUNITS (mode);
18641 130214 : int n_var = 0, one_var = -1;
18642 130214 : bool all_same = true, all_const_zero = true;
18643 130214 : int i;
18644 130214 : rtx x;
18645 :
18646 : /* Handle first initialization from vector elts. */
18647 130214 : if (n_elts != XVECLEN (vals, 0))
18648 : {
18649 1291 : rtx subtarget = target;
18650 1291 : x = XVECEXP (vals, 0, 0);
18651 2582 : gcc_assert (GET_MODE_INNER (GET_MODE (x)) == inner_mode);
18652 2582 : if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
18653 : {
18654 1291 : rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
18655 1291 : if (inner_mode == QImode
18656 1291 : || inner_mode == HImode
18657 1291 : || inner_mode == TImode
18658 : || inner_mode == HFmode
18659 : || inner_mode == BFmode)
18660 : {
18661 134 : unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
18662 134 : scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
18663 134 : n_bits /= GET_MODE_SIZE (elt_mode);
18664 134 : mode = mode_for_vector (elt_mode, n_bits).require ();
18665 134 : inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
18666 134 : ops[0] = gen_lowpart (inner_mode, ops[0]);
18667 134 : ops[1] = gen_lowpart (inner_mode, ops[1]);
18668 134 : subtarget = gen_reg_rtx (mode);
18669 : }
18670 1291 : ix86_expand_vector_init_concat (mode, subtarget, ops, 2);
18671 1291 : if (subtarget != target)
18672 134 : emit_move_insn (target, gen_lowpart (GET_MODE (target), subtarget));
18673 1291 : return;
18674 : }
18675 0 : gcc_unreachable ();
18676 : }
18677 :
18678 473193 : for (i = 0; i < n_elts; ++i)
18679 : {
18680 344270 : x = XVECEXP (vals, 0, i);
18681 668025 : if (!(CONST_SCALAR_INT_P (x)
18682 327728 : || CONST_DOUBLE_P (x)
18683 : || CONST_FIXED_P (x)))
18684 323755 : n_var++, one_var = i;
18685 20515 : else if (x != CONST0_RTX (inner_mode))
18686 3248 : all_const_zero = false;
18687 344270 : if (i > 0 && !rtx_equal_p (x, XVECEXP (vals, 0, 0)))
18688 : all_same = false;
18689 : }
18690 :
18691 : /* Handle the zero vector as special case. */
18692 128923 : if (n_var == 0 && all_const_zero)
18693 : {
18694 302 : emit_move_insn (target, CONST0_RTX (mode));
18695 302 : return;
18696 : }
18697 :
18698 : /* If all values are identical, broadcast the value. */
18699 128621 : if (all_same
18700 135790 : && ix86_expand_vector_init_duplicate (mmx_ok, mode, target,
18701 7169 : XVECEXP (vals, 0, 0)))
18702 : return;
18703 :
18704 : /* Constants are best loaded from the constant pool. */
18705 122623 : if (n_var == 0)
18706 : {
18707 41 : emit_move_insn (target, gen_rtx_CONST_VECTOR (mode, XVEC (vals, 0)));
18708 41 : return;
18709 : }
18710 :
18711 : /* Values where only one field is non-constant are best loaded from
18712 : the pool and overwritten via move later. */
18713 122582 : if (n_var == 1)
18714 : {
18715 11776 : if (all_const_zero
18716 22401 : && ix86_expand_vector_init_one_nonzero (mmx_ok, mode, target,
18717 10625 : XVECEXP (vals, 0, one_var),
18718 : one_var))
18719 : return;
18720 :
18721 7960 : if (ix86_expand_vector_init_one_var (mmx_ok, mode, target, vals, one_var))
18722 : return;
18723 : }
18724 :
18725 118528 : ix86_expand_vector_init_general (mmx_ok, mode, target, vals);
18726 : }
18727 :
18728 : /* Implemented as
18729 : V setg (V v, int idx, T val)
18730 : {
18731 : V idxv = (V){idx, idx, idx, idx, idx, idx, idx, idx};
18732 : V valv = (V){val, val, val, val, val, val, val, val};
18733 : V mask = ((V){0, 1, 2, 3, 4, 5, 6, 7} == idxv);
18734 : v = (v & ~mask) | (valv & mask);
18735 : return v;
18736 : }. */
18737 : void
18738 129 : ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
18739 : {
18740 129 : rtx vec[64];
18741 129 : machine_mode mode = GET_MODE (target);
18742 129 : machine_mode cmp_mode = mode;
18743 129 : int n_elts = GET_MODE_NUNITS (mode);
18744 129 : rtx valv,idxv,constv,idx_tmp;
18745 129 : bool ok = false;
18746 :
18747 : /* 512-bits vector byte/word broadcast and comparison only available
18748 : under TARGET_AVX512BW, break 512-bits vector into two 256-bits vector
18749 : when without TARGET_AVX512BW. */
18750 129 : if ((mode == V32HImode || mode == V32HFmode || mode == V32BFmode
18751 123 : || mode == V64QImode)
18752 10 : && !TARGET_AVX512BW)
18753 : {
18754 3 : gcc_assert (TARGET_AVX512F);
18755 3 : rtx vhi, vlo, idx_hi;
18756 3 : machine_mode half_mode;
18757 3 : rtx (*extract_hi)(rtx, rtx);
18758 3 : rtx (*extract_lo)(rtx, rtx);
18759 :
18760 3 : if (mode == V32HImode)
18761 : {
18762 : half_mode = V16HImode;
18763 : extract_hi = gen_vec_extract_hi_v32hi;
18764 : extract_lo = gen_vec_extract_lo_v32hi;
18765 : }
18766 : else if (mode == V32HFmode)
18767 : {
18768 : half_mode = V16HFmode;
18769 : extract_hi = gen_vec_extract_hi_v32hf;
18770 : extract_lo = gen_vec_extract_lo_v32hf;
18771 : }
18772 : else if (mode == V32BFmode)
18773 : {
18774 : half_mode = V16BFmode;
18775 : extract_hi = gen_vec_extract_hi_v32bf;
18776 : extract_lo = gen_vec_extract_lo_v32bf;
18777 : }
18778 : else
18779 : {
18780 3 : half_mode = V32QImode;
18781 3 : extract_hi = gen_vec_extract_hi_v64qi;
18782 3 : extract_lo = gen_vec_extract_lo_v64qi;
18783 : }
18784 :
18785 3 : vhi = gen_reg_rtx (half_mode);
18786 3 : vlo = gen_reg_rtx (half_mode);
18787 3 : idx_hi = gen_reg_rtx (GET_MODE (idx));
18788 3 : emit_insn (extract_hi (vhi, target));
18789 3 : emit_insn (extract_lo (vlo, target));
18790 3 : vec[0] = idx_hi;
18791 3 : vec[1] = idx;
18792 3 : vec[2] = GEN_INT (n_elts/2);
18793 3 : ix86_expand_binary_operator (MINUS, GET_MODE (idx), vec);
18794 3 : ix86_expand_vector_set_var (vhi, val, idx_hi);
18795 3 : ix86_expand_vector_set_var (vlo, val, idx);
18796 3 : emit_insn (gen_rtx_SET (target, gen_rtx_VEC_CONCAT (mode, vlo, vhi)));
18797 3 : return;
18798 : }
18799 :
18800 504 : if (FLOAT_MODE_P (GET_MODE_INNER (mode)))
18801 : {
18802 42 : switch (mode)
18803 : {
18804 : case E_V2DFmode:
18805 : cmp_mode = V2DImode;
18806 : break;
18807 6 : case E_V4DFmode:
18808 6 : cmp_mode = V4DImode;
18809 6 : break;
18810 4 : case E_V8DFmode:
18811 4 : cmp_mode = V8DImode;
18812 4 : break;
18813 2 : case E_V2SFmode:
18814 2 : cmp_mode = V2SImode;
18815 2 : break;
18816 6 : case E_V4SFmode:
18817 6 : cmp_mode = V4SImode;
18818 6 : break;
18819 6 : case E_V8SFmode:
18820 6 : cmp_mode = V8SImode;
18821 6 : break;
18822 5 : case E_V16SFmode:
18823 5 : cmp_mode = V16SImode;
18824 5 : break;
18825 1 : case E_V2HFmode:
18826 1 : case E_V2BFmode:
18827 1 : cmp_mode = V2HImode;
18828 1 : break;
18829 1 : case E_V4HFmode:
18830 1 : case E_V4BFmode:
18831 1 : cmp_mode = V4HImode;
18832 1 : break;
18833 : case E_V8HFmode:
18834 2 : cmp_mode = V8HImode;
18835 : break;
18836 : case E_V16HFmode:
18837 2 : cmp_mode = V16HImode;
18838 : break;
18839 : case E_V32HFmode:
18840 1 : cmp_mode = V32HImode;
18841 : break;
18842 : case E_V8BFmode:
18843 2 : cmp_mode = V8HImode;
18844 : break;
18845 : case E_V16BFmode:
18846 2 : cmp_mode = V16HImode;
18847 : break;
18848 : case E_V32BFmode:
18849 1 : cmp_mode = V32HImode;
18850 : break;
18851 0 : default:
18852 0 : gcc_unreachable ();
18853 : }
18854 : }
18855 :
18856 1604 : for (int i = 0; i != n_elts; i++)
18857 1478 : vec[i] = GEN_INT (i);
18858 126 : constv = gen_rtx_CONST_VECTOR (cmp_mode, gen_rtvec_v (n_elts, vec));
18859 126 : valv = gen_reg_rtx (mode);
18860 126 : idxv = gen_reg_rtx (cmp_mode);
18861 252 : idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
18862 :
18863 126 : ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
18864 : mode, valv, val);
18865 126 : gcc_assert (ok);
18866 126 : ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
18867 : cmp_mode, idxv, idx_tmp);
18868 126 : gcc_assert (ok);
18869 126 : vec[0] = target;
18870 126 : vec[1] = valv;
18871 126 : vec[2] = target;
18872 126 : vec[3] = gen_rtx_EQ (mode, idxv, constv);
18873 126 : vec[4] = idxv;
18874 126 : vec[5] = constv;
18875 126 : ok = ix86_expand_int_vcond (vec);
18876 126 : gcc_assert (ok);
18877 : }
18878 :
18879 : void
18880 8170 : ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
18881 : {
18882 8170 : machine_mode mode = GET_MODE (target);
18883 8170 : machine_mode inner_mode = GET_MODE_INNER (mode);
18884 8170 : machine_mode half_mode;
18885 8170 : bool use_vec_merge = false;
18886 8170 : bool blendm_const = false;
18887 8170 : rtx tmp;
18888 8170 : static rtx (*gen_extract[8][2]) (rtx, rtx)
18889 : = {
18890 : { gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
18891 : { gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
18892 : { gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
18893 : { gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
18894 : { gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
18895 : { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df },
18896 : { gen_vec_extract_lo_v16hf, gen_vec_extract_hi_v16hf },
18897 : { gen_vec_extract_lo_v16bf, gen_vec_extract_hi_v16bf }
18898 : };
18899 8170 : static rtx (*gen_insert[8][2]) (rtx, rtx, rtx)
18900 : = {
18901 : { gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
18902 : { gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
18903 : { gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
18904 : { gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
18905 : { gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
18906 : { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df },
18907 : { gen_vec_set_lo_v16hf, gen_vec_set_hi_v16hf },
18908 : { gen_vec_set_lo_v16bf, gen_vec_set_hi_v16bf },
18909 : };
18910 8170 : int i, j, n;
18911 8170 : machine_mode mmode = VOIDmode;
18912 8170 : rtx (*gen_blendm) (rtx, rtx, rtx, rtx);
18913 :
18914 8170 : switch (mode)
18915 : {
18916 187 : case E_V2SImode:
18917 187 : use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
18918 : if (use_vec_merge)
18919 : break;
18920 : /* FALLTHRU */
18921 :
18922 168 : case E_V2SFmode:
18923 168 : if (mmx_ok)
18924 : {
18925 168 : tmp = gen_reg_rtx (GET_MODE_INNER (mode));
18926 168 : ix86_expand_vector_extract (true, tmp, target, 1 - elt);
18927 168 : if (elt == 0)
18928 1 : tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
18929 : else
18930 167 : tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
18931 168 : emit_insn (gen_rtx_SET (target, tmp));
18932 168 : return;
18933 : }
18934 : break;
18935 :
18936 241 : case E_V2DImode:
18937 241 : use_vec_merge = TARGET_SSE4_1 && TARGET_64BIT;
18938 99 : if (use_vec_merge)
18939 : break;
18940 :
18941 99 : tmp = gen_reg_rtx (GET_MODE_INNER (mode));
18942 99 : ix86_expand_vector_extract (false, tmp, target, 1 - elt);
18943 99 : if (elt == 0)
18944 77 : tmp = gen_rtx_VEC_CONCAT (mode, val, tmp);
18945 : else
18946 22 : tmp = gen_rtx_VEC_CONCAT (mode, tmp, val);
18947 99 : emit_insn (gen_rtx_SET (target, tmp));
18948 99 : return;
18949 :
18950 153 : case E_V2DFmode:
18951 : /* NB: For ELT == 0, use standard scalar operation patterns which
18952 : preserve the rest of the vector for combiner:
18953 :
18954 : (vec_merge:V2DF
18955 : (vec_duplicate:V2DF (reg:DF))
18956 : (reg:V2DF)
18957 : (const_int 1))
18958 : */
18959 153 : if (elt == 0)
18960 68 : goto do_vec_merge;
18961 :
18962 85 : {
18963 85 : rtx op0, op1;
18964 :
18965 : /* For the two element vectors, we implement a VEC_CONCAT with
18966 : the extraction of the other element. */
18967 :
18968 85 : tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (1 - elt)));
18969 85 : tmp = gen_rtx_VEC_SELECT (inner_mode, target, tmp);
18970 :
18971 85 : if (elt == 0)
18972 : op0 = val, op1 = tmp;
18973 : else
18974 85 : op0 = tmp, op1 = val;
18975 :
18976 85 : tmp = gen_rtx_VEC_CONCAT (mode, op0, op1);
18977 85 : emit_insn (gen_rtx_SET (target, tmp));
18978 : }
18979 85 : return;
18980 :
18981 574 : case E_V4SFmode:
18982 574 : use_vec_merge = TARGET_SSE4_1;
18983 574 : if (use_vec_merge)
18984 : break;
18985 :
18986 62 : switch (elt)
18987 : {
18988 : case 0:
18989 : use_vec_merge = true;
18990 : break;
18991 :
18992 1 : case 1:
18993 : /* tmp = target = A B C D */
18994 1 : tmp = copy_to_reg (target);
18995 : /* target = A A B B */
18996 1 : emit_insn (gen_vec_interleave_lowv4sf (target, target, target));
18997 : /* target = X A B B */
18998 1 : ix86_expand_vector_set (false, target, val, 0);
18999 : /* target = A X C D */
19000 1 : emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
19001 : const1_rtx, const0_rtx,
19002 : GEN_INT (2+4), GEN_INT (3+4)));
19003 1 : return;
19004 :
19005 0 : case 2:
19006 : /* tmp = target = A B C D */
19007 0 : tmp = copy_to_reg (target);
19008 : /* tmp = X B C D */
19009 0 : ix86_expand_vector_set (false, tmp, val, 0);
19010 : /* target = A B X D */
19011 0 : emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
19012 : const0_rtx, const1_rtx,
19013 : GEN_INT (0+4), GEN_INT (3+4)));
19014 0 : return;
19015 :
19016 4 : case 3:
19017 : /* tmp = target = A B C D */
19018 4 : tmp = copy_to_reg (target);
19019 : /* tmp = X B C D */
19020 4 : ix86_expand_vector_set (false, tmp, val, 0);
19021 : /* target = A B X D */
19022 4 : emit_insn (gen_sse_shufps_v4sf (target, target, tmp,
19023 : const0_rtx, const1_rtx,
19024 : GEN_INT (2+4), GEN_INT (0+4)));
19025 4 : return;
19026 :
19027 0 : default:
19028 0 : gcc_unreachable ();
19029 : }
19030 : break;
19031 :
19032 436 : case E_V4SImode:
19033 436 : use_vec_merge = TARGET_SSE4_1;
19034 436 : if (use_vec_merge)
19035 : break;
19036 :
19037 : /* Element 0 handled by vec_merge below. */
19038 276 : if (elt == 0)
19039 : {
19040 : use_vec_merge = true;
19041 : break;
19042 : }
19043 :
19044 86 : if (TARGET_SSE2)
19045 : {
19046 : /* With SSE2, use integer shuffles to swap element 0 and ELT,
19047 : store into element 0, then shuffle them back. */
19048 :
19049 86 : rtx order[4];
19050 :
19051 86 : order[0] = GEN_INT (elt);
19052 86 : order[1] = const1_rtx;
19053 86 : order[2] = const2_rtx;
19054 86 : order[3] = GEN_INT (3);
19055 86 : order[elt] = const0_rtx;
19056 :
19057 86 : emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
19058 : order[1], order[2], order[3]));
19059 :
19060 86 : ix86_expand_vector_set (false, target, val, 0);
19061 :
19062 86 : emit_insn (gen_sse2_pshufd_1 (target, target, order[0],
19063 : order[1], order[2], order[3]));
19064 : }
19065 : else
19066 : {
19067 : /* For SSE1, we have to reuse the V4SF code. */
19068 0 : rtx t = gen_reg_rtx (V4SFmode);
19069 0 : emit_move_insn (t, gen_lowpart (V4SFmode, target));
19070 0 : ix86_expand_vector_set (false, t, gen_lowpart (SFmode, val), elt);
19071 0 : emit_move_insn (target, gen_lowpart (mode, t));
19072 : }
19073 : return;
19074 :
19075 3316 : case E_V8HImode:
19076 3316 : case E_V8HFmode:
19077 3316 : case E_V8BFmode:
19078 3316 : case E_V2HImode:
19079 3316 : case E_V2HFmode:
19080 3316 : case E_V2BFmode:
19081 3316 : use_vec_merge = TARGET_SSE2;
19082 3316 : break;
19083 50 : case E_V4HImode:
19084 50 : case E_V4HFmode:
19085 50 : case E_V4BFmode:
19086 50 : use_vec_merge = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
19087 : break;
19088 :
19089 3067 : case E_V16QImode:
19090 3067 : case E_V4QImode:
19091 3067 : use_vec_merge = TARGET_SSE4_1;
19092 3067 : break;
19093 :
19094 5 : case E_V8QImode:
19095 5 : use_vec_merge = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
19096 : break;
19097 :
19098 3 : case E_V32QImode:
19099 3 : half_mode = V16QImode;
19100 3 : j = 0;
19101 3 : n = 16;
19102 3 : goto half;
19103 :
19104 17 : case E_V16HFmode:
19105 17 : case E_V16BFmode:
19106 : /* For ELT == 0, vec_setv8hf_0 can save 1 vpbroadcastw. */
19107 17 : if (TARGET_AVX2 && elt != 0)
19108 : {
19109 12 : mmode = SImode;
19110 12 : gen_blendm = ((mode == E_V16HFmode) ? gen_avx2_pblendph_1
19111 : : gen_avx2_pblendbf_1);
19112 : blendm_const = true;
19113 : break;
19114 : }
19115 : else
19116 : {
19117 5 : half_mode = ((mode == E_V16HFmode) ? V8HFmode : V8BFmode);
19118 3 : j = ((mode == E_V16HFmode) ? 6 : 7);
19119 5 : n = 8;
19120 5 : goto half;
19121 : }
19122 :
19123 5 : case E_V16HImode:
19124 5 : half_mode = V8HImode;
19125 5 : j = 1;
19126 5 : n = 8;
19127 5 : goto half;
19128 :
19129 15 : case E_V8SImode:
19130 15 : half_mode = V4SImode;
19131 15 : j = 2;
19132 15 : n = 4;
19133 15 : goto half;
19134 :
19135 15 : case E_V4DImode:
19136 15 : half_mode = V2DImode;
19137 15 : j = 3;
19138 15 : n = 2;
19139 15 : goto half;
19140 :
19141 4 : case E_V8SFmode:
19142 4 : half_mode = V4SFmode;
19143 4 : j = 4;
19144 4 : n = 4;
19145 4 : goto half;
19146 :
19147 6 : case E_V4DFmode:
19148 6 : half_mode = V2DFmode;
19149 6 : j = 5;
19150 6 : n = 2;
19151 6 : goto half;
19152 :
19153 53 : half:
19154 : /* Compute offset. */
19155 53 : i = elt / n;
19156 53 : elt %= n;
19157 :
19158 53 : gcc_assert (i <= 1);
19159 :
19160 : /* Extract the half. */
19161 53 : tmp = gen_reg_rtx (half_mode);
19162 53 : emit_insn (gen_extract[j][i] (tmp, target));
19163 :
19164 : /* Put val in tmp at elt. */
19165 53 : ix86_expand_vector_set (false, tmp, val, elt);
19166 :
19167 : /* Put it back. */
19168 53 : emit_insn (gen_insert[j][i] (target, target, tmp));
19169 53 : return;
19170 :
19171 8 : case E_V8DFmode:
19172 8 : if (TARGET_AVX512F)
19173 : {
19174 : mmode = QImode;
19175 : gen_blendm = gen_avx512f_blendmv8df;
19176 : }
19177 : break;
19178 :
19179 6 : case E_V8DImode:
19180 6 : if (TARGET_AVX512F)
19181 : {
19182 : mmode = QImode;
19183 : gen_blendm = gen_avx512f_blendmv8di;
19184 : }
19185 : break;
19186 :
19187 0 : case E_V16SFmode:
19188 0 : if (TARGET_AVX512F)
19189 : {
19190 : mmode = HImode;
19191 : gen_blendm = gen_avx512f_blendmv16sf;
19192 : }
19193 : break;
19194 :
19195 0 : case E_V16SImode:
19196 0 : if (TARGET_AVX512F)
19197 : {
19198 : mmode = HImode;
19199 : gen_blendm = gen_avx512f_blendmv16si;
19200 : }
19201 : break;
19202 :
19203 12 : case E_V32HFmode:
19204 12 : if (TARGET_AVX512BW)
19205 : {
19206 : mmode = SImode;
19207 : gen_blendm = gen_avx512bw_blendmv32hf;
19208 : }
19209 : break;
19210 12 : case E_V32BFmode:
19211 12 : if (TARGET_AVX512BW)
19212 : {
19213 : mmode = SImode;
19214 : gen_blendm = gen_avx512bw_blendmv32bf;
19215 : }
19216 : break;
19217 11 : case E_V32HImode:
19218 11 : if (TARGET_AVX512BW)
19219 : {
19220 : mmode = SImode;
19221 : gen_blendm = gen_avx512bw_blendmv32hi;
19222 : }
19223 7 : else if (TARGET_AVX512F)
19224 : {
19225 7 : half_mode = E_V8HImode;
19226 7 : n = 8;
19227 7 : goto quarter;
19228 : }
19229 : break;
19230 :
19231 12 : case E_V64QImode:
19232 12 : if (TARGET_AVX512BW)
19233 : {
19234 : mmode = DImode;
19235 : gen_blendm = gen_avx512bw_blendmv64qi;
19236 : }
19237 6 : else if (TARGET_AVX512F)
19238 : {
19239 6 : half_mode = E_V16QImode;
19240 6 : n = 16;
19241 6 : goto quarter;
19242 : }
19243 : break;
19244 :
19245 13 : quarter:
19246 : /* Compute offset. */
19247 13 : i = elt / n;
19248 13 : elt %= n;
19249 :
19250 13 : gcc_assert (i <= 3);
19251 :
19252 13 : {
19253 : /* Extract the quarter. */
19254 13 : tmp = gen_reg_rtx (V4SImode);
19255 13 : rtx tmp2 = gen_lowpart (V16SImode, target);
19256 13 : rtx mask = gen_reg_rtx (QImode);
19257 :
19258 13 : emit_move_insn (mask, constm1_rtx);
19259 13 : emit_insn (gen_avx512f_vextracti32x4_mask (tmp, tmp2, GEN_INT (i),
19260 : tmp, mask));
19261 :
19262 13 : tmp2 = gen_reg_rtx (half_mode);
19263 13 : emit_move_insn (tmp2, gen_lowpart (half_mode, tmp));
19264 13 : tmp = tmp2;
19265 :
19266 : /* Put val in tmp at elt. */
19267 13 : ix86_expand_vector_set (false, tmp, val, elt);
19268 :
19269 : /* Put it back. */
19270 13 : tmp2 = gen_reg_rtx (V16SImode);
19271 13 : rtx tmp3 = gen_lowpart (V16SImode, target);
19272 13 : mask = gen_reg_rtx (HImode);
19273 13 : emit_move_insn (mask, constm1_rtx);
19274 13 : tmp = gen_lowpart (V4SImode, tmp);
19275 13 : emit_insn (gen_avx512f_vinserti32x4_mask (tmp2, tmp3, tmp, GEN_INT (i),
19276 : tmp3, mask));
19277 13 : emit_move_insn (target, gen_lowpart (mode, tmp2));
19278 : }
19279 13 : return;
19280 :
19281 : default:
19282 : break;
19283 : }
19284 :
19285 6383 : if (mmode != VOIDmode)
19286 : {
19287 54 : tmp = gen_reg_rtx (mode);
19288 54 : emit_insn (gen_rtx_SET (tmp, gen_rtx_VEC_DUPLICATE (mode, val)));
19289 54 : rtx merge_mask = gen_int_mode (HOST_WIDE_INT_1U << elt, mmode);
19290 : /* The avx512*_blendm<mode> expanders have different operand order
19291 : from VEC_MERGE. In VEC_MERGE, the first input operand is used for
19292 : elements where the mask is set and second input operand otherwise,
19293 : in {sse,avx}*_*blend* the first input operand is used for elements
19294 : where the mask is clear and second input operand otherwise. */
19295 54 : if (!blendm_const)
19296 42 : merge_mask = force_reg (mmode, merge_mask);
19297 54 : emit_insn (gen_blendm (target, target, tmp, merge_mask));
19298 : }
19299 7539 : else if (use_vec_merge)
19300 : {
19301 7529 : do_vec_merge:
19302 7597 : if (!nonimmediate_operand (val, inner_mode))
19303 1 : val = force_reg (inner_mode, val);
19304 7597 : tmp = gen_rtx_VEC_DUPLICATE (mode, val);
19305 7597 : tmp = gen_rtx_VEC_MERGE (mode, tmp, target,
19306 : GEN_INT (HOST_WIDE_INT_1U << elt));
19307 7597 : emit_insn (gen_rtx_SET (target, tmp));
19308 : }
19309 : else
19310 : {
19311 20 : rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
19312 :
19313 10 : emit_move_insn (mem, target);
19314 :
19315 20 : tmp = adjust_address (mem, inner_mode, elt * GET_MODE_SIZE (inner_mode));
19316 10 : emit_move_insn (tmp, val);
19317 :
19318 10 : emit_move_insn (target, mem);
19319 : }
19320 : }
19321 :
19322 : void
19323 104517 : ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
19324 : {
19325 104517 : machine_mode mode = GET_MODE (vec);
19326 104517 : machine_mode inner_mode = GET_MODE_INNER (mode);
19327 104517 : bool use_vec_extr = false;
19328 104517 : rtx tmp;
19329 :
19330 104517 : switch (mode)
19331 : {
19332 8485 : case E_V2SImode:
19333 8485 : use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
19334 : if (use_vec_extr)
19335 : break;
19336 : /* FALLTHRU */
19337 :
19338 9367 : case E_V2SFmode:
19339 9367 : if (!mmx_ok)
19340 : break;
19341 : /* FALLTHRU */
19342 :
19343 : case E_V2DFmode:
19344 : case E_V2DImode:
19345 : case E_V2TImode:
19346 : case E_V4TImode:
19347 : use_vec_extr = true;
19348 : break;
19349 :
19350 7866 : case E_V4SFmode:
19351 7866 : use_vec_extr = TARGET_SSE4_1;
19352 7866 : if (use_vec_extr)
19353 : break;
19354 :
19355 4026 : switch (elt)
19356 : {
19357 : case 0:
19358 : tmp = vec;
19359 : break;
19360 :
19361 1669 : case 1:
19362 1669 : case 3:
19363 1669 : tmp = gen_reg_rtx (mode);
19364 1669 : emit_insn (gen_sse_shufps_v4sf (tmp, vec, vec,
19365 : GEN_INT (elt), GEN_INT (elt),
19366 1669 : GEN_INT (elt+4), GEN_INT (elt+4)));
19367 1669 : break;
19368 :
19369 929 : case 2:
19370 929 : tmp = gen_reg_rtx (mode);
19371 929 : emit_insn (gen_vec_interleave_highv4sf (tmp, vec, vec));
19372 929 : break;
19373 :
19374 0 : default:
19375 0 : gcc_unreachable ();
19376 : }
19377 : vec = tmp;
19378 : use_vec_extr = true;
19379 : elt = 0;
19380 : break;
19381 :
19382 22728 : case E_V4SImode:
19383 22728 : use_vec_extr = TARGET_SSE4_1;
19384 22728 : if (use_vec_extr)
19385 : break;
19386 :
19387 16906 : if (TARGET_SSE2)
19388 : {
19389 16902 : switch (elt)
19390 : {
19391 : case 0:
19392 : tmp = vec;
19393 : break;
19394 :
19395 5432 : case 1:
19396 5432 : case 3:
19397 5432 : tmp = gen_reg_rtx (mode);
19398 5432 : emit_insn (gen_sse2_pshufd_1 (tmp, vec,
19399 : GEN_INT (elt), GEN_INT (elt),
19400 : GEN_INT (elt), GEN_INT (elt)));
19401 5432 : break;
19402 :
19403 2663 : case 2:
19404 2663 : tmp = gen_reg_rtx (mode);
19405 2663 : emit_insn (gen_vec_interleave_highv4si (tmp, vec, vec));
19406 2663 : break;
19407 :
19408 0 : default:
19409 0 : gcc_unreachable ();
19410 : }
19411 : vec = tmp;
19412 : use_vec_extr = true;
19413 : elt = 0;
19414 : }
19415 : else
19416 : {
19417 : /* For SSE1, we have to reuse the V4SF code. */
19418 4 : ix86_expand_vector_extract (false, gen_lowpart (SFmode, target),
19419 4 : gen_lowpart (V4SFmode, vec), elt);
19420 4 : return;
19421 : }
19422 : break;
19423 :
19424 6218 : case E_V8HImode:
19425 6218 : case E_V8HFmode:
19426 6218 : case E_V8BFmode:
19427 6218 : case E_V2HImode:
19428 6218 : case E_V2HFmode:
19429 6218 : case E_V2BFmode:
19430 6218 : use_vec_extr = TARGET_SSE2;
19431 6218 : break;
19432 876 : case E_V4HImode:
19433 876 : case E_V4HFmode:
19434 876 : case E_V4BFmode:
19435 876 : use_vec_extr = mmx_ok && (TARGET_SSE || TARGET_3DNOW_A);
19436 : break;
19437 :
19438 7431 : case E_V16QImode:
19439 7431 : use_vec_extr = TARGET_SSE4_1;
19440 7431 : if (!use_vec_extr
19441 5637 : && TARGET_SSE2
19442 5637 : && elt == 0
19443 11256 : && (optimize_insn_for_size_p () || TARGET_INTER_UNIT_MOVES_FROM_VEC))
19444 : {
19445 3824 : tmp = gen_reg_rtx (SImode);
19446 3824 : ix86_expand_vector_extract (false, tmp, gen_lowpart (V4SImode, vec),
19447 : 0);
19448 3824 : emit_insn (gen_rtx_SET (target, gen_lowpart (QImode, tmp)));
19449 3824 : return;
19450 : }
19451 : break;
19452 78 : case E_V4QImode:
19453 78 : use_vec_extr = TARGET_SSE4_1;
19454 78 : break;
19455 :
19456 604 : case E_V8SFmode:
19457 604 : if (TARGET_AVX)
19458 : {
19459 604 : tmp = gen_reg_rtx (V4SFmode);
19460 604 : if (elt < 4)
19461 298 : emit_insn (gen_vec_extract_lo_v8sf (tmp, vec));
19462 : else
19463 306 : emit_insn (gen_vec_extract_hi_v8sf (tmp, vec));
19464 604 : ix86_expand_vector_extract (false, target, tmp, elt & 3);
19465 604 : return;
19466 : }
19467 : break;
19468 :
19469 565 : case E_V4DFmode:
19470 565 : if (TARGET_AVX)
19471 : {
19472 565 : tmp = gen_reg_rtx (V2DFmode);
19473 565 : if (elt < 2)
19474 297 : emit_insn (gen_vec_extract_lo_v4df (tmp, vec));
19475 : else
19476 268 : emit_insn (gen_vec_extract_hi_v4df (tmp, vec));
19477 565 : ix86_expand_vector_extract (false, target, tmp, elt & 1);
19478 565 : return;
19479 : }
19480 : break;
19481 :
19482 253 : case E_V32QImode:
19483 253 : if (TARGET_AVX)
19484 : {
19485 253 : tmp = gen_reg_rtx (V16QImode);
19486 253 : if (elt < 16)
19487 130 : emit_insn (gen_vec_extract_lo_v32qi (tmp, vec));
19488 : else
19489 123 : emit_insn (gen_vec_extract_hi_v32qi (tmp, vec));
19490 253 : ix86_expand_vector_extract (false, target, tmp, elt & 15);
19491 253 : return;
19492 : }
19493 : break;
19494 :
19495 616 : case E_V16HImode:
19496 616 : if (TARGET_AVX)
19497 : {
19498 616 : tmp = gen_reg_rtx (V8HImode);
19499 616 : if (elt < 8)
19500 304 : emit_insn (gen_vec_extract_lo_v16hi (tmp, vec));
19501 : else
19502 312 : emit_insn (gen_vec_extract_hi_v16hi (tmp, vec));
19503 616 : ix86_expand_vector_extract (false, target, tmp, elt & 7);
19504 616 : return;
19505 : }
19506 : break;
19507 :
19508 1093 : case E_V8SImode:
19509 1093 : if (TARGET_AVX)
19510 : {
19511 1093 : tmp = gen_reg_rtx (V4SImode);
19512 1093 : if (elt < 4)
19513 527 : emit_insn (gen_vec_extract_lo_v8si (tmp, vec));
19514 : else
19515 566 : emit_insn (gen_vec_extract_hi_v8si (tmp, vec));
19516 1093 : ix86_expand_vector_extract (false, target, tmp, elt & 3);
19517 1093 : return;
19518 : }
19519 : break;
19520 :
19521 1558 : case E_V4DImode:
19522 1558 : if (TARGET_AVX)
19523 : {
19524 1558 : tmp = gen_reg_rtx (V2DImode);
19525 1558 : if (elt < 2)
19526 833 : emit_insn (gen_vec_extract_lo_v4di (tmp, vec));
19527 : else
19528 725 : emit_insn (gen_vec_extract_hi_v4di (tmp, vec));
19529 1558 : ix86_expand_vector_extract (false, target, tmp, elt & 1);
19530 1558 : return;
19531 : }
19532 : break;
19533 :
19534 8 : case E_V32HImode:
19535 8 : if (TARGET_AVX512BW)
19536 : {
19537 8 : tmp = gen_reg_rtx (V16HImode);
19538 8 : if (elt < 16)
19539 3 : emit_insn (gen_vec_extract_lo_v32hi (tmp, vec));
19540 : else
19541 5 : emit_insn (gen_vec_extract_hi_v32hi (tmp, vec));
19542 8 : ix86_expand_vector_extract (false, target, tmp, elt & 15);
19543 8 : return;
19544 : }
19545 : break;
19546 :
19547 11 : case E_V64QImode:
19548 11 : if (TARGET_AVX512BW)
19549 : {
19550 11 : tmp = gen_reg_rtx (V32QImode);
19551 11 : if (elt < 32)
19552 5 : emit_insn (gen_vec_extract_lo_v64qi (tmp, vec));
19553 : else
19554 6 : emit_insn (gen_vec_extract_hi_v64qi (tmp, vec));
19555 11 : ix86_expand_vector_extract (false, target, tmp, elt & 31);
19556 11 : return;
19557 : }
19558 : break;
19559 :
19560 311 : case E_V16SFmode:
19561 311 : tmp = gen_reg_rtx (V8SFmode);
19562 311 : if (elt < 8)
19563 157 : emit_insn (gen_vec_extract_lo_v16sf (tmp, vec));
19564 : else
19565 154 : emit_insn (gen_vec_extract_hi_v16sf (tmp, vec));
19566 311 : ix86_expand_vector_extract (false, target, tmp, elt & 7);
19567 311 : return;
19568 :
19569 296 : case E_V8DFmode:
19570 296 : tmp = gen_reg_rtx (V4DFmode);
19571 296 : if (elt < 4)
19572 160 : emit_insn (gen_vec_extract_lo_v8df (tmp, vec));
19573 : else
19574 136 : emit_insn (gen_vec_extract_hi_v8df (tmp, vec));
19575 296 : ix86_expand_vector_extract (false, target, tmp, elt & 3);
19576 296 : return;
19577 :
19578 332 : case E_V16SImode:
19579 332 : tmp = gen_reg_rtx (V8SImode);
19580 332 : if (elt < 8)
19581 163 : emit_insn (gen_vec_extract_lo_v16si (tmp, vec));
19582 : else
19583 169 : emit_insn (gen_vec_extract_hi_v16si (tmp, vec));
19584 332 : ix86_expand_vector_extract (false, target, tmp, elt & 7);
19585 332 : return;
19586 :
19587 738 : case E_V8DImode:
19588 738 : tmp = gen_reg_rtx (V4DImode);
19589 738 : if (elt < 4)
19590 419 : emit_insn (gen_vec_extract_lo_v8di (tmp, vec));
19591 : else
19592 319 : emit_insn (gen_vec_extract_hi_v8di (tmp, vec));
19593 738 : ix86_expand_vector_extract (false, target, tmp, elt & 3);
19594 738 : return;
19595 :
19596 45 : case E_V32HFmode:
19597 45 : case E_V32BFmode:
19598 45 : if (TARGET_AVX512BW)
19599 : {
19600 45 : tmp = (mode == E_V32HFmode
19601 45 : ? gen_reg_rtx (V16HFmode)
19602 7 : : gen_reg_rtx (V16BFmode));
19603 45 : if (elt < 16)
19604 31 : emit_insn (gen_vec_extract_lo (mode, tmp, vec));
19605 : else
19606 14 : emit_insn (gen_vec_extract_hi (mode, tmp, vec));
19607 45 : ix86_expand_vector_extract (false, target, tmp, elt & 15);
19608 45 : return;
19609 : }
19610 : break;
19611 :
19612 474 : case E_V16HFmode:
19613 474 : case E_V16BFmode:
19614 474 : if (TARGET_AVX)
19615 : {
19616 474 : tmp = (mode == E_V16HFmode
19617 474 : ? gen_reg_rtx (V8HFmode)
19618 339 : : gen_reg_rtx (V8BFmode));
19619 474 : if (elt < 8)
19620 249 : emit_insn (gen_vec_extract_lo (mode, tmp, vec));
19621 : else
19622 225 : emit_insn (gen_vec_extract_hi (mode, tmp, vec));
19623 474 : ix86_expand_vector_extract (false, target, tmp, elt & 7);
19624 474 : return;
19625 : }
19626 : break;
19627 :
19628 627 : case E_V8QImode:
19629 627 : use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
19630 : /* ??? Could extract the appropriate HImode element and shift. */
19631 : break;
19632 :
19633 : default:
19634 : break;
19635 : }
19636 :
19637 26122 : if (use_vec_extr)
19638 : {
19639 86565 : tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (elt)));
19640 86565 : tmp = gen_rtx_VEC_SELECT (inner_mode, vec, tmp);
19641 :
19642 : /* Let the rtl optimizers know about the zero extension performed. */
19643 86565 : if (inner_mode == QImode || inner_mode == HImode)
19644 : {
19645 8417 : rtx reg = gen_reg_rtx (SImode);
19646 8417 : tmp = gen_rtx_ZERO_EXTEND (SImode, tmp);
19647 8417 : emit_move_insn (reg, tmp);
19648 8417 : tmp = gen_lowpart (inner_mode, reg);
19649 8417 : SUBREG_PROMOTED_VAR_P (tmp) = 1;
19650 8417 : SUBREG_PROMOTED_SET (tmp, 1);
19651 : }
19652 :
19653 86565 : emit_move_insn (target, tmp);
19654 : }
19655 : else
19656 : {
19657 14440 : rtx mem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
19658 :
19659 7220 : emit_move_insn (mem, vec);
19660 :
19661 14440 : tmp = adjust_address (mem, inner_mode, elt*GET_MODE_SIZE (inner_mode));
19662 7220 : emit_move_insn (target, tmp);
19663 : }
19664 : }
19665 :
19666 : /* Generate code to copy vector bits i / 2 ... i - 1 from vector SRC
19667 : to bits 0 ... i / 2 - 1 of vector DEST, which has the same mode.
19668 : The upper bits of DEST are undefined, though they shouldn't cause
19669 : exceptions (some bits from src or all zeros are ok). */
19670 :
19671 : static void
19672 41858 : emit_reduc_half (rtx dest, rtx src, int i)
19673 : {
19674 41858 : rtx tem, d = dest;
19675 41858 : switch (GET_MODE (src))
19676 : {
19677 6042 : case E_V4SFmode:
19678 6042 : if (i == 128)
19679 3021 : tem = gen_sse_movhlps (dest, src, src);
19680 : else
19681 3021 : tem = gen_sse_shufps_v4sf (dest, src, src, const1_rtx, const1_rtx,
19682 : GEN_INT (1 + 4), GEN_INT (1 + 4));
19683 : break;
19684 3362 : case E_V2DFmode:
19685 3362 : tem = gen_vec_interleave_highv2df (dest, src, src);
19686 3362 : break;
19687 76 : case E_V4QImode:
19688 76 : d = gen_reg_rtx (V1SImode);
19689 76 : tem = gen_mmx_lshrv1si3 (d, gen_lowpart (V1SImode, src),
19690 76 : GEN_INT (i / 2));
19691 76 : break;
19692 615 : case E_V8QImode:
19693 615 : case E_V4HImode:
19694 615 : d = gen_reg_rtx (V1DImode);
19695 615 : tem = gen_mmx_lshrv1di3 (d, gen_lowpart (V1DImode, src),
19696 615 : GEN_INT (i / 2));
19697 615 : break;
19698 31763 : case E_V16QImode:
19699 31763 : case E_V8HImode:
19700 31763 : case E_V8HFmode:
19701 31763 : case E_V4SImode:
19702 31763 : case E_V2DImode:
19703 31763 : if (TARGET_SSE_REDUCTION_PREFER_PSHUF)
19704 : {
19705 19 : if (i == 128)
19706 : {
19707 13 : d = gen_reg_rtx (V4SImode);
19708 26 : tem = gen_sse2_pshufd_1 (
19709 13 : d, force_reg (V4SImode, gen_lowpart (V4SImode, src)),
19710 : GEN_INT (2), GEN_INT (3), GEN_INT (2), GEN_INT (3));
19711 13 : break;
19712 : }
19713 6 : else if (i == 64)
19714 : {
19715 5 : d = gen_reg_rtx (V4SImode);
19716 10 : tem = gen_sse2_pshufd_1 (
19717 5 : d, force_reg (V4SImode, gen_lowpart (V4SImode, src)),
19718 : GEN_INT (1), GEN_INT (1), GEN_INT (1), GEN_INT (1));
19719 5 : break;
19720 : }
19721 1 : else if (i == 32)
19722 : {
19723 1 : d = gen_reg_rtx (V8HImode);
19724 2 : tem = gen_sse2_pshuflw_1 (
19725 1 : d, force_reg (V8HImode, gen_lowpart (V8HImode, src)),
19726 : GEN_INT (1), GEN_INT (1), GEN_INT (1), GEN_INT (1));
19727 1 : break;
19728 : }
19729 : }
19730 31744 : d = gen_reg_rtx (V1TImode);
19731 31744 : tem = gen_sse2_lshrv1ti3 (d, gen_lowpart (V1TImode, src),
19732 31744 : GEN_INT (i / 2));
19733 31744 : break;
19734 0 : case E_V8SFmode:
19735 0 : if (i == 256)
19736 0 : tem = gen_avx_vperm2f128v8sf3 (dest, src, src, const1_rtx);
19737 : else
19738 0 : tem = gen_avx_shufps256 (dest, src, src,
19739 : GEN_INT (i == 128 ? 2 + (3 << 2) : 1));
19740 : break;
19741 0 : case E_V4DFmode:
19742 0 : if (i == 256)
19743 0 : tem = gen_avx_vperm2f128v4df3 (dest, src, src, const1_rtx);
19744 : else
19745 0 : tem = gen_avx_shufpd256 (dest, src, src, const1_rtx);
19746 : break;
19747 0 : case E_V32QImode:
19748 0 : case E_V16HImode:
19749 0 : case E_V16HFmode:
19750 0 : case E_V8SImode:
19751 0 : case E_V4DImode:
19752 0 : if (i == 256)
19753 : {
19754 0 : if (GET_MODE (dest) != V4DImode)
19755 0 : d = gen_reg_rtx (V4DImode);
19756 0 : tem = gen_avx2_permv2ti (d, gen_lowpart (V4DImode, src),
19757 0 : gen_lowpart (V4DImode, src),
19758 : const1_rtx);
19759 : }
19760 : else
19761 : {
19762 0 : d = gen_reg_rtx (V2TImode);
19763 0 : tem = gen_avx2_lshrv2ti3 (d, gen_lowpart (V2TImode, src),
19764 0 : GEN_INT (i / 2));
19765 : }
19766 : break;
19767 0 : case E_V64QImode:
19768 0 : case E_V32HImode:
19769 0 : case E_V32HFmode:
19770 0 : if (i < 64)
19771 : {
19772 0 : d = gen_reg_rtx (V4TImode);
19773 0 : tem = gen_avx512bw_lshrv4ti3 (d, gen_lowpart (V4TImode, src),
19774 0 : GEN_INT (i / 2));
19775 0 : break;
19776 : }
19777 : /* FALLTHRU */
19778 0 : case E_V16SImode:
19779 0 : case E_V16SFmode:
19780 0 : case E_V8DImode:
19781 0 : case E_V8DFmode:
19782 0 : if (i > 128)
19783 0 : tem = gen_avx512f_shuf_i32x4_1 (gen_lowpart (V16SImode, dest),
19784 0 : gen_lowpart (V16SImode, src),
19785 0 : gen_lowpart (V16SImode, src),
19786 : GEN_INT (0x4 + (i == 512 ? 4 : 0)),
19787 : GEN_INT (0x5 + (i == 512 ? 4 : 0)),
19788 : GEN_INT (0x6 + (i == 512 ? 4 : 0)),
19789 : GEN_INT (0x7 + (i == 512 ? 4 : 0)),
19790 : GEN_INT (0xC), GEN_INT (0xD),
19791 : GEN_INT (0xE), GEN_INT (0xF),
19792 : GEN_INT (0x10), GEN_INT (0x11),
19793 : GEN_INT (0x12), GEN_INT (0x13),
19794 : GEN_INT (0x14), GEN_INT (0x15),
19795 : GEN_INT (0x16), GEN_INT (0x17));
19796 : else
19797 0 : tem = gen_avx512f_pshufd_1 (gen_lowpart (V16SImode, dest),
19798 0 : gen_lowpart (V16SImode, src),
19799 : GEN_INT (i == 128 ? 0x2 : 0x1),
19800 : GEN_INT (0x3),
19801 : GEN_INT (0x3),
19802 : GEN_INT (0x3),
19803 : GEN_INT (i == 128 ? 0x6 : 0x5),
19804 : GEN_INT (0x7),
19805 : GEN_INT (0x7),
19806 : GEN_INT (0x7),
19807 : GEN_INT (i == 128 ? 0xA : 0x9),
19808 : GEN_INT (0xB),
19809 : GEN_INT (0xB),
19810 : GEN_INT (0xB),
19811 : GEN_INT (i == 128 ? 0xE : 0xD),
19812 : GEN_INT (0xF),
19813 : GEN_INT (0xF),
19814 : GEN_INT (0xF));
19815 : break;
19816 0 : default:
19817 0 : gcc_unreachable ();
19818 : }
19819 41858 : emit_insn (tem);
19820 41858 : if (d != dest)
19821 32454 : emit_move_insn (dest, gen_lowpart (GET_MODE (dest), d));
19822 41858 : }
19823 :
19824 : /* Expand a vector reduction. FN is the binary pattern to reduce;
19825 : DEST is the destination; IN is the input vector. */
19826 :
19827 : void
19828 20886 : ix86_expand_reduc (rtx (*fn) (rtx, rtx, rtx), rtx dest, rtx in)
19829 : {
19830 20886 : rtx half, dst, vec = in;
19831 20886 : machine_mode mode = GET_MODE (in);
19832 20886 : int i;
19833 :
19834 : /* SSE4 has a special instruction for V8HImode UMIN reduction. */
19835 20886 : if (TARGET_SSE4_1
19836 9943 : && mode == V8HImode
19837 780 : && fn == gen_uminv8hi3)
19838 : {
19839 4 : emit_insn (gen_sse4_1_phminposuw (dest, in));
19840 4 : return;
19841 : }
19842 :
19843 41764 : for (i = GET_MODE_BITSIZE (mode);
19844 125480 : i > GET_MODE_UNIT_BITSIZE (mode);
19845 41858 : i >>= 1)
19846 : {
19847 41858 : half = gen_reg_rtx (mode);
19848 41858 : emit_reduc_half (half, vec, i);
19849 83716 : if (i == GET_MODE_UNIT_BITSIZE (mode) * 2)
19850 : dst = dest;
19851 : else
19852 20976 : dst = gen_reg_rtx (mode);
19853 41858 : emit_insn (fn (dst, half, vec));
19854 41858 : vec = dst;
19855 : }
19856 : }
19857 :
19858 : /* Output code to perform a conditional jump to LABEL, if C2 flag in
19859 : FP status register is set. */
19860 :
19861 : void
19862 284 : ix86_emit_fp_unordered_jump (rtx label)
19863 : {
19864 284 : rtx reg = gen_reg_rtx (HImode);
19865 284 : rtx_insn *insn;
19866 284 : rtx temp;
19867 :
19868 284 : emit_insn (gen_x86_fnstsw_1 (reg));
19869 :
19870 284 : if (TARGET_SAHF && (TARGET_USE_SAHF || optimize_insn_for_size_p ()))
19871 : {
19872 37 : emit_insn (gen_x86_sahf_1 (reg));
19873 :
19874 37 : temp = gen_rtx_REG (CCmode, FLAGS_REG);
19875 37 : temp = gen_rtx_UNORDERED (VOIDmode, temp, const0_rtx);
19876 : }
19877 : else
19878 : {
19879 247 : emit_insn (gen_testqi_ext_1_ccno (reg, GEN_INT (0x04)));
19880 :
19881 247 : temp = gen_rtx_REG (CCNOmode, FLAGS_REG);
19882 247 : temp = gen_rtx_NE (VOIDmode, temp, const0_rtx);
19883 : }
19884 :
19885 284 : temp = gen_rtx_IF_THEN_ELSE (VOIDmode, temp,
19886 : gen_rtx_LABEL_REF (VOIDmode, label),
19887 : pc_rtx);
19888 284 : insn = emit_jump_insn (gen_rtx_SET (pc_rtx, temp));
19889 284 : predict_jump (REG_BR_PROB_BASE * 10 / 100);
19890 284 : JUMP_LABEL (insn) = label;
19891 284 : }
19892 :
19893 : /* Output code to perform an sinh XFmode calculation. */
19894 :
19895 : void
19896 2 : ix86_emit_i387_sinh (rtx op0, rtx op1)
19897 : {
19898 2 : rtx e1 = gen_reg_rtx (XFmode);
19899 2 : rtx e2 = gen_reg_rtx (XFmode);
19900 2 : rtx scratch = gen_reg_rtx (HImode);
19901 2 : rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
19902 2 : rtx half = const_double_from_real_value (dconsthalf, XFmode);
19903 2 : rtx cst1, tmp;
19904 2 : rtx_code_label *jump_label = gen_label_rtx ();
19905 2 : rtx_insn *insn;
19906 :
19907 : /* scratch = fxam (op1) */
19908 2 : emit_insn (gen_fxamxf2_i387 (scratch, op1));
19909 :
19910 : /* e1 = expm1 (|op1|) */
19911 2 : emit_insn (gen_absxf2 (e2, op1));
19912 2 : emit_insn (gen_expm1xf2 (e1, e2));
19913 :
19914 : /* e2 = e1 / (e1 + 1.0) + e1 */
19915 2 : cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
19916 2 : emit_insn (gen_addxf3 (e2, e1, cst1));
19917 2 : emit_insn (gen_divxf3 (e2, e1, e2));
19918 2 : emit_insn (gen_addxf3 (e2, e2, e1));
19919 :
19920 : /* flags = signbit (op1) */
19921 2 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
19922 :
19923 : /* if (flags) then e2 = -e2 */
19924 2 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
19925 : gen_rtx_EQ (VOIDmode, flags, const0_rtx),
19926 : gen_rtx_LABEL_REF (VOIDmode, jump_label),
19927 : pc_rtx);
19928 2 : insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
19929 2 : predict_jump (REG_BR_PROB_BASE * 50 / 100);
19930 2 : JUMP_LABEL (insn) = jump_label;
19931 :
19932 2 : emit_insn (gen_negxf2 (e2, e2));
19933 :
19934 2 : emit_label (jump_label);
19935 2 : LABEL_NUSES (jump_label) = 1;
19936 :
19937 : /* op0 = 0.5 * e2 */
19938 2 : half = force_reg (XFmode, half);
19939 2 : emit_insn (gen_mulxf3 (op0, e2, half));
19940 2 : }
19941 :
19942 : /* Output code to perform an cosh XFmode calculation. */
19943 :
19944 : void
19945 3 : ix86_emit_i387_cosh (rtx op0, rtx op1)
19946 : {
19947 3 : rtx e1 = gen_reg_rtx (XFmode);
19948 3 : rtx e2 = gen_reg_rtx (XFmode);
19949 3 : rtx half = const_double_from_real_value (dconsthalf, XFmode);
19950 3 : rtx cst1;
19951 :
19952 : /* e1 = exp (op1) */
19953 3 : emit_insn (gen_expxf2 (e1, op1));
19954 :
19955 : /* e2 = e1 + 1.0 / e1 */
19956 3 : cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
19957 3 : emit_insn (gen_divxf3 (e2, cst1, e1));
19958 3 : emit_insn (gen_addxf3 (e2, e1, e2));
19959 :
19960 : /* op0 = 0.5 * e2 */
19961 3 : half = force_reg (XFmode, half);
19962 3 : emit_insn (gen_mulxf3 (op0, e2, half));
19963 3 : }
19964 :
19965 : /* Output code to perform an tanh XFmode calculation. */
19966 :
19967 : void
19968 1 : ix86_emit_i387_tanh (rtx op0, rtx op1)
19969 : {
19970 1 : rtx e1 = gen_reg_rtx (XFmode);
19971 1 : rtx e2 = gen_reg_rtx (XFmode);
19972 1 : rtx scratch = gen_reg_rtx (HImode);
19973 1 : rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
19974 1 : rtx cst2, tmp;
19975 1 : rtx_code_label *jump_label = gen_label_rtx ();
19976 1 : rtx_insn *insn;
19977 :
19978 : /* scratch = fxam (op1) */
19979 1 : emit_insn (gen_fxamxf2_i387 (scratch, op1));
19980 :
19981 : /* e1 = expm1 (-|2 * op1|) */
19982 1 : emit_insn (gen_addxf3 (e2, op1, op1));
19983 1 : emit_insn (gen_absxf2 (e2, e2));
19984 1 : emit_insn (gen_negxf2 (e2, e2));
19985 1 : emit_insn (gen_expm1xf2 (e1, e2));
19986 :
19987 : /* e2 = e1 / (e1 + 2.0) */
19988 1 : cst2 = force_reg (XFmode, CONST2_RTX (XFmode));
19989 1 : emit_insn (gen_addxf3 (e2, e1, cst2));
19990 1 : emit_insn (gen_divxf3 (e2, e1, e2));
19991 :
19992 : /* flags = signbit (op1) */
19993 1 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
19994 :
19995 : /* if (!flags) then e2 = -e2 */
19996 1 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
19997 : gen_rtx_NE (VOIDmode, flags, const0_rtx),
19998 : gen_rtx_LABEL_REF (VOIDmode, jump_label),
19999 : pc_rtx);
20000 1 : insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
20001 1 : predict_jump (REG_BR_PROB_BASE * 50 / 100);
20002 1 : JUMP_LABEL (insn) = jump_label;
20003 :
20004 1 : emit_insn (gen_negxf2 (e2, e2));
20005 :
20006 1 : emit_label (jump_label);
20007 1 : LABEL_NUSES (jump_label) = 1;
20008 :
20009 1 : emit_move_insn (op0, e2);
20010 1 : }
20011 :
20012 : /* Output code to perform an asinh XFmode calculation. */
20013 :
20014 : void
20015 0 : ix86_emit_i387_asinh (rtx op0, rtx op1)
20016 : {
20017 0 : rtx e1 = gen_reg_rtx (XFmode);
20018 0 : rtx e2 = gen_reg_rtx (XFmode);
20019 0 : rtx scratch = gen_reg_rtx (HImode);
20020 0 : rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
20021 0 : rtx cst1, tmp;
20022 0 : rtx_code_label *jump_label = gen_label_rtx ();
20023 0 : rtx_insn *insn;
20024 :
20025 : /* e2 = sqrt (op1^2 + 1.0) + 1.0 */
20026 0 : emit_insn (gen_mulxf3 (e1, op1, op1));
20027 0 : cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
20028 0 : emit_insn (gen_addxf3 (e2, e1, cst1));
20029 0 : emit_insn (gen_sqrtxf2 (e2, e2));
20030 0 : emit_insn (gen_addxf3 (e2, e2, cst1));
20031 :
20032 : /* e1 = e1 / e2 */
20033 0 : emit_insn (gen_divxf3 (e1, e1, e2));
20034 :
20035 : /* scratch = fxam (op1) */
20036 0 : emit_insn (gen_fxamxf2_i387 (scratch, op1));
20037 :
20038 : /* e1 = e1 + |op1| */
20039 0 : emit_insn (gen_absxf2 (e2, op1));
20040 0 : emit_insn (gen_addxf3 (e1, e1, e2));
20041 :
20042 : /* e2 = log1p (e1) */
20043 0 : ix86_emit_i387_log1p (e2, e1);
20044 :
20045 : /* flags = signbit (op1) */
20046 0 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
20047 :
20048 : /* if (flags) then e2 = -e2 */
20049 0 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
20050 : gen_rtx_EQ (VOIDmode, flags, const0_rtx),
20051 : gen_rtx_LABEL_REF (VOIDmode, jump_label),
20052 : pc_rtx);
20053 0 : insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
20054 0 : predict_jump (REG_BR_PROB_BASE * 50 / 100);
20055 0 : JUMP_LABEL (insn) = jump_label;
20056 :
20057 0 : emit_insn (gen_negxf2 (e2, e2));
20058 :
20059 0 : emit_label (jump_label);
20060 0 : LABEL_NUSES (jump_label) = 1;
20061 :
20062 0 : emit_move_insn (op0, e2);
20063 0 : }
20064 :
20065 : /* Output code to perform an acosh XFmode calculation. */
20066 :
20067 : void
20068 0 : ix86_emit_i387_acosh (rtx op0, rtx op1)
20069 : {
20070 0 : rtx e1 = gen_reg_rtx (XFmode);
20071 0 : rtx e2 = gen_reg_rtx (XFmode);
20072 0 : rtx cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
20073 :
20074 : /* e2 = sqrt (op1 + 1.0) */
20075 0 : emit_insn (gen_addxf3 (e2, op1, cst1));
20076 0 : emit_insn (gen_sqrtxf2 (e2, e2));
20077 :
20078 : /* e1 = sqrt (op1 - 1.0) */
20079 0 : emit_insn (gen_subxf3 (e1, op1, cst1));
20080 0 : emit_insn (gen_sqrtxf2 (e1, e1));
20081 :
20082 : /* e1 = e1 * e2 */
20083 0 : emit_insn (gen_mulxf3 (e1, e1, e2));
20084 :
20085 : /* e1 = e1 + op1 */
20086 0 : emit_insn (gen_addxf3 (e1, e1, op1));
20087 :
20088 : /* op0 = log (e1) */
20089 0 : emit_insn (gen_logxf2 (op0, e1));
20090 0 : }
20091 :
20092 : /* Output code to perform an atanh XFmode calculation. */
20093 :
20094 : void
20095 4 : ix86_emit_i387_atanh (rtx op0, rtx op1)
20096 : {
20097 4 : rtx e1 = gen_reg_rtx (XFmode);
20098 4 : rtx e2 = gen_reg_rtx (XFmode);
20099 4 : rtx scratch = gen_reg_rtx (HImode);
20100 4 : rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
20101 4 : rtx half = const_double_from_real_value (dconsthalf, XFmode);
20102 4 : rtx cst1, tmp;
20103 4 : rtx_code_label *jump_label = gen_label_rtx ();
20104 4 : rtx_insn *insn;
20105 :
20106 : /* scratch = fxam (op1) */
20107 4 : emit_insn (gen_fxamxf2_i387 (scratch, op1));
20108 :
20109 : /* e2 = |op1| */
20110 4 : emit_insn (gen_absxf2 (e2, op1));
20111 :
20112 : /* e1 = -(e2 + e2) / (e2 + 1.0) */
20113 4 : cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
20114 4 : emit_insn (gen_addxf3 (e1, e2, cst1));
20115 4 : emit_insn (gen_addxf3 (e2, e2, e2));
20116 4 : emit_insn (gen_negxf2 (e2, e2));
20117 4 : emit_insn (gen_divxf3 (e1, e2, e1));
20118 :
20119 : /* e2 = log1p (e1) */
20120 4 : ix86_emit_i387_log1p (e2, e1);
20121 :
20122 : /* flags = signbit (op1) */
20123 4 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
20124 :
20125 : /* if (!flags) then e2 = -e2 */
20126 4 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
20127 : gen_rtx_NE (VOIDmode, flags, const0_rtx),
20128 : gen_rtx_LABEL_REF (VOIDmode, jump_label),
20129 : pc_rtx);
20130 4 : insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
20131 4 : predict_jump (REG_BR_PROB_BASE * 50 / 100);
20132 4 : JUMP_LABEL (insn) = jump_label;
20133 :
20134 4 : emit_insn (gen_negxf2 (e2, e2));
20135 :
20136 4 : emit_label (jump_label);
20137 4 : LABEL_NUSES (jump_label) = 1;
20138 :
20139 : /* op0 = 0.5 * e2 */
20140 4 : half = force_reg (XFmode, half);
20141 4 : emit_insn (gen_mulxf3 (op0, e2, half));
20142 4 : }
20143 :
20144 : /* Output code to perform a log1p XFmode calculation. */
20145 :
20146 : void
20147 5 : ix86_emit_i387_log1p (rtx op0, rtx op1)
20148 : {
20149 5 : rtx_code_label *label1 = gen_label_rtx ();
20150 5 : rtx_code_label *label2 = gen_label_rtx ();
20151 :
20152 5 : rtx tmp = gen_reg_rtx (XFmode);
20153 5 : rtx res = gen_reg_rtx (XFmode);
20154 5 : rtx cst, cstln2, cst1;
20155 5 : rtx_insn *insn;
20156 :
20157 : /* The emit_jump call emits pending stack adjust, make sure it is emitted
20158 : before the conditional jump, otherwise the stack adjustment will be
20159 : only conditional. */
20160 5 : do_pending_stack_adjust ();
20161 :
20162 5 : cst = const_double_from_real_value
20163 5 : (REAL_VALUE_ATOF ("0.29289321881345247561810596348408353", XFmode), XFmode);
20164 5 : cstln2 = force_reg (XFmode, standard_80387_constant_rtx (4)); /* fldln2 */
20165 :
20166 5 : emit_insn (gen_absxf2 (tmp, op1));
20167 :
20168 5 : cst = force_reg (XFmode, cst);
20169 5 : ix86_expand_branch (GE, tmp, cst, label1);
20170 5 : predict_jump (REG_BR_PROB_BASE * 10 / 100);
20171 5 : insn = get_last_insn ();
20172 5 : JUMP_LABEL (insn) = label1;
20173 :
20174 5 : emit_insn (gen_fyl2xp1xf3_i387 (res, op1, cstln2));
20175 5 : emit_jump (label2);
20176 :
20177 5 : emit_label (label1);
20178 5 : LABEL_NUSES (label1) = 1;
20179 :
20180 5 : cst1 = force_reg (XFmode, CONST1_RTX (XFmode));
20181 5 : emit_insn (gen_rtx_SET (tmp, gen_rtx_PLUS (XFmode, op1, cst1)));
20182 5 : emit_insn (gen_fyl2xxf3_i387 (res, tmp, cstln2));
20183 :
20184 5 : emit_label (label2);
20185 5 : LABEL_NUSES (label2) = 1;
20186 :
20187 5 : emit_move_insn (op0, res);
20188 5 : }
20189 :
20190 : /* Emit code for round calculation. */
20191 : void
20192 68 : ix86_emit_i387_round (rtx op0, rtx op1)
20193 : {
20194 68 : machine_mode inmode = GET_MODE (op1);
20195 68 : machine_mode outmode = GET_MODE (op0);
20196 68 : rtx e1 = gen_reg_rtx (XFmode);
20197 68 : rtx e2 = gen_reg_rtx (XFmode);
20198 68 : rtx scratch = gen_reg_rtx (HImode);
20199 68 : rtx flags = gen_rtx_REG (CCNOmode, FLAGS_REG);
20200 68 : rtx half = const_double_from_real_value (dconsthalf, XFmode);
20201 68 : rtx res = gen_reg_rtx (outmode);
20202 68 : rtx_code_label *jump_label = gen_label_rtx ();
20203 68 : rtx (*floor_insn) (rtx, rtx);
20204 68 : rtx (*neg_insn) (rtx, rtx);
20205 68 : rtx_insn *insn;
20206 68 : rtx tmp;
20207 :
20208 68 : switch (inmode)
20209 : {
20210 37 : case E_SFmode:
20211 37 : case E_DFmode:
20212 37 : tmp = gen_reg_rtx (XFmode);
20213 :
20214 37 : emit_insn (gen_rtx_SET (tmp, gen_rtx_FLOAT_EXTEND (XFmode, op1)));
20215 37 : op1 = tmp;
20216 37 : break;
20217 : case E_XFmode:
20218 : break;
20219 0 : default:
20220 0 : gcc_unreachable ();
20221 : }
20222 :
20223 68 : switch (outmode)
20224 : {
20225 : case E_SFmode:
20226 : floor_insn = gen_frndintxf2_floor;
20227 : neg_insn = gen_negsf2;
20228 : break;
20229 8 : case E_DFmode:
20230 8 : floor_insn = gen_frndintxf2_floor;
20231 8 : neg_insn = gen_negdf2;
20232 8 : break;
20233 10 : case E_XFmode:
20234 10 : floor_insn = gen_frndintxf2_floor;
20235 10 : neg_insn = gen_negxf2;
20236 10 : break;
20237 0 : case E_HImode:
20238 0 : floor_insn = gen_lfloorxfhi2;
20239 0 : neg_insn = gen_neghi2;
20240 0 : break;
20241 10 : case E_SImode:
20242 10 : floor_insn = gen_lfloorxfsi2;
20243 10 : neg_insn = gen_negsi2;
20244 10 : break;
20245 36 : case E_DImode:
20246 36 : floor_insn = gen_lfloorxfdi2;
20247 36 : neg_insn = gen_negdi2;
20248 36 : break;
20249 0 : default:
20250 0 : gcc_unreachable ();
20251 : }
20252 :
20253 : /* round(a) = sgn(a) * floor(fabs(a) + 0.5) */
20254 :
20255 : /* scratch = fxam(op1) */
20256 68 : emit_insn (gen_fxamxf2_i387 (scratch, op1));
20257 :
20258 : /* e1 = fabs(op1) */
20259 68 : emit_insn (gen_absxf2 (e1, op1));
20260 :
20261 : /* e2 = e1 + 0.5 */
20262 68 : half = force_reg (XFmode, half);
20263 68 : emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (XFmode, e1, half)));
20264 :
20265 : /* res = floor(e2) */
20266 68 : switch (outmode)
20267 : {
20268 12 : case E_SFmode:
20269 12 : case E_DFmode:
20270 12 : {
20271 12 : tmp = gen_reg_rtx (XFmode);
20272 :
20273 12 : emit_insn (floor_insn (tmp, e2));
20274 12 : emit_insn (gen_rtx_SET (res,
20275 : gen_rtx_UNSPEC (outmode, gen_rtvec (1, tmp),
20276 : UNSPEC_TRUNC_NOOP)));
20277 : }
20278 12 : break;
20279 56 : default:
20280 56 : emit_insn (floor_insn (res, e2));
20281 : }
20282 :
20283 : /* flags = signbit(a) */
20284 68 : emit_insn (gen_testqi_ext_1_ccno (scratch, GEN_INT (0x02)));
20285 :
20286 : /* if (flags) then res = -res */
20287 68 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode,
20288 : gen_rtx_EQ (VOIDmode, flags, const0_rtx),
20289 : gen_rtx_LABEL_REF (VOIDmode, jump_label),
20290 : pc_rtx);
20291 68 : insn = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
20292 68 : predict_jump (REG_BR_PROB_BASE * 50 / 100);
20293 68 : JUMP_LABEL (insn) = jump_label;
20294 :
20295 68 : emit_insn (neg_insn (res, res));
20296 :
20297 68 : emit_label (jump_label);
20298 68 : LABEL_NUSES (jump_label) = 1;
20299 :
20300 68 : emit_move_insn (op0, res);
20301 68 : }
20302 :
20303 : /* Output code to perform a Newton-Rhapson approximation of a single precision
20304 : floating point divide [http://en.wikipedia.org/wiki/N-th_root_algorithm]. */
20305 :
20306 : void
20307 55 : ix86_emit_swdivsf (rtx res, rtx a, rtx b, machine_mode mode)
20308 : {
20309 55 : rtx x0, x1, e0, e1;
20310 :
20311 55 : x0 = gen_reg_rtx (mode);
20312 55 : e0 = gen_reg_rtx (mode);
20313 55 : e1 = gen_reg_rtx (mode);
20314 55 : x1 = gen_reg_rtx (mode);
20315 :
20316 55 : b = force_reg (mode, b);
20317 :
20318 : /* x0 = rcp(b) estimate */
20319 55 : if (mode == V16SFmode || mode == V8DFmode)
20320 : {
20321 0 : emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
20322 : UNSPEC_RCP14)));
20323 : }
20324 : else
20325 55 : emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, b),
20326 : UNSPEC_RCP)));
20327 :
20328 55 : unsigned vector_size = GET_MODE_SIZE (mode);
20329 :
20330 : /* (a - (rcp(b) * a * b)) * rcp(b) + rcp(b) * a
20331 : N-R step with 2 fma implementation. */
20332 55 : if (TARGET_FMA
20333 54 : || (TARGET_AVX512F && vector_size == 64)
20334 54 : || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
20335 : {
20336 : /* e0 = x0 * a */
20337 1 : emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
20338 : /* e1 = e0 * b - a */
20339 1 : emit_insn (gen_rtx_SET (e1, gen_rtx_FMA (mode, e0, b,
20340 : gen_rtx_NEG (mode, a))));
20341 : /* res = - e1 * x0 + e0 */
20342 1 : emit_insn (gen_rtx_SET (res, gen_rtx_FMA (mode,
20343 : gen_rtx_NEG (mode, e1),
20344 : x0, e0)));
20345 : }
20346 : else
20347 : /* a / b = a * ((rcp(b) + rcp(b)) - (b * rcp(b) * rcp (b))) */
20348 : {
20349 : /* e0 = x0 * b */
20350 54 : emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, b)));
20351 :
20352 : /* e1 = x0 + x0 */
20353 54 : emit_insn (gen_rtx_SET (e1, gen_rtx_PLUS (mode, x0, x0)));
20354 :
20355 : /* e0 = x0 * e0 */
20356 54 : emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, e0)));
20357 :
20358 : /* x1 = e1 - e0 */
20359 54 : emit_insn (gen_rtx_SET (x1, gen_rtx_MINUS (mode, e1, e0)));
20360 :
20361 : /* res = a * x1 */
20362 54 : emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, a, x1)));
20363 : }
20364 55 : }
20365 :
20366 : /* Output code to perform a Newton-Rhapson approximation of a
20367 : single precision floating point [reciprocal] square root. */
20368 :
20369 : void
20370 85 : ix86_emit_swsqrtsf (rtx res, rtx a, machine_mode mode, bool recip)
20371 : {
20372 85 : rtx x0, e0, e1, e2, e3, mthree, mhalf;
20373 85 : REAL_VALUE_TYPE r;
20374 85 : int unspec;
20375 :
20376 85 : x0 = gen_reg_rtx (mode);
20377 85 : e0 = gen_reg_rtx (mode);
20378 85 : e1 = gen_reg_rtx (mode);
20379 85 : e2 = gen_reg_rtx (mode);
20380 85 : e3 = gen_reg_rtx (mode);
20381 :
20382 85 : real_from_integer (&r, VOIDmode, -3, SIGNED);
20383 85 : mthree = const_double_from_real_value (r, SFmode);
20384 :
20385 85 : real_arithmetic (&r, NEGATE_EXPR, &dconsthalf, NULL);
20386 85 : mhalf = const_double_from_real_value (r, SFmode);
20387 85 : unspec = UNSPEC_RSQRT;
20388 :
20389 85 : if (VECTOR_MODE_P (mode))
20390 : {
20391 66 : mthree = ix86_build_const_vector (mode, true, mthree);
20392 66 : mhalf = ix86_build_const_vector (mode, true, mhalf);
20393 : /* There is no 512-bit rsqrt. There is however rsqrt14. */
20394 132 : if (GET_MODE_SIZE (mode) == 64)
20395 0 : unspec = UNSPEC_RSQRT14;
20396 : }
20397 :
20398 : /* sqrt(a) = -0.5 * a * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0)
20399 : rsqrt(a) = -0.5 * rsqrtss(a) * (a * rsqrtss(a) * rsqrtss(a) - 3.0) */
20400 :
20401 85 : a = force_reg (mode, a);
20402 :
20403 : /* x0 = rsqrt(a) estimate */
20404 85 : emit_insn (gen_rtx_SET (x0, gen_rtx_UNSPEC (mode, gen_rtvec (1, a),
20405 : unspec)));
20406 :
20407 : /* If (a == 0.0) Filter out infinity to prevent NaN for sqrt(0.0). */
20408 85 : if (!recip)
20409 : {
20410 57 : rtx zero = force_reg (mode, CONST0_RTX(mode));
20411 57 : rtx mask;
20412 :
20413 : /* Handle masked compare. */
20414 110 : if (VECTOR_MODE_P (mode) && GET_MODE_SIZE (mode) == 64)
20415 : {
20416 0 : mask = gen_reg_rtx (HImode);
20417 : /* Imm value 0x4 corresponds to not-equal comparison. */
20418 0 : emit_insn (gen_avx512f_cmpv16sf3 (mask, zero, a, GEN_INT (0x4)));
20419 0 : emit_insn (gen_avx512f_blendmv16sf (x0, zero, x0, mask));
20420 : }
20421 : else
20422 : {
20423 57 : mask = gen_reg_rtx (mode);
20424 57 : emit_insn (gen_rtx_SET (mask, gen_rtx_NE (mode, zero, a)));
20425 57 : emit_insn (gen_rtx_SET (x0, gen_rtx_AND (mode, x0, mask)));
20426 : }
20427 : }
20428 :
20429 85 : mthree = force_reg (mode, mthree);
20430 :
20431 : /* e0 = x0 * a */
20432 85 : emit_insn (gen_rtx_SET (e0, gen_rtx_MULT (mode, x0, a)));
20433 :
20434 85 : unsigned vector_size = GET_MODE_SIZE (mode);
20435 85 : if (TARGET_FMA
20436 77 : || (TARGET_AVX512F && vector_size == 64)
20437 77 : || (TARGET_AVX512VL && (vector_size == 32 || vector_size == 16)))
20438 16 : emit_insn (gen_rtx_SET (e2,
20439 : gen_rtx_FMA (mode, e0, x0, mthree)));
20440 : else
20441 : {
20442 : /* e1 = e0 * x0 */
20443 69 : emit_insn (gen_rtx_SET (e1, gen_rtx_MULT (mode, e0, x0)));
20444 :
20445 : /* e2 = e1 - 3. */
20446 69 : emit_insn (gen_rtx_SET (e2, gen_rtx_PLUS (mode, e1, mthree)));
20447 : }
20448 :
20449 85 : mhalf = force_reg (mode, mhalf);
20450 85 : if (recip)
20451 : /* e3 = -.5 * x0 */
20452 28 : emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, x0, mhalf)));
20453 : else
20454 : /* e3 = -.5 * e0 */
20455 57 : emit_insn (gen_rtx_SET (e3, gen_rtx_MULT (mode, e0, mhalf)));
20456 : /* ret = e2 * e3 */
20457 85 : emit_insn (gen_rtx_SET (res, gen_rtx_MULT (mode, e2, e3)));
20458 85 : }
20459 :
20460 : /* Expand fabs (OP0) and return a new rtx that holds the result. The
20461 : mask for masking out the sign-bit is stored in *SMASK, if that is
20462 : non-null. */
20463 :
20464 : static rtx
20465 1049 : ix86_expand_sse_fabs (rtx op0, rtx *smask)
20466 : {
20467 1049 : machine_mode vmode, mode = GET_MODE (op0);
20468 1049 : rtx xa, mask;
20469 :
20470 1049 : xa = gen_reg_rtx (mode);
20471 1049 : if (mode == SFmode)
20472 : vmode = V4SFmode;
20473 467 : else if (mode == DFmode)
20474 : vmode = V2DFmode;
20475 : else
20476 0 : vmode = mode;
20477 1049 : mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), true);
20478 1049 : if (!VECTOR_MODE_P (mode))
20479 : {
20480 : /* We need to generate a scalar mode mask in this case. */
20481 1049 : rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
20482 1049 : tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
20483 1049 : mask = gen_reg_rtx (mode);
20484 1049 : emit_insn (gen_rtx_SET (mask, tmp));
20485 : }
20486 1049 : emit_insn (gen_rtx_SET (xa, gen_rtx_AND (mode, op0, mask)));
20487 :
20488 1049 : if (smask)
20489 996 : *smask = mask;
20490 :
20491 1049 : return xa;
20492 : }
20493 :
20494 : /* Expands a comparison of OP0 with OP1 using comparison code CODE,
20495 : swapping the operands if SWAP_OPERANDS is true. The expanded
20496 : code is a forward jump to a newly created label in case the
20497 : comparison is true. The generated label rtx is returned. */
20498 : static rtx_code_label *
20499 1064 : ix86_expand_sse_compare_and_jump (enum rtx_code code, rtx op0, rtx op1,
20500 : bool swap_operands)
20501 : {
20502 1064 : bool unordered_compare = ix86_unordered_fp_compare (code);
20503 1064 : rtx_code_label *label;
20504 1064 : rtx tmp, reg;
20505 :
20506 1064 : if (swap_operands)
20507 34 : std::swap (op0, op1);
20508 :
20509 1064 : label = gen_label_rtx ();
20510 1064 : tmp = gen_rtx_COMPARE (CCFPmode, op0, op1);
20511 1064 : if (unordered_compare)
20512 908 : tmp = gen_rtx_UNSPEC (CCFPmode, gen_rtvec (1, tmp), UNSPEC_NOTRAP);
20513 1064 : reg = gen_rtx_REG (CCFPmode, FLAGS_REG);
20514 1064 : emit_insn (gen_rtx_SET (reg, tmp));
20515 1064 : tmp = gen_rtx_fmt_ee (code, VOIDmode, reg, const0_rtx);
20516 1064 : tmp = gen_rtx_IF_THEN_ELSE (VOIDmode, tmp,
20517 : gen_rtx_LABEL_REF (VOIDmode, label), pc_rtx);
20518 1064 : tmp = emit_jump_insn (gen_rtx_SET (pc_rtx, tmp));
20519 1064 : JUMP_LABEL (tmp) = label;
20520 :
20521 1064 : return label;
20522 : }
20523 :
20524 : /* Expand a mask generating SSE comparison instruction comparing OP0 with OP1
20525 : using comparison code CODE. Operands are swapped for the comparison if
20526 : SWAP_OPERANDS is true. Returns a rtx for the generated mask. */
20527 : static rtx
20528 541 : ix86_expand_sse_compare_mask (enum rtx_code code, rtx op0, rtx op1,
20529 : bool swap_operands)
20530 : {
20531 541 : rtx (*insn)(rtx, rtx, rtx, rtx);
20532 541 : machine_mode mode = GET_MODE (op0);
20533 541 : rtx mask = gen_reg_rtx (mode);
20534 :
20535 541 : if (swap_operands)
20536 362 : std::swap (op0, op1);
20537 :
20538 541 : insn = mode == DFmode ? gen_setcc_df_sse : gen_setcc_sf_sse;
20539 :
20540 541 : emit_insn (insn (mask, op0, op1,
20541 : gen_rtx_fmt_ee (code, mode, op0, op1)));
20542 541 : return mask;
20543 : }
20544 :
20545 : /* Expand copysign from SIGN to the positive value ABS_VALUE
20546 : storing in RESULT. If MASK is non-null, it shall be a mask to mask out
20547 : the sign-bit. */
20548 :
20549 : static void
20550 1016 : ix86_sse_copysign_to_positive (rtx result, rtx abs_value, rtx sign, rtx mask)
20551 : {
20552 1016 : machine_mode mode = GET_MODE (sign);
20553 1016 : rtx sgn = gen_reg_rtx (mode);
20554 1016 : if (mask == NULL_RTX)
20555 : {
20556 28 : machine_mode vmode;
20557 :
20558 28 : if (mode == SFmode)
20559 : vmode = V4SFmode;
20560 : else if (mode == DFmode)
20561 : vmode = V2DFmode;
20562 : else if (mode == HFmode)
20563 : vmode = V8HFmode;
20564 : else
20565 28 : vmode = mode;
20566 :
20567 28 : mask = ix86_build_signbit_mask (vmode, VECTOR_MODE_P (mode), false);
20568 28 : if (!VECTOR_MODE_P (mode))
20569 : {
20570 : /* We need to generate a scalar mode mask in this case. */
20571 28 : rtx tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const0_rtx));
20572 28 : tmp = gen_rtx_VEC_SELECT (mode, mask, tmp);
20573 28 : mask = gen_reg_rtx (mode);
20574 28 : emit_insn (gen_rtx_SET (mask, tmp));
20575 : }
20576 : }
20577 : else
20578 988 : mask = gen_rtx_NOT (mode, mask);
20579 1016 : emit_insn (gen_rtx_SET (sgn, gen_rtx_AND (mode, mask, sign)));
20580 1016 : emit_insn (gen_rtx_SET (result, gen_rtx_IOR (mode, abs_value, sgn)));
20581 1016 : }
20582 :
20583 : /* Expand SSE sequence for computing lround from OP1 storing
20584 : into OP0. */
20585 :
20586 : void
20587 28 : ix86_expand_lround (rtx op0, rtx op1)
20588 : {
20589 : /* C code for the stuff we're doing below:
20590 : tmp = op1 + copysign (nextafter (0.5, 0.0), op1)
20591 : return (long)tmp;
20592 : */
20593 28 : machine_mode mode = GET_MODE (op1);
20594 28 : const struct real_format *fmt;
20595 28 : REAL_VALUE_TYPE pred_half, half_minus_pred_half;
20596 28 : rtx adj;
20597 :
20598 : /* load nextafter (0.5, 0.0) */
20599 28 : fmt = REAL_MODE_FORMAT (mode);
20600 28 : real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
20601 28 : real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
20602 :
20603 : /* adj = copysign (0.5, op1) */
20604 28 : adj = force_reg (mode, const_double_from_real_value (pred_half, mode));
20605 28 : ix86_sse_copysign_to_positive (adj, adj, force_reg (mode, op1), NULL_RTX);
20606 :
20607 : /* adj = op1 + adj */
20608 28 : adj = expand_simple_binop (mode, PLUS, adj, op1, NULL_RTX, 0, OPTAB_DIRECT);
20609 :
20610 : /* op0 = (imode)adj */
20611 28 : expand_fix (op0, adj, 0);
20612 28 : }
20613 :
20614 : /* Expand SSE2 sequence for computing lround from OPERAND1 storing
20615 : into OPERAND0. */
20616 :
20617 : void
20618 68 : ix86_expand_lfloorceil (rtx op0, rtx op1, bool do_floor)
20619 : {
20620 : /* C code for the stuff we're doing below (for do_floor):
20621 : xi = (long)op1;
20622 : xi -= (double)xi > op1 ? 1 : 0;
20623 : return xi;
20624 : */
20625 68 : machine_mode fmode = GET_MODE (op1);
20626 68 : machine_mode imode = GET_MODE (op0);
20627 68 : rtx ireg, freg, tmp;
20628 68 : rtx_code_label *label;
20629 :
20630 : /* reg = (long)op1 */
20631 68 : ireg = gen_reg_rtx (imode);
20632 68 : expand_fix (ireg, op1, 0);
20633 :
20634 : /* freg = (double)reg */
20635 68 : freg = gen_reg_rtx (fmode);
20636 68 : expand_float (freg, ireg, 0);
20637 :
20638 : /* ireg = (freg > op1) ? ireg - 1 : ireg */
20639 136 : label = ix86_expand_sse_compare_and_jump (UNLE,
20640 68 : freg, op1, !do_floor);
20641 102 : tmp = expand_simple_binop (imode, do_floor ? MINUS : PLUS,
20642 : ireg, const1_rtx, NULL_RTX, 0, OPTAB_DIRECT);
20643 68 : emit_move_insn (ireg, tmp);
20644 :
20645 68 : emit_label (label);
20646 68 : LABEL_NUSES (label) = 1;
20647 :
20648 68 : emit_move_insn (op0, ireg);
20649 68 : }
20650 :
20651 : /* Generate and return a rtx of mode MODE for 2**n where n is the number
20652 : of bits of the mantissa of MODE, which must be one of DFmode or SFmode. */
20653 :
20654 : static rtx
20655 996 : ix86_gen_TWO52 (machine_mode mode)
20656 : {
20657 996 : const struct real_format *fmt;
20658 996 : REAL_VALUE_TYPE TWO52r;
20659 996 : rtx TWO52;
20660 :
20661 996 : fmt = REAL_MODE_FORMAT (mode);
20662 996 : real_2expN (&TWO52r, fmt->p - 1, mode);
20663 996 : TWO52 = const_double_from_real_value (TWO52r, mode);
20664 996 : TWO52 = force_reg (mode, TWO52);
20665 :
20666 996 : return TWO52;
20667 : }
20668 :
20669 : /* Expand rint rounding OPERAND1 and storing the result in OPERAND0. */
20670 :
20671 : void
20672 122 : ix86_expand_rint (rtx operand0, rtx operand1)
20673 : {
20674 : /* C code for the stuff we're doing below:
20675 : xa = fabs (operand1);
20676 : if (!isless (xa, 2**52))
20677 : return operand1;
20678 : two52 = 2**52;
20679 : if (flag_rounding_math)
20680 : {
20681 : two52 = copysign (two52, operand1);
20682 : xa = operand1;
20683 : }
20684 : xa = xa + two52 - two52;
20685 : return copysign (xa, operand1);
20686 : */
20687 122 : machine_mode mode = GET_MODE (operand0);
20688 122 : rtx res, xa, TWO52, mask;
20689 122 : rtx_code_label *label;
20690 :
20691 122 : TWO52 = ix86_gen_TWO52 (mode);
20692 :
20693 : /* Temporary for holding the result, initialized to the input
20694 : operand to ease control flow. */
20695 122 : res = copy_to_reg (operand1);
20696 :
20697 : /* xa = abs (operand1) */
20698 122 : xa = ix86_expand_sse_fabs (res, &mask);
20699 :
20700 : /* if (!isless (xa, TWO52)) goto label; */
20701 122 : label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
20702 :
20703 122 : if (flag_rounding_math)
20704 : {
20705 53 : ix86_sse_copysign_to_positive (TWO52, TWO52, res, mask);
20706 53 : xa = res;
20707 : }
20708 :
20709 122 : xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
20710 122 : xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
20711 :
20712 : /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
20713 122 : if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
20714 53 : xa = ix86_expand_sse_fabs (xa, NULL);
20715 :
20716 122 : ix86_sse_copysign_to_positive (res, xa, res, mask);
20717 :
20718 122 : emit_label (label);
20719 122 : LABEL_NUSES (label) = 1;
20720 :
20721 122 : emit_move_insn (operand0, res);
20722 122 : }
20723 :
20724 : /* Expand SSE2 sequence for computing floor or ceil
20725 : from OPERAND1 storing into OPERAND0. */
20726 : void
20727 541 : ix86_expand_floorceil (rtx operand0, rtx operand1, bool do_floor)
20728 : {
20729 : /* C code for the stuff we expand below.
20730 : double xa = fabs (x), x2;
20731 : if (!isless (xa, TWO52))
20732 : return x;
20733 : x2 = (double)(long)x;
20734 :
20735 : Compensate. Floor:
20736 : if (x2 > x)
20737 : x2 -= 1;
20738 : Compensate. Ceil:
20739 : if (x2 < x)
20740 : x2 += 1;
20741 :
20742 : if (HONOR_SIGNED_ZEROS (mode))
20743 : return copysign (x2, x);
20744 : return x2;
20745 : */
20746 541 : machine_mode mode = GET_MODE (operand0);
20747 541 : rtx xa, xi, TWO52, tmp, one, res, mask;
20748 541 : rtx_code_label *label;
20749 :
20750 541 : TWO52 = ix86_gen_TWO52 (mode);
20751 :
20752 : /* Temporary for holding the result, initialized to the input
20753 : operand to ease control flow. */
20754 541 : res = copy_to_reg (operand1);
20755 :
20756 : /* xa = abs (operand1) */
20757 541 : xa = ix86_expand_sse_fabs (res, &mask);
20758 :
20759 : /* if (!isless (xa, TWO52)) goto label; */
20760 541 : label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
20761 :
20762 : /* xa = (double)(long)x */
20763 541 : xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
20764 541 : expand_fix (xi, res, 0);
20765 541 : expand_float (xa, xi, 0);
20766 :
20767 : /* generate 1.0 */
20768 541 : one = force_reg (mode, const_double_from_real_value (dconst1, mode));
20769 :
20770 : /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
20771 541 : tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
20772 541 : emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
20773 903 : tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
20774 : xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
20775 541 : if (HONOR_SIGNED_ZEROS (mode))
20776 : {
20777 : /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
20778 494 : if (do_floor && flag_rounding_math)
20779 0 : tmp = ix86_expand_sse_fabs (tmp, NULL);
20780 :
20781 494 : ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
20782 : }
20783 541 : emit_move_insn (res, tmp);
20784 :
20785 541 : emit_label (label);
20786 541 : LABEL_NUSES (label) = 1;
20787 :
20788 541 : emit_move_insn (operand0, res);
20789 541 : }
20790 :
20791 : /* Expand SSE2 sequence for computing floor or ceil from OPERAND1 storing
20792 : into OPERAND0 without relying on DImode truncation via cvttsd2siq
20793 : that is only available on 64bit targets. */
20794 : void
20795 0 : ix86_expand_floorceildf_32 (rtx operand0, rtx operand1, bool do_floor)
20796 : {
20797 : /* C code for the stuff we expand below.
20798 : double xa = fabs (x), x2;
20799 : if (!isless (xa, TWO52))
20800 : return x;
20801 : xa = xa + TWO52 - TWO52;
20802 : x2 = copysign (xa, x);
20803 :
20804 : Compensate. Floor:
20805 : if (x2 > x)
20806 : x2 -= 1;
20807 : Compensate. Ceil:
20808 : if (x2 < x)
20809 : x2 += 1;
20810 :
20811 : if (HONOR_SIGNED_ZEROS (mode))
20812 : x2 = copysign (x2, x);
20813 : return x2;
20814 : */
20815 0 : machine_mode mode = GET_MODE (operand0);
20816 0 : rtx xa, TWO52, tmp, one, res, mask;
20817 0 : rtx_code_label *label;
20818 :
20819 0 : TWO52 = ix86_gen_TWO52 (mode);
20820 :
20821 : /* Temporary for holding the result, initialized to the input
20822 : operand to ease control flow. */
20823 0 : res = copy_to_reg (operand1);
20824 :
20825 : /* xa = abs (operand1) */
20826 0 : xa = ix86_expand_sse_fabs (res, &mask);
20827 :
20828 : /* if (!isless (xa, TWO52)) goto label; */
20829 0 : label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
20830 :
20831 : /* xa = xa + TWO52 - TWO52; */
20832 0 : xa = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
20833 0 : xa = expand_simple_binop (mode, MINUS, xa, TWO52, xa, 0, OPTAB_DIRECT);
20834 :
20835 : /* xa = copysign (xa, operand1) */
20836 0 : ix86_sse_copysign_to_positive (xa, xa, res, mask);
20837 :
20838 : /* generate 1.0 */
20839 0 : one = force_reg (mode, const_double_from_real_value (dconst1, mode));
20840 :
20841 : /* Compensate: xa = xa - (xa > operand1 ? 1 : 0) */
20842 0 : tmp = ix86_expand_sse_compare_mask (UNGT, xa, res, !do_floor);
20843 0 : emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
20844 0 : tmp = expand_simple_binop (mode, do_floor ? MINUS : PLUS,
20845 : xa, tmp, NULL_RTX, 0, OPTAB_DIRECT);
20846 0 : if (HONOR_SIGNED_ZEROS (mode))
20847 : {
20848 : /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
20849 0 : if (do_floor && flag_rounding_math)
20850 0 : tmp = ix86_expand_sse_fabs (tmp, NULL);
20851 :
20852 0 : ix86_sse_copysign_to_positive (tmp, tmp, res, mask);
20853 : }
20854 0 : emit_move_insn (res, tmp);
20855 :
20856 0 : emit_label (label);
20857 0 : LABEL_NUSES (label) = 1;
20858 :
20859 0 : emit_move_insn (operand0, res);
20860 0 : }
20861 :
20862 : /* Expand SSE sequence for computing trunc
20863 : from OPERAND1 storing into OPERAND0. */
20864 : void
20865 319 : ix86_expand_trunc (rtx operand0, rtx operand1)
20866 : {
20867 : /* C code for SSE variant we expand below.
20868 : double xa = fabs (x), x2;
20869 : if (!isless (xa, TWO52))
20870 : return x;
20871 : x2 = (double)(long)x;
20872 : if (HONOR_SIGNED_ZEROS (mode))
20873 : return copysign (x2, x);
20874 : return x2;
20875 : */
20876 319 : machine_mode mode = GET_MODE (operand0);
20877 319 : rtx xa, xi, TWO52, res, mask;
20878 319 : rtx_code_label *label;
20879 :
20880 319 : TWO52 = ix86_gen_TWO52 (mode);
20881 :
20882 : /* Temporary for holding the result, initialized to the input
20883 : operand to ease control flow. */
20884 319 : res = copy_to_reg (operand1);
20885 :
20886 : /* xa = abs (operand1) */
20887 319 : xa = ix86_expand_sse_fabs (res, &mask);
20888 :
20889 : /* if (!isless (xa, TWO52)) goto label; */
20890 319 : label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
20891 :
20892 : /* xa = (double)(long)x */
20893 319 : xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
20894 319 : expand_fix (xi, res, 0);
20895 319 : expand_float (xa, xi, 0);
20896 :
20897 319 : if (HONOR_SIGNED_ZEROS (mode))
20898 305 : ix86_sse_copysign_to_positive (xa, xa, res, mask);
20899 :
20900 319 : emit_move_insn (res, xa);
20901 :
20902 319 : emit_label (label);
20903 319 : LABEL_NUSES (label) = 1;
20904 :
20905 319 : emit_move_insn (operand0, res);
20906 319 : }
20907 :
20908 : /* Expand SSE sequence for computing trunc from OPERAND1 storing
20909 : into OPERAND0 without relying on DImode truncation via cvttsd2siq
20910 : that is only available on 64bit targets. */
20911 : void
20912 0 : ix86_expand_truncdf_32 (rtx operand0, rtx operand1)
20913 : {
20914 0 : machine_mode mode = GET_MODE (operand0);
20915 0 : rtx xa, xa2, TWO52, tmp, one, res, mask;
20916 0 : rtx_code_label *label;
20917 :
20918 : /* C code for SSE variant we expand below.
20919 : double xa = fabs (x), x2;
20920 : if (!isless (xa, TWO52))
20921 : return x;
20922 : xa2 = xa + TWO52 - TWO52;
20923 : Compensate:
20924 : if (xa2 > xa)
20925 : xa2 -= 1.0;
20926 : x2 = copysign (xa2, x);
20927 : return x2;
20928 : */
20929 :
20930 0 : TWO52 = ix86_gen_TWO52 (mode);
20931 :
20932 : /* Temporary for holding the result, initialized to the input
20933 : operand to ease control flow. */
20934 0 : res =copy_to_reg (operand1);
20935 :
20936 : /* xa = abs (operand1) */
20937 0 : xa = ix86_expand_sse_fabs (res, &mask);
20938 :
20939 : /* if (!isless (xa, TWO52)) goto label; */
20940 0 : label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
20941 :
20942 : /* xa2 = xa + TWO52 - TWO52; */
20943 0 : xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
20944 0 : xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
20945 :
20946 : /* generate 1.0 */
20947 0 : one = force_reg (mode, const_double_from_real_value (dconst1, mode));
20948 :
20949 : /* Compensate: xa2 = xa2 - (xa2 > xa ? 1 : 0) */
20950 0 : tmp = ix86_expand_sse_compare_mask (UNGT, xa2, xa, false);
20951 0 : emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, one, tmp)));
20952 0 : tmp = expand_simple_binop (mode, MINUS,
20953 : xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
20954 : /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
20955 0 : if (HONOR_SIGNED_ZEROS (mode) && flag_rounding_math)
20956 0 : tmp = ix86_expand_sse_fabs (tmp, NULL);
20957 :
20958 : /* res = copysign (xa2, operand1) */
20959 0 : ix86_sse_copysign_to_positive (res, tmp, res, mask);
20960 :
20961 0 : emit_label (label);
20962 0 : LABEL_NUSES (label) = 1;
20963 :
20964 0 : emit_move_insn (operand0, res);
20965 0 : }
20966 :
20967 : /* Expand SSE sequence for computing round
20968 : from OPERAND1 storing into OPERAND0. */
20969 : void
20970 14 : ix86_expand_round (rtx operand0, rtx operand1)
20971 : {
20972 : /* C code for the stuff we're doing below:
20973 : double xa = fabs (x);
20974 : if (!isless (xa, TWO52))
20975 : return x;
20976 : xa = (double)(long)(xa + nextafter (0.5, 0.0));
20977 : return copysign (xa, x);
20978 : */
20979 14 : machine_mode mode = GET_MODE (operand0);
20980 14 : rtx res, TWO52, xa, xi, half, mask;
20981 14 : rtx_code_label *label;
20982 14 : const struct real_format *fmt;
20983 14 : REAL_VALUE_TYPE pred_half, half_minus_pred_half;
20984 :
20985 : /* Temporary for holding the result, initialized to the input
20986 : operand to ease control flow. */
20987 14 : res = copy_to_reg (operand1);
20988 :
20989 14 : TWO52 = ix86_gen_TWO52 (mode);
20990 14 : xa = ix86_expand_sse_fabs (res, &mask);
20991 14 : label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
20992 :
20993 : /* load nextafter (0.5, 0.0) */
20994 14 : fmt = REAL_MODE_FORMAT (mode);
20995 14 : real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
20996 14 : real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
20997 :
20998 : /* xa = xa + 0.5 */
20999 14 : half = force_reg (mode, const_double_from_real_value (pred_half, mode));
21000 14 : xa = expand_simple_binop (mode, PLUS, xa, half, NULL_RTX, 0, OPTAB_DIRECT);
21001 :
21002 : /* xa = (double)(int64_t)xa */
21003 14 : xi = gen_reg_rtx (int_mode_for_mode (mode).require ());
21004 14 : expand_fix (xi, xa, 0);
21005 14 : expand_float (xa, xi, 0);
21006 :
21007 : /* res = copysign (xa, operand1) */
21008 14 : ix86_sse_copysign_to_positive (res, xa, res, mask);
21009 :
21010 14 : emit_label (label);
21011 14 : LABEL_NUSES (label) = 1;
21012 :
21013 14 : emit_move_insn (operand0, res);
21014 14 : }
21015 :
21016 : /* Expand SSE sequence for computing round from OPERAND1 storing
21017 : into OPERAND0 without relying on DImode truncation via cvttsd2siq
21018 : that is only available on 64bit targets. */
21019 : void
21020 0 : ix86_expand_rounddf_32 (rtx operand0, rtx operand1)
21021 : {
21022 : /* C code for the stuff we expand below.
21023 : double xa = fabs (x), xa2, x2;
21024 : if (!isless (xa, TWO52))
21025 : return x;
21026 : Using the absolute value and copying back sign makes
21027 : -0.0 -> -0.0 correct.
21028 : xa2 = xa + TWO52 - TWO52;
21029 : Compensate.
21030 : dxa = xa2 - xa;
21031 : if (dxa <= -0.5)
21032 : xa2 += 1;
21033 : else if (dxa > 0.5)
21034 : xa2 -= 1;
21035 : x2 = copysign (xa2, x);
21036 : return x2;
21037 : */
21038 0 : machine_mode mode = GET_MODE (operand0);
21039 0 : rtx xa, xa2, dxa, TWO52, tmp, half, mhalf, one, res, mask;
21040 0 : rtx_code_label *label;
21041 :
21042 0 : TWO52 = ix86_gen_TWO52 (mode);
21043 :
21044 : /* Temporary for holding the result, initialized to the input
21045 : operand to ease control flow. */
21046 0 : res = copy_to_reg (operand1);
21047 :
21048 : /* xa = abs (operand1) */
21049 0 : xa = ix86_expand_sse_fabs (res, &mask);
21050 :
21051 : /* if (!isless (xa, TWO52)) goto label; */
21052 0 : label = ix86_expand_sse_compare_and_jump (UNLE, TWO52, xa, false);
21053 :
21054 : /* xa2 = xa + TWO52 - TWO52; */
21055 0 : xa2 = expand_simple_binop (mode, PLUS, xa, TWO52, NULL_RTX, 0, OPTAB_DIRECT);
21056 0 : xa2 = expand_simple_binop (mode, MINUS, xa2, TWO52, xa2, 0, OPTAB_DIRECT);
21057 :
21058 : /* dxa = xa2 - xa; */
21059 0 : dxa = expand_simple_binop (mode, MINUS, xa2, xa, NULL_RTX, 0, OPTAB_DIRECT);
21060 :
21061 : /* generate 0.5, 1.0 and -0.5 */
21062 0 : half = force_reg (mode, const_double_from_real_value (dconsthalf, mode));
21063 0 : one = expand_simple_binop (mode, PLUS, half, half, NULL_RTX, 0, OPTAB_DIRECT);
21064 0 : mhalf = expand_simple_binop (mode, MINUS, half, one, NULL_RTX,
21065 : 0, OPTAB_DIRECT);
21066 :
21067 : /* Compensate. */
21068 : /* xa2 = xa2 - (dxa > 0.5 ? 1 : 0) */
21069 0 : tmp = ix86_expand_sse_compare_mask (UNGT, dxa, half, false);
21070 0 : emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
21071 0 : xa2 = expand_simple_binop (mode, MINUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
21072 : /* xa2 = xa2 + (dxa <= -0.5 ? 1 : 0) */
21073 0 : tmp = ix86_expand_sse_compare_mask (UNGE, mhalf, dxa, false);
21074 0 : emit_insn (gen_rtx_SET (tmp, gen_rtx_AND (mode, tmp, one)));
21075 0 : xa2 = expand_simple_binop (mode, PLUS, xa2, tmp, NULL_RTX, 0, OPTAB_DIRECT);
21076 :
21077 : /* res = copysign (xa2, operand1) */
21078 0 : ix86_sse_copysign_to_positive (res, xa2, res, mask);
21079 :
21080 0 : emit_label (label);
21081 0 : LABEL_NUSES (label) = 1;
21082 :
21083 0 : emit_move_insn (operand0, res);
21084 0 : }
21085 :
21086 : /* Expand SSE sequence for computing round
21087 : from OP1 storing into OP0 using sse4 round insn. */
21088 : void
21089 9 : ix86_expand_round_sse4 (rtx op0, rtx op1)
21090 : {
21091 9 : machine_mode mode = GET_MODE (op0);
21092 9 : rtx e1, e2, res, half;
21093 9 : const struct real_format *fmt;
21094 9 : REAL_VALUE_TYPE pred_half, half_minus_pred_half;
21095 9 : rtx (*gen_copysign) (rtx, rtx, rtx);
21096 9 : rtx (*gen_round) (rtx, rtx, rtx);
21097 :
21098 9 : switch (mode)
21099 : {
21100 : case E_HFmode:
21101 : gen_copysign = gen_copysignhf3;
21102 : gen_round = gen_sse4_1_roundhf2;
21103 : break;
21104 4 : case E_SFmode:
21105 4 : gen_copysign = gen_copysignsf3;
21106 4 : gen_round = gen_sse4_1_roundsf2;
21107 4 : break;
21108 4 : case E_DFmode:
21109 4 : gen_copysign = gen_copysigndf3;
21110 4 : gen_round = gen_sse4_1_rounddf2;
21111 4 : break;
21112 0 : default:
21113 0 : gcc_unreachable ();
21114 : }
21115 :
21116 : /* round (a) = trunc (a + copysign (0.5, a)) */
21117 :
21118 : /* load nextafter (0.5, 0.0) */
21119 9 : fmt = REAL_MODE_FORMAT (mode);
21120 9 : real_2expN (&half_minus_pred_half, -(fmt->p) - 1, mode);
21121 9 : real_arithmetic (&pred_half, MINUS_EXPR, &dconsthalf, &half_minus_pred_half);
21122 9 : half = const_double_from_real_value (pred_half, mode);
21123 :
21124 : /* e1 = copysign (0.5, op1) */
21125 9 : e1 = gen_reg_rtx (mode);
21126 9 : emit_insn (gen_copysign (e1, half, op1));
21127 :
21128 : /* e2 = op1 + e1 */
21129 9 : e2 = expand_simple_binop (mode, PLUS, op1, e1, NULL_RTX, 0, OPTAB_DIRECT);
21130 :
21131 : /* res = trunc (e2) */
21132 9 : res = gen_reg_rtx (mode);
21133 9 : emit_insn (gen_round (res, e2, GEN_INT (ROUND_TRUNC)));
21134 :
21135 9 : emit_move_insn (op0, res);
21136 9 : }
21137 :
21138 : /* A cached (set (nil) (vselect (vconcat (nil) (nil)) (parallel [])))
21139 : insn, so that expand_vselect{,_vconcat} doesn't have to create a fresh
21140 : insn every time. */
21141 :
21142 : static GTY(()) rtx_insn *vselect_insn;
21143 :
21144 : /* Initialize vselect_insn. */
21145 :
21146 : static void
21147 7489 : init_vselect_insn (void)
21148 : {
21149 7489 : unsigned i;
21150 7489 : rtx x;
21151 :
21152 7489 : x = gen_rtx_PARALLEL (VOIDmode, rtvec_alloc (MAX_VECT_LEN));
21153 486785 : for (i = 0; i < MAX_VECT_LEN; ++i)
21154 479296 : XVECEXP (x, 0, i) = const0_rtx;
21155 7489 : x = gen_rtx_VEC_SELECT (V2DFmode, gen_rtx_VEC_CONCAT (V4DFmode, const0_rtx,
21156 : const0_rtx), x);
21157 7489 : x = gen_rtx_SET (const0_rtx, x);
21158 7489 : start_sequence ();
21159 7489 : vselect_insn = emit_insn (x);
21160 7489 : end_sequence ();
21161 7489 : }
21162 :
21163 : /* Construct (set target (vec_select op0 (parallel perm))) and
21164 : return true if that's a valid instruction in the active ISA. */
21165 :
21166 : static bool
21167 542572 : expand_vselect (rtx target, rtx op0, const unsigned char *perm,
21168 : unsigned nelt, bool testing_p)
21169 : {
21170 542572 : unsigned int i;
21171 542572 : rtx x, save_vconcat;
21172 542572 : int icode;
21173 :
21174 542572 : if (vselect_insn == NULL_RTX)
21175 1662 : init_vselect_insn ();
21176 :
21177 542572 : x = XEXP (SET_SRC (PATTERN (vselect_insn)), 1);
21178 542572 : PUT_NUM_ELEM (XVEC (x, 0), nelt);
21179 4239146 : for (i = 0; i < nelt; ++i)
21180 3696574 : XVECEXP (x, 0, i) = GEN_INT (perm[i]);
21181 542572 : save_vconcat = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
21182 542572 : XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = op0;
21183 542572 : PUT_MODE (SET_SRC (PATTERN (vselect_insn)), GET_MODE (target));
21184 542572 : SET_DEST (PATTERN (vselect_insn)) = target;
21185 542572 : icode = recog_memoized (vselect_insn);
21186 :
21187 542572 : if (icode >= 0 && !testing_p)
21188 72503 : emit_insn (copy_rtx (PATTERN (vselect_insn)));
21189 :
21190 542572 : SET_DEST (PATTERN (vselect_insn)) = const0_rtx;
21191 542572 : XEXP (SET_SRC (PATTERN (vselect_insn)), 0) = save_vconcat;
21192 542572 : INSN_CODE (vselect_insn) = -1;
21193 :
21194 542572 : return icode >= 0;
21195 : }
21196 :
21197 : /* Similar, but generate a vec_concat from op0 and op1 as well. */
21198 :
21199 : static bool
21200 476027 : expand_vselect_vconcat (rtx target, rtx op0, rtx op1,
21201 : const unsigned char *perm, unsigned nelt,
21202 : bool testing_p)
21203 : {
21204 476027 : machine_mode v2mode;
21205 476027 : rtx x;
21206 476027 : bool ok;
21207 :
21208 476027 : if (vselect_insn == NULL_RTX)
21209 5827 : init_vselect_insn ();
21210 :
21211 476027 : if (!GET_MODE_2XWIDER_MODE (GET_MODE (op0)).exists (&v2mode))
21212 : return false;
21213 476027 : x = XEXP (SET_SRC (PATTERN (vselect_insn)), 0);
21214 476027 : PUT_MODE (x, v2mode);
21215 476027 : XEXP (x, 0) = op0;
21216 476027 : XEXP (x, 1) = op1;
21217 476027 : ok = expand_vselect (target, x, perm, nelt, testing_p);
21218 476027 : XEXP (x, 0) = const0_rtx;
21219 476027 : XEXP (x, 1) = const0_rtx;
21220 476027 : return ok;
21221 : }
21222 :
21223 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
21224 : using movss or movsd. */
21225 : static bool
21226 318690 : expand_vec_perm_movs (struct expand_vec_perm_d *d)
21227 : {
21228 318690 : machine_mode vmode = d->vmode;
21229 318690 : unsigned i, nelt = d->nelt;
21230 318690 : rtx x;
21231 :
21232 318690 : if (d->one_operand_p)
21233 : return false;
21234 :
21235 291960 : if (!(TARGET_SSE && (vmode == V4SFmode || vmode == V4SImode))
21236 140436 : && !(TARGET_MMX_WITH_SSE && (vmode == V2SFmode || vmode == V2SImode))
21237 84903 : && !(TARGET_SSE2 && (vmode == V2DFmode || vmode == V2DImode)))
21238 : return false;
21239 :
21240 : /* Only the first element is changed. */
21241 215947 : if (d->perm[0] != nelt && d->perm[0] != 0)
21242 : return false;
21243 160484 : for (i = 1; i < nelt; ++i)
21244 125181 : if (d->perm[i] != i + nelt - d->perm[0])
21245 : return false;
21246 :
21247 35303 : if (d->testing_p)
21248 : return true;
21249 :
21250 6396 : if (d->perm[0] == nelt)
21251 0 : x = gen_rtx_VEC_MERGE (vmode, d->op1, d->op0, GEN_INT (1));
21252 : else
21253 6396 : x = gen_rtx_VEC_MERGE (vmode, d->op0, d->op1, GEN_INT (1));
21254 :
21255 6396 : emit_insn (gen_rtx_SET (d->target, x));
21256 :
21257 6396 : return true;
21258 : }
21259 :
21260 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
21261 : using insertps. */
21262 : static bool
21263 283387 : expand_vec_perm_insertps (struct expand_vec_perm_d *d)
21264 : {
21265 283387 : machine_mode vmode = d->vmode;
21266 283387 : unsigned i, cnt_s, nelt = d->nelt;
21267 283387 : int cnt_d = -1;
21268 283387 : rtx src, dst;
21269 :
21270 283387 : if (d->one_operand_p)
21271 : return false;
21272 :
21273 256657 : if (!(TARGET_SSE4_1
21274 37527 : && (vmode == V4SFmode || vmode == V4SImode
21275 27282 : || (TARGET_MMX_WITH_SSE
21276 19742 : && (vmode == V2SFmode || vmode == V2SImode)))))
21277 : return false;
21278 :
21279 51004 : for (i = 0; i < nelt; ++i)
21280 : {
21281 48757 : if (d->perm[i] == i)
21282 9630 : continue;
21283 39127 : if (cnt_d != -1)
21284 : {
21285 : cnt_d = -1;
21286 : break;
21287 : }
21288 20687 : cnt_d = i;
21289 : }
21290 :
21291 20687 : if (cnt_d == -1)
21292 : {
21293 41056 : for (i = 0; i < nelt; ++i)
21294 : {
21295 39123 : if (d->perm[i] == i + nelt)
21296 4176 : continue;
21297 34947 : if (cnt_d != -1)
21298 : return false;
21299 18440 : cnt_d = i;
21300 : }
21301 :
21302 1933 : if (cnt_d == -1)
21303 : return false;
21304 : }
21305 :
21306 4180 : if (d->testing_p)
21307 : return true;
21308 :
21309 550 : gcc_assert (cnt_d != -1);
21310 :
21311 550 : cnt_s = d->perm[cnt_d];
21312 550 : if (cnt_s < nelt)
21313 : {
21314 241 : src = d->op0;
21315 241 : dst = d->op1;
21316 : }
21317 : else
21318 : {
21319 309 : cnt_s -= nelt;
21320 309 : src = d->op1;
21321 309 : dst = d->op0;
21322 : }
21323 550 : gcc_assert (cnt_s < nelt);
21324 :
21325 550 : rtx x = gen_sse4_1_insertps (vmode, d->target, dst, src,
21326 550 : GEN_INT (cnt_s << 6 | cnt_d << 4));
21327 550 : emit_insn (x);
21328 :
21329 550 : return true;
21330 : }
21331 :
21332 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
21333 : in terms of blendp[sd] / pblendw / pblendvb / vpblendd. */
21334 :
21335 : static bool
21336 323014 : expand_vec_perm_blend (struct expand_vec_perm_d *d)
21337 : {
21338 323014 : machine_mode mmode, vmode = d->vmode;
21339 323014 : unsigned i, nelt = d->nelt;
21340 323014 : unsigned HOST_WIDE_INT mask;
21341 323014 : rtx target, op0, op1, maskop, x;
21342 323014 : rtx rperm[32], vperm;
21343 :
21344 323014 : if (d->one_operand_p)
21345 : return false;
21346 6070 : if (TARGET_AVX512F && GET_MODE_SIZE (vmode) == 64
21347 297435 : && (TARGET_AVX512BW
21348 691 : || GET_MODE_UNIT_SIZE (vmode) >= 4))
21349 : ;
21350 307345 : else if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
21351 : ;
21352 288182 : else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
21353 : ;
21354 281599 : else if (TARGET_SSE4_1
21355 313819 : && (GET_MODE_SIZE (vmode) == 16
21356 22328 : || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
21357 3073 : || GET_MODE_SIZE (vmode) == 4))
21358 : ;
21359 : else
21360 : return false;
21361 :
21362 : /* This is a blend, not a permute. Elements must stay in their
21363 : respective lanes. */
21364 106352 : for (i = 0; i < nelt; ++i)
21365 : {
21366 102028 : unsigned e = d->perm[i];
21367 102028 : if (!(e == i || e == i + nelt))
21368 : return false;
21369 : }
21370 :
21371 4324 : if (d->testing_p)
21372 : return true;
21373 :
21374 : /* ??? Without SSE4.1, we could implement this with and/andn/or. This
21375 : decision should be extracted elsewhere, so that we only try that
21376 : sequence once all budget==3 options have been tried. */
21377 3012 : target = d->target;
21378 3012 : op0 = d->op0;
21379 3012 : op1 = d->op1;
21380 3012 : mask = 0;
21381 :
21382 3012 : switch (vmode)
21383 : {
21384 : case E_V8DFmode:
21385 : case E_V16SFmode:
21386 : case E_V4DFmode:
21387 : case E_V8SFmode:
21388 : case E_V2DFmode:
21389 : case E_V4SFmode:
21390 : case E_V2SFmode:
21391 : case E_V2HImode:
21392 : case E_V4HImode:
21393 : case E_V8HImode:
21394 : case E_V8SImode:
21395 : case E_V32HImode:
21396 : case E_V64QImode:
21397 : case E_V16SImode:
21398 : case E_V8DImode:
21399 10948 : for (i = 0; i < nelt; ++i)
21400 9462 : mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
21401 : break;
21402 :
21403 : case E_V2DImode:
21404 18 : for (i = 0; i < 2; ++i)
21405 18 : mask |= (d->perm[i] >= 2 ? 15 : 0) << (i * 4);
21406 6 : vmode = V8HImode;
21407 6 : goto do_subreg;
21408 :
21409 : case E_V2SImode:
21410 24 : for (i = 0; i < 2; ++i)
21411 24 : mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2);
21412 8 : vmode = V4HImode;
21413 8 : goto do_subreg;
21414 :
21415 871 : case E_V4SImode:
21416 871 : if (TARGET_AVX2)
21417 : {
21418 : /* Use vpblendd instead of vpblendw. */
21419 185 : for (i = 0; i < nelt; ++i)
21420 148 : mask |= ((unsigned HOST_WIDE_INT) (d->perm[i] >= nelt)) << i;
21421 : break;
21422 : }
21423 : else
21424 : {
21425 4170 : for (i = 0; i < 4; ++i)
21426 5200 : mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
21427 834 : vmode = V8HImode;
21428 834 : goto do_subreg;
21429 : }
21430 :
21431 : case E_V16QImode:
21432 : /* See if bytes move in pairs so we can use pblendw with
21433 : an immediate argument, rather than pblendvb with a vector
21434 : argument. */
21435 93 : for (i = 0; i < 16; i += 2)
21436 92 : if (d->perm[i] + 1 != d->perm[i + 1])
21437 : {
21438 83 : use_pblendvb:
21439 3502 : for (i = 0; i < nelt; ++i)
21440 3212 : rperm[i] = (d->perm[i] < nelt ? const0_rtx : constm1_rtx);
21441 :
21442 290 : finish_pblendvb:
21443 291 : vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
21444 291 : vperm = force_reg (vmode, vperm);
21445 :
21446 582 : if (GET_MODE_SIZE (vmode) == 4)
21447 135 : emit_insn (gen_mmx_pblendvb_v4qi (target, op0, op1, vperm));
21448 312 : else if (GET_MODE_SIZE (vmode) == 8)
21449 40 : emit_insn (gen_mmx_pblendvb_v8qi (target, op0, op1, vperm));
21450 232 : else if (GET_MODE_SIZE (vmode) == 16)
21451 83 : emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
21452 : else
21453 33 : emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
21454 291 : if (target != d->target)
21455 1 : emit_move_insn (d->target, gen_lowpart (d->vmode, target));
21456 291 : return true;
21457 : }
21458 :
21459 9 : for (i = 0; i < 8; ++i)
21460 8 : mask |= (d->perm[i * 2] >= 16) << i;
21461 : vmode = V8HImode;
21462 : /* FALLTHRU */
21463 :
21464 1166 : do_subreg:
21465 1166 : target = gen_reg_rtx (vmode);
21466 1166 : op0 = gen_lowpart (vmode, op0);
21467 1166 : op1 = gen_lowpart (vmode, op1);
21468 1166 : break;
21469 :
21470 : case E_V8QImode:
21471 40 : for (i = 0; i < 8; i += 2)
21472 40 : if (d->perm[i] + 1 != d->perm[i + 1])
21473 40 : goto use_pblendvb;
21474 :
21475 0 : for (i = 0; i < 4; ++i)
21476 0 : mask |= (d->perm[i * 2] >= 8) << i;
21477 0 : vmode = V4HImode;
21478 0 : goto do_subreg;
21479 :
21480 : case E_V4QImode:
21481 153 : for (i = 0; i < 4; i += 2)
21482 150 : if (d->perm[i] + 1 != d->perm[i + 1])
21483 135 : goto use_pblendvb;
21484 :
21485 9 : for (i = 0; i < 2; ++i)
21486 6 : mask |= (d->perm[i * 2] >= 4) << i;
21487 3 : vmode = V2HImode;
21488 3 : goto do_subreg;
21489 :
21490 : case E_V32QImode:
21491 : /* See if bytes move in pairs. If not, vpblendvb must be used. */
21492 4928 : for (i = 0; i < 32; i += 2)
21493 4640 : if (d->perm[i] + 1 != d->perm[i + 1])
21494 32 : goto use_pblendvb;
21495 : /* See if bytes move in quadruplets. If yes, vpblendd
21496 : with immediate can be used. */
21497 2592 : for (i = 0; i < 32; i += 4)
21498 2304 : if (d->perm[i] + 2 != d->perm[i + 2])
21499 : break;
21500 288 : if (i < 32)
21501 : {
21502 : /* See if bytes move the same in both lanes. If yes,
21503 : vpblendw with immediate can be used. */
21504 0 : for (i = 0; i < 16; i += 2)
21505 0 : if (d->perm[i] + 16 != d->perm[i + 16])
21506 0 : goto use_pblendvb;
21507 :
21508 : /* Use vpblendw. */
21509 0 : for (i = 0; i < 16; ++i)
21510 0 : mask |= (d->perm[i * 2] >= 32) << i;
21511 0 : vmode = V16HImode;
21512 0 : goto do_subreg;
21513 : }
21514 :
21515 : /* Use vpblendd. */
21516 2592 : for (i = 0; i < 8; ++i)
21517 2304 : mask |= (d->perm[i * 4] >= 32) << i;
21518 288 : vmode = V8SImode;
21519 288 : goto do_subreg;
21520 :
21521 : case E_V16HImode:
21522 : /* See if words move in pairs. If yes, vpblendd can be used. */
21523 186 : for (i = 0; i < 16; i += 2)
21524 169 : if (d->perm[i] + 1 != d->perm[i + 1])
21525 : break;
21526 50 : if (i < 16)
21527 : {
21528 : /* See if words move the same in both lanes. If not,
21529 : vpblendvb must be used. */
21530 290 : for (i = 0; i < 8; i++)
21531 258 : if (d->perm[i] + 8 != d->perm[i + 8])
21532 : {
21533 : /* Use vpblendvb. */
21534 33 : for (i = 0; i < 32; ++i)
21535 32 : rperm[i] = (d->perm[i / 2] < 16 ? const0_rtx : constm1_rtx);
21536 :
21537 1 : vmode = V32QImode;
21538 1 : nelt = 32;
21539 1 : target = gen_reg_rtx (vmode);
21540 1 : op0 = gen_lowpart (vmode, op0);
21541 1 : op1 = gen_lowpart (vmode, op1);
21542 1 : goto finish_pblendvb;
21543 : }
21544 :
21545 : /* Use vpblendw. */
21546 544 : for (i = 0; i < 16; ++i)
21547 512 : mask |= (d->perm[i] >= 16) << i;
21548 : break;
21549 : }
21550 :
21551 : /* Use vpblendd. */
21552 153 : for (i = 0; i < 8; ++i)
21553 136 : mask |= (d->perm[i * 2] >= 16) << i;
21554 17 : vmode = V8SImode;
21555 17 : goto do_subreg;
21556 :
21557 : case E_V4DImode:
21558 : /* Use vpblendd. */
21559 45 : for (i = 0; i < 4; ++i)
21560 54 : mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
21561 9 : vmode = V8SImode;
21562 9 : goto do_subreg;
21563 :
21564 0 : default:
21565 0 : gcc_unreachable ();
21566 : }
21567 :
21568 2721 : switch (vmode)
21569 : {
21570 : case E_V8DFmode:
21571 : case E_V8DImode:
21572 : mmode = QImode;
21573 : break;
21574 5 : case E_V16SFmode:
21575 5 : case E_V16SImode:
21576 5 : mmode = HImode;
21577 5 : break;
21578 6 : case E_V32HImode:
21579 6 : mmode = SImode;
21580 6 : break;
21581 1 : case E_V64QImode:
21582 1 : mmode = DImode;
21583 1 : break;
21584 : default:
21585 : mmode = VOIDmode;
21586 : }
21587 :
21588 : /* Canonicalize vec_merge. */
21589 2721 : if (swap_commutative_operands_p (op1, op0)
21590 : /* Two operands have same precedence, then
21591 : first bit of mask select first operand. */
21592 2721 : || (!swap_commutative_operands_p (op0, op1)
21593 2721 : && !(mask & 1)))
21594 : {
21595 2714 : unsigned n_elts = GET_MODE_NUNITS (vmode);
21596 2714 : std::swap (op0, op1);
21597 2714 : unsigned HOST_WIDE_INT mask_all = HOST_WIDE_INT_1U;
21598 2714 : if (n_elts == HOST_BITS_PER_WIDE_INT)
21599 : mask_all = -1;
21600 : else
21601 2713 : mask_all = (HOST_WIDE_INT_1U << n_elts) - 1;
21602 2714 : mask = ~mask & mask_all;
21603 : }
21604 :
21605 2721 : if (mmode != VOIDmode)
21606 22 : maskop = force_reg (mmode, gen_int_mode (mask, mmode));
21607 : else
21608 2699 : maskop = GEN_INT (mask);
21609 :
21610 : /* This matches five different patterns with the different modes. */
21611 2721 : x = gen_rtx_VEC_MERGE (vmode, op1, op0, maskop);
21612 2721 : x = gen_rtx_SET (target, x);
21613 2721 : emit_insn (x);
21614 2721 : if (target != d->target)
21615 1166 : emit_move_insn (d->target, gen_lowpart (d->vmode, target));
21616 :
21617 : return true;
21618 : }
21619 :
21620 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
21621 : in terms of the variable form of vpermilps.
21622 :
21623 : Note that we will have already failed the immediate input vpermilps,
21624 : which requires that the high and low part shuffle be identical; the
21625 : variable form doesn't require that. */
21626 :
21627 : static bool
21628 138644 : expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
21629 : {
21630 138644 : rtx rperm[8], vperm;
21631 138644 : unsigned i;
21632 :
21633 138644 : if (!TARGET_AVX || !d->one_operand_p
21634 12421 : || (d->vmode != V8SImode && d->vmode != V8SFmode))
21635 : return false;
21636 :
21637 : /* We can only permute within the 128-bit lane. */
21638 20283 : for (i = 0; i < 8; ++i)
21639 : {
21640 19345 : unsigned e = d->perm[i];
21641 19345 : if (i < 4 ? e >= 4 : e < 4)
21642 : return false;
21643 : }
21644 :
21645 938 : if (d->testing_p)
21646 : return true;
21647 :
21648 657 : for (i = 0; i < 8; ++i)
21649 : {
21650 584 : unsigned e = d->perm[i];
21651 :
21652 : /* Within each 128-bit lane, the elements of op0 are numbered
21653 : from 0 and the elements of op1 are numbered from 4. */
21654 584 : if (e >= 8 + 4)
21655 0 : e -= 8;
21656 584 : else if (e >= 4)
21657 292 : e -= 4;
21658 :
21659 584 : rperm[i] = GEN_INT (e);
21660 : }
21661 :
21662 73 : vperm = gen_rtx_CONST_VECTOR (V8SImode, gen_rtvec_v (8, rperm));
21663 73 : vperm = force_reg (V8SImode, vperm);
21664 73 : rtx target = d->target;
21665 73 : rtx op0 = d->op0;
21666 73 : if (d->vmode == V8SImode)
21667 : {
21668 21 : target = lowpart_subreg (V8SFmode, target, V8SImode);
21669 21 : op0 = lowpart_subreg (V8SFmode, op0, V8SImode);
21670 : }
21671 :
21672 73 : emit_insn (gen_avx_vpermilvarv8sf3 (target, op0, vperm));
21673 :
21674 73 : return true;
21675 : }
21676 :
21677 : /* For V*[QHS]Imode permutations, check if the same permutation
21678 : can't be performed in a 2x, 4x or 8x wider inner mode. */
21679 :
21680 : static bool
21681 161403 : canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
21682 : struct expand_vec_perm_d *nd)
21683 : {
21684 161403 : int i;
21685 161403 : machine_mode mode = VOIDmode;
21686 :
21687 161403 : switch (d->vmode)
21688 : {
21689 : case E_V8QImode: mode = V4HImode; break;
21690 29190 : case E_V16QImode: mode = V8HImode; break;
21691 1420 : case E_V32QImode: mode = V16HImode; break;
21692 315 : case E_V64QImode: mode = V32HImode; break;
21693 11589 : case E_V4HImode: mode = V2SImode; break;
21694 20226 : case E_V8HImode: mode = V4SImode; break;
21695 1001 : case E_V16HImode: mode = V8SImode; break;
21696 397 : case E_V32HImode: mode = V16SImode; break;
21697 42016 : case E_V4SImode: mode = V2DImode; break;
21698 1491 : case E_V8SImode: mode = V4DImode; break;
21699 65 : case E_V16SImode: mode = V8DImode; break;
21700 : default: return false;
21701 : }
21702 201459 : for (i = 0; i < d->nelt; i += 2)
21703 187429 : if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
21704 : return false;
21705 14030 : nd->vmode = mode;
21706 14030 : nd->nelt = d->nelt / 2;
21707 92048 : for (i = 0; i < nd->nelt; i++)
21708 78018 : nd->perm[i] = d->perm[2 * i] / 2;
21709 28060 : if (GET_MODE_INNER (mode) != DImode)
21710 12337 : canonicalize_vector_int_perm (nd, nd);
21711 14030 : if (nd != d)
21712 : {
21713 8906 : nd->one_operand_p = d->one_operand_p;
21714 8906 : nd->testing_p = d->testing_p;
21715 8906 : if (d->op0 == d->op1)
21716 3001 : nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
21717 : else
21718 : {
21719 5905 : nd->op0 = gen_lowpart (nd->vmode, d->op0);
21720 5905 : nd->op1 = gen_lowpart (nd->vmode, d->op1);
21721 : }
21722 8906 : if (d->testing_p)
21723 5680 : nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
21724 : else
21725 3226 : nd->target = gen_reg_rtx (nd->vmode);
21726 : }
21727 : return true;
21728 : }
21729 :
21730 : /* Return true if permutation D can be performed as VMODE permutation
21731 : instead. */
21732 :
21733 : static bool
21734 7580 : valid_perm_using_mode_p (machine_mode vmode, struct expand_vec_perm_d *d)
21735 : {
21736 7580 : unsigned int i, j, chunk;
21737 :
21738 7580 : if (GET_MODE_CLASS (vmode) != MODE_VECTOR_INT
21739 7580 : || GET_MODE_CLASS (d->vmode) != MODE_VECTOR_INT
21740 18636 : || GET_MODE_SIZE (vmode) != GET_MODE_SIZE (d->vmode))
21741 : return false;
21742 :
21743 11056 : if (GET_MODE_NUNITS (vmode) >= d->nelt)
21744 : return true;
21745 :
21746 5236 : chunk = d->nelt / GET_MODE_NUNITS (vmode);
21747 7186 : for (i = 0; i < d->nelt; i += chunk)
21748 6939 : if (d->perm[i] & (chunk - 1))
21749 : return false;
21750 : else
21751 12694 : for (j = 1; j < chunk; ++j)
21752 10744 : if (d->perm[i] + j != d->perm[i + j])
21753 : return false;
21754 :
21755 : return true;
21756 : }
21757 :
21758 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
21759 : in terms of pshufb, vpperm, vpermq, vpermd, vpermps or vperm2i128. */
21760 :
21761 : static bool
21762 137706 : expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
21763 : {
21764 137706 : unsigned i, nelt, eltsz, mask;
21765 137706 : unsigned char perm[64];
21766 137706 : machine_mode vmode;
21767 137706 : struct expand_vec_perm_d nd;
21768 137706 : rtx rperm[64], vperm, target, op0, op1;
21769 :
21770 137706 : nelt = d->nelt;
21771 :
21772 137706 : if (!d->one_operand_p)
21773 224044 : switch (GET_MODE_SIZE (d->vmode))
21774 : {
21775 7810 : case 4:
21776 7810 : if (!TARGET_XOP)
21777 : return false;
21778 : vmode = V4QImode;
21779 : break;
21780 :
21781 18613 : case 8:
21782 18613 : if (!TARGET_XOP)
21783 : return false;
21784 : vmode = V8QImode;
21785 : break;
21786 :
21787 74960 : case 16:
21788 74960 : if (!TARGET_XOP)
21789 : return false;
21790 : vmode = V16QImode;
21791 : break;
21792 :
21793 9614 : case 32:
21794 9614 : if (!TARGET_AVX2)
21795 : return false;
21796 :
21797 4648 : if (valid_perm_using_mode_p (V2TImode, d))
21798 : {
21799 56 : if (d->testing_p)
21800 : return true;
21801 :
21802 : /* Use vperm2i128 insn. The pattern uses
21803 : V4DImode instead of V2TImode. */
21804 52 : target = d->target;
21805 52 : if (d->vmode != V4DImode)
21806 12 : target = gen_reg_rtx (V4DImode);
21807 52 : op0 = gen_lowpart (V4DImode, d->op0);
21808 52 : op1 = gen_lowpart (V4DImode, d->op1);
21809 52 : rperm[0]
21810 52 : = GEN_INT ((d->perm[0] / (nelt / 2))
21811 : | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
21812 52 : emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
21813 52 : if (target != d->target)
21814 12 : emit_move_insn (d->target, gen_lowpart (d->vmode, target));
21815 52 : return true;
21816 : }
21817 : /* FALLTHRU */
21818 :
21819 : default:
21820 : return false;
21821 : }
21822 : else
21823 51368 : switch (GET_MODE_SIZE (d->vmode))
21824 : {
21825 3455 : case 4:
21826 3455 : if (!TARGET_SSSE3)
21827 : return false;
21828 : vmode = V4QImode;
21829 : break;
21830 :
21831 2392 : case 8:
21832 2392 : if (!TARGET_SSSE3)
21833 : return false;
21834 : vmode = V8QImode;
21835 : break;
21836 :
21837 13926 : case 16:
21838 13926 : if (!TARGET_SSSE3)
21839 : return false;
21840 : vmode = V16QImode;
21841 : break;
21842 :
21843 5522 : case 32:
21844 5522 : if (!TARGET_AVX2)
21845 : return false;
21846 :
21847 : /* V4DImode should be already handled through
21848 : expand_vselect by vpermq instruction. */
21849 2663 : gcc_assert (d->vmode != V4DImode);
21850 :
21851 2663 : vmode = V32QImode;
21852 2663 : if (d->vmode == V8SImode
21853 2270 : || d->vmode == V16HImode
21854 2054 : || d->vmode == V32QImode)
21855 : {
21856 : /* First see if vpermq can be used for
21857 : V8SImode/V16HImode/V32QImode. */
21858 1379 : if (valid_perm_using_mode_p (V4DImode, d))
21859 : {
21860 770 : for (i = 0; i < 4; i++)
21861 616 : perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
21862 154 : if (d->testing_p)
21863 : return true;
21864 58 : target = gen_reg_rtx (V4DImode);
21865 58 : if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
21866 : perm, 4, false))
21867 : {
21868 116 : emit_move_insn (d->target,
21869 58 : gen_lowpart (d->vmode, target));
21870 58 : return true;
21871 : }
21872 : return false;
21873 : }
21874 :
21875 : /* Next see if vpermd can be used. */
21876 1225 : if (valid_perm_using_mode_p (V8SImode, d))
21877 : vmode = V8SImode;
21878 : }
21879 : /* Or if vpermps can be used. */
21880 1284 : else if (d->vmode == V8SFmode)
21881 : vmode = V8SImode;
21882 :
21883 : if (vmode == V32QImode)
21884 : {
21885 : /* vpshufb only works intra lanes, it is not
21886 : possible to shuffle bytes in between the lanes. */
21887 22049 : for (i = 0; i < nelt; ++i)
21888 21395 : if ((d->perm[i] ^ i) & (nelt / 2))
21889 : return false;
21890 : }
21891 : break;
21892 :
21893 389 : case 64:
21894 389 : if (!TARGET_AVX512BW)
21895 : return false;
21896 :
21897 : /* If vpermq didn't work, vpshufb won't work either. */
21898 204 : if (d->vmode == V8DFmode || d->vmode == V8DImode)
21899 : return false;
21900 :
21901 175 : vmode = V64QImode;
21902 175 : if (d->vmode == V16SImode
21903 150 : || d->vmode == V32HImode
21904 50 : || d->vmode == V64QImode)
21905 : {
21906 : /* First see if vpermq can be used for
21907 : V16SImode/V32HImode/V64QImode. */
21908 164 : if (valid_perm_using_mode_p (V8DImode, d))
21909 : {
21910 0 : for (i = 0; i < 8; i++)
21911 0 : perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
21912 0 : if (d->testing_p)
21913 : return true;
21914 0 : target = gen_reg_rtx (V8DImode);
21915 0 : if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
21916 : perm, 8, false))
21917 : {
21918 0 : emit_move_insn (d->target,
21919 0 : gen_lowpart (d->vmode, target));
21920 0 : return true;
21921 : }
21922 : return false;
21923 : }
21924 :
21925 : /* Next see if vpermd can be used. */
21926 164 : if (valid_perm_using_mode_p (V16SImode, d))
21927 : vmode = V16SImode;
21928 : }
21929 : /* Or if vpermps can be used. */
21930 11 : else if (d->vmode == V16SFmode)
21931 : vmode = V16SImode;
21932 :
21933 : if (vmode == V64QImode)
21934 : {
21935 : /* vpshufb only works intra lanes, it is not
21936 : possible to shuffle bytes in between the lanes. */
21937 578 : for (i = 0; i < nelt; ++i)
21938 578 : if ((d->perm[i] ^ i) & (3 * nelt / 4))
21939 : return false;
21940 : }
21941 : break;
21942 :
21943 : default:
21944 : return false;
21945 : }
21946 :
21947 12232 : if (d->testing_p)
21948 : return true;
21949 :
21950 : /* Try to avoid variable permutation instruction. */
21951 9327 : if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
21952 : {
21953 1839 : emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
21954 1839 : return true;
21955 : }
21956 :
21957 7488 : if (vmode == V8SImode)
21958 9639 : for (i = 0; i < 8; ++i)
21959 8568 : rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
21960 6417 : else if (vmode == V16SImode)
21961 612 : for (i = 0; i < 16; ++i)
21962 576 : rperm[i] = GEN_INT ((d->perm[i * nelt / 16] * 16 / nelt) & 15);
21963 : else
21964 : {
21965 6381 : eltsz = GET_MODE_UNIT_SIZE (d->vmode);
21966 6381 : if (!d->one_operand_p)
21967 3210 : mask = 2 * nelt - 1;
21968 3171 : else if (vmode == V64QImode)
21969 0 : mask = nelt / 4 - 1;
21970 3171 : else if (vmode == V32QImode)
21971 648 : mask = nelt / 2 - 1;
21972 : else
21973 2523 : mask = nelt - 1;
21974 :
21975 74529 : for (i = 0; i < nelt; ++i)
21976 : {
21977 68148 : unsigned j, e = d->perm[i] & mask;
21978 178264 : for (j = 0; j < eltsz; ++j)
21979 110116 : rperm[i * eltsz + j] = GEN_INT (e * eltsz + j);
21980 : }
21981 : }
21982 :
21983 7488 : machine_mode vpmode = vmode;
21984 :
21985 7488 : nelt = GET_MODE_SIZE (vmode);
21986 :
21987 : /* Emulate narrow modes with V16QI instructions. */
21988 7488 : if (nelt < 16)
21989 : {
21990 222 : rtx m128 = GEN_INT (-128);
21991 :
21992 : /* Remap elements from the second operand, as we have to
21993 : account for inactive top elements from the first operand. */
21994 222 : if (!d->one_operand_p)
21995 : {
21996 243 : for (i = 0; i < nelt; ++i)
21997 : {
21998 216 : unsigned ival = UINTVAL (rperm[i]);
21999 216 : if (ival >= nelt)
22000 108 : rperm[i] = GEN_INT (ival + 16 - nelt);
22001 : }
22002 : }
22003 :
22004 : /* Fill inactive elements in the top positions with zeros. */
22005 2570 : for (i = nelt; i < 16; ++i)
22006 2348 : rperm[i] = m128;
22007 :
22008 : vpmode = V16QImode;
22009 : }
22010 :
22011 14976 : vperm = gen_rtx_CONST_VECTOR (vpmode,
22012 7488 : gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm));
22013 7488 : vperm = force_reg (vpmode, vperm);
22014 :
22015 7488 : if (vmode == d->vmode)
22016 2893 : target = d->target;
22017 : else
22018 4595 : target = gen_reg_rtx (vmode);
22019 :
22020 7488 : op0 = gen_lowpart (vmode, d->op0);
22021 :
22022 7488 : if (d->one_operand_p)
22023 : {
22024 4278 : rtx (*gen) (rtx, rtx, rtx);
22025 :
22026 4278 : if (vmode == V4QImode)
22027 : gen = gen_mmx_pshufbv4qi3;
22028 : else if (vmode == V8QImode)
22029 : gen = gen_mmx_pshufbv8qi3;
22030 : else if (vmode == V16QImode)
22031 : gen = gen_ssse3_pshufbv16qi3;
22032 : else if (vmode == V32QImode)
22033 : gen = gen_avx2_pshufbv32qi3;
22034 : else if (vmode == V64QImode)
22035 : gen = gen_avx512bw_pshufbv64qi3;
22036 : else if (vmode == V8SFmode)
22037 : gen = gen_avx2_permvarv8sf;
22038 : else if (vmode == V8SImode)
22039 : gen = gen_avx2_permvarv8si;
22040 : else if (vmode == V16SFmode)
22041 : gen = gen_avx512f_permvarv16sf;
22042 : else if (vmode == V16SImode)
22043 : gen = gen_avx512f_permvarv16si;
22044 : else
22045 : gcc_unreachable ();
22046 :
22047 4278 : emit_insn (gen (target, op0, vperm));
22048 : }
22049 : else
22050 : {
22051 3210 : rtx (*gen) (rtx, rtx, rtx, rtx);
22052 :
22053 3210 : op1 = gen_lowpart (vmode, d->op1);
22054 :
22055 3210 : if (vmode == V4QImode)
22056 : gen = gen_mmx_ppermv32;
22057 : else if (vmode == V8QImode)
22058 : gen = gen_mmx_ppermv64;
22059 : else if (vmode == V16QImode)
22060 : gen = gen_xop_pperm;
22061 : else
22062 0 : gcc_unreachable ();
22063 :
22064 3210 : emit_insn (gen (target, op0, op1, vperm));
22065 : }
22066 :
22067 7488 : if (target != d->target)
22068 4595 : emit_move_insn (d->target, gen_lowpart (d->vmode, target));
22069 :
22070 : return true;
22071 : }
22072 :
22073 : /* Try to expand one-operand permutation with constant mask. */
22074 :
22075 : static bool
22076 125144 : ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
22077 : {
22078 125144 : machine_mode mode = GET_MODE (d->op0);
22079 125144 : machine_mode maskmode = mode;
22080 250288 : unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
22081 125144 : rtx (*gen) (rtx, rtx, rtx) = NULL;
22082 125144 : rtx target, op0, mask;
22083 125144 : rtx vec[64];
22084 :
22085 125144 : if (!rtx_equal_p (d->op0, d->op1))
22086 : return false;
22087 :
22088 17641 : if (!TARGET_AVX512F)
22089 : return false;
22090 :
22091 : /* Accept VNxHImode and VNxQImode now. */
22092 719 : if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
22093 : return false;
22094 :
22095 : /* vpermw. */
22096 457 : if (!TARGET_AVX512BW && inner_size == 2)
22097 : return false;
22098 :
22099 : /* vpermb. */
22100 323 : if (!TARGET_AVX512VBMI && inner_size == 1)
22101 : return false;
22102 :
22103 202 : switch (mode)
22104 : {
22105 : case E_V16SImode:
22106 : gen = gen_avx512f_permvarv16si;
22107 : break;
22108 4 : case E_V16SFmode:
22109 4 : gen = gen_avx512f_permvarv16sf;
22110 4 : maskmode = V16SImode;
22111 4 : break;
22112 1 : case E_V8DImode:
22113 1 : gen = gen_avx512f_permvarv8di;
22114 1 : break;
22115 30 : case E_V8DFmode:
22116 30 : gen = gen_avx512f_permvarv8df;
22117 30 : maskmode = V8DImode;
22118 30 : break;
22119 108 : case E_V32HImode:
22120 108 : gen = gen_avx512bw_permvarv32hi;
22121 108 : break;
22122 14 : case E_V16HImode:
22123 14 : gen = gen_avx512vl_permvarv16hi;
22124 14 : break;
22125 6 : case E_V8HImode:
22126 6 : gen = gen_avx512vl_permvarv8hi;
22127 6 : break;
22128 4 : case E_V64QImode:
22129 4 : gen = gen_avx512bw_permvarv64qi;
22130 4 : break;
22131 2 : case E_V32QImode:
22132 2 : gen = gen_avx512vl_permvarv32qi;
22133 2 : break;
22134 0 : case E_V16QImode:
22135 0 : gen = gen_avx512vl_permvarv16qi;
22136 0 : break;
22137 :
22138 : default:
22139 : return false;
22140 : }
22141 :
22142 201 : if (d->testing_p)
22143 : return true;
22144 :
22145 192 : target = d->target;
22146 192 : op0 = d->op0;
22147 4920 : for (int i = 0; i < d->nelt; ++i)
22148 4728 : vec[i] = GEN_INT (d->perm[i]);
22149 192 : mask = gen_rtx_CONST_VECTOR (maskmode, gen_rtvec_v (d->nelt, vec));
22150 192 : emit_insn (gen (target, op0, force_reg (maskmode, mask)));
22151 192 : return true;
22152 : }
22153 :
22154 : static bool expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool);
22155 :
22156 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to instantiate D
22157 : in a single instruction. */
22158 :
22159 : static bool
22160 356858 : expand_vec_perm_1 (struct expand_vec_perm_d *d)
22161 : {
22162 356858 : unsigned i, nelt = d->nelt;
22163 356858 : struct expand_vec_perm_d nd;
22164 :
22165 : /* Check plain VEC_SELECT first, because AVX has instructions that could
22166 : match both SEL and SEL+CONCAT, but the plain SEL will allow a memory
22167 : input where SEL+CONCAT may not. */
22168 356858 : if (d->one_operand_p)
22169 : {
22170 : int mask = nelt - 1;
22171 : bool identity_perm = true;
22172 : bool broadcast_perm = true;
22173 :
22174 527944 : for (i = 0; i < nelt; i++)
22175 : {
22176 464630 : nd.perm[i] = d->perm[i] & mask;
22177 464630 : if (nd.perm[i] != i)
22178 348901 : identity_perm = false;
22179 464630 : if (nd.perm[i])
22180 385619 : broadcast_perm = false;
22181 : }
22182 :
22183 63314 : if (identity_perm)
22184 : {
22185 59 : if (!d->testing_p)
22186 5 : emit_move_insn (d->target, d->op0);
22187 59 : return true;
22188 : }
22189 63255 : else if (broadcast_perm && TARGET_AVX2)
22190 : {
22191 : /* Use vpbroadcast{b,w,d}. */
22192 390 : rtx (*gen) (rtx, rtx) = NULL;
22193 390 : switch (d->vmode)
22194 : {
22195 1 : case E_V64QImode:
22196 1 : if (TARGET_AVX512BW)
22197 : gen = gen_avx512bw_vec_dupv64qi_1;
22198 : break;
22199 4 : case E_V32QImode:
22200 4 : gen = gen_avx2_pbroadcastv32qi_1;
22201 4 : break;
22202 1 : case E_V32HImode:
22203 1 : if (TARGET_AVX512BW)
22204 : gen = gen_avx512bw_vec_dupv32hi_1;
22205 : break;
22206 4 : case E_V16HImode:
22207 4 : gen = gen_avx2_pbroadcastv16hi_1;
22208 4 : break;
22209 1 : case E_V16SImode:
22210 1 : if (TARGET_AVX512F)
22211 : gen = gen_avx512f_vec_dupv16si_1;
22212 : break;
22213 4 : case E_V8SImode:
22214 4 : gen = gen_avx2_pbroadcastv8si_1;
22215 4 : break;
22216 4 : case E_V16QImode:
22217 4 : gen = gen_avx2_pbroadcastv16qi;
22218 4 : break;
22219 5 : case E_V8HImode:
22220 5 : gen = gen_avx2_pbroadcastv8hi;
22221 5 : break;
22222 0 : case E_V16SFmode:
22223 0 : if (TARGET_AVX512F)
22224 : gen = gen_avx512f_vec_dupv16sf_1;
22225 : break;
22226 : case E_V8SFmode:
22227 : gen = gen_avx2_vec_dupv8sf_1;
22228 : break;
22229 0 : case E_V8DFmode:
22230 0 : if (TARGET_AVX512F)
22231 : gen = gen_avx512f_vec_dupv8df_1;
22232 : break;
22233 0 : case E_V8DImode:
22234 0 : if (TARGET_AVX512F)
22235 : gen = gen_avx512f_vec_dupv8di_1;
22236 : break;
22237 : /* For other modes prefer other shuffles this function creates. */
22238 : default: break;
22239 : }
22240 21 : if (gen != NULL)
22241 : {
22242 24 : if (!d->testing_p)
22243 24 : emit_insn (gen (d->target, d->op0));
22244 24 : return true;
22245 : }
22246 : }
22247 :
22248 63231 : if (expand_vselect (d->target, d->op0, nd.perm, nelt, d->testing_p))
22249 : return true;
22250 :
22251 : /* There are plenty of patterns in sse.md that are written for
22252 : SEL+CONCAT and are not replicated for a single op. Perhaps
22253 : that should be changed, to avoid the nastiness here. */
22254 :
22255 : /* Recognize interleave style patterns, which means incrementing
22256 : every other permutation operand. */
22257 210419 : for (i = 0; i < nelt; i += 2)
22258 : {
22259 172987 : nd.perm[i] = d->perm[i] & mask;
22260 172987 : nd.perm[i + 1] = (d->perm[i + 1] & mask) + nelt;
22261 : }
22262 37432 : if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
22263 37432 : d->testing_p))
22264 : return true;
22265 :
22266 : /* Recognize shufps, which means adding {0, 0, nelt, nelt}. */
22267 32397 : if (nelt >= 4)
22268 : {
22269 113027 : for (i = 0; i < nelt; i += 4)
22270 : {
22271 80630 : nd.perm[i + 0] = d->perm[i + 0] & mask;
22272 80630 : nd.perm[i + 1] = d->perm[i + 1] & mask;
22273 80630 : nd.perm[i + 2] = (d->perm[i + 2] & mask) + nelt;
22274 80630 : nd.perm[i + 3] = (d->perm[i + 3] & mask) + nelt;
22275 : }
22276 :
22277 32397 : if (expand_vselect_vconcat (d->target, d->op0, d->op0, nd.perm, nelt,
22278 32397 : d->testing_p))
22279 : return true;
22280 : }
22281 : }
22282 :
22283 : /* Try the SSE4.1 blend variable merge instructions. */
22284 320274 : if (expand_vec_perm_blend (d))
22285 : return true;
22286 :
22287 : /* Try movss/movsd instructions. */
22288 318690 : if (expand_vec_perm_movs (d))
22289 : return true;
22290 :
22291 : /* Try the SSE4.1 insertps instruction. */
22292 283387 : if (expand_vec_perm_insertps (d))
22293 : return true;
22294 :
22295 : /* Try the fully general two operand permute. */
22296 279207 : if (expand_vselect_vconcat (d->target, d->op0, d->op1, d->perm, nelt,
22297 279207 : d->testing_p))
22298 : return true;
22299 :
22300 : /* Recognize interleave style patterns with reversed operands. */
22301 138656 : if (!d->one_operand_p)
22302 : {
22303 904434 : for (i = 0; i < nelt; ++i)
22304 : {
22305 792400 : unsigned e = d->perm[i];
22306 792400 : if (e >= nelt)
22307 388318 : e -= nelt;
22308 : else
22309 404082 : e += nelt;
22310 792400 : nd.perm[i] = e;
22311 : }
22312 :
22313 112034 : if (expand_vselect_vconcat (d->target, d->op1, d->op0, nd.perm, nelt,
22314 112034 : d->testing_p))
22315 : return true;
22316 : }
22317 :
22318 : /* Try one of the AVX vpermil variable permutations. */
22319 138644 : if (expand_vec_perm_vpermil (d))
22320 : return true;
22321 :
22322 : /* Try the SSSE3 pshufb or XOP vpperm or AVX2 vperm2i128,
22323 : vpshufb, vpermd, vpermps or vpermq variable permutation. */
22324 137706 : if (expand_vec_perm_pshufb (d))
22325 : return true;
22326 :
22327 : /* Try the AVX2 vpalignr instruction. */
22328 125264 : if (expand_vec_perm_palignr (d, true))
22329 : return true;
22330 :
22331 : /* Try the AVX512F vperm{w,b,s,d} instructions */
22332 125144 : if (ix86_expand_vec_one_operand_perm_avx512 (d))
22333 : return true;
22334 :
22335 : /* Try the AVX512F vpermt2/vpermi2 instructions. */
22336 124943 : if (ix86_expand_vec_perm_vpermt2 (NULL_RTX, NULL_RTX, NULL_RTX, NULL_RTX, d))
22337 : return true;
22338 :
22339 : /* See if we can get the same permutation in different vector integer
22340 : mode. */
22341 123987 : if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
22342 : {
22343 6525 : if (!d->testing_p)
22344 1180 : emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
22345 6525 : return true;
22346 : }
22347 : return false;
22348 : }
22349 :
22350 : /* Canonicalize vec_perm index to make the first index
22351 : always comes from the first vector. */
22352 : static void
22353 8157 : ix86_vec_perm_index_canon (struct expand_vec_perm_d *d)
22354 : {
22355 8157 : unsigned nelt = d->nelt;
22356 8157 : if (d->perm[0] < nelt)
22357 : return;
22358 :
22359 5 : for (unsigned i = 0; i != nelt; i++)
22360 4 : d->perm[i] = (d->perm[i] + nelt) % (2 * nelt);
22361 :
22362 1 : std::swap (d->op0, d->op1);
22363 1 : return;
22364 : }
22365 :
22366 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
22367 : in terms of a pair of shufps+ shufps/pshufd instructions. */
22368 : static bool
22369 84971 : expand_vec_perm_shufps_shufps (struct expand_vec_perm_d *d)
22370 : {
22371 84971 : unsigned char perm1[4];
22372 84971 : machine_mode vmode = d->vmode;
22373 84971 : bool ok;
22374 84971 : unsigned i, j, k, count = 0;
22375 :
22376 84971 : if (d->one_operand_p
22377 79743 : || (vmode != V4SImode && vmode != V4SFmode))
22378 : return false;
22379 :
22380 36301 : if (d->testing_p)
22381 : return true;
22382 :
22383 8157 : ix86_vec_perm_index_canon (d);
22384 48942 : for (i = 0; i < 4; ++i)
22385 51069 : count += d->perm[i] > 3 ? 1 : 0;
22386 :
22387 8157 : gcc_assert (count & 3);
22388 :
22389 8157 : rtx tmp = gen_reg_rtx (vmode);
22390 : /* 2 from op0 and 2 from op1. */
22391 8157 : if (count == 2)
22392 : {
22393 : unsigned char perm2[4];
22394 18110 : for (i = 0, j = 0, k = 2; i < 4; ++i)
22395 14488 : if (d->perm[i] & 4)
22396 : {
22397 7244 : perm1[k++] = d->perm[i];
22398 7244 : perm2[i] = k - 1;
22399 : }
22400 : else
22401 : {
22402 7244 : perm1[j++] = d->perm[i];
22403 7244 : perm2[i] = j - 1;
22404 : }
22405 :
22406 : /* shufps. */
22407 7244 : ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
22408 3622 : perm1, d->nelt, false);
22409 3622 : gcc_assert (ok);
22410 3622 : if (vmode == V4SImode && TARGET_SSE2)
22411 : /* pshufd. */
22412 2066 : ok = expand_vselect (d->target, tmp,
22413 2066 : perm2, d->nelt, false);
22414 : else
22415 : {
22416 : /* shufps. */
22417 1556 : perm2[2] += 4;
22418 1556 : perm2[3] += 4;
22419 1556 : ok = expand_vselect_vconcat (d->target, tmp, tmp,
22420 1556 : perm2, d->nelt, false);
22421 : }
22422 3622 : gcc_assert (ok);
22423 : }
22424 : /* 3 from one op and 1 from another. */
22425 : else
22426 : {
22427 22675 : unsigned pair_idx = 8, lone_idx = 8, shift;
22428 :
22429 : /* Find the lone index. */
22430 22675 : for (i = 0; i < 4; ++i)
22431 18140 : if ((d->perm[i] > 3 && count == 1)
22432 14809 : || (d->perm[i] < 4 && count == 3))
22433 18140 : lone_idx = i;
22434 :
22435 : /* When lone_idx is not 0, it must from second op(count == 1). */
22436 5739 : gcc_assert (count == (lone_idx ? 1 : 3));
22437 :
22438 : /* Find the pair index that sits in the same half as the lone index. */
22439 4535 : shift = lone_idx & 2;
22440 4535 : pair_idx = 1 - lone_idx + 2 * shift;
22441 :
22442 : /* First permutate lone index and pair index into the same vector as
22443 : [ lone, lone, pair, pair ]. */
22444 9070 : perm1[1] = perm1[0]
22445 4535 : = (count == 3) ? d->perm[lone_idx] : d->perm[lone_idx] - 4;
22446 9070 : perm1[3] = perm1[2]
22447 4535 : = (count == 3) ? d->perm[pair_idx] : d->perm[pair_idx] + 4;
22448 :
22449 : /* Alway put the vector contains lone indx at the first. */
22450 4535 : if (count == 1)
22451 3331 : std::swap (d->op0, d->op1);
22452 :
22453 : /* shufps. */
22454 9070 : ok = expand_vselect_vconcat (tmp, d->op0, d->op1,
22455 4535 : perm1, d->nelt, false);
22456 4535 : gcc_assert (ok);
22457 :
22458 : /* Refine lone and pair index to original order. */
22459 4535 : perm1[shift] = lone_idx << 1;
22460 4535 : perm1[shift + 1] = pair_idx << 1;
22461 :
22462 : /* Select the remaining 2 elements in another vector. */
22463 13605 : for (i = 2 - shift; i < 4 - shift; ++i)
22464 9070 : perm1[i] = lone_idx == 1 ? d->perm[i] + 4 : d->perm[i];
22465 :
22466 : /* Adjust to original selector. */
22467 4535 : if (lone_idx > 1)
22468 2244 : std::swap (tmp, d->op1);
22469 :
22470 : /* shufps. */
22471 9070 : ok = expand_vselect_vconcat (d->target, tmp, d->op1,
22472 4535 : perm1, d->nelt, false);
22473 :
22474 4535 : gcc_assert (ok);
22475 : }
22476 :
22477 : return true;
22478 : }
22479 :
22480 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement D
22481 : in terms of a pair of pshuflw + pshufhw instructions. */
22482 :
22483 : static bool
22484 102329 : expand_vec_perm_pshuflw_pshufhw (struct expand_vec_perm_d *d)
22485 : {
22486 102329 : unsigned char perm2[MAX_VECT_LEN];
22487 102329 : unsigned i;
22488 102329 : bool ok;
22489 :
22490 102329 : if (d->vmode != V8HImode || !d->one_operand_p)
22491 : return false;
22492 :
22493 : /* The two permutations only operate in 64-bit lanes. */
22494 12835 : for (i = 0; i < 4; ++i)
22495 10358 : if (d->perm[i] >= 4)
22496 : return false;
22497 12329 : for (i = 4; i < 8; ++i)
22498 9866 : if (d->perm[i] < 4)
22499 : return false;
22500 :
22501 2463 : if (d->testing_p)
22502 : return true;
22503 :
22504 : /* Emit the pshuflw. */
22505 134 : memcpy (perm2, d->perm, 4);
22506 670 : for (i = 4; i < 8; ++i)
22507 536 : perm2[i] = i;
22508 134 : ok = expand_vselect (d->target, d->op0, perm2, 8, d->testing_p);
22509 134 : gcc_assert (ok);
22510 :
22511 : /* Emit the pshufhw. */
22512 134 : memcpy (perm2 + 4, d->perm + 4, 4);
22513 670 : for (i = 0; i < 4; ++i)
22514 536 : perm2[i] = i;
22515 134 : ok = expand_vselect (d->target, d->target, perm2, 8, d->testing_p);
22516 134 : gcc_assert (ok);
22517 :
22518 : return true;
22519 : }
22520 :
22521 : /* Try to permute 2 64-bit vectors by punpckldq + 128-bit vector shuffle. */
22522 : static bool
22523 48670 : expand_vec_perm_punpckldq_pshuf (struct expand_vec_perm_d *d)
22524 : {
22525 48670 : if (GET_MODE_BITSIZE (d->vmode) != 64
22526 15094 : || !TARGET_MMX_WITH_SSE
22527 63764 : || d->one_operand_p)
22528 : return false;
22529 :
22530 13699 : machine_mode widen_vmode;
22531 13699 : switch (d->vmode)
22532 : {
22533 : /* pshufd. */
22534 : case E_V2SImode:
22535 : widen_vmode = V4SImode;
22536 : break;
22537 :
22538 : /* pshufd. */
22539 1101 : case E_V2SFmode:
22540 1101 : widen_vmode = V4SFmode;
22541 1101 : break;
22542 :
22543 4663 : case E_V4HImode:
22544 4663 : widen_vmode = V8HImode;
22545 : /* pshufb. */
22546 4663 : if (!TARGET_SSSE3)
22547 : return false;
22548 : break;
22549 :
22550 5560 : case E_V8QImode:
22551 : /* pshufb. */
22552 5560 : widen_vmode = V16QImode;
22553 5560 : if (!TARGET_SSSE3)
22554 : return false;
22555 : break;
22556 :
22557 : default:
22558 : return false;
22559 : }
22560 :
22561 5272 : if (d->testing_p)
22562 : return true;
22563 :
22564 379 : struct expand_vec_perm_d dperm;
22565 379 : dperm.target = gen_reg_rtx (widen_vmode);
22566 379 : rtx op0 = gen_reg_rtx (widen_vmode);
22567 379 : emit_move_insn (op0, gen_rtx_VEC_CONCAT (widen_vmode, d->op0, d->op1));
22568 379 : dperm.op0 = op0;
22569 379 : dperm.op1 = op0;
22570 379 : dperm.vmode = widen_vmode;
22571 379 : unsigned nelt = GET_MODE_NUNITS (widen_vmode);
22572 379 : dperm.nelt = nelt;
22573 379 : dperm.one_operand_p = true;
22574 379 : dperm.testing_p = false;
22575 :
22576 2009 : for (unsigned i = 0; i != nelt / 2; i++)
22577 : {
22578 1630 : dperm.perm[i] = d->perm[i];
22579 1630 : dperm.perm[i + nelt / 2] = d->perm[i];
22580 : }
22581 :
22582 379 : gcc_assert (expand_vec_perm_1 (&dperm));
22583 379 : emit_move_insn (d->target, lowpart_subreg (d->vmode,
22584 : dperm.target,
22585 : dperm.vmode));
22586 379 : return true;
22587 : }
22588 :
22589 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
22590 : the permutation using the SSSE3 palignr instruction. This succeeds
22591 : when all of the elements in PERM fit within one vector and we merely
22592 : need to shift them down so that a single vector permutation has a
22593 : chance to succeed. If SINGLE_INSN_ONLY_P, succeed if only
22594 : the vpalignr instruction itself can perform the requested permutation. */
22595 :
22596 : static bool
22597 225130 : expand_vec_perm_palignr (struct expand_vec_perm_d *d, bool single_insn_only_p)
22598 : {
22599 225130 : unsigned i, nelt = d->nelt;
22600 225130 : unsigned min, max, minswap, maxswap;
22601 225130 : bool in_order, ok, swap = false;
22602 225130 : rtx shift, target;
22603 225130 : struct expand_vec_perm_d dcopy;
22604 :
22605 : /* Even with AVX, palignr only operates on 128-bit vectors,
22606 : in AVX2 palignr operates on both 128-bit lanes. */
22607 120848 : if ((!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
22608 269958 : && (!TARGET_AVX2 || GET_MODE_SIZE (d->vmode) != 32))
22609 : return false;
22610 :
22611 35543 : min = 2 * nelt;
22612 35543 : max = 0;
22613 35543 : minswap = 2 * nelt;
22614 35543 : maxswap = 0;
22615 259515 : for (i = 0; i < nelt; ++i)
22616 : {
22617 223972 : unsigned e = d->perm[i];
22618 223972 : unsigned eswap = d->perm[i] ^ nelt;
22619 447944 : if (GET_MODE_SIZE (d->vmode) == 32)
22620 : {
22621 89592 : e = (e & ((nelt / 2) - 1)) | ((e & nelt) >> 1);
22622 89592 : eswap = e ^ (nelt / 2);
22623 : }
22624 223972 : if (e < min)
22625 : min = e;
22626 223972 : if (e > max)
22627 : max = e;
22628 223972 : if (eswap < minswap)
22629 : minswap = eswap;
22630 223972 : if (eswap > maxswap)
22631 : maxswap = eswap;
22632 : }
22633 35543 : if (min == 0
22634 51597 : || max - min >= (GET_MODE_SIZE (d->vmode) == 32 ? nelt / 2 : nelt))
22635 : {
22636 32347 : if (d->one_operand_p
22637 32078 : || minswap == 0
22638 68519 : || maxswap - minswap >= (GET_MODE_SIZE (d->vmode) == 32
22639 18086 : ? nelt / 2 : nelt))
22640 : return false;
22641 : swap = true;
22642 : min = minswap;
22643 6416 : max = maxswap;
22644 : }
22645 :
22646 : /* Given that we have SSSE3, we know we'll be able to implement the
22647 : single operand permutation after the palignr with pshufb for
22648 : 128-bit vectors. If SINGLE_INSN_ONLY_P, in_order has to be computed
22649 : first. */
22650 6466 : if (d->testing_p && GET_MODE_SIZE (d->vmode) == 16 && !single_insn_only_p)
22651 : return true;
22652 :
22653 6416 : dcopy = *d;
22654 6416 : if (swap)
22655 : {
22656 3220 : dcopy.op0 = d->op1;
22657 3220 : dcopy.op1 = d->op0;
22658 16172 : for (i = 0; i < nelt; ++i)
22659 12952 : dcopy.perm[i] ^= nelt;
22660 : }
22661 :
22662 : in_order = true;
22663 32632 : for (i = 0; i < nelt; ++i)
22664 : {
22665 26216 : unsigned e = dcopy.perm[i];
22666 26216 : if (GET_MODE_SIZE (d->vmode) == 32
22667 1120 : && e >= nelt
22668 26466 : && (e & (nelt / 2 - 1)) < min)
22669 250 : e = e - min - (nelt / 2);
22670 : else
22671 25966 : e = e - min;
22672 26216 : if (e != i)
22673 19394 : in_order = false;
22674 26216 : dcopy.perm[i] = e;
22675 : }
22676 6416 : dcopy.one_operand_p = true;
22677 :
22678 6416 : if (single_insn_only_p && !in_order)
22679 : return false;
22680 :
22681 : /* For AVX2, test whether we can permute the result in one instruction. */
22682 3267 : if (d->testing_p)
22683 : {
22684 50 : if (in_order)
22685 : return true;
22686 0 : dcopy.op1 = dcopy.op0;
22687 0 : return expand_vec_perm_1 (&dcopy);
22688 : }
22689 :
22690 6434 : shift = GEN_INT (min * GET_MODE_UNIT_BITSIZE (d->vmode));
22691 6434 : if (GET_MODE_SIZE (d->vmode) == 16)
22692 : {
22693 3145 : target = gen_reg_rtx (V1TImode);
22694 3145 : emit_insn (gen_ssse3_palignrv1ti (target,
22695 3145 : gen_lowpart (V1TImode, dcopy.op1),
22696 3145 : gen_lowpart (V1TImode, dcopy.op0),
22697 : shift));
22698 : }
22699 : else
22700 : {
22701 72 : target = gen_reg_rtx (V2TImode);
22702 72 : emit_insn (gen_avx2_palignrv2ti (target,
22703 72 : gen_lowpart (V2TImode, dcopy.op1),
22704 72 : gen_lowpart (V2TImode, dcopy.op0),
22705 : shift));
22706 : }
22707 :
22708 3217 : dcopy.op0 = dcopy.op1 = gen_lowpart (d->vmode, target);
22709 :
22710 : /* Test for the degenerate case where the alignment by itself
22711 : produces the desired permutation. */
22712 3217 : if (in_order)
22713 : {
22714 70 : emit_move_insn (d->target, dcopy.op0);
22715 70 : return true;
22716 : }
22717 :
22718 3147 : ok = expand_vec_perm_1 (&dcopy);
22719 3159 : gcc_assert (ok || GET_MODE_SIZE (d->vmode) == 32);
22720 :
22721 : return ok;
22722 : }
22723 :
22724 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
22725 : the permutation using the SSE4_1 pblendv instruction. Potentially
22726 : reduces permutation from 2 pshufb and or to 1 pshufb and pblendv. */
22727 :
22728 : static bool
22729 90184 : expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
22730 : {
22731 90184 : unsigned i, which, nelt = d->nelt;
22732 90184 : struct expand_vec_perm_d dcopy, dcopy1;
22733 90184 : machine_mode vmode = d->vmode;
22734 90184 : bool ok;
22735 :
22736 : /* Use the same checks as in expand_vec_perm_blend. */
22737 90184 : if (d->one_operand_p)
22738 : return false;
22739 89007 : if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
22740 : ;
22741 82777 : else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
22742 : ;
22743 79006 : else if (TARGET_SSE4_1
22744 89187 : && (GET_MODE_SIZE (vmode) == 16
22745 8484 : || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
22746 2559 : || GET_MODE_SIZE (vmode) == 4))
22747 : ;
22748 : else
22749 : return false;
22750 :
22751 : /* Figure out where permutation elements stay not in their
22752 : respective lanes. */
22753 119408 : for (i = 0, which = 0; i < nelt; ++i)
22754 : {
22755 103648 : unsigned e = d->perm[i];
22756 103648 : if (e != i)
22757 141880 : which |= (e < nelt ? 1 : 2);
22758 : }
22759 : /* We can pblend the part where elements stay not in their
22760 : respective lanes only when these elements are all in one
22761 : half of a permutation.
22762 : {0 1 8 3 4 5 9 7} is ok as 8, 9 are at not at their respective
22763 : lanes, but both 8 and 9 >= 8
22764 : {0 1 8 3 4 5 2 7} is not ok as 2 and 8 are not at their
22765 : respective lanes and 8 >= 8, but 2 not. */
22766 15760 : if (which != 1 && which != 2)
22767 : return false;
22768 3361 : if (d->testing_p && GET_MODE_SIZE (vmode) == 16)
22769 : return true;
22770 :
22771 : /* First we apply one operand permutation to the part where
22772 : elements stay not in their respective lanes. */
22773 2051 : dcopy = *d;
22774 2051 : if (which == 2)
22775 2051 : dcopy.op0 = dcopy.op1 = d->op1;
22776 : else
22777 0 : dcopy.op0 = dcopy.op1 = d->op0;
22778 2051 : if (!d->testing_p)
22779 741 : dcopy.target = gen_reg_rtx (vmode);
22780 2051 : dcopy.one_operand_p = true;
22781 :
22782 16603 : for (i = 0; i < nelt; ++i)
22783 14552 : dcopy.perm[i] = d->perm[i] & (nelt - 1);
22784 :
22785 2051 : ok = expand_vec_perm_1 (&dcopy);
22786 4102 : if (GET_MODE_SIZE (vmode) != 16 && !ok)
22787 : return false;
22788 : else
22789 1756 : gcc_assert (ok);
22790 1756 : if (d->testing_p)
22791 : return true;
22792 :
22793 : /* Next we put permuted elements into their positions. */
22794 679 : dcopy1 = *d;
22795 679 : if (which == 2)
22796 679 : dcopy1.op1 = dcopy.target;
22797 : else
22798 0 : dcopy1.op0 = dcopy.target;
22799 :
22800 5751 : for (i = 0; i < nelt; ++i)
22801 5072 : dcopy1.perm[i] = ((d->perm[i] >= nelt) ? (nelt + i) : i);
22802 :
22803 679 : ok = expand_vec_perm_blend (&dcopy1);
22804 679 : gcc_assert (ok);
22805 :
22806 : return true;
22807 : }
22808 :
22809 : static bool expand_vec_perm_interleave3 (struct expand_vec_perm_d *d);
22810 :
22811 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
22812 : a two vector permutation into a single vector permutation by using
22813 : an interleave operation to merge the vectors. */
22814 :
22815 : static bool
22816 96731 : expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
22817 : {
22818 96731 : struct expand_vec_perm_d dremap, dfinal;
22819 96731 : unsigned i, nelt = d->nelt, nelt2 = nelt / 2;
22820 96731 : unsigned HOST_WIDE_INT contents;
22821 96731 : unsigned char remap[2 * MAX_VECT_LEN];
22822 96731 : rtx_insn *seq;
22823 96731 : bool ok, same_halves = false;
22824 :
22825 96731 : if (GET_MODE_SIZE (d->vmode) == 4
22826 174560 : || GET_MODE_SIZE (d->vmode) == 8
22827 237383 : || GET_MODE_SIZE (d->vmode) == 16)
22828 : {
22829 89330 : if (d->one_operand_p)
22830 : return false;
22831 : }
22832 14802 : else if (GET_MODE_SIZE (d->vmode) == 32)
22833 : {
22834 7049 : if (!TARGET_AVX)
22835 : return false;
22836 : /* For 32-byte modes allow even d->one_operand_p.
22837 : The lack of cross-lane shuffling in some instructions
22838 : might prevent a single insn shuffle. */
22839 7049 : dfinal = *d;
22840 7049 : dfinal.testing_p = true;
22841 : /* If expand_vec_perm_interleave3 can expand this into
22842 : a 3 insn sequence, give up and let it be expanded as
22843 : 3 insn sequence. While that is one insn longer,
22844 : it doesn't need a memory operand and in the common
22845 : case that both interleave low and high permutations
22846 : with the same operands are adjacent needs 4 insns
22847 : for both after CSE. */
22848 7049 : if (expand_vec_perm_interleave3 (&dfinal))
22849 : return false;
22850 : }
22851 : else
22852 : return false;
22853 :
22854 : /* Examine from whence the elements come. */
22855 91013 : contents = 0;
22856 686429 : for (i = 0; i < nelt; ++i)
22857 595416 : contents |= HOST_WIDE_INT_1U << d->perm[i];
22858 :
22859 91013 : memset (remap, 0xff, sizeof (remap));
22860 91013 : dremap = *d;
22861 :
22862 91013 : if (GET_MODE_SIZE (d->vmode) == 4
22863 174240 : || GET_MODE_SIZE (d->vmode) == 8)
22864 : {
22865 23341 : unsigned HOST_WIDE_INT h1, h2, h3, h4;
22866 :
22867 : /* Split the two input vectors into 4 halves. */
22868 23341 : h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
22869 23341 : h2 = h1 << nelt2;
22870 23341 : h3 = h2 << nelt2;
22871 23341 : h4 = h3 << nelt2;
22872 :
22873 : /* If the elements from the low halves use interleave low,
22874 : and similarly for interleave high. */
22875 23341 : if ((contents & (h1 | h3)) == contents)
22876 : {
22877 : /* punpckl* */
22878 3247 : for (i = 0; i < nelt2; ++i)
22879 : {
22880 2292 : remap[i] = i * 2;
22881 2292 : remap[i + nelt] = i * 2 + 1;
22882 2292 : dremap.perm[i * 2] = i;
22883 2292 : dremap.perm[i * 2 + 1] = i + nelt;
22884 : }
22885 : }
22886 22386 : else if ((contents & (h2 | h4)) == contents)
22887 : {
22888 : /* punpckh* */
22889 2836 : for (i = 0; i < nelt2; ++i)
22890 : {
22891 2000 : remap[i + nelt2] = i * 2;
22892 2000 : remap[i + nelt + nelt2] = i * 2 + 1;
22893 2000 : dremap.perm[i * 2] = i + nelt2;
22894 2000 : dremap.perm[i * 2 + 1] = i + nelt + nelt2;
22895 : }
22896 : }
22897 : else
22898 : return false;
22899 : }
22900 135344 : else if (GET_MODE_SIZE (d->vmode) == 16)
22901 : {
22902 60841 : unsigned HOST_WIDE_INT h1, h2, h3, h4;
22903 :
22904 : /* Split the two input vectors into 4 halves. */
22905 60841 : h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
22906 60841 : h2 = h1 << nelt2;
22907 60841 : h3 = h2 << nelt2;
22908 60841 : h4 = h3 << nelt2;
22909 :
22910 : /* If the elements from the low halves use interleave low, and similarly
22911 : for interleave high. If the elements are from mis-matched halves, we
22912 : can use shufps for V4SF/V4SI or do a DImode shuffle. */
22913 60841 : if ((contents & (h1 | h3)) == contents)
22914 : {
22915 : /* punpckl* */
22916 5923 : for (i = 0; i < nelt2; ++i)
22917 : {
22918 4382 : remap[i] = i * 2;
22919 4382 : remap[i + nelt] = i * 2 + 1;
22920 4382 : dremap.perm[i * 2] = i;
22921 4382 : dremap.perm[i * 2 + 1] = i + nelt;
22922 : }
22923 1541 : if (!TARGET_SSE2 && d->vmode == V4SImode)
22924 0 : dremap.vmode = V4SFmode;
22925 : }
22926 59300 : else if ((contents & (h2 | h4)) == contents)
22927 : {
22928 : /* punpckh* */
22929 5130 : for (i = 0; i < nelt2; ++i)
22930 : {
22931 3762 : remap[i + nelt2] = i * 2;
22932 3762 : remap[i + nelt + nelt2] = i * 2 + 1;
22933 3762 : dremap.perm[i * 2] = i + nelt2;
22934 3762 : dremap.perm[i * 2 + 1] = i + nelt + nelt2;
22935 : }
22936 1368 : if (!TARGET_SSE2 && d->vmode == V4SImode)
22937 0 : dremap.vmode = V4SFmode;
22938 : }
22939 57932 : else if ((contents & (h1 | h4)) == contents)
22940 : {
22941 : /* shufps */
22942 2537 : for (i = 0; i < nelt2; ++i)
22943 : {
22944 1828 : remap[i] = i;
22945 1828 : remap[i + nelt + nelt2] = i + nelt2;
22946 1828 : dremap.perm[i] = i;
22947 1828 : dremap.perm[i + nelt2] = i + nelt + nelt2;
22948 : }
22949 709 : if (nelt != 4)
22950 : {
22951 : /* shufpd */
22952 69 : dremap.vmode = V2DImode;
22953 69 : dremap.nelt = 2;
22954 69 : dremap.perm[0] = 0;
22955 69 : dremap.perm[1] = 3;
22956 : }
22957 : }
22958 57223 : else if ((contents & (h2 | h3)) == contents)
22959 : {
22960 : /* shufps */
22961 3423 : for (i = 0; i < nelt2; ++i)
22962 : {
22963 2410 : remap[i + nelt2] = i;
22964 2410 : remap[i + nelt] = i + nelt2;
22965 2410 : dremap.perm[i] = i + nelt2;
22966 2410 : dremap.perm[i + nelt2] = i + nelt;
22967 : }
22968 1013 : if (nelt != 4)
22969 : {
22970 : /* shufpd */
22971 64 : dremap.vmode = V2DImode;
22972 64 : dremap.nelt = 2;
22973 64 : dremap.perm[0] = 1;
22974 64 : dremap.perm[1] = 2;
22975 : }
22976 : }
22977 : else
22978 : return false;
22979 : }
22980 : else
22981 : {
22982 6831 : unsigned int nelt4 = nelt / 4, nzcnt = 0;
22983 6831 : unsigned HOST_WIDE_INT q[8];
22984 6831 : unsigned int nonzero_halves[4];
22985 :
22986 : /* Split the two input vectors into 8 quarters. */
22987 6831 : q[0] = (HOST_WIDE_INT_1U << nelt4) - 1;
22988 54648 : for (i = 1; i < 8; ++i)
22989 47817 : q[i] = q[0] << (nelt4 * i);
22990 34155 : for (i = 0; i < 4; ++i)
22991 27324 : if (((q[2 * i] | q[2 * i + 1]) & contents) != 0)
22992 : {
22993 24487 : nonzero_halves[nzcnt] = i;
22994 24487 : ++nzcnt;
22995 : }
22996 :
22997 6831 : if (nzcnt == 1)
22998 : {
22999 221 : gcc_assert (d->one_operand_p);
23000 221 : nonzero_halves[1] = nonzero_halves[0];
23001 221 : same_halves = true;
23002 : }
23003 6610 : else if (d->one_operand_p)
23004 : {
23005 23 : gcc_assert (nonzero_halves[0] == 0);
23006 23 : gcc_assert (nonzero_halves[1] == 1);
23007 : }
23008 :
23009 6831 : if (nzcnt <= 2)
23010 : {
23011 544 : if (d->perm[0] / nelt2 == nonzero_halves[1])
23012 : {
23013 : /* Attempt to increase the likelihood that dfinal
23014 : shuffle will be intra-lane. */
23015 229 : std::swap (nonzero_halves[0], nonzero_halves[1]);
23016 : }
23017 :
23018 : /* vperm2f128 or vperm2i128. */
23019 3526 : for (i = 0; i < nelt2; ++i)
23020 : {
23021 2982 : remap[i + nonzero_halves[1] * nelt2] = i + nelt2;
23022 2982 : remap[i + nonzero_halves[0] * nelt2] = i;
23023 2982 : dremap.perm[i + nelt2] = i + nonzero_halves[1] * nelt2;
23024 2982 : dremap.perm[i] = i + nonzero_halves[0] * nelt2;
23025 : }
23026 :
23027 544 : if (d->vmode != V8SFmode
23028 : && d->vmode != V4DFmode
23029 : && d->vmode != V8SImode)
23030 : {
23031 132 : dremap.vmode = V8SImode;
23032 132 : dremap.nelt = 8;
23033 660 : for (i = 0; i < 4; ++i)
23034 : {
23035 528 : dremap.perm[i] = i + nonzero_halves[0] * 4;
23036 528 : dremap.perm[i + 4] = i + nonzero_halves[1] * 4;
23037 : }
23038 : }
23039 : }
23040 6287 : else if (d->one_operand_p)
23041 5822 : return false;
23042 6287 : else if (TARGET_AVX2
23043 2600 : && (contents & (q[0] | q[2] | q[4] | q[6])) == contents)
23044 : {
23045 : /* vpunpckl* */
23046 491 : for (i = 0; i < nelt4; ++i)
23047 : {
23048 247 : remap[i] = i * 2;
23049 247 : remap[i + nelt] = i * 2 + 1;
23050 247 : remap[i + nelt2] = i * 2 + nelt2;
23051 247 : remap[i + nelt + nelt2] = i * 2 + nelt2 + 1;
23052 247 : dremap.perm[i * 2] = i;
23053 247 : dremap.perm[i * 2 + 1] = i + nelt;
23054 247 : dremap.perm[i * 2 + nelt2] = i + nelt2;
23055 247 : dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2;
23056 : }
23057 : }
23058 6043 : else if (TARGET_AVX2
23059 2356 : && (contents & (q[1] | q[3] | q[5] | q[7])) == contents)
23060 : {
23061 : /* vpunpckh* */
23062 445 : for (i = 0; i < nelt4; ++i)
23063 : {
23064 224 : remap[i + nelt4] = i * 2;
23065 224 : remap[i + nelt + nelt4] = i * 2 + 1;
23066 224 : remap[i + nelt2 + nelt4] = i * 2 + nelt2;
23067 224 : remap[i + nelt + nelt2 + nelt4] = i * 2 + nelt2 + 1;
23068 224 : dremap.perm[i * 2] = i + nelt4;
23069 224 : dremap.perm[i * 2 + 1] = i + nelt + nelt4;
23070 224 : dremap.perm[i * 2 + nelt2] = i + nelt2 + nelt4;
23071 224 : dremap.perm[i * 2 + nelt2 + 1] = i + nelt + nelt2 + nelt4;
23072 : }
23073 : }
23074 : else
23075 : return false;
23076 : }
23077 :
23078 : /* Use the remapping array set up above to move the elements from their
23079 : swizzled locations into their final destinations. */
23080 7431 : dfinal = *d;
23081 48627 : for (i = 0; i < nelt; ++i)
23082 : {
23083 41196 : unsigned e = remap[d->perm[i]];
23084 41196 : gcc_assert (e < nelt);
23085 : /* If same_halves is true, both halves of the remapped vector are the
23086 : same. Avoid cross-lane accesses if possible. */
23087 41196 : if (same_halves && i >= nelt2)
23088 : {
23089 816 : gcc_assert (e < nelt2);
23090 816 : dfinal.perm[i] = e + nelt2;
23091 : }
23092 : else
23093 40380 : dfinal.perm[i] = e;
23094 : }
23095 7431 : if (!d->testing_p)
23096 : {
23097 2773 : dremap.target = gen_reg_rtx (dremap.vmode);
23098 2773 : dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
23099 : }
23100 7431 : dfinal.op1 = dfinal.op0;
23101 7431 : dfinal.one_operand_p = true;
23102 :
23103 : /* Test if the final remap can be done with a single insn. For V4SFmode or
23104 : V4SImode this *will* succeed. For V8HImode or V16QImode it may not. */
23105 7431 : start_sequence ();
23106 7431 : ok = expand_vec_perm_1 (&dfinal);
23107 7431 : seq = end_sequence ();
23108 :
23109 7431 : if (!ok)
23110 : return false;
23111 :
23112 6383 : if (d->testing_p)
23113 : return true;
23114 :
23115 2734 : if (dremap.vmode != dfinal.vmode)
23116 : {
23117 53 : dremap.op0 = gen_lowpart (dremap.vmode, dremap.op0);
23118 53 : dremap.op1 = gen_lowpart (dremap.vmode, dremap.op1);
23119 : }
23120 :
23121 2734 : ok = expand_vec_perm_1 (&dremap);
23122 2734 : gcc_assert (ok);
23123 :
23124 2734 : emit_insn (seq);
23125 2734 : return true;
23126 : }
23127 :
23128 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
23129 : a single vector cross-lane permutation into vpermq followed
23130 : by any of the single insn permutations. */
23131 :
23132 : static bool
23133 90248 : expand_vec_perm_vpermq_perm_1 (struct expand_vec_perm_d *d)
23134 : {
23135 90248 : struct expand_vec_perm_d dremap, dfinal;
23136 90248 : unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, nelt4 = nelt / 4;
23137 90248 : unsigned contents[2];
23138 90248 : bool ok;
23139 :
23140 90248 : if (!(TARGET_AVX2
23141 4277 : && (d->vmode == V32QImode || d->vmode == V16HImode)
23142 495 : && d->one_operand_p))
23143 : return false;
23144 :
23145 7 : contents[0] = 0;
23146 7 : contents[1] = 0;
23147 103 : for (i = 0; i < nelt2; ++i)
23148 : {
23149 96 : contents[0] |= 1u << (d->perm[i] / nelt4);
23150 96 : contents[1] |= 1u << (d->perm[i + nelt2] / nelt4);
23151 : }
23152 :
23153 7 : for (i = 0; i < 2; ++i)
23154 : {
23155 : unsigned int cnt = 0;
23156 21 : for (j = 0; j < 4; ++j)
23157 21 : if ((contents[i] & (1u << j)) != 0 && ++cnt > 2)
23158 : return false;
23159 : }
23160 :
23161 0 : if (d->testing_p)
23162 : return true;
23163 :
23164 0 : dremap = *d;
23165 0 : dremap.vmode = V4DImode;
23166 0 : dremap.nelt = 4;
23167 0 : dremap.target = gen_reg_rtx (V4DImode);
23168 0 : dremap.op0 = gen_lowpart (V4DImode, d->op0);
23169 0 : dremap.op1 = dremap.op0;
23170 0 : dremap.one_operand_p = true;
23171 0 : for (i = 0; i < 2; ++i)
23172 : {
23173 : unsigned int cnt = 0;
23174 0 : for (j = 0; j < 4; ++j)
23175 0 : if ((contents[i] & (1u << j)) != 0)
23176 0 : dremap.perm[2 * i + cnt++] = j;
23177 0 : for (; cnt < 2; ++cnt)
23178 0 : dremap.perm[2 * i + cnt] = 0;
23179 : }
23180 :
23181 0 : dfinal = *d;
23182 0 : dfinal.op0 = gen_lowpart (dfinal.vmode, dremap.target);
23183 0 : dfinal.op1 = dfinal.op0;
23184 0 : dfinal.one_operand_p = true;
23185 0 : for (i = 0, j = 0; i < nelt; ++i)
23186 : {
23187 0 : if (i == nelt2)
23188 0 : j = 2;
23189 0 : dfinal.perm[i] = (d->perm[i] & (nelt4 - 1)) | (j ? nelt2 : 0);
23190 0 : if ((d->perm[i] / nelt4) == dremap.perm[j])
23191 : ;
23192 0 : else if ((d->perm[i] / nelt4) == dremap.perm[j + 1])
23193 0 : dfinal.perm[i] |= nelt4;
23194 : else
23195 0 : gcc_unreachable ();
23196 : }
23197 :
23198 0 : ok = expand_vec_perm_1 (&dremap);
23199 0 : gcc_assert (ok);
23200 :
23201 0 : ok = expand_vec_perm_1 (&dfinal);
23202 0 : gcc_assert (ok);
23203 :
23204 : return true;
23205 : }
23206 :
23207 : static bool canonicalize_perm (struct expand_vec_perm_d *d);
23208 :
23209 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to expand
23210 : a vector permutation using two instructions, vperm2f128 resp.
23211 : vperm2i128 followed by any single in-lane permutation. */
23212 :
23213 : static bool
23214 90248 : expand_vec_perm_vperm2f128 (struct expand_vec_perm_d *d)
23215 : {
23216 90248 : struct expand_vec_perm_d dfirst, dsecond;
23217 90248 : unsigned i, j, nelt = d->nelt, nelt2 = nelt / 2, perm;
23218 90248 : bool ok;
23219 :
23220 90248 : if (!TARGET_AVX
23221 23226 : || GET_MODE_SIZE (d->vmode) != 32
23222 96461 : || (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2))
23223 : return false;
23224 :
23225 6029 : dsecond = *d;
23226 6029 : dsecond.one_operand_p = false;
23227 6029 : dsecond.testing_p = true;
23228 :
23229 : /* ((perm << 2)|perm) & 0x33 is the vperm2[fi]128
23230 : immediate. For perm < 16 the second permutation uses
23231 : d->op0 as first operand, for perm >= 16 it uses d->op1
23232 : as first operand. The second operand is the result of
23233 : vperm2[fi]128. */
23234 197735 : for (perm = 0; perm < 32; perm++)
23235 : {
23236 : /* Ignore permutations which do not move anything cross-lane. */
23237 191785 : if (perm < 16)
23238 : {
23239 : /* The second shuffle for e.g. V4DFmode has
23240 : 0123 and ABCD operands.
23241 : Ignore AB23, as 23 is already in the second lane
23242 : of the first operand. */
23243 96126 : if ((perm & 0xc) == (1 << 2)) continue;
23244 : /* And 01CD, as 01 is in the first lane of the first
23245 : operand. */
23246 72086 : if ((perm & 3) == 0) continue;
23247 : /* And 4567, as then the vperm2[fi]128 doesn't change
23248 : anything on the original 4567 second operand. */
23249 54049 : if ((perm & 0xf) == ((3 << 2) | 2)) continue;
23250 : }
23251 : else
23252 : {
23253 : /* The second shuffle for e.g. V4DFmode has
23254 : 4567 and ABCD operands.
23255 : Ignore AB67, as 67 is already in the second lane
23256 : of the first operand. */
23257 95659 : if ((perm & 0xc) == (3 << 2)) continue;
23258 : /* And 45CD, as 45 is in the first lane of the first
23259 : operand. */
23260 71859 : if ((perm & 3) == 2) continue;
23261 : /* And 0123, as then the vperm2[fi]128 doesn't change
23262 : anything on the original 0123 first operand. */
23263 53918 : if ((perm & 0xf) == (1 << 2)) continue;
23264 : }
23265 :
23266 277596 : for (i = 0; i < nelt; i++)
23267 : {
23268 275777 : j = d->perm[i] / nelt2;
23269 510607 : if (j == ((perm >> (2 * (i >= nelt2))) & 3))
23270 67089 : dsecond.perm[i] = nelt + (i & nelt2) + (d->perm[i] & (nelt2 - 1));
23271 349601 : else if (j == (unsigned) (i >= nelt2) + 2 * (perm >= 16))
23272 114534 : dsecond.perm[i] = d->perm[i] & (nelt - 1);
23273 : else
23274 : break;
23275 : }
23276 :
23277 95973 : if (i == nelt)
23278 : {
23279 1819 : start_sequence ();
23280 1819 : ok = expand_vec_perm_1 (&dsecond);
23281 1819 : end_sequence ();
23282 : }
23283 : else
23284 : ok = false;
23285 :
23286 1819 : if (ok)
23287 : {
23288 64 : if (d->testing_p)
23289 : return true;
23290 :
23291 : /* Found a usable second shuffle. dfirst will be
23292 : vperm2f128 on d->op0 and d->op1. */
23293 46 : dsecond.testing_p = false;
23294 46 : dfirst = *d;
23295 46 : dfirst.target = gen_reg_rtx (d->vmode);
23296 270 : for (i = 0; i < nelt; i++)
23297 448 : dfirst.perm[i] = (i & (nelt2 - 1))
23298 336 : + ((perm >> (2 * (i >= nelt2))) & 3) * nelt2;
23299 :
23300 46 : canonicalize_perm (&dfirst);
23301 46 : ok = expand_vec_perm_1 (&dfirst);
23302 46 : gcc_assert (ok);
23303 :
23304 : /* And dsecond is some single insn shuffle, taking
23305 : d->op0 and result of vperm2f128 (if perm < 16) or
23306 : d->op1 and result of vperm2f128 (otherwise). */
23307 46 : if (perm >= 16)
23308 46 : dsecond.op0 = dsecond.op1;
23309 46 : dsecond.op1 = dfirst.target;
23310 :
23311 46 : ok = expand_vec_perm_1 (&dsecond);
23312 46 : gcc_assert (ok);
23313 :
23314 : return true;
23315 : }
23316 :
23317 : /* For one operand, the only useful vperm2f128 permutation is 0x01
23318 : aka lanes swap. */
23319 95909 : if (d->one_operand_p)
23320 : return false;
23321 : }
23322 :
23323 : return false;
23324 : }
23325 :
23326 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
23327 : a two vector permutation using 2 intra-lane interleave insns
23328 : and cross-lane shuffle for 32-byte vectors. */
23329 :
23330 : static bool
23331 34319 : expand_vec_perm_interleave3 (struct expand_vec_perm_d *d)
23332 : {
23333 34319 : unsigned i, nelt;
23334 34319 : rtx (*gen) (rtx, rtx, rtx);
23335 :
23336 34319 : if (d->one_operand_p)
23337 : return false;
23338 33087 : if (TARGET_AVX2 && GET_MODE_SIZE (d->vmode) == 32)
23339 : ;
23340 24880 : else if (TARGET_AVX && (d->vmode == V8SFmode || d->vmode == V4DFmode))
23341 : ;
23342 : else
23343 : return false;
23344 :
23345 9717 : nelt = d->nelt;
23346 9717 : if (d->perm[0] != 0 && d->perm[0] != nelt / 2)
23347 : return false;
23348 9877 : for (i = 0; i < nelt; i += 2)
23349 9521 : if (d->perm[i] != d->perm[0] + i / 2
23350 8648 : || d->perm[i + 1] != d->perm[0] + i / 2 + nelt)
23351 : return false;
23352 :
23353 356 : if (d->testing_p)
23354 : return true;
23355 :
23356 56 : switch (d->vmode)
23357 : {
23358 32 : case E_V32QImode:
23359 32 : if (d->perm[0])
23360 : gen = gen_vec_interleave_highv32qi;
23361 : else
23362 16 : gen = gen_vec_interleave_lowv32qi;
23363 : break;
23364 18 : case E_V16HImode:
23365 18 : if (d->perm[0])
23366 : gen = gen_vec_interleave_highv16hi;
23367 : else
23368 9 : gen = gen_vec_interleave_lowv16hi;
23369 : break;
23370 0 : case E_V8SImode:
23371 0 : if (d->perm[0])
23372 : gen = gen_vec_interleave_highv8si;
23373 : else
23374 0 : gen = gen_vec_interleave_lowv8si;
23375 : break;
23376 4 : case E_V4DImode:
23377 4 : if (d->perm[0])
23378 : gen = gen_vec_interleave_highv4di;
23379 : else
23380 2 : gen = gen_vec_interleave_lowv4di;
23381 : break;
23382 2 : case E_V8SFmode:
23383 2 : if (d->perm[0])
23384 : gen = gen_vec_interleave_highv8sf;
23385 : else
23386 1 : gen = gen_vec_interleave_lowv8sf;
23387 : break;
23388 0 : case E_V4DFmode:
23389 0 : if (d->perm[0])
23390 : gen = gen_vec_interleave_highv4df;
23391 : else
23392 0 : gen = gen_vec_interleave_lowv4df;
23393 : break;
23394 0 : default:
23395 0 : gcc_unreachable ();
23396 : }
23397 :
23398 56 : emit_insn (gen (d->target, d->op0, d->op1));
23399 56 : return true;
23400 : }
23401 :
23402 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
23403 : a single vector permutation using a single intra-lane vector
23404 : permutation, vperm2f128 swapping the lanes and vblend* insn blending
23405 : the non-swapped and swapped vectors together. */
23406 :
23407 : static bool
23408 27132 : expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
23409 : {
23410 27132 : struct expand_vec_perm_d dfirst, dsecond;
23411 27132 : unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2;
23412 27132 : rtx_insn *seq;
23413 27132 : bool ok;
23414 27132 : rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
23415 :
23416 27132 : if (!TARGET_AVX
23417 3508 : || TARGET_AVX2
23418 2030 : || (d->vmode != V8SFmode && d->vmode != V4DFmode)
23419 1846 : || !d->one_operand_p)
23420 : return false;
23421 :
23422 0 : dfirst = *d;
23423 0 : for (i = 0; i < nelt; i++)
23424 0 : dfirst.perm[i] = 0xff;
23425 0 : for (i = 0, msk = 0; i < nelt; i++)
23426 : {
23427 0 : j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
23428 0 : if (dfirst.perm[j] != 0xff && dfirst.perm[j] != d->perm[i])
23429 : return false;
23430 0 : dfirst.perm[j] = d->perm[i];
23431 0 : if (j != i)
23432 0 : msk |= (1 << i);
23433 : }
23434 0 : for (i = 0; i < nelt; i++)
23435 0 : if (dfirst.perm[i] == 0xff)
23436 0 : dfirst.perm[i] = i;
23437 :
23438 0 : if (!d->testing_p)
23439 0 : dfirst.target = gen_reg_rtx (dfirst.vmode);
23440 :
23441 0 : start_sequence ();
23442 0 : ok = expand_vec_perm_1 (&dfirst);
23443 0 : seq = end_sequence ();
23444 :
23445 0 : if (!ok)
23446 : return false;
23447 :
23448 0 : if (d->testing_p)
23449 : return true;
23450 :
23451 0 : emit_insn (seq);
23452 :
23453 0 : dsecond = *d;
23454 0 : dsecond.op0 = dfirst.target;
23455 0 : dsecond.op1 = dfirst.target;
23456 0 : dsecond.one_operand_p = true;
23457 0 : dsecond.target = gen_reg_rtx (dsecond.vmode);
23458 0 : for (i = 0; i < nelt; i++)
23459 0 : dsecond.perm[i] = i ^ nelt2;
23460 :
23461 0 : ok = expand_vec_perm_1 (&dsecond);
23462 0 : gcc_assert (ok);
23463 :
23464 0 : blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
23465 0 : emit_insn (blend (d->target, dfirst.target, dsecond.target, GEN_INT (msk)));
23466 0 : return true;
23467 : }
23468 :
23469 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
23470 : a two vector permutation using two single vector permutations and
23471 : {,v}{,p}unpckl{ps,pd,bw,wd,dq}. If two_insn, succeed only if one
23472 : of dfirst or dsecond is identity permutation. */
23473 :
23474 : static bool
23475 115560 : expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
23476 : {
23477 115560 : unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
23478 115560 : struct expand_vec_perm_d dfirst, dsecond, dfinal;
23479 115560 : bool ident1 = true, ident2 = true;
23480 :
23481 115560 : if (d->one_operand_p)
23482 : return false;
23483 :
23484 210294 : if (GET_MODE_SIZE (d->vmode) == 16)
23485 : {
23486 63729 : if (!TARGET_SSE)
23487 : return false;
23488 63729 : if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
23489 : return false;
23490 : }
23491 82836 : else if (GET_MODE_SIZE (d->vmode) == 32)
23492 : {
23493 8599 : if (!TARGET_AVX)
23494 : return false;
23495 8599 : if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
23496 : return false;
23497 : lane = nelt2;
23498 : }
23499 : else
23500 : return false;
23501 :
23502 238066 : for (i = 1; i < nelt; i++)
23503 203569 : if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
23504 : return false;
23505 :
23506 34497 : dfirst = *d;
23507 34497 : dsecond = *d;
23508 34497 : dfinal = *d;
23509 34497 : dfirst.op1 = dfirst.op0;
23510 34497 : dfirst.one_operand_p = true;
23511 34497 : dsecond.op0 = dsecond.op1;
23512 34497 : dsecond.one_operand_p = true;
23513 :
23514 225509 : for (i = 0; i < nelt; i++)
23515 191012 : if (d->perm[i] >= nelt)
23516 : {
23517 95506 : dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
23518 95506 : if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
23519 87029 : ident2 = false;
23520 95506 : dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
23521 95506 : = d->perm[i] - nelt;
23522 : }
23523 : else
23524 : {
23525 95506 : dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
23526 95506 : if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
23527 78674 : ident1 = false;
23528 95506 : dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
23529 : }
23530 :
23531 34497 : if (two_insn && !ident1 && !ident2)
23532 : return false;
23533 :
23534 3957 : if (!d->testing_p)
23535 : {
23536 214 : if (!ident1)
23537 144 : dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
23538 214 : if (!ident2)
23539 148 : dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
23540 214 : if (d->perm[0] >= nelt)
23541 0 : std::swap (dfinal.op0, dfinal.op1);
23542 : }
23543 :
23544 3957 : bool ok;
23545 3957 : rtx_insn *seq1 = NULL, *seq2 = NULL;
23546 :
23547 3957 : if (!ident1)
23548 : {
23549 2645 : start_sequence ();
23550 2645 : ok = expand_vec_perm_1 (&dfirst);
23551 2645 : seq1 = end_sequence ();
23552 :
23553 2645 : if (!ok)
23554 : return false;
23555 : }
23556 :
23557 2168 : if (!ident2)
23558 : {
23559 2074 : start_sequence ();
23560 2074 : ok = expand_vec_perm_1 (&dsecond);
23561 2074 : seq2 = end_sequence ();
23562 :
23563 2074 : if (!ok)
23564 : return false;
23565 : }
23566 :
23567 602 : if (d->testing_p)
23568 : return true;
23569 :
23570 680 : for (i = 0; i < nelt; i++)
23571 : {
23572 544 : dfinal.perm[i] = i / 2;
23573 544 : if (i >= lane)
23574 4 : dfinal.perm[i] += lane / 2;
23575 544 : if ((i & 1) != 0)
23576 272 : dfinal.perm[i] += nelt;
23577 : }
23578 136 : emit_insn (seq1);
23579 136 : emit_insn (seq2);
23580 136 : ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1,
23581 : dfinal.perm, dfinal.nelt, false);
23582 136 : gcc_assert (ok);
23583 : return true;
23584 : }
23585 :
23586 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
23587 : the permutation using two single vector permutations and the SSE4_1 pblendv
23588 : instruction. If two_insn, succeed only if one of dfirst or dsecond is
23589 : identity permutation. */
23590 :
23591 : static bool
23592 114958 : expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
23593 : {
23594 114958 : unsigned i, nelt = d->nelt;
23595 114958 : struct expand_vec_perm_d dfirst, dsecond, dfinal;
23596 114958 : machine_mode vmode = d->vmode;
23597 114958 : bool ident1 = true, ident2 = true;
23598 :
23599 : /* Use the same checks as in expand_vec_perm_blend. */
23600 114958 : if (d->one_operand_p)
23601 : return false;
23602 109075 : if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
23603 : ;
23604 101803 : else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
23605 : ;
23606 96562 : else if (TARGET_SSE4_1
23607 105978 : && (GET_MODE_SIZE (vmode) == 16
23608 8190 : || (TARGET_MMX_WITH_SSE && GET_MODE_SIZE (vmode) == 8)
23609 2482 : || GET_MODE_SIZE (vmode) == 4))
23610 : ;
23611 : else
23612 : return false;
23613 :
23614 16657 : dfirst = *d;
23615 16657 : dsecond = *d;
23616 16657 : dfinal = *d;
23617 16657 : dfirst.op1 = dfirst.op0;
23618 16657 : dfirst.one_operand_p = true;
23619 16657 : dsecond.op0 = dsecond.op1;
23620 16657 : dsecond.one_operand_p = true;
23621 :
23622 137649 : for (i = 0; i < nelt; ++i)
23623 120992 : if (d->perm[i] >= nelt)
23624 : {
23625 60832 : dfirst.perm[i] = 0xff;
23626 60832 : dsecond.perm[i] = d->perm[i] - nelt;
23627 60832 : if (d->perm[i] != i + nelt)
23628 120992 : ident2 = false;
23629 : }
23630 : else
23631 : {
23632 60160 : dsecond.perm[i] = 0xff;
23633 60160 : dfirst.perm[i] = d->perm[i];
23634 60160 : if (d->perm[i] != i)
23635 120992 : ident1 = false;
23636 : }
23637 :
23638 16657 : if (two_insn && !ident1 && !ident2)
23639 : return false;
23640 :
23641 : /* For now. Ideally treat 0xff as a wildcard. */
23642 57247 : for (i = 0; i < nelt; ++i)
23643 51036 : if (dfirst.perm[i] == 0xff)
23644 : {
23645 26620 : if (GET_MODE_SIZE (vmode) == 32
23646 26620 : && dfirst.perm[i ^ (nelt / 2)] != 0xff)
23647 14868 : dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
23648 : else
23649 11752 : dfirst.perm[i] = i;
23650 : }
23651 : else
23652 : {
23653 24416 : if (GET_MODE_SIZE (vmode) == 32
23654 24416 : && dsecond.perm[i ^ (nelt / 2)] != 0xff)
23655 13292 : dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
23656 : else
23657 11124 : dsecond.perm[i] = i;
23658 : }
23659 :
23660 6211 : if (!d->testing_p)
23661 : {
23662 2403 : if (!ident1)
23663 2279 : dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
23664 2403 : if (!ident2)
23665 1091 : dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
23666 : }
23667 :
23668 6211 : bool ok;
23669 6211 : rtx_insn *seq1 = NULL, *seq2 = NULL;
23670 :
23671 6211 : if (!ident1)
23672 : {
23673 5622 : start_sequence ();
23674 5622 : ok = expand_vec_perm_1 (&dfirst);
23675 5622 : seq1 = end_sequence ();
23676 :
23677 5622 : if (!ok)
23678 : return false;
23679 : }
23680 :
23681 4584 : if (!ident2)
23682 : {
23683 1489 : start_sequence ();
23684 1489 : ok = expand_vec_perm_1 (&dsecond);
23685 1489 : seq2 = end_sequence ();
23686 :
23687 1489 : if (!ok)
23688 : return false;
23689 : }
23690 :
23691 3995 : if (d->testing_p)
23692 : return true;
23693 :
23694 21825 : for (i = 0; i < nelt; ++i)
23695 19764 : dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
23696 :
23697 2061 : emit_insn (seq1);
23698 2061 : emit_insn (seq2);
23699 2061 : ok = expand_vec_perm_blend (&dfinal);
23700 2061 : gcc_assert (ok);
23701 : return true;
23702 : }
23703 :
23704 : /* A subroutine of ix86_expand_vec_perm_const_1.
23705 : Implement a permutation with psrlw, psllw and por.
23706 : It handles case:
23707 : __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14);
23708 : __builtin_shufflevector (v,v,1,0,3,2,5,4,7,6); */
23709 :
23710 : static bool
23711 25992 : expand_vec_perm_psrlw_psllw_por (struct expand_vec_perm_d *d)
23712 : {
23713 25992 : unsigned i;
23714 25992 : rtx (*gen_shr) (rtx, rtx, rtx);
23715 25992 : rtx (*gen_shl) (rtx, rtx, rtx);
23716 25992 : rtx (*gen_or) (rtx, rtx, rtx);
23717 25992 : machine_mode mode = VOIDmode;
23718 :
23719 25992 : if (!TARGET_SSE2 || !d->one_operand_p)
23720 : return false;
23721 :
23722 5185 : switch (d->vmode)
23723 : {
23724 1395 : case E_V8QImode:
23725 1395 : if (!TARGET_MMX_WITH_SSE)
23726 : return false;
23727 : mode = V4HImode;
23728 : gen_shr = gen_lshrv4hi3;
23729 : gen_shl = gen_ashlv4hi3;
23730 : gen_or = gen_iorv4hi3;
23731 : break;
23732 : case E_V16QImode:
23733 : mode = V8HImode;
23734 : gen_shr = gen_lshrv8hi3;
23735 : gen_shl = gen_ashlv8hi3;
23736 : gen_or = gen_iorv8hi3;
23737 : break;
23738 : default: return false;
23739 : }
23740 :
23741 3082 : if (!rtx_equal_p (d->op0, d->op1))
23742 : return false;
23743 :
23744 12122 : for (i = 0; i < d->nelt; i += 2)
23745 10684 : if (d->perm[i] != i + 1 || d->perm[i + 1] != i)
23746 : return false;
23747 :
23748 1438 : if (d->testing_p)
23749 : return true;
23750 :
23751 26 : rtx tmp1 = gen_reg_rtx (mode);
23752 26 : rtx tmp2 = gen_reg_rtx (mode);
23753 26 : rtx op0 = force_reg (d->vmode, d->op0);
23754 :
23755 26 : emit_move_insn (tmp1, lowpart_subreg (mode, op0, d->vmode));
23756 26 : emit_move_insn (tmp2, lowpart_subreg (mode, op0, d->vmode));
23757 26 : emit_insn (gen_shr (tmp1, tmp1, GEN_INT (8)));
23758 26 : emit_insn (gen_shl (tmp2, tmp2, GEN_INT (8)));
23759 26 : emit_insn (gen_or (tmp1, tmp1, tmp2));
23760 26 : emit_move_insn (d->target, lowpart_subreg (d->vmode, tmp1, mode));
23761 :
23762 26 : return true;
23763 : }
23764 :
23765 : /* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
23766 : permutation using two vperm2f128, followed by a vshufpd insn blending
23767 : the two vectors together. */
23768 :
23769 : static bool
23770 29841 : expand_vec_perm_2vperm2f128_vshuf (struct expand_vec_perm_d *d)
23771 : {
23772 29841 : struct expand_vec_perm_d dfirst, dsecond, dthird;
23773 29841 : bool ok;
23774 :
23775 29841 : if (!TARGET_AVX || (d->vmode != V4DFmode))
23776 : return false;
23777 :
23778 1277 : if (d->testing_p)
23779 : return true;
23780 :
23781 206 : dfirst = *d;
23782 206 : dsecond = *d;
23783 206 : dthird = *d;
23784 :
23785 206 : dfirst.perm[0] = (d->perm[0] & ~1);
23786 206 : dfirst.perm[1] = (d->perm[0] & ~1) + 1;
23787 206 : dfirst.perm[2] = (d->perm[2] & ~1);
23788 206 : dfirst.perm[3] = (d->perm[2] & ~1) + 1;
23789 206 : dsecond.perm[0] = (d->perm[1] & ~1);
23790 206 : dsecond.perm[1] = (d->perm[1] & ~1) + 1;
23791 206 : dsecond.perm[2] = (d->perm[3] & ~1);
23792 206 : dsecond.perm[3] = (d->perm[3] & ~1) + 1;
23793 206 : dthird.perm[0] = (d->perm[0] % 2);
23794 206 : dthird.perm[1] = (d->perm[1] % 2) + 4;
23795 206 : dthird.perm[2] = (d->perm[2] % 2) + 2;
23796 206 : dthird.perm[3] = (d->perm[3] % 2) + 6;
23797 :
23798 206 : dfirst.target = gen_reg_rtx (dfirst.vmode);
23799 206 : dsecond.target = gen_reg_rtx (dsecond.vmode);
23800 206 : dthird.op0 = dfirst.target;
23801 206 : dthird.op1 = dsecond.target;
23802 206 : dthird.one_operand_p = false;
23803 :
23804 206 : canonicalize_perm (&dfirst);
23805 206 : canonicalize_perm (&dsecond);
23806 :
23807 206 : ok = expand_vec_perm_1 (&dfirst)
23808 206 : && expand_vec_perm_1 (&dsecond)
23809 412 : && expand_vec_perm_1 (&dthird);
23810 :
23811 0 : gcc_assert (ok);
23812 :
23813 : return true;
23814 : }
23815 :
23816 : static bool ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *);
23817 :
23818 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
23819 : a two vector permutation using two intra-lane vector
23820 : permutations, vperm2f128 swapping the lanes and vblend* insn blending
23821 : the non-swapped and swapped vectors together. */
23822 :
23823 : static bool
23824 15752 : expand_vec_perm2_vperm2f128_vblend (struct expand_vec_perm_d *d)
23825 : {
23826 15752 : struct expand_vec_perm_d dfirst, dsecond, dthird;
23827 15752 : unsigned i, j, msk, nelt = d->nelt, nelt2 = nelt / 2, which1 = 0, which2 = 0;
23828 15752 : rtx_insn *seq1, *seq2;
23829 15752 : bool ok;
23830 15752 : rtx (*blend) (rtx, rtx, rtx, rtx) = NULL;
23831 :
23832 15752 : if (!TARGET_AVX
23833 990 : || TARGET_AVX2
23834 722 : || (d->vmode != V8SFmode && d->vmode != V4DFmode)
23835 595 : || d->one_operand_p)
23836 : return false;
23837 :
23838 595 : dfirst = *d;
23839 595 : dsecond = *d;
23840 5355 : for (i = 0; i < nelt; i++)
23841 : {
23842 4760 : dfirst.perm[i] = 0xff;
23843 4760 : dsecond.perm[i] = 0xff;
23844 : }
23845 5355 : for (i = 0, msk = 0; i < nelt; i++)
23846 : {
23847 4760 : j = (d->perm[i] & nelt2) ? i | nelt2 : i & ~nelt2;
23848 4760 : if (j == i)
23849 : {
23850 3458 : dfirst.perm[j] = d->perm[i];
23851 5858 : which1 |= (d->perm[i] < nelt ? 1 : 2);
23852 : }
23853 : else
23854 : {
23855 1302 : dsecond.perm[j] = d->perm[i];
23856 1302 : which2 |= (d->perm[i] < nelt ? 1 : 2);
23857 1302 : msk |= (1U << i);
23858 : }
23859 : }
23860 595 : if (msk == 0 || msk == (1U << nelt) - 1)
23861 : return false;
23862 :
23863 595 : if (!d->testing_p)
23864 : {
23865 40 : dfirst.target = gen_reg_rtx (dfirst.vmode);
23866 40 : dsecond.target = gen_reg_rtx (dsecond.vmode);
23867 : }
23868 :
23869 5355 : for (i = 0; i < nelt; i++)
23870 : {
23871 4760 : if (dfirst.perm[i] == 0xff)
23872 1302 : dfirst.perm[i] = (which1 == 2 ? i + nelt : i);
23873 4760 : if (dsecond.perm[i] == 0xff)
23874 3458 : dsecond.perm[i] = (which2 == 2 ? i + nelt : i);
23875 : }
23876 595 : canonicalize_perm (&dfirst);
23877 595 : start_sequence ();
23878 595 : ok = ix86_expand_vec_perm_const_1 (&dfirst);
23879 595 : seq1 = end_sequence ();
23880 :
23881 595 : if (!ok)
23882 : return false;
23883 :
23884 595 : canonicalize_perm (&dsecond);
23885 595 : start_sequence ();
23886 595 : ok = ix86_expand_vec_perm_const_1 (&dsecond);
23887 595 : seq2 = end_sequence ();
23888 :
23889 595 : if (!ok)
23890 : return false;
23891 :
23892 595 : if (d->testing_p)
23893 : return true;
23894 :
23895 40 : emit_insn (seq1);
23896 40 : emit_insn (seq2);
23897 :
23898 40 : dthird = *d;
23899 40 : dthird.op0 = dsecond.target;
23900 40 : dthird.op1 = dsecond.target;
23901 40 : dthird.one_operand_p = true;
23902 40 : dthird.target = gen_reg_rtx (dthird.vmode);
23903 360 : for (i = 0; i < nelt; i++)
23904 320 : dthird.perm[i] = i ^ nelt2;
23905 :
23906 40 : ok = expand_vec_perm_1 (&dthird);
23907 40 : gcc_assert (ok);
23908 :
23909 40 : blend = d->vmode == V8SFmode ? gen_avx_blendps256 : gen_avx_blendpd256;
23910 40 : emit_insn (blend (d->target, dfirst.target, dthird.target, GEN_INT (msk)));
23911 40 : return true;
23912 : }
23913 :
23914 : /* A subroutine of expand_vec_perm_even_odd_1. Implement the double-word
23915 : permutation with two pshufb insns and an ior. We should have already
23916 : failed all two instruction sequences. */
23917 :
23918 : static bool
23919 28585 : expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
23920 : {
23921 28585 : rtx rperm[2][16], vperm, l, h, op, m128;
23922 28585 : unsigned int i, nelt, eltsz;
23923 28585 : machine_mode mode;
23924 28585 : rtx (*gen) (rtx, rtx, rtx);
23925 :
23926 33303 : if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
23927 9346 : && GET_MODE_SIZE (d->vmode) != 8
23928 9306 : && GET_MODE_SIZE (d->vmode) != 4))
23929 : return false;
23930 1072 : gcc_assert (!d->one_operand_p);
23931 :
23932 1072 : if (d->testing_p)
23933 : return true;
23934 :
23935 202 : switch (GET_MODE_SIZE (d->vmode))
23936 : {
23937 : case 4:
23938 : mode = V4QImode;
23939 : gen = gen_mmx_pshufbv4qi3;
23940 : break;
23941 20 : case 8:
23942 20 : mode = V8QImode;
23943 20 : gen = gen_mmx_pshufbv8qi3;
23944 20 : break;
23945 45 : case 16:
23946 45 : mode = V16QImode;
23947 45 : gen = gen_ssse3_pshufbv16qi3;
23948 45 : break;
23949 0 : default:
23950 0 : gcc_unreachable ();
23951 : }
23952 :
23953 101 : nelt = d->nelt;
23954 101 : eltsz = GET_MODE_UNIT_SIZE (d->vmode);
23955 :
23956 : /* Generate two permutation masks. If the required element is within
23957 : the given vector it is shuffled into the proper lane. If the required
23958 : element is in the other vector, force a zero into the lane by setting
23959 : bit 7 in the permutation mask. */
23960 101 : m128 = GEN_INT (-128);
23961 1029 : for (i = 0; i < nelt; ++i)
23962 : {
23963 928 : unsigned j, k, e = d->perm[i];
23964 928 : unsigned which = (e >= nelt);
23965 928 : if (e >= nelt)
23966 480 : e -= nelt;
23967 :
23968 1952 : for (j = 0; j < eltsz; ++j)
23969 : {
23970 1024 : rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
23971 1024 : rperm[1-which][i*eltsz + j] = m128;
23972 : }
23973 :
23974 9024 : for (k = i*eltsz + j; k < 16; ++k)
23975 8096 : rperm[0][k] = rperm[1][k] = m128;
23976 : }
23977 :
23978 101 : vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
23979 101 : vperm = force_reg (V16QImode, vperm);
23980 :
23981 101 : l = gen_reg_rtx (mode);
23982 101 : op = gen_lowpart (mode, d->op0);
23983 101 : emit_insn (gen (l, op, vperm));
23984 :
23985 101 : vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
23986 101 : vperm = force_reg (V16QImode, vperm);
23987 :
23988 101 : h = gen_reg_rtx (mode);
23989 101 : op = gen_lowpart (mode, d->op1);
23990 101 : emit_insn (gen (h, op, vperm));
23991 :
23992 101 : op = d->target;
23993 101 : if (d->vmode != mode)
23994 22 : op = gen_reg_rtx (mode);
23995 101 : ix86_emit_vec_binop (IOR, mode, op, l, h);
23996 101 : if (op != d->target)
23997 22 : emit_move_insn (d->target, gen_lowpart (d->vmode, op));
23998 :
23999 : return true;
24000 : }
24001 :
24002 : /* Implement arbitrary permutation of one V32QImode and V16QImode operand
24003 : with two vpshufb insns, vpermq and vpor. We should have already failed
24004 : all two or three instruction sequences. */
24005 :
24006 : static bool
24007 23525 : expand_vec_perm_vpshufb2_vpermq (struct expand_vec_perm_d *d)
24008 : {
24009 23525 : rtx rperm[2][32], vperm, l, h, hp, op, m128;
24010 23525 : unsigned int i, nelt, eltsz;
24011 :
24012 23525 : if (!TARGET_AVX2
24013 401 : || !d->one_operand_p
24014 172 : || (d->vmode != V32QImode && d->vmode != V16HImode))
24015 : return false;
24016 :
24017 7 : if (d->testing_p)
24018 : return true;
24019 :
24020 7 : nelt = d->nelt;
24021 7 : eltsz = GET_MODE_UNIT_SIZE (d->vmode);
24022 :
24023 : /* Generate two permutation masks. If the required element is within
24024 : the same lane, it is shuffled in. If the required element from the
24025 : other lane, force a zero by setting bit 7 in the permutation mask.
24026 : In the other mask the mask has non-negative elements if element
24027 : is requested from the other lane, but also moved to the other lane,
24028 : so that the result of vpshufb can have the two V2TImode halves
24029 : swapped. */
24030 7 : m128 = GEN_INT (-128);
24031 199 : for (i = 0; i < nelt; ++i)
24032 : {
24033 192 : unsigned j, e = d->perm[i] & (nelt / 2 - 1);
24034 192 : unsigned which = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
24035 :
24036 416 : for (j = 0; j < eltsz; ++j)
24037 : {
24038 224 : rperm[!!which][(i * eltsz + j) ^ which] = GEN_INT (e * eltsz + j);
24039 224 : rperm[!which][(i * eltsz + j) ^ (which ^ 16)] = m128;
24040 : }
24041 : }
24042 :
24043 7 : vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
24044 7 : vperm = force_reg (V32QImode, vperm);
24045 :
24046 7 : h = gen_reg_rtx (V32QImode);
24047 7 : op = gen_lowpart (V32QImode, d->op0);
24048 7 : emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
24049 :
24050 : /* Swap the 128-byte lanes of h into hp. */
24051 7 : hp = gen_reg_rtx (V4DImode);
24052 7 : op = gen_lowpart (V4DImode, h);
24053 7 : emit_insn (gen_avx2_permv4di_1 (hp, op, const2_rtx, GEN_INT (3), const0_rtx,
24054 : const1_rtx));
24055 :
24056 7 : vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
24057 7 : vperm = force_reg (V32QImode, vperm);
24058 :
24059 7 : l = gen_reg_rtx (V32QImode);
24060 7 : op = gen_lowpart (V32QImode, d->op0);
24061 7 : emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
24062 :
24063 7 : op = d->target;
24064 7 : if (d->vmode != V32QImode)
24065 2 : op = gen_reg_rtx (V32QImode);
24066 7 : emit_insn (gen_iorv32qi3 (op, l, gen_lowpart (V32QImode, hp)));
24067 7 : if (op != d->target)
24068 2 : emit_move_insn (d->target, gen_lowpart (d->vmode, op));
24069 :
24070 : return true;
24071 : }
24072 :
24073 : /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
24074 : and extract-odd permutations of two V32QImode and V16QImode operand
24075 : with two vpshufb insns, vpor and vpermq. We should have already
24076 : failed all two or three instruction sequences. */
24077 :
24078 : static bool
24079 23518 : expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
24080 : {
24081 23518 : rtx rperm[2][32], vperm, l, h, ior, op, m128;
24082 23518 : unsigned int i, nelt, eltsz;
24083 :
24084 23518 : if (!TARGET_AVX2
24085 394 : || d->one_operand_p
24086 229 : || (d->vmode != V32QImode && d->vmode != V16HImode))
24087 : return false;
24088 :
24089 112 : for (i = 0; i < d->nelt; ++i)
24090 112 : if ((d->perm[i] ^ (i * 2)) & (3 * d->nelt / 2))
24091 : return false;
24092 :
24093 0 : if (d->testing_p)
24094 : return true;
24095 :
24096 0 : nelt = d->nelt;
24097 0 : eltsz = GET_MODE_UNIT_SIZE (d->vmode);
24098 :
24099 : /* Generate two permutation masks. In the first permutation mask
24100 : the first quarter will contain indexes for the first half
24101 : of the op0, the second quarter will contain bit 7 set, third quarter
24102 : will contain indexes for the second half of the op0 and the
24103 : last quarter bit 7 set. In the second permutation mask
24104 : the first quarter will contain bit 7 set, the second quarter
24105 : indexes for the first half of the op1, the third quarter bit 7 set
24106 : and last quarter indexes for the second half of the op1.
24107 : I.e. the first mask e.g. for V32QImode extract even will be:
24108 : 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128
24109 : (all values masked with 0xf except for -128) and second mask
24110 : for extract even will be
24111 : -128, ..., -128, 0, 2, ..., 0xe, -128, ..., -128, 0, 2, ..., 0xe. */
24112 0 : m128 = GEN_INT (-128);
24113 0 : for (i = 0; i < nelt; ++i)
24114 : {
24115 0 : unsigned j, e = d->perm[i] & (nelt / 2 - 1);
24116 0 : unsigned which = d->perm[i] >= nelt;
24117 0 : unsigned xorv = (i >= nelt / 4 && i < 3 * nelt / 4) ? 24 : 0;
24118 :
24119 0 : for (j = 0; j < eltsz; ++j)
24120 : {
24121 0 : rperm[which][(i * eltsz + j) ^ xorv] = GEN_INT (e * eltsz + j);
24122 0 : rperm[1 - which][(i * eltsz + j) ^ xorv] = m128;
24123 : }
24124 : }
24125 :
24126 0 : vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[0]));
24127 0 : vperm = force_reg (V32QImode, vperm);
24128 :
24129 0 : l = gen_reg_rtx (V32QImode);
24130 0 : op = gen_lowpart (V32QImode, d->op0);
24131 0 : emit_insn (gen_avx2_pshufbv32qi3 (l, op, vperm));
24132 :
24133 0 : vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[1]));
24134 0 : vperm = force_reg (V32QImode, vperm);
24135 :
24136 0 : h = gen_reg_rtx (V32QImode);
24137 0 : op = gen_lowpart (V32QImode, d->op1);
24138 0 : emit_insn (gen_avx2_pshufbv32qi3 (h, op, vperm));
24139 :
24140 0 : ior = gen_reg_rtx (V32QImode);
24141 0 : emit_insn (gen_iorv32qi3 (ior, l, h));
24142 :
24143 : /* Permute the V4DImode quarters using { 0, 2, 1, 3 } permutation. */
24144 0 : op = gen_reg_rtx (V4DImode);
24145 0 : ior = gen_lowpart (V4DImode, ior);
24146 0 : emit_insn (gen_avx2_permv4di_1 (op, ior, const0_rtx, const2_rtx,
24147 : const1_rtx, GEN_INT (3)));
24148 0 : emit_move_insn (d->target, gen_lowpart (d->vmode, op));
24149 :
24150 0 : return true;
24151 : }
24152 :
24153 : /* A subroutine of ix86_expand_vec_perm_const_1. Try to implement a
24154 : permutation (which is a bland) with and, andnot and or when pshufb is not available.
24155 :
24156 : It handles case:
24157 : __builtin_shufflevector (v1, v2, 0, 9, 2, 11, 4, 13, 6, 15);
24158 : __builtin_shufflevector (v1, v2, 8, 1, 2, 11, 4, 13, 6, 15);
24159 :
24160 : An element[i] must be chosen between op0[i] and op1[i] to satisfy the
24161 : requirement.
24162 : */
24163 :
24164 : static bool
24165 24554 : expand_vec_perm_pand_pandn_por (struct expand_vec_perm_d *d)
24166 : {
24167 24554 : rtx rperm[16], vperm;
24168 24554 : unsigned int i, nelt = d->nelt;
24169 :
24170 24554 : if (!TARGET_SSE2
24171 24554 : || d->one_operand_p
24172 20807 : || (d->vmode != V16QImode && d->vmode != V8HImode))
24173 : return false;
24174 :
24175 7585 : if (d->perm[0] != 0)
24176 : return false;
24177 :
24178 : /* The dest[i] must select an element between op0[i] and op1[i]. */
24179 15483 : for (i = 1; i < nelt; i++)
24180 14466 : if ((d->perm[i] % nelt) != i)
24181 : return false;
24182 :
24183 1017 : if (d->testing_p)
24184 : return true;
24185 :
24186 : /* Generates a blend mask for the operators AND and ANDNOT. */
24187 108 : machine_mode inner_mode = GET_MODE_INNER (d->vmode);
24188 1148 : for (i = 0; i < nelt; i++)
24189 1581 : rperm[i] = (d->perm[i] < nelt) ? CONSTM1_RTX (inner_mode)
24190 541 : : CONST0_RTX (inner_mode);
24191 :
24192 108 : vperm = gen_rtx_CONST_VECTOR (d->vmode, gen_rtvec_v (nelt, rperm));
24193 108 : vperm = force_reg (d->vmode, vperm);
24194 :
24195 108 : ix86_expand_sse_movcc (d->target, vperm, d->op0, d->op1);
24196 :
24197 108 : return true;
24198 : }
24199 :
24200 : /* Implement permutation with pslldq + psrldq + por when pshufb is not
24201 : available. */
24202 : static bool
24203 43543 : expand_vec_perm_pslldq_psrldq_por (struct expand_vec_perm_d *d, bool pandn)
24204 : {
24205 43543 : unsigned i, nelt = d->nelt;
24206 43543 : unsigned start1, end1 = -1;
24207 43543 : machine_mode vmode = d->vmode, imode;
24208 43543 : int start2 = -1;
24209 43543 : bool clear_op0, clear_op1;
24210 43543 : unsigned inner_size;
24211 43543 : rtx op0, op1, dop1;
24212 43543 : rtx (*gen_vec_shr) (rtx, rtx, rtx);
24213 43543 : rtx (*gen_vec_shl) (rtx, rtx, rtx);
24214 :
24215 : /* pshufd can be used for V4SI/V2DI under TARGET_SSE2. */
24216 43543 : if (!TARGET_SSE2 || (vmode != E_V16QImode && vmode != E_V8HImode))
24217 : return false;
24218 :
24219 13498 : start1 = d->perm[0];
24220 36108 : for (i = 1; i < nelt; i++)
24221 : {
24222 35417 : if (d->perm[i] != d->perm[i-1] + 1
24223 9326 : || d->perm[i] == nelt)
24224 : {
24225 26305 : if (start2 == -1)
24226 : {
24227 13498 : start2 = d->perm[i];
24228 13498 : end1 = d->perm[i-1];
24229 : }
24230 : else
24231 : return false;
24232 : }
24233 : }
24234 :
24235 691 : clear_op0 = end1 != nelt - 1;
24236 691 : clear_op1 = start2 % nelt != 0;
24237 : /* pandn/pand is needed to clear upper/lower bits of op0/op1. */
24238 691 : if (!pandn && (clear_op0 || clear_op1))
24239 : return false;
24240 :
24241 467 : if (d->testing_p)
24242 : return true;
24243 :
24244 44 : gen_vec_shr = vmode == E_V16QImode ? gen_vec_shr_v16qi : gen_vec_shr_v8hi;
24245 20 : gen_vec_shl = vmode == E_V16QImode ? gen_vec_shl_v16qi : gen_vec_shl_v8hi;
24246 44 : imode = GET_MODE_INNER (vmode);
24247 44 : inner_size = GET_MODE_BITSIZE (imode);
24248 44 : op0 = gen_reg_rtx (vmode);
24249 44 : op1 = gen_reg_rtx (vmode);
24250 :
24251 44 : if (start1)
24252 41 : emit_insn (gen_vec_shr (op0, d->op0, GEN_INT (start1 * inner_size)));
24253 : else
24254 3 : emit_move_insn (op0, d->op0);
24255 :
24256 44 : dop1 = d->op1;
24257 44 : if (d->one_operand_p)
24258 24 : dop1 = d->op0;
24259 :
24260 44 : int shl_offset = end1 - start1 + 1 - start2 % nelt;
24261 44 : if (shl_offset)
24262 44 : emit_insn (gen_vec_shl (op1, dop1, GEN_INT (shl_offset * inner_size)));
24263 : else
24264 0 : emit_move_insn (op1, dop1);
24265 :
24266 : /* Clear lower/upper bits for op0/op1. */
24267 44 : if (clear_op0 || clear_op1)
24268 : {
24269 : rtx vec[16];
24270 : rtx const_vec;
24271 : rtx clear;
24272 339 : for (i = 0; i != nelt; i++)
24273 : {
24274 312 : if (i < (end1 - start1 + 1))
24275 156 : vec[i] = gen_int_mode ((HOST_WIDE_INT_1U << inner_size) - 1, imode);
24276 : else
24277 156 : vec[i] = CONST0_RTX (imode);
24278 : }
24279 27 : const_vec = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, vec));
24280 27 : const_vec = validize_mem (force_const_mem (vmode, const_vec));
24281 27 : clear = force_reg (vmode, const_vec);
24282 :
24283 27 : if (clear_op0)
24284 19 : emit_move_insn (op0, gen_rtx_AND (vmode, op0, clear));
24285 27 : if (clear_op1)
24286 16 : emit_move_insn (op1, gen_rtx_AND (vmode,
24287 : gen_rtx_NOT (vmode, clear),
24288 : op1));
24289 : }
24290 :
24291 44 : emit_move_insn (d->target, gen_rtx_IOR (vmode, op0, op1));
24292 44 : return true;
24293 : }
24294 :
24295 : /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
24296 : and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
24297 : operands with two "and" and "pack" or two "shift" and "pack" insns.
24298 : We should have already failed all two instruction sequences. */
24299 :
24300 : static bool
24301 45724 : expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
24302 : {
24303 45724 : rtx op, dop0, dop1, t;
24304 45724 : unsigned i, odd, c, s, nelt = d->nelt;
24305 45724 : int pblendw_i = 0;
24306 45724 : bool end_perm = false;
24307 45724 : machine_mode half_mode;
24308 45724 : rtx (*gen_and) (rtx, rtx, rtx);
24309 45724 : rtx (*gen_pack) (rtx, rtx, rtx);
24310 45724 : rtx (*gen_shift) (rtx, rtx, rtx);
24311 :
24312 45724 : if (d->one_operand_p)
24313 : return false;
24314 :
24315 40496 : switch (d->vmode)
24316 : {
24317 4222 : case E_V4HImode:
24318 : /* Required for "pack". */
24319 4222 : if (!TARGET_SSE4_1)
24320 : return false;
24321 : c = 0xffff;
24322 : s = 16;
24323 : half_mode = V2SImode;
24324 : gen_and = gen_andv2si3;
24325 : gen_pack = gen_mmx_packusdw;
24326 : gen_shift = gen_lshrv2si3;
24327 : pblendw_i = 0x5;
24328 : break;
24329 5843 : case E_V8HImode:
24330 : /* Required for "pack". */
24331 5843 : if (!TARGET_SSE4_1)
24332 : return false;
24333 : c = 0xffff;
24334 : s = 16;
24335 : half_mode = V4SImode;
24336 : gen_and = gen_andv4si3;
24337 : gen_pack = gen_sse4_1_packusdw;
24338 : gen_shift = gen_lshrv4si3;
24339 : pblendw_i = 0x55;
24340 : break;
24341 : case E_V8QImode:
24342 : /* No check as all instructions are SSE2. */
24343 : c = 0xff;
24344 : s = 8;
24345 : half_mode = V4HImode;
24346 : gen_and = gen_andv4hi3;
24347 : gen_pack = gen_mmx_packuswb;
24348 : gen_shift = gen_lshrv4hi3;
24349 : break;
24350 14113 : case E_V16QImode:
24351 : /* No check as all instructions are SSE2. */
24352 14113 : c = 0xff;
24353 14113 : s = 8;
24354 14113 : half_mode = V8HImode;
24355 14113 : gen_and = gen_andv8hi3;
24356 14113 : gen_pack = gen_sse2_packuswb;
24357 14113 : gen_shift = gen_lshrv8hi3;
24358 14113 : break;
24359 435 : case E_V16HImode:
24360 435 : if (!TARGET_AVX2)
24361 : return false;
24362 : c = 0xffff;
24363 : s = 16;
24364 : half_mode = V8SImode;
24365 : gen_and = gen_andv8si3;
24366 : gen_pack = gen_avx2_packusdw;
24367 : gen_shift = gen_lshrv8si3;
24368 : pblendw_i = 0x5555;
24369 : end_perm = true;
24370 : break;
24371 509 : case E_V32QImode:
24372 509 : if (!TARGET_AVX2)
24373 : return false;
24374 : c = 0xff;
24375 : s = 8;
24376 : half_mode = V16HImode;
24377 : gen_and = gen_andv16hi3;
24378 : gen_pack = gen_avx2_packuswb;
24379 : gen_shift = gen_lshrv16hi3;
24380 : end_perm = true;
24381 : break;
24382 : default:
24383 : /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
24384 : are more profitable than general shuffles. */
24385 : return false;
24386 : }
24387 :
24388 : /* Check that permutation is even or odd. */
24389 19982 : odd = d->perm[0];
24390 19982 : if (odd > 1)
24391 : return false;
24392 :
24393 229021 : for (i = 1; i < nelt; ++i)
24394 213198 : if (d->perm[i] != 2 * i + odd)
24395 : return false;
24396 :
24397 15823 : if (d->testing_p)
24398 : return true;
24399 :
24400 5495 : dop0 = gen_reg_rtx (half_mode);
24401 5495 : dop1 = gen_reg_rtx (half_mode);
24402 5495 : if (odd == 0)
24403 : {
24404 : /* Use pblendw since const_vector 0 should be cheaper than
24405 : const_vector 0xffff. */
24406 4774 : if (d->vmode == V4HImode
24407 : || d->vmode == E_V8HImode
24408 : || d->vmode == E_V16HImode)
24409 : {
24410 864 : rtx dop0_t = gen_reg_rtx (d->vmode);
24411 864 : rtx dop1_t = gen_reg_rtx (d->vmode);
24412 864 : t = gen_reg_rtx (d->vmode);
24413 864 : emit_move_insn (t, CONST0_RTX (d->vmode));
24414 :
24415 864 : emit_move_insn (dop0_t, gen_rtx_VEC_MERGE (d->vmode, d->op0, t,
24416 : GEN_INT (pblendw_i)));
24417 864 : emit_move_insn (dop1_t, gen_rtx_VEC_MERGE (d->vmode, d->op1, t,
24418 : GEN_INT (pblendw_i)));
24419 :
24420 864 : emit_move_insn (dop0, gen_lowpart (half_mode, dop0_t));
24421 864 : emit_move_insn (dop1, gen_lowpart (half_mode, dop1_t));
24422 864 : }
24423 : else
24424 : {
24425 3910 : t = gen_const_vec_duplicate (half_mode, GEN_INT (c));
24426 3910 : t = force_reg (half_mode, t);
24427 3910 : emit_insn (gen_and (dop0, t, gen_lowpart (half_mode, d->op0)));
24428 3910 : emit_insn (gen_and (dop1, t, gen_lowpart (half_mode, d->op1)));
24429 : }
24430 : }
24431 : else
24432 : {
24433 1442 : emit_insn (gen_shift (dop0,
24434 721 : gen_lowpart (half_mode, d->op0),
24435 : GEN_INT (s)));
24436 1442 : emit_insn (gen_shift (dop1,
24437 721 : gen_lowpart (half_mode, d->op1),
24438 : GEN_INT (s)));
24439 : }
24440 : /* In AVX2 for 256 bit case we need to permute pack result. */
24441 5495 : if (TARGET_AVX2 && end_perm)
24442 : {
24443 411 : op = gen_reg_rtx (d->vmode);
24444 411 : t = gen_reg_rtx (V4DImode);
24445 411 : emit_insn (gen_pack (op, dop0, dop1));
24446 822 : emit_insn (gen_avx2_permv4di_1 (t,
24447 411 : gen_lowpart (V4DImode, op),
24448 : const0_rtx,
24449 : const2_rtx,
24450 : const1_rtx,
24451 : GEN_INT (3)));
24452 411 : emit_move_insn (d->target, gen_lowpart (d->vmode, t));
24453 : }
24454 : else
24455 5084 : emit_insn (gen_pack (d->target, dop0, dop1));
24456 :
24457 : return true;
24458 : }
24459 :
24460 : /* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
24461 : and extract-odd permutations of two V64QI operands
24462 : with two "shifts", two "truncs" and one "concat" insns for "odd"
24463 : and two "truncs" and one concat insn for "even."
24464 : Have already failed all two instruction sequences. */
24465 :
24466 : static bool
24467 23573 : expand_vec_perm_even_odd_trunc (struct expand_vec_perm_d *d)
24468 : {
24469 23573 : rtx t1, t2, t3, t4;
24470 23573 : unsigned i, odd, nelt = d->nelt;
24471 :
24472 23573 : if (!TARGET_AVX512BW
24473 106 : || d->one_operand_p
24474 70 : || d->vmode != V64QImode)
24475 : return false;
24476 :
24477 : /* Check that permutation is even or odd. */
24478 70 : odd = d->perm[0];
24479 70 : if (odd > 1)
24480 : return false;
24481 :
24482 2422 : for (i = 1; i < nelt; ++i)
24483 2388 : if (d->perm[i] != 2 * i + odd)
24484 : return false;
24485 :
24486 34 : if (d->testing_p)
24487 : return true;
24488 :
24489 :
24490 34 : if (odd)
24491 : {
24492 5 : t1 = gen_reg_rtx (V32HImode);
24493 5 : t2 = gen_reg_rtx (V32HImode);
24494 10 : emit_insn (gen_lshrv32hi3 (t1,
24495 5 : gen_lowpart (V32HImode, d->op0),
24496 : GEN_INT (8)));
24497 10 : emit_insn (gen_lshrv32hi3 (t2,
24498 5 : gen_lowpart (V32HImode, d->op1),
24499 : GEN_INT (8)));
24500 : }
24501 : else
24502 : {
24503 29 : t1 = gen_lowpart (V32HImode, d->op0);
24504 29 : t2 = gen_lowpart (V32HImode, d->op1);
24505 : }
24506 :
24507 34 : t3 = gen_reg_rtx (V32QImode);
24508 34 : t4 = gen_reg_rtx (V32QImode);
24509 34 : emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t3, t1));
24510 34 : emit_insn (gen_avx512bw_truncatev32hiv32qi2 (t4, t2));
24511 34 : emit_insn (gen_avx_vec_concatv64qi (d->target, t3, t4));
24512 :
24513 34 : return true;
24514 : }
24515 :
24516 : /* A subroutine of ix86_expand_vec_perm_const_1. Implement extract-even
24517 : and extract-odd permutations. */
24518 :
24519 : static bool
24520 12561 : expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
24521 : {
24522 12561 : rtx t1, t2, t3, t4, t5;
24523 :
24524 12561 : switch (d->vmode)
24525 : {
24526 19 : case E_V4DFmode:
24527 19 : if (d->testing_p)
24528 : break;
24529 1 : t1 = gen_reg_rtx (V4DFmode);
24530 1 : t2 = gen_reg_rtx (V4DFmode);
24531 :
24532 : /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
24533 1 : emit_insn (gen_avx_vperm2f128v4df3 (t1, d->op0, d->op1, GEN_INT (0x20)));
24534 1 : emit_insn (gen_avx_vperm2f128v4df3 (t2, d->op0, d->op1, GEN_INT (0x31)));
24535 :
24536 : /* Now an unpck[lh]pd will produce the result required. */
24537 1 : if (odd)
24538 0 : t3 = gen_avx_unpckhpd256 (d->target, t1, t2);
24539 : else
24540 1 : t3 = gen_avx_unpcklpd256 (d->target, t1, t2);
24541 1 : emit_insn (t3);
24542 1 : break;
24543 :
24544 1214 : case E_V8SFmode:
24545 1214 : {
24546 1214 : int mask = odd ? 0xdd : 0x88;
24547 :
24548 1214 : if (d->testing_p)
24549 : break;
24550 186 : t1 = gen_reg_rtx (V8SFmode);
24551 186 : t2 = gen_reg_rtx (V8SFmode);
24552 186 : t3 = gen_reg_rtx (V8SFmode);
24553 :
24554 : /* Shuffle within the 128-bit lanes to produce:
24555 : { 0 2 8 a 4 6 c e } | { 1 3 9 b 5 7 d f }. */
24556 186 : emit_insn (gen_avx_shufps256 (t1, d->op0, d->op1,
24557 : GEN_INT (mask)));
24558 :
24559 : /* Shuffle the lanes around to produce:
24560 : { 4 6 c e 0 2 8 a } and { 5 7 d f 1 3 9 b }. */
24561 186 : emit_insn (gen_avx_vperm2f128v8sf3 (t2, t1, t1,
24562 : GEN_INT (0x3)));
24563 :
24564 : /* Shuffle within the 128-bit lanes to produce:
24565 : { 0 2 4 6 4 6 0 2 } | { 1 3 5 7 5 7 1 3 }. */
24566 186 : emit_insn (gen_avx_shufps256 (t3, t1, t2, GEN_INT (0x44)));
24567 :
24568 : /* Shuffle within the 128-bit lanes to produce:
24569 : { 8 a c e c e 8 a } | { 9 b d f d f 9 b }. */
24570 186 : emit_insn (gen_avx_shufps256 (t2, t1, t2, GEN_INT (0xee)));
24571 :
24572 : /* Shuffle the lanes around to produce:
24573 : { 0 2 4 6 8 a c e } | { 1 3 5 7 9 b d f }. */
24574 186 : emit_insn (gen_avx_vperm2f128v8sf3 (d->target, t3, t2,
24575 : GEN_INT (0x20)));
24576 : }
24577 186 : break;
24578 :
24579 0 : case E_V2DFmode:
24580 0 : case E_V4SFmode:
24581 0 : case E_V2DImode:
24582 0 : case E_V2SImode:
24583 0 : case E_V4SImode:
24584 0 : case E_V2HImode:
24585 : /* These are always directly implementable by expand_vec_perm_1. */
24586 0 : gcc_unreachable ();
24587 :
24588 0 : case E_V2SFmode:
24589 0 : gcc_assert (TARGET_MMX_WITH_SSE);
24590 : /* We have no suitable instructions. */
24591 0 : if (d->testing_p)
24592 : return false;
24593 : break;
24594 :
24595 1412 : case E_V4QImode:
24596 1412 : if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
24597 0 : return expand_vec_perm_pshufb2 (d);
24598 : else
24599 : {
24600 1412 : if (d->testing_p)
24601 : break;
24602 : /* We need 2*log2(N)-1 operations to achieve odd/even
24603 : with interleave. */
24604 178 : t1 = gen_reg_rtx (V4QImode);
24605 178 : emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1));
24606 178 : emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1));
24607 178 : if (odd)
24608 41 : t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1);
24609 : else
24610 137 : t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1);
24611 178 : emit_insn (t2);
24612 : }
24613 178 : break;
24614 :
24615 1320 : case E_V4HImode:
24616 1320 : if (TARGET_SSE4_1)
24617 90 : return expand_vec_perm_even_odd_pack (d);
24618 1230 : else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
24619 20 : return expand_vec_perm_pshufb2 (d);
24620 : else
24621 : {
24622 1210 : if (d->testing_p)
24623 : break;
24624 : /* We need 2*log2(N)-1 operations to achieve odd/even
24625 : with interleave. */
24626 496 : t1 = gen_reg_rtx (V4HImode);
24627 496 : emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
24628 496 : emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
24629 496 : if (odd)
24630 8 : t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
24631 : else
24632 488 : t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
24633 496 : emit_insn (t2);
24634 : }
24635 496 : break;
24636 :
24637 6568 : case E_V8HImode:
24638 6568 : if (TARGET_SSE4_1)
24639 439 : return expand_vec_perm_even_odd_pack (d);
24640 6129 : else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
24641 1 : return expand_vec_perm_pshufb2 (d);
24642 : else
24643 : {
24644 6128 : if (d->testing_p)
24645 : break;
24646 : /* We need 2*log2(N)-1 operations to achieve odd/even
24647 : with interleave. */
24648 2699 : t1 = gen_reg_rtx (V8HImode);
24649 2699 : t2 = gen_reg_rtx (V8HImode);
24650 2699 : emit_insn (gen_vec_interleave_highv8hi (t1, d->op0, d->op1));
24651 2699 : emit_insn (gen_vec_interleave_lowv8hi (d->target, d->op0, d->op1));
24652 2699 : emit_insn (gen_vec_interleave_highv8hi (t2, d->target, t1));
24653 2699 : emit_insn (gen_vec_interleave_lowv8hi (d->target, d->target, t1));
24654 2699 : if (odd)
24655 92 : t3 = gen_vec_interleave_highv8hi (d->target, d->target, t2);
24656 : else
24657 2607 : t3 = gen_vec_interleave_lowv8hi (d->target, d->target, t2);
24658 2699 : emit_insn (t3);
24659 : }
24660 2699 : break;
24661 :
24662 1341 : case E_V8QImode:
24663 1341 : case E_V16QImode:
24664 1341 : return expand_vec_perm_even_odd_pack (d);
24665 :
24666 456 : case E_V16HImode:
24667 456 : case E_V32QImode:
24668 456 : return expand_vec_perm_even_odd_pack (d);
24669 :
24670 36 : case E_V64QImode:
24671 36 : return expand_vec_perm_even_odd_trunc (d);
24672 :
24673 19 : case E_V4DImode:
24674 19 : if (!TARGET_AVX2)
24675 : {
24676 19 : struct expand_vec_perm_d d_copy = *d;
24677 19 : d_copy.vmode = V4DFmode;
24678 19 : if (d->testing_p)
24679 18 : d_copy.target = gen_raw_REG (V4DFmode, LAST_VIRTUAL_REGISTER + 1);
24680 : else
24681 1 : d_copy.target = gen_reg_rtx (V4DFmode);
24682 19 : d_copy.op0 = gen_lowpart (V4DFmode, d->op0);
24683 19 : d_copy.op1 = gen_lowpart (V4DFmode, d->op1);
24684 19 : if (expand_vec_perm_even_odd_1 (&d_copy, odd))
24685 : {
24686 19 : if (!d->testing_p)
24687 1 : emit_move_insn (d->target,
24688 1 : gen_lowpart (V4DImode, d_copy.target));
24689 19 : return true;
24690 : }
24691 : return false;
24692 : }
24693 :
24694 0 : if (d->testing_p)
24695 : break;
24696 :
24697 0 : t1 = gen_reg_rtx (V4DImode);
24698 0 : t2 = gen_reg_rtx (V4DImode);
24699 :
24700 : /* Shuffle the lanes around into { 0 1 4 5 } and { 2 3 6 7 }. */
24701 0 : emit_insn (gen_avx2_permv2ti (t1, d->op0, d->op1, GEN_INT (0x20)));
24702 0 : emit_insn (gen_avx2_permv2ti (t2, d->op0, d->op1, GEN_INT (0x31)));
24703 :
24704 : /* Now an vpunpck[lh]qdq will produce the result required. */
24705 0 : if (odd)
24706 0 : t3 = gen_avx2_interleave_highv4di (d->target, t1, t2);
24707 : else
24708 0 : t3 = gen_avx2_interleave_lowv4di (d->target, t1, t2);
24709 0 : emit_insn (t3);
24710 0 : break;
24711 :
24712 176 : case E_V8SImode:
24713 176 : if (!TARGET_AVX2)
24714 : {
24715 38 : struct expand_vec_perm_d d_copy = *d;
24716 38 : d_copy.vmode = V8SFmode;
24717 38 : if (d->testing_p)
24718 38 : d_copy.target = gen_raw_REG (V8SFmode, LAST_VIRTUAL_REGISTER + 1);
24719 : else
24720 0 : d_copy.target = gen_reg_rtx (V8SFmode);
24721 38 : d_copy.op0 = gen_lowpart (V8SFmode, d->op0);
24722 38 : d_copy.op1 = gen_lowpart (V8SFmode, d->op1);
24723 38 : if (expand_vec_perm_even_odd_1 (&d_copy, odd))
24724 : {
24725 38 : if (!d->testing_p)
24726 0 : emit_move_insn (d->target,
24727 0 : gen_lowpart (V8SImode, d_copy.target));
24728 38 : return true;
24729 : }
24730 : return false;
24731 : }
24732 :
24733 138 : if (d->testing_p)
24734 : break;
24735 :
24736 138 : t1 = gen_reg_rtx (V8SImode);
24737 138 : t2 = gen_reg_rtx (V8SImode);
24738 138 : t3 = gen_reg_rtx (V4DImode);
24739 138 : t4 = gen_reg_rtx (V4DImode);
24740 138 : t5 = gen_reg_rtx (V4DImode);
24741 :
24742 : /* Shuffle the lanes around into
24743 : { 0 1 2 3 8 9 a b } and { 4 5 6 7 c d e f }. */
24744 276 : emit_insn (gen_avx2_permv2ti (t3, gen_lowpart (V4DImode, d->op0),
24745 138 : gen_lowpart (V4DImode, d->op1),
24746 : GEN_INT (0x20)));
24747 276 : emit_insn (gen_avx2_permv2ti (t4, gen_lowpart (V4DImode, d->op0),
24748 138 : gen_lowpart (V4DImode, d->op1),
24749 : GEN_INT (0x31)));
24750 :
24751 : /* Swap the 2nd and 3rd position in each lane into
24752 : { 0 2 1 3 8 a 9 b } and { 4 6 5 7 c e d f }. */
24753 138 : emit_insn (gen_avx2_pshufdv3 (t1, gen_lowpart (V8SImode, t3),
24754 : GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
24755 138 : emit_insn (gen_avx2_pshufdv3 (t2, gen_lowpart (V8SImode, t4),
24756 : GEN_INT (2 * 4 + 1 * 16 + 3 * 64)));
24757 :
24758 : /* Now an vpunpck[lh]qdq will produce
24759 : { 0 2 4 6 8 a c e } resp. { 1 3 5 7 9 b d f }. */
24760 138 : if (odd)
24761 0 : t3 = gen_avx2_interleave_highv4di (t5, gen_lowpart (V4DImode, t1),
24762 0 : gen_lowpart (V4DImode, t2));
24763 : else
24764 138 : t3 = gen_avx2_interleave_lowv4di (t5, gen_lowpart (V4DImode, t1),
24765 138 : gen_lowpart (V4DImode, t2));
24766 138 : emit_insn (t3);
24767 138 : emit_move_insn (d->target, gen_lowpart (V8SImode, t5));
24768 138 : break;
24769 :
24770 0 : default:
24771 0 : gcc_unreachable ();
24772 : }
24773 :
24774 : return true;
24775 : }
24776 :
24777 : /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
24778 : extract-even and extract-odd permutations. */
24779 :
24780 : static bool
24781 23446 : expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
24782 : {
24783 23446 : unsigned i, odd, nelt = d->nelt;
24784 :
24785 23446 : odd = d->perm[0];
24786 23446 : if (odd != 0 && odd != 1)
24787 : return false;
24788 :
24789 63629 : for (i = 1; i < nelt; ++i)
24790 56127 : if (d->perm[i] != 2 * i + odd)
24791 : return false;
24792 :
24793 7502 : if (d->vmode == E_V32HImode
24794 12 : && d->testing_p
24795 12 : && !TARGET_AVX512BW)
24796 : return false;
24797 :
24798 7490 : return expand_vec_perm_even_odd_1 (d, odd);
24799 : }
24800 :
24801 : /* A subroutine of ix86_expand_vec_perm_const_1. Implement broadcast
24802 : permutations. We assume that expand_vec_perm_1 has already failed. */
24803 :
24804 : static bool
24805 1004 : expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
24806 : {
24807 1004 : unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
24808 1004 : machine_mode vmode = d->vmode;
24809 1004 : rtx (*gen) (rtx, rtx, rtx);
24810 1004 : unsigned char perm2[4];
24811 1004 : rtx op0 = d->op0, dest;
24812 1004 : bool ok;
24813 :
24814 1004 : switch (vmode)
24815 : {
24816 0 : case E_V4DFmode:
24817 0 : case E_V8SFmode:
24818 : /* These are special-cased in sse.md so that we can optionally
24819 : use the vbroadcast instruction. They expand to two insns
24820 : if the input happens to be in a register. */
24821 0 : gcc_unreachable ();
24822 :
24823 0 : case E_V2DFmode:
24824 0 : case E_V2SFmode:
24825 0 : case E_V4SFmode:
24826 0 : case E_V2DImode:
24827 0 : case E_V2SImode:
24828 0 : case E_V4SImode:
24829 0 : case E_V2HImode:
24830 0 : case E_V4HImode:
24831 : /* These are always implementable using standard shuffle patterns. */
24832 0 : gcc_unreachable ();
24833 :
24834 16 : case E_V4QImode:
24835 : /* This can be implemented via interleave and pshuflw. */
24836 16 : if (d->testing_p)
24837 : return true;
24838 :
24839 8 : if (elt >= nelt2)
24840 : {
24841 4 : gen = gen_mmx_punpckhbw_low;
24842 4 : elt -= nelt2;
24843 : }
24844 : else
24845 : gen = gen_mmx_punpcklbw_low;
24846 :
24847 8 : dest = gen_reg_rtx (vmode);
24848 8 : emit_insn (gen (dest, op0, op0));
24849 8 : vmode = get_mode_wider_vector (vmode);
24850 8 : op0 = gen_lowpart (vmode, dest);
24851 :
24852 8 : memset (perm2, elt, 2);
24853 8 : dest = gen_reg_rtx (vmode);
24854 8 : ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
24855 8 : gcc_assert (ok);
24856 :
24857 8 : emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
24858 8 : return true;
24859 :
24860 4 : case E_V8QImode:
24861 : /* This can be implemented via interleave. We save one insn by
24862 : stopping once we have promoted to V2SImode and then use pshufd. */
24863 4 : if (d->testing_p)
24864 : return true;
24865 4 : do
24866 : {
24867 4 : if (elt >= nelt2)
24868 : {
24869 1 : gen = vmode == V8QImode ? gen_mmx_punpckhbw
24870 : : gen_mmx_punpckhwd;
24871 1 : elt -= nelt2;
24872 : }
24873 : else
24874 3 : gen = vmode == V8QImode ? gen_mmx_punpcklbw
24875 : : gen_mmx_punpcklwd;
24876 4 : nelt2 /= 2;
24877 :
24878 4 : dest = gen_reg_rtx (vmode);
24879 4 : emit_insn (gen (dest, op0, op0));
24880 4 : vmode = get_mode_wider_vector (vmode);
24881 4 : op0 = gen_lowpart (vmode, dest);
24882 : }
24883 4 : while (vmode != V2SImode);
24884 :
24885 2 : memset (perm2, elt, 2);
24886 2 : dest = gen_reg_rtx (vmode);
24887 2 : ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
24888 2 : gcc_assert (ok);
24889 :
24890 2 : emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
24891 2 : return true;
24892 :
24893 975 : case E_V8HImode:
24894 975 : case E_V16QImode:
24895 : /* These can be implemented via interleave. We save one insn by
24896 : stopping once we have promoted to V4SImode and then use pshufd. */
24897 975 : if (d->testing_p)
24898 : return true;
24899 1502 : do
24900 : {
24901 1502 : if (elt >= nelt2)
24902 : {
24903 16 : gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
24904 : : gen_vec_interleave_highv8hi;
24905 16 : elt -= nelt2;
24906 : }
24907 : else
24908 1486 : gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi
24909 : : gen_vec_interleave_lowv8hi;
24910 1502 : nelt2 /= 2;
24911 :
24912 1502 : dest = gen_reg_rtx (vmode);
24913 1502 : emit_insn (gen (dest, op0, op0));
24914 1502 : vmode = get_mode_wider_vector (vmode);
24915 1502 : op0 = gen_lowpart (vmode, dest);
24916 : }
24917 1502 : while (vmode != V4SImode);
24918 :
24919 911 : memset (perm2, elt, 4);
24920 911 : dest = gen_reg_rtx (vmode);
24921 911 : ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
24922 911 : gcc_assert (ok);
24923 :
24924 911 : emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
24925 911 : return true;
24926 :
24927 1 : case E_V8HFmode:
24928 1 : case E_V8BFmode:
24929 : /* This can be implemented via interleave and pshufd. */
24930 1 : if (d->testing_p)
24931 : return true;
24932 :
24933 1 : rtx (*gen_interleave) (machine_mode, rtx, rtx, rtx);
24934 1 : if (elt >= nelt2)
24935 : {
24936 0 : gen_interleave = gen_vec_interleave_high;
24937 0 : elt -= nelt2;
24938 : }
24939 : else
24940 : gen_interleave = gen_vec_interleave_low;
24941 1 : nelt2 /= 2;
24942 :
24943 1 : dest = gen_reg_rtx (vmode);
24944 1 : emit_insn (gen_interleave (vmode, dest, op0, op0));
24945 :
24946 1 : vmode = V4SImode;
24947 1 : op0 = gen_lowpart (vmode, dest);
24948 :
24949 1 : memset (perm2, elt, 4);
24950 1 : dest = gen_reg_rtx (vmode);
24951 1 : ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
24952 1 : gcc_assert (ok);
24953 :
24954 1 : emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
24955 1 : return true;
24956 :
24957 0 : case E_V32QImode:
24958 0 : case E_V16HImode:
24959 0 : case E_V8SImode:
24960 0 : case E_V4DImode:
24961 : /* For AVX2 broadcasts of the first element vpbroadcast* or
24962 : vpermq should be used by expand_vec_perm_1. */
24963 0 : gcc_assert (!TARGET_AVX2 || d->perm[0]);
24964 : return false;
24965 :
24966 6 : case E_V64QImode:
24967 6 : gcc_assert (!TARGET_AVX512BW || d->perm[0]);
24968 : return false;
24969 :
24970 2 : case E_V32HImode:
24971 2 : gcc_assert (!TARGET_AVX512BW);
24972 : return false;
24973 :
24974 0 : default:
24975 0 : gcc_unreachable ();
24976 : }
24977 : }
24978 :
24979 : /* A subroutine of ix86_expand_vec_perm_const_1. Pattern match
24980 : broadcast permutations. */
24981 :
24982 : static bool
24983 90348 : expand_vec_perm_broadcast (struct expand_vec_perm_d *d)
24984 : {
24985 90348 : unsigned i, elt, nelt = d->nelt;
24986 :
24987 90348 : if (!d->one_operand_p)
24988 : return false;
24989 :
24990 5332 : elt = d->perm[0];
24991 8175 : for (i = 1; i < nelt; ++i)
24992 8067 : if (d->perm[i] != elt)
24993 : return false;
24994 :
24995 108 : return expand_vec_perm_broadcast_1 (d);
24996 : }
24997 :
24998 : /* Implement arbitrary permutations of two V64QImode operands
24999 : with 2 vperm[it]2w, 2 vpshufb and one vpor instruction. */
25000 : static bool
25001 23518 : expand_vec_perm_vpermt2_vpshub2 (struct expand_vec_perm_d *d)
25002 : {
25003 23518 : if (!TARGET_AVX512BW || !(d->vmode == V64QImode))
25004 : return false;
25005 :
25006 72 : if (d->testing_p)
25007 : return true;
25008 :
25009 72 : struct expand_vec_perm_d ds[2];
25010 72 : rtx rperm[128], vperm, target0, target1;
25011 72 : unsigned int i, nelt;
25012 72 : machine_mode vmode;
25013 :
25014 72 : nelt = d->nelt;
25015 72 : vmode = V64QImode;
25016 :
25017 216 : for (i = 0; i < 2; i++)
25018 : {
25019 144 : ds[i] = *d;
25020 144 : ds[i].vmode = V32HImode;
25021 144 : ds[i].nelt = 32;
25022 144 : ds[i].target = gen_reg_rtx (V32HImode);
25023 144 : ds[i].op0 = gen_lowpart (V32HImode, d->op0);
25024 144 : ds[i].op1 = gen_lowpart (V32HImode, d->op1);
25025 : }
25026 :
25027 : /* Prepare permutations such that the first one takes care of
25028 : putting the even bytes into the right positions or one higher
25029 : positions (ds[0]) and the second one takes care of
25030 : putting the odd bytes into the right positions or one below
25031 : (ds[1]). */
25032 :
25033 4680 : for (i = 0; i < nelt; i++)
25034 : {
25035 4608 : ds[i & 1].perm[i / 2] = d->perm[i] / 2;
25036 4608 : if (i & 1)
25037 : {
25038 2304 : rperm[i] = constm1_rtx;
25039 2304 : rperm[i + 64] = GEN_INT ((i & 14) + (d->perm[i] & 1));
25040 : }
25041 : else
25042 : {
25043 2304 : rperm[i] = GEN_INT ((i & 14) + (d->perm[i] & 1));
25044 2304 : rperm[i + 64] = constm1_rtx;
25045 : }
25046 : }
25047 :
25048 72 : bool ok = expand_vec_perm_1 (&ds[0]);
25049 72 : gcc_assert (ok);
25050 72 : ds[0].target = gen_lowpart (V64QImode, ds[0].target);
25051 :
25052 72 : ok = expand_vec_perm_1 (&ds[1]);
25053 72 : gcc_assert (ok);
25054 72 : ds[1].target = gen_lowpart (V64QImode, ds[1].target);
25055 :
25056 72 : vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm));
25057 72 : vperm = force_reg (vmode, vperm);
25058 72 : target0 = gen_reg_rtx (V64QImode);
25059 72 : emit_insn (gen_avx512bw_pshufbv64qi3 (target0, ds[0].target, vperm));
25060 :
25061 72 : vperm = gen_rtx_CONST_VECTOR (V64QImode, gen_rtvec_v (64, rperm + 64));
25062 72 : vperm = force_reg (vmode, vperm);
25063 72 : target1 = gen_reg_rtx (V64QImode);
25064 72 : emit_insn (gen_avx512bw_pshufbv64qi3 (target1, ds[1].target, vperm));
25065 :
25066 72 : emit_insn (gen_iorv64qi3 (d->target, target0, target1));
25067 72 : return true;
25068 : }
25069 :
25070 : /* Implement arbitrary permutation of two V32QImode and V16QImode operands
25071 : with 4 vpshufb insns, 2 vpermq and 3 vpor. We should have already failed
25072 : all the shorter instruction sequences. */
25073 :
25074 : static bool
25075 15806 : expand_vec_perm_vpshufb4_vpermq2 (struct expand_vec_perm_d *d)
25076 : {
25077 15806 : rtx rperm[4][32], vperm, l[2], h[2], op, m128;
25078 15806 : unsigned int i, nelt, eltsz;
25079 15806 : bool used[4];
25080 :
25081 15806 : if (!TARGET_AVX2
25082 322 : || d->one_operand_p
25083 193 : || (d->vmode != V32QImode && d->vmode != V16HImode))
25084 : return false;
25085 :
25086 54 : if (d->testing_p)
25087 : return true;
25088 :
25089 54 : nelt = d->nelt;
25090 54 : eltsz = GET_MODE_UNIT_SIZE (d->vmode);
25091 :
25092 : /* Generate 4 permutation masks. If the required element is within
25093 : the same lane, it is shuffled in. If the required element from the
25094 : other lane, force a zero by setting bit 7 in the permutation mask.
25095 : In the other mask the mask has non-negative elements if element
25096 : is requested from the other lane, but also moved to the other lane,
25097 : so that the result of vpshufb can have the two V2TImode halves
25098 : swapped. */
25099 54 : m128 = GEN_INT (-128);
25100 1836 : for (i = 0; i < 32; ++i)
25101 : {
25102 1728 : rperm[0][i] = m128;
25103 1728 : rperm[1][i] = m128;
25104 1728 : rperm[2][i] = m128;
25105 1728 : rperm[3][i] = m128;
25106 : }
25107 54 : used[0] = false;
25108 54 : used[1] = false;
25109 54 : used[2] = false;
25110 54 : used[3] = false;
25111 1590 : for (i = 0; i < nelt; ++i)
25112 : {
25113 1536 : unsigned j, e = d->perm[i] & (nelt / 2 - 1);
25114 1536 : unsigned xlane = ((d->perm[i] ^ i) & (nelt / 2)) * eltsz;
25115 2074 : unsigned int which = ((d->perm[i] & nelt) ? 2 : 0) + (xlane ? 1 : 0);
25116 :
25117 3264 : for (j = 0; j < eltsz; ++j)
25118 1728 : rperm[which][(i * eltsz + j) ^ xlane] = GEN_INT (e * eltsz + j);
25119 1536 : used[which] = true;
25120 : }
25121 :
25122 162 : for (i = 0; i < 2; ++i)
25123 : {
25124 108 : if (!used[2 * i + 1])
25125 : {
25126 22 : h[i] = NULL_RTX;
25127 22 : continue;
25128 : }
25129 86 : vperm = gen_rtx_CONST_VECTOR (V32QImode,
25130 86 : gen_rtvec_v (32, rperm[2 * i + 1]));
25131 86 : vperm = force_reg (V32QImode, vperm);
25132 86 : h[i] = gen_reg_rtx (V32QImode);
25133 86 : op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
25134 86 : emit_insn (gen_avx2_pshufbv32qi3 (h[i], op, vperm));
25135 : }
25136 :
25137 : /* Swap the 128-byte lanes of h[X]. */
25138 162 : for (i = 0; i < 2; ++i)
25139 : {
25140 108 : if (h[i] == NULL_RTX)
25141 22 : continue;
25142 86 : op = gen_reg_rtx (V4DImode);
25143 86 : emit_insn (gen_avx2_permv4di_1 (op, gen_lowpart (V4DImode, h[i]),
25144 : const2_rtx, GEN_INT (3), const0_rtx,
25145 : const1_rtx));
25146 86 : h[i] = gen_lowpart (V32QImode, op);
25147 : }
25148 :
25149 162 : for (i = 0; i < 2; ++i)
25150 : {
25151 108 : if (!used[2 * i])
25152 : {
25153 0 : l[i] = NULL_RTX;
25154 0 : continue;
25155 : }
25156 108 : vperm = gen_rtx_CONST_VECTOR (V32QImode, gen_rtvec_v (32, rperm[2 * i]));
25157 108 : vperm = force_reg (V32QImode, vperm);
25158 108 : l[i] = gen_reg_rtx (V32QImode);
25159 108 : op = gen_lowpart (V32QImode, i ? d->op1 : d->op0);
25160 108 : emit_insn (gen_avx2_pshufbv32qi3 (l[i], op, vperm));
25161 : }
25162 :
25163 162 : for (i = 0; i < 2; ++i)
25164 : {
25165 108 : if (h[i] && l[i])
25166 : {
25167 86 : op = gen_reg_rtx (V32QImode);
25168 86 : emit_insn (gen_iorv32qi3 (op, l[i], h[i]));
25169 86 : l[i] = op;
25170 : }
25171 22 : else if (h[i])
25172 0 : l[i] = h[i];
25173 : }
25174 :
25175 54 : gcc_assert (l[0] && l[1]);
25176 54 : op = d->target;
25177 54 : if (d->vmode != V32QImode)
25178 12 : op = gen_reg_rtx (V32QImode);
25179 54 : emit_insn (gen_iorv32qi3 (op, l[0], l[1]));
25180 54 : if (op != d->target)
25181 12 : emit_move_insn (d->target, gen_lowpart (d->vmode, op));
25182 : return true;
25183 : }
25184 :
25185 : /* The guts of ix86_vectorize_vec_perm_const. With all of the interface bits
25186 : taken care of, perform the expansion in D and return true on success. */
25187 :
25188 : static bool
25189 308371 : ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
25190 : {
25191 : /* Try a single instruction expansion. */
25192 308371 : if (expand_vec_perm_1 (d))
25193 : return true;
25194 :
25195 : /* Try sequences of two instructions. */
25196 :
25197 102329 : if (expand_vec_perm_pshuflw_pshufhw (d))
25198 : return true;
25199 :
25200 99866 : if (expand_vec_perm_palignr (d, false))
25201 : return true;
25202 :
25203 96731 : if (expand_vec_perm_interleave2 (d))
25204 : return true;
25205 :
25206 90348 : if (expand_vec_perm_broadcast (d))
25207 : return true;
25208 :
25209 90248 : if (expand_vec_perm_vpermq_perm_1 (d))
25210 : return true;
25211 :
25212 90248 : if (expand_vec_perm_vperm2f128 (d))
25213 : return true;
25214 :
25215 90184 : if (expand_vec_perm_pblendv (d))
25216 : return true;
25217 :
25218 88428 : if (expand_vec_perm_2perm_interleave (d, true))
25219 : return true;
25220 :
25221 88066 : if (expand_vec_perm_2perm_pblendv (d, true))
25222 : return true;
25223 :
25224 84971 : if (expand_vec_perm_shufps_shufps (d))
25225 : return true;
25226 :
25227 48670 : if (expand_vec_perm_punpckldq_pshuf (d))
25228 : return true;
25229 :
25230 : /* Try sequences of three instructions. */
25231 :
25232 43398 : if (expand_vec_perm_even_odd_pack (d))
25233 : return true;
25234 :
25235 29841 : if (expand_vec_perm_2vperm2f128_vshuf (d))
25236 : return true;
25237 :
25238 28564 : if (expand_vec_perm_pshufb2 (d))
25239 : return true;
25240 :
25241 27513 : if (expand_vec_perm_pslldq_psrldq_por (d, false))
25242 : return true;
25243 :
25244 27270 : if (expand_vec_perm_interleave3 (d))
25245 : return true;
25246 :
25247 27132 : if (expand_vec_perm_vperm2f128_vblend (d))
25248 : return true;
25249 :
25250 27132 : if (expand_vec_perm_2perm_interleave (d, false))
25251 : return true;
25252 :
25253 26892 : if (expand_vec_perm_2perm_pblendv (d, false))
25254 : return true;
25255 :
25256 25992 : if (expand_vec_perm_psrlw_psllw_por (d))
25257 : return true;
25258 :
25259 24554 : if (expand_vec_perm_pand_pandn_por (d))
25260 : return true;
25261 :
25262 : /* Try sequences of four instructions. */
25263 :
25264 23537 : if (expand_vec_perm_even_odd_trunc (d))
25265 : return true;
25266 23525 : if (expand_vec_perm_vpshufb2_vpermq (d))
25267 : return true;
25268 :
25269 23518 : if (expand_vec_perm_vpshufb2_vpermq_even_odd (d))
25270 : return true;
25271 :
25272 23518 : if (expand_vec_perm_vpermt2_vpshub2 (d))
25273 : return true;
25274 :
25275 : /* ??? Look for narrow permutations whose element orderings would
25276 : allow the promotion to a wider mode. */
25277 :
25278 : /* ??? Look for sequences of interleave or a wider permute that place
25279 : the data into the correct lanes for a half-vector shuffle like
25280 : pshuf[lh]w or vpermilps. */
25281 :
25282 : /* ??? Look for sequences of interleave that produce the desired results.
25283 : The combinatorics of punpck[lh] get pretty ugly... */
25284 :
25285 23446 : if (expand_vec_perm_even_odd (d))
25286 : return true;
25287 :
25288 : /* Generate four or five instructions. */
25289 16030 : if (expand_vec_perm_pslldq_psrldq_por (d, true))
25290 : return true;
25291 :
25292 : /* Even longer sequences. */
25293 15806 : if (expand_vec_perm_vpshufb4_vpermq2 (d))
25294 : return true;
25295 :
25296 : /* See if we can get the same permutation in different vector integer
25297 : mode. */
25298 15752 : struct expand_vec_perm_d nd;
25299 15752 : if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
25300 : {
25301 0 : if (!d->testing_p)
25302 0 : emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
25303 0 : return true;
25304 : }
25305 :
25306 : /* Even longer, including recursion to ix86_expand_vec_perm_const_1. */
25307 15752 : if (expand_vec_perm2_vperm2f128_vblend (d))
25308 : return true;
25309 :
25310 : return false;
25311 : }
25312 :
25313 : /* If a permutation only uses one operand, make it clear. Returns true
25314 : if the permutation references both operands. */
25315 :
25316 : static bool
25317 74642 : canonicalize_perm (struct expand_vec_perm_d *d)
25318 : {
25319 74642 : int i, which, nelt = d->nelt;
25320 :
25321 449508 : for (i = which = 0; i < nelt; ++i)
25322 509485 : which |= (d->perm[i] < nelt ? 1 : 2);
25323 :
25324 74642 : d->one_operand_p = true;
25325 74642 : switch (which)
25326 : {
25327 0 : default:
25328 0 : gcc_unreachable();
25329 :
25330 55521 : case 3:
25331 55521 : if (!rtx_equal_p (d->op0, d->op1))
25332 : {
25333 55470 : d->one_operand_p = false;
25334 55470 : break;
25335 : }
25336 : /* The elements of PERM do not suggest that only the first operand
25337 : is used, but both operands are identical. Allow easier matching
25338 : of the permutation by folding the permutation into the single
25339 : input vector. */
25340 : /* FALLTHRU */
25341 :
25342 : case 2:
25343 2913 : for (i = 0; i < nelt; ++i)
25344 2576 : d->perm[i] &= nelt - 1;
25345 337 : d->op0 = d->op1;
25346 337 : break;
25347 :
25348 18835 : case 1:
25349 18835 : d->op1 = d->op0;
25350 18835 : break;
25351 : }
25352 :
25353 74642 : return (which == 3);
25354 : }
25355 :
25356 : /* Implement TARGET_VECTORIZE_VEC_PERM_CONST. */
25357 :
25358 : bool
25359 830158 : ix86_vectorize_vec_perm_const (machine_mode vmode, machine_mode op_mode,
25360 : rtx target, rtx op0, rtx op1,
25361 : const vec_perm_indices &sel)
25362 : {
25363 830158 : if (vmode != op_mode)
25364 : return false;
25365 :
25366 828293 : struct expand_vec_perm_d d;
25367 828293 : unsigned char perm[MAX_VECT_LEN];
25368 828293 : unsigned int i, nelt, which;
25369 828293 : bool two_args;
25370 :
25371 : /* For HF and BF mode vector, convert it to HI using subreg. */
25372 2484421 : if (GET_MODE_INNER (vmode) == HFmode || GET_MODE_INNER (vmode) == BFmode)
25373 : {
25374 484 : machine_mode orig_mode = vmode;
25375 968 : vmode = mode_for_vector (HImode,
25376 484 : GET_MODE_NUNITS (vmode)).require ();
25377 484 : if (target)
25378 441 : target = lowpart_subreg (vmode, target, orig_mode);
25379 484 : if (op0)
25380 441 : op0 = lowpart_subreg (vmode, op0, orig_mode);
25381 484 : if (op1)
25382 441 : op1 = lowpart_subreg (vmode, op1, orig_mode);
25383 : }
25384 :
25385 828293 : d.target = target;
25386 828293 : d.op0 = op0;
25387 828293 : d.op1 = op1;
25388 :
25389 828293 : d.vmode = vmode;
25390 828293 : gcc_assert (VECTOR_MODE_P (d.vmode));
25391 828293 : d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
25392 828293 : d.testing_p = !target;
25393 :
25394 828293 : gcc_assert (sel.length () == nelt);
25395 828293 : gcc_checking_assert (sizeof (d.perm) == sizeof (perm));
25396 :
25397 : /* Given sufficient ISA support we can just return true here
25398 : for selected vector modes. */
25399 828293 : switch (d.vmode)
25400 : {
25401 2047 : case E_V16SFmode:
25402 2047 : case E_V16SImode:
25403 2047 : case E_V8DImode:
25404 2047 : case E_V8DFmode:
25405 2047 : if (!TARGET_AVX512F)
25406 : return false;
25407 : /* All implementable with a single vperm[it]2 insn. */
25408 2047 : if (d.testing_p)
25409 : return true;
25410 : break;
25411 323 : case E_V32HImode:
25412 323 : if (!TARGET_AVX512F)
25413 : return false;
25414 323 : if (d.testing_p && TARGET_AVX512BW)
25415 : /* All implementable with a single vperm[it]2 insn. */
25416 : return true;
25417 : break;
25418 752 : case E_V64QImode:
25419 752 : if (!TARGET_AVX512F)
25420 : return false;
25421 752 : if (d.testing_p && TARGET_AVX512BW)
25422 : /* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
25423 : return true;
25424 : break;
25425 12888 : case E_V8SImode:
25426 12888 : case E_V8SFmode:
25427 12888 : case E_V4DFmode:
25428 12888 : case E_V4DImode:
25429 12888 : if (!TARGET_AVX)
25430 : return false;
25431 12888 : if (d.testing_p && TARGET_AVX512VL)
25432 : /* All implementable with a single vperm[it]2 insn. */
25433 : return true;
25434 : break;
25435 614 : case E_V16HImode:
25436 614 : if (!TARGET_SSE2)
25437 : return false;
25438 614 : if (d.testing_p && TARGET_AVX2)
25439 : /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
25440 : return true;
25441 : break;
25442 696 : case E_V32QImode:
25443 696 : if (!TARGET_SSE2)
25444 : return false;
25445 696 : if (d.testing_p && TARGET_AVX2)
25446 : /* Implementable with 4 vpshufb insns, 2 vpermq and 3 vpor insns. */
25447 : return true;
25448 : break;
25449 37702 : case E_V8HImode:
25450 37702 : case E_V16QImode:
25451 37702 : if (!TARGET_SSE2)
25452 : return false;
25453 : /* Fall through. */
25454 238413 : case E_V4SImode:
25455 238413 : case E_V4SFmode:
25456 238413 : if (!TARGET_SSE)
25457 : return false;
25458 : /* All implementable with a single vpperm insn. */
25459 238413 : if (d.testing_p && TARGET_XOP)
25460 : return true;
25461 : /* All implementable with 2 pshufb + 1 ior. */
25462 238307 : if (d.testing_p && TARGET_SSSE3)
25463 : return true;
25464 : break;
25465 138561 : case E_V2SFmode:
25466 138561 : case E_V2SImode:
25467 138561 : case E_V4HImode:
25468 138561 : case E_V8QImode:
25469 138561 : if (!TARGET_MMX_WITH_SSE)
25470 : return false;
25471 : break;
25472 25533 : case E_V2HImode:
25473 25533 : if (!TARGET_SSE2)
25474 : return false;
25475 : /* All implementable with *punpckwd. */
25476 25533 : if (d.testing_p)
25477 : return true;
25478 : break;
25479 10804 : case E_V4QImode:
25480 10804 : if (!TARGET_SSE2)
25481 : return false;
25482 : break;
25483 395748 : case E_V2DImode:
25484 395748 : case E_V2DFmode:
25485 395748 : if (!TARGET_SSE)
25486 : return false;
25487 : /* All implementable with shufpd or unpck[lh]pd. */
25488 395748 : if (d.testing_p)
25489 : return true;
25490 : break;
25491 : default:
25492 : return false;
25493 : }
25494 :
25495 2233858 : for (i = which = 0; i < nelt; ++i)
25496 : {
25497 1826984 : unsigned char e = sel[i];
25498 1826984 : gcc_assert (e < 2 * nelt);
25499 1826984 : d.perm[i] = e;
25500 1826984 : perm[i] = e;
25501 2476040 : which |= (e < nelt ? 1 : 2);
25502 : }
25503 :
25504 406874 : if (d.testing_p)
25505 : {
25506 : /* For all elements from second vector, fold the elements to first. */
25507 333880 : if (which == 2)
25508 1345 : for (i = 0; i < nelt; ++i)
25509 1240 : d.perm[i] -= nelt;
25510 :
25511 : /* Check whether the mask can be applied to the vector type. */
25512 333880 : d.one_operand_p = (which != 3);
25513 :
25514 : /* Implementable with shufps, pshufd or pshuflw. */
25515 333880 : if (d.one_operand_p
25516 : && (d.vmode == V4SFmode || d.vmode == V2SFmode
25517 : || d.vmode == V4SImode || d.vmode == V2SImode
25518 : || d.vmode == V4HImode || d.vmode == V2HImode))
25519 : return true;
25520 :
25521 : /* Otherwise we have to go through the motions and see if we can
25522 : figure out how to generate the requested permutation. */
25523 230890 : d.target = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 1);
25524 230890 : d.op1 = d.op0 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 2);
25525 230890 : if (!d.one_operand_p)
25526 216989 : d.op1 = gen_raw_REG (d.vmode, LAST_VIRTUAL_REGISTER + 3);
25527 :
25528 230890 : start_sequence ();
25529 230890 : bool ret = ix86_expand_vec_perm_const_1 (&d);
25530 230890 : end_sequence ();
25531 :
25532 230890 : return ret;
25533 : }
25534 :
25535 72994 : two_args = canonicalize_perm (&d);
25536 :
25537 : /* If one of the operands is a zero vector, try to match pmovzx. */
25538 72994 : if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
25539 : {
25540 573 : struct expand_vec_perm_d dzero = d;
25541 573 : if (d.op0 == CONST0_RTX (vmode))
25542 : {
25543 387 : d.op1 = dzero.op1 = force_reg (vmode, d.op1);
25544 387 : std::swap (dzero.op0, dzero.op1);
25545 7527 : for (i = 0; i < nelt; ++i)
25546 7140 : dzero.perm[i] ^= nelt;
25547 : }
25548 : else
25549 186 : d.op0 = dzero.op0 = force_reg (vmode, d.op0);
25550 :
25551 573 : if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
25552 573 : dzero.perm, nelt, dzero.testing_p))
25553 116 : return true;
25554 : }
25555 :
25556 : /* Force operands into registers. */
25557 72878 : rtx nop0 = force_reg (vmode, d.op0);
25558 72878 : if (d.op0 == d.op1)
25559 18675 : d.op1 = nop0;
25560 72878 : d.op0 = nop0;
25561 72878 : d.op1 = force_reg (vmode, d.op1);
25562 :
25563 72878 : if (ix86_expand_vec_perm_const_1 (&d))
25564 : return true;
25565 :
25566 : /* If the selector says both arguments are needed, but the operands are the
25567 : same, the above tried to expand with one_operand_p and flattened selector.
25568 : If that didn't work, retry without one_operand_p; we succeeded with that
25569 : during testing. */
25570 22 : if (two_args && d.one_operand_p)
25571 : {
25572 22 : d.one_operand_p = false;
25573 22 : memcpy (d.perm, perm, sizeof (perm));
25574 22 : return ix86_expand_vec_perm_const_1 (&d);
25575 : }
25576 :
25577 : return false;
25578 : }
25579 :
25580 : void
25581 8190 : ix86_expand_vec_extract_even_odd (rtx targ, rtx op0, rtx op1, unsigned odd)
25582 : {
25583 8190 : struct expand_vec_perm_d d;
25584 8190 : unsigned i, nelt;
25585 :
25586 8190 : d.target = targ;
25587 8190 : d.op0 = op0;
25588 8190 : d.op1 = op1;
25589 8190 : d.vmode = GET_MODE (targ);
25590 8190 : d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
25591 8190 : d.one_operand_p = false;
25592 8190 : d.testing_p = false;
25593 :
25594 77926 : for (i = 0; i < nelt; ++i)
25595 69736 : d.perm[i] = i * 2 + odd;
25596 :
25597 : /* We'll either be able to implement the permutation directly... */
25598 8190 : if (expand_vec_perm_1 (&d))
25599 3176 : return;
25600 :
25601 : /* ... or we use the special-case patterns. */
25602 5014 : expand_vec_perm_even_odd_1 (&d, odd);
25603 : }
25604 :
25605 : static void
25606 924 : ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
25607 : {
25608 924 : struct expand_vec_perm_d d;
25609 924 : unsigned i, nelt, base;
25610 924 : bool ok;
25611 :
25612 924 : d.target = targ;
25613 924 : d.op0 = op0;
25614 924 : d.op1 = op1;
25615 924 : d.vmode = GET_MODE (targ);
25616 924 : d.nelt = nelt = GET_MODE_NUNITS (d.vmode);
25617 924 : d.one_operand_p = false;
25618 924 : d.testing_p = false;
25619 :
25620 924 : base = high_p ? nelt / 2 : 0;
25621 3652 : for (i = 0; i < nelt / 2; ++i)
25622 : {
25623 2728 : d.perm[i * 2] = i + base;
25624 2728 : d.perm[i * 2 + 1] = i + base + nelt;
25625 : }
25626 :
25627 : /* Note that for AVX this isn't one instruction. */
25628 924 : ok = ix86_expand_vec_perm_const_1 (&d);
25629 924 : gcc_assert (ok);
25630 924 : }
25631 :
25632 : /* Expand a vector operation shift by constant for a V*QImode in terms of the
25633 : same operation on V*HImode. Return true if success. */
25634 : static bool
25635 380 : ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
25636 : rtx dest, rtx op1, rtx op2)
25637 : {
25638 380 : machine_mode qimode, himode;
25639 380 : HOST_WIDE_INT and_constant, xor_constant;
25640 380 : HOST_WIDE_INT shift_amount;
25641 380 : rtx vec_const_and, vec_const_xor;
25642 380 : rtx tmp, op1_subreg;
25643 380 : rtx (*gen_shift) (rtx, rtx, rtx);
25644 380 : rtx (*gen_and) (rtx, rtx, rtx);
25645 380 : rtx (*gen_xor) (rtx, rtx, rtx);
25646 380 : rtx (*gen_sub) (rtx, rtx, rtx);
25647 :
25648 : /* Only optimize shift by constant. */
25649 380 : if (!CONST_INT_P (op2))
25650 : return false;
25651 :
25652 380 : qimode = GET_MODE (dest);
25653 380 : shift_amount = INTVAL (op2);
25654 : /* Do nothing when shift amount greater equal 8. */
25655 380 : if (shift_amount > 7)
25656 : return false;
25657 :
25658 380 : gcc_assert (code == ASHIFT || code == ASHIFTRT || code == LSHIFTRT);
25659 :
25660 :
25661 380 : if (shift_amount == 7
25662 380 : && code == ASHIFTRT)
25663 : {
25664 32 : if (qimode == V16QImode
25665 10 : || qimode == V32QImode)
25666 : {
25667 31 : rtx zero = gen_reg_rtx (qimode);
25668 31 : emit_move_insn (zero, CONST0_RTX (qimode));
25669 31 : emit_move_insn (dest, gen_rtx_fmt_ee (GT, qimode, zero, op1));
25670 31 : }
25671 : else
25672 : {
25673 1 : gcc_assert (qimode == V64QImode);
25674 1 : rtx kmask = gen_reg_rtx (DImode);
25675 1 : emit_insn (gen_avx512bw_cvtb2maskv64qi (kmask, op1));
25676 1 : emit_insn (gen_avx512bw_cvtmask2bv64qi (dest, kmask));
25677 : }
25678 32 : return true;
25679 : }
25680 :
25681 : /* Record sign bit. */
25682 348 : xor_constant = 1 << (8 - shift_amount - 1);
25683 :
25684 : /* Zero upper/lower bits shift from left/right element. */
25685 348 : and_constant
25686 348 : = (code == ASHIFT ? 256 - (1 << shift_amount)
25687 317 : : (1 << (8 - shift_amount)) - 1);
25688 :
25689 348 : switch (qimode)
25690 : {
25691 331 : case V16QImode:
25692 331 : himode = V8HImode;
25693 281 : gen_shift =
25694 : ((code == ASHIFT)
25695 331 : ? gen_ashlv8hi3
25696 313 : : (code == ASHIFTRT) ? gen_ashrv8hi3 : gen_lshrv8hi3);
25697 : gen_and = gen_andv16qi3;
25698 : gen_xor = gen_xorv16qi3;
25699 : gen_sub = gen_subv16qi3;
25700 : break;
25701 6 : case V32QImode:
25702 6 : himode = V16HImode;
25703 1 : gen_shift =
25704 : ((code == ASHIFT)
25705 6 : ? gen_ashlv16hi3
25706 2 : : (code == ASHIFTRT) ? gen_ashrv16hi3 : gen_lshrv16hi3);
25707 : gen_and = gen_andv32qi3;
25708 : gen_xor = gen_xorv32qi3;
25709 : gen_sub = gen_subv32qi3;
25710 : break;
25711 11 : case V64QImode:
25712 11 : himode = V32HImode;
25713 1 : gen_shift =
25714 : ((code == ASHIFT)
25715 11 : ? gen_ashlv32hi3
25716 2 : : (code == ASHIFTRT) ? gen_ashrv32hi3 : gen_lshrv32hi3);
25717 : gen_and = gen_andv64qi3;
25718 : gen_xor = gen_xorv64qi3;
25719 : gen_sub = gen_subv64qi3;
25720 : break;
25721 0 : default:
25722 0 : gcc_unreachable ();
25723 : }
25724 :
25725 348 : tmp = gen_reg_rtx (himode);
25726 348 : vec_const_and = gen_reg_rtx (qimode);
25727 348 : op1_subreg = lowpart_subreg (himode, op1, qimode);
25728 :
25729 : /* For ASHIFT and LSHIFTRT, perform operation like
25730 : vpsllw/vpsrlw $shift_amount, %op1, %dest.
25731 : vpand %vec_const_and, %dest. */
25732 348 : emit_insn (gen_shift (tmp, op1_subreg, op2));
25733 348 : emit_move_insn (dest, simplify_gen_subreg (qimode, tmp, himode, 0));
25734 348 : emit_move_insn (vec_const_and,
25735 : ix86_build_const_vector (qimode, true,
25736 348 : gen_int_mode (and_constant, QImode)));
25737 348 : emit_insn (gen_and (dest, dest, vec_const_and));
25738 :
25739 : /* For ASHIFTRT, perform extra operation like
25740 : vpxor %vec_const_xor, %dest, %dest
25741 : vpsubb %vec_const_xor, %dest, %dest */
25742 348 : if (code == ASHIFTRT)
25743 : {
25744 34 : vec_const_xor = gen_reg_rtx (qimode);
25745 34 : emit_move_insn (vec_const_xor,
25746 : ix86_build_const_vector (qimode, true,
25747 34 : gen_int_mode (xor_constant, QImode)));
25748 34 : emit_insn (gen_xor (dest, dest, vec_const_xor));
25749 34 : emit_insn (gen_sub (dest, dest, vec_const_xor));
25750 : }
25751 : return true;
25752 : }
25753 :
25754 : void
25755 1412 : ix86_expand_vecop_qihi_partial (enum rtx_code code, rtx dest, rtx op1, rtx op2)
25756 : {
25757 1412 : machine_mode qimode = GET_MODE (dest);
25758 1412 : rtx qop1, qop2, hop1, hop2, qdest, hdest;
25759 1412 : bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
25760 1412 : bool uns_p = code != ASHIFTRT;
25761 :
25762 1412 : switch (qimode)
25763 : {
25764 1412 : case E_V4QImode:
25765 1412 : case E_V8QImode:
25766 1412 : break;
25767 0 : default:
25768 0 : gcc_unreachable ();
25769 : }
25770 :
25771 1412 : qop1 = lowpart_subreg (V16QImode, force_reg (qimode, op1), qimode);
25772 :
25773 1412 : if (op2vec)
25774 1310 : qop2 = lowpart_subreg (V16QImode, force_reg (qimode, op2), qimode);
25775 : else
25776 : qop2 = op2;
25777 :
25778 1412 : qdest = gen_reg_rtx (V16QImode);
25779 :
25780 1412 : if (CONST_INT_P (op2)
25781 90 : && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
25782 : /* With AVX512 it's cheaper to do vpmovsxbw/op/vpmovwb.
25783 : Even with SSE4.1 the alternative is better. */
25784 90 : && !TARGET_SSE4_1
25785 1466 : && ix86_expand_vec_shift_qihi_constant (code, qdest, qop1, qop2))
25786 : {
25787 54 : emit_move_insn (dest, gen_lowpart (qimode, qdest));
25788 54 : return;
25789 : }
25790 :
25791 1358 : if (CONST_INT_P (op2)
25792 36 : && code == ASHIFTRT
25793 10 : && INTVAL (op2) == 7)
25794 : {
25795 3 : rtx zero = gen_reg_rtx (qimode);
25796 3 : emit_move_insn (zero, CONST0_RTX (qimode));
25797 3 : emit_move_insn (dest, gen_rtx_fmt_ee (GT, qimode, zero, op1));
25798 3 : return;
25799 : }
25800 :
25801 1355 : switch (code)
25802 : {
25803 1297 : case MULT:
25804 1297 : gcc_assert (op2vec);
25805 1297 : if (!TARGET_SSE4_1)
25806 : {
25807 : /* Unpack data such that we've got a source byte in each low byte
25808 : of each word. We don't care what goes into the high byte of
25809 : each word. Rather than trying to get zero in there, most
25810 : convenient is to let it be a copy of the low byte. */
25811 244 : hop1 = copy_to_reg (qop1);
25812 244 : hop2 = copy_to_reg (qop2);
25813 244 : emit_insn (gen_vec_interleave_lowv16qi (hop1, hop1, hop1));
25814 244 : emit_insn (gen_vec_interleave_lowv16qi (hop2, hop2, hop2));
25815 244 : break;
25816 : }
25817 : /* FALLTHRU */
25818 1111 : case ASHIFT:
25819 1111 : case ASHIFTRT:
25820 1111 : case LSHIFTRT:
25821 1111 : hop1 = gen_reg_rtx (V8HImode);
25822 1111 : ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
25823 : /* mult/vashr/vlshr/vashl */
25824 1111 : if (op2vec)
25825 : {
25826 1066 : hop2 = gen_reg_rtx (V8HImode);
25827 1066 : ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
25828 : }
25829 : else
25830 : hop2 = qop2;
25831 :
25832 : break;
25833 0 : default:
25834 0 : gcc_unreachable ();
25835 : }
25836 :
25837 1355 : if (code != MULT && op2vec)
25838 : {
25839 : /* Expand vashr/vlshr/vashl. */
25840 13 : hdest = gen_reg_rtx (V8HImode);
25841 13 : emit_insn (gen_rtx_SET (hdest,
25842 : simplify_gen_binary (code, V8HImode,
25843 : hop1, hop2)));
25844 : }
25845 : else
25846 : /* Expand mult/ashr/lshr/ashl. */
25847 1342 : hdest = expand_simple_binop (V8HImode, code, hop1, hop2,
25848 : NULL_RTX, 1, OPTAB_DIRECT);
25849 :
25850 1355 : if (TARGET_AVX512BW && TARGET_AVX512VL)
25851 : {
25852 30 : if (qimode == V8QImode)
25853 : qdest = dest;
25854 : else
25855 10 : qdest = gen_reg_rtx (V8QImode);
25856 :
25857 30 : emit_insn (gen_truncv8hiv8qi2 (qdest, hdest));
25858 : }
25859 : else
25860 : {
25861 1325 : struct expand_vec_perm_d d;
25862 1325 : rtx qres = gen_lowpart (V16QImode, hdest);
25863 1325 : bool ok;
25864 1325 : int i;
25865 :
25866 : /* Merge the data back into the right place. */
25867 1325 : d.target = qdest;
25868 1325 : d.op0 = d.op1 = qres;
25869 1325 : d.vmode = V16QImode;
25870 1325 : d.nelt = 16;
25871 1325 : d.one_operand_p = TARGET_SSSE3;
25872 1325 : d.testing_p = false;
25873 :
25874 22525 : for (i = 0; i < d.nelt; ++i)
25875 21200 : d.perm[i] = i * 2;
25876 :
25877 1325 : ok = ix86_expand_vec_perm_const_1 (&d);
25878 1325 : gcc_assert (ok);
25879 : }
25880 :
25881 1355 : if (qdest != dest)
25882 1335 : emit_move_insn (dest, gen_lowpart (qimode, qdest));
25883 : }
25884 :
25885 : /* Emit instruction in 2x wider mode. For example, optimize
25886 : vector MUL generation like
25887 :
25888 : vpmovzxbw ymm2, xmm0
25889 : vpmovzxbw ymm3, xmm1
25890 : vpmullw ymm4, ymm2, ymm3
25891 : vpmovwb xmm0, ymm4
25892 :
25893 : it would take less instructions than ix86_expand_vecop_qihi.
25894 : Return true if success. */
25895 :
25896 : static bool
25897 1339 : ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
25898 : {
25899 1339 : machine_mode himode, qimode = GET_MODE (dest);
25900 1339 : machine_mode wqimode;
25901 1339 : rtx qop1, qop2, hop1, hop2, hdest;
25902 1339 : rtx (*gen_truncate)(rtx, rtx) = NULL;
25903 1339 : bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
25904 1339 : bool uns_p = code != ASHIFTRT;
25905 :
25906 : /* Without VPMOVWB (provided by AVX512BW ISA), the expansion uses the
25907 : generic permutation to merge the data back into the right place. This
25908 : permutation results in VPERMQ, which is slow, so better fall back to
25909 : ix86_expand_vecop_qihi. */
25910 1339 : if (!TARGET_AVX512BW
25911 301 : || (qimode == V16QImode && !TARGET_AVX512VL)
25912 : /* There are no V64HImode instructions. */
25913 301 : || qimode == V64QImode)
25914 : return false;
25915 :
25916 : /* Do not generate ymm/zmm instructions when
25917 : target prefers 128/256 bit vector width. */
25918 267 : if ((qimode == V16QImode && TARGET_PREFER_AVX128)
25919 267 : || (qimode == V32QImode && TARGET_PREFER_AVX256))
25920 : return false;
25921 :
25922 260 : switch (qimode)
25923 : {
25924 : case E_V16QImode:
25925 : himode = V16HImode;
25926 : gen_truncate = gen_truncv16hiv16qi2;
25927 : break;
25928 17 : case E_V32QImode:
25929 17 : himode = V32HImode;
25930 17 : gen_truncate = gen_truncv32hiv32qi2;
25931 17 : break;
25932 0 : default:
25933 0 : gcc_unreachable ();
25934 : }
25935 :
25936 260 : wqimode = GET_MODE_2XWIDER_MODE (qimode).require ();
25937 260 : qop1 = lowpart_subreg (wqimode, force_reg (qimode, op1), qimode);
25938 :
25939 260 : if (op2vec)
25940 260 : qop2 = lowpart_subreg (wqimode, force_reg (qimode, op2), qimode);
25941 : else
25942 : qop2 = op2;
25943 :
25944 260 : hop1 = gen_reg_rtx (himode);
25945 260 : ix86_expand_sse_unpack (hop1, qop1, uns_p, false);
25946 :
25947 260 : if (op2vec)
25948 : {
25949 260 : hop2 = gen_reg_rtx (himode);
25950 260 : ix86_expand_sse_unpack (hop2, qop2, uns_p, false);
25951 : }
25952 : else
25953 : hop2 = qop2;
25954 :
25955 260 : if (code != MULT && op2vec)
25956 : {
25957 : /* Expand vashr/vlshr/vashl. */
25958 14 : hdest = gen_reg_rtx (himode);
25959 14 : emit_insn (gen_rtx_SET (hdest,
25960 : simplify_gen_binary (code, himode,
25961 : hop1, hop2)));
25962 : }
25963 : else
25964 : /* Expand mult/ashr/lshr/ashl. */
25965 246 : hdest = expand_simple_binop (himode, code, hop1, hop2,
25966 : NULL_RTX, 1, OPTAB_DIRECT);
25967 :
25968 260 : emit_insn (gen_truncate (dest, hdest));
25969 260 : return true;
25970 : }
25971 :
25972 : /* Expand a vector operation CODE for a V*QImode in terms of the
25973 : same operation on V*HImode. */
25974 :
25975 : void
25976 1665 : ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
25977 : {
25978 1665 : machine_mode qimode = GET_MODE (dest);
25979 1665 : machine_mode himode;
25980 1665 : rtx (*gen_il) (rtx, rtx, rtx);
25981 1665 : rtx (*gen_ih) (rtx, rtx, rtx);
25982 1665 : rtx op1_l, op1_h, op2_l, op2_h, res_l, res_h;
25983 1665 : bool op2vec = GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT;
25984 1665 : struct expand_vec_perm_d d;
25985 1665 : bool full_interleave = true;
25986 1665 : bool uns_p = code != ASHIFTRT;
25987 1665 : bool ok;
25988 1665 : int i;
25989 :
25990 1665 : if (CONST_INT_P (op2)
25991 326 : && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
25992 1991 : && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
25993 586 : return;
25994 :
25995 1339 : if (ix86_expand_vecop_qihi2 (code, dest, op1, op2))
25996 : return;
25997 :
25998 1079 : switch (qimode)
25999 : {
26000 : case E_V16QImode:
26001 : himode = V8HImode;
26002 : break;
26003 280 : case E_V32QImode:
26004 280 : himode = V16HImode;
26005 280 : break;
26006 34 : case E_V64QImode:
26007 34 : himode = V32HImode;
26008 34 : break;
26009 0 : default:
26010 0 : gcc_unreachable ();
26011 : }
26012 :
26013 1079 : switch (code)
26014 : {
26015 1071 : case MULT:
26016 1071 : gcc_assert (op2vec);
26017 : /* Unpack data such that we've got a source byte in each low byte of
26018 : each word. We don't care what goes into the high byte of each word.
26019 : Rather than trying to get zero in there, most convenient is to let
26020 : it be a copy of the low byte. */
26021 1071 : switch (qimode)
26022 : {
26023 : case E_V16QImode:
26024 : gen_il = gen_vec_interleave_lowv16qi;
26025 : gen_ih = gen_vec_interleave_highv16qi;
26026 : break;
26027 280 : case E_V32QImode:
26028 280 : gen_il = gen_avx2_interleave_lowv32qi;
26029 280 : gen_ih = gen_avx2_interleave_highv32qi;
26030 280 : full_interleave = false;
26031 280 : break;
26032 32 : case E_V64QImode:
26033 32 : gen_il = gen_avx512bw_interleave_lowv64qi;
26034 32 : gen_ih = gen_avx512bw_interleave_highv64qi;
26035 32 : full_interleave = false;
26036 32 : break;
26037 0 : default:
26038 0 : gcc_unreachable ();
26039 : }
26040 :
26041 1071 : op2_l = gen_reg_rtx (qimode);
26042 1071 : op2_h = gen_reg_rtx (qimode);
26043 1071 : emit_insn (gen_il (op2_l, op2, op2));
26044 1071 : emit_insn (gen_ih (op2_h, op2, op2));
26045 :
26046 1071 : op1_l = gen_reg_rtx (qimode);
26047 1071 : op1_h = gen_reg_rtx (qimode);
26048 1071 : emit_insn (gen_il (op1_l, op1, op1));
26049 1071 : emit_insn (gen_ih (op1_h, op1, op1));
26050 1071 : break;
26051 :
26052 8 : case ASHIFT:
26053 8 : case ASHIFTRT:
26054 8 : case LSHIFTRT:
26055 8 : op1_l = gen_reg_rtx (himode);
26056 8 : op1_h = gen_reg_rtx (himode);
26057 8 : ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
26058 8 : ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
26059 : /* vashr/vlshr/vashl */
26060 8 : if (op2vec)
26061 : {
26062 2 : rtx tmp = force_reg (qimode, op2);
26063 2 : op2_l = gen_reg_rtx (himode);
26064 2 : op2_h = gen_reg_rtx (himode);
26065 2 : ix86_expand_sse_unpack (op2_l, tmp, uns_p, false);
26066 2 : ix86_expand_sse_unpack (op2_h, tmp, uns_p, true);
26067 : }
26068 : else
26069 : op2_l = op2_h = op2;
26070 :
26071 : break;
26072 0 : default:
26073 0 : gcc_unreachable ();
26074 : }
26075 :
26076 1079 : if (code != MULT && op2vec)
26077 : {
26078 : /* Expand vashr/vlshr/vashl. */
26079 2 : res_l = gen_reg_rtx (himode);
26080 2 : res_h = gen_reg_rtx (himode);
26081 2 : emit_insn (gen_rtx_SET (res_l,
26082 : simplify_gen_binary (code, himode,
26083 : op1_l, op2_l)));
26084 2 : emit_insn (gen_rtx_SET (res_h,
26085 : simplify_gen_binary (code, himode,
26086 : op1_h, op2_h)));
26087 : }
26088 : else
26089 : {
26090 : /* Expand mult/ashr/lshr/ashl. */
26091 1077 : res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
26092 : 1, OPTAB_DIRECT);
26093 1077 : res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
26094 : 1, OPTAB_DIRECT);
26095 : }
26096 :
26097 1079 : gcc_assert (res_l && res_h);
26098 :
26099 : /* Merge the data back into the right place. */
26100 1079 : d.target = dest;
26101 1079 : d.op0 = gen_lowpart (qimode, res_l);
26102 1079 : d.op1 = gen_lowpart (qimode, res_h);
26103 1079 : d.vmode = qimode;
26104 1079 : d.nelt = GET_MODE_NUNITS (qimode);
26105 1079 : d.one_operand_p = false;
26106 1079 : d.testing_p = false;
26107 :
26108 1079 : if (full_interleave)
26109 : {
26110 : /* We used the full interleave, the desired
26111 : results are in the even elements. */
26112 13135 : for (i = 0; i < d.nelt; ++i)
26113 12368 : d.perm[i] = i * 2;
26114 : }
26115 : else
26116 : {
26117 : /* For AVX, the interleave used above was not cross-lane. So the
26118 : extraction is evens but with the second and third quarter swapped.
26119 : Happily, that is even one insn shorter than even extraction.
26120 : For AVX512BW we have 4 lanes. We extract evens from within a lane,
26121 : always first from the first and then from the second source operand,
26122 : the index bits above the low 4 bits remains the same.
26123 : Thus, for d.nelt == 32 we want permutation
26124 : 0,2,4,..14, 32,34,36,..46, 16,18,20,..30, 48,50,52,..62
26125 : and for d.nelt == 64 we want permutation
26126 : 0,2,4,..14, 64,66,68,..78, 16,18,20,..30, 80,82,84,..94,
26127 : 32,34,36,..46, 96,98,100,..110, 48,50,52,..62, 112,114,116,..126. */
26128 11320 : for (i = 0; i < d.nelt; ++i)
26129 16512 : d.perm[i] = ((i * 2) & 14) + ((i & 8) ? d.nelt : 0) + (i & ~15);
26130 : }
26131 :
26132 1079 : ok = ix86_expand_vec_perm_const_1 (&d);
26133 1079 : gcc_assert (ok);
26134 : }
26135 :
26136 : /* Helper function of ix86_expand_mul_widen_evenodd. Return true
26137 : if op is CONST_VECTOR with all odd elements equal to their
26138 : preceding element. */
26139 :
26140 : static bool
26141 8756 : const_vector_equal_evenodd_p (rtx op)
26142 : {
26143 8756 : machine_mode mode = GET_MODE (op);
26144 8756 : int i, nunits = GET_MODE_NUNITS (mode);
26145 8756 : if (!CONST_VECTOR_P (op)
26146 8756 : || nunits != CONST_VECTOR_NUNITS (op))
26147 : return false;
26148 3574 : for (i = 0; i < nunits; i += 2)
26149 2882 : if (CONST_VECTOR_ELT (op, i) != CONST_VECTOR_ELT (op, i + 1))
26150 : return false;
26151 : return true;
26152 : }
26153 :
26154 : void
26155 8873 : ix86_expand_mul_widen_evenodd (rtx dest, rtx op1, rtx op2,
26156 : bool uns_p, bool odd_p)
26157 : {
26158 8873 : machine_mode mode = GET_MODE (op1);
26159 8873 : machine_mode wmode = GET_MODE (dest);
26160 8873 : rtx x;
26161 8873 : rtx orig_op1 = op1, orig_op2 = op2;
26162 :
26163 8873 : if (!nonimmediate_operand (op1, mode))
26164 0 : op1 = force_reg (mode, op1);
26165 8873 : if (!nonimmediate_operand (op2, mode))
26166 3315 : op2 = force_reg (mode, op2);
26167 :
26168 : /* We only play even/odd games with vectors of SImode. */
26169 8873 : gcc_assert (mode == V4SImode || mode == V8SImode || mode == V16SImode);
26170 :
26171 : /* If we're looking for the odd results, shift those members down to
26172 : the even slots. For some cpus this is faster than a PSHUFD. */
26173 8873 : if (odd_p)
26174 : {
26175 : /* For XOP use vpmacsdqh, but only for smult, as it is only
26176 : signed. */
26177 4396 : if (TARGET_XOP && mode == V4SImode && !uns_p)
26178 : {
26179 18 : x = force_reg (wmode, CONST0_RTX (wmode));
26180 18 : emit_insn (gen_xop_pmacsdqh (dest, op1, op2, x));
26181 18 : return;
26182 : }
26183 :
26184 8756 : x = GEN_INT (GET_MODE_UNIT_BITSIZE (mode));
26185 4378 : if (!const_vector_equal_evenodd_p (orig_op1))
26186 4378 : op1 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op1),
26187 : x, NULL, 1, OPTAB_DIRECT);
26188 4378 : if (!const_vector_equal_evenodd_p (orig_op2))
26189 3686 : op2 = expand_binop (wmode, lshr_optab, gen_lowpart (wmode, op2),
26190 : x, NULL, 1, OPTAB_DIRECT);
26191 4378 : op1 = gen_lowpart (mode, op1);
26192 4378 : op2 = gen_lowpart (mode, op2);
26193 : }
26194 :
26195 8855 : if (mode == V16SImode)
26196 : {
26197 10 : if (uns_p)
26198 0 : x = gen_vec_widen_umult_even_v16si (dest, op1, op2);
26199 : else
26200 10 : x = gen_vec_widen_smult_even_v16si (dest, op1, op2);
26201 : }
26202 8845 : else if (mode == V8SImode)
26203 : {
26204 147 : if (uns_p)
26205 59 : x = gen_vec_widen_umult_even_v8si (dest, op1, op2);
26206 : else
26207 88 : x = gen_vec_widen_smult_even_v8si (dest, op1, op2);
26208 : }
26209 8698 : else if (uns_p)
26210 7643 : x = gen_vec_widen_umult_even_v4si (dest, op1, op2);
26211 1055 : else if (TARGET_SSE4_1)
26212 369 : x = gen_sse4_1_mulv2siv2di3 (dest, op1, op2);
26213 : else
26214 : {
26215 686 : rtx s1, s2, t0, t1, t2;
26216 :
26217 : /* The easiest way to implement this without PMULDQ is to go through
26218 : the motions as if we are performing a full 64-bit multiply. With
26219 : the exception that we need to do less shuffling of the elements. */
26220 :
26221 : /* Compute the sign-extension, aka highparts, of the two operands. */
26222 686 : s1 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
26223 : op1, pc_rtx, pc_rtx);
26224 686 : s2 = ix86_expand_sse_cmp (gen_reg_rtx (mode), GT, CONST0_RTX (mode),
26225 : op2, pc_rtx, pc_rtx);
26226 :
26227 : /* Multiply LO(A) * HI(B), and vice-versa. */
26228 686 : t1 = gen_reg_rtx (wmode);
26229 686 : t2 = gen_reg_rtx (wmode);
26230 686 : emit_insn (gen_vec_widen_umult_even_v4si (t1, s1, op2));
26231 686 : emit_insn (gen_vec_widen_umult_even_v4si (t2, s2, op1));
26232 :
26233 : /* Multiply LO(A) * LO(B). */
26234 686 : t0 = gen_reg_rtx (wmode);
26235 686 : emit_insn (gen_vec_widen_umult_even_v4si (t0, op1, op2));
26236 :
26237 : /* Combine and shift the highparts into place. */
26238 686 : t1 = expand_binop (wmode, add_optab, t1, t2, t1, 1, OPTAB_DIRECT);
26239 686 : t1 = expand_binop (wmode, ashl_optab, t1, GEN_INT (32), t1,
26240 : 1, OPTAB_DIRECT);
26241 :
26242 : /* Combine high and low parts. */
26243 686 : force_expand_binop (wmode, add_optab, t0, t1, dest, 1, OPTAB_DIRECT);
26244 686 : return;
26245 : }
26246 8169 : emit_insn (x);
26247 : }
26248 :
26249 : void
26250 983 : ix86_expand_mul_widen_hilo (rtx dest, rtx op1, rtx op2,
26251 : bool uns_p, bool high_p)
26252 : {
26253 983 : machine_mode wmode = GET_MODE (dest);
26254 983 : machine_mode mode = GET_MODE (op1);
26255 983 : rtx t1, t2, t3, t4, mask;
26256 :
26257 983 : switch (mode)
26258 : {
26259 297 : case E_V4SImode:
26260 297 : t1 = gen_reg_rtx (mode);
26261 297 : t2 = gen_reg_rtx (mode);
26262 297 : if (TARGET_XOP && !uns_p)
26263 : {
26264 : /* With XOP, we have pmacsdqh, aka mul_widen_odd. In this case,
26265 : shuffle the elements once so that all elements are in the right
26266 : place for immediate use: { A C B D }. */
26267 33 : emit_insn (gen_sse2_pshufd_1 (t1, op1, const0_rtx, const2_rtx,
26268 : const1_rtx, GEN_INT (3)));
26269 33 : emit_insn (gen_sse2_pshufd_1 (t2, op2, const0_rtx, const2_rtx,
26270 : const1_rtx, GEN_INT (3)));
26271 : }
26272 : else
26273 : {
26274 : /* Put the elements into place for the multiply. */
26275 264 : ix86_expand_vec_interleave (t1, op1, op1, high_p);
26276 264 : ix86_expand_vec_interleave (t2, op2, op2, high_p);
26277 264 : high_p = false;
26278 : }
26279 297 : ix86_expand_mul_widen_evenodd (dest, t1, t2, uns_p, high_p);
26280 297 : break;
26281 :
26282 78 : case E_V8SImode:
26283 : /* Shuffle the elements between the lanes. After this we
26284 : have { A B E F | C D G H } for each operand. */
26285 78 : t1 = gen_reg_rtx (V4DImode);
26286 78 : t2 = gen_reg_rtx (V4DImode);
26287 78 : emit_insn (gen_avx2_permv4di_1 (t1, gen_lowpart (V4DImode, op1),
26288 : const0_rtx, const2_rtx,
26289 : const1_rtx, GEN_INT (3)));
26290 78 : emit_insn (gen_avx2_permv4di_1 (t2, gen_lowpart (V4DImode, op2),
26291 : const0_rtx, const2_rtx,
26292 : const1_rtx, GEN_INT (3)));
26293 :
26294 : /* Shuffle the elements within the lanes. After this we
26295 : have { A A B B | C C D D } or { E E F F | G G H H }. */
26296 78 : t3 = gen_reg_rtx (V8SImode);
26297 78 : t4 = gen_reg_rtx (V8SImode);
26298 117 : mask = GEN_INT (high_p
26299 : ? 2 + (2 << 2) + (3 << 4) + (3 << 6)
26300 : : 0 + (0 << 2) + (1 << 4) + (1 << 6));
26301 78 : emit_insn (gen_avx2_pshufdv3 (t3, gen_lowpart (V8SImode, t1), mask));
26302 78 : emit_insn (gen_avx2_pshufdv3 (t4, gen_lowpart (V8SImode, t2), mask));
26303 :
26304 78 : ix86_expand_mul_widen_evenodd (dest, t3, t4, uns_p, false);
26305 78 : break;
26306 :
26307 396 : case E_V8HImode:
26308 396 : case E_V16HImode:
26309 396 : t1 = expand_binop (mode, smul_optab, op1, op2, NULL_RTX,
26310 : uns_p, OPTAB_DIRECT);
26311 630 : t2 = expand_binop (mode,
26312 : uns_p ? umul_highpart_optab : smul_highpart_optab,
26313 : op1, op2, NULL_RTX, uns_p, OPTAB_DIRECT);
26314 396 : gcc_assert (t1 && t2);
26315 :
26316 396 : t3 = gen_reg_rtx (mode);
26317 396 : ix86_expand_vec_interleave (t3, t1, t2, high_p);
26318 396 : emit_move_insn (dest, gen_lowpart (wmode, t3));
26319 396 : break;
26320 :
26321 212 : case E_V16QImode:
26322 212 : case E_V32QImode:
26323 212 : case E_V32HImode:
26324 212 : case E_V16SImode:
26325 212 : case E_V64QImode:
26326 212 : t1 = gen_reg_rtx (wmode);
26327 212 : t2 = gen_reg_rtx (wmode);
26328 212 : ix86_expand_sse_unpack (t1, op1, uns_p, high_p);
26329 212 : ix86_expand_sse_unpack (t2, op2, uns_p, high_p);
26330 :
26331 212 : emit_insn (gen_rtx_SET (dest, gen_rtx_MULT (wmode, t1, t2)));
26332 212 : break;
26333 :
26334 0 : default:
26335 0 : gcc_unreachable ();
26336 : }
26337 983 : }
26338 :
26339 : void
26340 3654 : ix86_expand_sse2_mulv4si3 (rtx op0, rtx op1, rtx op2)
26341 : {
26342 3654 : rtx res_1, res_2, res_3, res_4;
26343 :
26344 3654 : res_1 = gen_reg_rtx (V4SImode);
26345 3654 : res_2 = gen_reg_rtx (V4SImode);
26346 3654 : res_3 = gen_reg_rtx (V2DImode);
26347 3654 : res_4 = gen_reg_rtx (V2DImode);
26348 3654 : ix86_expand_mul_widen_evenodd (res_3, op1, op2, true, false);
26349 3654 : ix86_expand_mul_widen_evenodd (res_4, op1, op2, true, true);
26350 :
26351 : /* Move the results in element 2 down to element 1; we don't care
26352 : what goes in elements 2 and 3. Then we can merge the parts
26353 : back together with an interleave.
26354 :
26355 : Note that two other sequences were tried:
26356 : (1) Use interleaves at the start instead of psrldq, which allows
26357 : us to use a single shufps to merge things back at the end.
26358 : (2) Use shufps here to combine the two vectors, then pshufd to
26359 : put the elements in the correct order.
26360 : In both cases the cost of the reformatting stall was too high
26361 : and the overall sequence slower. */
26362 :
26363 3654 : emit_insn (gen_sse2_pshufd_1 (res_1, gen_lowpart (V4SImode, res_3),
26364 : const0_rtx, const2_rtx,
26365 : const0_rtx, const0_rtx));
26366 3654 : emit_insn (gen_sse2_pshufd_1 (res_2, gen_lowpart (V4SImode, res_4),
26367 : const0_rtx, const2_rtx,
26368 : const0_rtx, const0_rtx));
26369 3654 : res_1 = emit_insn (gen_vec_interleave_lowv4si (op0, res_1, res_2));
26370 :
26371 3654 : set_unique_reg_note (res_1, REG_EQUAL, gen_rtx_MULT (V4SImode, op1, op2));
26372 3654 : }
26373 :
26374 : void
26375 535 : ix86_expand_sse2_mulvxdi3 (rtx op0, rtx op1, rtx op2)
26376 : {
26377 535 : machine_mode mode = GET_MODE (op0);
26378 535 : rtx t1, t2, t3, t4, t5, t6;
26379 :
26380 535 : if (TARGET_AVX512DQ && mode == V8DImode)
26381 32 : emit_insn (gen_avx512dq_mulv8di3 (op0, op1, op2));
26382 503 : else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V4DImode)
26383 32 : emit_insn (gen_avx512dq_mulv4di3 (op0, op1, op2));
26384 471 : else if (TARGET_AVX512DQ && TARGET_AVX512VL && mode == V2DImode)
26385 36 : emit_insn (gen_avx512dq_mulv2di3 (op0, op1, op2));
26386 435 : else if (TARGET_XOP && mode == V2DImode)
26387 : {
26388 : /* op1: A,B,C,D, op2: E,F,G,H */
26389 2 : op1 = gen_lowpart (V4SImode, op1);
26390 2 : op2 = gen_lowpart (V4SImode, op2);
26391 :
26392 2 : t1 = gen_reg_rtx (V4SImode);
26393 2 : t2 = gen_reg_rtx (V4SImode);
26394 2 : t3 = gen_reg_rtx (V2DImode);
26395 2 : t4 = gen_reg_rtx (V2DImode);
26396 :
26397 : /* t1: B,A,D,C */
26398 2 : emit_insn (gen_sse2_pshufd_1 (t1, op1,
26399 : GEN_INT (1),
26400 : GEN_INT (0),
26401 : GEN_INT (3),
26402 : GEN_INT (2)));
26403 :
26404 : /* t2: (B*E),(A*F),(D*G),(C*H) */
26405 2 : emit_insn (gen_mulv4si3 (t2, t1, op2));
26406 :
26407 : /* t3: (B*E)+(A*F), (D*G)+(C*H) */
26408 2 : emit_insn (gen_xop_phadddq (t3, t2));
26409 :
26410 : /* t4: ((B*E)+(A*F))<<32, ((D*G)+(C*H))<<32 */
26411 2 : emit_insn (gen_ashlv2di3 (t4, t3, GEN_INT (32)));
26412 :
26413 : /* Multiply lower parts and add all */
26414 2 : t5 = gen_reg_rtx (V2DImode);
26415 2 : emit_insn (gen_vec_widen_umult_even_v4si (t5,
26416 2 : gen_lowpart (V4SImode, op1),
26417 2 : gen_lowpart (V4SImode, op2)));
26418 2 : force_expand_binop (mode, add_optab, t5, t4, op0, 1, OPTAB_DIRECT);
26419 : }
26420 : else
26421 : {
26422 433 : machine_mode nmode;
26423 433 : rtx (*umul) (rtx, rtx, rtx);
26424 :
26425 433 : if (mode == V2DImode)
26426 : {
26427 : umul = gen_vec_widen_umult_even_v4si;
26428 : nmode = V4SImode;
26429 : }
26430 327 : else if (mode == V4DImode)
26431 : {
26432 : umul = gen_vec_widen_umult_even_v8si;
26433 : nmode = V8SImode;
26434 : }
26435 116 : else if (mode == V8DImode)
26436 : {
26437 : umul = gen_vec_widen_umult_even_v16si;
26438 : nmode = V16SImode;
26439 : }
26440 : else
26441 0 : gcc_unreachable ();
26442 :
26443 :
26444 : /* Multiply low parts. */
26445 433 : t1 = gen_reg_rtx (mode);
26446 433 : emit_insn (umul (t1, gen_lowpart (nmode, op1), gen_lowpart (nmode, op2)));
26447 :
26448 : /* Shift input vectors right 32 bits so we can multiply high parts. */
26449 433 : t6 = GEN_INT (32);
26450 433 : t2 = expand_binop (mode, lshr_optab, op1, t6, NULL, 1, OPTAB_DIRECT);
26451 433 : t3 = expand_binop (mode, lshr_optab, op2, t6, NULL, 1, OPTAB_DIRECT);
26452 :
26453 : /* Multiply high parts by low parts. */
26454 433 : t4 = gen_reg_rtx (mode);
26455 433 : t5 = gen_reg_rtx (mode);
26456 433 : emit_insn (umul (t4, gen_lowpart (nmode, t2), gen_lowpart (nmode, op2)));
26457 433 : emit_insn (umul (t5, gen_lowpart (nmode, t3), gen_lowpart (nmode, op1)));
26458 :
26459 : /* Combine and shift the highparts back. */
26460 433 : t4 = expand_binop (mode, add_optab, t4, t5, t4, 1, OPTAB_DIRECT);
26461 433 : t4 = expand_binop (mode, ashl_optab, t4, t6, t4, 1, OPTAB_DIRECT);
26462 :
26463 : /* Combine high and low parts. */
26464 433 : force_expand_binop (mode, add_optab, t1, t4, op0, 1, OPTAB_DIRECT);
26465 : }
26466 :
26467 535 : set_unique_reg_note (get_last_insn (), REG_EQUAL,
26468 : gen_rtx_MULT (mode, op1, op2));
26469 535 : }
26470 :
26471 : /* Return 1 if control tansfer instruction INSN
26472 : should be encoded with notrack prefix. */
26473 :
26474 : bool
26475 14857979 : ix86_notrack_prefixed_insn_p (rtx_insn *insn)
26476 : {
26477 14857979 : if (!insn || !((flag_cf_protection & CF_BRANCH)))
26478 : return false;
26479 :
26480 3983300 : if (CALL_P (insn))
26481 : {
26482 1376770 : rtx call = get_call_rtx_from (insn);
26483 1376770 : gcc_assert (call != NULL_RTX);
26484 1376770 : rtx addr = XEXP (call, 0);
26485 :
26486 : /* Do not emit 'notrack' if it's not an indirect call. */
26487 1376770 : if (MEM_P (addr)
26488 1376770 : && SYMBOL_REF_P (XEXP (addr, 0)))
26489 : return false;
26490 : else
26491 68936 : return find_reg_note (insn, REG_CALL_NOCF_CHECK, 0);
26492 : }
26493 :
26494 2606530 : if (JUMP_P (insn) && !flag_cet_switch)
26495 : {
26496 2593159 : rtx target = JUMP_LABEL (insn);
26497 2593159 : if (target == NULL_RTX || ANY_RETURN_P (target))
26498 : return false;
26499 :
26500 : /* Check the jump is a switch table. */
26501 2593121 : rtx_insn *label = as_a<rtx_insn *> (target);
26502 2593121 : rtx_insn *table = next_insn (label);
26503 2593121 : if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
26504 : return false;
26505 : else
26506 : return true;
26507 : }
26508 : return false;
26509 : }
26510 :
26511 : /* Calculate integer abs() using only SSE2 instructions. */
26512 :
26513 : void
26514 557 : ix86_expand_sse2_abs (rtx target, rtx input)
26515 : {
26516 557 : machine_mode mode = GET_MODE (target);
26517 557 : rtx tmp0, tmp1, x;
26518 :
26519 557 : switch (mode)
26520 : {
26521 24 : case E_V2DImode:
26522 24 : case E_V4DImode:
26523 : /* For 64-bit signed integer X, with SSE4.2 use
26524 : pxor t0, t0; pcmpgtq X, t0; pxor t0, X; psubq t0, X.
26525 : Otherwise handle it similarly to V4SImode, except use 64 as W instead of
26526 : 32 and use logical instead of arithmetic right shift (which is
26527 : unimplemented) and subtract. */
26528 24 : if (TARGET_SSE4_2)
26529 : {
26530 9 : tmp0 = gen_reg_rtx (mode);
26531 9 : tmp1 = gen_reg_rtx (mode);
26532 9 : emit_move_insn (tmp1, CONST0_RTX (mode));
26533 9 : if (mode == E_V2DImode)
26534 6 : emit_insn (gen_sse4_2_gtv2di3 (tmp0, tmp1, input));
26535 : else
26536 3 : emit_insn (gen_avx2_gtv4di3 (tmp0, tmp1, input));
26537 : }
26538 : else
26539 : {
26540 30 : tmp0 = expand_simple_binop (mode, LSHIFTRT, input,
26541 15 : GEN_INT (GET_MODE_UNIT_BITSIZE (mode)
26542 : - 1), NULL, 0, OPTAB_DIRECT);
26543 15 : tmp0 = expand_simple_unop (mode, NEG, tmp0, NULL, false);
26544 : }
26545 :
26546 24 : tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
26547 : NULL, 0, OPTAB_DIRECT);
26548 24 : x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
26549 : target, 0, OPTAB_DIRECT);
26550 24 : break;
26551 :
26552 49 : case E_V4SImode:
26553 : /* For 32-bit signed integer X, the best way to calculate the absolute
26554 : value of X is (((signed) X >> (W-1)) ^ X) - ((signed) X >> (W-1)). */
26555 49 : tmp0 = expand_simple_binop (mode, ASHIFTRT, input,
26556 49 : GEN_INT (GET_MODE_UNIT_BITSIZE (mode) - 1),
26557 : NULL, 0, OPTAB_DIRECT);
26558 49 : tmp1 = expand_simple_binop (mode, XOR, tmp0, input,
26559 : NULL, 0, OPTAB_DIRECT);
26560 49 : x = expand_simple_binop (mode, MINUS, tmp1, tmp0,
26561 : target, 0, OPTAB_DIRECT);
26562 49 : break;
26563 :
26564 85 : case E_V8HImode:
26565 : /* For 16-bit signed integer X, the best way to calculate the absolute
26566 : value of X is max (X, -X), as SSE2 provides the PMAXSW insn. */
26567 85 : tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
26568 :
26569 85 : x = expand_simple_binop (mode, SMAX, tmp0, input,
26570 : target, 0, OPTAB_DIRECT);
26571 85 : break;
26572 :
26573 399 : case E_V16QImode:
26574 : /* For 8-bit signed integer X, the best way to calculate the absolute
26575 : value of X is min ((unsigned char) X, (unsigned char) (-X)),
26576 : as SSE2 provides the PMINUB insn. */
26577 399 : tmp0 = expand_unop (mode, neg_optab, input, NULL_RTX, 0);
26578 :
26579 399 : x = expand_simple_binop (V16QImode, UMIN, tmp0, input,
26580 : target, 0, OPTAB_DIRECT);
26581 399 : break;
26582 :
26583 0 : default:
26584 0 : gcc_unreachable ();
26585 : }
26586 :
26587 557 : if (x != target)
26588 0 : emit_move_insn (target, x);
26589 557 : }
26590 :
26591 : /* Expand an extract from a vector register through pextr insn.
26592 : Return true if successful. */
26593 :
26594 : bool
26595 104654 : ix86_expand_pextr (rtx *operands)
26596 : {
26597 104654 : rtx dst = operands[0];
26598 104654 : rtx src = operands[1];
26599 :
26600 104654 : unsigned int size = INTVAL (operands[2]);
26601 104654 : unsigned int pos = INTVAL (operands[3]);
26602 :
26603 104654 : if (SUBREG_P (dst))
26604 : {
26605 : /* Reject non-lowpart subregs. */
26606 61317 : if (SUBREG_BYTE (dst) > 0)
26607 : return false;
26608 61222 : dst = SUBREG_REG (dst);
26609 : }
26610 :
26611 104559 : if (SUBREG_P (src))
26612 : {
26613 36264 : pos += SUBREG_BYTE (src) * BITS_PER_UNIT;
26614 36264 : src = SUBREG_REG (src);
26615 : }
26616 :
26617 104559 : switch (GET_MODE (src))
26618 : {
26619 0 : case E_V16QImode:
26620 0 : case E_V8HImode:
26621 0 : case E_V4SImode:
26622 0 : case E_V2DImode:
26623 0 : case E_V1TImode:
26624 0 : {
26625 0 : machine_mode srcmode, dstmode;
26626 0 : rtx d, pat;
26627 :
26628 0 : if (!int_mode_for_size (size, 0).exists (&dstmode))
26629 0 : return false;
26630 :
26631 0 : switch (dstmode)
26632 : {
26633 0 : case E_QImode:
26634 0 : if (!TARGET_SSE4_1)
26635 : return false;
26636 : srcmode = V16QImode;
26637 : break;
26638 :
26639 0 : case E_HImode:
26640 0 : if (!TARGET_SSE2)
26641 : return false;
26642 : srcmode = V8HImode;
26643 : break;
26644 :
26645 0 : case E_SImode:
26646 0 : if (!TARGET_SSE4_1)
26647 : return false;
26648 : srcmode = V4SImode;
26649 : break;
26650 :
26651 0 : case E_DImode:
26652 0 : gcc_assert (TARGET_64BIT);
26653 0 : if (!TARGET_SSE4_1)
26654 : return false;
26655 : srcmode = V2DImode;
26656 : break;
26657 :
26658 : default:
26659 : return false;
26660 : }
26661 :
26662 : /* Reject extractions from misaligned positions. */
26663 0 : if (pos & (size-1))
26664 : return false;
26665 :
26666 0 : if (GET_MODE (dst) == dstmode)
26667 : d = dst;
26668 : else
26669 0 : d = gen_reg_rtx (dstmode);
26670 :
26671 : /* Construct insn pattern. */
26672 0 : pat = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, GEN_INT (pos / size)));
26673 0 : pat = gen_rtx_VEC_SELECT (dstmode, gen_lowpart (srcmode, src), pat);
26674 :
26675 : /* Let the rtl optimizers know about the zero extension performed. */
26676 0 : if (dstmode == QImode || dstmode == HImode)
26677 : {
26678 0 : pat = gen_rtx_ZERO_EXTEND (SImode, pat);
26679 0 : d = gen_lowpart (SImode, d);
26680 : }
26681 :
26682 0 : emit_insn (gen_rtx_SET (d, pat));
26683 :
26684 0 : if (d != dst)
26685 0 : emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
26686 : return true;
26687 : }
26688 :
26689 : default:
26690 : return false;
26691 : }
26692 : }
26693 :
26694 : /* Expand an insert into a vector register through pinsr insn.
26695 : Return true if successful. */
26696 :
26697 : bool
26698 108491 : ix86_expand_pinsr (rtx *operands)
26699 : {
26700 108491 : rtx dst = operands[0];
26701 108491 : rtx src = operands[3];
26702 :
26703 108491 : unsigned int size = INTVAL (operands[1]);
26704 108491 : unsigned int pos = INTVAL (operands[2]);
26705 :
26706 108491 : if (SUBREG_P (dst))
26707 : {
26708 60572 : pos += SUBREG_BYTE (dst) * BITS_PER_UNIT;
26709 60572 : dst = SUBREG_REG (dst);
26710 : }
26711 :
26712 108491 : switch (GET_MODE (dst))
26713 : {
26714 20 : case E_V16QImode:
26715 20 : case E_V8HImode:
26716 20 : case E_V4SImode:
26717 20 : case E_V2DImode:
26718 20 : case E_V1TImode:
26719 20 : {
26720 20 : machine_mode srcmode, dstmode;
26721 20 : rtx (*pinsr)(rtx, rtx, rtx, rtx);
26722 20 : rtx d;
26723 :
26724 20 : if (!int_mode_for_size (size, 0).exists (&srcmode))
26725 0 : return false;
26726 :
26727 20 : switch (srcmode)
26728 : {
26729 1 : case E_QImode:
26730 1 : if (!TARGET_SSE4_1)
26731 : return false;
26732 : dstmode = V16QImode;
26733 : pinsr = gen_sse4_1_pinsrb;
26734 : break;
26735 :
26736 5 : case E_HImode:
26737 5 : if (!TARGET_SSE2)
26738 : return false;
26739 : dstmode = V8HImode;
26740 : pinsr = gen_sse2_pinsrw;
26741 : break;
26742 :
26743 14 : case E_SImode:
26744 14 : if (!TARGET_SSE4_1)
26745 : return false;
26746 : dstmode = V4SImode;
26747 : pinsr = gen_sse4_1_pinsrd;
26748 : break;
26749 :
26750 0 : case E_DImode:
26751 0 : gcc_assert (TARGET_64BIT);
26752 0 : if (!TARGET_SSE4_1)
26753 : return false;
26754 : dstmode = V2DImode;
26755 : pinsr = gen_sse4_1_pinsrq;
26756 : break;
26757 :
26758 : default:
26759 : return false;
26760 : }
26761 :
26762 : /* Reject insertions to misaligned positions. */
26763 7 : if (pos & (size-1))
26764 : return false;
26765 :
26766 7 : if (SUBREG_P (src))
26767 : {
26768 7 : unsigned int srcpos = SUBREG_BYTE (src);
26769 :
26770 7 : if (srcpos > 0)
26771 : {
26772 0 : rtx extr_ops[4];
26773 :
26774 0 : extr_ops[0] = gen_reg_rtx (srcmode);
26775 0 : extr_ops[1] = gen_lowpart (srcmode, SUBREG_REG (src));
26776 0 : extr_ops[2] = GEN_INT (size);
26777 0 : extr_ops[3] = GEN_INT (srcpos * BITS_PER_UNIT);
26778 :
26779 0 : if (!ix86_expand_pextr (extr_ops))
26780 0 : return false;
26781 :
26782 0 : src = extr_ops[0];
26783 : }
26784 : else
26785 7 : src = gen_lowpart (srcmode, SUBREG_REG (src));
26786 : }
26787 :
26788 7 : if (GET_MODE (dst) == dstmode)
26789 : d = dst;
26790 : else
26791 7 : d = gen_reg_rtx (dstmode);
26792 :
26793 7 : emit_insn (pinsr (d, gen_lowpart (dstmode, dst),
26794 7 : gen_lowpart (srcmode, src),
26795 7 : GEN_INT (1 << (pos / size))));
26796 7 : if (d != dst)
26797 7 : emit_move_insn (dst, gen_lowpart (GET_MODE (dst), d));
26798 : return true;
26799 : }
26800 :
26801 : default:
26802 : return false;
26803 : }
26804 : }
26805 :
26806 : /* All CPUs prefer to avoid cross-lane operations so perform reductions
26807 : upper against lower halves up to SSE reg size. */
26808 :
26809 : machine_mode
26810 1892 : ix86_split_reduction (machine_mode mode)
26811 : {
26812 : /* Reduce lowpart against highpart until we reach SSE reg width to
26813 : avoid cross-lane operations. */
26814 1892 : switch (mode)
26815 : {
26816 : case E_V8DImode:
26817 : case E_V4DImode:
26818 : return V2DImode;
26819 9 : case E_V16SImode:
26820 9 : case E_V8SImode:
26821 9 : return V4SImode;
26822 8 : case E_V32HImode:
26823 8 : case E_V16HImode:
26824 8 : return V8HImode;
26825 4 : case E_V64QImode:
26826 4 : case E_V32QImode:
26827 4 : return V16QImode;
26828 5 : case E_V16SFmode:
26829 5 : case E_V8SFmode:
26830 5 : return V4SFmode;
26831 16 : case E_V8DFmode:
26832 16 : case E_V4DFmode:
26833 16 : return V2DFmode;
26834 1845 : default:
26835 1845 : return mode;
26836 : }
26837 : }
26838 :
26839 : /* Generate call to __divmoddi4. */
26840 :
26841 : void
26842 897 : ix86_expand_divmod_libfunc (rtx libfunc, machine_mode mode,
26843 : rtx op0, rtx op1,
26844 : rtx *quot_p, rtx *rem_p)
26845 : {
26846 1794 : rtx rem = assign_stack_temp (mode, GET_MODE_SIZE (mode));
26847 :
26848 897 : rtx quot = emit_library_call_value (libfunc, NULL_RTX, LCT_NORMAL,
26849 : mode, op0, mode, op1, mode,
26850 897 : XEXP (rem, 0), Pmode);
26851 897 : *quot_p = quot;
26852 897 : *rem_p = rem;
26853 897 : }
26854 :
26855 : void
26856 64 : ix86_expand_atomic_fetch_op_loop (rtx target, rtx mem, rtx val,
26857 : enum rtx_code code, bool after,
26858 : bool doubleword)
26859 : {
26860 64 : rtx old_reg, new_reg, old_mem, success;
26861 64 : machine_mode mode = GET_MODE (target);
26862 64 : rtx_code_label *loop_label = NULL;
26863 :
26864 64 : old_reg = gen_reg_rtx (mode);
26865 64 : new_reg = old_reg;
26866 64 : old_mem = copy_to_reg (mem);
26867 64 : loop_label = gen_label_rtx ();
26868 64 : emit_label (loop_label);
26869 64 : emit_move_insn (old_reg, old_mem);
26870 :
26871 : /* return value for atomic_fetch_op. */
26872 64 : if (!after)
26873 32 : emit_move_insn (target, old_reg);
26874 :
26875 64 : if (code == NOT)
26876 : {
26877 16 : new_reg = expand_simple_binop (mode, AND, new_reg, val, NULL_RTX,
26878 : true, OPTAB_LIB_WIDEN);
26879 16 : new_reg = expand_simple_unop (mode, code, new_reg, NULL_RTX, true);
26880 : }
26881 : else
26882 48 : new_reg = expand_simple_binop (mode, code, new_reg, val, NULL_RTX,
26883 : true, OPTAB_LIB_WIDEN);
26884 :
26885 : /* return value for atomic_op_fetch. */
26886 64 : if (after)
26887 32 : emit_move_insn (target, new_reg);
26888 :
26889 64 : success = NULL_RTX;
26890 :
26891 64 : ix86_expand_cmpxchg_loop (&success, old_mem, mem, old_reg, new_reg,
26892 : gen_int_mode (MEMMODEL_SYNC_SEQ_CST,
26893 : SImode),
26894 : doubleword, loop_label);
26895 64 : }
26896 :
26897 : /* Relax cmpxchg instruction, param loop_label indicates whether
26898 : the instruction should be relaxed with a pause loop. If not,
26899 : it will be relaxed to an atomic load + compare, and skip
26900 : cmpxchg instruction if mem != exp_input. */
26901 :
26902 : void
26903 72 : ix86_expand_cmpxchg_loop (rtx *ptarget_bool, rtx target_val,
26904 : rtx mem, rtx exp_input, rtx new_input,
26905 : rtx mem_model, bool doubleword,
26906 : rtx_code_label *loop_label)
26907 : {
26908 72 : rtx_code_label *cmp_label = NULL;
26909 72 : rtx_code_label *done_label = NULL;
26910 72 : rtx target_bool = NULL_RTX, new_mem = NULL_RTX;
26911 72 : rtx (*gen) (rtx, rtx, rtx, rtx, rtx) = NULL;
26912 72 : rtx (*gendw) (rtx, rtx, rtx, rtx, rtx, rtx) = NULL;
26913 72 : machine_mode mode = GET_MODE (target_val), hmode = mode;
26914 :
26915 72 : if (*ptarget_bool == NULL)
26916 64 : target_bool = gen_reg_rtx (QImode);
26917 : else
26918 : target_bool = *ptarget_bool;
26919 :
26920 72 : cmp_label = gen_label_rtx ();
26921 72 : done_label = gen_label_rtx ();
26922 :
26923 72 : new_mem = gen_reg_rtx (mode);
26924 : /* Load memory first. */
26925 72 : expand_atomic_load (new_mem, mem, MEMMODEL_SEQ_CST);
26926 :
26927 72 : switch (mode)
26928 : {
26929 : case E_TImode:
26930 : gendw = gen_atomic_compare_and_swapti_doubleword;
26931 : hmode = DImode;
26932 : break;
26933 18 : case E_DImode:
26934 18 : if (doubleword)
26935 : {
26936 : gendw = gen_atomic_compare_and_swapdi_doubleword;
26937 : hmode = SImode;
26938 : }
26939 : else
26940 : gen = gen_atomic_compare_and_swapdi_1;
26941 : break;
26942 18 : case E_SImode:
26943 18 : gen = gen_atomic_compare_and_swapsi_1;
26944 18 : break;
26945 18 : case E_HImode:
26946 18 : gen = gen_atomic_compare_and_swaphi_1;
26947 18 : break;
26948 18 : case E_QImode:
26949 18 : gen = gen_atomic_compare_and_swapqi_1;
26950 18 : break;
26951 0 : default:
26952 0 : gcc_unreachable ();
26953 : }
26954 :
26955 : /* Compare mem value with expected value. */
26956 54 : if (doubleword)
26957 : {
26958 0 : rtx low_new_mem = gen_lowpart (hmode, new_mem);
26959 0 : rtx low_exp_input = gen_lowpart (hmode, exp_input);
26960 0 : rtx high_new_mem = gen_highpart (hmode, new_mem);
26961 0 : rtx high_exp_input = gen_highpart (hmode, exp_input);
26962 0 : emit_cmp_and_jump_insns (low_new_mem, low_exp_input, NE, NULL_RTX,
26963 : hmode, 1, cmp_label,
26964 : profile_probability::guessed_never ());
26965 0 : emit_cmp_and_jump_insns (high_new_mem, high_exp_input, NE, NULL_RTX,
26966 : hmode, 1, cmp_label,
26967 : profile_probability::guessed_never ());
26968 : }
26969 : else
26970 72 : emit_cmp_and_jump_insns (new_mem, exp_input, NE, NULL_RTX,
26971 72 : GET_MODE (exp_input), 1, cmp_label,
26972 : profile_probability::guessed_never ());
26973 :
26974 : /* Directly emits cmpxchg here. */
26975 72 : if (doubleword)
26976 0 : emit_insn (gendw (target_val, mem, exp_input,
26977 0 : gen_lowpart (hmode, new_input),
26978 : gen_highpart (hmode, new_input),
26979 : mem_model));
26980 : else
26981 72 : emit_insn (gen (target_val, mem, exp_input, new_input, mem_model));
26982 :
26983 72 : if (!loop_label)
26984 : {
26985 8 : emit_jump_insn (gen_jump (done_label));
26986 8 : emit_barrier ();
26987 8 : emit_label (cmp_label);
26988 8 : emit_move_insn (target_val, new_mem);
26989 8 : emit_label (done_label);
26990 8 : ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
26991 : const0_rtx);
26992 : }
26993 : else
26994 : {
26995 64 : ix86_expand_setcc (target_bool, EQ, gen_rtx_REG (CCZmode, FLAGS_REG),
26996 : const0_rtx);
26997 64 : emit_cmp_and_jump_insns (target_bool, const0_rtx, EQ, const0_rtx,
26998 64 : GET_MODE (target_bool), 1, loop_label,
26999 : profile_probability::guessed_never ());
27000 64 : emit_jump_insn (gen_jump (done_label));
27001 64 : emit_barrier ();
27002 :
27003 : /* If mem is not expected, pause and loop back. */
27004 64 : emit_label (cmp_label);
27005 64 : emit_move_insn (target_val, new_mem);
27006 64 : emit_insn (gen_pause ());
27007 64 : emit_jump_insn (gen_jump (loop_label));
27008 64 : emit_barrier ();
27009 64 : emit_label (done_label);
27010 : }
27011 :
27012 72 : *ptarget_bool = target_bool;
27013 72 : }
27014 :
27015 : /* Convert a BFmode VAL to SFmode without signaling sNaNs.
27016 : This is done by returning SF SUBREG of ((HI SUBREG) (VAL)) << 16. */
27017 :
27018 : rtx
27019 2832 : ix86_expand_fast_convert_bf_to_sf (rtx val)
27020 : {
27021 2832 : rtx op = gen_lowpart (HImode, val), ret;
27022 2832 : if (CONST_INT_P (op))
27023 : {
27024 514 : ret = simplify_const_unary_operation (FLOAT_EXTEND, SFmode,
27025 : val, BFmode);
27026 514 : if (ret)
27027 : return ret;
27028 : /* FLOAT_EXTEND simplification will fail if VAL is a sNaN. */
27029 1 : ret = gen_reg_rtx (SImode);
27030 1 : emit_move_insn (ret, GEN_INT (INTVAL (op) & 0xffff));
27031 1 : emit_insn (gen_ashlsi3 (ret, ret, GEN_INT (16)));
27032 1 : return gen_lowpart (SFmode, ret);
27033 : }
27034 :
27035 2318 : ret = gen_reg_rtx (SFmode);
27036 2318 : emit_insn (gen_extendbfsf2_1 (ret, force_reg (BFmode, val)));
27037 2318 : return ret;
27038 : }
27039 :
27040 : rtx
27041 65576 : ix86_gen_ccmp_first (rtx_insn **prep_seq, rtx_insn **gen_seq,
27042 : rtx_code code, tree treeop0, tree treeop1)
27043 : {
27044 65576 : if (!TARGET_APX_CCMP)
27045 : return NULL_RTX;
27046 :
27047 65576 : rtx op0, op1, res;
27048 65576 : machine_mode op_mode;
27049 :
27050 65576 : start_sequence ();
27051 65576 : expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
27052 :
27053 65576 : op_mode = GET_MODE (op0);
27054 65576 : if (op_mode == VOIDmode)
27055 0 : op_mode = GET_MODE (op1);
27056 :
27057 : /* We only supports following scalar comparisons that use just 1
27058 : instruction: DI/SI/QI/HI/DF/SF/HF.
27059 : Unordered/Ordered compare cannot be corretly indentified by
27060 : ccmp so they are not supported. */
27061 98348 : if (!(op_mode == DImode || op_mode == SImode || op_mode == HImode
27062 65576 : || op_mode == QImode || op_mode == DFmode || op_mode == SFmode
27063 32772 : || op_mode == HFmode)
27064 32806 : || code == ORDERED
27065 32806 : || code == UNORDERED)
27066 : {
27067 32770 : end_sequence ();
27068 32770 : return NULL_RTX;
27069 : }
27070 :
27071 : /* Canonicalize the operands according to mode. */
27072 32806 : if (SCALAR_INT_MODE_P (op_mode))
27073 : {
27074 32799 : if (!nonimmediate_operand (op0, op_mode))
27075 0 : op0 = force_reg (op_mode, op0);
27076 32799 : if (!x86_64_general_operand (op1, op_mode))
27077 0 : op1 = force_reg (op_mode, op1);
27078 : }
27079 : else
27080 : {
27081 : /* op0/op1 can be canonicallized from expand_fp_compare, so
27082 : just adjust the code to make it generate supported fp
27083 : condition. */
27084 7 : if (ix86_fp_compare_code_to_integer (code) == UNKNOWN)
27085 : {
27086 : /* First try to split condition if we don't need to honor
27087 : NaNs, as the ORDERED/UNORDERED check always fall
27088 : through. */
27089 6 : if (!HONOR_NANS (op_mode))
27090 : {
27091 6 : rtx_code first_code;
27092 6 : split_comparison (code, op_mode, &first_code, &code);
27093 : }
27094 : /* Otherwise try to swap the operand order and check if
27095 : the comparison is supported. */
27096 : else
27097 : {
27098 0 : code = swap_condition (code);
27099 0 : std::swap (op0, op1);
27100 : }
27101 :
27102 6 : if (ix86_fp_compare_code_to_integer (code) == UNKNOWN)
27103 : {
27104 0 : end_sequence ();
27105 0 : return NULL_RTX;
27106 : }
27107 : }
27108 : }
27109 :
27110 32806 : *prep_seq = end_sequence ();
27111 :
27112 32806 : start_sequence ();
27113 :
27114 32806 : res = ix86_expand_compare (code, op0, op1);
27115 :
27116 32806 : if (!res)
27117 : {
27118 : end_sequence ();
27119 : return NULL_RTX;
27120 : }
27121 32806 : *gen_seq = end_sequence ();
27122 :
27123 32806 : return res;
27124 : }
27125 :
27126 : rtx
27127 32809 : ix86_gen_ccmp_next (rtx_insn **prep_seq, rtx_insn **gen_seq, rtx prev,
27128 : rtx_code cmp_code, tree treeop0, tree treeop1,
27129 : rtx_code bit_code)
27130 : {
27131 32809 : if (!TARGET_APX_CCMP)
27132 : return NULL_RTX;
27133 :
27134 32809 : rtx op0, op1, target;
27135 32809 : machine_mode op_mode, cmp_mode, cc_mode = CCmode;
27136 32809 : int unsignedp = TYPE_UNSIGNED (TREE_TYPE (treeop0));
27137 32809 : insn_code icode;
27138 32809 : rtx_code prev_code;
27139 32809 : struct expand_operand ops[5];
27140 32809 : int dfv;
27141 :
27142 : /* Exit early for non integer modes to avoid O(n^2) part of expand_operands. */
27143 32809 : cmp_mode = op_mode = TYPE_MODE (TREE_TYPE (treeop0));
27144 :
27145 32809 : if (!(op_mode == DImode || op_mode == SImode || op_mode == HImode
27146 : || op_mode == QImode))
27147 : return NULL_RTX;
27148 :
27149 32 : push_to_sequence (*prep_seq);
27150 32 : expand_operands (treeop0, treeop1, NULL_RTX, &op0, &op1, EXPAND_NORMAL);
27151 :
27152 32 : icode = code_for_ccmp (op_mode);
27153 :
27154 32 : op0 = prepare_operand (icode, op0, 2, op_mode, cmp_mode, unsignedp);
27155 32 : op1 = prepare_operand (icode, op1, 3, op_mode, cmp_mode, unsignedp);
27156 32 : if (!op0 || !op1)
27157 : {
27158 0 : end_sequence ();
27159 0 : return NULL_RTX;
27160 : }
27161 :
27162 32 : *prep_seq = end_sequence ();
27163 :
27164 32 : target = gen_rtx_REG (cc_mode, FLAGS_REG);
27165 32 : dfv = ix86_get_flags_cc ((rtx_code) cmp_code);
27166 :
27167 32 : prev_code = GET_CODE (prev);
27168 : /* Fixup FP compare code here. */
27169 32 : if (GET_MODE (XEXP (prev, 0)) == CCFPmode)
27170 7 : prev_code = ix86_fp_compare_code_to_integer (prev_code);
27171 :
27172 32 : if (bit_code != AND)
27173 17 : prev_code = reverse_condition (prev_code);
27174 : else
27175 15 : dfv = (int)(dfv ^ 1);
27176 :
27177 32 : prev = gen_rtx_fmt_ee (prev_code, VOIDmode, XEXP (prev, 0),
27178 : const0_rtx);
27179 :
27180 32 : create_fixed_operand (&ops[0], target);
27181 32 : create_fixed_operand (&ops[1], prev);
27182 32 : create_fixed_operand (&ops[2], op0);
27183 32 : create_fixed_operand (&ops[3], op1);
27184 32 : create_fixed_operand (&ops[4], GEN_INT (dfv));
27185 :
27186 32 : push_to_sequence (*gen_seq);
27187 32 : if (!maybe_expand_insn (icode, 5, ops))
27188 : {
27189 0 : end_sequence ();
27190 0 : return NULL_RTX;
27191 : }
27192 :
27193 32 : *gen_seq = end_sequence ();
27194 :
27195 32 : return gen_rtx_fmt_ee ((rtx_code) cmp_code, VOIDmode, target, const0_rtx);
27196 : }
27197 :
27198 : /* Attempt to convert a CONST_VECTOR into a bcst_mem_operand.
27199 : Returns NULL_RTX if X is cannot be expressed as a suitable
27200 : VEC_DUPLICATE in mode MODE. */
27201 :
27202 : static rtx
27203 48 : ix86_gen_bcst_mem (machine_mode mode, rtx x)
27204 : {
27205 48 : if (!TARGET_AVX512F
27206 48 : || !CONST_VECTOR_P (x)
27207 64 : || (!TARGET_AVX512VL && GET_MODE_SIZE (mode) != 64)
27208 147 : || !VALID_BCST_MODE_P (GET_MODE_INNER (mode))
27209 : /* Disallow HFmode broadcast. */
27210 126 : || GET_MODE_SIZE (GET_MODE_INNER (mode)) < 4)
27211 : return NULL_RTX;
27212 :
27213 21 : rtx cst = CONST_VECTOR_ELT (x, 0);
27214 21 : if (!CONST_SCALAR_INT_P (cst)
27215 15 : && !CONST_DOUBLE_P (cst)
27216 0 : && !CONST_FIXED_P (cst))
27217 : return NULL_RTX;
27218 :
27219 21 : int n_elts = GET_MODE_NUNITS (mode);
27220 42 : if (CONST_VECTOR_NUNITS (x) != n_elts)
27221 : return NULL_RTX;
27222 :
27223 150 : for (int i = 1; i < n_elts; i++)
27224 129 : if (!rtx_equal_p (cst, CONST_VECTOR_ELT (x, i)))
27225 : return NULL_RTX;
27226 :
27227 42 : rtx mem = force_const_mem (GET_MODE_INNER (mode), cst);
27228 21 : return gen_rtx_VEC_DUPLICATE (mode, validize_mem (mem));
27229 : }
27230 :
27231 : /* Determine the ternlog immediate index that implements 3-operand
27232 : ternary logic expression OP. This uses and modifies the 3 element
27233 : array ARGS to record and check the leaves, either 3 REGs, or 2 REGs
27234 : and MEM. Returns an index between 0 and 255 for a valid ternlog,
27235 : or -1 if the expression isn't suitable. */
27236 :
27237 : int
27238 7127601 : ix86_ternlog_idx (rtx op, rtx *args)
27239 : {
27240 7127601 : int idx0, idx1;
27241 :
27242 7127601 : if (!op)
27243 : return -1;
27244 :
27245 7127601 : switch (GET_CODE (op))
27246 : {
27247 742846 : case SUBREG:
27248 742846 : if (!register_operand (op, GET_MODE (op)))
27249 : return -1;
27250 : /* FALLTHRU */
27251 :
27252 3487823 : case REG:
27253 3487823 : if (!args[0])
27254 : {
27255 1808128 : args[0] = op;
27256 1808128 : return 0xf0;
27257 : }
27258 1679695 : if (rtx_equal_p (op, args[0]))
27259 : return 0xf0;
27260 1653479 : if (!args[1])
27261 : {
27262 1392408 : args[1] = op;
27263 1392408 : return 0xcc;
27264 : }
27265 261071 : if (rtx_equal_p (op, args[1]))
27266 : return 0xcc;
27267 244493 : if (!args[2])
27268 : {
27269 222499 : args[2] = op;
27270 222499 : return 0xaa;
27271 : }
27272 21994 : if (rtx_equal_p (op, args[2]))
27273 : return 0xaa;
27274 : return -1;
27275 :
27276 18037 : case VEC_DUPLICATE:
27277 18037 : if (!bcst_mem_operand (op, GET_MODE (op)))
27278 : return -1;
27279 302 : goto do_mem_operand;
27280 :
27281 362714 : case MEM:
27282 362714 : if (!memory_operand (op, GET_MODE (op)))
27283 : return -1;
27284 362552 : if (MEM_P (op)
27285 362552 : && MEM_VOLATILE_P (op)
27286 362646 : && !volatile_ok)
27287 : return -1;
27288 : /* FALLTHRU */
27289 :
27290 469872 : case CONST_VECTOR:
27291 469872 : do_mem_operand:
27292 469872 : if (!args[2])
27293 : {
27294 422765 : args[2] = op;
27295 422765 : return 0xaa;
27296 : }
27297 : /* Maximum of one volatile memory reference per expression. */
27298 47107 : if (side_effects_p (op))
27299 : return -1;
27300 47107 : if (rtx_equal_p (op, args[2]))
27301 : return 0xaa;
27302 : /* Check if CONST_VECTOR is the ones-complement of args[2]. */
27303 47056 : if (CONST_VECTOR_P (op)
27304 3380 : && CONST_VECTOR_P (args[2])
27305 47301 : && rtx_equal_p (simplify_const_unary_operation (NOT, GET_MODE (op),
27306 245 : op, GET_MODE (op)),
27307 : args[2]))
27308 : return 0x55;
27309 46869 : if (!args[0])
27310 : {
27311 45067 : args[0] = op;
27312 45067 : return 0xf0;
27313 : }
27314 1802 : if (rtx_equal_p (op, args[0]))
27315 : return 0xf0;
27316 : /* Check if CONST_VECTOR is the ones-complement of args[0]. */
27317 1802 : if (CONST_VECTOR_P (op)
27318 101 : && CONST_VECTOR_P (args[0])
27319 1844 : && rtx_equal_p (simplify_const_unary_operation (NOT, GET_MODE (op),
27320 42 : op, GET_MODE (op)),
27321 : args[0]))
27322 : return 0x0f;
27323 1760 : if (!args[1])
27324 : {
27325 1748 : args[1] = op;
27326 1748 : return 0xcc;
27327 : }
27328 12 : if (rtx_equal_p (op, args[1]))
27329 : return 0xcc;
27330 : /* Check if CONST_VECTOR is the ones-complement of args[1]. */
27331 12 : if (CONST_VECTOR_P (op)
27332 0 : && CONST_VECTOR_P (args[1])
27333 12 : && rtx_equal_p (simplify_const_unary_operation (NOT, GET_MODE (op),
27334 0 : op, GET_MODE (op)),
27335 : args[1]))
27336 : return 0x33;
27337 : return -1;
27338 :
27339 172080 : case NOT:
27340 172080 : idx0 = ix86_ternlog_idx (XEXP (op, 0), args);
27341 172080 : return (idx0 >= 0) ? idx0 ^ 0xff : -1;
27342 :
27343 1267315 : case AND:
27344 1267315 : idx0 = ix86_ternlog_idx (XEXP (op, 0), args);
27345 1267315 : if (idx0 < 0)
27346 : return -1;
27347 1045047 : idx1 = ix86_ternlog_idx (XEXP (op, 1), args);
27348 1045047 : return (idx1 >= 0) ? idx0 & idx1 : -1;
27349 :
27350 947408 : case IOR:
27351 947408 : idx0 = ix86_ternlog_idx (XEXP (op, 0), args);
27352 947408 : if (idx0 < 0)
27353 : return -1;
27354 703589 : idx1 = ix86_ternlog_idx (XEXP (op, 1), args);
27355 703589 : return (idx1 >= 0) ? idx0 | idx1 : -1;
27356 :
27357 392660 : case XOR:
27358 392660 : idx0 = ix86_ternlog_idx (XEXP (op, 0), args);
27359 392660 : if (idx0 < 0)
27360 : return -1;
27361 373899 : if (vector_all_ones_operand (XEXP (op, 1), GET_MODE (op)))
27362 6111 : return idx0 ^ 0xff;
27363 367788 : idx1 = ix86_ternlog_idx (XEXP (op, 1), args);
27364 367788 : return (idx1 >= 0) ? idx0 ^ idx1 : -1;
27365 :
27366 7492 : case UNSPEC:
27367 7492 : if (XINT (op, 1) != UNSPEC_VTERNLOG
27368 0 : || XVECLEN (op, 0) != 4
27369 0 : || !CONST_INT_P (XVECEXP (op, 0, 3)))
27370 : return -1;
27371 :
27372 : /* TODO: Handle permuted operands. */
27373 0 : if (ix86_ternlog_idx (XVECEXP (op, 0, 0), args) != 0xf0
27374 0 : || ix86_ternlog_idx (XVECEXP (op, 0, 1), args) != 0xcc
27375 0 : || ix86_ternlog_idx (XVECEXP (op, 0, 2), args) != 0xaa)
27376 0 : return -1;
27377 0 : return INTVAL (XVECEXP (op, 0, 3));
27378 :
27379 : default:
27380 : return -1;
27381 : }
27382 : }
27383 :
27384 : /* Return TRUE if OP (in mode MODE) is the leaf of a ternary logic
27385 : expression, such as a register or a memory reference. */
27386 :
27387 : bool
27388 3308975 : ix86_ternlog_leaf_p (rtx op, machine_mode mode)
27389 : {
27390 : /* We can't use memory_operand here, as it may return a different
27391 : value before and after reload (for volatile MEMs) which creates
27392 : problems splitting instructions. */
27393 3308975 : return register_operand (op, mode)
27394 723565 : || MEM_P (op)
27395 374707 : || CONST_VECTOR_P (op)
27396 3582958 : || bcst_mem_operand (op, mode);
27397 : }
27398 :
27399 : /* Test whether OP is a 3-operand ternary logic expression suitable
27400 : for use in a ternlog instruction. */
27401 :
27402 : bool
27403 2196426 : ix86_ternlog_operand_p (rtx op)
27404 : {
27405 2196426 : rtx op0, op1;
27406 2196426 : rtx args[3];
27407 :
27408 2196426 : args[0] = NULL_RTX;
27409 2196426 : args[1] = NULL_RTX;
27410 2196426 : args[2] = NULL_RTX;
27411 2196426 : int idx = ix86_ternlog_idx (op, args);
27412 2196426 : if (idx < 0)
27413 : return false;
27414 :
27415 : /* Don't match simple (binary or unary) expressions. */
27416 1785438 : machine_mode mode = GET_MODE (op);
27417 1785438 : switch (GET_CODE (op))
27418 : {
27419 819686 : case AND:
27420 819686 : op0 = XEXP (op, 0);
27421 819686 : op1 = XEXP (op, 1);
27422 :
27423 : /* Prefer pand. */
27424 819686 : if (ix86_ternlog_leaf_p (op0, mode)
27425 819686 : && ix86_ternlog_leaf_p (op1, mode))
27426 : return false;
27427 : /* Prefer pandn. */
27428 103796 : if (GET_CODE (op0) == NOT
27429 73091 : && register_operand (XEXP (op0, 0), mode)
27430 173296 : && ix86_ternlog_leaf_p (op1, mode))
27431 : return false;
27432 : break;
27433 :
27434 618722 : case IOR:
27435 : /* Prefer por. */
27436 618722 : if (ix86_ternlog_leaf_p (XEXP (op, 0), mode)
27437 618722 : && ix86_ternlog_leaf_p (XEXP (op, 1), mode))
27438 : return false;
27439 : break;
27440 :
27441 317481 : case XOR:
27442 317481 : op1 = XEXP (op, 1);
27443 : /* Prefer pxor, or one_cmpl<vmode>2. */
27444 317481 : if (ix86_ternlog_leaf_p (XEXP (op, 0), mode)
27445 317481 : && ix86_ternlog_leaf_p (XEXP (op, 1), mode))
27446 : return false;
27447 : break;
27448 :
27449 : default:
27450 : break;
27451 : }
27452 : return true;
27453 : }
27454 :
27455 : /* Helper function for ix86_expand_ternlog. */
27456 : static rtx
27457 0 : ix86_expand_ternlog_binop (enum rtx_code code, machine_mode mode,
27458 : rtx op0, rtx op1, rtx target)
27459 : {
27460 0 : if (GET_MODE (op0) != mode)
27461 0 : op0 = gen_lowpart (mode, op0);
27462 0 : if (GET_MODE (op1) != mode)
27463 0 : op1 = gen_lowpart (mode, op1);
27464 :
27465 0 : if (CONST_VECTOR_P (op0))
27466 0 : op0 = validize_mem (force_const_mem (mode, op0));
27467 0 : if (CONST_VECTOR_P (op1))
27468 0 : op1 = validize_mem (force_const_mem (mode, op1));
27469 :
27470 0 : if (!register_operand (op0, mode))
27471 : {
27472 0 : if (!register_operand (op1, mode))
27473 : {
27474 : /* We can't use force_reg (op0, mode). */
27475 0 : rtx reg = gen_reg_rtx (mode);
27476 0 : emit_move_insn (reg, op0);
27477 0 : op0 = reg;
27478 : }
27479 : else
27480 : std::swap (op0, op1);
27481 : }
27482 0 : rtx ops[3] = { target, op0, op1 };
27483 0 : ix86_expand_vector_logical_operator (code, mode, ops);
27484 0 : return target;
27485 : }
27486 :
27487 :
27488 : /* Helper function for ix86_expand_ternlog. */
27489 : static rtx
27490 0 : ix86_expand_ternlog_andnot (machine_mode mode, rtx op0, rtx op1, rtx target)
27491 : {
27492 0 : if (GET_MODE (op0) != mode)
27493 0 : op0 = gen_lowpart (mode, op0);
27494 0 : op0 = gen_rtx_NOT (mode, op0);
27495 0 : if (GET_MODE (op1) != mode)
27496 0 : op1 = gen_lowpart (mode, op1);
27497 0 : if (CONST_VECTOR_P (op1))
27498 0 : op1 = validize_mem (force_const_mem (mode, op1));
27499 0 : emit_move_insn (target, gen_rtx_AND (mode, op0, op1));
27500 0 : return target;
27501 : }
27502 :
27503 : /* Expand a 3-operand ternary logic expression. Return TARGET. */
27504 : rtx
27505 2354 : ix86_expand_ternlog (machine_mode mode, rtx op0, rtx op1, rtx op2, int idx,
27506 : rtx target)
27507 : {
27508 2354 : rtx tmp0, tmp1, tmp2;
27509 :
27510 2354 : if (!target)
27511 3 : target = gen_reg_rtx (mode);
27512 :
27513 : /* Canonicalize ternlog index for degenerate (duplicated) operands. */
27514 2354 : if (rtx_equal_p (op0, op1) && rtx_equal_p (op0, op2))
27515 0 : switch (idx & 0x81)
27516 : {
27517 : case 0x00:
27518 : idx = 0x00;
27519 : break;
27520 : case 0x01:
27521 : idx = 0x0f;
27522 : break;
27523 : case 0x80:
27524 : idx = 0xf0;
27525 : break;
27526 : case 0x81:
27527 : idx = 0xff;
27528 : break;
27529 : }
27530 :
27531 2354 : switch (idx & 0xff)
27532 : {
27533 0 : case 0x00:
27534 0 : if ((!op0 || !side_effects_p (op0))
27535 0 : && (!op1 || !side_effects_p (op1))
27536 0 : && (!op2 || !side_effects_p (op2)))
27537 : {
27538 0 : emit_move_insn (target, CONST0_RTX (mode));
27539 0 : return target;
27540 : }
27541 : break;
27542 :
27543 0 : case 0x0a: /* ~a&c */
27544 0 : if ((!op1 || !side_effects_p (op1))
27545 0 : && op0 && register_operand (op0, mode)
27546 0 : && op2 && ix86_ternlog_leaf_p (op2, mode))
27547 0 : return ix86_expand_ternlog_andnot (mode, op0, op2, target);
27548 : break;
27549 :
27550 0 : case 0x0c: /* ~a&b */
27551 0 : if ((!op2 || !side_effects_p (op2))
27552 0 : && op0 && register_operand (op0, mode)
27553 0 : && op1 && ix86_ternlog_leaf_p (op1, mode))
27554 0 : return ix86_expand_ternlog_andnot (mode, op0, op1, target);
27555 : break;
27556 :
27557 14 : case 0x0f: /* ~a */
27558 0 : if ((!op1 || !side_effects_p (op1))
27559 14 : && (!op2 || !side_effects_p (op2))
27560 28 : && op0)
27561 : {
27562 14 : emit_move_insn (target, gen_rtx_XOR (mode, op0, CONSTM1_RTX (mode)));
27563 14 : return target;
27564 : }
27565 : break;
27566 :
27567 0 : case 0x22: /* ~b&c */
27568 0 : if ((!op0 || !side_effects_p (op0))
27569 0 : && op1 && register_operand (op1, mode)
27570 0 : && op2 && ix86_ternlog_leaf_p (op2, mode))
27571 0 : return ix86_expand_ternlog_andnot (mode, op1, op2, target);
27572 : break;
27573 :
27574 0 : case 0x30: /* ~b&a */
27575 0 : if ((!op2 || !side_effects_p (op2))
27576 0 : && op0 && ix86_ternlog_leaf_p (op0, mode)
27577 0 : && op1 && register_operand (op1, mode))
27578 0 : return ix86_expand_ternlog_andnot (mode, op1, op0, target);
27579 : break;
27580 :
27581 0 : case 0x33: /* ~b */
27582 0 : if ((!op0 || !side_effects_p (op0))
27583 0 : && (!op2 || !side_effects_p (op2))
27584 0 : && op1)
27585 : {
27586 0 : emit_move_insn (target, gen_rtx_XOR (mode, op1, CONSTM1_RTX (mode)));
27587 0 : return target;
27588 : }
27589 : break;
27590 :
27591 0 : case 0x3c: /* a^b */
27592 0 : if (op0 && ix86_ternlog_leaf_p (op0, mode)
27593 0 : && op1 && ix86_ternlog_leaf_p (op1, mode)
27594 0 : && (!op2 || !side_effects_p (op2)))
27595 0 : return ix86_expand_ternlog_binop (XOR, mode, op0, op1, target);
27596 : break;
27597 :
27598 0 : case 0x44: /* ~c&b */
27599 0 : if ((!op0 || !side_effects_p (op0))
27600 0 : && op1 && ix86_ternlog_leaf_p (op1, mode)
27601 0 : && op2 && register_operand (op2, mode))
27602 0 : return ix86_expand_ternlog_andnot (mode, op2, op1, target);
27603 : break;
27604 :
27605 2 : case 0x50: /* ~c&a */
27606 0 : if ((!op1 || !side_effects_p (op1))
27607 2 : && op0 && ix86_ternlog_leaf_p (op0, mode)
27608 4 : && op2 && register_operand (op2, mode))
27609 0 : return ix86_expand_ternlog_andnot (mode, op2, op0, target);
27610 : break;
27611 :
27612 4 : case 0x55: /* ~c */
27613 1 : if ((!op0 || !side_effects_p (op0))
27614 4 : && (!op1 || !side_effects_p (op1))
27615 8 : && op2)
27616 : {
27617 4 : emit_move_insn (target, gen_rtx_XOR (mode, op2, CONSTM1_RTX (mode)));
27618 4 : return target;
27619 : }
27620 : break;
27621 :
27622 0 : case 0x5a: /* a^c */
27623 0 : if (op0 && ix86_ternlog_leaf_p (op0, mode)
27624 0 : && op2 && ix86_ternlog_leaf_p (op2, mode)
27625 0 : && (!op1 || !side_effects_p (op1)))
27626 0 : return ix86_expand_ternlog_binop (XOR, mode, op0, op2, target);
27627 : break;
27628 :
27629 0 : case 0x66: /* b^c */
27630 0 : if ((!op0 || !side_effects_p (op0))
27631 0 : && op1 && ix86_ternlog_leaf_p (op1, mode)
27632 0 : && op2 && ix86_ternlog_leaf_p (op2, mode))
27633 0 : return ix86_expand_ternlog_binop (XOR, mode, op1, op2, target);
27634 : break;
27635 :
27636 0 : case 0x88: /* b&c */
27637 0 : if ((!op0 || !side_effects_p (op0))
27638 0 : && op1 && ix86_ternlog_leaf_p (op1, mode)
27639 0 : && op2 && ix86_ternlog_leaf_p (op2, mode))
27640 0 : return ix86_expand_ternlog_binop (AND, mode, op1, op2, target);
27641 : break;
27642 :
27643 0 : case 0xa0: /* a&c */
27644 0 : if ((!op1 || !side_effects_p (op1))
27645 0 : && op0 && ix86_ternlog_leaf_p (op0, mode)
27646 0 : && op2 && ix86_ternlog_leaf_p (op2, mode))
27647 0 : return ix86_expand_ternlog_binop (AND, mode, op0, op2, target);
27648 : break;
27649 :
27650 0 : case 0xaa: /* c */
27651 0 : if ((!op0 || !side_effects_p (op0))
27652 0 : && (!op1 || !side_effects_p (op1))
27653 0 : && op2)
27654 : {
27655 0 : if (GET_MODE (op2) != mode)
27656 0 : op2 = gen_lowpart (mode, op2);
27657 0 : emit_move_insn (target, op2);
27658 0 : return target;
27659 : }
27660 : break;
27661 :
27662 0 : case 0xc0: /* a&b */
27663 0 : if (op0 && ix86_ternlog_leaf_p (op0, mode)
27664 0 : && op1 && ix86_ternlog_leaf_p (op1, mode)
27665 0 : && (!op2 || !side_effects_p (op2)))
27666 0 : return ix86_expand_ternlog_binop (AND, mode, op0, op1, target);
27667 : break;
27668 :
27669 0 : case 0xcc: /* b */
27670 0 : if ((!op0 || !side_effects_p (op0))
27671 0 : && op1
27672 0 : && (!op2 || !side_effects_p (op2)))
27673 : {
27674 0 : if (GET_MODE (op1) != mode)
27675 0 : op1 = gen_lowpart (mode, op1);
27676 0 : emit_move_insn (target, op1);
27677 0 : return target;
27678 : }
27679 : break;
27680 :
27681 0 : case 0xee: /* b|c */
27682 0 : if ((!op0 || !side_effects_p (op0))
27683 0 : && op1 && ix86_ternlog_leaf_p (op1, mode)
27684 0 : && op2 && ix86_ternlog_leaf_p (op2, mode))
27685 0 : return ix86_expand_ternlog_binop (IOR, mode, op1, op2, target);
27686 : break;
27687 :
27688 6 : case 0xf0: /* a */
27689 6 : if (op0
27690 6 : && (!op1 || !side_effects_p (op1))
27691 12 : && (!op2 || !side_effects_p (op2)))
27692 : {
27693 6 : if (GET_MODE (op0) != mode)
27694 0 : op0 = gen_lowpart (mode, op0);
27695 6 : emit_move_insn (target, op0);
27696 6 : return target;
27697 : }
27698 : break;
27699 :
27700 0 : case 0xfa: /* a|c */
27701 0 : if (op0 && ix86_ternlog_leaf_p (op0, mode)
27702 0 : && op2 && ix86_ternlog_leaf_p (op2, mode)
27703 0 : && (!op1 || !side_effects_p (op1)))
27704 0 : return ix86_expand_ternlog_binop (IOR, mode, op0, op2, target);
27705 : break;
27706 :
27707 0 : case 0xfc: /* a|b */
27708 0 : if (op0 && ix86_ternlog_leaf_p (op0, mode)
27709 0 : && op1 && ix86_ternlog_leaf_p (op1, mode)
27710 0 : && (!op2 || !side_effects_p (op2)))
27711 0 : return ix86_expand_ternlog_binop (IOR, mode, op0, op1, target);
27712 : break;
27713 :
27714 0 : case 0xff:
27715 0 : if ((!op0 || !side_effects_p (op0))
27716 0 : && (!op1 || !side_effects_p (op1))
27717 0 : && (!op2 || !side_effects_p (op2)))
27718 : {
27719 0 : emit_move_insn (target, CONSTM1_RTX (mode));
27720 0 : return target;
27721 : }
27722 : break;
27723 : }
27724 :
27725 2330 : if (!register_operand (op0, mode))
27726 : {
27727 : /* We can't use force_reg (mode, op0). */
27728 12 : tmp0 = gen_reg_rtx (GET_MODE (op0));
27729 12 : emit_move_insn (tmp0,op0);
27730 : }
27731 : else
27732 : tmp0 = op0;
27733 2330 : if (GET_MODE (tmp0) != mode)
27734 0 : tmp0 = gen_lowpart (mode, tmp0);
27735 :
27736 2330 : if (!op1 || rtx_equal_p (op0, op1))
27737 6 : tmp1 = copy_rtx (tmp0);
27738 2324 : else if (!register_operand (op1, mode))
27739 : {
27740 : /* We can't use force_reg (mode, op1). */
27741 28 : tmp1 = gen_reg_rtx (GET_MODE (op1));
27742 28 : emit_move_insn (tmp1, op1);
27743 : }
27744 : else
27745 : tmp1 = op1;
27746 2330 : if (GET_MODE (tmp1) != mode)
27747 0 : tmp1 = gen_lowpart (mode, tmp1);
27748 :
27749 2330 : if (!op2 || rtx_equal_p (op0, op2))
27750 71 : tmp2 = copy_rtx (tmp0);
27751 2259 : else if (rtx_equal_p (op1, op2))
27752 0 : tmp2 = copy_rtx (tmp1);
27753 2259 : else if (CONST_VECTOR_P (op2))
27754 : {
27755 43 : if (GET_MODE (op2) != mode)
27756 0 : op2 = gen_lowpart (mode, op2);
27757 43 : tmp2 = ix86_gen_bcst_mem (mode, op2);
27758 43 : if (!tmp2)
27759 : {
27760 25 : machine_mode bcst32_mode = mode;
27761 25 : machine_mode bcst64_mode = mode;
27762 25 : switch (mode)
27763 : {
27764 1 : case V1TImode:
27765 1 : case V4SImode:
27766 1 : case V4SFmode:
27767 1 : case V8HImode:
27768 1 : case V16QImode:
27769 1 : bcst32_mode = V4SImode;
27770 1 : bcst64_mode = V2DImode;
27771 1 : break;
27772 :
27773 0 : case V2TImode:
27774 0 : case V8SImode:
27775 0 : case V8SFmode:
27776 0 : case V16HImode:
27777 0 : case V32QImode:
27778 0 : bcst32_mode = V8SImode;
27779 0 : bcst64_mode = V4DImode;
27780 0 : break;
27781 :
27782 3 : case V4TImode:
27783 3 : case V16SImode:
27784 3 : case V16SFmode:
27785 3 : case V32HImode:
27786 3 : case V64QImode:
27787 3 : bcst32_mode = V16SImode;
27788 3 : bcst64_mode = V8DImode;
27789 3 : break;
27790 :
27791 : default:
27792 : break;
27793 : }
27794 :
27795 25 : if (bcst32_mode != mode)
27796 : {
27797 4 : tmp2 = gen_lowpart (bcst32_mode, op2);
27798 4 : if (ix86_gen_bcst_mem (bcst32_mode, tmp2))
27799 : {
27800 3 : tmp2 = ix86_expand_ternlog (bcst32_mode,
27801 3 : gen_lowpart (bcst32_mode, tmp0),
27802 3 : gen_lowpart (bcst32_mode, tmp1),
27803 : tmp2, idx, NULL_RTX);
27804 3 : emit_move_insn (target, gen_lowpart (mode, tmp2));
27805 3 : return target;
27806 : }
27807 : }
27808 :
27809 22 : if (bcst64_mode != mode)
27810 : {
27811 1 : tmp2 = gen_lowpart (bcst64_mode, op2);
27812 1 : if (ix86_gen_bcst_mem (bcst64_mode, tmp2))
27813 : {
27814 0 : tmp2 = ix86_expand_ternlog (bcst64_mode,
27815 0 : gen_lowpart (bcst64_mode, tmp0),
27816 0 : gen_lowpart (bcst64_mode, tmp1),
27817 : tmp2, idx, NULL_RTX);
27818 0 : emit_move_insn (target, gen_lowpart (mode, tmp2));
27819 0 : return target;
27820 : }
27821 : }
27822 :
27823 22 : tmp2 = force_const_mem (mode, op2);
27824 22 : rtx bcast = ix86_broadcast_from_constant (mode, tmp2);
27825 22 : tmp2 = validize_mem (tmp2);
27826 22 : if (bcast)
27827 : {
27828 12 : rtx reg2 = gen_reg_rtx (mode);
27829 12 : bool ok = ix86_expand_vector_init_duplicate (false, mode,
27830 : reg2, bcast);
27831 12 : if (ok)
27832 2327 : tmp2 = reg2;
27833 : }
27834 : }
27835 : }
27836 : else
27837 : tmp2 = op2;
27838 2327 : if (GET_MODE (tmp2) != mode)
27839 0 : tmp2 = gen_lowpart (mode, tmp2);
27840 : /* Some memory_operands are not vector_memory_operands. */
27841 2327 : if (!bcst_vector_operand (tmp2, mode))
27842 0 : tmp2 = force_reg (mode, tmp2);
27843 :
27844 2327 : rtvec vec = gen_rtvec (4, tmp0, tmp1, tmp2, GEN_INT (idx));
27845 2327 : emit_move_insn (target, gen_rtx_UNSPEC (mode, vec, UNSPEC_VTERNLOG));
27846 2327 : return target;
27847 : }
27848 :
27849 : /* GF2P8AFFINEQB matrixes to implement shift and rotate. */
27850 :
27851 : static const uint64_t matrix_ashift[8] =
27852 : {
27853 : 0,
27854 : 0x0001020408102040, /* 1 l */
27855 : 0x0000010204081020, /* 2 l */
27856 : 0x0000000102040810, /* 3 l */
27857 : 0x0000000001020408, /* 4 l */
27858 : 0x0000000000010204, /* 5 l */
27859 : 0x0000000000000102, /* 6 l */
27860 : 0x0000000000000001 /* 7 l */
27861 : };
27862 :
27863 : static const uint64_t matrix_lshiftrt[8] =
27864 : {
27865 : 0,
27866 : 0x0204081020408000, /* 1 r */
27867 : 0x0408102040800000, /* 2 r */
27868 : 0x0810204080000000, /* 3 r */
27869 : 0x1020408000000000, /* 4 r */
27870 : 0x2040800000000000, /* 5 r */
27871 : 0x4080000000000000, /* 6 r */
27872 : 0x8000000000000000 /* 7 r */
27873 : };
27874 :
27875 : static const uint64_t matrix_ashiftrt[8] =
27876 : {
27877 : 0,
27878 : 0x0204081020408080, /* 1 r */
27879 : 0x0408102040808080, /* 2 r */
27880 : 0x0810204080808080, /* 3 r */
27881 : 0x1020408080808080, /* 4 r */
27882 : 0x2040808080808080, /* 5 r */
27883 : 0x4080808080808080, /* 6 r */
27884 : 0x8080808080808080 /* 7 r */
27885 : };
27886 :
27887 : static const uint64_t matrix_rotate[8] =
27888 : {
27889 : 0,
27890 : 0x8001020408102040, /* 1 rol8 */
27891 : 0x4080010204081020, /* 2 rol8 */
27892 : 0x2040800102040810, /* 3 rol8 */
27893 : 0x1020408001020408, /* 4 rol8 */
27894 : 0x0810204080010204, /* 5 rol8 */
27895 : 0x0408102040800102, /* 6 rol8 */
27896 : 0x0204081020408001 /* 7 rol8 */
27897 : };
27898 :
27899 : static const uint64_t matrix_rotatert[8] =
27900 : {
27901 : 0,
27902 : 0x0204081020408001, /* 1 ror8 */
27903 : 0x0408102040800102, /* 2 ror8 */
27904 : 0x0810204080010204, /* 3 ror8 */
27905 : 0x1020408001020408, /* 4 ror8 */
27906 : 0x2040800102040810, /* 5 ror8 */
27907 : 0x4080010204081020, /* 6 ror8 */
27908 : 0x8001020408102040 /* 7 ror8 */
27909 : };
27910 :
27911 : /* Return rtx to load a 64bit GF2P8AFFINE GP(2) matrix implementing a shift
27912 : for CODE and shift count COUNT into register with vector of size of SRC. */
27913 :
27914 : rtx
27915 189 : ix86_vgf2p8affine_shift_matrix (rtx src, rtx count, enum rtx_code code)
27916 : {
27917 189 : machine_mode mode = GET_MODE (src);
27918 189 : const uint64_t *matrix;
27919 189 : unsigned shift = INTVAL (count) & 7;
27920 189 : gcc_assert (shift > 0 && shift < 8);
27921 :
27922 189 : switch (code)
27923 : {
27924 : case ASHIFT:
27925 : matrix = matrix_ashift;
27926 : break;
27927 26 : case ASHIFTRT:
27928 26 : matrix = matrix_ashiftrt;
27929 26 : break;
27930 28 : case LSHIFTRT:
27931 28 : matrix = matrix_lshiftrt;
27932 28 : break;
27933 32 : case ROTATE:
27934 32 : matrix = matrix_rotate;
27935 32 : break;
27936 33 : case ROTATERT:
27937 33 : matrix = matrix_rotatert;
27938 33 : break;
27939 0 : default:
27940 0 : gcc_unreachable ();
27941 : }
27942 :
27943 189 : int nelts = GET_MODE_NUNITS (mode);
27944 189 : rtvec vec = rtvec_alloc (nelts);
27945 189 : uint64_t ma = matrix[shift];
27946 7741 : for (int i = 0; i < nelts; i++)
27947 7552 : RTVEC_ELT (vec, i) = gen_int_mode ((ma >> ((i % 8) * 8)) & 0xff, QImode);
27948 :
27949 189 : return force_reg (mode, gen_rtx_CONST_VECTOR (mode, vec));
27950 : }
27951 :
27952 : /* Trunc a vector to a narrow vector, like v4di -> v4si. */
27953 :
27954 : void
27955 63 : ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input, machine_mode cvt_mode)
27956 : {
27957 63 : machine_mode out_mode = GET_MODE (output);
27958 63 : machine_mode in_mode = GET_MODE (input);
27959 63 : int len = GET_MODE_SIZE (in_mode);
27960 252 : gcc_assert (len == GET_MODE_SIZE (cvt_mode)
27961 : && GET_MODE_INNER (out_mode) == GET_MODE_INNER (cvt_mode)
27962 : && (REG_P (input) || SUBREG_P (input)));
27963 63 : scalar_mode inner_out_mode = GET_MODE_INNER (out_mode);
27964 126 : int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode));
27965 63 : int out_innersize = GET_MODE_SIZE (inner_out_mode);
27966 :
27967 63 : struct expand_vec_perm_d d;
27968 63 : d.target = gen_reg_rtx (cvt_mode);
27969 63 : d.op0 = lowpart_subreg (cvt_mode, force_reg(in_mode, input), in_mode);
27970 63 : d.op1 = d.op0;
27971 63 : d.vmode = cvt_mode;
27972 63 : d.nelt = GET_MODE_NUNITS (cvt_mode);
27973 63 : d.testing_p = false;
27974 63 : d.one_operand_p = true;
27975 :
27976 : /* Init perm. Put the needed bits of input in order and
27977 : fill the rest of bits by default. */
27978 687 : for (int i = 0; i < d.nelt; ++i)
27979 : {
27980 624 : d.perm[i] = i;
27981 1248 : if (i < GET_MODE_NUNITS (out_mode))
27982 246 : d.perm[i] = i * (in_innersize / out_innersize);
27983 : }
27984 :
27985 63 : bool ok = ix86_expand_vec_perm_const_1(&d);
27986 63 : gcc_assert (ok);
27987 63 : emit_move_insn (output, gen_lowpart (out_mode, d.target));
27988 63 : }
27989 :
27990 : /* Implement truncv8sfv8bf2 with vector permutation. */
27991 : void
27992 8 : ix86_expand_vector_sf2bf_with_vec_perm (rtx dest, rtx src)
27993 : {
27994 8 : machine_mode vperm_mode, src_mode = GET_MODE (src);
27995 8 : switch (src_mode)
27996 : {
27997 : case V16SFmode:
27998 : vperm_mode = V32BFmode;
27999 : break;
28000 2 : case V8SFmode:
28001 2 : vperm_mode = V16BFmode;
28002 2 : break;
28003 4 : case V4SFmode:
28004 4 : vperm_mode = V8BFmode;
28005 4 : break;
28006 0 : default:
28007 0 : gcc_unreachable ();
28008 : }
28009 :
28010 8 : int nelt = GET_MODE_NUNITS (vperm_mode);
28011 8 : vec_perm_builder sel (nelt, nelt, 1);
28012 8 : sel.quick_grow (nelt);
28013 136 : for (int i = 0; i != nelt; i++)
28014 128 : sel[i] = (2 * i + 1) % nelt;
28015 16 : vec_perm_indices indices (sel, 1, nelt);
28016 :
28017 8 : rtx target = gen_reg_rtx (vperm_mode);
28018 8 : rtx op0 = lowpart_subreg (vperm_mode,
28019 : force_reg (src_mode, src),
28020 : src_mode);
28021 8 : bool ok = targetm.vectorize.vec_perm_const (vperm_mode, vperm_mode,
28022 : target, op0, op0, indices);
28023 8 : gcc_assert (ok);
28024 8 : emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target, vperm_mode));
28025 8 : }
28026 :
28027 : /* Implement extendv8bf2v8sf2 with vector permutation. */
28028 : void
28029 8 : ix86_expand_vector_bf2sf_with_vec_perm (rtx dest, rtx src)
28030 : {
28031 8 : machine_mode vperm_mode, src_mode = GET_MODE (src);
28032 8 : switch (src_mode)
28033 : {
28034 : case V16BFmode:
28035 : vperm_mode = V32BFmode;
28036 : break;
28037 2 : case V8BFmode:
28038 2 : vperm_mode = V16BFmode;
28039 2 : break;
28040 4 : case V4BFmode:
28041 4 : vperm_mode = V8BFmode;
28042 4 : break;
28043 0 : default:
28044 0 : gcc_unreachable ();
28045 : }
28046 :
28047 8 : int nelt = GET_MODE_NUNITS (vperm_mode);
28048 8 : vec_perm_builder sel (nelt, nelt, 1);
28049 8 : sel.quick_grow (nelt);
28050 136 : for (int i = 0, k = 0, j = nelt; i != nelt; i++)
28051 128 : sel[i] = i & 1 ? j++ : k++;
28052 :
28053 16 : vec_perm_indices indices (sel, 2, nelt);
28054 :
28055 8 : rtx target = gen_reg_rtx (vperm_mode);
28056 8 : rtx op1 = lowpart_subreg (vperm_mode,
28057 : force_reg (src_mode, src),
28058 : src_mode);
28059 8 : rtx op0 = CONST0_RTX (vperm_mode);
28060 8 : bool ok = targetm.vectorize.vec_perm_const (vperm_mode, vperm_mode,
28061 : target, op0, op1, indices);
28062 8 : gcc_assert (ok);
28063 8 : emit_move_insn (dest, lowpart_subreg (GET_MODE (dest), target, vperm_mode));
28064 8 : }
28065 :
28066 :
28067 : #include "gt-i386-expand.h"
|