Line data Source code
1 : /* Copyright (C) 1988-2026 Free Software Foundation, Inc.
2 :
3 : This file is part of GCC.
4 :
5 : GCC is free software; you can redistribute it and/or modify
6 : it under the terms of the GNU General Public License as published by
7 : the Free Software Foundation; either version 3, or (at your option)
8 : any later version.
9 :
10 : GCC is distributed in the hope that it will be useful,
11 : but WITHOUT ANY WARRANTY; without even the implied warranty of
12 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 : GNU General Public License for more details.
14 :
15 : You should have received a copy of the GNU General Public License
16 : along with GCC; see the file COPYING3. If not see
17 : <http://www.gnu.org/licenses/>. */
18 :
19 : #define IN_TARGET_CODE 1
20 :
21 : #include "config.h"
22 : #include "system.h"
23 : #include "coretypes.h"
24 : #include "backend.h"
25 : #include "rtl.h"
26 : #include "tree.h"
27 : #include "memmodel.h"
28 : #include "gimple.h"
29 : #include "cfghooks.h"
30 : #include "cfgloop.h"
31 : #include "df.h"
32 : #include "tm_p.h"
33 : #include "stringpool.h"
34 : #include "expmed.h"
35 : #include "optabs.h"
36 : #include "regs.h"
37 : #include "emit-rtl.h"
38 : #include "recog.h"
39 : #include "cgraph.h"
40 : #include "diagnostic.h"
41 : #include "cfgbuild.h"
42 : #include "alias.h"
43 : #include "fold-const.h"
44 : #include "attribs.h"
45 : #include "calls.h"
46 : #include "stor-layout.h"
47 : #include "varasm.h"
48 : #include "output.h"
49 : #include "insn-attr.h"
50 : #include "flags.h"
51 : #include "except.h"
52 : #include "explow.h"
53 : #include "expr.h"
54 : #include "cfgrtl.h"
55 : #include "common/common-target.h"
56 : #include "langhooks.h"
57 : #include "reload.h"
58 : #include "gimplify.h"
59 : #include "dwarf2.h"
60 : #include "tm-constrs.h"
61 : #include "cselib.h"
62 : #include "sched-int.h"
63 : #include "opts.h"
64 : #include "tree-pass.h"
65 : #include "context.h"
66 : #include "pass_manager.h"
67 : #include "target-globals.h"
68 : #include "gimple-iterator.h"
69 : #include "shrink-wrap.h"
70 : #include "builtins.h"
71 : #include "rtl-iter.h"
72 : #include "tree-iterator.h"
73 : #include "dbgcnt.h"
74 : #include "case-cfn-macros.h"
75 : #include "dojump.h"
76 : #include "fold-const-call.h"
77 : #include "tree-vrp.h"
78 : #include "tree-ssanames.h"
79 : #include "selftest.h"
80 : #include "selftest-rtl.h"
81 : #include "print-rtl.h"
82 : #include "intl.h"
83 : #include "ifcvt.h"
84 : #include "symbol-summary.h"
85 : #include "sreal.h"
86 : #include "ipa-cp.h"
87 : #include "ipa-prop.h"
88 : #include "ipa-fnsummary.h"
89 : #include "wide-int-bitmask.h"
90 : #include "tree-vector-builder.h"
91 : #include "debug.h"
92 : #include "dwarf2out.h"
93 : #include "i386-builtins.h"
94 : #include "i386-features.h"
95 : #include "i386-expand.h"
96 :
97 : const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
98 : "savms64",
99 : "resms64",
100 : "resms64x",
101 : "savms64f",
102 : "resms64f",
103 : "resms64fx"
104 : };
105 :
106 : const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
107 : /* The below offset values are where each register is stored for the layout
108 : relative to incoming stack pointer. The value of each m_regs[].offset will
109 : be relative to the incoming base pointer (rax or rsi) used by the stub.
110 :
111 : s_instances: 0 1 2 3
112 : Offset: realigned or aligned + 8
113 : Register aligned aligned + 8 aligned w/HFP w/HFP */
114 : XMM15_REG, /* 0x10 0x18 0x10 0x18 */
115 : XMM14_REG, /* 0x20 0x28 0x20 0x28 */
116 : XMM13_REG, /* 0x30 0x38 0x30 0x38 */
117 : XMM12_REG, /* 0x40 0x48 0x40 0x48 */
118 : XMM11_REG, /* 0x50 0x58 0x50 0x58 */
119 : XMM10_REG, /* 0x60 0x68 0x60 0x68 */
120 : XMM9_REG, /* 0x70 0x78 0x70 0x78 */
121 : XMM8_REG, /* 0x80 0x88 0x80 0x88 */
122 : XMM7_REG, /* 0x90 0x98 0x90 0x98 */
123 : XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
124 : SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
125 : DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
126 : BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
127 : BP_REG, /* 0xc0 0xc8 N/A N/A */
128 : R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
129 : R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
130 : R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
131 : R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
132 : };
133 :
134 : /* Instantiate static const values. */
135 : const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
136 : const unsigned xlogue_layout::MIN_REGS;
137 : const unsigned xlogue_layout::MAX_REGS;
138 : const unsigned xlogue_layout::MAX_EXTRA_REGS;
139 : const unsigned xlogue_layout::VARIANT_COUNT;
140 : const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
141 :
142 : /* Initialize xlogue_layout::s_stub_names to zero. */
143 : char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
144 : [STUB_NAME_MAX_LEN];
145 :
146 : /* Instantiates all xlogue_layout instances. */
147 : const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
148 : xlogue_layout (0, false),
149 : xlogue_layout (8, false),
150 : xlogue_layout (0, true),
151 : xlogue_layout (8, true)
152 : };
153 :
154 : /* Return an appropriate const instance of xlogue_layout based upon values
155 : in cfun->machine and crtl. */
156 : const class xlogue_layout &
157 49891 : xlogue_layout::get_instance ()
158 : {
159 49891 : enum xlogue_stub_sets stub_set;
160 49891 : bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
161 :
162 49891 : if (stack_realign_fp)
163 : stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
164 40910 : else if (frame_pointer_needed)
165 25246 : stub_set = aligned_plus_8
166 31552 : ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
167 : : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
168 : else
169 9358 : stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
170 :
171 49891 : return s_instances[stub_set];
172 : }
173 :
174 : /* Determine how many clobbered registers can be saved by the stub.
175 : Returns the count of registers the stub will save and restore. */
176 : unsigned
177 35225 : xlogue_layout::count_stub_managed_regs ()
178 : {
179 35225 : bool hfp = frame_pointer_needed || stack_realign_fp;
180 35225 : unsigned i, count;
181 35225 : unsigned regno;
182 :
183 94890 : for (count = i = MIN_REGS; i < MAX_REGS; ++i)
184 : {
185 93670 : regno = REG_ORDER[i];
186 93670 : if (regno == BP_REG && hfp)
187 18200 : continue;
188 75470 : if (!ix86_save_reg (regno, false, false))
189 : break;
190 41465 : ++count;
191 : }
192 35225 : return count;
193 : }
194 :
195 : /* Determine if register REGNO is a stub managed register given the
196 : total COUNT of stub managed registers. */
197 : bool
198 2641728 : xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
199 : {
200 2641728 : bool hfp = frame_pointer_needed || stack_realign_fp;
201 2641728 : unsigned i;
202 :
203 34456982 : for (i = 0; i < count; ++i)
204 : {
205 32315123 : gcc_assert (i < MAX_REGS);
206 32315123 : if (REG_ORDER[i] == BP_REG && hfp)
207 519694 : ++count;
208 31795429 : else if (REG_ORDER[i] == regno)
209 : return true;
210 : }
211 : return false;
212 : }
213 :
214 : /* Constructor for xlogue_layout. */
215 1150544 : xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
216 1150544 : : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
217 1150544 : m_stack_align_off_in (stack_align_off_in)
218 : {
219 1150544 : HOST_WIDE_INT offset = stack_align_off_in;
220 1150544 : unsigned i, j;
221 :
222 21860336 : for (i = j = 0; i < MAX_REGS; ++i)
223 : {
224 20709792 : unsigned regno = REG_ORDER[i];
225 :
226 20709792 : if (regno == BP_REG && hfp)
227 575272 : continue;
228 20134520 : if (SSE_REGNO_P (regno))
229 : {
230 11505440 : offset += 16;
231 : /* Verify that SSE regs are always aligned. */
232 11505440 : gcc_assert (!((stack_align_off_in + offset) & 15));
233 : }
234 : else
235 8629080 : offset += 8;
236 :
237 20134520 : m_regs[j].regno = regno;
238 20134520 : m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
239 : }
240 1150544 : gcc_assert (j == m_nregs);
241 1150544 : }
242 :
243 : const char *
244 14666 : xlogue_layout::get_stub_name (enum xlogue_stub stub,
245 : unsigned n_extra_regs)
246 : {
247 14666 : const int have_avx = TARGET_AVX;
248 14666 : char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
249 :
250 : /* Lazy init */
251 14666 : if (!*name)
252 : {
253 362 : int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
254 : (have_avx ? "avx" : "sse"),
255 181 : STUB_BASE_NAMES[stub],
256 : MIN_REGS + n_extra_regs);
257 181 : gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
258 : }
259 :
260 14666 : return name;
261 : }
262 :
263 : /* Return rtx of a symbol ref for the entry point (based upon
264 : cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
265 : rtx
266 14666 : xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
267 : {
268 14666 : const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
269 14666 : gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
270 14666 : gcc_assert (stub < XLOGUE_STUB_COUNT);
271 14666 : gcc_assert (crtl->stack_realign_finalized);
272 :
273 14666 : return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
274 : }
275 :
276 : unsigned scalar_chain::max_id = 0;
277 :
278 : namespace {
279 :
280 : /* Initialize new chain. */
281 :
282 6389111 : scalar_chain::scalar_chain (enum machine_mode smode_, enum machine_mode vmode_)
283 : {
284 6389111 : smode = smode_;
285 6389111 : vmode = vmode_;
286 :
287 6389111 : chain_id = ++max_id;
288 :
289 6389111 : if (dump_file)
290 136 : fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
291 :
292 6389111 : bitmap_obstack_initialize (NULL);
293 6389111 : insns = BITMAP_ALLOC (NULL);
294 6389111 : defs = BITMAP_ALLOC (NULL);
295 6389111 : defs_conv = BITMAP_ALLOC (NULL);
296 6389111 : insns_conv = BITMAP_ALLOC (NULL);
297 6389111 : queue = NULL;
298 :
299 6389111 : cost_sse_integer = 0;
300 6389111 : weighted_cost_sse_integer = 0 ;
301 6389111 : max_visits = x86_stv_max_visits;
302 6389111 : }
303 :
304 : /* Free chain's data. */
305 :
306 6389111 : scalar_chain::~scalar_chain ()
307 : {
308 6389111 : BITMAP_FREE (insns);
309 6389111 : BITMAP_FREE (defs);
310 6389111 : BITMAP_FREE (defs_conv);
311 6389111 : BITMAP_FREE (insns_conv);
312 6389111 : bitmap_obstack_release (NULL);
313 6389111 : }
314 :
315 : /* Add instruction into chains' queue. */
316 :
317 : void
318 8232309 : scalar_chain::add_to_queue (unsigned insn_uid)
319 : {
320 8232309 : if (!bitmap_set_bit (queue, insn_uid))
321 : return;
322 :
323 6211958 : if (dump_file)
324 141 : fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
325 : insn_uid, chain_id);
326 : }
327 :
328 : /* For DImode conversion, mark register defined by DEF as requiring
329 : conversion. */
330 :
331 : void
332 9317542 : scalar_chain::mark_dual_mode_def (df_ref def)
333 : {
334 9317542 : gcc_assert (DF_REF_REG_DEF_P (def));
335 :
336 : /* Record the def/insn pair so we can later efficiently iterate over
337 : the defs to convert on insns not in the chain. */
338 9317542 : bool reg_new = bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
339 9317542 : basic_block bb = BLOCK_FOR_INSN (DF_REF_INSN (def));
340 9317542 : profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count;
341 9317542 : bool speed_p = optimize_bb_for_speed_p (bb);
342 9317542 : int cost = 0;
343 :
344 9317542 : if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def)))
345 : {
346 2707235 : if (!bitmap_set_bit (insns_conv, DF_REF_INSN_UID (def))
347 2707235 : && !reg_new)
348 1386803 : return;
349 :
350 : /* Cost integer to sse moves. */
351 2465013 : if (speed_p)
352 2183962 : cost = COSTS_N_INSNS (ix86_cost->integer_to_sse) / 2;
353 281051 : else if (TARGET_64BIT || smode == SImode)
354 : cost = COSTS_N_BYTES (4);
355 : /* vmovd (4 bytes) + vpinsrd (6 bytes). */
356 18650 : else if (TARGET_SSE4_1)
357 : cost = COSTS_N_BYTES (10);
358 : /* movd (4 bytes) + movd (4 bytes) + unpckldq (4 bytes). */
359 : else
360 7930739 : cost = COSTS_N_BYTES (12);
361 : }
362 : else
363 : {
364 6610307 : if (!reg_new)
365 : return;
366 :
367 : /* Cost sse to integer moves. */
368 5465726 : if (speed_p)
369 4907580 : cost = COSTS_N_INSNS (ix86_cost->sse_to_integer) / 2;
370 558146 : else if (TARGET_64BIT || smode == SImode)
371 : cost = COSTS_N_BYTES (4);
372 : /* vmovd (4 bytes) + vpextrd (6 bytes). */
373 2973 : else if (TARGET_SSE4_1)
374 : cost = COSTS_N_BYTES (10);
375 : /* movd (4 bytes) + psrlq (5 bytes) + movd (4 bytes). */
376 : else
377 7930739 : cost = COSTS_N_BYTES (13);
378 : }
379 :
380 7930739 : if (speed_p)
381 7091542 : weighted_cost_sse_integer += bb->count.to_sreal_scale (entry_count) * cost;
382 :
383 7930739 : cost_sse_integer += cost;
384 :
385 7930739 : if (dump_file)
386 240 : fprintf (dump_file,
387 : " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
388 240 : DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
389 : }
390 :
391 : /* Check REF's chain to add new insns into a queue
392 : and find registers requiring conversion. Return true if OK, false
393 : if the analysis was aborted. */
394 :
395 : bool
396 17746546 : scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref,
397 : bitmap disallowed)
398 : {
399 17746546 : df_link *chain;
400 17746546 : bool mark_def = false;
401 :
402 17746546 : gcc_checking_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)));
403 :
404 61491507 : for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
405 : {
406 43748830 : unsigned uid = DF_REF_INSN_UID (chain->ref);
407 :
408 43748830 : if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
409 7851978 : continue;
410 :
411 35896852 : if (--max_visits == 0)
412 : return false;
413 :
414 35896292 : if (!DF_REF_REG_MEM_P (chain->ref))
415 : {
416 29965617 : if (bitmap_bit_p (insns, uid))
417 9540541 : continue;
418 :
419 20425076 : if (bitmap_bit_p (candidates, uid))
420 : {
421 8232309 : add_to_queue (uid);
422 8232309 : continue;
423 : }
424 :
425 : /* If we run into parts of an aborted chain discovery abort. */
426 12192767 : if (bitmap_bit_p (disallowed, uid))
427 : return false;
428 : }
429 :
430 18120133 : if (DF_REF_REG_DEF_P (chain->ref))
431 : {
432 2707235 : if (dump_file)
433 125 : fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
434 : DF_REF_REGNO (chain->ref), uid);
435 2707235 : mark_dual_mode_def (chain->ref);
436 : }
437 : else
438 : {
439 15412898 : if (dump_file)
440 524 : fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
441 : DF_REF_REGNO (chain->ref), uid);
442 : mark_def = true;
443 : }
444 : }
445 :
446 17742677 : if (mark_def)
447 6610307 : mark_dual_mode_def (ref);
448 :
449 : return true;
450 : }
451 :
452 : /* Check whether X is a convertible *concatditi_? variant. X is known
453 : to be any_or_plus:TI, i.e. PLUS:TI, IOR:TI or XOR:TI. */
454 :
455 : static bool
456 29932 : timode_concatdi_p (rtx x)
457 : {
458 29932 : rtx op0 = XEXP (x, 0);
459 29932 : rtx op1 = XEXP (x, 1);
460 :
461 29932 : if (GET_CODE (op1) == ASHIFT)
462 948 : std::swap (op0, op1);
463 :
464 29932 : return GET_CODE (op0) == ASHIFT
465 21050 : && GET_CODE (XEXP (op0, 0)) == ZERO_EXTEND
466 21050 : && GET_MODE (XEXP (XEXP (op0, 0), 0)) == DImode
467 21050 : && REG_P (XEXP (XEXP (op0, 0), 0))
468 20919 : && CONST_INT_P (XEXP (op0, 1))
469 20919 : && INTVAL (XEXP (op0, 1)) == 64
470 20919 : && GET_CODE (op1) == ZERO_EXTEND
471 19971 : && GET_MODE (XEXP (op1, 0)) == DImode
472 49903 : && REG_P (XEXP (op1, 0));
473 : }
474 :
475 :
476 : /* Add instruction into a chain. Return true if OK, false if the search
477 : was aborted. */
478 :
479 : bool
480 12596937 : scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid,
481 : bitmap disallowed)
482 : {
483 12596937 : if (!bitmap_set_bit (insns, insn_uid))
484 : return true;
485 :
486 12596937 : if (dump_file)
487 277 : fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
488 :
489 12596937 : rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
490 12596937 : rtx def_set = single_set (insn);
491 12596937 : if (def_set && REG_P (SET_DEST (def_set))
492 22301691 : && !HARD_REGISTER_P (SET_DEST (def_set)))
493 9679858 : bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
494 :
495 : /* ??? The following is quadratic since analyze_register_chain
496 : iterates over all refs to look for dual-mode regs. Instead this
497 : should be done separately for all regs mentioned in the chain once. */
498 12596937 : df_ref ref;
499 25730377 : for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
500 13134837 : if (!HARD_REGISTER_P (DF_REF_REG (ref)))
501 9679858 : if (!analyze_register_chain (candidates, ref, disallowed))
502 : return false;
503 :
504 : /* The operand(s) of VEC_SELECT, ZERO_EXTEND and similar ops don't need
505 : to be converted/convertible. */
506 12595540 : if (def_set)
507 12595540 : switch (GET_CODE (SET_SRC (def_set)))
508 : {
509 3749670 : case REG:
510 3749670 : if (HARD_REGISTER_P (SET_SRC (def_set)))
511 : return true;
512 : break;
513 : case VEC_SELECT:
514 : return true;
515 260 : case ZERO_EXTEND:
516 260 : if (GET_MODE (XEXP (SET_SRC (def_set), 0)) == DImode)
517 : return true;
518 : break;
519 2359271 : case PLUS:
520 2359271 : case IOR:
521 2359271 : case XOR:
522 2359271 : if (smode == TImode && timode_concatdi_p (SET_SRC (def_set)))
523 : return true;
524 : break;
525 : default:
526 : break;
527 : }
528 :
529 27550204 : for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
530 15023481 : if (DF_REF_TYPE (ref) == DF_REF_REG_USE
531 8066691 : && !SUBREG_P (DF_REF_REG (ref)))
532 8066688 : if (!analyze_register_chain (candidates, ref, disallowed))
533 : return false;
534 :
535 : return true;
536 : }
537 :
538 : /* Build new chain starting from insn INSN_UID recursively
539 : adding all dependent uses and definitions. Return true if OK, false
540 : if the chain discovery was aborted. */
541 :
542 : bool
543 6389111 : scalar_chain::build (bitmap candidates, unsigned insn_uid, bitmap disallowed)
544 : {
545 6389111 : queue = BITMAP_ALLOC (NULL);
546 6389111 : bitmap_set_bit (queue, insn_uid);
547 :
548 6389111 : if (dump_file)
549 136 : fprintf (dump_file, "Building chain #%d...\n", chain_id);
550 :
551 18982179 : while (!bitmap_empty_p (queue))
552 : {
553 12596937 : insn_uid = bitmap_first_set_bit (queue);
554 12596937 : bitmap_clear_bit (queue, insn_uid);
555 12596937 : bitmap_clear_bit (candidates, insn_uid);
556 12596937 : if (!add_insn (candidates, insn_uid, disallowed))
557 : {
558 : /* If we aborted the search put sofar found insn on the set of
559 : disallowed insns so that further searches reaching them also
560 : abort and thus we abort the whole but yet undiscovered chain. */
561 3869 : bitmap_ior_into (disallowed, insns);
562 3869 : if (dump_file)
563 0 : fprintf (dump_file, "Aborted chain #%d discovery\n", chain_id);
564 3869 : BITMAP_FREE (queue);
565 3869 : return false;
566 : }
567 : }
568 :
569 6385242 : if (dump_file)
570 : {
571 136 : fprintf (dump_file, "Collected chain #%d...\n", chain_id);
572 136 : fprintf (dump_file, " insns: ");
573 136 : dump_bitmap (dump_file, insns);
574 136 : if (!bitmap_empty_p (defs_conv))
575 : {
576 136 : bitmap_iterator bi;
577 136 : unsigned id;
578 136 : const char *comma = "";
579 136 : fprintf (dump_file, " defs to convert: ");
580 366 : EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
581 : {
582 230 : fprintf (dump_file, "%sr%d", comma, id);
583 230 : comma = ", ";
584 : }
585 136 : fprintf (dump_file, "\n");
586 : }
587 : }
588 :
589 6385242 : BITMAP_FREE (queue);
590 :
591 6385242 : return true;
592 : }
593 :
594 : /* Return a cost of building a vector constant
595 : instead of using a scalar one. */
596 :
597 : int
598 2637382 : general_scalar_chain::vector_const_cost (rtx exp, basic_block bb)
599 : {
600 2637382 : gcc_assert (CONST_INT_P (exp));
601 :
602 2637382 : if (standard_sse_constant_p (exp, vmode))
603 616063 : return ix86_cost->sse_op;
604 2021319 : if (optimize_bb_for_size_p (bb))
605 : return COSTS_N_BYTES (8);
606 : /* We have separate costs for SImode and DImode, use SImode costs
607 : for smaller modes. */
608 2401649 : return COSTS_N_INSNS (ix86_cost->sse_load[smode == DImode ? 1 : 0]) / 2;
609 : }
610 :
611 : /* Return true if it's cost profitable for chain conversion. */
612 :
613 : bool
614 5885375 : general_scalar_chain::compute_convert_gain ()
615 : {
616 5885375 : bitmap_iterator bi;
617 5885375 : unsigned insn_uid;
618 5885375 : int gain = 0;
619 5885375 : sreal weighted_gain = 0;
620 :
621 5885375 : if (dump_file)
622 136 : fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
623 :
624 : /* SSE costs distinguish between SImode and DImode loads/stores, for
625 : int costs factor in the number of GPRs involved. When supporting
626 : smaller modes than SImode the int load/store costs need to be
627 : adjusted as well. */
628 5885375 : unsigned sse_cost_idx = smode == DImode ? 1 : 0;
629 5885375 : int m = smode == DImode ? (TARGET_64BIT ? 1 : 2) : 1;
630 :
631 17472900 : EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
632 : {
633 11587525 : rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
634 11587525 : rtx def_set = single_set (insn);
635 11587525 : rtx src = SET_SRC (def_set);
636 11587525 : rtx dst = SET_DEST (def_set);
637 11587525 : basic_block bb = BLOCK_FOR_INSN (insn);
638 11587525 : int igain = 0;
639 11587525 : profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count;
640 11587525 : bool speed_p = optimize_bb_for_speed_p (bb);
641 11587525 : sreal bb_freq = bb->count.to_sreal_scale (entry_count);
642 :
643 11587525 : if (REG_P (src) && REG_P (dst))
644 : {
645 911982 : if (!speed_p)
646 : /* reg-reg move is 2 bytes, while SSE 3. */
647 187201 : igain += COSTS_N_BYTES (2 * m - 3);
648 : else
649 : /* Move costs are normalized to reg-reg move having cost 2. */
650 724781 : igain += COSTS_N_INSNS (2 * m - ix86_cost->xmm_move) / 2;
651 : }
652 10675543 : else if (REG_P (src) && MEM_P (dst))
653 : {
654 2314578 : if (!speed_p)
655 : /* Integer load/store is 3+ bytes and SSE 4+. */
656 192017 : igain += COSTS_N_BYTES (3 * m - 4);
657 : else
658 2122561 : igain
659 2122561 : += COSTS_N_INSNS (m * ix86_cost->int_store[2]
660 : - ix86_cost->sse_store[sse_cost_idx]) / 2;
661 : }
662 8360965 : else if (MEM_P (src) && REG_P (dst))
663 : {
664 3775094 : if (!speed_p)
665 360427 : igain += COSTS_N_BYTES (3 * m - 4);
666 : else
667 3414667 : igain += COSTS_N_INSNS (m * ix86_cost->int_load[2]
668 : - ix86_cost->sse_load[sse_cost_idx]) / 2;
669 : }
670 : else
671 : {
672 : /* For operations on memory operands, include the overhead
673 : of explicit load and store instructions. */
674 4585871 : if (MEM_P (dst))
675 : {
676 66596 : if (!speed_p)
677 : /* ??? This probably should account size difference
678 : of SSE and integer load rather than full SSE load. */
679 : igain -= COSTS_N_BYTES (8);
680 : else
681 : {
682 57321 : int cost = (m * (ix86_cost->int_load[2]
683 57321 : + ix86_cost->int_store[2])
684 57321 : - (ix86_cost->sse_load[sse_cost_idx] +
685 57321 : ix86_cost->sse_store[sse_cost_idx]));
686 57321 : igain += COSTS_N_INSNS (cost) / 2;
687 : }
688 : }
689 :
690 4585871 : switch (GET_CODE (src))
691 : {
692 480092 : case ASHIFT:
693 480092 : case ASHIFTRT:
694 480092 : case LSHIFTRT:
695 480092 : if (m == 2)
696 : {
697 16992 : if (INTVAL (XEXP (src, 1)) >= 32)
698 11522 : igain += ix86_cost->add;
699 : /* Gain for extend highpart case. */
700 5470 : else if (GET_CODE (XEXP (src, 0)) == ASHIFT)
701 0 : igain += ix86_cost->shift_const - ix86_cost->sse_op;
702 : else
703 5470 : igain += ix86_cost->shift_const;
704 : }
705 :
706 480092 : igain += ix86_cost->shift_const - ix86_cost->sse_op;
707 :
708 480092 : if (CONST_INT_P (XEXP (src, 0)))
709 0 : igain -= vector_const_cost (XEXP (src, 0), bb);
710 : break;
711 :
712 3646 : case ROTATE:
713 3646 : case ROTATERT:
714 3646 : igain += m * ix86_cost->shift_const;
715 3646 : if (TARGET_AVX512VL)
716 204 : igain -= ix86_cost->sse_op;
717 3442 : else if (smode == DImode)
718 : {
719 590 : int bits = INTVAL (XEXP (src, 1));
720 590 : if ((bits & 0x0f) == 0)
721 106 : igain -= ix86_cost->sse_op;
722 484 : else if ((bits & 0x07) == 0)
723 27 : igain -= 2 * ix86_cost->sse_op;
724 : else
725 457 : igain -= 3 * ix86_cost->sse_op;
726 : }
727 2852 : else if (INTVAL (XEXP (src, 1)) == 16)
728 139 : igain -= ix86_cost->sse_op;
729 : else
730 2713 : igain -= 2 * ix86_cost->sse_op;
731 : break;
732 :
733 2834700 : case AND:
734 2834700 : case IOR:
735 2834700 : case XOR:
736 2834700 : case PLUS:
737 2834700 : case MINUS:
738 2834700 : igain += m * ix86_cost->add - ix86_cost->sse_op;
739 : /* Additional gain for andnot for targets without BMI. */
740 2834700 : if (GET_CODE (XEXP (src, 0)) == NOT
741 3598 : && !TARGET_BMI)
742 3589 : igain += m * ix86_cost->add;
743 :
744 2834700 : if (CONST_INT_P (XEXP (src, 0)))
745 0 : igain -= vector_const_cost (XEXP (src, 0), bb);
746 2834700 : if (CONST_INT_P (XEXP (src, 1)))
747 1687550 : igain -= vector_const_cost (XEXP (src, 1), bb);
748 2834700 : if (MEM_P (XEXP (src, 1)))
749 : {
750 87376 : if (!speed_p)
751 20625 : igain -= COSTS_N_BYTES (m == 2 ? 3 : 5);
752 : else
753 77059 : igain += COSTS_N_INSNS
754 : (m * ix86_cost->int_load[2]
755 : - ix86_cost->sse_load[sse_cost_idx]) / 2;
756 : }
757 : break;
758 :
759 50935 : case NEG:
760 50935 : case NOT:
761 50935 : igain -= ix86_cost->sse_op + COSTS_N_INSNS (1);
762 :
763 50935 : if (GET_CODE (XEXP (src, 0)) != ABS)
764 : {
765 50935 : igain += m * ix86_cost->add;
766 50935 : break;
767 : }
768 : /* FALLTHRU */
769 :
770 998 : case ABS:
771 998 : case SMAX:
772 998 : case SMIN:
773 998 : case UMAX:
774 998 : case UMIN:
775 : /* We do not have any conditional move cost, estimate it as a
776 : reg-reg move. Comparisons are costed as adds. */
777 998 : igain += m * (COSTS_N_INSNS (2) + ix86_cost->add);
778 : /* Integer SSE ops are all costed the same. */
779 998 : igain -= ix86_cost->sse_op;
780 998 : break;
781 :
782 0 : case COMPARE:
783 0 : if (XEXP (src, 1) != const0_rtx)
784 : {
785 : /* cmp vs. pxor;pshufd;ptest. */
786 0 : igain += COSTS_N_INSNS (m - 3);
787 : }
788 0 : else if (GET_CODE (XEXP (src, 0)) != AND)
789 : {
790 : /* test vs. pshufd;ptest. */
791 0 : igain += COSTS_N_INSNS (m - 2);
792 : }
793 0 : else if (GET_CODE (XEXP (XEXP (src, 0), 0)) != NOT)
794 : {
795 : /* and;test vs. pshufd;ptest. */
796 0 : igain += COSTS_N_INSNS (2 * m - 2);
797 : }
798 0 : else if (TARGET_BMI)
799 : {
800 : /* andn;test vs. pandn;pshufd;ptest. */
801 0 : igain += COSTS_N_INSNS (2 * m - 3);
802 : }
803 : else
804 : {
805 : /* not;and;test vs. pandn;pshufd;ptest. */
806 0 : igain += COSTS_N_INSNS (3 * m - 3);
807 : }
808 : break;
809 :
810 1178536 : case CONST_INT:
811 1178536 : if (REG_P (dst))
812 : {
813 1178536 : if (!speed_p)
814 : {
815 : /* xor (2 bytes) vs. xorps (3 bytes). */
816 228704 : if (src == const0_rtx)
817 121416 : igain -= COSTS_N_BYTES (1);
818 : /* movdi_internal vs. movv2di_internal. */
819 : /* => mov (5 bytes) vs. movaps (7 bytes). */
820 107288 : else if (x86_64_immediate_operand (src, SImode))
821 95294 : igain -= COSTS_N_BYTES (2);
822 : else
823 : /* ??? Larger immediate constants are placed in the
824 : constant pool, where the size benefit/impact of
825 : STV conversion is affected by whether and how
826 : often each constant pool entry is shared/reused.
827 : The value below is empirically derived from the
828 : CSiBE benchmark (and the optimal value may drift
829 : over time). */
830 : igain += COSTS_N_BYTES (0);
831 : }
832 : else
833 : {
834 : /* DImode can be immediate for TARGET_64BIT
835 : and SImode always. */
836 949832 : igain += m * COSTS_N_INSNS (1);
837 949832 : igain -= vector_const_cost (src, bb);
838 : }
839 : }
840 0 : else if (MEM_P (dst))
841 : {
842 0 : igain += (m * ix86_cost->int_store[2]
843 0 : - ix86_cost->sse_store[sse_cost_idx]);
844 0 : igain -= vector_const_cost (src, bb);
845 : }
846 : break;
847 :
848 36964 : case VEC_SELECT:
849 36964 : if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
850 : {
851 : // movd (4 bytes) replaced with movdqa (4 bytes).
852 27088 : if (!!speed_p)
853 25309 : igain += COSTS_N_INSNS (ix86_cost->sse_to_integer
854 : - ix86_cost->xmm_move) / 2;
855 : }
856 : else
857 : {
858 : // pshufd; movd replaced with pshufd.
859 9876 : if (!speed_p)
860 648 : igain += COSTS_N_BYTES (4);
861 : else
862 9228 : igain += ix86_cost->sse_to_integer;
863 : }
864 : break;
865 :
866 0 : default:
867 0 : gcc_unreachable ();
868 : }
869 : }
870 :
871 11585746 : if (speed_p)
872 10323843 : weighted_gain += bb_freq * igain;
873 11587525 : gain += igain;
874 :
875 11587525 : if (igain != 0 && dump_file)
876 : {
877 93 : fprintf (dump_file, " Instruction gain %d with bb_freq %.2f for",
878 : igain, bb_freq.to_double ());
879 93 : dump_insn_slim (dump_file, insn);
880 : }
881 : }
882 :
883 5885375 : if (dump_file)
884 : {
885 136 : fprintf (dump_file, " Instruction conversion gain: %d, \n",
886 : gain);
887 136 : fprintf (dump_file, " Registers conversion cost: %d\n",
888 : cost_sse_integer);
889 136 : fprintf (dump_file, " Weighted instruction conversion gain: %.2f, \n",
890 : weighted_gain.to_double ());
891 136 : fprintf (dump_file, " Weighted registers conversion cost: %.2f\n",
892 : weighted_cost_sse_integer.to_double ());
893 : }
894 :
895 5885375 : if (weighted_gain != weighted_cost_sse_integer)
896 4750083 : return weighted_gain > weighted_cost_sse_integer;
897 : else
898 1135292 : return gain > cost_sse_integer;;
899 : }
900 :
901 : /* Insert generated conversion instruction sequence INSNS
902 : after instruction AFTER. New BB may be required in case
903 : instruction has EH region attached. */
904 :
905 : void
906 31208 : scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
907 : {
908 31208 : if (!control_flow_insn_p (after))
909 : {
910 30995 : emit_insn_after (insns, after);
911 30995 : return;
912 : }
913 :
914 213 : basic_block bb = BLOCK_FOR_INSN (after);
915 213 : edge e = find_fallthru_edge (bb->succs);
916 213 : gcc_assert (e);
917 :
918 213 : basic_block new_bb = split_edge (e);
919 213 : emit_insn_after (insns, BB_HEAD (new_bb));
920 : }
921 :
922 : } // anon namespace
923 :
924 : /* Generate the canonical SET_SRC to move GPR to a VMODE vector register,
925 : zeroing the upper parts. */
926 :
927 : static rtx
928 173164 : gen_gpr_to_xmm_move_src (enum machine_mode vmode, rtx gpr)
929 : {
930 346328 : switch (GET_MODE_NUNITS (vmode))
931 : {
932 45 : case 1:
933 45 : return gen_rtx_SUBREG (vmode, gpr, 0);
934 172557 : case 2:
935 345114 : return gen_rtx_VEC_CONCAT (vmode, gpr,
936 : CONST0_RTX (GET_MODE_INNER (vmode)));
937 562 : default:
938 562 : return gen_rtx_VEC_MERGE (vmode, gen_rtx_VEC_DUPLICATE (vmode, gpr),
939 : CONST0_RTX (vmode), GEN_INT (HOST_WIDE_INT_1U));
940 : }
941 : }
942 :
943 : /* Make vector copies for all register REGNO definitions
944 : and replace its uses in a chain. */
945 :
946 : void
947 8414 : scalar_chain::make_vector_copies (rtx_insn *insn, rtx reg)
948 : {
949 8414 : rtx vreg = *defs_map.get (reg);
950 :
951 8414 : start_sequence ();
952 8414 : if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
953 : {
954 0 : rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
955 0 : if (smode == DImode && !TARGET_64BIT)
956 : {
957 0 : emit_move_insn (adjust_address (tmp, SImode, 0),
958 : gen_rtx_SUBREG (SImode, reg, 0));
959 0 : emit_move_insn (adjust_address (tmp, SImode, 4),
960 : gen_rtx_SUBREG (SImode, reg, 4));
961 : }
962 : else
963 0 : emit_move_insn (copy_rtx (tmp), reg);
964 0 : emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
965 : gen_gpr_to_xmm_move_src (vmode, tmp)));
966 : }
967 8414 : else if (!TARGET_64BIT && smode == DImode)
968 : {
969 8278 : if (TARGET_SSE4_1)
970 : {
971 356 : emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
972 : CONST0_RTX (V4SImode),
973 : gen_rtx_SUBREG (SImode, reg, 0)));
974 356 : emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
975 : gen_rtx_SUBREG (V4SImode, vreg, 0),
976 : gen_rtx_SUBREG (SImode, reg, 4),
977 : GEN_INT (2)));
978 : }
979 : else
980 : {
981 7922 : rtx tmp = gen_reg_rtx (DImode);
982 7922 : emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
983 : CONST0_RTX (V4SImode),
984 : gen_rtx_SUBREG (SImode, reg, 0)));
985 7922 : emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
986 : CONST0_RTX (V4SImode),
987 : gen_rtx_SUBREG (SImode, reg, 4)));
988 7922 : emit_insn (gen_vec_interleave_lowv4si
989 : (gen_rtx_SUBREG (V4SImode, vreg, 0),
990 : gen_rtx_SUBREG (V4SImode, vreg, 0),
991 : gen_rtx_SUBREG (V4SImode, tmp, 0)));
992 : }
993 : }
994 : else
995 136 : emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
996 : gen_gpr_to_xmm_move_src (vmode, reg)));
997 8414 : rtx_insn *seq = end_sequence ();
998 8414 : emit_conversion_insns (seq, insn);
999 :
1000 8414 : if (dump_file)
1001 0 : fprintf (dump_file,
1002 : " Copied r%d to a vector register r%d for insn %d\n",
1003 0 : REGNO (reg), REGNO (vreg), INSN_UID (insn));
1004 8414 : }
1005 :
1006 : /* Copy the definition SRC of INSN inside the chain to DST for
1007 : scalar uses outside of the chain. */
1008 :
1009 : void
1010 22036 : scalar_chain::convert_reg (rtx_insn *insn, rtx dst, rtx src)
1011 : {
1012 22036 : start_sequence ();
1013 22036 : if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
1014 : {
1015 0 : rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
1016 0 : emit_move_insn (tmp, src);
1017 0 : if (!TARGET_64BIT && smode == DImode)
1018 : {
1019 0 : emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
1020 : adjust_address (tmp, SImode, 0));
1021 0 : emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
1022 : adjust_address (tmp, SImode, 4));
1023 : }
1024 : else
1025 0 : emit_move_insn (dst, copy_rtx (tmp));
1026 : }
1027 22036 : else if (!TARGET_64BIT && smode == DImode)
1028 : {
1029 21135 : if (TARGET_SSE4_1)
1030 : {
1031 0 : rtx tmp = gen_rtx_PARALLEL (VOIDmode,
1032 : gen_rtvec (1, const0_rtx));
1033 0 : emit_insn
1034 0 : (gen_rtx_SET
1035 : (gen_rtx_SUBREG (SImode, dst, 0),
1036 : gen_rtx_VEC_SELECT (SImode,
1037 : gen_rtx_SUBREG (V4SImode, src, 0),
1038 : tmp)));
1039 :
1040 0 : tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
1041 0 : emit_insn
1042 0 : (gen_rtx_SET
1043 : (gen_rtx_SUBREG (SImode, dst, 4),
1044 : gen_rtx_VEC_SELECT (SImode,
1045 : gen_rtx_SUBREG (V4SImode, src, 0),
1046 : tmp)));
1047 : }
1048 : else
1049 : {
1050 21135 : rtx vcopy = gen_reg_rtx (V2DImode);
1051 21135 : emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, src, 0));
1052 21135 : emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
1053 : gen_rtx_SUBREG (SImode, vcopy, 0));
1054 21135 : emit_move_insn (vcopy,
1055 : gen_rtx_LSHIFTRT (V2DImode,
1056 : vcopy, GEN_INT (32)));
1057 21135 : emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
1058 : gen_rtx_SUBREG (SImode, vcopy, 0));
1059 : }
1060 : }
1061 : else
1062 901 : emit_move_insn (dst, src);
1063 :
1064 22036 : rtx_insn *seq = end_sequence ();
1065 22036 : emit_conversion_insns (seq, insn);
1066 :
1067 22036 : if (dump_file)
1068 0 : fprintf (dump_file,
1069 : " Copied r%d to a scalar register r%d for insn %d\n",
1070 0 : REGNO (src), REGNO (dst), INSN_UID (insn));
1071 22036 : }
1072 :
1073 : /* Helper function to convert immediate constant X to vmode. */
1074 : static rtx
1075 36105 : smode_convert_cst (rtx x, enum machine_mode vmode)
1076 : {
1077 : /* Prefer all ones vector in case of -1. */
1078 36105 : if (constm1_operand (x, GET_MODE (x)))
1079 625 : return CONSTM1_RTX (vmode);
1080 :
1081 35480 : unsigned n = GET_MODE_NUNITS (vmode);
1082 35480 : rtx *v = XALLOCAVEC (rtx, n);
1083 35480 : v[0] = x;
1084 41240 : for (unsigned i = 1; i < n; ++i)
1085 5760 : v[i] = const0_rtx;
1086 35480 : return gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (n, v));
1087 : }
1088 :
1089 : /* Convert operand OP in INSN. We should handle
1090 : memory operands and uninitialized registers.
1091 : All other register uses are converted during
1092 : registers conversion. */
1093 :
1094 : void
1095 247574 : scalar_chain::convert_op (rtx *op, rtx_insn *insn)
1096 : {
1097 247574 : rtx tmp;
1098 :
1099 247574 : if (GET_MODE (*op) == V1TImode)
1100 : return;
1101 :
1102 247391 : *op = copy_rtx_if_shared (*op);
1103 :
1104 247391 : if (GET_CODE (*op) == NOT
1105 247391 : || GET_CODE (*op) == ASHIFT)
1106 : {
1107 3493 : convert_op (&XEXP (*op, 0), insn);
1108 3493 : PUT_MODE (*op, vmode);
1109 : }
1110 : else if (MEM_P (*op))
1111 : {
1112 173028 : rtx_insn *movabs = NULL;
1113 :
1114 : /* Emit MOVABS to load from a 64-bit absolute address to a GPR. */
1115 173028 : if (!memory_operand (*op, GET_MODE (*op)))
1116 : {
1117 0 : tmp = gen_reg_rtx (GET_MODE (*op));
1118 0 : movabs = emit_insn_before (gen_rtx_SET (tmp, *op), insn);
1119 :
1120 0 : *op = tmp;
1121 : }
1122 :
1123 173028 : tmp = gen_rtx_SUBREG (vmode, gen_reg_rtx (GET_MODE (*op)), 0);
1124 :
1125 173028 : rtx_insn *eh_insn
1126 173028 : = emit_insn_before (gen_rtx_SET (copy_rtx (tmp),
1127 : gen_gpr_to_xmm_move_src (vmode, *op)),
1128 173028 : insn);
1129 :
1130 173028 : if (cfun->can_throw_non_call_exceptions)
1131 : {
1132 : /* Handle REG_EH_REGION note. */
1133 168761 : rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
1134 168761 : if (note)
1135 : {
1136 3588 : if (movabs)
1137 0 : eh_insn = movabs;
1138 3588 : control_flow_insns.safe_push (eh_insn);
1139 3588 : add_reg_note (eh_insn, REG_EH_REGION, XEXP (note, 0));
1140 : }
1141 : }
1142 :
1143 173028 : *op = tmp;
1144 :
1145 173028 : if (dump_file)
1146 0 : fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
1147 0 : INSN_UID (insn), reg_or_subregno (tmp));
1148 : }
1149 : else if (REG_P (*op))
1150 64750 : *op = gen_rtx_SUBREG (vmode, *op, 0);
1151 : else if (CONST_SCALAR_INT_P (*op))
1152 : {
1153 6117 : rtx vec_cst = smode_convert_cst (*op, vmode);
1154 :
1155 6117 : if (!standard_sse_constant_p (vec_cst, vmode))
1156 : {
1157 2718 : start_sequence ();
1158 2718 : vec_cst = validize_mem (force_const_mem (vmode, vec_cst));
1159 2718 : rtx_insn *seq = end_sequence ();
1160 2718 : emit_insn_before (seq, insn);
1161 : }
1162 :
1163 6117 : tmp = gen_rtx_SUBREG (vmode, gen_reg_rtx (smode), 0);
1164 :
1165 6117 : emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
1166 6117 : *op = tmp;
1167 : }
1168 : else
1169 : {
1170 0 : gcc_assert (SUBREG_P (*op));
1171 3 : if (GET_MODE (*op) != vmode)
1172 3 : *op = gen_lowpart (vmode, *op);
1173 : }
1174 : }
1175 :
1176 : /* Convert CCZmode COMPARE to vector mode. */
1177 :
1178 : rtx
1179 12 : scalar_chain::convert_compare (rtx op1, rtx op2, rtx_insn *insn)
1180 : {
1181 12 : rtx src, tmp;
1182 :
1183 : /* Handle any REG_EQUAL notes. */
1184 12 : tmp = find_reg_equal_equiv_note (insn);
1185 12 : if (tmp)
1186 : {
1187 1 : if (GET_CODE (XEXP (tmp, 0)) == COMPARE
1188 1 : && GET_MODE (XEXP (tmp, 0)) == CCZmode
1189 1 : && REG_P (XEXP (XEXP (tmp, 0), 0)))
1190 : {
1191 1 : rtx *op = &XEXP (XEXP (tmp, 0), 1);
1192 1 : if (CONST_SCALAR_INT_P (*op))
1193 : {
1194 1 : if (constm1_operand (*op, GET_MODE (*op)))
1195 0 : *op = CONSTM1_RTX (vmode);
1196 : else
1197 : {
1198 1 : unsigned n = GET_MODE_NUNITS (vmode);
1199 1 : rtx *v = XALLOCAVEC (rtx, n);
1200 1 : v[0] = *op;
1201 1 : for (unsigned i = 1; i < n; ++i)
1202 0 : v[i] = const0_rtx;
1203 1 : *op = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (n, v));
1204 : }
1205 : tmp = NULL_RTX;
1206 : }
1207 0 : else if (REG_P (*op))
1208 : tmp = NULL_RTX;
1209 : }
1210 :
1211 : if (tmp)
1212 0 : remove_note (insn, tmp);
1213 : }
1214 :
1215 : /* Comparison against anything other than zero, requires an XOR. */
1216 12 : if (op2 != const0_rtx)
1217 : {
1218 6 : convert_op (&op1, insn);
1219 6 : convert_op (&op2, insn);
1220 : /* If both operands are MEMs, explicitly load the OP1 into TMP. */
1221 6 : if (MEM_P (op1) && MEM_P (op2))
1222 : {
1223 0 : tmp = gen_reg_rtx (vmode);
1224 0 : emit_insn_before (gen_rtx_SET (tmp, op1), insn);
1225 0 : src = tmp;
1226 : }
1227 : else
1228 : src = op1;
1229 6 : src = gen_rtx_XOR (vmode, src, op2);
1230 : }
1231 6 : else if (GET_CODE (op1) == AND
1232 0 : && GET_CODE (XEXP (op1, 0)) == NOT)
1233 : {
1234 0 : rtx op11 = XEXP (XEXP (op1, 0), 0);
1235 0 : rtx op12 = XEXP (op1, 1);
1236 0 : convert_op (&op11, insn);
1237 0 : convert_op (&op12, insn);
1238 0 : if (!REG_P (op11))
1239 : {
1240 0 : tmp = gen_reg_rtx (vmode);
1241 0 : emit_insn_before (gen_rtx_SET (tmp, op11), insn);
1242 0 : op11 = tmp;
1243 : }
1244 0 : src = gen_rtx_AND (vmode, gen_rtx_NOT (vmode, op11), op12);
1245 0 : }
1246 6 : else if (GET_CODE (op1) == AND)
1247 : {
1248 0 : rtx op11 = XEXP (op1, 0);
1249 0 : rtx op12 = XEXP (op1, 1);
1250 0 : convert_op (&op11, insn);
1251 0 : convert_op (&op12, insn);
1252 0 : if (!REG_P (op11))
1253 : {
1254 0 : tmp = gen_reg_rtx (vmode);
1255 0 : emit_insn_before (gen_rtx_SET (tmp, op11), insn);
1256 0 : op11 = tmp;
1257 : }
1258 0 : return gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, op11, op12),
1259 : UNSPEC_PTEST);
1260 : }
1261 : else
1262 : {
1263 6 : convert_op (&op1, insn);
1264 6 : src = op1;
1265 : }
1266 :
1267 12 : if (!REG_P (src))
1268 : {
1269 8 : tmp = gen_reg_rtx (vmode);
1270 8 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
1271 8 : src = tmp;
1272 : }
1273 :
1274 12 : if (vmode == V2DImode)
1275 : {
1276 0 : tmp = gen_reg_rtx (vmode);
1277 0 : emit_insn_before (gen_vec_interleave_lowv2di (tmp, src, src), insn);
1278 0 : src = tmp;
1279 : }
1280 12 : else if (vmode == V4SImode)
1281 : {
1282 0 : tmp = gen_reg_rtx (vmode);
1283 0 : emit_insn_before (gen_sse2_pshufd (tmp, src, const0_rtx), insn);
1284 0 : src = tmp;
1285 : }
1286 :
1287 12 : return gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, src, src), UNSPEC_PTEST);
1288 : }
1289 :
1290 : /* Helper function for converting INSN to vector mode. */
1291 :
1292 : void
1293 1318701 : scalar_chain::convert_insn_common (rtx_insn *insn)
1294 : {
1295 : /* Generate copies for out-of-chain uses of defs and adjust debug uses. */
1296 2019780 : for (df_ref ref = DF_INSN_DEFS (insn); ref; ref = DF_REF_NEXT_LOC (ref))
1297 701079 : if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
1298 : {
1299 23468 : df_link *use;
1300 44151 : for (use = DF_REF_CHAIN (ref); use; use = use->next)
1301 42719 : if (NONDEBUG_INSN_P (DF_REF_INSN (use->ref))
1302 42719 : && (DF_REF_REG_MEM_P (use->ref)
1303 38640 : || !bitmap_bit_p (insns, DF_REF_INSN_UID (use->ref))))
1304 : break;
1305 23468 : if (use)
1306 22036 : convert_reg (insn, DF_REF_REG (ref),
1307 22036 : *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]));
1308 1432 : else if (MAY_HAVE_DEBUG_BIND_INSNS)
1309 : {
1310 : /* If we generated a scalar copy we can leave debug-insns
1311 : as-is, if not, we have to adjust them. */
1312 1310 : auto_vec<rtx_insn *, 5> to_reset_debug_insns;
1313 3877 : for (use = DF_REF_CHAIN (ref); use; use = use->next)
1314 2567 : if (DEBUG_INSN_P (DF_REF_INSN (use->ref)))
1315 : {
1316 825 : rtx_insn *debug_insn = DF_REF_INSN (use->ref);
1317 : /* If there's a reaching definition outside of the
1318 : chain we have to reset. */
1319 825 : df_link *def;
1320 2934 : for (def = DF_REF_CHAIN (use->ref); def; def = def->next)
1321 2292 : if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def->ref)))
1322 : break;
1323 825 : if (def)
1324 183 : to_reset_debug_insns.safe_push (debug_insn);
1325 : else
1326 : {
1327 642 : *DF_REF_REAL_LOC (use->ref)
1328 642 : = *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]);
1329 642 : df_insn_rescan (debug_insn);
1330 : }
1331 : }
1332 : /* Have to do the reset outside of the DF_CHAIN walk to not
1333 : disrupt it. */
1334 2803 : while (!to_reset_debug_insns.is_empty ())
1335 : {
1336 183 : rtx_insn *debug_insn = to_reset_debug_insns.pop ();
1337 183 : INSN_VAR_LOCATION_LOC (debug_insn) = gen_rtx_UNKNOWN_VAR_LOC ();
1338 183 : df_insn_rescan_debug_internal (debug_insn);
1339 : }
1340 1310 : }
1341 : }
1342 :
1343 : /* Replace uses in this insn with the defs we use in the chain. */
1344 3293523 : for (df_ref ref = DF_INSN_USES (insn); ref; ref = DF_REF_NEXT_LOC (ref))
1345 1974822 : if (!DF_REF_REG_MEM_P (ref))
1346 711677 : if (rtx *vreg = defs_map.get (regno_reg_rtx[DF_REF_REGNO (ref)]))
1347 : {
1348 : /* Also update a corresponding REG_DEAD note. */
1349 35367 : rtx note = find_reg_note (insn, REG_DEAD, DF_REF_REG (ref));
1350 35367 : if (note)
1351 23379 : XEXP (note, 0) = *vreg;
1352 35367 : *DF_REF_REAL_LOC (ref) = *vreg;
1353 : }
1354 1318701 : }
1355 :
1356 : /* Convert INSN which is an SImode or DImode rotation by a constant
1357 : to vector mode. CODE is either ROTATE or ROTATERT with operands
1358 : OP0 and OP1. Returns the SET_SRC of the last instruction in the
1359 : resulting sequence, which is emitted before INSN. */
1360 :
1361 : rtx
1362 92 : general_scalar_chain::convert_rotate (enum rtx_code code, rtx op0, rtx op1,
1363 : rtx_insn *insn)
1364 : {
1365 92 : int bits = INTVAL (op1);
1366 92 : rtx pat, result;
1367 :
1368 92 : convert_op (&op0, insn);
1369 92 : if (bits == 0)
1370 0 : return op0;
1371 :
1372 92 : if (smode == DImode)
1373 : {
1374 92 : if (code == ROTATE)
1375 45 : bits = 64 - bits;
1376 92 : if (bits == 32)
1377 : {
1378 0 : rtx tmp1 = gen_reg_rtx (V4SImode);
1379 0 : pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
1380 : GEN_INT (225));
1381 0 : emit_insn_before (pat, insn);
1382 0 : result = gen_lowpart (V2DImode, tmp1);
1383 : }
1384 92 : else if (TARGET_AVX512VL)
1385 0 : result = simplify_gen_binary (code, V2DImode, op0, op1);
1386 92 : else if (bits == 16 || bits == 48)
1387 : {
1388 0 : rtx tmp1 = gen_reg_rtx (V8HImode);
1389 0 : pat = gen_sse2_pshuflw (tmp1, gen_lowpart (V8HImode, op0),
1390 : GEN_INT (bits == 16 ? 57 : 147));
1391 0 : emit_insn_before (pat, insn);
1392 0 : result = gen_lowpart (V2DImode, tmp1);
1393 : }
1394 92 : else if ((bits & 0x07) == 0)
1395 : {
1396 0 : rtx tmp1 = gen_reg_rtx (V4SImode);
1397 0 : pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
1398 : GEN_INT (68));
1399 0 : emit_insn_before (pat, insn);
1400 0 : rtx tmp2 = gen_reg_rtx (V1TImode);
1401 0 : pat = gen_sse2_lshrv1ti3 (tmp2, gen_lowpart (V1TImode, tmp1),
1402 : GEN_INT (bits));
1403 0 : emit_insn_before (pat, insn);
1404 0 : result = gen_lowpart (V2DImode, tmp2);
1405 : }
1406 : else
1407 : {
1408 92 : rtx tmp1 = gen_reg_rtx (V4SImode);
1409 92 : pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
1410 : GEN_INT (20));
1411 92 : emit_insn_before (pat, insn);
1412 92 : rtx tmp2 = gen_reg_rtx (V2DImode);
1413 92 : pat = gen_lshrv2di3 (tmp2, gen_lowpart (V2DImode, tmp1),
1414 : GEN_INT (bits & 31));
1415 92 : emit_insn_before (pat, insn);
1416 92 : rtx tmp3 = gen_reg_rtx (V4SImode);
1417 139 : pat = gen_sse2_pshufd (tmp3, gen_lowpart (V4SImode, tmp2),
1418 : GEN_INT (bits > 32 ? 34 : 136));
1419 92 : emit_insn_before (pat, insn);
1420 92 : result = gen_lowpart (V2DImode, tmp3);
1421 : }
1422 : }
1423 0 : else if (bits == 16)
1424 : {
1425 0 : rtx tmp1 = gen_reg_rtx (V8HImode);
1426 0 : pat = gen_sse2_pshuflw (tmp1, gen_lowpart (V8HImode, op0), GEN_INT (225));
1427 0 : emit_insn_before (pat, insn);
1428 0 : result = gen_lowpart (V4SImode, tmp1);
1429 : }
1430 0 : else if (TARGET_AVX512VL)
1431 0 : result = simplify_gen_binary (code, V4SImode, op0, op1);
1432 : else
1433 : {
1434 0 : if (code == ROTATE)
1435 0 : bits = 32 - bits;
1436 :
1437 0 : rtx tmp1 = gen_reg_rtx (V4SImode);
1438 0 : emit_insn_before (gen_sse2_pshufd (tmp1, op0, GEN_INT (224)), insn);
1439 0 : rtx tmp2 = gen_reg_rtx (V2DImode);
1440 0 : pat = gen_lshrv2di3 (tmp2, gen_lowpart (V2DImode, tmp1),
1441 : GEN_INT (bits));
1442 0 : emit_insn_before (pat, insn);
1443 0 : result = gen_lowpart (V4SImode, tmp2);
1444 : }
1445 :
1446 : return result;
1447 : }
1448 :
1449 : /* Convert INSN to vector mode. */
1450 :
1451 : void
1452 411616 : general_scalar_chain::convert_insn (rtx_insn *insn)
1453 : {
1454 411616 : rtx def_set = single_set (insn);
1455 411616 : rtx src = SET_SRC (def_set);
1456 411616 : rtx dst = SET_DEST (def_set);
1457 411616 : rtx subreg;
1458 :
1459 411616 : if (MEM_P (dst) && !REG_P (src))
1460 : {
1461 : /* There are no scalar integer instructions and therefore
1462 : temporary register usage is required. */
1463 758 : rtx tmp = gen_reg_rtx (smode);
1464 758 : emit_conversion_insns (gen_move_insn (dst, tmp), insn);
1465 758 : dst = gen_rtx_SUBREG (vmode, tmp, 0);
1466 758 : }
1467 410858 : else if (REG_P (dst) && GET_MODE (dst) == smode)
1468 : {
1469 : /* Replace the definition with a SUBREG to the definition we
1470 : use inside the chain. */
1471 215545 : rtx *vdef = defs_map.get (dst);
1472 215545 : if (vdef)
1473 23468 : dst = *vdef;
1474 215545 : dst = gen_rtx_SUBREG (vmode, dst, 0);
1475 : /* IRA doesn't like to have REG_EQUAL/EQUIV notes when the SET_DEST
1476 : is a non-REG_P. So kill those off. */
1477 215545 : rtx note = find_reg_equal_equiv_note (insn);
1478 215545 : if (note)
1479 9727 : remove_note (insn, note);
1480 : }
1481 :
1482 411616 : switch (GET_CODE (src))
1483 : {
1484 30092 : case PLUS:
1485 30092 : case MINUS:
1486 30092 : case IOR:
1487 30092 : case XOR:
1488 30092 : case AND:
1489 30092 : case SMAX:
1490 30092 : case SMIN:
1491 30092 : case UMAX:
1492 30092 : case UMIN:
1493 30092 : convert_op (&XEXP (src, 1), insn);
1494 : /* FALLTHRU */
1495 :
1496 37480 : case ABS:
1497 37480 : case ASHIFT:
1498 37480 : case ASHIFTRT:
1499 37480 : case LSHIFTRT:
1500 37480 : convert_op (&XEXP (src, 0), insn);
1501 37480 : PUT_MODE (src, vmode);
1502 37480 : break;
1503 :
1504 92 : case ROTATE:
1505 92 : case ROTATERT:
1506 92 : src = convert_rotate (GET_CODE (src), XEXP (src, 0), XEXP (src, 1),
1507 : insn);
1508 92 : break;
1509 :
1510 391 : case NEG:
1511 391 : src = XEXP (src, 0);
1512 :
1513 391 : if (GET_CODE (src) == ABS)
1514 : {
1515 0 : src = XEXP (src, 0);
1516 0 : convert_op (&src, insn);
1517 0 : subreg = gen_reg_rtx (vmode);
1518 0 : emit_insn_before (gen_rtx_SET (subreg,
1519 : gen_rtx_ABS (vmode, src)), insn);
1520 0 : src = subreg;
1521 : }
1522 : else
1523 391 : convert_op (&src, insn);
1524 :
1525 391 : subreg = gen_reg_rtx (vmode);
1526 391 : emit_insn_before (gen_move_insn (subreg, CONST0_RTX (vmode)), insn);
1527 391 : src = gen_rtx_MINUS (vmode, subreg, src);
1528 391 : break;
1529 :
1530 250 : case NOT:
1531 250 : src = XEXP (src, 0);
1532 250 : convert_op (&src, insn);
1533 250 : subreg = gen_reg_rtx (vmode);
1534 250 : emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (vmode)), insn);
1535 250 : src = gen_rtx_XOR (vmode, src, subreg);
1536 250 : break;
1537 :
1538 170860 : case MEM:
1539 170860 : if (!REG_P (dst))
1540 170860 : convert_op (&src, insn);
1541 : break;
1542 :
1543 196623 : case REG:
1544 196623 : if (!MEM_P (dst))
1545 1310 : convert_op (&src, insn);
1546 : break;
1547 :
1548 0 : case SUBREG:
1549 0 : gcc_assert (GET_MODE (src) == vmode);
1550 : break;
1551 :
1552 0 : case COMPARE:
1553 0 : dst = gen_rtx_REG (CCZmode, FLAGS_REG);
1554 0 : src = convert_compare (XEXP (src, 0), XEXP (src, 1), insn);
1555 0 : break;
1556 :
1557 3362 : case CONST_INT:
1558 3362 : convert_op (&src, insn);
1559 3362 : break;
1560 :
1561 2558 : case VEC_SELECT:
1562 2558 : if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
1563 1565 : src = XEXP (src, 0);
1564 993 : else if (smode == DImode)
1565 : {
1566 748 : rtx tmp = gen_lowpart (V1TImode, XEXP (src, 0));
1567 748 : dst = gen_lowpart (V1TImode, dst);
1568 748 : src = gen_rtx_LSHIFTRT (V1TImode, tmp, GEN_INT (64));
1569 : }
1570 : else
1571 : {
1572 245 : rtx tmp = XVECEXP (XEXP (src, 1), 0, 0);
1573 245 : rtvec vec = gen_rtvec (4, tmp, tmp, tmp, tmp);
1574 245 : rtx par = gen_rtx_PARALLEL (VOIDmode, vec);
1575 245 : src = gen_rtx_VEC_SELECT (vmode, XEXP (src, 0), par);
1576 : }
1577 : break;
1578 :
1579 0 : default:
1580 0 : gcc_unreachable ();
1581 : }
1582 :
1583 411616 : SET_SRC (def_set) = src;
1584 411616 : SET_DEST (def_set) = dst;
1585 :
1586 : /* Drop possible dead definitions. */
1587 411616 : PATTERN (insn) = def_set;
1588 :
1589 411616 : INSN_CODE (insn) = -1;
1590 411616 : int patt = recog_memoized (insn);
1591 411616 : if (patt == -1)
1592 0 : fatal_insn_not_found (insn);
1593 411616 : df_insn_rescan (insn);
1594 411616 : }
1595 :
1596 : /* Helper function to compute gain for loading an immediate constant.
1597 : Typically, two movabsq for TImode vs. vmovdqa for V1TImode, but
1598 : with numerous special cases. */
1599 :
1600 : static int
1601 19 : timode_immed_const_gain (rtx cst, basic_block bb)
1602 : {
1603 : /* movabsq vs. movabsq+vmovq+vunpacklqdq. */
1604 19 : if (CONST_WIDE_INT_P (cst)
1605 7 : && CONST_WIDE_INT_NUNITS (cst) == 2
1606 26 : && CONST_WIDE_INT_ELT (cst, 0) == CONST_WIDE_INT_ELT (cst, 1))
1607 0 : return optimize_bb_for_size_p (bb) ? -COSTS_N_BYTES (9)
1608 : : -COSTS_N_INSNS (2);
1609 : /* 2x movabsq ~ vmovdqa. */
1610 : return 0;
1611 : }
1612 :
1613 : /* Return true it's cost profitable for for chain conversion. */
1614 :
1615 : bool
1616 499867 : timode_scalar_chain::compute_convert_gain ()
1617 : {
1618 : /* Assume that if we have to move TImode values between units,
1619 : then transforming this chain isn't worth it. */
1620 499867 : if (cost_sse_integer)
1621 : return false;
1622 :
1623 499867 : bitmap_iterator bi;
1624 499867 : unsigned insn_uid;
1625 :
1626 : /* Split ties to prefer V1TImode when not optimizing for size. */
1627 499867 : int gain = optimize_size ? 0 : 1;
1628 499867 : sreal weighted_gain = 0;
1629 :
1630 499867 : if (dump_file)
1631 0 : fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
1632 :
1633 1495389 : EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
1634 : {
1635 995522 : rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1636 995522 : rtx def_set = single_set (insn);
1637 995522 : rtx src = SET_SRC (def_set);
1638 995522 : rtx dst = SET_DEST (def_set);
1639 995522 : HOST_WIDE_INT op1val;
1640 995522 : basic_block bb = BLOCK_FOR_INSN (insn);
1641 995522 : int scost, vcost;
1642 995522 : int igain = 0;
1643 995522 : profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count;
1644 995522 : bool speed_p = optimize_bb_for_speed_p (bb);
1645 995522 : sreal bb_freq = bb->count.to_sreal_scale (entry_count);
1646 :
1647 995522 : switch (GET_CODE (src))
1648 : {
1649 519981 : case REG:
1650 519981 : if (GENERAL_REGNO_P (REGNO (src)))
1651 : {
1652 24979 : if (TARGET_AVX)
1653 : /* vmovq + vpinsrq */
1654 26 : igain = speed_p ? -ix86_cost->integer_to_sse
1655 : - COSTS_N_INSNS (1)
1656 : : -COSTS_N_BYTES (11);
1657 : else
1658 : /* movq + movq + punpcklqdq */
1659 24953 : igain = speed_p ? -ix86_cost->integer_to_sse
1660 : - COSTS_N_INSNS (2)
1661 : : -COSTS_N_BYTES (14);
1662 : }
1663 495002 : else if (GENERAL_REG_P (dst))
1664 : {
1665 24503 : if (TARGET_AVX)
1666 : /* vpextrq + vmovq */
1667 26 : igain = speed_p ? -ix86_cost->sse_to_integer
1668 : - COSTS_N_INSNS (1)
1669 : : -COSTS_N_BYTES (11);
1670 : else
1671 : /* movhlps + movq + movq */
1672 24477 : igain = speed_p ? -ix86_cost->sse_to_integer
1673 : - COSTS_N_INSNS (2)
1674 : : -COSTS_N_BYTES (13);
1675 : }
1676 470499 : else if (!speed_p)
1677 14482 : igain = MEM_P (dst) ? COSTS_N_BYTES (6) : COSTS_N_BYTES (3);
1678 : else
1679 : igain = COSTS_N_INSNS (1);
1680 : break;
1681 :
1682 429606 : case MEM:
1683 429606 : igain = !speed_p ? COSTS_N_BYTES (7) : COSTS_N_INSNS (1);
1684 : break;
1685 :
1686 10593 : case CONST_INT:
1687 10593 : if (MEM_P (dst)
1688 10593 : && standard_sse_constant_p (src, V1TImode))
1689 10058 : igain = !speed_p ? COSTS_N_BYTES (11) : 1;
1690 : break;
1691 :
1692 30350 : case CONST_WIDE_INT:
1693 : /* 2 x mov vs. vmovdqa. */
1694 30350 : if (MEM_P (dst))
1695 29878 : igain = !speed_p ? COSTS_N_BYTES (3) : COSTS_N_INSNS (1);
1696 : break;
1697 :
1698 78 : case NOT:
1699 78 : if (MEM_P (dst))
1700 66323 : igain = -COSTS_N_INSNS (1);
1701 : break;
1702 :
1703 38 : case AND:
1704 38 : if (!MEM_P (dst))
1705 27 : igain = COSTS_N_INSNS (1);
1706 38 : if (CONST_SCALAR_INT_P (XEXP (src, 1)))
1707 10 : igain += timode_immed_const_gain (XEXP (src, 1), bb);
1708 : break;
1709 :
1710 4262 : case XOR:
1711 4262 : case IOR:
1712 4262 : if (timode_concatdi_p (src))
1713 : {
1714 : /* vmovq;vpinsrq (11 bytes). */
1715 4136 : igain = speed_p ? -ix86_cost->integer_to_sse - COSTS_N_INSNS (1)
1716 : : -COSTS_N_BYTES (11);
1717 : break;
1718 : }
1719 126 : if (!MEM_P (dst))
1720 118 : igain = COSTS_N_INSNS (1);
1721 126 : if (CONST_SCALAR_INT_P (XEXP (src, 1)))
1722 9 : igain += timode_immed_const_gain (XEXP (src, 1), bb);
1723 : break;
1724 :
1725 0 : case PLUS:
1726 0 : if (timode_concatdi_p (src))
1727 : /* vmovq;vpinsrq (11 bytes). */
1728 0 : igain = speed_p ? -ix86_cost->integer_to_sse - COSTS_N_INSNS (1)
1729 : : -COSTS_N_BYTES (11);
1730 : break;
1731 :
1732 206 : case ASHIFT:
1733 206 : case LSHIFTRT:
1734 : /* See ix86_expand_v1ti_shift. */
1735 206 : op1val = INTVAL (XEXP (src, 1));
1736 206 : if (!speed_p)
1737 : {
1738 19 : if (op1val == 64 || op1val == 65)
1739 : scost = COSTS_N_BYTES (5);
1740 13 : else if (op1val >= 66)
1741 : scost = COSTS_N_BYTES (6);
1742 13 : else if (op1val == 1)
1743 : scost = COSTS_N_BYTES (8);
1744 : else
1745 : scost = COSTS_N_BYTES (9);
1746 :
1747 17 : if ((op1val & 7) == 0)
1748 : vcost = COSTS_N_BYTES (5);
1749 13 : else if (op1val > 64)
1750 : vcost = COSTS_N_BYTES (10);
1751 : else
1752 13 : vcost = TARGET_AVX ? COSTS_N_BYTES (19) : COSTS_N_BYTES (23);
1753 : }
1754 : else
1755 : {
1756 187 : scost = COSTS_N_INSNS (2);
1757 187 : if ((op1val & 7) == 0)
1758 : vcost = COSTS_N_INSNS (1);
1759 133 : else if (op1val > 64)
1760 : vcost = COSTS_N_INSNS (2);
1761 : else
1762 133 : vcost = TARGET_AVX ? COSTS_N_INSNS (4) : COSTS_N_INSNS (5);
1763 : }
1764 206 : igain = scost - vcost;
1765 206 : break;
1766 :
1767 123 : case ASHIFTRT:
1768 : /* See ix86_expand_v1ti_ashiftrt. */
1769 123 : op1val = INTVAL (XEXP (src, 1));
1770 123 : if (!speed_p)
1771 : {
1772 9 : if (op1val == 64 || op1val == 127)
1773 : scost = COSTS_N_BYTES (7);
1774 9 : else if (op1val == 1)
1775 : scost = COSTS_N_BYTES (8);
1776 8 : else if (op1val == 65)
1777 : scost = COSTS_N_BYTES (10);
1778 8 : else if (op1val >= 66)
1779 : scost = COSTS_N_BYTES (11);
1780 : else
1781 : scost = COSTS_N_BYTES (9);
1782 :
1783 0 : if (op1val == 127)
1784 : vcost = COSTS_N_BYTES (10);
1785 9 : else if (op1val == 64)
1786 : vcost = COSTS_N_BYTES (14);
1787 9 : else if (op1val == 96)
1788 : vcost = COSTS_N_BYTES (18);
1789 9 : else if (op1val >= 111)
1790 : vcost = COSTS_N_BYTES (15);
1791 9 : else if (TARGET_AVX2 && op1val == 32)
1792 : vcost = COSTS_N_BYTES (16);
1793 9 : else if (TARGET_SSE4_1 && op1val == 32)
1794 : vcost = COSTS_N_BYTES (20);
1795 9 : else if (op1val >= 96)
1796 : vcost = COSTS_N_BYTES (23);
1797 9 : else if ((op1val & 7) == 0)
1798 : vcost = COSTS_N_BYTES (28);
1799 9 : else if (TARGET_AVX2 && op1val < 32)
1800 : vcost = COSTS_N_BYTES (30);
1801 9 : else if (op1val == 1 || op1val >= 64)
1802 : vcost = COSTS_N_BYTES (42);
1803 : else
1804 8 : vcost = COSTS_N_BYTES (47);
1805 : }
1806 : else
1807 : {
1808 114 : if (op1val >= 65 && op1val <= 126)
1809 : scost = COSTS_N_INSNS (3);
1810 : else
1811 114 : scost = COSTS_N_INSNS (2);
1812 :
1813 114 : if (op1val == 127)
1814 : vcost = COSTS_N_INSNS (2);
1815 113 : else if (op1val == 64)
1816 : vcost = COSTS_N_INSNS (3);
1817 113 : else if (op1val == 96)
1818 : vcost = COSTS_N_INSNS (3);
1819 113 : else if (op1val >= 111)
1820 : vcost = COSTS_N_INSNS (3);
1821 113 : else if (TARGET_SSE4_1 && op1val == 32)
1822 : vcost = COSTS_N_INSNS (3);
1823 113 : else if (TARGET_SSE4_1
1824 0 : && (op1val == 8 || op1val == 16 || op1val == 24))
1825 : vcost = COSTS_N_INSNS (3);
1826 113 : else if (op1val >= 96)
1827 : vcost = COSTS_N_INSNS (4);
1828 113 : else if (TARGET_SSE4_1 && (op1val == 28 || op1val == 80))
1829 : vcost = COSTS_N_INSNS (4);
1830 113 : else if ((op1val & 7) == 0)
1831 : vcost = COSTS_N_INSNS (5);
1832 113 : else if (TARGET_AVX2 && op1val < 32)
1833 : vcost = COSTS_N_INSNS (6);
1834 113 : else if (TARGET_SSE4_1 && op1val < 15)
1835 : vcost = COSTS_N_INSNS (6);
1836 113 : else if (op1val == 1 || op1val >= 64)
1837 : vcost = COSTS_N_INSNS (8);
1838 : else
1839 16 : vcost = COSTS_N_INSNS (9);
1840 : }
1841 123 : igain = scost - vcost;
1842 123 : break;
1843 :
1844 6 : case ROTATE:
1845 6 : case ROTATERT:
1846 : /* See ix86_expand_v1ti_rotate. */
1847 6 : op1val = INTVAL (XEXP (src, 1));
1848 6 : if (!speed_p)
1849 : {
1850 0 : scost = COSTS_N_BYTES (13);
1851 0 : if ((op1val & 31) == 0)
1852 : vcost = COSTS_N_BYTES (5);
1853 0 : else if ((op1val & 7) == 0)
1854 0 : vcost = TARGET_AVX ? COSTS_N_BYTES (13) : COSTS_N_BYTES (18);
1855 0 : else if (op1val > 32 && op1val < 96)
1856 : vcost = COSTS_N_BYTES (24);
1857 : else
1858 0 : vcost = COSTS_N_BYTES (19);
1859 : }
1860 : else
1861 : {
1862 6 : scost = COSTS_N_INSNS (3);
1863 6 : if ((op1val & 31) == 0)
1864 : vcost = COSTS_N_INSNS (1);
1865 4 : else if ((op1val & 7) == 0)
1866 1 : vcost = TARGET_AVX ? COSTS_N_INSNS (3) : COSTS_N_INSNS (4);
1867 3 : else if (op1val > 32 && op1val < 96)
1868 : vcost = COSTS_N_INSNS (5);
1869 : else
1870 3 : vcost = COSTS_N_INSNS (4);
1871 : }
1872 6 : igain = scost - vcost;
1873 6 : break;
1874 :
1875 19 : case COMPARE:
1876 19 : if (XEXP (src, 1) == const0_rtx)
1877 : {
1878 8 : if (GET_CODE (XEXP (src, 0)) == AND)
1879 : /* and;and;or (9 bytes) vs. ptest (5 bytes). */
1880 : igain = !speed_p ? COSTS_N_BYTES (4) : COSTS_N_INSNS (2);
1881 : /* or (3 bytes) vs. ptest (5 bytes). */
1882 8 : else if (!speed_p)
1883 0 : igain = -COSTS_N_BYTES (2);
1884 : }
1885 11 : else if (XEXP (src, 1) == const1_rtx)
1886 : /* and;cmp -1 (7 bytes) vs. pcmpeqd;pxor;ptest (13 bytes). */
1887 0 : igain = !speed_p ? -COSTS_N_BYTES (6) : -COSTS_N_INSNS (1);
1888 : break;
1889 :
1890 260 : case ZERO_EXTEND:
1891 260 : if (GET_MODE (XEXP (src, 0)) == DImode)
1892 : /* xor (2 bytes) vs. vmovq (5 bytes). */
1893 260 : igain = speed_p ? COSTS_N_INSNS (1) - ix86_cost->sse_to_integer
1894 : : -COSTS_N_BYTES (3);
1895 : break;
1896 :
1897 : default:
1898 : break;
1899 : }
1900 :
1901 1959692 : gain += igain;
1902 995514 : if (speed_p)
1903 964178 : weighted_gain += bb_freq * igain;
1904 :
1905 995522 : if (igain != 0 && dump_file)
1906 : {
1907 0 : fprintf (dump_file, " Instruction gain %d with bb_freq %.2f for ",
1908 : igain, bb_freq.to_double ());
1909 0 : dump_insn_slim (dump_file, insn);
1910 : }
1911 : }
1912 :
1913 499867 : if (dump_file)
1914 0 : fprintf (dump_file, " Total gain: %d, weighted gain %.2f\n",
1915 : gain, weighted_gain.to_double ());
1916 :
1917 499867 : if (weighted_gain > (sreal) 0)
1918 : return true;
1919 : else
1920 54212 : return gain > 0;
1921 : }
1922 :
1923 : /* Fix uses of converted REG in debug insns. */
1924 :
1925 : void
1926 419412 : timode_scalar_chain::fix_debug_reg_uses (rtx reg)
1927 : {
1928 419412 : if (!flag_var_tracking)
1929 : return;
1930 :
1931 370692 : df_ref ref, next;
1932 760135 : for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
1933 : {
1934 389443 : rtx_insn *insn = DF_REF_INSN (ref);
1935 : /* Make sure the next ref is for a different instruction,
1936 : so that we're not affected by the rescan. */
1937 389443 : next = DF_REF_NEXT_REG (ref);
1938 389443 : while (next && DF_REF_INSN (next) == insn)
1939 0 : next = DF_REF_NEXT_REG (next);
1940 :
1941 389443 : if (DEBUG_INSN_P (insn))
1942 : {
1943 : /* It may be a debug insn with a TImode variable in
1944 : register. */
1945 : bool changed = false;
1946 228 : for (; ref != next; ref = DF_REF_NEXT_REG (ref))
1947 : {
1948 114 : rtx *loc = DF_REF_LOC (ref);
1949 114 : if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
1950 : {
1951 105 : *loc = gen_rtx_SUBREG (TImode, *loc, 0);
1952 105 : changed = true;
1953 : }
1954 : }
1955 114 : if (changed)
1956 105 : df_insn_rescan (insn);
1957 : }
1958 : }
1959 : }
1960 :
1961 : /* Convert SRC, a *concatditi3 pattern, into a vec_concatv2di instruction.
1962 : Insert this before INSN, and return the result as a V1TImode subreg. */
1963 :
1964 : static rtx
1965 266 : timode_convert_concatdi (rtx src, rtx_insn *insn)
1966 : {
1967 266 : rtx hi, lo;
1968 266 : rtx tmp = gen_reg_rtx (V2DImode);
1969 266 : if (GET_CODE (XEXP (src, 0)) == ASHIFT)
1970 : {
1971 266 : hi = XEXP (XEXP (XEXP (src, 0), 0), 0);
1972 266 : lo = XEXP (XEXP (src, 1), 0);
1973 : }
1974 : else
1975 : {
1976 0 : hi = XEXP (XEXP (XEXP (src, 1), 0), 0);
1977 0 : lo = XEXP (XEXP (src, 0), 0);
1978 : }
1979 266 : emit_insn_before (gen_vec_concatv2di (tmp, lo, hi), insn);
1980 266 : return gen_rtx_SUBREG (V1TImode, tmp, 0);
1981 : }
1982 :
1983 : /* Convert INSN from TImode to V1T1mode. */
1984 :
1985 : void
1986 907085 : timode_scalar_chain::convert_insn (rtx_insn *insn)
1987 : {
1988 907085 : rtx def_set = single_set (insn);
1989 907085 : rtx src = SET_SRC (def_set);
1990 907085 : rtx dst = SET_DEST (def_set);
1991 907085 : rtx tmp;
1992 :
1993 907085 : switch (GET_CODE (dst))
1994 : {
1995 419929 : case REG:
1996 419929 : if (GET_MODE (dst) == TImode)
1997 : {
1998 419300 : if (!HARD_REGISTER_NUM_P (REGNO (dst)))
1999 : {
2000 418795 : PUT_MODE (dst, V1TImode);
2001 418795 : fix_debug_reg_uses (dst);
2002 : }
2003 505 : else if (!GENERAL_REGNO_P (REGNO (dst)))
2004 359 : dst = gen_raw_REG (V1TImode, REGNO (dst));
2005 : }
2006 419929 : if (GET_MODE (dst) == V1TImode)
2007 : {
2008 : /* It might potentially be helpful to convert REG_EQUAL notes,
2009 : but for now we just remove them. */
2010 419771 : rtx note = find_reg_equal_equiv_note (insn);
2011 419771 : if (note)
2012 470 : remove_note (insn, note);
2013 : }
2014 : break;
2015 487156 : case MEM:
2016 487156 : PUT_MODE (dst, V1TImode);
2017 487156 : break;
2018 :
2019 0 : default:
2020 0 : gcc_unreachable ();
2021 : }
2022 :
2023 907085 : switch (GET_CODE (src))
2024 : {
2025 448038 : case REG:
2026 448038 : if (GET_MODE (src) == TImode)
2027 : {
2028 823 : if (GENERAL_REGNO_P (REGNO (src)))
2029 : {
2030 201 : rtx lo = gen_reg_rtx (DImode);
2031 201 : rtx hi = gen_reg_rtx (DImode);
2032 201 : emit_insn_before (gen_rtx_SET (lo, gen_lowpart (DImode, src)),
2033 : insn);
2034 201 : emit_insn_before (gen_rtx_SET (hi, gen_highpart (DImode, src)),
2035 : insn);
2036 201 : src = gen_reg_rtx (V2DImode);
2037 201 : emit_insn_before (gen_vec_concatv2di (src, lo, hi), insn);
2038 201 : src = gen_lowpart (V1TImode, src);
2039 : }
2040 622 : else if (!HARD_REGISTER_NUM_P (REGNO (src)))
2041 : {
2042 617 : PUT_MODE (src, V1TImode);
2043 617 : fix_debug_reg_uses (src);
2044 : }
2045 : else
2046 5 : src = gen_raw_REG (V1TImode, REGNO (src));
2047 : }
2048 448038 : if (GENERAL_REG_P (dst))
2049 : {
2050 146 : rtx tmp = gen_reg_rtx (V2DImode);
2051 146 : src = gen_lowpart (V2DImode, src);
2052 146 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2053 : /* Extracting hi before lo helps register allocation. */
2054 146 : rtx hi = gen_reg_rtx (DImode);
2055 146 : rtx lo = gen_reg_rtx (DImode);
2056 146 : emit_insn_before (gen_vec_extractv2didi (hi, tmp, const1_rtx), insn);
2057 146 : emit_insn_before (gen_vec_extractv2didi (lo, tmp, const0_rtx), insn);
2058 :
2059 : /* Construct *concatditi3 pattern from lo and hi. */
2060 146 : hi = gen_rtx_ZERO_EXTEND (TImode, hi);
2061 146 : hi = gen_rtx_ASHIFT (TImode, hi, GEN_INT (64));
2062 146 : lo = gen_rtx_ZERO_EXTEND (TImode, lo);
2063 146 : src = gen_rtx_PLUS (TImode, hi, lo);
2064 : }
2065 : break;
2066 :
2067 417759 : case MEM:
2068 417759 : PUT_MODE (src, V1TImode);
2069 417759 : break;
2070 :
2071 30182 : case CONST_WIDE_INT:
2072 30182 : if (NONDEBUG_INSN_P (insn))
2073 : {
2074 : /* Since there are no instructions to store 128-bit constant,
2075 : temporary register usage is required. */
2076 30182 : bool use_move;
2077 30182 : start_sequence ();
2078 30182 : tmp = ix86_convert_const_wide_int_to_broadcast (TImode, src);
2079 30182 : if (tmp)
2080 : {
2081 194 : src = lowpart_subreg (V1TImode, tmp, TImode);
2082 194 : use_move = true;
2083 : }
2084 : else
2085 : {
2086 29988 : src = smode_convert_cst (src, V1TImode);
2087 29988 : src = validize_mem (force_const_mem (V1TImode, src));
2088 29988 : use_move = MEM_P (dst);
2089 : }
2090 30182 : rtx_insn *seq = end_sequence ();
2091 30182 : if (seq)
2092 195 : emit_insn_before (seq, insn);
2093 30182 : if (use_move)
2094 : {
2095 29879 : tmp = gen_reg_rtx (V1TImode);
2096 29879 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2097 29879 : src = tmp;
2098 : }
2099 : }
2100 : break;
2101 :
2102 10593 : case CONST_INT:
2103 10593 : switch (standard_sse_constant_p (src, TImode))
2104 : {
2105 10370 : case 1:
2106 10370 : src = CONST0_RTX (GET_MODE (dst));
2107 10370 : break;
2108 223 : case 2:
2109 223 : src = CONSTM1_RTX (GET_MODE (dst));
2110 223 : break;
2111 0 : default:
2112 0 : gcc_unreachable ();
2113 : }
2114 10593 : if (MEM_P (dst))
2115 : {
2116 10058 : tmp = gen_reg_rtx (V1TImode);
2117 10058 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2118 10058 : src = tmp;
2119 : }
2120 : break;
2121 :
2122 13 : case AND:
2123 13 : if (GET_CODE (XEXP (src, 0)) == NOT)
2124 : {
2125 0 : convert_op (&XEXP (XEXP (src, 0), 0), insn);
2126 0 : convert_op (&XEXP (src, 1), insn);
2127 0 : PUT_MODE (XEXP (src, 0), V1TImode);
2128 0 : PUT_MODE (src, V1TImode);
2129 0 : break;
2130 : }
2131 13 : convert_op (&XEXP (src, 0), insn);
2132 13 : convert_op (&XEXP (src, 1), insn);
2133 13 : PUT_MODE (src, V1TImode);
2134 13 : if (MEM_P (dst))
2135 : {
2136 10 : tmp = gen_reg_rtx (V1TImode);
2137 10 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2138 10 : src = tmp;
2139 : }
2140 : break;
2141 :
2142 343 : case XOR:
2143 343 : case IOR:
2144 343 : if (timode_concatdi_p (src))
2145 : {
2146 266 : src = timode_convert_concatdi (src, insn);
2147 266 : break;
2148 : }
2149 77 : convert_op (&XEXP (src, 0), insn);
2150 77 : convert_op (&XEXP (src, 1), insn);
2151 77 : PUT_MODE (src, V1TImode);
2152 77 : if (MEM_P (dst))
2153 : {
2154 8 : tmp = gen_reg_rtx (V1TImode);
2155 8 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2156 8 : src = tmp;
2157 : }
2158 : break;
2159 :
2160 3 : case NOT:
2161 3 : src = XEXP (src, 0);
2162 3 : convert_op (&src, insn);
2163 3 : tmp = gen_reg_rtx (V1TImode);
2164 3 : emit_insn_before (gen_move_insn (tmp, CONSTM1_RTX (V1TImode)), insn);
2165 3 : src = gen_rtx_XOR (V1TImode, src, tmp);
2166 3 : if (MEM_P (dst))
2167 : {
2168 0 : tmp = gen_reg_rtx (V1TImode);
2169 0 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2170 0 : src = tmp;
2171 : }
2172 : break;
2173 :
2174 12 : case COMPARE:
2175 12 : dst = gen_rtx_REG (CCZmode, FLAGS_REG);
2176 12 : src = convert_compare (XEXP (src, 0), XEXP (src, 1), insn);
2177 12 : break;
2178 :
2179 43 : case ASHIFT:
2180 43 : case LSHIFTRT:
2181 43 : case ASHIFTRT:
2182 43 : case ROTATERT:
2183 43 : case ROTATE:
2184 43 : convert_op (&XEXP (src, 0), insn);
2185 43 : PUT_MODE (src, V1TImode);
2186 43 : break;
2187 :
2188 99 : case ZERO_EXTEND:
2189 99 : if (GET_MODE (XEXP (src, 0)) == DImode)
2190 : {
2191 : /* Convert to *vec_concatv2di_0. */
2192 99 : rtx tmp = gen_reg_rtx (V2DImode);
2193 99 : rtx pat = gen_rtx_VEC_CONCAT (V2DImode, XEXP (src, 0), const0_rtx);
2194 99 : emit_insn_before (gen_move_insn (tmp, pat), insn);
2195 99 : src = gen_rtx_SUBREG (vmode, tmp, 0);
2196 : }
2197 : else
2198 0 : gcc_unreachable ();
2199 99 : break;
2200 :
2201 0 : case PLUS:
2202 0 : if (timode_concatdi_p (src))
2203 0 : src = timode_convert_concatdi (src, insn);
2204 : else
2205 0 : gcc_unreachable ();
2206 0 : break;
2207 :
2208 0 : default:
2209 0 : gcc_unreachable ();
2210 : }
2211 :
2212 907085 : SET_SRC (def_set) = src;
2213 907085 : SET_DEST (def_set) = dst;
2214 :
2215 : /* Drop possible dead definitions. */
2216 907085 : PATTERN (insn) = def_set;
2217 :
2218 907085 : INSN_CODE (insn) = -1;
2219 907085 : recog_memoized (insn);
2220 907085 : df_insn_rescan (insn);
2221 907085 : }
2222 :
2223 : /* Generate copies from defs used by the chain but not defined therein.
2224 : Also populates defs_map which is used later by convert_insn. */
2225 :
2226 : void
2227 631957 : scalar_chain::convert_registers ()
2228 : {
2229 631957 : bitmap_iterator bi;
2230 631957 : unsigned id;
2231 658127 : EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
2232 : {
2233 26170 : rtx chain_reg = gen_reg_rtx (smode);
2234 26170 : defs_map.put (regno_reg_rtx[id], chain_reg);
2235 : }
2236 640371 : EXECUTE_IF_SET_IN_BITMAP (insns_conv, 0, id, bi)
2237 21067 : for (df_ref ref = DF_INSN_UID_DEFS (id); ref; ref = DF_REF_NEXT_LOC (ref))
2238 12653 : if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
2239 8414 : make_vector_copies (DF_REF_INSN (ref), DF_REF_REAL_REG (ref));
2240 631957 : }
2241 :
2242 : /* Convert whole chain creating required register
2243 : conversions and copies. */
2244 :
2245 : int
2246 631957 : scalar_chain::convert ()
2247 : {
2248 631957 : bitmap_iterator bi;
2249 631957 : unsigned id;
2250 631957 : int converted_insns = 0;
2251 :
2252 631957 : if (!dbg_cnt (stv_conversion))
2253 : return 0;
2254 :
2255 631957 : if (dump_file)
2256 0 : fprintf (dump_file, "Converting chain #%d...\n", chain_id);
2257 :
2258 631957 : convert_registers ();
2259 :
2260 1950658 : EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
2261 : {
2262 1318701 : rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
2263 1318701 : convert_insn_common (insn);
2264 1318701 : convert_insn (insn);
2265 1318701 : converted_insns++;
2266 : }
2267 :
2268 : return converted_insns;
2269 : }
2270 :
2271 : /* Return the SET expression if INSN doesn't reference hard register.
2272 : Return NULL if INSN uses or defines a hard register, excluding
2273 : pseudo register pushes, hard register uses in a memory address,
2274 : clobbers and flags definitions. */
2275 :
2276 : static rtx
2277 338790339 : pseudo_reg_set (rtx_insn *insn)
2278 : {
2279 338790339 : rtx set = single_set (insn);
2280 338790339 : if (!set)
2281 : return NULL;
2282 :
2283 : /* Check pseudo register push first. */
2284 134994096 : machine_mode mode = TARGET_64BIT ? TImode : DImode;
2285 134994096 : if (REG_P (SET_SRC (set))
2286 38213537 : && !HARD_REGISTER_P (SET_SRC (set))
2287 164804080 : && push_operand (SET_DEST (set), mode))
2288 : return set;
2289 :
2290 134740953 : df_ref ref;
2291 218062626 : FOR_EACH_INSN_DEF (ref, insn)
2292 120178038 : if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
2293 64697182 : && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
2294 170433143 : && DF_REF_REGNO (ref) != FLAGS_REG)
2295 : return NULL;
2296 :
2297 187740355 : FOR_EACH_INSN_USE (ref, insn)
2298 115178667 : if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
2299 : return NULL;
2300 :
2301 : return set;
2302 : }
2303 :
2304 : /* Return true if the register REG is defined in a single DEF chain.
2305 : If it is defined in more than one DEF chains, we may not be able
2306 : to convert it in all chains. */
2307 :
2308 : static bool
2309 1240630 : single_def_chain_p (rtx reg)
2310 : {
2311 1240630 : df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg));
2312 1240630 : if (!ref)
2313 : return false;
2314 1240610 : return DF_REF_NEXT_REG (ref) == nullptr;
2315 : }
2316 :
2317 : /* Check if comparison INSN may be transformed into vector comparison.
2318 : Currently we transform equality/inequality checks which look like:
2319 : (set (reg:CCZ 17 flags) (compare:CCZ (reg:TI x) (reg:TI y))) */
2320 :
2321 : static bool
2322 12785335 : convertible_comparison_p (rtx_insn *insn, enum machine_mode mode)
2323 : {
2324 14195677 : if (mode != (TARGET_64BIT ? TImode : DImode))
2325 : return false;
2326 :
2327 4672554 : if (!TARGET_SSE4_1)
2328 : return false;
2329 :
2330 164004 : rtx def_set = single_set (insn);
2331 :
2332 164004 : gcc_assert (def_set);
2333 :
2334 164004 : rtx src = SET_SRC (def_set);
2335 164004 : rtx dst = SET_DEST (def_set);
2336 :
2337 164004 : gcc_assert (GET_CODE (src) == COMPARE);
2338 :
2339 164004 : if (!REG_P (dst)
2340 164004 : || REGNO (dst) != FLAGS_REG
2341 328008 : || GET_MODE (dst) != CCZmode)
2342 : return false;
2343 :
2344 114364 : rtx op1 = XEXP (src, 0);
2345 114364 : rtx op2 = XEXP (src, 1);
2346 :
2347 : /* *cmp<dwi>_doubleword. */
2348 114364 : if (general_operand (op1, mode)
2349 114364 : && general_operand (op2, mode))
2350 : return true;
2351 :
2352 : /* *testti_doubleword. */
2353 114308 : if (op2 == const0_rtx
2354 38179 : && GET_CODE (op1) == AND
2355 142 : && REG_P (XEXP (op1, 0)))
2356 : {
2357 142 : rtx op12 = XEXP (op1, 1);
2358 142 : return GET_MODE (XEXP (op1, 0)) == TImode
2359 142 : && (CONST_SCALAR_INT_P (op12)
2360 0 : || ((REG_P (op12) || MEM_P (op12))
2361 0 : && GET_MODE (op12) == TImode));
2362 : }
2363 :
2364 : /* *test<dwi>_not_doubleword. */
2365 114166 : if (op2 == const0_rtx
2366 38037 : && GET_CODE (op1) == AND
2367 0 : && GET_CODE (XEXP (op1, 0)) == NOT)
2368 : {
2369 0 : rtx op11 = XEXP (XEXP (op1, 0), 0);
2370 0 : rtx op12 = XEXP (op1, 1);
2371 0 : return (REG_P (op11) || MEM_P (op11))
2372 0 : && (REG_P (op12) || MEM_P (op12))
2373 0 : && GET_MODE (op11) == mode
2374 0 : && GET_MODE (op12) == mode;
2375 : }
2376 :
2377 : return false;
2378 : }
2379 :
2380 : /* The general version of scalar_to_vector_candidate_p. */
2381 :
2382 : static bool
2383 236964414 : general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)
2384 : {
2385 236964414 : rtx def_set = pseudo_reg_set (insn);
2386 :
2387 236964414 : if (!def_set)
2388 : return false;
2389 :
2390 49246740 : rtx src = SET_SRC (def_set);
2391 49246740 : rtx dst = SET_DEST (def_set);
2392 :
2393 49246740 : if (GET_CODE (src) == COMPARE)
2394 8817952 : return convertible_comparison_p (insn, mode);
2395 :
2396 : /* We are interested in "mode" only. */
2397 40428788 : if ((GET_MODE (src) != mode
2398 27618172 : && !CONST_INT_P (src))
2399 17909006 : || GET_MODE (dst) != mode)
2400 : return false;
2401 :
2402 15069205 : if (!REG_P (dst) && !MEM_P (dst))
2403 : return false;
2404 :
2405 14812226 : switch (GET_CODE (src))
2406 : {
2407 530461 : case ASHIFT:
2408 530461 : case LSHIFTRT:
2409 530461 : case ASHIFTRT:
2410 530461 : case ROTATE:
2411 530461 : case ROTATERT:
2412 530461 : if (!CONST_INT_P (XEXP (src, 1))
2413 1024985 : || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, GET_MODE_BITSIZE (mode)-1))
2414 : return false;
2415 :
2416 : /* Check for extend highpart case. */
2417 494520 : if (mode != DImode
2418 361009 : || GET_CODE (src) != ASHIFTRT
2419 81904 : || GET_CODE (XEXP (src, 0)) != ASHIFT)
2420 : break;
2421 :
2422 3682276 : src = XEXP (src, 0);
2423 : break;
2424 :
2425 87166 : case SMAX:
2426 87166 : case SMIN:
2427 87166 : case UMAX:
2428 87166 : case UMIN:
2429 87166 : if ((mode == DImode && !TARGET_AVX512VL)
2430 17860 : || (mode == SImode && !TARGET_SSE4_1))
2431 : return false;
2432 : /* Fallthru. */
2433 :
2434 3226132 : case AND:
2435 3226132 : case IOR:
2436 3226132 : case XOR:
2437 3226132 : case PLUS:
2438 3226132 : case MINUS:
2439 3226132 : if (!REG_P (XEXP (src, 1))
2440 : && !MEM_P (XEXP (src, 1))
2441 : && !CONST_INT_P (XEXP (src, 1)))
2442 : return false;
2443 :
2444 3135024 : if (GET_MODE (XEXP (src, 1)) != mode
2445 1826635 : && !CONST_INT_P (XEXP (src, 1)))
2446 : return false;
2447 :
2448 : /* Check for andnot case. */
2449 3135024 : if (GET_CODE (src) != AND
2450 177746 : || GET_CODE (XEXP (src, 0)) != NOT)
2451 : break;
2452 :
2453 3682276 : src = XEXP (src, 0);
2454 : /* FALLTHRU */
2455 :
2456 : case NOT:
2457 : break;
2458 :
2459 24839 : case NEG:
2460 : /* Check for nabs case. */
2461 24839 : if (GET_CODE (XEXP (src, 0)) != ABS)
2462 : break;
2463 :
2464 : src = XEXP (src, 0);
2465 : /* FALLTHRU */
2466 :
2467 3793 : case ABS:
2468 3793 : if ((mode == DImode && !TARGET_AVX512VL)
2469 1431 : || (mode == SImode && !TARGET_SSSE3))
2470 : return false;
2471 : break;
2472 :
2473 : case REG:
2474 : return true;
2475 :
2476 5962011 : case MEM:
2477 5962011 : case CONST_INT:
2478 5962011 : return REG_P (dst);
2479 :
2480 57285 : case VEC_SELECT:
2481 : /* Excluding MEM_P (dst) avoids intefering with vpextr[dq]. */
2482 57285 : return REG_P (dst)
2483 46815 : && REG_P (XEXP (src, 0))
2484 53404 : && GET_MODE (XEXP (src, 0)) == (mode == DImode ? V2DImode
2485 : : V4SImode)
2486 36964 : && GET_CODE (XEXP (src, 1)) == PARALLEL
2487 36964 : && XVECLEN (XEXP (src, 1), 0) == 1
2488 94249 : && CONST_INT_P (XVECEXP (XEXP (src, 1), 0, 0));
2489 :
2490 : default:
2491 : return false;
2492 : }
2493 :
2494 3682276 : if (!REG_P (XEXP (src, 0))
2495 : && !MEM_P (XEXP (src, 0))
2496 : && !CONST_INT_P (XEXP (src, 0)))
2497 : return false;
2498 :
2499 3376138 : if (GET_MODE (XEXP (src, 0)) != mode
2500 0 : && !CONST_INT_P (XEXP (src, 0)))
2501 : return false;
2502 :
2503 : return true;
2504 : }
2505 :
2506 : /* Check for a suitable TImode memory operand. */
2507 :
2508 : static bool
2509 1582 : timode_mem_p (rtx x)
2510 : {
2511 1582 : return MEM_P (x)
2512 1582 : && (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
2513 0 : || !misaligned_operand (x, TImode));
2514 : }
2515 :
2516 : /* The TImode version of scalar_to_vector_candidate_p. */
2517 :
2518 : static bool
2519 101825925 : timode_scalar_to_vector_candidate_p (rtx_insn *insn)
2520 : {
2521 101825925 : rtx def_set = pseudo_reg_set (insn);
2522 :
2523 : /* We allow two exceptions to the pseudo registers only rule.
2524 : Setting a hard register from a pseudo, and setting a pseudo
2525 : from a hard register. */
2526 101825925 : if (!def_set)
2527 : {
2528 78257834 : def_set = single_set (insn);
2529 78257834 : if (def_set)
2530 : {
2531 17876965 : rtx src = SET_SRC (def_set);
2532 17876965 : rtx dst = SET_DEST (def_set);
2533 17876965 : if (GET_MODE (dst) == TImode
2534 220539 : && REG_P (src) && REG_P (dst))
2535 : {
2536 101506 : if (HARD_REGISTER_P (dst)
2537 52167 : && !HARD_REGISTER_P (src)
2538 153673 : && single_def_chain_p (src))
2539 : return true;
2540 72881 : if (HARD_REGISTER_P (src)
2541 49339 : && !HARD_REGISTER_P (dst)
2542 122220 : && single_def_chain_p (dst))
2543 : return true;
2544 : }
2545 : }
2546 : return false;
2547 : }
2548 :
2549 23568091 : rtx src = SET_SRC (def_set);
2550 23568091 : rtx dst = SET_DEST (def_set);
2551 :
2552 23568091 : if (GET_CODE (src) == COMPARE)
2553 3967383 : return convertible_comparison_p (insn, TImode);
2554 :
2555 19600708 : if (GET_MODE (dst) != TImode
2556 1182149 : || (GET_MODE (src) != TImode
2557 59090 : && !CONST_SCALAR_INT_P (src)))
2558 : return false;
2559 :
2560 1182149 : if (!REG_P (dst) && !MEM_P (dst))
2561 : return false;
2562 :
2563 1180696 : if (MEM_P (dst)
2564 523673 : && misaligned_operand (dst, TImode)
2565 1487671 : && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
2566 : return false;
2567 :
2568 1180691 : if (REG_P (dst) && !single_def_chain_p (dst))
2569 : return false;
2570 :
2571 1029417 : switch (GET_CODE (src))
2572 : {
2573 482101 : case REG:
2574 482101 : return single_def_chain_p (src);
2575 :
2576 : case CONST_WIDE_INT:
2577 : return true;
2578 :
2579 12482 : case CONST_INT:
2580 : /* ??? Verify performance impact before enabling CONST_INT for
2581 : __int128 store. */
2582 12482 : return standard_sse_constant_p (src, TImode);
2583 :
2584 439787 : case MEM:
2585 : /* Memory must be aligned or unaligned load is optimal. */
2586 439787 : return (REG_P (dst)
2587 439787 : && (!misaligned_operand (src, TImode)
2588 141320 : || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
2589 :
2590 3109 : case AND:
2591 3109 : if (!MEM_P (dst)
2592 3068 : && GET_CODE (XEXP (src, 0)) == NOT
2593 0 : && REG_P (XEXP (XEXP (src, 0), 0))
2594 3109 : && (REG_P (XEXP (src, 1))
2595 0 : || CONST_SCALAR_INT_P (XEXP (src, 1))
2596 0 : || timode_mem_p (XEXP (src, 1))))
2597 0 : return true;
2598 3109 : return (REG_P (XEXP (src, 0))
2599 46 : || timode_mem_p (XEXP (src, 0)))
2600 3155 : && (REG_P (XEXP (src, 1))
2601 1280 : || CONST_SCALAR_INT_P (XEXP (src, 1))
2602 35 : || timode_mem_p (XEXP (src, 1)));
2603 :
2604 14048 : case IOR:
2605 14048 : case XOR:
2606 14048 : if (timode_concatdi_p (src))
2607 : return true;
2608 2722 : return (REG_P (XEXP (src, 0))
2609 1431 : || timode_mem_p (XEXP (src, 0)))
2610 2739 : && (REG_P (XEXP (src, 1))
2611 290 : || CONST_SCALAR_INT_P (XEXP (src, 1))
2612 54 : || timode_mem_p (XEXP (src, 1)));
2613 :
2614 509 : case NOT:
2615 509 : return REG_P (XEXP (src, 0)) || timode_mem_p (XEXP (src, 0));
2616 :
2617 11664 : case ASHIFT:
2618 11664 : case LSHIFTRT:
2619 11664 : case ASHIFTRT:
2620 11664 : case ROTATERT:
2621 11664 : case ROTATE:
2622 : /* Handle shifts/rotates by integer constants between 0 and 127. */
2623 11664 : return REG_P (XEXP (src, 0))
2624 11632 : && CONST_INT_P (XEXP (src, 1))
2625 22936 : && (INTVAL (XEXP (src, 1)) & ~0x7f) == 0;
2626 :
2627 7017 : case PLUS:
2628 7017 : return timode_concatdi_p (src);
2629 :
2630 3751 : case ZERO_EXTEND:
2631 3751 : return REG_P (XEXP (src, 0))
2632 3751 : && GET_MODE (XEXP (src, 0)) == DImode;
2633 :
2634 : default:
2635 : return false;
2636 : }
2637 : }
2638 :
2639 : /* For a register REGNO, scan instructions for its defs and uses.
2640 : Put REGNO in REGS if a def or use isn't in CANDIDATES. */
2641 :
2642 : static void
2643 1221400 : timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
2644 : unsigned int regno)
2645 : {
2646 : /* Do nothing if REGNO is already in REGS or is a hard reg. */
2647 1221400 : if (bitmap_bit_p (regs, regno)
2648 1221400 : || HARD_REGISTER_NUM_P (regno))
2649 : return;
2650 :
2651 1213358 : for (df_ref def = DF_REG_DEF_CHAIN (regno);
2652 2416377 : def;
2653 1203019 : def = DF_REF_NEXT_REG (def))
2654 : {
2655 1213338 : if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2656 : {
2657 10319 : if (dump_file)
2658 0 : fprintf (dump_file,
2659 : "r%d has non convertible def in insn %d\n",
2660 0 : regno, DF_REF_INSN_UID (def));
2661 :
2662 10319 : bitmap_set_bit (regs, regno);
2663 10319 : break;
2664 : }
2665 : }
2666 :
2667 1213358 : for (df_ref ref = DF_REG_USE_CHAIN (regno);
2668 2687554 : ref;
2669 1474196 : ref = DF_REF_NEXT_REG (ref))
2670 : {
2671 : /* Debug instructions are skipped. */
2672 1543826 : if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
2673 1543826 : && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
2674 : {
2675 69630 : if (dump_file)
2676 0 : fprintf (dump_file,
2677 : "r%d has non convertible use in insn %d\n",
2678 0 : regno, DF_REF_INSN_UID (ref));
2679 :
2680 69630 : bitmap_set_bit (regs, regno);
2681 69630 : break;
2682 : }
2683 : }
2684 : }
2685 :
2686 : /* For a given bitmap of insn UIDs scans all instructions and
2687 : remove insn from CANDIDATES in case it has both convertible
2688 : and not convertible definitions.
2689 :
2690 : All insns in a bitmap are conversion candidates according to
2691 : scalar_to_vector_candidate_p. Currently it implies all insns
2692 : are single_set. */
2693 :
2694 : static void
2695 834120 : timode_remove_non_convertible_regs (bitmap candidates)
2696 : {
2697 834120 : bitmap_iterator bi;
2698 834120 : unsigned id;
2699 834120 : bitmap regs = BITMAP_ALLOC (NULL);
2700 855231 : bool changed;
2701 :
2702 855231 : do {
2703 855231 : changed = false;
2704 2099843 : EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
2705 : {
2706 1244612 : rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
2707 1244612 : df_ref ref;
2708 :
2709 1946193 : FOR_EACH_INSN_DEF (ref, insn)
2710 701581 : if (!DF_REF_REG_MEM_P (ref)
2711 701581 : && GET_MODE (DF_REF_REG (ref)) == TImode)
2712 613896 : timode_check_non_convertible_regs (candidates, regs,
2713 : DF_REF_REGNO (ref));
2714 :
2715 3069629 : FOR_EACH_INSN_USE (ref, insn)
2716 1825017 : if (DF_REF_TYPE (ref) == DF_REF_REG_USE
2717 743614 : && GET_MODE (DF_REF_REG (ref)) == TImode
2718 607509 : && !SUBREG_P (DF_REF_REG (ref)))
2719 607504 : timode_check_non_convertible_regs (candidates, regs,
2720 : DF_REF_REGNO (ref));
2721 : }
2722 :
2723 1030662 : EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
2724 : {
2725 175431 : for (df_ref def = DF_REG_DEF_CHAIN (id);
2726 356686 : def;
2727 181255 : def = DF_REF_NEXT_REG (def))
2728 181255 : if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2729 : {
2730 56075 : if (dump_file)
2731 0 : fprintf (dump_file, "Removing insn %d from candidates list\n",
2732 0 : DF_REF_INSN_UID (def));
2733 :
2734 56075 : bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
2735 56075 : changed = true;
2736 : }
2737 :
2738 175431 : for (df_ref ref = DF_REG_USE_CHAIN (id);
2739 521357 : ref;
2740 345926 : ref = DF_REF_NEXT_REG (ref))
2741 345926 : if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
2742 : {
2743 16235 : if (dump_file)
2744 0 : fprintf (dump_file, "Removing insn %d from candidates list\n",
2745 0 : DF_REF_INSN_UID (ref));
2746 :
2747 16235 : bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
2748 16235 : changed = true;
2749 : }
2750 : }
2751 : } while (changed);
2752 :
2753 834120 : BITMAP_FREE (regs);
2754 834120 : }
2755 :
2756 : /* Main STV pass function. Find and convert scalar
2757 : instructions into vector mode when profitable. */
2758 :
2759 : static unsigned int
2760 1794454 : convert_scalars_to_vector (bool timode_p)
2761 : {
2762 1794454 : basic_block bb;
2763 1794454 : int converted_insns = 0;
2764 1794454 : auto_vec<rtx_insn *> control_flow_insns;
2765 :
2766 1794454 : bitmap_obstack_initialize (NULL);
2767 1794454 : const machine_mode cand_mode[3] = { SImode, DImode, TImode };
2768 1794454 : const machine_mode cand_vmode[3] = { V4SImode, V2DImode, V1TImode };
2769 5383362 : bitmap_head candidates[3]; /* { SImode, DImode, TImode } */
2770 7177816 : for (unsigned i = 0; i < 3; ++i)
2771 5383362 : bitmap_initialize (&candidates[i], &bitmap_default_obstack);
2772 :
2773 1794454 : calculate_dominance_info (CDI_DOMINATORS);
2774 1794454 : df_set_flags (DF_DEFER_INSN_RESCAN | DF_RD_PRUNE_DEAD_DEFS);
2775 1794454 : df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2776 1794454 : df_analyze ();
2777 :
2778 : /* Find all instructions we want to convert into vector mode. */
2779 1794454 : if (dump_file)
2780 44 : fprintf (dump_file, "Searching for mode conversion candidates...\n");
2781 :
2782 19693860 : FOR_EACH_BB_FN (bb, cfun)
2783 : {
2784 17899406 : rtx_insn *insn;
2785 239966144 : FOR_BB_INSNS (bb, insn)
2786 222066738 : if (timode_p
2787 222066738 : && timode_scalar_to_vector_candidate_p (insn))
2788 : {
2789 1067832 : if (dump_file)
2790 0 : fprintf (dump_file, " insn %d is marked as a TImode candidate\n",
2791 0 : INSN_UID (insn));
2792 :
2793 1067832 : bitmap_set_bit (&candidates[2], INSN_UID (insn));
2794 : }
2795 220998906 : else if (!timode_p)
2796 : {
2797 : /* Check {SI,DI}mode. */
2798 345603812 : for (unsigned i = 0; i <= 1; ++i)
2799 236964414 : if (general_scalar_to_vector_candidate_p (insn, cand_mode[i]))
2800 : {
2801 11601415 : if (dump_file)
2802 554 : fprintf (dump_file, " insn %d is marked as a %s candidate\n",
2803 277 : INSN_UID (insn), i == 0 ? "SImode" : "DImode");
2804 :
2805 11601415 : bitmap_set_bit (&candidates[i], INSN_UID (insn));
2806 11601415 : break;
2807 : }
2808 : }
2809 : }
2810 :
2811 1794454 : if (timode_p)
2812 834120 : timode_remove_non_convertible_regs (&candidates[2]);
2813 :
2814 5685229 : for (unsigned i = 0; i <= 2; ++i)
2815 4530779 : if (!bitmap_empty_p (&candidates[i]))
2816 : break;
2817 3890775 : else if (i == 2 && dump_file)
2818 23 : fprintf (dump_file, "There are no candidates for optimization.\n");
2819 :
2820 7177816 : for (unsigned i = 0; i <= 2; ++i)
2821 : {
2822 5383362 : auto_bitmap disallowed;
2823 5383362 : bitmap_tree_view (&candidates[i]);
2824 17155835 : while (!bitmap_empty_p (&candidates[i]))
2825 : {
2826 6389111 : unsigned uid = bitmap_first_set_bit (&candidates[i]);
2827 6389111 : scalar_chain *chain;
2828 :
2829 6389111 : if (cand_mode[i] == TImode)
2830 499867 : chain = new timode_scalar_chain;
2831 : else
2832 5889244 : chain = new general_scalar_chain (cand_mode[i], cand_vmode[i]);
2833 :
2834 : /* Find instructions chain we want to convert to vector mode.
2835 : Check all uses and definitions to estimate all required
2836 : conversions. */
2837 6389111 : if (chain->build (&candidates[i], uid, disallowed))
2838 : {
2839 6385242 : if (chain->compute_convert_gain ())
2840 631957 : converted_insns += chain->convert ();
2841 5753285 : else if (dump_file)
2842 136 : fprintf (dump_file, "Chain #%d conversion is not profitable\n",
2843 : chain->chain_id);
2844 : }
2845 :
2846 6389111 : rtx_insn* iter_insn;
2847 6389111 : unsigned int ii;
2848 6392699 : FOR_EACH_VEC_ELT (chain->control_flow_insns, ii, iter_insn)
2849 3588 : control_flow_insns.safe_push (iter_insn);
2850 :
2851 6389111 : delete chain;
2852 : }
2853 5383362 : }
2854 :
2855 1794454 : if (dump_file)
2856 44 : fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
2857 :
2858 7177816 : for (unsigned i = 0; i <= 2; ++i)
2859 5383362 : bitmap_release (&candidates[i]);
2860 1794454 : bitmap_obstack_release (NULL);
2861 1794454 : df_process_deferred_rescans ();
2862 :
2863 : /* Conversion means we may have 128bit register spills/fills
2864 : which require aligned stack. */
2865 1794454 : if (converted_insns)
2866 : {
2867 104035 : if (crtl->stack_alignment_needed < 128)
2868 2324 : crtl->stack_alignment_needed = 128;
2869 104035 : if (crtl->stack_alignment_estimated < 128)
2870 221 : crtl->stack_alignment_estimated = 128;
2871 :
2872 104035 : crtl->stack_realign_needed
2873 104035 : = INCOMING_STACK_BOUNDARY < crtl->stack_alignment_estimated;
2874 104035 : crtl->stack_realign_tried = crtl->stack_realign_needed;
2875 :
2876 104035 : crtl->stack_realign_processed = true;
2877 :
2878 104035 : if (!crtl->drap_reg)
2879 : {
2880 103858 : rtx drap_rtx = targetm.calls.get_drap_rtx ();
2881 :
2882 : /* stack_realign_drap and drap_rtx must match. */
2883 103858 : gcc_assert ((stack_realign_drap != 0) == (drap_rtx != NULL));
2884 :
2885 : /* Do nothing if NULL is returned,
2886 : which means DRAP is not needed. */
2887 103858 : if (drap_rtx != NULL)
2888 : {
2889 0 : crtl->args.internal_arg_pointer = drap_rtx;
2890 :
2891 : /* Call fixup_tail_calls to clean up
2892 : REG_EQUIV note if DRAP is needed. */
2893 0 : fixup_tail_calls ();
2894 : }
2895 : }
2896 :
2897 : /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
2898 104035 : if (TARGET_64BIT)
2899 65538 : for (tree parm = DECL_ARGUMENTS (current_function_decl);
2900 179247 : parm; parm = DECL_CHAIN (parm))
2901 : {
2902 113709 : if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
2903 98034 : continue;
2904 15675 : if (DECL_RTL_SET_P (parm)
2905 31350 : && GET_MODE (DECL_RTL (parm)) == V1TImode)
2906 : {
2907 611 : rtx r = DECL_RTL (parm);
2908 611 : if (REG_P (r))
2909 611 : SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
2910 : }
2911 15675 : if (DECL_INCOMING_RTL (parm)
2912 15675 : && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
2913 : {
2914 0 : rtx r = DECL_INCOMING_RTL (parm);
2915 0 : if (REG_P (r))
2916 0 : DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
2917 : }
2918 : }
2919 :
2920 104035 : if (!control_flow_insns.is_empty ())
2921 : {
2922 1130 : free_dominance_info (CDI_DOMINATORS);
2923 :
2924 1130 : unsigned int i;
2925 1130 : rtx_insn* insn;
2926 5848 : FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
2927 3588 : if (control_flow_insn_p (insn))
2928 : {
2929 : /* Split the block after insn. There will be a fallthru
2930 : edge, which is OK so we keep it. We have to create
2931 : the exception edges ourselves. */
2932 3588 : bb = BLOCK_FOR_INSN (insn);
2933 3588 : split_block (bb, insn);
2934 3588 : rtl_make_eh_edge (NULL, bb, BB_END (bb));
2935 : }
2936 : }
2937 : }
2938 :
2939 1794454 : return 0;
2940 1794454 : }
2941 :
2942 : static unsigned int
2943 74570 : rest_of_handle_insert_vzeroupper (void)
2944 : {
2945 : /* vzeroupper instructions are inserted immediately after reload and
2946 : postreload_cse to clean up after it a little bit to account for possible
2947 : spills from 256bit or 512bit registers. The pass reuses mode switching
2948 : infrastructure by re-running mode insertion pass, so disable entities
2949 : that have already been processed. */
2950 521990 : for (int i = 0; i < MAX_386_ENTITIES; i++)
2951 447420 : ix86_optimize_mode_switching[i] = 0;
2952 :
2953 74570 : ix86_optimize_mode_switching[AVX_U128] = 1;
2954 :
2955 : /* Call optimize_mode_switching. */
2956 74570 : g->get_passes ()->execute_pass_mode_switching ();
2957 :
2958 : /* LRA removes all REG_DEAD/REG_UNUSED notes and normally they
2959 : reappear in the IL only at the start of pass_rtl_dse2, which does
2960 : df_note_add_problem (); df_analyze ();
2961 : The vzeroupper is scheduled after postreload_cse pass and mode
2962 : switching computes the notes as well, the problem is that e.g.
2963 : pass_gcse2 doesn't maintain the notes, see PR113059 and
2964 : PR112760. Remove the notes now to restore status quo ante
2965 : until we figure out how to maintain the notes or what else
2966 : to do. */
2967 74570 : basic_block bb;
2968 74570 : rtx_insn *insn;
2969 407172 : FOR_EACH_BB_FN (bb, cfun)
2970 4262225 : FOR_BB_INSNS (bb, insn)
2971 3929623 : if (NONDEBUG_INSN_P (insn))
2972 : {
2973 2090925 : rtx *pnote = ®_NOTES (insn);
2974 3881958 : while (*pnote != 0)
2975 : {
2976 1791033 : if (REG_NOTE_KIND (*pnote) == REG_DEAD
2977 818796 : || REG_NOTE_KIND (*pnote) == REG_UNUSED)
2978 1283144 : *pnote = XEXP (*pnote, 1);
2979 : else
2980 507889 : pnote = &XEXP (*pnote, 1);
2981 : }
2982 : }
2983 :
2984 74570 : df_remove_problem (df_note);
2985 74570 : df_analyze ();
2986 74570 : return 0;
2987 : }
2988 :
2989 : namespace {
2990 :
2991 : const pass_data pass_data_insert_vzeroupper =
2992 : {
2993 : RTL_PASS, /* type */
2994 : "vzeroupper", /* name */
2995 : OPTGROUP_NONE, /* optinfo_flags */
2996 : TV_MACH_DEP, /* tv_id */
2997 : 0, /* properties_required */
2998 : 0, /* properties_provided */
2999 : 0, /* properties_destroyed */
3000 : 0, /* todo_flags_start */
3001 : TODO_df_finish, /* todo_flags_finish */
3002 : };
3003 :
3004 : class pass_insert_vzeroupper : public rtl_opt_pass
3005 : {
3006 : public:
3007 288767 : pass_insert_vzeroupper(gcc::context *ctxt)
3008 577534 : : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
3009 : {}
3010 :
3011 : /* opt_pass methods: */
3012 1481491 : bool gate (function *) final override
3013 : {
3014 1481491 : return TARGET_AVX && TARGET_VZEROUPPER;
3015 : }
3016 :
3017 74570 : unsigned int execute (function *) final override
3018 : {
3019 74570 : return rest_of_handle_insert_vzeroupper ();
3020 : }
3021 :
3022 : }; // class pass_insert_vzeroupper
3023 :
3024 : const pass_data pass_data_stv =
3025 : {
3026 : RTL_PASS, /* type */
3027 : "stv", /* name */
3028 : OPTGROUP_NONE, /* optinfo_flags */
3029 : TV_MACH_DEP, /* tv_id */
3030 : 0, /* properties_required */
3031 : 0, /* properties_provided */
3032 : 0, /* properties_destroyed */
3033 : 0, /* todo_flags_start */
3034 : TODO_df_finish, /* todo_flags_finish */
3035 : };
3036 :
3037 : class pass_stv : public rtl_opt_pass
3038 : {
3039 : public:
3040 577534 : pass_stv (gcc::context *ctxt)
3041 577534 : : rtl_opt_pass (pass_data_stv, ctxt),
3042 1155068 : timode_p (false)
3043 : {}
3044 :
3045 : /* opt_pass methods: */
3046 2962982 : bool gate (function *) final override
3047 : {
3048 1481491 : return ((!timode_p || TARGET_64BIT)
3049 4317938 : && TARGET_STV && TARGET_SSE2 && optimize > 1);
3050 : }
3051 :
3052 1794454 : unsigned int execute (function *) final override
3053 : {
3054 1794454 : return convert_scalars_to_vector (timode_p);
3055 : }
3056 :
3057 288767 : opt_pass *clone () final override
3058 : {
3059 288767 : return new pass_stv (m_ctxt);
3060 : }
3061 :
3062 577534 : void set_pass_param (unsigned int n, bool param) final override
3063 : {
3064 577534 : gcc_assert (n == 0);
3065 577534 : timode_p = param;
3066 577534 : }
3067 :
3068 : private:
3069 : bool timode_p;
3070 : }; // class pass_stv
3071 :
3072 : } // anon namespace
3073 :
3074 : rtl_opt_pass *
3075 288767 : make_pass_insert_vzeroupper (gcc::context *ctxt)
3076 : {
3077 288767 : return new pass_insert_vzeroupper (ctxt);
3078 : }
3079 :
3080 : rtl_opt_pass *
3081 288767 : make_pass_stv (gcc::context *ctxt)
3082 : {
3083 288767 : return new pass_stv (ctxt);
3084 : }
3085 :
3086 : /* Inserting ENDBR and pseudo patchable-area instructions. */
3087 :
3088 : static void
3089 197428 : rest_of_insert_endbr_and_patchable_area (bool need_endbr,
3090 : unsigned int patchable_area_size)
3091 : {
3092 197428 : rtx endbr;
3093 197428 : rtx_insn *insn;
3094 197428 : rtx_insn *endbr_insn = NULL;
3095 197428 : basic_block bb;
3096 :
3097 197428 : if (need_endbr)
3098 : {
3099 : /* Currently emit EB if it's a tracking function, i.e. 'nocf_check'
3100 : is absent among function attributes. Later an optimization will
3101 : be introduced to make analysis if an address of a static function
3102 : is taken. A static function whose address is not taken will get
3103 : a nocf_check attribute. This will allow to reduce the number of
3104 : EB. */
3105 197383 : if (!lookup_attribute ("nocf_check",
3106 197383 : TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
3107 197365 : && (!flag_manual_endbr
3108 8 : || lookup_attribute ("cf_check",
3109 8 : DECL_ATTRIBUTES (cfun->decl)))
3110 394747 : && (!cgraph_node::get (cfun->decl)->only_called_directly_p ()
3111 28556 : || ix86_cmodel == CM_LARGE
3112 28555 : || ix86_cmodel == CM_LARGE_PIC
3113 28554 : || flag_force_indirect_call
3114 28554 : || (TARGET_DLLIMPORT_DECL_ATTRIBUTES
3115 : && DECL_DLLIMPORT_P (cfun->decl))))
3116 : {
3117 168811 : if (crtl->profile && flag_fentry)
3118 : {
3119 : /* Queue ENDBR insertion to x86_function_profiler.
3120 : NB: Any patchable-area insn will be inserted after
3121 : ENDBR. */
3122 6 : cfun->machine->insn_queued_at_entrance = TYPE_ENDBR;
3123 : }
3124 : else
3125 : {
3126 168805 : endbr = gen_nop_endbr ();
3127 168805 : bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
3128 168805 : rtx_insn *insn = BB_HEAD (bb);
3129 168805 : endbr_insn = emit_insn_before (endbr, insn);
3130 : }
3131 : }
3132 : }
3133 :
3134 197428 : if (patchable_area_size)
3135 : {
3136 51 : if (crtl->profile && flag_fentry)
3137 : {
3138 : /* Queue patchable-area insertion to x86_function_profiler.
3139 : NB: If there is a queued ENDBR, x86_function_profiler
3140 : will also handle patchable-area. */
3141 2 : if (!cfun->machine->insn_queued_at_entrance)
3142 1 : cfun->machine->insn_queued_at_entrance = TYPE_PATCHABLE_AREA;
3143 : }
3144 : else
3145 : {
3146 49 : rtx patchable_area
3147 49 : = gen_patchable_area (GEN_INT (patchable_area_size),
3148 49 : GEN_INT (crtl->patch_area_entry == 0));
3149 49 : if (endbr_insn)
3150 3 : emit_insn_after (patchable_area, endbr_insn);
3151 : else
3152 : {
3153 46 : bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
3154 46 : insn = BB_HEAD (bb);
3155 46 : emit_insn_before (patchable_area, insn);
3156 : }
3157 : }
3158 : }
3159 :
3160 197428 : if (!need_endbr)
3161 : return;
3162 :
3163 197383 : bb = 0;
3164 4017437 : FOR_EACH_BB_FN (bb, cfun)
3165 : {
3166 74049696 : for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
3167 70229642 : insn = NEXT_INSN (insn))
3168 : {
3169 70229642 : if (CALL_P (insn))
3170 : {
3171 1388266 : need_endbr = find_reg_note (insn, REG_SETJMP, NULL) != NULL;
3172 1388266 : if (!need_endbr && !SIBLING_CALL_P (insn))
3173 : {
3174 1338198 : rtx call = get_call_rtx_from (insn);
3175 1338198 : rtx fnaddr = XEXP (call, 0);
3176 1338198 : tree fndecl = NULL_TREE;
3177 :
3178 : /* Also generate ENDBRANCH for non-tail call which
3179 : may return via indirect branch. */
3180 1338198 : if (SYMBOL_REF_P (XEXP (fnaddr, 0)))
3181 1280518 : fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
3182 1280518 : if (fndecl == NULL_TREE)
3183 58048 : fndecl = MEM_EXPR (fnaddr);
3184 58048 : if (fndecl
3185 1335940 : && TREE_CODE (TREE_TYPE (fndecl)) != FUNCTION_TYPE
3186 566483 : && TREE_CODE (TREE_TYPE (fndecl)) != METHOD_TYPE)
3187 : fndecl = NULL_TREE;
3188 1338198 : if (fndecl && TYPE_ARG_TYPES (TREE_TYPE (fndecl)))
3189 : {
3190 1297565 : tree fntype = TREE_TYPE (fndecl);
3191 1297565 : if (lookup_attribute ("indirect_return",
3192 1297565 : TYPE_ATTRIBUTES (fntype)))
3193 : need_endbr = true;
3194 : }
3195 : }
3196 1388254 : if (!need_endbr)
3197 1388246 : continue;
3198 : /* Generate ENDBRANCH after CALL, which can return more than
3199 : twice, setjmp-like functions. */
3200 :
3201 20 : endbr = gen_nop_endbr ();
3202 20 : emit_insn_after_setloc (endbr, insn, INSN_LOCATION (insn));
3203 20 : continue;
3204 20 : }
3205 :
3206 68841376 : if (JUMP_P (insn) && flag_cet_switch)
3207 : {
3208 9 : rtx target = JUMP_LABEL (insn);
3209 9 : if (target == NULL_RTX || ANY_RETURN_P (target))
3210 5 : continue;
3211 :
3212 : /* Check the jump is a switch table. */
3213 4 : rtx_insn *label = as_a<rtx_insn *> (target);
3214 4 : rtx_insn *table = next_insn (label);
3215 4 : if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
3216 2 : continue;
3217 :
3218 : /* For the indirect jump find out all places it jumps and insert
3219 : ENDBRANCH there. It should be done under a special flag to
3220 : control ENDBRANCH generation for switch stmts. */
3221 2 : edge_iterator ei;
3222 2 : edge e;
3223 2 : basic_block dest_blk;
3224 :
3225 24 : FOR_EACH_EDGE (e, ei, bb->succs)
3226 : {
3227 22 : rtx_insn *insn;
3228 :
3229 22 : dest_blk = e->dest;
3230 22 : insn = BB_HEAD (dest_blk);
3231 22 : gcc_assert (LABEL_P (insn));
3232 22 : endbr = gen_nop_endbr ();
3233 22 : emit_insn_after (endbr, insn);
3234 : }
3235 2 : continue;
3236 2 : }
3237 :
3238 68841367 : if (LABEL_P (insn) && LABEL_PRESERVE_P (insn))
3239 : {
3240 142093 : endbr = gen_nop_endbr ();
3241 142093 : emit_insn_after (endbr, insn);
3242 142093 : continue;
3243 : }
3244 : }
3245 : }
3246 :
3247 : return;
3248 : }
3249 :
3250 : namespace {
3251 :
3252 : const pass_data pass_data_insert_endbr_and_patchable_area =
3253 : {
3254 : RTL_PASS, /* type. */
3255 : "endbr_and_patchable_area", /* name. */
3256 : OPTGROUP_NONE, /* optinfo_flags. */
3257 : TV_MACH_DEP, /* tv_id. */
3258 : 0, /* properties_required. */
3259 : 0, /* properties_provided. */
3260 : 0, /* properties_destroyed. */
3261 : 0, /* todo_flags_start. */
3262 : 0, /* todo_flags_finish. */
3263 : };
3264 :
3265 : class pass_insert_endbr_and_patchable_area : public rtl_opt_pass
3266 : {
3267 : public:
3268 288767 : pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
3269 577534 : : rtl_opt_pass (pass_data_insert_endbr_and_patchable_area, ctxt)
3270 : {}
3271 :
3272 : /* opt_pass methods: */
3273 1481491 : bool gate (function *) final override
3274 : {
3275 1481491 : need_endbr = (flag_cf_protection & CF_BRANCH) != 0;
3276 1481491 : patchable_area_size = crtl->patch_area_size - crtl->patch_area_entry;
3277 1481491 : return need_endbr || patchable_area_size;
3278 : }
3279 :
3280 197428 : unsigned int execute (function *) final override
3281 : {
3282 197428 : timevar_push (TV_MACH_DEP);
3283 197428 : rest_of_insert_endbr_and_patchable_area (need_endbr,
3284 : patchable_area_size);
3285 197428 : timevar_pop (TV_MACH_DEP);
3286 197428 : return 0;
3287 : }
3288 :
3289 : private:
3290 : bool need_endbr;
3291 : unsigned int patchable_area_size;
3292 : }; // class pass_insert_endbr_and_patchable_area
3293 :
3294 : } // anon namespace
3295 :
3296 : rtl_opt_pass *
3297 288767 : make_pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
3298 : {
3299 288767 : return new pass_insert_endbr_and_patchable_area (ctxt);
3300 : }
3301 :
3302 : bool
3303 6061569 : ix86_rpad_gate ()
3304 : {
3305 6061569 : return (TARGET_AVX
3306 385907 : && TARGET_SSE_PARTIAL_REG_DEPENDENCY
3307 291012 : && TARGET_SSE_MATH
3308 290698 : && optimize
3309 6347041 : && optimize_function_for_speed_p (cfun));
3310 : }
3311 :
3312 : enum x86_cse_kind
3313 : {
3314 : X86_CSE_CONST0_VECTOR,
3315 : X86_CSE_CONSTM1_VECTOR,
3316 : X86_CSE_CONST_VECTOR,
3317 : X86_CSE_VEC_DUP,
3318 : X86_CSE_TLS_GD,
3319 : X86_CSE_TLS_LD_BASE,
3320 : X86_CSE_TLSDESC
3321 : };
3322 :
3323 154859 : struct redundant_pattern
3324 : {
3325 : /* Bitmap of basic blocks with broadcast instructions. */
3326 : auto_bitmap bbs;
3327 : /* Bitmap of broadcast instructions. */
3328 : auto_bitmap insns;
3329 : /* The broadcast inner scalar. */
3330 : rtx val;
3331 : /* The actual redundant source value for UNSPEC_TLSDESC. */
3332 : rtx tlsdesc_val;
3333 : /* The inner scalar mode. */
3334 : machine_mode mode;
3335 : /* The destination mode which can be changed to the integer mode of
3336 : the same time. */
3337 : machine_mode dest_mode;
3338 : /* The instruction which sets the inner scalar. Nullptr if the inner
3339 : scalar is applied to the whole function, instead of within the same
3340 : block. */
3341 : rtx_insn *def_insn;
3342 : /* The widest broadcast source. */
3343 : rtx broadcast_source;
3344 : /* The widest broadcast register. */
3345 : rtx broadcast_reg;
3346 : /* The basic block of the broadcast instruction. */
3347 : basic_block bb;
3348 : /* The number of broadcast instructions with the same inner scalar. */
3349 : unsigned HOST_WIDE_INT count;
3350 : /* The threshold of broadcast instructions with the same inner
3351 : scalar. */
3352 : unsigned int threshold;
3353 : /* The widest broadcast size in bytes. */
3354 : unsigned int size;
3355 : /* Load kind. */
3356 : x86_cse_kind kind;
3357 : };
3358 :
3359 : /* Generate a vector set, DEST = SRC, at entry of the nearest dominator
3360 : for basic block map BBS, which is in the fake loop that contains the
3361 : whole function, so that there is only a single vector set in the
3362 : whole function. If not nullptr, LOAD is a pointer to the load. */
3363 :
3364 : static void
3365 43324 : ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs,
3366 : redundant_pattern *load = nullptr)
3367 : {
3368 43324 : basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
3369 : /* For X86_CSE_VEC_DUP and X86_CSE_CONST_VECTOR, don't place the vector
3370 : set outside of the loop to avoid extra spills. */
3371 43324 : if (!load
3372 42302 : || (load->kind != X86_CSE_VEC_DUP
3373 42302 : && load->kind != X86_CSE_CONST_VECTOR))
3374 : {
3375 23860 : while (bb->loop_father->latch
3376 23860 : != EXIT_BLOCK_PTR_FOR_FN (cfun))
3377 1361 : bb = get_immediate_dominator (CDI_DOMINATORS,
3378 : bb->loop_father->header);
3379 : }
3380 :
3381 43324 : if (CONST_INT_P (src))
3382 10493 : dest = gen_rtx_SUBREG (load->dest_mode, dest, 0);
3383 32831 : else if (CONST_VECTOR_P (src))
3384 : {
3385 : /* The only possible CONST_VECTORs of SRC are CONST0_RTX and
3386 : CONSTM1_RTX. Otherwise,
3387 :
3388 : rtx set = gen_rtx_SET (dest, src);
3389 :
3390 : won't be a valid instruction. CONST0_RTX always works. It
3391 : can comes from:
3392 :
3393 : 1. remove_partial_avx_dependency with LOAD == NULL.
3394 : 2. X86_CSE_VEC_DUP with
3395 :
3396 : (insn 48 58 16 3 (set (reg:V4HI 123)
3397 : (const_vector:V4HI [
3398 : (const_int 0 [0]) repeated x4
3399 : ])) 2065 {*movv4hi_internal} (nil))
3400 :
3401 : 3. X86_CSE_CONST0_VECTOR.
3402 : */
3403 22499 : machine_mode mode = GET_MODE (dest);
3404 22499 : if (!(src == CONST0_RTX (mode)
3405 1578 : || (src == CONSTM1_RTX (mode)
3406 1578 : && load->kind == X86_CSE_CONSTM1_VECTOR)))
3407 0 : gcc_unreachable ();
3408 : }
3409 43324 : rtx set = gen_rtx_SET (dest, src);
3410 :
3411 43324 : rtx_insn *insn = BB_HEAD (bb);
3412 170033 : while (insn && !NONDEBUG_INSN_P (insn))
3413 : {
3414 126713 : if (insn == BB_END (bb))
3415 : {
3416 : insn = NULL;
3417 : break;
3418 : }
3419 126709 : insn = NEXT_INSN (insn);
3420 : }
3421 :
3422 43324 : rtx_insn *set_insn;
3423 43324 : if (insn == BB_HEAD (bb))
3424 : {
3425 0 : set_insn = emit_insn_before (set, insn);
3426 0 : if (dump_file)
3427 : {
3428 0 : fprintf (dump_file, "\nPlace:\n\n");
3429 0 : print_rtl_single (dump_file, set_insn);
3430 0 : fprintf (dump_file, "\nbefore:\n\n");
3431 0 : print_rtl_single (dump_file, insn);
3432 0 : fprintf (dump_file, "\n");
3433 : }
3434 : }
3435 : else
3436 : {
3437 43324 : rtx_insn *after = insn ? PREV_INSN (insn) : BB_END (bb);
3438 43324 : set_insn = emit_insn_after (set, after);
3439 43324 : if (dump_file)
3440 : {
3441 2 : fprintf (dump_file, "\nPlace:\n\n");
3442 2 : print_rtl_single (dump_file, set_insn);
3443 2 : fprintf (dump_file, "\nafter:\n\n");
3444 2 : print_rtl_single (dump_file, after);
3445 2 : fprintf (dump_file, "\n");
3446 : }
3447 : }
3448 :
3449 43324 : if (load && load->kind == X86_CSE_VEC_DUP)
3450 : {
3451 : /* Get the source from LOAD as (reg:SI 99) in
3452 :
3453 : (vec_duplicate:V4SI (reg:SI 99))
3454 :
3455 : */
3456 10332 : rtx inner_scalar = load->val;
3457 : /* Set the source in (vec_duplicate:V4SI (reg:SI 99)). */
3458 10332 : rtx reg = XEXP (src, 0);
3459 10332 : machine_mode reg_mode = GET_MODE (reg);
3460 10332 : if (reg_mode != GET_MODE (inner_scalar))
3461 : {
3462 10051 : if (REG_P (inner_scalar) || MEM_P (inner_scalar))
3463 0 : inner_scalar = gen_rtx_SUBREG (reg_mode, inner_scalar, 0);
3464 10051 : else if (!SCALAR_INT_MODE_P (reg_mode))
3465 : {
3466 : /* For non-int load with integer constant, generate
3467 :
3468 : (set (subreg:SI (reg/v:SF 105 [ f ]) 0)
3469 : (const_int 1313486336 [0x4e4a3600]))
3470 :
3471 : */
3472 1 : gcc_assert (CONST_INT_P (inner_scalar));
3473 1 : unsigned int bits = GET_MODE_BITSIZE (reg_mode);
3474 1 : machine_mode mode = int_mode_for_size (bits, 0).require ();
3475 1 : reg = gen_rtx_SUBREG (mode, reg, 0);
3476 : }
3477 : }
3478 10332 : rtx set = gen_rtx_SET (reg, inner_scalar);
3479 10332 : insn = emit_insn_before (set, set_insn);
3480 10332 : if (dump_file)
3481 : {
3482 0 : fprintf (dump_file, "\nAdd:\n\n");
3483 0 : print_rtl_single (dump_file, insn);
3484 0 : fprintf (dump_file, "\nbefore:\n\n");
3485 0 : print_rtl_single (dump_file, set_insn);
3486 0 : fprintf (dump_file, "\n");
3487 : }
3488 : }
3489 43324 : }
3490 :
3491 : /* At entry of the nearest common dominator for basic blocks with
3492 : conversions/rcp/sqrt/rsqrt/round, generate a single
3493 : vxorps %xmmN, %xmmN, %xmmN
3494 : for all
3495 : vcvtss2sd op, %xmmN, %xmmX
3496 : vcvtsd2ss op, %xmmN, %xmmX
3497 : vcvtsi2ss op, %xmmN, %xmmX
3498 : vcvtsi2sd op, %xmmN, %xmmX
3499 :
3500 : NB: We want to generate only a single vxorps to cover the whole
3501 : function. The LCM algorithm isn't appropriate here since it may
3502 : place a vxorps inside the loop. */
3503 :
3504 : static unsigned int
3505 33407 : remove_partial_avx_dependency (void)
3506 : {
3507 33407 : timevar_push (TV_MACH_DEP);
3508 :
3509 33407 : bitmap_obstack_initialize (NULL);
3510 33407 : bitmap convert_bbs = BITMAP_ALLOC (NULL);
3511 :
3512 33407 : basic_block bb;
3513 33407 : rtx_insn *insn, *set_insn;
3514 33407 : rtx set;
3515 33407 : rtx v4sf_const0 = NULL_RTX;
3516 :
3517 33407 : auto_vec<rtx_insn *> control_flow_insns;
3518 :
3519 : /* We create invalid RTL initially so defer rescans. */
3520 33407 : df_set_flags (DF_DEFER_INSN_RESCAN);
3521 :
3522 311853 : FOR_EACH_BB_FN (bb, cfun)
3523 : {
3524 3493404 : FOR_BB_INSNS (bb, insn)
3525 : {
3526 3214958 : if (!NONDEBUG_INSN_P (insn))
3527 1437683 : continue;
3528 :
3529 1777275 : set = single_set (insn);
3530 1777275 : if (!set)
3531 70966 : continue;
3532 :
3533 1706309 : if (get_attr_avx_partial_xmm_update (insn)
3534 : != AVX_PARTIAL_XMM_UPDATE_TRUE)
3535 1703130 : continue;
3536 :
3537 : /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF,
3538 : SI -> SF, SI -> DF, DI -> SF, DI -> DF, sqrt, rsqrt, rcp,
3539 : round, to vec_dup and vec_merge with subreg. */
3540 3179 : rtx src = SET_SRC (set);
3541 3179 : rtx dest = SET_DEST (set);
3542 3179 : machine_mode dest_mode = GET_MODE (dest);
3543 3179 : bool convert_p = false;
3544 3179 : switch (GET_CODE (src))
3545 : {
3546 3114 : case FLOAT:
3547 3114 : case FLOAT_EXTEND:
3548 3114 : case FLOAT_TRUNCATE:
3549 3114 : case UNSIGNED_FLOAT:
3550 3114 : convert_p = true;
3551 3114 : break;
3552 : default:
3553 : break;
3554 : }
3555 :
3556 : /* Only handle conversion here. */
3557 3114 : machine_mode src_mode
3558 3114 : = convert_p ? GET_MODE (XEXP (src, 0)) : VOIDmode;
3559 3114 : switch (src_mode)
3560 : {
3561 153 : case E_SFmode:
3562 153 : case E_DFmode:
3563 153 : if (TARGET_USE_VECTOR_FP_CONVERTS
3564 147 : || !TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY)
3565 8 : continue;
3566 : break;
3567 2961 : case E_SImode:
3568 2961 : case E_DImode:
3569 2961 : if (TARGET_USE_VECTOR_CONVERTS
3570 2949 : || !TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY)
3571 14 : continue;
3572 : break;
3573 65 : case E_VOIDmode:
3574 65 : gcc_assert (!convert_p);
3575 : break;
3576 0 : default:
3577 0 : gcc_unreachable ();
3578 : }
3579 :
3580 3157 : if (!v4sf_const0)
3581 1022 : v4sf_const0 = gen_reg_rtx (V4SFmode);
3582 :
3583 3157 : rtx zero;
3584 3157 : machine_mode dest_vecmode;
3585 3157 : switch (dest_mode)
3586 : {
3587 50 : case E_HFmode:
3588 50 : dest_vecmode = V8HFmode;
3589 50 : zero = gen_rtx_SUBREG (V8HFmode, v4sf_const0, 0);
3590 50 : break;
3591 : case E_SFmode:
3592 : dest_vecmode = V4SFmode;
3593 : zero = v4sf_const0;
3594 : break;
3595 1167 : case E_DFmode:
3596 1167 : dest_vecmode = V2DFmode;
3597 1167 : zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0);
3598 1167 : break;
3599 0 : default:
3600 0 : gcc_unreachable ();
3601 : }
3602 :
3603 : /* Change source to vector mode. */
3604 3157 : src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src);
3605 3157 : src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero,
3606 : GEN_INT (HOST_WIDE_INT_1U));
3607 : /* Change destination to vector mode. */
3608 3157 : rtx vec = gen_reg_rtx (dest_vecmode);
3609 : /* Generate an XMM vector SET. */
3610 3157 : set = gen_rtx_SET (vec, src);
3611 3157 : set_insn = emit_insn_before (set, insn);
3612 :
3613 3157 : if (cfun->can_throw_non_call_exceptions)
3614 : {
3615 : /* Handle REG_EH_REGION note. */
3616 0 : rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
3617 0 : if (note)
3618 : {
3619 0 : control_flow_insns.safe_push (set_insn);
3620 0 : add_reg_note (set_insn, REG_EH_REGION, XEXP (note, 0));
3621 : }
3622 : }
3623 :
3624 3157 : src = gen_rtx_SUBREG (dest_mode, vec, 0);
3625 3157 : set = gen_rtx_SET (dest, src);
3626 :
3627 : /* Drop possible dead definitions. */
3628 3157 : PATTERN (insn) = set;
3629 :
3630 3157 : INSN_CODE (insn) = -1;
3631 3157 : recog_memoized (insn);
3632 3157 : df_insn_rescan (insn);
3633 3157 : bitmap_set_bit (convert_bbs, bb->index);
3634 : }
3635 : }
3636 :
3637 33407 : if (v4sf_const0)
3638 : {
3639 : /* (Re-)discover loops so that bb->loop_father can be used in the
3640 : analysis below. */
3641 1022 : calculate_dominance_info (CDI_DOMINATORS);
3642 1022 : loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
3643 :
3644 1022 : ix86_place_single_vector_set (v4sf_const0,
3645 : CONST0_RTX (V4SFmode),
3646 : convert_bbs);
3647 :
3648 1022 : loop_optimizer_finalize ();
3649 :
3650 1022 : if (!control_flow_insns.is_empty ())
3651 : {
3652 0 : free_dominance_info (CDI_DOMINATORS);
3653 :
3654 0 : unsigned int i;
3655 0 : FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
3656 0 : if (control_flow_insn_p (insn))
3657 : {
3658 : /* Split the block after insn. There will be a fallthru
3659 : edge, which is OK so we keep it. We have to create
3660 : the exception edges ourselves. */
3661 0 : bb = BLOCK_FOR_INSN (insn);
3662 0 : split_block (bb, insn);
3663 0 : rtl_make_eh_edge (NULL, bb, BB_END (bb));
3664 : }
3665 : }
3666 : }
3667 :
3668 33407 : df_process_deferred_rescans ();
3669 33407 : df_clear_flags (DF_DEFER_INSN_RESCAN);
3670 33407 : bitmap_obstack_release (NULL);
3671 33407 : BITMAP_FREE (convert_bbs);
3672 :
3673 33407 : timevar_pop (TV_MACH_DEP);
3674 33407 : return 0;
3675 33407 : }
3676 :
3677 : namespace {
3678 :
3679 : const pass_data pass_data_remove_partial_avx_dependency =
3680 : {
3681 : RTL_PASS, /* type */
3682 : "rpad", /* name */
3683 : OPTGROUP_NONE, /* optinfo_flags */
3684 : TV_MACH_DEP, /* tv_id */
3685 : 0, /* properties_required */
3686 : 0, /* properties_provided */
3687 : 0, /* properties_destroyed */
3688 : 0, /* todo_flags_start */
3689 : 0, /* todo_flags_finish */
3690 : };
3691 :
3692 : class pass_remove_partial_avx_dependency : public rtl_opt_pass
3693 : {
3694 : public:
3695 288767 : pass_remove_partial_avx_dependency (gcc::context *ctxt)
3696 577534 : : rtl_opt_pass (pass_data_remove_partial_avx_dependency, ctxt)
3697 : {}
3698 :
3699 : /* opt_pass methods: */
3700 1481491 : bool gate (function *) final override
3701 : {
3702 1481491 : return ix86_rpad_gate ();
3703 : }
3704 :
3705 33407 : unsigned int execute (function *) final override
3706 : {
3707 33407 : return remove_partial_avx_dependency ();
3708 : }
3709 : }; // class pass_rpad
3710 :
3711 : } // anon namespace
3712 :
3713 : rtl_opt_pass *
3714 288767 : make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
3715 : {
3716 288767 : return new pass_remove_partial_avx_dependency (ctxt);
3717 : }
3718 :
3719 : /* Return a machine mode suitable for vector SIZE with SMODE inner
3720 : mode. */
3721 :
3722 : static machine_mode
3723 64127 : ix86_get_vector_cse_mode (unsigned int size, machine_mode smode)
3724 : {
3725 : /* Use the inner scalar mode of vector broadcast source in:
3726 :
3727 : (set (reg:V8DF 394)
3728 : (vec_duplicate:V8DF (reg:V2DF 190 [ alpha ])))
3729 :
3730 : to compute the vector mode for broadcast from vector source.
3731 : */
3732 64127 : if (VECTOR_MODE_P (smode))
3733 31026 : smode = GET_MODE_INNER (smode);
3734 64127 : scalar_mode s_mode = as_a <scalar_mode> (smode);
3735 128254 : poly_uint64 nunits = size / GET_MODE_SIZE (smode);
3736 64127 : machine_mode mode = mode_for_vector (s_mode, nunits).require ();
3737 64127 : return mode;
3738 : }
3739 :
3740 : /* Replace the source operand of instructions in VECTOR_INSNS with
3741 : VECTOR_CONST in VECTOR_MODE. */
3742 :
3743 : static void
3744 63656 : replace_vector_const (machine_mode vector_mode, rtx vector_const,
3745 : auto_bitmap &vector_insns,
3746 : machine_mode scalar_mode)
3747 : {
3748 63656 : bitmap_iterator bi;
3749 63656 : unsigned int id;
3750 :
3751 222461 : EXECUTE_IF_SET_IN_BITMAP (vector_insns, 0, id, bi)
3752 : {
3753 158805 : rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
3754 :
3755 : /* Get the single SET instruction. */
3756 158805 : rtx set = single_set (insn);
3757 158805 : rtx src = SET_SRC (set);
3758 158805 : rtx dest = SET_DEST (set);
3759 158805 : machine_mode mode = GET_MODE (dest);
3760 :
3761 158805 : rtx replace;
3762 : /* Replace the source operand with VECTOR_CONST. */
3763 158805 : if (SUBREG_P (src)
3764 158805 : || mode == vector_mode
3765 60165 : || CONST_INT_P (vector_const))
3766 : replace = vector_const;
3767 : else
3768 : {
3769 60165 : unsigned int size = GET_MODE_SIZE (mode);
3770 60165 : if (size < ix86_regmode_natural_size (mode))
3771 : {
3772 : /* If the mode size is smaller than its natural size,
3773 : first insert an extra move with a QI vector SUBREG
3774 : of the same size to avoid validate_subreg failure. */
3775 471 : machine_mode vmode
3776 471 : = ix86_get_vector_cse_mode (size, scalar_mode);
3777 471 : rtx vreg;
3778 471 : if (mode == vmode)
3779 : vreg = vector_const;
3780 : else
3781 : {
3782 59 : vreg = gen_reg_rtx (vmode);
3783 59 : rtx vsubreg = gen_rtx_SUBREG (vmode, vector_const, 0);
3784 59 : rtx pat = gen_rtx_SET (vreg, vsubreg);
3785 59 : rtx_insn *vinsn = emit_insn_before (pat, insn);
3786 59 : if (dump_file)
3787 : {
3788 0 : fprintf (dump_file, "\nInsert an extra move:\n\n");
3789 0 : print_rtl_single (dump_file, vinsn);
3790 0 : fprintf (dump_file, "\nbefore:\n\n");
3791 0 : print_rtl_single (dump_file, insn);
3792 0 : fprintf (dump_file, "\n");
3793 : }
3794 : }
3795 471 : replace = gen_rtx_SUBREG (mode, vreg, 0);
3796 : }
3797 : else
3798 59694 : replace = gen_rtx_SUBREG (mode, vector_const, 0);
3799 : }
3800 :
3801 158805 : if (dump_file)
3802 : {
3803 3 : fprintf (dump_file, "\nReplace:\n\n");
3804 3 : print_rtl_single (dump_file, insn);
3805 : }
3806 158805 : SET_SRC (set) = replace;
3807 158805 : if (CONST_INT_P (replace))
3808 : {
3809 23527 : dest = gen_rtx_SUBREG (scalar_mode, dest, 0);
3810 23527 : SET_DEST (set) = dest;
3811 : }
3812 : /* Drop possible dead definitions. */
3813 158805 : PATTERN (insn) = set;
3814 158805 : INSN_CODE (insn) = -1;
3815 158805 : recog_memoized (insn);
3816 158805 : if (dump_file)
3817 : {
3818 3 : fprintf (dump_file, "\nwith:\n\n");
3819 3 : print_rtl_single (dump_file, insn);
3820 3 : fprintf (dump_file, "\n");
3821 : }
3822 158805 : df_insn_rescan (insn);
3823 : }
3824 63656 : }
3825 :
3826 : /* Return the inner scalar if OP is a broadcast, else return nullptr. */
3827 :
3828 : static rtx
3829 2202109 : ix86_broadcast_inner (rtx op, machine_mode mode,
3830 : machine_mode *scalar_mode_p,
3831 : x86_cse_kind *kind_p, rtx_insn **insn_p)
3832 : {
3833 2202109 : switch (standard_sse_constant_p (op, mode))
3834 : {
3835 114524 : case 1:
3836 114524 : *scalar_mode_p = QImode;
3837 114524 : *kind_p = X86_CSE_CONST0_VECTOR;
3838 114524 : *insn_p = nullptr;
3839 114524 : return const0_rtx;
3840 12130 : case 2:
3841 12130 : *scalar_mode_p = QImode;
3842 12130 : *kind_p = X86_CSE_CONSTM1_VECTOR;
3843 12130 : *insn_p = nullptr;
3844 12130 : return constm1_rtx;
3845 2075455 : default:
3846 2075455 : break;
3847 : }
3848 :
3849 2075455 : mode = GET_MODE (op);
3850 2075455 : int nunits = GET_MODE_NUNITS (mode);
3851 2075455 : if (nunits < 2)
3852 : return nullptr;
3853 :
3854 1600386 : bool const_vector_p = CONST_VECTOR_P (op);
3855 1600386 : bool duplicated = GET_CODE (op) == VEC_DUPLICATE;
3856 1600386 : rtx orig_op = op;
3857 1600386 : if (!const_vector_p)
3858 : {
3859 : /* Check CONST_VECTOR in REG_EQUAL note. */
3860 1600366 : rtx equal = find_reg_equal_equiv_note (*insn_p);
3861 1600366 : if (equal)
3862 : {
3863 371383 : equal = XEXP (equal, 0);
3864 371383 : const_vector_p = CONST_VECTOR_P (equal);
3865 : /* Use CONST_VECTOR in REG_EQUAL note. */
3866 371383 : if (const_vector_p)
3867 : {
3868 : /* Handle REG_EQUAL note in:
3869 :
3870 : (insn 7 5 12 2 (set (subreg:V8SI (reg:V4DI 100) 0)
3871 : (vec_duplicate:V8SI (reg:SI 102)))
3872 : (expr_list:REG_DEAD (reg:SI 102)
3873 : (expr_list:REG_EQUAL (const_vector:V4DI [
3874 : (const_int -1 [0xffffffffffffffff]) repeated x4]) (nil))))
3875 :
3876 : NB: Don't treat it as CONST_VECTOR since EQUAL isn't
3877 : supported by ISAs as in gcc.target/i386/pr40957.c. */
3878 260409 : if (GET_MODE (equal) != mode)
3879 : const_vector_p = false;
3880 : else
3881 1600386 : op = equal;
3882 : }
3883 : }
3884 : }
3885 :
3886 1600386 : machine_mode inner_mode = GET_MODE_INNER (mode);
3887 :
3888 1600386 : if (const_vector_p)
3889 : {
3890 520790 : bool int_load_p = GET_MODE_SIZE (mode) <= UNITS_PER_WORD;
3891 260395 : *kind_p = X86_CSE_CONST_VECTOR;
3892 260395 : if (int_load_p)
3893 : {
3894 : /* This CONST_VECTOR load can be converted to constant
3895 : integer load. */
3896 34643 : *scalar_mode_p = mode;
3897 34643 : *insn_p = nullptr;
3898 34643 : return op;
3899 : }
3900 :
3901 : /* This CONST_VECTOR is wider than the integer register. */
3902 225752 : rtx first = XVECEXP (op, 0, 0);
3903 :
3904 225752 : if (duplicated)
3905 : {
3906 : /* Check if CONST_VECTOR in REG_EQUAL note is duplicated in
3907 :
3908 : (insn 10 7 12 2 (set (reg:V8SI 128)
3909 : (vec_duplicate:V8SI (vec_select:V2SI (reg:V4SI 180)
3910 : (parallel [(const_int 0 [0])
3911 : (const_int 1 [0x1])]))))
3912 : (expr_list:REG_EQUAL (const_vector:V8SI [
3913 : (const_int 0 [0])
3914 : (const_int 34 [0x22])
3915 : (const_int 0 [0])
3916 : (const_int 34 [0x22])
3917 : (const_int 0 [0])
3918 : (const_int 34 [0x22])
3919 : (const_int 0 [0])
3920 : (const_int 34 [0x22])])(nil)))
3921 :
3922 : */
3923 :
3924 211066 : bool duplicated_const_vector = true;
3925 211066 : for (int i = 1; i < nunits; ++i)
3926 : {
3927 138153 : rtx tmp = XVECEXP (op, 0, i);
3928 138153 : if (!rtx_equal_p (tmp, first))
3929 : {
3930 : duplicated_const_vector = false;
3931 : break;
3932 : }
3933 : }
3934 :
3935 72929 : if (duplicated_const_vector)
3936 : {
3937 72913 : bool const_double_p = CONST_DOUBLE_P (first);
3938 : /* Force the floating point constant to memory. */
3939 72913 : if (const_double_p)
3940 5534 : first = validize_mem (force_const_mem (inner_mode, first));
3941 :
3942 72913 : if (const_double_p || CONST_INT_P (first))
3943 : {
3944 : /* Handle
3945 :
3946 : (insn 7 6 8 2 (set (reg:V4SF 99)
3947 : (vec_duplicate:V4SF (mem/u/c:SF (symbol_ref/u:DI ("*.LC2") [flags 0x2]) [0 S4 A32])))
3948 : (expr_list:REG_EQUAL (const_vector:V4SF [
3949 : (const_double:SF 3.4e+1 [0x0.88p+6]) repeated x4]) (nil)))
3950 :
3951 : and
3952 :
3953 : (insn 14 15 16 3 (set (reg:V4SI 116)
3954 : (vec_duplicate:V4SI (reg:SI 117)))
3955 : (expr_list:REG_EQUAL (const_vector:V4SI [
3956 : (const_int 34 [0x22]) repeated x4]) (nil)))
3957 :
3958 : */
3959 72913 : *kind_p = X86_CSE_VEC_DUP;
3960 72913 : *insn_p = nullptr;
3961 72913 : *scalar_mode_p = inner_mode;
3962 72913 : return first;
3963 : }
3964 : }
3965 :
3966 : op = orig_op;
3967 : }
3968 : else
3969 : {
3970 : /* Only native CONST_VECTOR is allowed. */
3971 152823 : if (orig_op != op)
3972 : return nullptr;
3973 :
3974 : /* Check if VEC_DUPLICATE can be used. */
3975 48 : for (int i = 1; i < nunits; ++i)
3976 : {
3977 48 : rtx tmp = XVECEXP (op, 0, i);
3978 : /* Vector duplicate value. */
3979 48 : if (!rtx_equal_p (tmp, first))
3980 : return nullptr;
3981 : }
3982 :
3983 : /* Use the inner mode to handle
3984 : (const_vector:V2QI [(const_int 0 [0]) repeated x2])
3985 : */
3986 0 : *scalar_mode_p = inner_mode;
3987 0 : *insn_p = nullptr;
3988 0 : return first;
3989 : }
3990 : }
3991 :
3992 1340007 : if (!duplicated)
3993 : return nullptr;
3994 :
3995 22642 : *kind_p = X86_CSE_VEC_DUP;
3996 :
3997 : /* Only
3998 :
3999 : (vec_duplicate:V4SI (reg:SI 99))
4000 : (vec_duplicate:V2DF (mem/u/c:DF (symbol_ref/u:DI ("*.LC1") [flags 0x2]) [0 S8 A64]))
4001 :
4002 : are supported. Set OP to the broadcast source by default. */
4003 22642 : op = XEXP (op, 0);
4004 22642 : rtx reg = op;
4005 22642 : if (SUBREG_P (op)
4006 401 : && SUBREG_BYTE (op) == 0
4007 23043 : && !paradoxical_subreg_p (op))
4008 401 : reg = SUBREG_REG (op);
4009 22642 : if (!REG_P (reg))
4010 : {
4011 2301 : if (MEM_P (op)
4012 2045 : && SYMBOL_REF_P (XEXP (op, 0))
4013 2538 : && CONSTANT_POOL_ADDRESS_P (XEXP (op, 0)))
4014 : {
4015 : /* Handle constant broadcast from memory. */
4016 11 : *scalar_mode_p = inner_mode;
4017 11 : *insn_p = nullptr;
4018 11 : return op;
4019 : }
4020 : return nullptr;
4021 : }
4022 :
4023 20341 : machine_mode orig_mode = mode;
4024 20341 : mode = GET_MODE (op);
4025 :
4026 : /* Only single def chain is supported. */
4027 20341 : df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg));
4028 20341 : if (!ref
4029 20340 : || DF_REF_IS_ARTIFICIAL (ref)
4030 20340 : || DF_REF_NEXT_REG (ref) != nullptr)
4031 : return nullptr;
4032 :
4033 14863 : rtx_insn *insn = DF_REF_INSN (ref);
4034 14863 : rtx set = single_set (insn);
4035 14863 : if (!set)
4036 : return nullptr;
4037 :
4038 14824 : rtx src = SET_SRC (set);
4039 :
4040 14824 : if (CONST_INT_P (src))
4041 : {
4042 : /* Handle sequences like
4043 :
4044 : (set (subreg:SI (reg/v:SF 105 [ f ]) 0)
4045 : (const_int 0 [0]))
4046 : (set (reg:V4SF 110)
4047 : (vec_duplicate:V4SF (reg/v:SF 105 [ f ])))
4048 :
4049 : and
4050 :
4051 : (set (reg:SI 99)
4052 : (const_int 34 [0x22]))
4053 : (set (reg:V4SI 98)
4054 : (vec_duplicate:V4SI (reg:SI 99)))
4055 :
4056 : Set *INSN_P to nullptr and return SET_SRC if SET_SRC is an
4057 : integer constant. */
4058 234 : op = src;
4059 234 : if (SCALAR_INT_MODE_P (mode) && mode != GET_MODE (reg))
4060 0 : op = gen_int_mode (INTVAL (src), mode);
4061 234 : if (op == const0_rtx)
4062 : {
4063 6 : if (standard_sse_constant_p (CONST0_RTX (orig_mode),
4064 : orig_mode) == 1)
4065 : {
4066 6 : *scalar_mode_p = QImode;
4067 6 : *kind_p = X86_CSE_CONST0_VECTOR;
4068 6 : *insn_p = nullptr;
4069 6 : return const0_rtx;
4070 : }
4071 0 : op = CONST0_RTX (mode);
4072 : }
4073 228 : else if (op == constm1_rtx
4074 228 : && standard_sse_constant_p (CONSTM1_RTX (orig_mode),
4075 : orig_mode) == 2)
4076 : {
4077 0 : *scalar_mode_p = QImode;
4078 0 : *kind_p = X86_CSE_CONSTM1_VECTOR;
4079 0 : *insn_p = nullptr;
4080 0 : return constm1_rtx;
4081 : }
4082 :
4083 : /* Check if we can convert:
4084 :
4085 : (insn 14 465 412 3 (set (reg:SI 507 [ j_lsm.26 ])
4086 : (const_int 2 [0x2])) "foo.c":10:12 discrim 2 100 {*movsi_internal} (nil))
4087 : ...
4088 : (insn 518 507 434 16 (set (reg:V2SI 493)
4089 : (vec_duplicate:V2SI (reg:SI 507 [ j_lsm.26 ]))) 2395 {*vec_dupv2si} (nil))
4090 :
4091 : to constant integer load:
4092 :
4093 : (insn 566 55 56 6 (set (subreg:DI (reg:V2SI 517) 0)
4094 : (const_int 8589934594 [0x200000002])) -1 (nil))
4095 : ...
4096 : (insn 518 507 434 16 (set (reg:V2SI 493)
4097 : (reg:V2SI 517)) 2066 {*movv2si_internal} (nil))
4098 :
4099 : */
4100 456 : if (GET_MODE_SIZE (orig_mode) <= UNITS_PER_WORD)
4101 6 : *kind_p = X86_CSE_CONST_VECTOR;
4102 :
4103 228 : *insn_p = nullptr;
4104 : }
4105 : else
4106 : {
4107 : /* Handle sequences like
4108 :
4109 : (set (reg:QI 105 [ c ])
4110 : (reg:QI 5 di [ c ]))
4111 : (set (reg:V64QI 102 [ _1 ])
4112 : (vec_duplicate:V64QI (reg:QI 105 [ c ])))
4113 :
4114 : (set (reg/v:SI 116 [ argc ])
4115 : (mem/c:SI (reg:SI 135) [2 argc+0 S4 A32]))
4116 : (set (reg:V4SI 119 [ _45 ])
4117 : (vec_duplicate:V4SI (reg/v:SI 116 [ argc ])))
4118 :
4119 : (set (reg:SI 98 [ _1 ])
4120 : (sign_extend:SI (reg:QI 106 [ c ])))
4121 : (set (reg:V16SI 103 [ _2 ])
4122 : (vec_duplicate:V16SI (reg:SI 98 [ _1 ])))
4123 :
4124 : (set (reg:SI 102 [ cost ])
4125 : (mem/c:SI (symbol_ref:DI ("cost") [flags 0x40])))
4126 : (set (reg:V4HI 103 [ _16 ])
4127 : (vec_duplicate:V4HI (subreg:HI (reg:SI 102 [ cost ]) 0)))
4128 :
4129 : (set (subreg:SI (reg/v:HI 107 [ cr_val ]) 0)
4130 : (ashift:SI (reg:SI 158)
4131 : (subreg:QI (reg:SI 156 [ _2 ]) 0)))
4132 : (set (reg:V16HI 183 [ _61 ])
4133 : (vec_duplicate:V16HI (reg/v:HI 107 [ cr_val ])))
4134 :
4135 : Set *INSN_P to INSN and return the broadcast source otherwise. */
4136 14590 : *insn_p = insn;
4137 : }
4138 :
4139 14818 : *scalar_mode_p = mode;
4140 14818 : return op;
4141 : }
4142 :
4143 : /* Replace CALL instruction in TLS_CALL_INSNS with SET from SRC and
4144 : put the updated instruction in UPDATED_TLS_INSNS. */
4145 :
4146 : static void
4147 313 : replace_tls_call (rtx src, auto_bitmap &tls_call_insns,
4148 : auto_bitmap &updated_tls_insns)
4149 : {
4150 313 : bitmap_iterator bi;
4151 313 : unsigned int id;
4152 :
4153 1739 : EXECUTE_IF_SET_IN_BITMAP (tls_call_insns, 0, id, bi)
4154 : {
4155 1426 : rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
4156 :
4157 : /* If this isn't a CALL, only GNU2 TLS implicit CALL patterns are
4158 : allowed. */
4159 1426 : if (!CALL_P (insn))
4160 : {
4161 47 : attr_tls64 tls64 = get_attr_tls64 (insn);
4162 47 : if (tls64 != TLS64_CALL && tls64 != TLS64_COMBINE)
4163 0 : gcc_unreachable ();
4164 : }
4165 :
4166 1426 : rtx pat = PATTERN (insn);
4167 1426 : gcc_assert (GET_CODE (pat) == PARALLEL);
4168 1426 : rtx set = XVECEXP (pat, 0, 0);
4169 1426 : gcc_assert (GET_CODE (set) == SET);
4170 1426 : rtx dest = SET_DEST (set);
4171 :
4172 1426 : set = gen_rtx_SET (dest, src);
4173 1426 : rtx_insn *set_insn = emit_insn_after (set, insn);
4174 1426 : if (recog_memoized (set_insn) < 0)
4175 0 : gcc_unreachable ();
4176 :
4177 : /* Put SET_INSN in UPDATED_TLS_INSNS. */
4178 1426 : bitmap_set_bit (updated_tls_insns, INSN_UID (set_insn));
4179 :
4180 1426 : if (dump_file)
4181 : {
4182 0 : fprintf (dump_file, "\nReplace:\n\n");
4183 0 : print_rtl_single (dump_file, insn);
4184 0 : fprintf (dump_file, "\nwith:\n\n");
4185 0 : print_rtl_single (dump_file, set_insn);
4186 0 : fprintf (dump_file, "\n");
4187 : }
4188 :
4189 : /* Delete the CALL insn. */
4190 1426 : delete_insn (insn);
4191 :
4192 1426 : df_insn_rescan (set_insn);
4193 : }
4194 313 : }
4195 :
4196 : /* Return the basic block which dominates all basic blocks which set
4197 : hard register REGNO used in basic block BB. */
4198 :
4199 : static basic_block
4200 2 : ix86_get_dominator_for_reg (unsigned int regno, basic_block bb)
4201 : {
4202 2 : basic_block set_bb;
4203 2 : auto_bitmap set_bbs;
4204 :
4205 : /* Get all BBs which set REGNO and dominate the current BB from all
4206 : DEFs of REGNO. */
4207 2 : for (df_ref def = DF_REG_DEF_CHAIN (regno);
4208 18 : def;
4209 16 : def = DF_REF_NEXT_REG (def))
4210 16 : if (!DF_REF_IS_ARTIFICIAL (def)
4211 16 : && !DF_REF_FLAGS_IS_SET (def, DF_REF_MAY_CLOBBER)
4212 6 : && !DF_REF_FLAGS_IS_SET (def, DF_REF_MUST_CLOBBER))
4213 : {
4214 4 : set_bb = DF_REF_BB (def);
4215 4 : if (dominated_by_p (CDI_DOMINATORS, bb, set_bb))
4216 2 : bitmap_set_bit (set_bbs, set_bb->index);
4217 : }
4218 :
4219 2 : bb = nearest_common_dominator_for_set (CDI_DOMINATORS, set_bbs);
4220 2 : return bb;
4221 2 : }
4222 :
4223 : /* Mark FLAGS register as live in DATA, a bitmap of live caller-saved
4224 : registers, if DEST is FLAGS register. */
4225 :
4226 : static void
4227 381 : ix86_check_flags_reg (rtx dest, const_rtx x, void *data)
4228 : {
4229 381 : if (GET_CODE (x) == CLOBBER)
4230 : return;
4231 :
4232 374 : auto_bitmap *live_caller_saved_regs = (auto_bitmap *) data;
4233 374 : if (REG_P (dest) && REGNO (dest) == FLAGS_REG)
4234 0 : bitmap_set_bit (*live_caller_saved_regs, FLAGS_REG);
4235 : }
4236 :
4237 : /* Emit a TLS_SET instruction of KIND in basic block BB. Store the
4238 : insertion point in *BEFORE_P for emit_insn_before or in *AFTER_P
4239 : for emit_insn_after. UPDATED_GNU_TLS_INSNS contains instructions
4240 : which replace the GNU TLS instructions. UPDATED_GNU2_TLS_INSNS
4241 : contains instructions which replace the GNU2 TLS instructions. */
4242 :
4243 : static rtx_insn *
4244 313 : ix86_emit_tls_call (rtx tls_set, x86_cse_kind kind, basic_block bb,
4245 : rtx_insn **before_p, rtx_insn **after_p,
4246 : auto_bitmap &updated_gnu_tls_insns,
4247 : auto_bitmap &updated_gnu2_tls_insns)
4248 : {
4249 315 : rtx_insn *tls_insn;
4250 :
4251 315 : do
4252 : {
4253 315 : rtx_insn *insn = BB_HEAD (bb);
4254 1297 : while (insn && !NONDEBUG_INSN_P (insn))
4255 : {
4256 986 : if (insn == BB_END (bb))
4257 : {
4258 : /* This must be the beginning basic block:
4259 :
4260 : (note 4 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
4261 : (note 2 4 26 2 NOTE_INSN_FUNCTION_BEG)
4262 :
4263 : or a basic block with only a label:
4264 :
4265 : (code_label 78 11 77 3 14 (nil) [1 uses])
4266 : (note 77 78 54 3 [bb 3] NOTE_INSN_BASIC_BLOCK)
4267 :
4268 : or a basic block with only a debug marker:
4269 :
4270 : (note 3 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
4271 : (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG)
4272 : (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil))
4273 :
4274 : or a basic block with only deleted instructions:
4275 :
4276 : (code_label 348 23 349 45 3 (nil) [0 uses])
4277 : (note 349 348 436 45 [bb 45] NOTE_INSN_BASIC_BLOCK)
4278 : (note 436 349 362 45 NOTE_INSN_DELETED)
4279 :
4280 : */
4281 4 : gcc_assert (DEBUG_INSN_P (insn)
4282 : || (NOTE_P (insn)
4283 : && ((NOTE_KIND (insn)
4284 : == NOTE_INSN_FUNCTION_BEG)
4285 : || (NOTE_KIND (insn)
4286 : == NOTE_INSN_DELETED)
4287 : || (NOTE_KIND (insn)
4288 : == NOTE_INSN_BASIC_BLOCK))));
4289 : insn = NULL;
4290 : break;
4291 : }
4292 982 : insn = NEXT_INSN (insn);
4293 : }
4294 :
4295 : /* TLS_GD and TLS_LD_BASE instructions are normal functions which
4296 : clobber caller-saved registers. TLSDESC instructions only
4297 : clobber FLAGS. If any registers clobbered by TLS instructions
4298 : are live in this basic block, we must insert TLS instructions
4299 : after all live registers clobbered are dead. */
4300 :
4301 315 : auto_bitmap live_caller_saved_regs;
4302 630 : bitmap in = df_live ? DF_LIVE_IN (bb) : DF_LR_IN (bb);
4303 :
4304 315 : if (bitmap_bit_p (in, FLAGS_REG))
4305 4 : bitmap_set_bit (live_caller_saved_regs, FLAGS_REG);
4306 :
4307 315 : unsigned int i;
4308 :
4309 : /* Get all live caller-saved registers for TLS_GD and TLS_LD_BASE
4310 : instructions. */
4311 315 : if (kind != X86_CSE_TLSDESC)
4312 27249 : for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4313 26956 : if (call_used_regs[i]
4314 25198 : && !fixed_regs[i]
4315 38993 : && bitmap_bit_p (in, i))
4316 344 : bitmap_set_bit (live_caller_saved_regs, i);
4317 :
4318 315 : if (bitmap_empty_p (live_caller_saved_regs))
4319 : {
4320 82 : if (insn == BB_HEAD (bb))
4321 : {
4322 0 : *before_p = insn;
4323 0 : tls_insn = emit_insn_before (tls_set, insn);
4324 : }
4325 : else
4326 : {
4327 : /* Emit the TLS call after NOTE_INSN_FUNCTION_BEG in the
4328 : beginning basic block:
4329 :
4330 : (note 4 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
4331 : (note 2 4 26 2 NOTE_INSN_FUNCTION_BEG)
4332 :
4333 : or after NOTE_INSN_BASIC_BLOCK in a basic block with
4334 : only a label:
4335 :
4336 : (code_label 78 11 77 3 14 (nil) [1 uses])
4337 : (note 77 78 54 3 [bb 3] NOTE_INSN_BASIC_BLOCK)
4338 :
4339 : or after debug marker in a basic block with only a
4340 : debug marker:
4341 :
4342 : (note 3 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
4343 : (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG)
4344 : (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil))
4345 :
4346 : */
4347 82 : insn = insn ? PREV_INSN (insn) : BB_END (bb);
4348 82 : *after_p = insn;
4349 82 : tls_insn = emit_insn_after (tls_set, insn);
4350 : }
4351 82 : return tls_insn;
4352 : }
4353 :
4354 233 : bool repeat = false;
4355 :
4356 : /* Search for REG_DEAD notes in this basic block. */
4357 661 : FOR_BB_INSNS (bb, insn)
4358 : {
4359 661 : if (!NONDEBUG_INSN_P (insn))
4360 283 : continue;
4361 :
4362 : /* NB: Conditional jump is the only instruction which reads
4363 : flags register and changes control flow. We can never
4364 : place the TLS call after unconditional jump. */
4365 378 : if (JUMP_P (insn))
4366 : {
4367 : /* This must be a conditional jump. */
4368 2 : rtx label = JUMP_LABEL (insn);
4369 2 : if (label == nullptr
4370 2 : || ANY_RETURN_P (label)
4371 2 : || !(LABEL_P (label) || SYMBOL_REF_P (label)))
4372 0 : gcc_unreachable ();
4373 :
4374 : /* Place the call before all FLAGS_REG setting BBs since
4375 : we can't place a call before nor after a conditional
4376 : jump. */
4377 2 : bb = ix86_get_dominator_for_reg (FLAGS_REG, bb);
4378 :
4379 : /* Start over again. */
4380 2 : repeat = true;
4381 2 : break;
4382 : }
4383 :
4384 376 : if (bitmap_bit_p (updated_gnu_tls_insns, INSN_UID (insn)))
4385 : {
4386 : /* Insert the __tls_get_addr call before INSN which
4387 : replaces a __tls_get_addr call. */
4388 1 : *before_p = insn;
4389 1 : tls_insn = emit_insn_before (tls_set, insn);
4390 1 : return tls_insn;
4391 : }
4392 :
4393 375 : if (bitmap_bit_p (updated_gnu2_tls_insns, INSN_UID (insn)))
4394 : {
4395 : /* Mark FLAGS register as dead since FLAGS register
4396 : would be clobbered by the GNU2 TLS instruction. */
4397 1 : bitmap_clear_bit (live_caller_saved_regs, FLAGS_REG);
4398 1 : continue;
4399 : }
4400 :
4401 : /* Check if FLAGS register is live. */
4402 374 : note_stores (insn, ix86_check_flags_reg,
4403 : &live_caller_saved_regs);
4404 :
4405 374 : rtx link;
4406 515 : for (link = REG_NOTES (insn); link; link = XEXP (link, 1))
4407 371 : if ((REG_NOTE_KIND (link) == REG_DEAD
4408 9 : || (REG_NOTE_KIND (link) == REG_UNUSED
4409 7 : && REGNO (XEXP (link, 0)) == FLAGS_REG))
4410 378 : && REG_P (XEXP (link, 0)))
4411 : {
4412 : /* Mark the live caller-saved register as dead. */
4413 743 : for (i = REGNO (XEXP (link, 0));
4414 743 : i < END_REGNO (XEXP (link, 0));
4415 : i++)
4416 374 : if (i < FIRST_PSEUDO_REGISTER)
4417 351 : bitmap_clear_bit (live_caller_saved_regs, i);
4418 :
4419 369 : if (bitmap_empty_p (live_caller_saved_regs))
4420 : {
4421 230 : *after_p = insn;
4422 230 : tls_insn = emit_insn_after (tls_set, insn);
4423 230 : return tls_insn;
4424 : }
4425 : }
4426 : }
4427 :
4428 : /* NB: Start over again for conditional jump. */
4429 2 : if (repeat)
4430 2 : continue;
4431 :
4432 0 : gcc_assert (!bitmap_empty_p (live_caller_saved_regs));
4433 :
4434 : /* If any live caller-saved registers aren't dead at the end of
4435 : this basic block, get the basic block which dominates all
4436 : basic blocks which set the remaining live registers. */
4437 0 : auto_bitmap set_bbs;
4438 0 : bitmap_iterator bi;
4439 0 : unsigned int id;
4440 0 : EXECUTE_IF_SET_IN_BITMAP (live_caller_saved_regs, 0, id, bi)
4441 : {
4442 0 : basic_block set_bb = ix86_get_dominator_for_reg (id, bb);
4443 0 : bitmap_set_bit (set_bbs, set_bb->index);
4444 : }
4445 0 : bb = nearest_common_dominator_for_set (CDI_DOMINATORS, set_bbs);
4446 2 : }
4447 : while (true);
4448 : }
4449 :
4450 : /* Generate a TLS call of KIND with VAL and copy the call result to DEST,
4451 : at entry of the nearest dominator for basic block map BBS, which is in
4452 : the fake loop that contains the whole function, so that there is only
4453 : a single TLS CALL of KIND with VAL in the whole function.
4454 : UPDATED_GNU_TLS_INSNS contains instructions which replace the GNU TLS
4455 : instructions. UPDATED_GNU2_TLS_INSNS contains instructions which
4456 : replace the GNU2 TLS instructions. If TLSDESC_SET isn't nullptr,
4457 : insert it before the TLS call. */
4458 :
4459 : static void
4460 313 : ix86_place_single_tls_call (rtx dest, rtx val, x86_cse_kind kind,
4461 : auto_bitmap &bbs,
4462 : auto_bitmap &updated_gnu_tls_insns,
4463 : auto_bitmap &updated_gnu2_tls_insns,
4464 : rtx tlsdesc_set = nullptr)
4465 : {
4466 313 : basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
4467 313 : while (bb->loop_father->latch
4468 322 : != EXIT_BLOCK_PTR_FOR_FN (cfun))
4469 9 : bb = get_immediate_dominator (CDI_DOMINATORS,
4470 : bb->loop_father->header);
4471 :
4472 313 : rtx rax = nullptr, rdi;
4473 313 : rtx eqv = nullptr;
4474 313 : rtx caddr;
4475 313 : rtx set;
4476 313 : rtx clob;
4477 313 : rtx symbol;
4478 313 : rtx tls;
4479 :
4480 313 : switch (kind)
4481 : {
4482 262 : case X86_CSE_TLS_GD:
4483 262 : rax = gen_rtx_REG (Pmode, AX_REG);
4484 262 : rdi = gen_rtx_REG (Pmode, DI_REG);
4485 262 : caddr = ix86_tls_get_addr ();
4486 :
4487 262 : symbol = XVECEXP (val, 0, 0);
4488 262 : tls = gen_tls_global_dynamic_64 (Pmode, rax, symbol, caddr, rdi);
4489 :
4490 262 : if (GET_MODE (symbol) != Pmode)
4491 0 : symbol = gen_rtx_ZERO_EXTEND (Pmode, symbol);
4492 : eqv = symbol;
4493 : break;
4494 :
4495 30 : case X86_CSE_TLS_LD_BASE:
4496 30 : rax = gen_rtx_REG (Pmode, AX_REG);
4497 30 : rdi = gen_rtx_REG (Pmode, DI_REG);
4498 30 : caddr = ix86_tls_get_addr ();
4499 :
4500 30 : tls = gen_tls_local_dynamic_base_64 (Pmode, rax, caddr, rdi);
4501 :
4502 : /* Attach a unique REG_EQUAL to DEST, to allow the RTL optimizers
4503 : to share the LD_BASE result with other LD model accesses. */
4504 30 : eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
4505 : UNSPEC_TLS_LD_BASE);
4506 :
4507 30 : break;
4508 :
4509 21 : case X86_CSE_TLSDESC:
4510 21 : set = gen_rtx_SET (dest, val);
4511 21 : clob = gen_rtx_CLOBBER (VOIDmode,
4512 : gen_rtx_REG (CCmode, FLAGS_REG));
4513 21 : tls = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clob));
4514 21 : break;
4515 :
4516 0 : default:
4517 0 : gcc_unreachable ();
4518 : }
4519 :
4520 : /* Emit the TLS CALL insn. */
4521 313 : rtx_insn *before = nullptr;
4522 313 : rtx_insn *after = nullptr;
4523 313 : rtx_insn *tls_insn = ix86_emit_tls_call (tls, kind, bb, &before,
4524 : &after,
4525 : updated_gnu_tls_insns,
4526 : updated_gnu2_tls_insns);
4527 :
4528 313 : rtx_insn *tlsdesc_insn = nullptr;
4529 313 : if (tlsdesc_set)
4530 : {
4531 16 : rtx dest = copy_rtx (SET_DEST (tlsdesc_set));
4532 16 : rtx src = copy_rtx (SET_SRC (tlsdesc_set));
4533 16 : tlsdesc_set = gen_rtx_SET (dest, src);
4534 16 : tlsdesc_insn = emit_insn_before (tlsdesc_set, tls_insn);
4535 : }
4536 :
4537 313 : if (kind != X86_CSE_TLSDESC)
4538 : {
4539 292 : RTL_CONST_CALL_P (tls_insn) = 1;
4540 :
4541 : /* Indicate that this function can't jump to non-local gotos. */
4542 292 : make_reg_eh_region_note_nothrow_nononlocal (tls_insn);
4543 : }
4544 :
4545 313 : if (recog_memoized (tls_insn) < 0)
4546 0 : gcc_unreachable ();
4547 :
4548 313 : if (dump_file)
4549 : {
4550 0 : if (after)
4551 : {
4552 0 : fprintf (dump_file, "\nPlace:\n\n");
4553 0 : if (tlsdesc_insn)
4554 0 : print_rtl_single (dump_file, tlsdesc_insn);
4555 0 : print_rtl_single (dump_file, tls_insn);
4556 0 : fprintf (dump_file, "\nafter:\n\n");
4557 0 : print_rtl_single (dump_file, after);
4558 0 : fprintf (dump_file, "\n");
4559 : }
4560 : else
4561 : {
4562 0 : fprintf (dump_file, "\nPlace:\n\n");
4563 0 : if (tlsdesc_insn)
4564 0 : print_rtl_single (dump_file, tlsdesc_insn);
4565 0 : print_rtl_single (dump_file, tls_insn);
4566 0 : fprintf (dump_file, "\nbefore:\n\n");
4567 0 : print_rtl_single (dump_file, before);
4568 0 : fprintf (dump_file, "\n");
4569 : }
4570 : }
4571 :
4572 313 : if (kind != X86_CSE_TLSDESC)
4573 : {
4574 : /* Copy RAX to DEST. */
4575 292 : set = gen_rtx_SET (dest, rax);
4576 292 : rtx_insn *set_insn = emit_insn_after (set, tls_insn);
4577 292 : set_dst_reg_note (set_insn, REG_EQUAL, copy_rtx (eqv), dest);
4578 292 : if (dump_file)
4579 : {
4580 0 : fprintf (dump_file, "\nPlace:\n\n");
4581 0 : print_rtl_single (dump_file, set_insn);
4582 0 : fprintf (dump_file, "\nafter:\n\n");
4583 0 : print_rtl_single (dump_file, tls_insn);
4584 0 : fprintf (dump_file, "\n");
4585 : }
4586 : }
4587 313 : }
4588 :
4589 : namespace {
4590 :
4591 : const pass_data pass_data_x86_cse =
4592 : {
4593 : RTL_PASS, /* type */
4594 : "x86_cse", /* name */
4595 : OPTGROUP_NONE, /* optinfo_flags */
4596 : TV_MACH_DEP, /* tv_id */
4597 : 0, /* properties_required */
4598 : 0, /* properties_provided */
4599 : 0, /* properties_destroyed */
4600 : 0, /* todo_flags_start */
4601 : 0, /* todo_flags_finish */
4602 : };
4603 :
4604 : class pass_x86_cse : public rtl_opt_pass
4605 : {
4606 : public:
4607 288767 : pass_x86_cse (gcc::context *ctxt)
4608 577534 : : rtl_opt_pass (pass_data_x86_cse, ctxt)
4609 : {}
4610 :
4611 : /* opt_pass methods: */
4612 1481491 : bool gate (function *fun) final override
4613 : {
4614 1481491 : return optimize && optimize_function_for_speed_p (fun);
4615 : }
4616 :
4617 981264 : unsigned int execute (function *) final override
4618 : {
4619 981264 : return x86_cse ();
4620 : }
4621 :
4622 : private:
4623 : /* The redundant source value. */
4624 : rtx val;
4625 : /* The actual redundant source value for UNSPEC_TLSDESC. */
4626 : rtx tlsdesc_val;
4627 : /* The instruction which defines the redundant value. */
4628 : rtx_insn *def_insn;
4629 : /* Mode of the destination of the candidate redundant instruction. */
4630 : machine_mode mode;
4631 : /* Mode of the source of the candidate redundant instruction. */
4632 : machine_mode scalar_mode;
4633 : /* The classification of the candidate redundant instruction. */
4634 : x86_cse_kind kind;
4635 :
4636 : unsigned int x86_cse (void);
4637 : bool candidate_gnu_tls_p (rtx_insn *, attr_tls64);
4638 : bool candidate_gnu2_tls_p (rtx, attr_tls64);
4639 : bool candidate_vector_p (rtx, rtx_insn *);
4640 : rtx_insn *tls_set_insn_from_symbol (const_rtx, const_rtx);
4641 : }; // class pass_x86_cse
4642 :
4643 : /* Return the instruction which sets REG from TLS_SYMBOL. */
4644 :
4645 : rtx_insn *
4646 42 : pass_x86_cse::tls_set_insn_from_symbol (const_rtx reg,
4647 : const_rtx tls_symbol)
4648 : {
4649 42 : rtx_insn *set_insn = nullptr;
4650 42 : for (df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg));
4651 111 : ref;
4652 69 : ref = DF_REF_NEXT_REG (ref))
4653 : {
4654 69 : if (DF_REF_IS_ARTIFICIAL (ref))
4655 : return nullptr;
4656 :
4657 69 : set_insn = DF_REF_INSN (ref);
4658 69 : if (get_attr_tls64 (set_insn) != TLS64_LEA)
4659 : return nullptr;
4660 :
4661 69 : rtx tls_set = PATTERN (set_insn);
4662 69 : rtx tls_src = XVECEXP (SET_SRC (tls_set), 0, 0);
4663 69 : if (!rtx_equal_p (tls_symbol, tls_src))
4664 : return nullptr;
4665 : }
4666 :
4667 : return set_insn;
4668 : }
4669 :
4670 : /* Return true and output def_insn, val, mode, scalar_mode and kind if
4671 : INSN is UNSPEC_TLS_GD or UNSPEC_TLS_LD_BASE. */
4672 :
4673 : bool
4674 2185 : pass_x86_cse::candidate_gnu_tls_p (rtx_insn *insn, attr_tls64 tls64)
4675 : {
4676 2185 : if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p)
4677 : return false;
4678 :
4679 : /* Record the redundant TLS CALLs for 64-bit:
4680 :
4681 : (parallel [
4682 : (set (reg:DI 0 ax)
4683 : (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr")))
4684 : (const_int 0 [0])))
4685 : (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50])
4686 : (reg/f:DI 7 sp)] UNSPEC_TLS_GD)
4687 : (clobber (reg:DI 5 di))])
4688 :
4689 :
4690 : and
4691 :
4692 : (parallel [
4693 : (set (reg:DI 0 ax)
4694 : (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr")))
4695 : (const_int 0 [0])))
4696 : (unspec:DI [(reg/f:DI 7 sp)] UNSPEC_TLS_LD_BASE)])
4697 :
4698 : */
4699 :
4700 2022 : rtx pat = PATTERN (insn);
4701 2022 : rtx set = XVECEXP (pat, 0, 0);
4702 2022 : gcc_assert (GET_CODE (set) == SET);
4703 2022 : rtx dest = SET_DEST (set);
4704 2022 : scalar_mode = mode = GET_MODE (dest);
4705 2022 : val = XVECEXP (pat, 0, 1);
4706 2022 : gcc_assert (GET_CODE (val) == UNSPEC);
4707 :
4708 2022 : if (tls64 == TLS64_GD)
4709 1921 : kind = X86_CSE_TLS_GD;
4710 : else
4711 101 : kind = X86_CSE_TLS_LD_BASE;
4712 :
4713 2022 : def_insn = nullptr;
4714 2022 : return true;
4715 : }
4716 :
4717 : /* Return true and output def_insn, val, mode, scalar_mode and kind if
4718 : SET is UNSPEC_TLSDESC. */
4719 :
4720 : bool
4721 56 : pass_x86_cse::candidate_gnu2_tls_p (rtx set, attr_tls64 tls64)
4722 : {
4723 56 : if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p)
4724 : return false;
4725 :
4726 54 : rtx tls_symbol;
4727 54 : rtx_insn *set_insn;
4728 54 : rtx src = SET_SRC (set);
4729 54 : val = src;
4730 54 : tlsdesc_val = src;
4731 54 : kind = X86_CSE_TLSDESC;
4732 :
4733 54 : if (tls64 == TLS64_COMBINE)
4734 : {
4735 : /* Record 64-bit TLS64_COMBINE:
4736 :
4737 : (set (reg/f:DI 104)
4738 : (plus:DI (unspec:DI [
4739 : (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
4740 : (reg:DI 114)
4741 : (reg/f:DI 7 sp)] UNSPEC_TLSDESC)
4742 : (const:DI (unspec:DI [
4743 : (symbol_ref:DI ("e") [flags 0x1a])
4744 : ] UNSPEC_DTPOFF))))
4745 :
4746 : (set (reg/f:DI 104)
4747 : (plus:DI (unspec:DI [
4748 : (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
4749 : (unspec:DI [
4750 : (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
4751 : ] UNSPEC_TLSDESC)
4752 : (reg/f:DI 7 sp)] UNSPEC_TLSDESC)
4753 : (const:DI (unspec:DI [
4754 : (symbol_ref:DI ("e") [flags 0x1a])
4755 : ] UNSPEC_DTPOFF))))
4756 : */
4757 :
4758 12 : scalar_mode = mode = GET_MODE (src);
4759 :
4760 : /* Since the first operand of PLUS in the source TLS_COMBINE
4761 : pattern is unused, use the second operand of PLUS:
4762 :
4763 : (const:DI (unspec:DI [
4764 : (symbol_ref:DI ("e") [flags 0x1a])
4765 : ] UNSPEC_DTPOFF))
4766 :
4767 : as VAL to check if 2 TLS_COMBINE patterns have the same
4768 : source. */
4769 12 : val = XEXP (src, 1);
4770 12 : gcc_assert (GET_CODE (val) == CONST
4771 : && GET_CODE (XEXP (val, 0)) == UNSPEC
4772 : && XINT (XEXP (val, 0), 1) == UNSPEC_DTPOFF
4773 : && SYMBOL_REF_P (XVECEXP (XEXP (val, 0), 0, 0)));
4774 12 : def_insn = nullptr;
4775 12 : return true;
4776 : }
4777 :
4778 : /* Record 64-bit TLS_CALL:
4779 :
4780 : (set (reg:DI 101)
4781 : (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50])
4782 : (reg:DI 112)
4783 : (reg/f:DI 7 sp)] UNSPEC_TLSDESC))
4784 :
4785 : */
4786 :
4787 42 : gcc_assert (GET_CODE (src) == UNSPEC);
4788 42 : tls_symbol = XVECEXP (src, 0, 0);
4789 42 : src = XVECEXP (src, 0, 1);
4790 42 : scalar_mode = mode = GET_MODE (src);
4791 42 : gcc_assert (REG_P (src));
4792 :
4793 : /* All definitions of reg:DI 129 in
4794 :
4795 : (set (reg:DI 110)
4796 : (unspec:DI [(symbol_ref:DI ("foo"))
4797 : (reg:DI 129)
4798 : (reg/f:DI 7 sp)] UNSPEC_TLSDESC))
4799 :
4800 : should have the same source as in
4801 :
4802 : (set (reg:DI 129)
4803 : (unspec:DI [(symbol_ref:DI ("foo"))] UNSPEC_TLSDESC))
4804 :
4805 : */
4806 :
4807 42 : set_insn = tls_set_insn_from_symbol (src, tls_symbol);
4808 42 : if (!set_insn)
4809 : return false;
4810 :
4811 : /* Use TLS_SYMBOL as VAL to check if 2 patterns have the same source. */
4812 42 : val = tls_symbol;
4813 42 : def_insn = set_insn;
4814 42 : return true;
4815 : }
4816 :
4817 : /* Return true and output def_insn, val, mode, scalar_mode and kind if
4818 : INSN is a vector broadcast instruction. */
4819 :
4820 : bool
4821 50034109 : pass_x86_cse::candidate_vector_p (rtx set, rtx_insn *insn)
4822 : {
4823 50034109 : rtx src = SET_SRC (set);
4824 50034109 : rtx dest = SET_DEST (set);
4825 50034109 : mode = GET_MODE (dest);
4826 : /* Skip non-vector instruction. */
4827 50034109 : if (!VECTOR_MODE_P (mode))
4828 : return false;
4829 :
4830 : /* Skip non-vector load instruction. */
4831 3715770 : if (!REG_P (dest) && !SUBREG_P (dest))
4832 : return false;
4833 :
4834 2202109 : def_insn = insn;
4835 2202109 : val = ix86_broadcast_inner (src, mode, &scalar_mode, &kind,
4836 : &def_insn);
4837 2202109 : return val ? true : false;
4838 : }
4839 :
4840 : /* At entry of the nearest common dominator for basic blocks with
4841 :
4842 : 1. Vector CONST0_RTX patterns.
4843 : 2. Vector CONSTM1_RTX patterns.
4844 : 3. Vector broadcast patterns.
4845 : 4. UNSPEC_TLS_GD patterns.
4846 : 5. UNSPEC_TLS_LD_BASE patterns.
4847 : 6. UNSPEC_TLSDESC patterns.
4848 :
4849 : generate a single pattern whose destination is used to replace the
4850 : source in all identical patterns.
4851 :
4852 : NB: We want to generate a pattern, which is executed only once, to
4853 : cover the whole function. The LCM algorithm isn't appropriate here
4854 : since it may place a pattern inside the loop. */
4855 :
4856 : unsigned int
4857 981264 : pass_x86_cse::x86_cse (void)
4858 : {
4859 981264 : timevar_push (TV_MACH_DEP);
4860 :
4861 981264 : auto_vec<redundant_pattern *> loads;
4862 981264 : redundant_pattern *load;
4863 981264 : basic_block bb;
4864 981264 : rtx_insn *insn;
4865 981264 : unsigned int i;
4866 981264 : auto_bitmap updated_gnu_tls_insns;
4867 981264 : auto_bitmap updated_gnu2_tls_insns;
4868 981264 : auto_bitmap call_bbs;
4869 :
4870 981264 : df_set_flags (DF_DEFER_INSN_RESCAN);
4871 :
4872 981264 : bool recursive_call_p = cfun->machine->recursive_function;
4873 :
4874 10941498 : FOR_EACH_BB_FN (bb, cfun)
4875 : {
4876 132278937 : FOR_BB_INSNS (bb, insn)
4877 : {
4878 122318703 : if (!NONDEBUG_INSN_P (insn))
4879 68601410 : continue;
4880 :
4881 53717293 : bool matched = false;
4882 : /* Remove redundant pattens if there are more than 2 of
4883 : them. */
4884 53717293 : unsigned int threshold = 2;
4885 :
4886 53717293 : bool call_p = CALL_P (insn);
4887 53717293 : rtx set = single_set (insn);
4888 53717293 : if (!set && !call_p)
4889 1105914 : continue;
4890 :
4891 52611379 : tlsdesc_val = nullptr;
4892 :
4893 52611379 : attr_tls64 tls64 = get_attr_tls64 (insn);
4894 :
4895 : /* NB: TLS calls preserve all registers. */
4896 52611379 : if (call_p && tls64 == TLS64_NONE)
4897 4446590 : bitmap_set_bit (call_bbs, BLOCK_FOR_INSN (insn)->index);
4898 :
4899 52611379 : switch (tls64)
4900 : {
4901 2185 : case TLS64_GD:
4902 2185 : case TLS64_LD_BASE:
4903 : /* Verify UNSPEC_TLS_GD and UNSPEC_TLS_LD_BASE. */
4904 2185 : if (candidate_gnu_tls_p (insn, tls64))
4905 : break;
4906 163 : continue;
4907 :
4908 56 : case TLS64_CALL:
4909 56 : case TLS64_COMBINE:
4910 : /* Verify UNSPEC_TLSDESC. */
4911 56 : if (candidate_gnu2_tls_p (set, tls64))
4912 : break;
4913 2 : continue;
4914 :
4915 38 : case TLS64_LEA:
4916 : /* Skip TLS64_LEA. */
4917 38 : continue;
4918 :
4919 52609100 : case TLS64_NONE:
4920 52609100 : if (!set)
4921 2574991 : continue;
4922 :
4923 : /* Check for vector broadcast. */
4924 50034109 : if (candidate_vector_p (set, insn))
4925 : break;
4926 49785064 : continue;
4927 : }
4928 :
4929 : /* Check if there is a matching redundant load. */
4930 592668 : FOR_EACH_VEC_ELT (loads, i, load)
4931 437809 : if (load->val
4932 437809 : && load->kind == kind
4933 291799 : && load->mode == scalar_mode
4934 256144 : && (load->bb == bb
4935 196938 : || (kind != X86_CSE_VEC_DUP
4936 196938 : && kind != X86_CSE_CONST_VECTOR)
4937 : /* Non all 0s/1s vector load must be in the same
4938 : basic block if it is in a recursive call. */
4939 137251 : || !recursive_call_p)
4940 691826 : && rtx_equal_p (load->val, val))
4941 : {
4942 : /* Record instruction. */
4943 96262 : bitmap_set_bit (load->insns, INSN_UID (insn));
4944 :
4945 : /* Record the maximum vector size. */
4946 96262 : if (kind <= X86_CSE_VEC_DUP
4947 191411 : && load->size < GET_MODE_SIZE (mode))
4948 1012 : load->size = GET_MODE_SIZE (mode);
4949 :
4950 : /* Record the basic block. */
4951 96262 : bitmap_set_bit (load->bbs, bb->index);
4952 :
4953 : /* Increment the count. */
4954 96262 : load->count++;
4955 :
4956 96262 : matched = true;
4957 96262 : break;
4958 : }
4959 :
4960 251121 : if (matched)
4961 96262 : continue;
4962 :
4963 : /* We see this instruction the first time. Record the
4964 : redundant source value, its mode, the destination size,
4965 : instruction which defines the redundant source value,
4966 : instruction basic block and the instruction kind. */
4967 154859 : load = new redundant_pattern;
4968 :
4969 : /* Convert CONST_VECTOR load no larger than integer register
4970 : to constant integer load even if there is no redundant
4971 : CONST_VECTOR load. */
4972 154859 : if (CONST_VECTOR_P (val))
4973 31025 : threshold = 1;
4974 :
4975 154859 : load->val = copy_rtx (val);
4976 154859 : if (tlsdesc_val)
4977 28 : load->tlsdesc_val = copy_rtx (tlsdesc_val);
4978 : else
4979 154831 : load->tlsdesc_val = nullptr;
4980 154859 : load->mode = scalar_mode;
4981 154859 : load->dest_mode = mode;
4982 154859 : load->size = GET_MODE_SIZE (mode);
4983 154859 : load->def_insn = def_insn;
4984 154859 : load->count = 1;
4985 154859 : load->threshold = threshold;
4986 154859 : load->bb = BLOCK_FOR_INSN (insn);
4987 154859 : load->kind = kind;
4988 :
4989 154859 : bitmap_set_bit (load->insns, INSN_UID (insn));
4990 154859 : bitmap_set_bit (load->bbs, bb->index);
4991 :
4992 154859 : loads.safe_push (load);
4993 : }
4994 : }
4995 :
4996 : bool replaced = false;
4997 1136123 : FOR_EACH_VEC_ELT (loads, i, load)
4998 154859 : if (load->count >= load->threshold)
4999 : {
5000 63969 : machine_mode mode;
5001 63969 : rtx reg, broadcast_reg;
5002 63969 : rtx broadcast_source = nullptr;
5003 63969 : replaced = true;
5004 63969 : switch (load->kind)
5005 : {
5006 313 : case X86_CSE_TLS_GD:
5007 313 : case X86_CSE_TLS_LD_BASE:
5008 313 : case X86_CSE_TLSDESC:
5009 313 : broadcast_reg = gen_reg_rtx (load->mode);
5010 313 : replace_tls_call (broadcast_reg, load->insns,
5011 313 : (load->kind == X86_CSE_TLSDESC
5012 : ? updated_gnu2_tls_insns
5013 : : updated_gnu_tls_insns));
5014 313 : load->broadcast_reg = broadcast_reg;
5015 313 : break;
5016 :
5017 11153 : case X86_CSE_VEC_DUP:
5018 11153 : if (CONST_INT_P (load->val)
5019 10051 : && (load->val == CONST0_RTX (load->mode)
5020 10075 : || load->size <= UNITS_PER_WORD))
5021 : {
5022 : /* Generate CONST_VECTOR load. */
5023 31026 : case X86_CSE_CONST_VECTOR:
5024 31026 : mode = ix86_get_vector_cse_mode (load->size,
5025 : load->mode);
5026 :
5027 31026 : if (CONST_VECTOR_P (load->val))
5028 : broadcast_source = load->val;
5029 1 : else if (load->val == CONST0_RTX (load->mode))
5030 0 : broadcast_source = CONST0_RTX (mode);
5031 1 : else if (load->val == CONSTM1_RTX (load->mode))
5032 0 : broadcast_source = CONSTM1_RTX (mode);
5033 : else
5034 : {
5035 1 : int nunits = GET_MODE_NUNITS (mode);
5036 1 : rtvec v = rtvec_alloc (nunits);
5037 3 : for (int j = 0; j < nunits ; j++)
5038 2 : RTVEC_ELT (v, j) = load->val;
5039 1 : broadcast_source = gen_rtx_CONST_VECTOR (mode, v);
5040 : }
5041 :
5042 : /* NB: Zero CONST_VECTOR load works for MMX and XMM
5043 : registers. */
5044 32437 : if (load->size <= UNITS_PER_WORD)
5045 : {
5046 : /* Convert CONST_VECTOR load no larger than integer
5047 : register:
5048 :
5049 : (set (reg:V2SI 106)
5050 : (const_vector:V2SI [(const_int 1 [1]) repeated x2]))
5051 :
5052 : to constant integer load:
5053 :
5054 : (set (subreg:DI (reg:V2SI 106 [ _20 ]) 0)
5055 : (const_int 4294967297 [0x100000001]))
5056 : */
5057 31026 : machine_mode int_mode
5058 31026 : = int_mode_for_mode (mode).require ();
5059 31026 : load->dest_mode = int_mode;
5060 31026 : broadcast_source = simplify_subreg (int_mode,
5061 : broadcast_source,
5062 : mode, 0);
5063 31026 : gcc_assert (broadcast_source != nullptr);
5064 :
5065 31026 : bool keep_const_int_load = false;
5066 31026 : if (!bitmap_empty_p (call_bbs))
5067 : {
5068 27798 : bitmap_iterator bi;
5069 27798 : unsigned int id;
5070 36194 : EXECUTE_IF_SET_IN_BITMAP (load->bbs, 0, id, bi)
5071 28929 : if (bitmap_bit_p (call_bbs, id))
5072 : {
5073 : /* NB: Constant integer load is faster
5074 : than save and restore an integer
5075 : register when crossing a function call.
5076 : */
5077 : keep_const_int_load = true;
5078 : break;
5079 : }
5080 : }
5081 :
5082 27798 : if (keep_const_int_load)
5083 : {
5084 : /* Keep constant integer load. */
5085 20533 : replace_vector_const (mode, broadcast_source,
5086 20533 : load->insns, int_mode);
5087 20533 : load->broadcast_source = nullptr;
5088 20533 : load->broadcast_reg = nullptr;
5089 : }
5090 : else
5091 : {
5092 10493 : broadcast_reg = gen_reg_rtx (mode);
5093 10493 : reg = gen_reg_rtx (load->mode);
5094 10493 : replace_vector_const (mode, broadcast_reg,
5095 10493 : load->insns, load->mode);
5096 10493 : load->broadcast_source = broadcast_source;
5097 10493 : load->broadcast_reg = broadcast_reg;
5098 : }
5099 : break;
5100 : }
5101 : }
5102 : /* FALLTHRU */
5103 :
5104 32630 : case X86_CSE_CONST0_VECTOR:
5105 32630 : case X86_CSE_CONSTM1_VECTOR:
5106 32630 : mode = ix86_get_vector_cse_mode (load->size, load->mode);
5107 32630 : broadcast_reg = gen_reg_rtx (mode);
5108 32630 : if (load->def_insn)
5109 : {
5110 : /* Replace redundant vector loads with a single vector
5111 : load in the same basic block. */
5112 821 : reg = load->val;
5113 821 : if (load->mode != GET_MODE (reg))
5114 0 : reg = gen_rtx_SUBREG (load->mode, reg, 0);
5115 821 : broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
5116 : }
5117 : else
5118 : /* This is a constant integer/double vector. If the
5119 : inner scalar is 0 or -1, set vector to CONST0_RTX
5120 : or CONSTM1_RTX directly. */
5121 31809 : switch (load->kind)
5122 : {
5123 19899 : case X86_CSE_CONST0_VECTOR:
5124 19899 : broadcast_source = CONST0_RTX (mode);
5125 19899 : break;
5126 1578 : case X86_CSE_CONSTM1_VECTOR:
5127 1578 : broadcast_source = CONSTM1_RTX (mode);
5128 1578 : break;
5129 10332 : case X86_CSE_CONST_VECTOR:
5130 10332 : case X86_CSE_VEC_DUP:
5131 10332 : if (!broadcast_source)
5132 : {
5133 10332 : reg = gen_reg_rtx (load->mode);
5134 10332 : broadcast_source = gen_rtx_VEC_DUPLICATE (mode,
5135 : reg);
5136 : }
5137 : break;
5138 0 : default:
5139 0 : gcc_unreachable ();
5140 : }
5141 32630 : replace_vector_const (mode, broadcast_reg, load->insns,
5142 : load->mode);
5143 32630 : load->broadcast_source = broadcast_source;
5144 32630 : load->broadcast_reg = broadcast_reg;
5145 32630 : break;
5146 : }
5147 : }
5148 :
5149 981264 : if (replaced)
5150 : {
5151 41558 : auto_vec<rtx_insn *> control_flow_insns;
5152 :
5153 : /* (Re-)discover loops so that bb->loop_father can be used in the
5154 : analysis below. */
5155 41558 : calculate_dominance_info (CDI_DOMINATORS);
5156 41558 : loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
5157 :
5158 126131 : FOR_EACH_VEC_ELT (loads, i, load)
5159 84573 : if (load->count >= load->threshold)
5160 : {
5161 63969 : rtx set;
5162 63969 : if (load->def_insn)
5163 837 : switch (load->kind)
5164 : {
5165 16 : case X86_CSE_TLSDESC:
5166 16 : ix86_place_single_tls_call (load->broadcast_reg,
5167 : load->tlsdesc_val,
5168 : load->kind,
5169 16 : load->bbs,
5170 : updated_gnu_tls_insns,
5171 : updated_gnu2_tls_insns,
5172 16 : PATTERN (load->def_insn));
5173 16 : break;
5174 821 : case X86_CSE_VEC_DUP:
5175 : /* Insert a broadcast after the original scalar
5176 : definition. */
5177 821 : set = gen_rtx_SET (load->broadcast_reg,
5178 : load->broadcast_source);
5179 821 : insn = emit_insn_after (set, load->def_insn);
5180 :
5181 821 : if (cfun->can_throw_non_call_exceptions)
5182 : {
5183 : /* Handle REG_EH_REGION note in DEF_INSN. */
5184 4 : rtx note = find_reg_note (load->def_insn,
5185 : REG_EH_REGION, nullptr);
5186 4 : if (note)
5187 : {
5188 1 : control_flow_insns.safe_push (load->def_insn);
5189 1 : add_reg_note (insn, REG_EH_REGION,
5190 : XEXP (note, 0));
5191 : }
5192 : }
5193 :
5194 821 : if (dump_file)
5195 : {
5196 0 : fprintf (dump_file, "\nAdd:\n\n");
5197 0 : print_rtl_single (dump_file, insn);
5198 0 : fprintf (dump_file, "\nafter:\n\n");
5199 0 : print_rtl_single (dump_file, load->def_insn);
5200 0 : fprintf (dump_file, "\n");
5201 : }
5202 : break;
5203 0 : default:
5204 0 : gcc_unreachable ();
5205 : }
5206 : else
5207 63132 : switch (load->kind)
5208 : {
5209 297 : case X86_CSE_TLS_GD:
5210 297 : case X86_CSE_TLS_LD_BASE:
5211 297 : case X86_CSE_TLSDESC:
5212 297 : ix86_place_single_tls_call (load->broadcast_reg,
5213 : (load->kind == X86_CSE_TLSDESC
5214 : ? load->tlsdesc_val
5215 : : load->val),
5216 : load->kind,
5217 297 : load->bbs,
5218 : updated_gnu_tls_insns,
5219 : updated_gnu2_tls_insns);
5220 297 : break;
5221 41358 : case X86_CSE_CONST_VECTOR:
5222 41358 : case X86_CSE_VEC_DUP:
5223 : /* Keep redundant constant integer load. */
5224 41358 : if (!load->broadcast_reg)
5225 : break;
5226 : /* FALLTHRU */
5227 42302 : case X86_CSE_CONST0_VECTOR:
5228 42302 : case X86_CSE_CONSTM1_VECTOR:
5229 42302 : ix86_place_single_vector_set (load->broadcast_reg,
5230 : load->broadcast_source,
5231 : load->bbs,
5232 : load);
5233 42302 : break;
5234 : }
5235 : }
5236 :
5237 41558 : loop_optimizer_finalize ();
5238 :
5239 41558 : if (!control_flow_insns.is_empty ())
5240 : {
5241 1 : free_dominance_info (CDI_DOMINATORS);
5242 :
5243 3 : FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
5244 1 : if (control_flow_insn_p (insn))
5245 : {
5246 : /* Split the block after insn. There will be a fallthru
5247 : edge, which is OK so we keep it. We have to create
5248 : the exception edges ourselves. */
5249 1 : bb = BLOCK_FOR_INSN (insn);
5250 1 : split_block (bb, insn);
5251 1 : rtl_make_eh_edge (NULL, bb, BB_END (bb));
5252 : }
5253 : }
5254 :
5255 41558 : df_process_deferred_rescans ();
5256 41558 : }
5257 :
5258 1136123 : FOR_EACH_VEC_ELT (loads, i, load)
5259 309718 : delete load;
5260 :
5261 981264 : df_clear_flags (DF_DEFER_INSN_RESCAN);
5262 :
5263 981264 : timevar_pop (TV_MACH_DEP);
5264 981264 : return 0;
5265 981264 : }
5266 :
5267 : } // anon namespace
5268 :
5269 : rtl_opt_pass *
5270 288767 : make_pass_x86_cse (gcc::context *ctxt)
5271 : {
5272 288767 : return new pass_x86_cse (ctxt);
5273 : }
5274 :
5275 : /* Convert legacy instructions that clobbers EFLAGS to APX_NF
5276 : instructions when there are no flag set between a flag
5277 : producer and user. */
5278 :
5279 : static unsigned int
5280 370 : ix86_apx_nf_convert (void)
5281 : {
5282 370 : timevar_push (TV_MACH_DEP);
5283 :
5284 370 : basic_block bb;
5285 370 : rtx_insn *insn;
5286 370 : hash_map <rtx_insn *, rtx> converting_map;
5287 370 : auto_vec <rtx_insn *> current_convert_list;
5288 :
5289 370 : bool converting_seq = false;
5290 370 : rtx cc = gen_rtx_REG (CCmode, FLAGS_REG);
5291 :
5292 792 : FOR_EACH_BB_FN (bb, cfun)
5293 : {
5294 : /* Reset conversion for each bb. */
5295 422 : converting_seq = false;
5296 5071 : FOR_BB_INSNS (bb, insn)
5297 : {
5298 4649 : if (!NONDEBUG_INSN_P (insn))
5299 4988 : continue;
5300 :
5301 3707 : if (recog_memoized (insn) < 0)
5302 336 : continue;
5303 :
5304 : /* Convert candidate insns after cstore, which should
5305 : satisify the two conditions:
5306 : 1. Is not flag user or producer, only clobbers
5307 : FLAGS_REG.
5308 : 2. Have corresponding nf pattern. */
5309 :
5310 3371 : rtx pat = PATTERN (insn);
5311 :
5312 : /* Starting convertion at first cstorecc. */
5313 3371 : rtx set = NULL_RTX;
5314 3371 : if (!converting_seq
5315 2790 : && (set = single_set (insn))
5316 2714 : && ix86_comparison_operator (SET_SRC (set), VOIDmode)
5317 126 : && reg_overlap_mentioned_p (cc, SET_SRC (set))
5318 3494 : && !reg_overlap_mentioned_p (cc, SET_DEST (set)))
5319 : {
5320 123 : converting_seq = true;
5321 123 : current_convert_list.truncate (0);
5322 : }
5323 : /* Terminate at the next explicit flag set. */
5324 3248 : else if (reg_set_p (cc, pat)
5325 3248 : && GET_CODE (set_of (cc, pat)) != CLOBBER)
5326 : converting_seq = false;
5327 :
5328 3151 : if (!converting_seq)
5329 2768 : continue;
5330 :
5331 603 : if (get_attr_has_nf (insn)
5332 603 : && GET_CODE (pat) == PARALLEL)
5333 : {
5334 : /* Record the insn to candidate map. */
5335 72 : current_convert_list.safe_push (insn);
5336 72 : converting_map.put (insn, pat);
5337 : }
5338 : /* If the insn clobbers flags but has no nf_attr,
5339 : revoke all previous candidates. */
5340 531 : else if (!get_attr_has_nf (insn)
5341 530 : && reg_set_p (cc, pat)
5342 534 : && GET_CODE (set_of (cc, pat)) == CLOBBER)
5343 : {
5344 3 : for (auto item : current_convert_list)
5345 0 : converting_map.remove (item);
5346 3 : converting_seq = false;
5347 : }
5348 : }
5349 : }
5350 :
5351 370 : if (!converting_map.is_empty ())
5352 : {
5353 85 : for (auto iter = converting_map.begin ();
5354 170 : iter != converting_map.end (); ++iter)
5355 : {
5356 72 : rtx_insn *replace = (*iter).first;
5357 72 : rtx pat = (*iter).second;
5358 72 : int i, n = 0, len = XVECLEN (pat, 0);
5359 72 : rtx *new_elems = XALLOCAVEC (rtx, len);
5360 72 : rtx new_pat;
5361 216 : for (i = 0; i < len; i++)
5362 : {
5363 144 : rtx temp = XVECEXP (pat, 0, i);
5364 216 : if (! (GET_CODE (temp) == CLOBBER
5365 72 : && reg_overlap_mentioned_p (cc,
5366 72 : XEXP (temp, 0))))
5367 : {
5368 72 : new_elems[n] = temp;
5369 72 : n++;
5370 : }
5371 : }
5372 :
5373 72 : if (n == 1)
5374 72 : new_pat = new_elems[0];
5375 : else
5376 0 : new_pat =
5377 0 : gen_rtx_PARALLEL (VOIDmode,
5378 : gen_rtvec_v (n,
5379 : new_elems));
5380 :
5381 72 : PATTERN (replace) = new_pat;
5382 72 : INSN_CODE (replace) = -1;
5383 72 : recog_memoized (replace);
5384 72 : df_insn_rescan (replace);
5385 : }
5386 : }
5387 :
5388 370 : timevar_pop (TV_MACH_DEP);
5389 370 : return 0;
5390 370 : }
5391 :
5392 :
5393 : namespace {
5394 :
5395 : const pass_data pass_data_apx_nf_convert =
5396 : {
5397 : RTL_PASS, /* type */
5398 : "apx_nfcvt", /* name */
5399 : OPTGROUP_NONE, /* optinfo_flags */
5400 : TV_MACH_DEP, /* tv_id */
5401 : 0, /* properties_required */
5402 : 0, /* properties_provided */
5403 : 0, /* properties_destroyed */
5404 : 0, /* todo_flags_start */
5405 : 0, /* todo_flags_finish */
5406 : };
5407 :
5408 : class pass_apx_nf_convert : public rtl_opt_pass
5409 : {
5410 : public:
5411 288767 : pass_apx_nf_convert (gcc::context *ctxt)
5412 577534 : : rtl_opt_pass (pass_data_apx_nf_convert, ctxt)
5413 : {}
5414 :
5415 : /* opt_pass methods: */
5416 1481491 : bool gate (function *) final override
5417 : {
5418 1481491 : return (TARGET_APX_NF
5419 462 : && optimize
5420 1481945 : && optimize_function_for_speed_p (cfun));
5421 : }
5422 :
5423 370 : unsigned int execute (function *) final override
5424 : {
5425 370 : return ix86_apx_nf_convert ();
5426 : }
5427 : }; // class pass_apx_nf_convert
5428 :
5429 : } // anon namespace
5430 :
5431 : rtl_opt_pass *
5432 288767 : make_pass_apx_nf_convert (gcc::context *ctxt)
5433 : {
5434 288767 : return new pass_apx_nf_convert (ctxt);
5435 : }
5436 :
5437 : /* When a hot loop can be fit into one cacheline,
5438 : force align the loop without considering the max skip. */
5439 : static void
5440 980785 : ix86_align_loops ()
5441 : {
5442 980785 : basic_block bb;
5443 :
5444 : /* Don't do this when we don't know cache line size. */
5445 980785 : if (ix86_cost->prefetch_block == 0)
5446 9 : return;
5447 :
5448 980776 : loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
5449 980776 : profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
5450 11408539 : FOR_EACH_BB_FN (bb, cfun)
5451 : {
5452 10427763 : rtx_insn *label = BB_HEAD (bb);
5453 10427763 : bool has_fallthru = 0;
5454 10427763 : edge e;
5455 10427763 : edge_iterator ei;
5456 :
5457 10427763 : if (!LABEL_P (label))
5458 5311390 : continue;
5459 :
5460 5121185 : profile_count fallthru_count = profile_count::zero ();
5461 5121185 : profile_count branch_count = profile_count::zero ();
5462 :
5463 14880616 : FOR_EACH_EDGE (e, ei, bb->preds)
5464 : {
5465 9759431 : if (e->flags & EDGE_FALLTHRU)
5466 2490903 : has_fallthru = 1, fallthru_count += e->count ();
5467 : else
5468 7268528 : branch_count += e->count ();
5469 : }
5470 :
5471 5121185 : if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
5472 4812 : continue;
5473 :
5474 5116373 : if (bb->loop_father
5475 5116373 : && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
5476 6461192 : && (has_fallthru
5477 1344819 : ? (!(single_succ_p (bb)
5478 146997 : && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
5479 937108 : && optimize_bb_for_speed_p (bb)
5480 857035 : && branch_count + fallthru_count > count_threshold
5481 732431 : && (branch_count > fallthru_count * param_align_loop_iterations))
5482 : /* In case there'no fallthru for the loop.
5483 : Nops inserted won't be executed. */
5484 407711 : : (branch_count > count_threshold
5485 136494 : || (bb->count > bb->prev_bb->count * 10
5486 12745 : && (bb->prev_bb->count
5487 4582721 : <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)))))
5488 : {
5489 546397 : rtx_insn* insn, *end_insn;
5490 546397 : HOST_WIDE_INT size = 0;
5491 546397 : bool padding_p = true;
5492 546397 : basic_block tbb = bb;
5493 546397 : unsigned cond_branch_num = 0;
5494 546397 : bool detect_tight_loop_p = false;
5495 :
5496 862184 : for (unsigned int i = 0; i != bb->loop_father->num_nodes;
5497 315787 : i++, tbb = tbb->next_bb)
5498 : {
5499 : /* Only handle continuous cfg layout. */
5500 862184 : if (bb->loop_father != tbb->loop_father)
5501 : {
5502 : padding_p = false;
5503 : break;
5504 : }
5505 :
5506 10298585 : FOR_BB_INSNS (tbb, insn)
5507 : {
5508 9635786 : if (!NONDEBUG_INSN_P (insn))
5509 5594458 : continue;
5510 4041328 : size += ix86_min_insn_size (insn);
5511 :
5512 : /* We don't know size of inline asm.
5513 : Don't align loop for call. */
5514 4041328 : if (asm_noperands (PATTERN (insn)) >= 0
5515 4041328 : || CALL_P (insn))
5516 : {
5517 : size = -1;
5518 : break;
5519 : }
5520 : }
5521 :
5522 821146 : if (size == -1 || size > ix86_cost->prefetch_block)
5523 : {
5524 : padding_p = false;
5525 : break;
5526 : }
5527 :
5528 1459687 : FOR_EACH_EDGE (e, ei, tbb->succs)
5529 : {
5530 : /* It could be part of the loop. */
5531 1007069 : if (e->dest == bb)
5532 : {
5533 : detect_tight_loop_p = true;
5534 : break;
5535 : }
5536 : }
5537 :
5538 637234 : if (detect_tight_loop_p)
5539 : break;
5540 :
5541 452618 : end_insn = BB_END (tbb);
5542 452618 : if (JUMP_P (end_insn))
5543 : {
5544 : /* For decoded icache:
5545 : 1. Up to two branches are allowed per Way.
5546 : 2. A non-conditional branch is the last micro-op in a Way.
5547 : */
5548 364524 : if (onlyjump_p (end_insn)
5549 364524 : && (any_uncondjump_p (end_insn)
5550 308814 : || single_succ_p (tbb)))
5551 : {
5552 : padding_p = false;
5553 : break;
5554 : }
5555 308814 : else if (++cond_branch_num >= 2)
5556 : {
5557 : padding_p = false;
5558 : break;
5559 : }
5560 : }
5561 :
5562 : }
5563 :
5564 546397 : if (padding_p && detect_tight_loop_p)
5565 : {
5566 369232 : emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)),
5567 : GEN_INT (0)), label);
5568 : /* End of function. */
5569 184616 : if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun))
5570 : break;
5571 : /* Skip bb which already fits into one cacheline. */
5572 : bb = tbb;
5573 : }
5574 : }
5575 : }
5576 :
5577 980776 : loop_optimizer_finalize ();
5578 980776 : free_dominance_info (CDI_DOMINATORS);
5579 : }
5580 :
5581 : namespace {
5582 :
5583 : const pass_data pass_data_align_tight_loops =
5584 : {
5585 : RTL_PASS, /* type */
5586 : "align_tight_loops", /* name */
5587 : OPTGROUP_NONE, /* optinfo_flags */
5588 : TV_MACH_DEP, /* tv_id */
5589 : 0, /* properties_required */
5590 : 0, /* properties_provided */
5591 : 0, /* properties_destroyed */
5592 : 0, /* todo_flags_start */
5593 : 0, /* todo_flags_finish */
5594 : };
5595 :
5596 : class pass_align_tight_loops : public rtl_opt_pass
5597 : {
5598 : public:
5599 288767 : pass_align_tight_loops (gcc::context *ctxt)
5600 577534 : : rtl_opt_pass (pass_data_align_tight_loops, ctxt)
5601 : {}
5602 :
5603 : /* opt_pass methods: */
5604 1481491 : bool gate (function *) final override
5605 : {
5606 1481491 : return TARGET_ALIGN_TIGHT_LOOPS
5607 1481005 : && optimize
5608 2527287 : && optimize_function_for_speed_p (cfun);
5609 : }
5610 :
5611 980785 : unsigned int execute (function *) final override
5612 : {
5613 980785 : timevar_push (TV_MACH_DEP);
5614 : #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
5615 980785 : ix86_align_loops ();
5616 : #endif
5617 980785 : timevar_pop (TV_MACH_DEP);
5618 980785 : return 0;
5619 : }
5620 : }; // class pass_align_tight_loops
5621 :
5622 : } // anon namespace
5623 :
5624 : rtl_opt_pass *
5625 288767 : make_pass_align_tight_loops (gcc::context *ctxt)
5626 : {
5627 288767 : return new pass_align_tight_loops (ctxt);
5628 : }
5629 :
5630 : /* This compares the priority of target features in function DECL1
5631 : and DECL2. It returns positive value if DECL1 is higher priority,
5632 : negative value if DECL2 is higher priority and 0 if they are the
5633 : same. */
5634 :
5635 : int
5636 5772 : ix86_compare_version_priority (tree decl1, tree decl2)
5637 : {
5638 5772 : unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
5639 5772 : unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
5640 :
5641 5772 : return (int)priority1 - (int)priority2;
5642 : }
5643 :
5644 : /* V1 and V2 point to function versions with different priorities
5645 : based on the target ISA. This function compares their priorities. */
5646 :
5647 : static int
5648 6860 : feature_compare (const void *v1, const void *v2)
5649 : {
5650 6860 : typedef struct _function_version_info
5651 : {
5652 : tree version_decl;
5653 : tree predicate_chain;
5654 : unsigned int dispatch_priority;
5655 : } function_version_info;
5656 :
5657 6860 : const function_version_info c1 = *(const function_version_info *)v1;
5658 6860 : const function_version_info c2 = *(const function_version_info *)v2;
5659 6860 : return (c2.dispatch_priority - c1.dispatch_priority);
5660 : }
5661 :
5662 : /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
5663 : to return a pointer to VERSION_DECL if the outcome of the expression
5664 : formed by PREDICATE_CHAIN is true. This function will be called during
5665 : version dispatch to decide which function version to execute. It returns
5666 : the basic block at the end, to which more conditions can be added. */
5667 :
5668 : static basic_block
5669 834 : add_condition_to_bb (tree function_decl, tree version_decl,
5670 : tree predicate_chain, basic_block new_bb)
5671 : {
5672 834 : gimple *return_stmt;
5673 834 : tree convert_expr, result_var;
5674 834 : gimple *convert_stmt;
5675 834 : gimple *call_cond_stmt;
5676 834 : gimple *if_else_stmt;
5677 :
5678 834 : basic_block bb1, bb2, bb3;
5679 834 : edge e12, e23;
5680 :
5681 834 : tree cond_var, and_expr_var = NULL_TREE;
5682 834 : gimple_seq gseq;
5683 :
5684 834 : tree predicate_decl, predicate_arg;
5685 :
5686 834 : push_cfun (DECL_STRUCT_FUNCTION (function_decl));
5687 :
5688 834 : gcc_assert (new_bb != NULL);
5689 834 : gseq = bb_seq (new_bb);
5690 :
5691 :
5692 834 : convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
5693 : build_fold_addr_expr (version_decl));
5694 834 : result_var = create_tmp_var (ptr_type_node);
5695 834 : convert_stmt = gimple_build_assign (result_var, convert_expr);
5696 834 : return_stmt = gimple_build_return (result_var);
5697 :
5698 834 : if (predicate_chain == NULL_TREE)
5699 : {
5700 200 : gimple_seq_add_stmt (&gseq, convert_stmt);
5701 200 : gimple_seq_add_stmt (&gseq, return_stmt);
5702 200 : set_bb_seq (new_bb, gseq);
5703 200 : gimple_set_bb (convert_stmt, new_bb);
5704 200 : gimple_set_bb (return_stmt, new_bb);
5705 200 : pop_cfun ();
5706 200 : return new_bb;
5707 : }
5708 :
5709 1307 : while (predicate_chain != NULL)
5710 : {
5711 673 : cond_var = create_tmp_var (integer_type_node);
5712 673 : predicate_decl = TREE_PURPOSE (predicate_chain);
5713 673 : predicate_arg = TREE_VALUE (predicate_chain);
5714 673 : call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
5715 673 : gimple_call_set_lhs (call_cond_stmt, cond_var);
5716 :
5717 673 : gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
5718 673 : gimple_set_bb (call_cond_stmt, new_bb);
5719 673 : gimple_seq_add_stmt (&gseq, call_cond_stmt);
5720 :
5721 673 : predicate_chain = TREE_CHAIN (predicate_chain);
5722 :
5723 673 : if (and_expr_var == NULL)
5724 : and_expr_var = cond_var;
5725 : else
5726 : {
5727 39 : gimple *assign_stmt;
5728 : /* Use MIN_EXPR to check if any integer is zero?.
5729 : and_expr_var = min_expr <cond_var, and_expr_var> */
5730 39 : assign_stmt = gimple_build_assign (and_expr_var,
5731 : build2 (MIN_EXPR, integer_type_node,
5732 : cond_var, and_expr_var));
5733 :
5734 39 : gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
5735 39 : gimple_set_bb (assign_stmt, new_bb);
5736 39 : gimple_seq_add_stmt (&gseq, assign_stmt);
5737 : }
5738 : }
5739 :
5740 634 : if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
5741 : integer_zero_node,
5742 : NULL_TREE, NULL_TREE);
5743 634 : gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
5744 634 : gimple_set_bb (if_else_stmt, new_bb);
5745 634 : gimple_seq_add_stmt (&gseq, if_else_stmt);
5746 :
5747 634 : gimple_seq_add_stmt (&gseq, convert_stmt);
5748 634 : gimple_seq_add_stmt (&gseq, return_stmt);
5749 634 : set_bb_seq (new_bb, gseq);
5750 :
5751 634 : bb1 = new_bb;
5752 634 : e12 = split_block (bb1, if_else_stmt);
5753 634 : bb2 = e12->dest;
5754 634 : e12->flags &= ~EDGE_FALLTHRU;
5755 634 : e12->flags |= EDGE_TRUE_VALUE;
5756 :
5757 634 : e23 = split_block (bb2, return_stmt);
5758 :
5759 634 : gimple_set_bb (convert_stmt, bb2);
5760 634 : gimple_set_bb (return_stmt, bb2);
5761 :
5762 634 : bb3 = e23->dest;
5763 634 : make_edge (bb1, bb3, EDGE_FALSE_VALUE);
5764 :
5765 634 : remove_edge (e23);
5766 634 : make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
5767 :
5768 634 : pop_cfun ();
5769 :
5770 634 : return bb3;
5771 : }
5772 :
5773 : /* This function generates the dispatch function for
5774 : multi-versioned functions. DISPATCH_DECL is the function which will
5775 : contain the dispatch logic. FNDECLS are the function choices for
5776 : dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
5777 : in DISPATCH_DECL in which the dispatch code is generated. */
5778 :
5779 : static int
5780 200 : dispatch_function_versions (tree dispatch_decl,
5781 : void *fndecls_p,
5782 : basic_block *empty_bb)
5783 : {
5784 200 : tree default_decl;
5785 200 : gimple *ifunc_cpu_init_stmt;
5786 200 : gimple_seq gseq;
5787 200 : int ix;
5788 200 : tree ele;
5789 200 : vec<tree> *fndecls;
5790 200 : unsigned int num_versions = 0;
5791 200 : unsigned int actual_versions = 0;
5792 200 : unsigned int i;
5793 :
5794 200 : struct _function_version_info
5795 : {
5796 : tree version_decl;
5797 : tree predicate_chain;
5798 : unsigned int dispatch_priority;
5799 : }*function_version_info;
5800 :
5801 200 : gcc_assert (dispatch_decl != NULL
5802 : && fndecls_p != NULL
5803 : && empty_bb != NULL);
5804 :
5805 : /*fndecls_p is actually a vector. */
5806 200 : fndecls = static_cast<vec<tree> *> (fndecls_p);
5807 :
5808 : /* At least one more version other than the default. */
5809 200 : num_versions = fndecls->length ();
5810 200 : gcc_assert (num_versions >= 2);
5811 :
5812 200 : function_version_info = (struct _function_version_info *)
5813 200 : XNEWVEC (struct _function_version_info, (num_versions - 1));
5814 :
5815 : /* The first version in the vector is the default decl. */
5816 200 : default_decl = (*fndecls)[0];
5817 :
5818 200 : push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
5819 :
5820 200 : gseq = bb_seq (*empty_bb);
5821 : /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
5822 : constructors, so explicity call __builtin_cpu_init here. */
5823 200 : ifunc_cpu_init_stmt
5824 200 : = gimple_build_call_vec (get_ix86_builtin (IX86_BUILTIN_CPU_INIT), vNULL);
5825 200 : gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
5826 200 : gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
5827 200 : set_bb_seq (*empty_bb, gseq);
5828 :
5829 200 : pop_cfun ();
5830 :
5831 :
5832 991 : for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
5833 : {
5834 791 : tree version_decl = ele;
5835 791 : tree predicate_chain = NULL_TREE;
5836 791 : unsigned int priority;
5837 : /* Get attribute string, parse it and find the right predicate decl.
5838 : The predicate function could be a lengthy combination of many
5839 : features, like arch-type and various isa-variants. */
5840 791 : priority = get_builtin_code_for_version (version_decl,
5841 : &predicate_chain);
5842 :
5843 791 : if (predicate_chain == NULL_TREE)
5844 157 : continue;
5845 :
5846 634 : function_version_info [actual_versions].version_decl = version_decl;
5847 634 : function_version_info [actual_versions].predicate_chain
5848 634 : = predicate_chain;
5849 634 : function_version_info [actual_versions].dispatch_priority = priority;
5850 634 : actual_versions++;
5851 : }
5852 :
5853 : /* Sort the versions according to descending order of dispatch priority. The
5854 : priority is based on the ISA. This is not a perfect solution. There
5855 : could still be ambiguity. If more than one function version is suitable
5856 : to execute, which one should be dispatched? In future, allow the user
5857 : to specify a dispatch priority next to the version. */
5858 200 : qsort (function_version_info, actual_versions,
5859 : sizeof (struct _function_version_info), feature_compare);
5860 :
5861 1034 : for (i = 0; i < actual_versions; ++i)
5862 634 : *empty_bb = add_condition_to_bb (dispatch_decl,
5863 : function_version_info[i].version_decl,
5864 634 : function_version_info[i].predicate_chain,
5865 : *empty_bb);
5866 :
5867 : /* dispatch default version at the end. */
5868 200 : *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
5869 : NULL, *empty_bb);
5870 :
5871 200 : free (function_version_info);
5872 200 : return 0;
5873 : }
5874 :
5875 : /* This function changes the assembler name for functions that are
5876 : versions. If DECL is a function version and has a "target"
5877 : attribute, it appends the attribute string to its assembler name. */
5878 :
5879 : static tree
5880 1113 : ix86_mangle_function_version_assembler_name (tree decl, tree id)
5881 : {
5882 1113 : tree version_attr;
5883 1113 : char *attr_str;
5884 :
5885 1113 : if (DECL_DECLARED_INLINE_P (decl)
5886 1162 : && lookup_attribute ("gnu_inline",
5887 49 : DECL_ATTRIBUTES (decl)))
5888 0 : error_at (DECL_SOURCE_LOCATION (decl),
5889 : "function versions cannot be marked as %<gnu_inline%>,"
5890 : " bodies have to be generated");
5891 :
5892 1113 : if (DECL_VIRTUAL_P (decl)
5893 2226 : || DECL_VINDEX (decl))
5894 0 : sorry ("virtual function multiversioning not supported");
5895 :
5896 1113 : version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
5897 :
5898 : /* target attribute string cannot be NULL. */
5899 1113 : gcc_assert (version_attr != NULL_TREE);
5900 :
5901 1113 : attr_str = sorted_attr_string (TREE_VALUE (version_attr));
5902 :
5903 : /* Allow assembler name to be modified if already set. */
5904 1113 : if (DECL_ASSEMBLER_NAME_SET_P (decl))
5905 1098 : SET_DECL_RTL (decl, NULL);
5906 :
5907 1113 : tree ret = clone_identifier (id, attr_str, true);
5908 :
5909 1113 : XDELETEVEC (attr_str);
5910 :
5911 1113 : return ret;
5912 : }
5913 :
5914 : tree
5915 485380750 : ix86_mangle_decl_assembler_name (tree decl, tree id)
5916 : {
5917 : /* For function version, add the target suffix to the assembler name. */
5918 485380750 : if (TREE_CODE (decl) == FUNCTION_DECL)
5919 : {
5920 451127040 : cgraph_node *node = cgraph_node::get (decl);
5921 : /* Mangle all versions when annotated with target_clones, but only
5922 : non-default versions when annotated with target attributes. */
5923 451127040 : if (DECL_FUNCTION_VERSIONED (decl)
5924 451127040 : && (node->is_target_clone
5925 1089 : || !is_function_default_version (node->decl)))
5926 1113 : id = ix86_mangle_function_version_assembler_name (decl, id);
5927 : /* Mangle the dispatched symbol but only in the case of target clones. */
5928 451125927 : else if (node && node->dispatcher_function && !node->is_target_clone)
5929 117 : id = clone_identifier (id, "ifunc");
5930 64075049 : else if (node && node->dispatcher_resolver_function)
5931 200 : id = clone_identifier (id, "resolver");
5932 : }
5933 : #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
5934 : id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
5935 : #endif
5936 :
5937 485380750 : return id;
5938 : }
5939 :
5940 : /* Make a dispatcher declaration for the multi-versioned function DECL.
5941 : Calls to DECL function will be replaced with calls to the dispatcher
5942 : by the front-end. Returns the decl of the dispatcher function. */
5943 :
5944 : tree
5945 326 : ix86_get_function_versions_dispatcher (void *decl)
5946 : {
5947 326 : tree fn = (tree) decl;
5948 326 : struct cgraph_node *node = NULL;
5949 326 : struct cgraph_node *default_node = NULL;
5950 326 : struct cgraph_function_version_info *node_v = NULL;
5951 :
5952 326 : tree dispatch_decl = NULL;
5953 :
5954 326 : struct cgraph_function_version_info *default_version_info = NULL;
5955 :
5956 652 : gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
5957 :
5958 326 : node = cgraph_node::get (fn);
5959 326 : gcc_assert (node != NULL);
5960 :
5961 326 : node_v = node->function_version ();
5962 326 : gcc_assert (node_v != NULL);
5963 :
5964 326 : if (node_v->dispatcher_resolver != NULL)
5965 : return node_v->dispatcher_resolver;
5966 :
5967 : /* The default node is always the beginning of the chain. */
5968 : default_version_info = node_v;
5969 674 : while (default_version_info->prev != NULL)
5970 : default_version_info = default_version_info->prev;
5971 212 : default_node = default_version_info->this_node;
5972 :
5973 : /* If there is no default node, just return NULL. */
5974 212 : if (!is_function_default_version (default_node->decl))
5975 : return NULL;
5976 :
5977 : #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
5978 203 : if (targetm.has_ifunc_p ())
5979 : {
5980 203 : struct cgraph_function_version_info *it_v = NULL;
5981 :
5982 : /* Right now, the dispatching is done via ifunc. */
5983 203 : dispatch_decl = make_dispatcher_decl (default_node->decl);
5984 :
5985 : /* Set the dispatcher for all the versions. */
5986 203 : it_v = default_version_info;
5987 1403 : while (it_v != NULL)
5988 : {
5989 997 : it_v->dispatcher_resolver = dispatch_decl;
5990 997 : it_v = it_v->next;
5991 : }
5992 : }
5993 : else
5994 : #endif
5995 : {
5996 0 : error_at (DECL_SOURCE_LOCATION (default_node->decl),
5997 : "multiversioning needs %<ifunc%> which is not supported "
5998 : "on this target");
5999 : }
6000 :
6001 : return dispatch_decl;
6002 : }
6003 :
6004 : /* Make the resolver function decl to dispatch the versions of
6005 : a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
6006 : ifunc alias that will point to the created resolver. Create an
6007 : empty basic block in the resolver and store the pointer in
6008 : EMPTY_BB. Return the decl of the resolver function. */
6009 :
6010 : static tree
6011 200 : make_resolver_func (const tree default_decl,
6012 : const tree ifunc_alias_decl,
6013 : basic_block *empty_bb)
6014 : {
6015 200 : tree decl, type, t;
6016 :
6017 : /* The resolver function should return a (void *). */
6018 200 : type = build_function_type_list (ptr_type_node, NULL_TREE);
6019 :
6020 200 : cgraph_node *node = cgraph_node::get (default_decl);
6021 200 : gcc_assert (node && node->function_version ());
6022 :
6023 200 : decl = build_fn_decl (IDENTIFIER_POINTER (DECL_NAME (default_decl)), type);
6024 :
6025 : /* Set the assembler name to prevent cgraph_node attempting to mangle. */
6026 200 : SET_DECL_ASSEMBLER_NAME (decl, DECL_ASSEMBLER_NAME (default_decl));
6027 :
6028 200 : cgraph_node *resolver_node = cgraph_node::get_create (decl);
6029 200 : resolver_node->dispatcher_resolver_function = true;
6030 :
6031 200 : if (node->is_target_clone)
6032 86 : resolver_node->is_target_clone = true;
6033 :
6034 200 : tree id = ix86_mangle_decl_assembler_name
6035 200 : (decl, node->function_version ()->assembler_name);
6036 200 : symtab->change_decl_assembler_name (decl, id);
6037 :
6038 200 : DECL_NAME (decl) = DECL_NAME (default_decl);
6039 200 : TREE_USED (decl) = 1;
6040 200 : DECL_ARTIFICIAL (decl) = 1;
6041 200 : DECL_IGNORED_P (decl) = 1;
6042 200 : TREE_PUBLIC (decl) = 0;
6043 200 : DECL_UNINLINABLE (decl) = 1;
6044 :
6045 : /* Resolver is not external, body is generated. */
6046 200 : DECL_EXTERNAL (decl) = 0;
6047 200 : DECL_EXTERNAL (ifunc_alias_decl) = 0;
6048 :
6049 200 : DECL_CONTEXT (decl) = NULL_TREE;
6050 200 : DECL_INITIAL (decl) = make_node (BLOCK);
6051 200 : DECL_STATIC_CONSTRUCTOR (decl) = 0;
6052 :
6053 200 : if (DECL_COMDAT_GROUP (default_decl)
6054 200 : || TREE_PUBLIC (default_decl))
6055 : {
6056 : /* In this case, each translation unit with a call to this
6057 : versioned function will put out a resolver. Ensure it
6058 : is comdat to keep just one copy. */
6059 176 : DECL_COMDAT (decl) = 1;
6060 176 : make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
6061 : }
6062 : else
6063 24 : TREE_PUBLIC (ifunc_alias_decl) = 0;
6064 :
6065 : /* Build result decl and add to function_decl. */
6066 200 : t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
6067 200 : DECL_CONTEXT (t) = decl;
6068 200 : DECL_ARTIFICIAL (t) = 1;
6069 200 : DECL_IGNORED_P (t) = 1;
6070 200 : DECL_RESULT (decl) = t;
6071 :
6072 200 : gimplify_function_tree (decl);
6073 200 : push_cfun (DECL_STRUCT_FUNCTION (decl));
6074 200 : *empty_bb = init_lowered_empty_function (decl, false,
6075 : profile_count::uninitialized ());
6076 :
6077 200 : cgraph_node::add_new_function (decl, true);
6078 200 : symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
6079 :
6080 200 : pop_cfun ();
6081 :
6082 200 : gcc_assert (ifunc_alias_decl != NULL);
6083 : /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
6084 200 : DECL_ATTRIBUTES (ifunc_alias_decl)
6085 200 : = make_attribute ("ifunc", IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)),
6086 200 : DECL_ATTRIBUTES (ifunc_alias_decl));
6087 :
6088 : /* Create the alias for dispatch to resolver here. */
6089 200 : cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
6090 200 : return decl;
6091 : }
6092 :
6093 : /* Generate the dispatching code body to dispatch multi-versioned function
6094 : DECL. The target hook is called to process the "target" attributes and
6095 : provide the code to dispatch the right function at run-time. NODE points
6096 : to the dispatcher decl whose body will be created. */
6097 :
6098 : tree
6099 200 : ix86_generate_version_dispatcher_body (void *node_p)
6100 : {
6101 200 : tree resolver_decl;
6102 200 : basic_block empty_bb;
6103 200 : tree default_ver_decl;
6104 200 : struct cgraph_node *versn;
6105 200 : struct cgraph_node *node;
6106 :
6107 200 : struct cgraph_function_version_info *node_version_info = NULL;
6108 200 : struct cgraph_function_version_info *versn_info = NULL;
6109 :
6110 200 : node = (cgraph_node *)node_p;
6111 :
6112 200 : node_version_info = node->function_version ();
6113 200 : gcc_assert (node->dispatcher_function
6114 : && node_version_info != NULL);
6115 :
6116 200 : if (node_version_info->dispatcher_resolver)
6117 : return node_version_info->dispatcher_resolver;
6118 :
6119 : /* The first version in the chain corresponds to the default version. */
6120 200 : default_ver_decl = node_version_info->next->this_node->decl;
6121 :
6122 : /* node is going to be an alias, so remove the finalized bit. */
6123 200 : node->definition = false;
6124 :
6125 200 : resolver_decl = make_resolver_func (default_ver_decl,
6126 : node->decl, &empty_bb);
6127 :
6128 200 : node_version_info->dispatcher_resolver = resolver_decl;
6129 :
6130 200 : push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
6131 :
6132 200 : auto_vec<tree, 2> fn_ver_vec;
6133 :
6134 1191 : for (versn_info = node_version_info->next; versn_info;
6135 991 : versn_info = versn_info->next)
6136 : {
6137 991 : versn = versn_info->this_node;
6138 : /* Check for virtual functions here again, as by this time it should
6139 : have been determined if this function needs a vtable index or
6140 : not. This happens for methods in derived classes that override
6141 : virtual methods in base classes but are not explicitly marked as
6142 : virtual. */
6143 991 : if (DECL_VIRTUAL_P (versn->decl))
6144 0 : sorry ("virtual function multiversioning not supported");
6145 :
6146 991 : fn_ver_vec.safe_push (versn->decl);
6147 : }
6148 :
6149 200 : dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
6150 200 : cgraph_edge::rebuild_edges ();
6151 200 : pop_cfun ();
6152 200 : return resolver_decl;
6153 200 : }
6154 :
6155 :
|