Line data Source code
1 : /* Copyright (C) 1988-2026 Free Software Foundation, Inc.
2 :
3 : This file is part of GCC.
4 :
5 : GCC is free software; you can redistribute it and/or modify
6 : it under the terms of the GNU General Public License as published by
7 : the Free Software Foundation; either version 3, or (at your option)
8 : any later version.
9 :
10 : GCC is distributed in the hope that it will be useful,
11 : but WITHOUT ANY WARRANTY; without even the implied warranty of
12 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 : GNU General Public License for more details.
14 :
15 : You should have received a copy of the GNU General Public License
16 : along with GCC; see the file COPYING3. If not see
17 : <http://www.gnu.org/licenses/>. */
18 :
19 : #define IN_TARGET_CODE 1
20 :
21 : #include "config.h"
22 : #include "system.h"
23 : #include "coretypes.h"
24 : #include "backend.h"
25 : #include "rtl.h"
26 : #include "tree.h"
27 : #include "memmodel.h"
28 : #include "gimple.h"
29 : #include "cfghooks.h"
30 : #include "cfgloop.h"
31 : #include "df.h"
32 : #include "tm_p.h"
33 : #include "stringpool.h"
34 : #include "expmed.h"
35 : #include "optabs.h"
36 : #include "regs.h"
37 : #include "emit-rtl.h"
38 : #include "recog.h"
39 : #include "cgraph.h"
40 : #include "diagnostic.h"
41 : #include "cfgbuild.h"
42 : #include "alias.h"
43 : #include "fold-const.h"
44 : #include "attribs.h"
45 : #include "calls.h"
46 : #include "stor-layout.h"
47 : #include "varasm.h"
48 : #include "output.h"
49 : #include "insn-attr.h"
50 : #include "flags.h"
51 : #include "except.h"
52 : #include "explow.h"
53 : #include "expr.h"
54 : #include "cfgrtl.h"
55 : #include "common/common-target.h"
56 : #include "langhooks.h"
57 : #include "reload.h"
58 : #include "gimplify.h"
59 : #include "dwarf2.h"
60 : #include "tm-constrs.h"
61 : #include "cselib.h"
62 : #include "sched-int.h"
63 : #include "opts.h"
64 : #include "tree-pass.h"
65 : #include "context.h"
66 : #include "pass_manager.h"
67 : #include "target-globals.h"
68 : #include "gimple-iterator.h"
69 : #include "shrink-wrap.h"
70 : #include "builtins.h"
71 : #include "rtl-iter.h"
72 : #include "tree-iterator.h"
73 : #include "dbgcnt.h"
74 : #include "case-cfn-macros.h"
75 : #include "dojump.h"
76 : #include "fold-const-call.h"
77 : #include "tree-vrp.h"
78 : #include "tree-ssanames.h"
79 : #include "selftest.h"
80 : #include "selftest-rtl.h"
81 : #include "print-rtl.h"
82 : #include "intl.h"
83 : #include "ifcvt.h"
84 : #include "symbol-summary.h"
85 : #include "sreal.h"
86 : #include "ipa-cp.h"
87 : #include "ipa-prop.h"
88 : #include "ipa-fnsummary.h"
89 : #include "wide-int-bitmask.h"
90 : #include "tree-vector-builder.h"
91 : #include "debug.h"
92 : #include "dwarf2out.h"
93 : #include "i386-builtins.h"
94 : #include "i386-features.h"
95 : #include "i386-expand.h"
96 :
97 : const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
98 : "savms64",
99 : "resms64",
100 : "resms64x",
101 : "savms64f",
102 : "resms64f",
103 : "resms64fx"
104 : };
105 :
106 : const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
107 : /* The below offset values are where each register is stored for the layout
108 : relative to incoming stack pointer. The value of each m_regs[].offset will
109 : be relative to the incoming base pointer (rax or rsi) used by the stub.
110 :
111 : s_instances: 0 1 2 3
112 : Offset: realigned or aligned + 8
113 : Register aligned aligned + 8 aligned w/HFP w/HFP */
114 : XMM15_REG, /* 0x10 0x18 0x10 0x18 */
115 : XMM14_REG, /* 0x20 0x28 0x20 0x28 */
116 : XMM13_REG, /* 0x30 0x38 0x30 0x38 */
117 : XMM12_REG, /* 0x40 0x48 0x40 0x48 */
118 : XMM11_REG, /* 0x50 0x58 0x50 0x58 */
119 : XMM10_REG, /* 0x60 0x68 0x60 0x68 */
120 : XMM9_REG, /* 0x70 0x78 0x70 0x78 */
121 : XMM8_REG, /* 0x80 0x88 0x80 0x88 */
122 : XMM7_REG, /* 0x90 0x98 0x90 0x98 */
123 : XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
124 : SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
125 : DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
126 : BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
127 : BP_REG, /* 0xc0 0xc8 N/A N/A */
128 : R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
129 : R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
130 : R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
131 : R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
132 : };
133 :
134 : /* Instantiate static const values. */
135 : const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
136 : const unsigned xlogue_layout::MIN_REGS;
137 : const unsigned xlogue_layout::MAX_REGS;
138 : const unsigned xlogue_layout::MAX_EXTRA_REGS;
139 : const unsigned xlogue_layout::VARIANT_COUNT;
140 : const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
141 :
142 : /* Initialize xlogue_layout::s_stub_names to zero. */
143 : char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
144 : [STUB_NAME_MAX_LEN];
145 :
146 : /* Instantiates all xlogue_layout instances. */
147 : const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
148 : xlogue_layout (0, false),
149 : xlogue_layout (8, false),
150 : xlogue_layout (0, true),
151 : xlogue_layout (8, true)
152 : };
153 :
154 : /* Return an appropriate const instance of xlogue_layout based upon values
155 : in cfun->machine and crtl. */
156 : const class xlogue_layout &
157 49891 : xlogue_layout::get_instance ()
158 : {
159 49891 : enum xlogue_stub_sets stub_set;
160 49891 : bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
161 :
162 49891 : if (stack_realign_fp)
163 : stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
164 40910 : else if (frame_pointer_needed)
165 25246 : stub_set = aligned_plus_8
166 31552 : ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
167 : : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
168 : else
169 9358 : stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
170 :
171 49891 : return s_instances[stub_set];
172 : }
173 :
174 : /* Determine how many clobbered registers can be saved by the stub.
175 : Returns the count of registers the stub will save and restore. */
176 : unsigned
177 35225 : xlogue_layout::count_stub_managed_regs ()
178 : {
179 35225 : bool hfp = frame_pointer_needed || stack_realign_fp;
180 35225 : unsigned i, count;
181 35225 : unsigned regno;
182 :
183 94890 : for (count = i = MIN_REGS; i < MAX_REGS; ++i)
184 : {
185 93670 : regno = REG_ORDER[i];
186 93670 : if (regno == BP_REG && hfp)
187 18200 : continue;
188 75470 : if (!ix86_save_reg (regno, false, false))
189 : break;
190 41465 : ++count;
191 : }
192 35225 : return count;
193 : }
194 :
195 : /* Determine if register REGNO is a stub managed register given the
196 : total COUNT of stub managed registers. */
197 : bool
198 2650688 : xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
199 : {
200 2650688 : bool hfp = frame_pointer_needed || stack_realign_fp;
201 2650688 : unsigned i;
202 :
203 34587805 : for (i = 0; i < count; ++i)
204 : {
205 32436986 : gcc_assert (i < MAX_REGS);
206 32436986 : if (REG_ORDER[i] == BP_REG && hfp)
207 522627 : ++count;
208 31914359 : else if (REG_ORDER[i] == regno)
209 : return true;
210 : }
211 : return false;
212 : }
213 :
214 : /* Constructor for xlogue_layout. */
215 1138364 : xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
216 1138364 : : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
217 1138364 : m_stack_align_off_in (stack_align_off_in)
218 : {
219 1138364 : HOST_WIDE_INT offset = stack_align_off_in;
220 1138364 : unsigned i, j;
221 :
222 21628916 : for (i = j = 0; i < MAX_REGS; ++i)
223 : {
224 20490552 : unsigned regno = REG_ORDER[i];
225 :
226 20490552 : if (regno == BP_REG && hfp)
227 569182 : continue;
228 19921370 : if (SSE_REGNO_P (regno))
229 : {
230 11383640 : offset += 16;
231 : /* Verify that SSE regs are always aligned. */
232 11383640 : gcc_assert (!((stack_align_off_in + offset) & 15));
233 : }
234 : else
235 8537730 : offset += 8;
236 :
237 19921370 : m_regs[j].regno = regno;
238 19921370 : m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
239 : }
240 1138364 : gcc_assert (j == m_nregs);
241 1138364 : }
242 :
243 : const char *
244 14666 : xlogue_layout::get_stub_name (enum xlogue_stub stub,
245 : unsigned n_extra_regs)
246 : {
247 14666 : const int have_avx = TARGET_AVX;
248 14666 : char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
249 :
250 : /* Lazy init */
251 14666 : if (!*name)
252 : {
253 362 : int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
254 : (have_avx ? "avx" : "sse"),
255 181 : STUB_BASE_NAMES[stub],
256 : MIN_REGS + n_extra_regs);
257 181 : gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
258 : }
259 :
260 14666 : return name;
261 : }
262 :
263 : /* Return rtx of a symbol ref for the entry point (based upon
264 : cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
265 : rtx
266 14666 : xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
267 : {
268 14666 : const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
269 14666 : gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
270 14666 : gcc_assert (stub < XLOGUE_STUB_COUNT);
271 14666 : gcc_assert (crtl->stack_realign_finalized);
272 :
273 14666 : return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
274 : }
275 :
276 : unsigned scalar_chain::max_id = 0;
277 :
278 : namespace {
279 :
280 : /* Initialize new chain. */
281 :
282 6377907 : scalar_chain::scalar_chain (enum machine_mode smode_, enum machine_mode vmode_)
283 : {
284 6377907 : smode = smode_;
285 6377907 : vmode = vmode_;
286 :
287 6377907 : chain_id = ++max_id;
288 :
289 6377907 : if (dump_file)
290 136 : fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
291 :
292 6377907 : bitmap_obstack_initialize (NULL);
293 6377907 : insns = BITMAP_ALLOC (NULL);
294 6377907 : defs = BITMAP_ALLOC (NULL);
295 6377907 : defs_conv = BITMAP_ALLOC (NULL);
296 6377907 : insns_conv = BITMAP_ALLOC (NULL);
297 6377907 : queue = NULL;
298 :
299 6377907 : cost_sse_integer = 0;
300 6377907 : weighted_cost_sse_integer = 0 ;
301 6377907 : max_visits = x86_stv_max_visits;
302 6377907 : }
303 :
304 : /* Free chain's data. */
305 :
306 6377907 : scalar_chain::~scalar_chain ()
307 : {
308 6377907 : BITMAP_FREE (insns);
309 6377907 : BITMAP_FREE (defs);
310 6377907 : BITMAP_FREE (defs_conv);
311 6377907 : BITMAP_FREE (insns_conv);
312 6377907 : bitmap_obstack_release (NULL);
313 6377907 : }
314 :
315 : /* Add instruction into chains' queue. */
316 :
317 : void
318 8280191 : scalar_chain::add_to_queue (unsigned insn_uid)
319 : {
320 8280191 : if (!bitmap_set_bit (queue, insn_uid))
321 : return;
322 :
323 6246698 : if (dump_file)
324 141 : fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
325 : insn_uid, chain_id);
326 : }
327 :
328 : /* For DImode conversion, mark register defined by DEF as requiring
329 : conversion. */
330 :
331 : void
332 9401198 : scalar_chain::mark_dual_mode_def (df_ref def)
333 : {
334 9401198 : gcc_assert (DF_REF_REG_DEF_P (def));
335 :
336 : /* Record the def/insn pair so we can later efficiently iterate over
337 : the defs to convert on insns not in the chain. */
338 9401198 : bool reg_new = bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
339 9401198 : basic_block bb = BLOCK_FOR_INSN (DF_REF_INSN (def));
340 9401198 : profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count;
341 9401198 : bool speed_p = optimize_bb_for_speed_p (bb);
342 9401198 : int cost = 0;
343 :
344 9401198 : if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def)))
345 : {
346 2718677 : if (!bitmap_set_bit (insns_conv, DF_REF_INSN_UID (def))
347 2718677 : && !reg_new)
348 1414282 : return;
349 :
350 : /* Cost integer to sse moves. */
351 2470566 : if (speed_p)
352 2191452 : cost = COSTS_N_INSNS (ix86_cost->integer_to_sse) / 2;
353 279114 : else if (TARGET_64BIT || smode == SImode)
354 : cost = COSTS_N_BYTES (4);
355 : /* vmovd (4 bytes) + vpinsrd (6 bytes). */
356 18683 : else if (TARGET_SSE4_1)
357 : cost = COSTS_N_BYTES (10);
358 : /* movd (4 bytes) + movd (4 bytes) + unpckldq (4 bytes). */
359 : else
360 7986916 : cost = COSTS_N_BYTES (12);
361 : }
362 : else
363 : {
364 6682521 : if (!reg_new)
365 : return;
366 :
367 : /* Cost sse to integer moves. */
368 5516350 : if (speed_p)
369 4958840 : cost = COSTS_N_INSNS (ix86_cost->sse_to_integer) / 2;
370 557510 : else if (TARGET_64BIT || smode == SImode)
371 : cost = COSTS_N_BYTES (4);
372 : /* vmovd (4 bytes) + vpextrd (6 bytes). */
373 3015 : else if (TARGET_SSE4_1)
374 : cost = COSTS_N_BYTES (10);
375 : /* movd (4 bytes) + psrlq (5 bytes) + movd (4 bytes). */
376 : else
377 7986916 : cost = COSTS_N_BYTES (13);
378 : }
379 :
380 7986916 : if (speed_p)
381 7150292 : weighted_cost_sse_integer += bb->count.to_sreal_scale (entry_count) * cost;
382 :
383 7986916 : cost_sse_integer += cost;
384 :
385 7986916 : if (dump_file)
386 240 : fprintf (dump_file,
387 : " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
388 240 : DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
389 : }
390 :
391 : /* Check REF's chain to add new insns into a queue
392 : and find registers requiring conversion. Return true if OK, false
393 : if the analysis was aborted. */
394 :
395 : bool
396 17811632 : scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref,
397 : bitmap disallowed)
398 : {
399 17811632 : df_link *chain;
400 17811632 : bool mark_def = false;
401 :
402 17811632 : gcc_checking_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)));
403 :
404 62349069 : for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
405 : {
406 44544326 : unsigned uid = DF_REF_INSN_UID (chain->ref);
407 :
408 44544326 : if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
409 8222393 : continue;
410 :
411 36321933 : if (--max_visits == 0)
412 : return false;
413 :
414 36321347 : if (!DF_REF_REG_MEM_P (chain->ref))
415 : {
416 30233453 : if (bitmap_bit_p (insns, uid))
417 9533741 : continue;
418 :
419 20699712 : if (bitmap_bit_p (candidates, uid))
420 : {
421 8280191 : add_to_queue (uid);
422 8280191 : continue;
423 : }
424 :
425 : /* If we run into parts of an aborted chain discovery abort. */
426 12419521 : if (bitmap_bit_p (disallowed, uid))
427 : return false;
428 : }
429 :
430 18501112 : if (DF_REF_REG_DEF_P (chain->ref))
431 : {
432 2718677 : if (dump_file)
433 125 : fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
434 : DF_REF_REGNO (chain->ref), uid);
435 2718677 : mark_dual_mode_def (chain->ref);
436 : }
437 : else
438 : {
439 15782435 : if (dump_file)
440 524 : fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
441 : DF_REF_REGNO (chain->ref), uid);
442 : mark_def = true;
443 : }
444 : }
445 :
446 17804743 : if (mark_def)
447 6682521 : mark_dual_mode_def (ref);
448 :
449 : return true;
450 : }
451 :
452 : /* Check whether X is a convertible *concatditi_? variant. X is known
453 : to be any_or_plus:TI, i.e. PLUS:TI, IOR:TI or XOR:TI. */
454 :
455 : static bool
456 26876 : timode_concatdi_p (rtx x)
457 : {
458 26876 : rtx op0 = XEXP (x, 0);
459 26876 : rtx op1 = XEXP (x, 1);
460 :
461 26876 : if (GET_CODE (op1) == ASHIFT)
462 957 : std::swap (op0, op1);
463 :
464 26876 : return GET_CODE (op0) == ASHIFT
465 18050 : && GET_CODE (XEXP (op0, 0)) == ZERO_EXTEND
466 18050 : && GET_MODE (XEXP (XEXP (op0, 0), 0)) == DImode
467 18050 : && REG_P (XEXP (XEXP (op0, 0), 0))
468 17915 : && CONST_INT_P (XEXP (op0, 1))
469 17915 : && INTVAL (XEXP (op0, 1)) == 64
470 17915 : && GET_CODE (op1) == ZERO_EXTEND
471 16958 : && GET_MODE (XEXP (op1, 0)) == DImode
472 43834 : && REG_P (XEXP (op1, 0));
473 : }
474 :
475 :
476 : /* Add instruction into a chain. Return true if OK, false if the search
477 : was aborted. */
478 :
479 : bool
480 12610686 : scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid,
481 : bitmap disallowed)
482 : {
483 12610686 : if (!bitmap_set_bit (insns, insn_uid))
484 : return true;
485 :
486 12610686 : if (dump_file)
487 277 : fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
488 :
489 12610686 : rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
490 12610686 : rtx def_set = single_set (insn);
491 12610686 : if (def_set && REG_P (SET_DEST (def_set))
492 22349260 : && !HARD_REGISTER_P (SET_DEST (def_set)))
493 9738562 : bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
494 :
495 : /* ??? The following is quadratic since analyze_register_chain
496 : iterates over all refs to look for dual-mode regs. Instead this
497 : should be done separately for all regs mentioned in the chain once. */
498 12610686 : df_ref ref;
499 25773262 : for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
500 13165186 : if (!HARD_REGISTER_P (DF_REF_REG (ref)))
501 9738562 : if (!analyze_register_chain (candidates, ref, disallowed))
502 : return false;
503 :
504 : /* The operand(s) of VEC_SELECT, ZERO_EXTEND and similar ops don't need
505 : to be converted/convertible. */
506 12608076 : if (def_set)
507 12608076 : switch (GET_CODE (SET_SRC (def_set)))
508 : {
509 : case VEC_SELECT:
510 : return true;
511 122 : case ZERO_EXTEND:
512 122 : if (GET_MODE (XEXP (SET_SRC (def_set), 0)) == DImode)
513 : return true;
514 : break;
515 2379743 : case PLUS:
516 2379743 : case IOR:
517 2379743 : case XOR:
518 2379743 : if (smode == TImode && timode_concatdi_p (SET_SRC (def_set)))
519 : return true;
520 : break;
521 : default:
522 : break;
523 : }
524 :
525 27605123 : for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
526 15038033 : if (!DF_REF_REG_MEM_P (ref))
527 8073070 : if (!analyze_register_chain (candidates, ref, disallowed))
528 : return false;
529 :
530 : return true;
531 : }
532 :
533 : /* Build new chain starting from insn INSN_UID recursively
534 : adding all dependent uses and definitions. Return true if OK, false
535 : if the chain discovery was aborted. */
536 :
537 : bool
538 6377907 : scalar_chain::build (bitmap candidates, unsigned insn_uid, bitmap disallowed)
539 : {
540 6377907 : queue = BITMAP_ALLOC (NULL);
541 6377907 : bitmap_set_bit (queue, insn_uid);
542 :
543 6377907 : if (dump_file)
544 136 : fprintf (dump_file, "Building chain #%d...\n", chain_id);
545 :
546 18981704 : while (!bitmap_empty_p (queue))
547 : {
548 12610686 : insn_uid = bitmap_first_set_bit (queue);
549 12610686 : bitmap_clear_bit (queue, insn_uid);
550 12610686 : bitmap_clear_bit (candidates, insn_uid);
551 12610686 : if (!add_insn (candidates, insn_uid, disallowed))
552 : {
553 : /* If we aborted the search put sofar found insn on the set of
554 : disallowed insns so that further searches reaching them also
555 : abort and thus we abort the whole but yet undiscovered chain. */
556 6889 : bitmap_ior_into (disallowed, insns);
557 6889 : if (dump_file)
558 0 : fprintf (dump_file, "Aborted chain #%d discovery\n", chain_id);
559 6889 : BITMAP_FREE (queue);
560 6889 : return false;
561 : }
562 : }
563 :
564 6371018 : if (dump_file)
565 : {
566 136 : fprintf (dump_file, "Collected chain #%d...\n", chain_id);
567 136 : fprintf (dump_file, " insns: ");
568 136 : dump_bitmap (dump_file, insns);
569 136 : if (!bitmap_empty_p (defs_conv))
570 : {
571 136 : bitmap_iterator bi;
572 136 : unsigned id;
573 136 : const char *comma = "";
574 136 : fprintf (dump_file, " defs to convert: ");
575 366 : EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
576 : {
577 230 : fprintf (dump_file, "%sr%d", comma, id);
578 230 : comma = ", ";
579 : }
580 136 : fprintf (dump_file, "\n");
581 : }
582 : }
583 :
584 6371018 : BITMAP_FREE (queue);
585 :
586 6371018 : return true;
587 : }
588 :
589 : /* Return a cost of building a vector constant
590 : instead of using a scalar one. */
591 :
592 : int
593 2686558 : general_scalar_chain::vector_const_cost (rtx exp, basic_block bb)
594 : {
595 2686558 : gcc_assert (CONST_INT_P (exp));
596 :
597 2686558 : if (standard_sse_constant_p (exp, vmode))
598 620050 : return ix86_cost->sse_op;
599 2066508 : if (optimize_bb_for_size_p (bb))
600 : return COSTS_N_BYTES (8);
601 : /* We have separate costs for SImode and DImode, use SImode costs
602 : for smaller modes. */
603 2458852 : return COSTS_N_INSNS (ix86_cost->sse_load[smode == DImode ? 1 : 0]) / 2;
604 : }
605 :
606 : /* Return true if it's cost profitable for chain conversion. */
607 :
608 : bool
609 5897243 : general_scalar_chain::compute_convert_gain ()
610 : {
611 5897243 : bitmap_iterator bi;
612 5897243 : unsigned insn_uid;
613 5897243 : int gain = 0;
614 5897243 : sreal weighted_gain = 0;
615 :
616 5897243 : if (dump_file)
617 136 : fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
618 :
619 : /* SSE costs distinguish between SImode and DImode loads/stores, for
620 : int costs factor in the number of GPRs involved. When supporting
621 : smaller modes than SImode the int load/store costs need to be
622 : adjusted as well. */
623 5897243 : unsigned sse_cost_idx = smode == DImode ? 1 : 0;
624 5897243 : int m = smode == DImode ? (TARGET_64BIT ? 1 : 2) : 1;
625 :
626 17552104 : EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
627 : {
628 11654861 : rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
629 11654861 : rtx def_set = single_set (insn);
630 11654861 : rtx src = SET_SRC (def_set);
631 11654861 : rtx dst = SET_DEST (def_set);
632 11654861 : basic_block bb = BLOCK_FOR_INSN (insn);
633 11654861 : int igain = 0;
634 11654861 : profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count;
635 11654861 : bool speed_p = optimize_bb_for_speed_p (bb);
636 11654861 : sreal bb_freq = bb->count.to_sreal_scale (entry_count);
637 :
638 11654861 : if (REG_P (src) && REG_P (dst))
639 : {
640 937970 : if (!speed_p)
641 : /* reg-reg move is 2 bytes, while SSE 3. */
642 188178 : igain += COSTS_N_BYTES (2 * m - 3);
643 : else
644 : /* Move costs are normalized to reg-reg move having cost 2. */
645 749792 : igain += COSTS_N_INSNS (2 * m - ix86_cost->xmm_move) / 2;
646 : }
647 10716891 : else if (REG_P (src) && MEM_P (dst))
648 : {
649 2301068 : if (!speed_p)
650 : /* Integer load/store is 3+ bytes and SSE 4+. */
651 191672 : igain += COSTS_N_BYTES (3 * m - 4);
652 : else
653 2109396 : igain
654 2109396 : += COSTS_N_INSNS (m * ix86_cost->int_store[2]
655 : - ix86_cost->sse_store[sse_cost_idx]) / 2;
656 : }
657 8415823 : else if (MEM_P (src) && REG_P (dst))
658 : {
659 3780986 : if (!speed_p)
660 358324 : igain += COSTS_N_BYTES (3 * m - 4);
661 : else
662 3422662 : igain += COSTS_N_INSNS (m * ix86_cost->int_load[2]
663 : - ix86_cost->sse_load[sse_cost_idx]) / 2;
664 : }
665 : else
666 : {
667 : /* For operations on memory operands, include the overhead
668 : of explicit load and store instructions. */
669 4634837 : if (MEM_P (dst))
670 : {
671 67157 : if (!speed_p)
672 : /* ??? This probably should account size difference
673 : of SSE and integer load rather than full SSE load. */
674 : igain -= COSTS_N_BYTES (8);
675 : else
676 : {
677 58045 : int cost = (m * (ix86_cost->int_load[2]
678 58045 : + ix86_cost->int_store[2])
679 58045 : - (ix86_cost->sse_load[sse_cost_idx] +
680 58045 : ix86_cost->sse_store[sse_cost_idx]));
681 58045 : igain += COSTS_N_INSNS (cost) / 2;
682 : }
683 : }
684 :
685 4634837 : switch (GET_CODE (src))
686 : {
687 474904 : case ASHIFT:
688 474904 : case ASHIFTRT:
689 474904 : case LSHIFTRT:
690 474904 : if (m == 2)
691 : {
692 16941 : if (INTVAL (XEXP (src, 1)) >= 32)
693 11523 : igain += ix86_cost->add;
694 : /* Gain for extend highpart case. */
695 5418 : else if (GET_CODE (XEXP (src, 0)) == ASHIFT)
696 0 : igain += ix86_cost->shift_const - ix86_cost->sse_op;
697 : else
698 5418 : igain += ix86_cost->shift_const;
699 : }
700 :
701 474904 : igain += ix86_cost->shift_const - ix86_cost->sse_op;
702 :
703 474904 : if (CONST_INT_P (XEXP (src, 0)))
704 0 : igain -= vector_const_cost (XEXP (src, 0), bb);
705 : break;
706 :
707 3819 : case ROTATE:
708 3819 : case ROTATERT:
709 3819 : igain += m * ix86_cost->shift_const;
710 3819 : if (TARGET_AVX512VL)
711 204 : igain -= ix86_cost->sse_op;
712 3615 : else if (smode == DImode)
713 : {
714 612 : int bits = INTVAL (XEXP (src, 1));
715 612 : if ((bits & 0x0f) == 0)
716 128 : igain -= ix86_cost->sse_op;
717 484 : else if ((bits & 0x07) == 0)
718 27 : igain -= 2 * ix86_cost->sse_op;
719 : else
720 457 : igain -= 3 * ix86_cost->sse_op;
721 : }
722 3003 : else if (INTVAL (XEXP (src, 1)) == 16)
723 242 : igain -= ix86_cost->sse_op;
724 : else
725 2761 : igain -= 2 * ix86_cost->sse_op;
726 : break;
727 :
728 2858422 : case AND:
729 2858422 : case IOR:
730 2858422 : case XOR:
731 2858422 : case PLUS:
732 2858422 : case MINUS:
733 2858422 : igain += m * ix86_cost->add - ix86_cost->sse_op;
734 : /* Additional gain for andnot for targets without BMI. */
735 2858422 : if (GET_CODE (XEXP (src, 0)) == NOT
736 3599 : && !TARGET_BMI)
737 3590 : igain += m * ix86_cost->add;
738 :
739 2858422 : if (CONST_INT_P (XEXP (src, 0)))
740 0 : igain -= vector_const_cost (XEXP (src, 0), bb);
741 2858422 : if (CONST_INT_P (XEXP (src, 1)))
742 1702852 : igain -= vector_const_cost (XEXP (src, 1), bb);
743 2858422 : if (MEM_P (XEXP (src, 1)))
744 : {
745 89267 : if (!speed_p)
746 21291 : igain -= COSTS_N_BYTES (m == 2 ? 3 : 5);
747 : else
748 78617 : igain += COSTS_N_INSNS
749 : (m * ix86_cost->int_load[2]
750 : - ix86_cost->sse_load[sse_cost_idx]) / 2;
751 : }
752 : break;
753 :
754 49831 : case NEG:
755 49831 : case NOT:
756 49831 : igain -= ix86_cost->sse_op + COSTS_N_INSNS (1);
757 :
758 49831 : if (GET_CODE (XEXP (src, 0)) != ABS)
759 : {
760 49831 : igain += m * ix86_cost->add;
761 49831 : break;
762 : }
763 : /* FALLTHRU */
764 :
765 1004 : case ABS:
766 1004 : case SMAX:
767 1004 : case SMIN:
768 1004 : case UMAX:
769 1004 : case UMIN:
770 : /* We do not have any conditional move cost, estimate it as a
771 : reg-reg move. Comparisons are costed as adds. */
772 1004 : igain += m * (COSTS_N_INSNS (2) + ix86_cost->add);
773 : /* Integer SSE ops are all costed the same. */
774 1004 : igain -= ix86_cost->sse_op;
775 1004 : break;
776 :
777 0 : case COMPARE:
778 0 : if (XEXP (src, 1) != const0_rtx)
779 : {
780 : /* cmp vs. pxor;pshufd;ptest. */
781 0 : igain += COSTS_N_INSNS (m - 3);
782 : }
783 0 : else if (GET_CODE (XEXP (src, 0)) != AND)
784 : {
785 : /* test vs. pshufd;ptest. */
786 0 : igain += COSTS_N_INSNS (m - 2);
787 : }
788 0 : else if (GET_CODE (XEXP (XEXP (src, 0), 0)) != NOT)
789 : {
790 : /* and;test vs. pshufd;ptest. */
791 0 : igain += COSTS_N_INSNS (2 * m - 2);
792 : }
793 0 : else if (TARGET_BMI)
794 : {
795 : /* andn;test vs. pandn;pshufd;ptest. */
796 0 : igain += COSTS_N_INSNS (2 * m - 3);
797 : }
798 : else
799 : {
800 : /* not;and;test vs. pandn;pshufd;ptest. */
801 0 : igain += COSTS_N_INSNS (3 * m - 3);
802 : }
803 : break;
804 :
805 1212913 : case CONST_INT:
806 1212913 : if (REG_P (dst))
807 : {
808 1212913 : if (!speed_p)
809 : {
810 : /* xor (2 bytes) vs. xorps (3 bytes). */
811 229207 : if (src == const0_rtx)
812 121775 : igain -= COSTS_N_BYTES (1);
813 : /* movdi_internal vs. movv2di_internal. */
814 : /* => mov (5 bytes) vs. movaps (7 bytes). */
815 107432 : else if (x86_64_immediate_operand (src, SImode))
816 94755 : igain -= COSTS_N_BYTES (2);
817 : else
818 : /* ??? Larger immediate constants are placed in the
819 : constant pool, where the size benefit/impact of
820 : STV conversion is affected by whether and how
821 : often each constant pool entry is shared/reused.
822 : The value below is empirically derived from the
823 : CSiBE benchmark (and the optimal value may drift
824 : over time). */
825 : igain += COSTS_N_BYTES (0);
826 : }
827 : else
828 : {
829 : /* DImode can be immediate for TARGET_64BIT
830 : and SImode always. */
831 983706 : igain += m * COSTS_N_INSNS (1);
832 983706 : igain -= vector_const_cost (src, bb);
833 : }
834 : }
835 0 : else if (MEM_P (dst))
836 : {
837 0 : igain += (m * ix86_cost->int_store[2]
838 0 : - ix86_cost->sse_store[sse_cost_idx]);
839 0 : igain -= vector_const_cost (src, bb);
840 : }
841 : break;
842 :
843 33944 : case VEC_SELECT:
844 33944 : if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
845 : {
846 : // movd (4 bytes) replaced with movdqa (4 bytes).
847 25589 : if (!!speed_p)
848 23829 : igain += COSTS_N_INSNS (ix86_cost->sse_to_integer
849 : - ix86_cost->xmm_move) / 2;
850 : }
851 : else
852 : {
853 : // pshufd; movd replaced with pshufd.
854 8355 : if (!speed_p)
855 624 : igain += COSTS_N_BYTES (4);
856 : else
857 7731 : igain += ix86_cost->sse_to_integer;
858 : }
859 : break;
860 :
861 0 : default:
862 0 : gcc_unreachable ();
863 : }
864 : }
865 :
866 11653101 : if (speed_p)
867 10384812 : weighted_gain += bb_freq * igain;
868 11654861 : gain += igain;
869 :
870 11654861 : if (igain != 0 && dump_file)
871 : {
872 93 : fprintf (dump_file, " Instruction gain %d with bb_freq %.2f for",
873 : igain, bb_freq.to_double ());
874 93 : dump_insn_slim (dump_file, insn);
875 : }
876 : }
877 :
878 5897243 : if (dump_file)
879 : {
880 136 : fprintf (dump_file, " Instruction conversion gain: %d, \n",
881 : gain);
882 136 : fprintf (dump_file, " Registers conversion cost: %d\n",
883 : cost_sse_integer);
884 136 : fprintf (dump_file, " Weighted instruction conversion gain: %.2f, \n",
885 : weighted_gain.to_double ());
886 136 : fprintf (dump_file, " Weighted registers conversion cost: %.2f\n",
887 : weighted_cost_sse_integer.to_double ());
888 : }
889 :
890 5897243 : if (weighted_gain != weighted_cost_sse_integer)
891 4768084 : return weighted_gain > weighted_cost_sse_integer;
892 : else
893 1129159 : return gain > cost_sse_integer;;
894 : }
895 :
896 : /* Insert generated conversion instruction sequence INSNS
897 : after instruction AFTER. New BB may be required in case
898 : instruction has EH region attached. */
899 :
900 : void
901 30192 : scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
902 : {
903 30192 : if (!control_flow_insn_p (after))
904 : {
905 29979 : emit_insn_after (insns, after);
906 29979 : return;
907 : }
908 :
909 213 : basic_block bb = BLOCK_FOR_INSN (after);
910 213 : edge e = find_fallthru_edge (bb->succs);
911 213 : gcc_assert (e);
912 :
913 213 : basic_block new_bb = split_edge (e);
914 213 : emit_insn_after (insns, BB_HEAD (new_bb));
915 : }
916 :
917 : } // anon namespace
918 :
919 : /* Generate the canonical SET_SRC to move GPR to a VMODE vector register,
920 : zeroing the upper parts. */
921 :
922 : static rtx
923 173048 : gen_gpr_to_xmm_move_src (enum machine_mode vmode, rtx gpr)
924 : {
925 346096 : switch (GET_MODE_NUNITS (vmode))
926 : {
927 25 : case 1:
928 25 : return gen_rtx_SUBREG (vmode, gpr, 0);
929 172466 : case 2:
930 344932 : return gen_rtx_VEC_CONCAT (vmode, gpr,
931 : CONST0_RTX (GET_MODE_INNER (vmode)));
932 557 : default:
933 557 : return gen_rtx_VEC_MERGE (vmode, gen_rtx_VEC_DUPLICATE (vmode, gpr),
934 : CONST0_RTX (vmode), GEN_INT (HOST_WIDE_INT_1U));
935 : }
936 : }
937 :
938 : /* Make vector copies for all register REGNO definitions
939 : and replace its uses in a chain. */
940 :
941 : void
942 8060 : scalar_chain::make_vector_copies (rtx_insn *insn, rtx reg)
943 : {
944 8060 : rtx vreg = *defs_map.get (reg);
945 :
946 8060 : start_sequence ();
947 8060 : if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
948 : {
949 0 : rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
950 0 : if (smode == DImode && !TARGET_64BIT)
951 : {
952 0 : emit_move_insn (adjust_address (tmp, SImode, 0),
953 : gen_rtx_SUBREG (SImode, reg, 0));
954 0 : emit_move_insn (adjust_address (tmp, SImode, 4),
955 : gen_rtx_SUBREG (SImode, reg, 4));
956 : }
957 : else
958 0 : emit_move_insn (copy_rtx (tmp), reg);
959 0 : emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
960 : gen_gpr_to_xmm_move_src (vmode, tmp)));
961 : }
962 8060 : else if (!TARGET_64BIT && smode == DImode)
963 : {
964 7949 : if (TARGET_SSE4_1)
965 : {
966 356 : emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
967 : CONST0_RTX (V4SImode),
968 : gen_rtx_SUBREG (SImode, reg, 0)));
969 356 : emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
970 : gen_rtx_SUBREG (V4SImode, vreg, 0),
971 : gen_rtx_SUBREG (SImode, reg, 4),
972 : GEN_INT (2)));
973 : }
974 : else
975 : {
976 7593 : rtx tmp = gen_reg_rtx (DImode);
977 7593 : emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
978 : CONST0_RTX (V4SImode),
979 : gen_rtx_SUBREG (SImode, reg, 0)));
980 7593 : emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
981 : CONST0_RTX (V4SImode),
982 : gen_rtx_SUBREG (SImode, reg, 4)));
983 7593 : emit_insn (gen_vec_interleave_lowv4si
984 : (gen_rtx_SUBREG (V4SImode, vreg, 0),
985 : gen_rtx_SUBREG (V4SImode, vreg, 0),
986 : gen_rtx_SUBREG (V4SImode, tmp, 0)));
987 : }
988 : }
989 : else
990 111 : emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
991 : gen_gpr_to_xmm_move_src (vmode, reg)));
992 8060 : rtx_insn *seq = end_sequence ();
993 8060 : emit_conversion_insns (seq, insn);
994 :
995 8060 : if (dump_file)
996 0 : fprintf (dump_file,
997 : " Copied r%d to a vector register r%d for insn %d\n",
998 0 : REGNO (reg), REGNO (vreg), INSN_UID (insn));
999 8060 : }
1000 :
1001 : /* Copy the definition SRC of INSN inside the chain to DST for
1002 : scalar uses outside of the chain. */
1003 :
1004 : void
1005 21370 : scalar_chain::convert_reg (rtx_insn *insn, rtx dst, rtx src)
1006 : {
1007 21370 : start_sequence ();
1008 21370 : if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
1009 : {
1010 0 : rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
1011 0 : emit_move_insn (tmp, src);
1012 0 : if (!TARGET_64BIT && smode == DImode)
1013 : {
1014 0 : emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
1015 : adjust_address (tmp, SImode, 0));
1016 0 : emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
1017 : adjust_address (tmp, SImode, 4));
1018 : }
1019 : else
1020 0 : emit_move_insn (dst, copy_rtx (tmp));
1021 : }
1022 21370 : else if (!TARGET_64BIT && smode == DImode)
1023 : {
1024 21002 : if (TARGET_SSE4_1)
1025 : {
1026 0 : rtx tmp = gen_rtx_PARALLEL (VOIDmode,
1027 : gen_rtvec (1, const0_rtx));
1028 0 : emit_insn
1029 0 : (gen_rtx_SET
1030 : (gen_rtx_SUBREG (SImode, dst, 0),
1031 : gen_rtx_VEC_SELECT (SImode,
1032 : gen_rtx_SUBREG (V4SImode, src, 0),
1033 : tmp)));
1034 :
1035 0 : tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
1036 0 : emit_insn
1037 0 : (gen_rtx_SET
1038 : (gen_rtx_SUBREG (SImode, dst, 4),
1039 : gen_rtx_VEC_SELECT (SImode,
1040 : gen_rtx_SUBREG (V4SImode, src, 0),
1041 : tmp)));
1042 : }
1043 : else
1044 : {
1045 21002 : rtx vcopy = gen_reg_rtx (V2DImode);
1046 21002 : emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, src, 0));
1047 21002 : emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
1048 : gen_rtx_SUBREG (SImode, vcopy, 0));
1049 21002 : emit_move_insn (vcopy,
1050 : gen_rtx_LSHIFTRT (V2DImode,
1051 : vcopy, GEN_INT (32)));
1052 21002 : emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
1053 : gen_rtx_SUBREG (SImode, vcopy, 0));
1054 : }
1055 : }
1056 : else
1057 368 : emit_move_insn (dst, src);
1058 :
1059 21370 : rtx_insn *seq = end_sequence ();
1060 21370 : emit_conversion_insns (seq, insn);
1061 :
1062 21370 : if (dump_file)
1063 0 : fprintf (dump_file,
1064 : " Copied r%d to a scalar register r%d for insn %d\n",
1065 0 : REGNO (src), REGNO (dst), INSN_UID (insn));
1066 21370 : }
1067 :
1068 : /* Helper function to convert immediate constant X to vmode. */
1069 : static rtx
1070 39460 : smode_convert_cst (rtx x, enum machine_mode vmode)
1071 : {
1072 : /* Prefer all ones vector in case of -1. */
1073 39460 : if (constm1_operand (x, GET_MODE (x)))
1074 894 : return CONSTM1_RTX (vmode);
1075 :
1076 38566 : unsigned n = GET_MODE_NUNITS (vmode);
1077 38566 : rtx *v = XALLOCAVEC (rtx, n);
1078 38566 : v[0] = x;
1079 44350 : for (unsigned i = 1; i < n; ++i)
1080 5784 : v[i] = const0_rtx;
1081 38566 : return gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (n, v));
1082 : }
1083 :
1084 : /* Convert operand OP in INSN. We should handle
1085 : memory operands and uninitialized registers.
1086 : All other register uses are converted during
1087 : registers conversion. */
1088 :
1089 : void
1090 246449 : scalar_chain::convert_op (rtx *op, rtx_insn *insn)
1091 : {
1092 246449 : rtx tmp;
1093 :
1094 246449 : if (GET_MODE (*op) == V1TImode)
1095 : return;
1096 :
1097 246298 : *op = copy_rtx_if_shared (*op);
1098 :
1099 246298 : if (GET_CODE (*op) == NOT
1100 246298 : || GET_CODE (*op) == ASHIFT)
1101 : {
1102 3490 : convert_op (&XEXP (*op, 0), insn);
1103 3490 : PUT_MODE (*op, vmode);
1104 : }
1105 : else if (MEM_P (*op))
1106 : {
1107 172937 : rtx_insn *movabs = NULL;
1108 :
1109 : /* Emit MOVABS to load from a 64-bit absolute address to a GPR. */
1110 172937 : if (!memory_operand (*op, GET_MODE (*op)))
1111 : {
1112 0 : tmp = gen_reg_rtx (GET_MODE (*op));
1113 0 : movabs = emit_insn_before (gen_rtx_SET (tmp, *op), insn);
1114 :
1115 0 : *op = tmp;
1116 : }
1117 :
1118 172937 : tmp = gen_rtx_SUBREG (vmode, gen_reg_rtx (GET_MODE (*op)), 0);
1119 :
1120 172937 : rtx_insn *eh_insn
1121 172937 : = emit_insn_before (gen_rtx_SET (copy_rtx (tmp),
1122 : gen_gpr_to_xmm_move_src (vmode, *op)),
1123 172937 : insn);
1124 :
1125 172937 : if (cfun->can_throw_non_call_exceptions)
1126 : {
1127 : /* Handle REG_EH_REGION note. */
1128 168856 : rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
1129 168856 : if (note)
1130 : {
1131 3597 : if (movabs)
1132 0 : eh_insn = movabs;
1133 3597 : control_flow_insns.safe_push (eh_insn);
1134 3597 : add_reg_note (eh_insn, REG_EH_REGION, XEXP (note, 0));
1135 : }
1136 : }
1137 :
1138 172937 : *op = tmp;
1139 :
1140 172937 : if (dump_file)
1141 0 : fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
1142 0 : INSN_UID (insn), reg_or_subregno (tmp));
1143 : }
1144 : else if (REG_P (*op))
1145 63459 : *op = gen_rtx_SUBREG (vmode, *op, 0);
1146 : else if (CONST_SCALAR_INT_P (*op))
1147 : {
1148 6412 : rtx vec_cst = smode_convert_cst (*op, vmode);
1149 :
1150 6412 : if (!standard_sse_constant_p (vec_cst, vmode))
1151 : {
1152 2698 : start_sequence ();
1153 2698 : vec_cst = validize_mem (force_const_mem (vmode, vec_cst));
1154 2698 : rtx_insn *seq = end_sequence ();
1155 2698 : emit_insn_before (seq, insn);
1156 : }
1157 :
1158 6412 : tmp = gen_rtx_SUBREG (vmode, gen_reg_rtx (smode), 0);
1159 :
1160 6412 : emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
1161 6412 : *op = tmp;
1162 : }
1163 : else
1164 : {
1165 0 : gcc_assert (SUBREG_P (*op));
1166 0 : gcc_assert (GET_MODE (*op) == vmode);
1167 : }
1168 : }
1169 :
1170 : /* Convert CCZmode COMPARE to vector mode. */
1171 :
1172 : rtx
1173 10 : scalar_chain::convert_compare (rtx op1, rtx op2, rtx_insn *insn)
1174 : {
1175 10 : rtx src, tmp;
1176 :
1177 : /* Handle any REG_EQUAL notes. */
1178 10 : tmp = find_reg_equal_equiv_note (insn);
1179 10 : if (tmp)
1180 : {
1181 1 : if (GET_CODE (XEXP (tmp, 0)) == COMPARE
1182 1 : && GET_MODE (XEXP (tmp, 0)) == CCZmode
1183 1 : && REG_P (XEXP (XEXP (tmp, 0), 0)))
1184 : {
1185 1 : rtx *op = &XEXP (XEXP (tmp, 0), 1);
1186 1 : if (CONST_SCALAR_INT_P (*op))
1187 : {
1188 1 : if (constm1_operand (*op, GET_MODE (*op)))
1189 0 : *op = CONSTM1_RTX (vmode);
1190 : else
1191 : {
1192 1 : unsigned n = GET_MODE_NUNITS (vmode);
1193 1 : rtx *v = XALLOCAVEC (rtx, n);
1194 1 : v[0] = *op;
1195 1 : for (unsigned i = 1; i < n; ++i)
1196 0 : v[i] = const0_rtx;
1197 1 : *op = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (n, v));
1198 : }
1199 : tmp = NULL_RTX;
1200 : }
1201 0 : else if (REG_P (*op))
1202 : tmp = NULL_RTX;
1203 : }
1204 :
1205 : if (tmp)
1206 0 : remove_note (insn, tmp);
1207 : }
1208 :
1209 : /* Comparison against anything other than zero, requires an XOR. */
1210 10 : if (op2 != const0_rtx)
1211 : {
1212 4 : convert_op (&op1, insn);
1213 4 : convert_op (&op2, insn);
1214 : /* If both operands are MEMs, explicitly load the OP1 into TMP. */
1215 4 : if (MEM_P (op1) && MEM_P (op2))
1216 : {
1217 0 : tmp = gen_reg_rtx (vmode);
1218 0 : emit_insn_before (gen_rtx_SET (tmp, op1), insn);
1219 0 : src = tmp;
1220 : }
1221 : else
1222 : src = op1;
1223 4 : src = gen_rtx_XOR (vmode, src, op2);
1224 : }
1225 6 : else if (GET_CODE (op1) == AND
1226 0 : && GET_CODE (XEXP (op1, 0)) == NOT)
1227 : {
1228 0 : rtx op11 = XEXP (XEXP (op1, 0), 0);
1229 0 : rtx op12 = XEXP (op1, 1);
1230 0 : convert_op (&op11, insn);
1231 0 : convert_op (&op12, insn);
1232 0 : if (!REG_P (op11))
1233 : {
1234 0 : tmp = gen_reg_rtx (vmode);
1235 0 : emit_insn_before (gen_rtx_SET (tmp, op11), insn);
1236 0 : op11 = tmp;
1237 : }
1238 0 : src = gen_rtx_AND (vmode, gen_rtx_NOT (vmode, op11), op12);
1239 0 : }
1240 6 : else if (GET_CODE (op1) == AND)
1241 : {
1242 0 : rtx op11 = XEXP (op1, 0);
1243 0 : rtx op12 = XEXP (op1, 1);
1244 0 : convert_op (&op11, insn);
1245 0 : convert_op (&op12, insn);
1246 0 : if (!REG_P (op11))
1247 : {
1248 0 : tmp = gen_reg_rtx (vmode);
1249 0 : emit_insn_before (gen_rtx_SET (tmp, op11), insn);
1250 0 : op11 = tmp;
1251 : }
1252 0 : return gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, op11, op12),
1253 : UNSPEC_PTEST);
1254 : }
1255 : else
1256 : {
1257 6 : convert_op (&op1, insn);
1258 6 : src = op1;
1259 : }
1260 :
1261 10 : if (!REG_P (src))
1262 : {
1263 6 : tmp = gen_reg_rtx (vmode);
1264 6 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
1265 6 : src = tmp;
1266 : }
1267 :
1268 10 : if (vmode == V2DImode)
1269 : {
1270 0 : tmp = gen_reg_rtx (vmode);
1271 0 : emit_insn_before (gen_vec_interleave_lowv2di (tmp, src, src), insn);
1272 0 : src = tmp;
1273 : }
1274 10 : else if (vmode == V4SImode)
1275 : {
1276 0 : tmp = gen_reg_rtx (vmode);
1277 0 : emit_insn_before (gen_sse2_pshufd (tmp, src, const0_rtx), insn);
1278 0 : src = tmp;
1279 : }
1280 :
1281 10 : return gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, src, src), UNSPEC_PTEST);
1282 : }
1283 :
1284 : /* Helper function for converting INSN to vector mode. */
1285 :
1286 : void
1287 1336784 : scalar_chain::convert_insn_common (rtx_insn *insn)
1288 : {
1289 : /* Generate copies for out-of-chain uses of defs and adjust debug uses. */
1290 2043450 : for (df_ref ref = DF_INSN_DEFS (insn); ref; ref = DF_REF_NEXT_LOC (ref))
1291 706666 : if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
1292 : {
1293 22795 : df_link *use;
1294 43884 : for (use = DF_REF_CHAIN (ref); use; use = use->next)
1295 42459 : if (NONDEBUG_INSN_P (DF_REF_INSN (use->ref))
1296 42459 : && (DF_REF_REG_MEM_P (use->ref)
1297 37941 : || !bitmap_bit_p (insns, DF_REF_INSN_UID (use->ref))))
1298 : break;
1299 22795 : if (use)
1300 21370 : convert_reg (insn, DF_REF_REG (ref),
1301 21370 : *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]));
1302 1425 : else if (MAY_HAVE_DEBUG_BIND_INSNS)
1303 : {
1304 : /* If we generated a scalar copy we can leave debug-insns
1305 : as-is, if not, we have to adjust them. */
1306 1305 : auto_vec<rtx_insn *, 5> to_reset_debug_insns;
1307 3903 : for (use = DF_REF_CHAIN (ref); use; use = use->next)
1308 2598 : if (DEBUG_INSN_P (DF_REF_INSN (use->ref)))
1309 : {
1310 849 : rtx_insn *debug_insn = DF_REF_INSN (use->ref);
1311 : /* If there's a reaching definition outside of the
1312 : chain we have to reset. */
1313 849 : df_link *def;
1314 2972 : for (def = DF_REF_CHAIN (use->ref); def; def = def->next)
1315 2307 : if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def->ref)))
1316 : break;
1317 849 : if (def)
1318 184 : to_reset_debug_insns.safe_push (debug_insn);
1319 : else
1320 : {
1321 665 : *DF_REF_REAL_LOC (use->ref)
1322 665 : = *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]);
1323 665 : df_insn_rescan (debug_insn);
1324 : }
1325 : }
1326 : /* Have to do the reset outside of the DF_CHAIN walk to not
1327 : disrupt it. */
1328 2794 : while (!to_reset_debug_insns.is_empty ())
1329 : {
1330 184 : rtx_insn *debug_insn = to_reset_debug_insns.pop ();
1331 184 : INSN_VAR_LOCATION_LOC (debug_insn) = gen_rtx_UNKNOWN_VAR_LOC ();
1332 184 : df_insn_rescan_debug_internal (debug_insn);
1333 : }
1334 1305 : }
1335 : }
1336 :
1337 : /* Replace uses in this insn with the defs we use in the chain. */
1338 3344151 : for (df_ref ref = DF_INSN_USES (insn); ref; ref = DF_REF_NEXT_LOC (ref))
1339 2007367 : if (!DF_REF_REG_MEM_P (ref))
1340 716366 : if (rtx *vreg = defs_map.get (regno_reg_rtx[DF_REF_REGNO (ref)]))
1341 : {
1342 : /* Also update a corresponding REG_DEAD note. */
1343 35087 : rtx note = find_reg_note (insn, REG_DEAD, DF_REF_REG (ref));
1344 35087 : if (note)
1345 23178 : XEXP (note, 0) = *vreg;
1346 35087 : *DF_REF_REAL_LOC (ref) = *vreg;
1347 : }
1348 1336784 : }
1349 :
1350 : /* Convert INSN which is an SImode or DImode rotation by a constant
1351 : to vector mode. CODE is either ROTATE or ROTATERT with operands
1352 : OP0 and OP1. Returns the SET_SRC of the last instruction in the
1353 : resulting sequence, which is emitted before INSN. */
1354 :
1355 : rtx
1356 92 : general_scalar_chain::convert_rotate (enum rtx_code code, rtx op0, rtx op1,
1357 : rtx_insn *insn)
1358 : {
1359 92 : int bits = INTVAL (op1);
1360 92 : rtx pat, result;
1361 :
1362 92 : convert_op (&op0, insn);
1363 92 : if (bits == 0)
1364 0 : return op0;
1365 :
1366 92 : if (smode == DImode)
1367 : {
1368 92 : if (code == ROTATE)
1369 45 : bits = 64 - bits;
1370 92 : if (bits == 32)
1371 : {
1372 0 : rtx tmp1 = gen_reg_rtx (V4SImode);
1373 0 : pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
1374 : GEN_INT (225));
1375 0 : emit_insn_before (pat, insn);
1376 0 : result = gen_lowpart (V2DImode, tmp1);
1377 : }
1378 92 : else if (TARGET_AVX512VL)
1379 0 : result = simplify_gen_binary (code, V2DImode, op0, op1);
1380 92 : else if (bits == 16 || bits == 48)
1381 : {
1382 0 : rtx tmp1 = gen_reg_rtx (V8HImode);
1383 0 : pat = gen_sse2_pshuflw (tmp1, gen_lowpart (V8HImode, op0),
1384 : GEN_INT (bits == 16 ? 57 : 147));
1385 0 : emit_insn_before (pat, insn);
1386 0 : result = gen_lowpart (V2DImode, tmp1);
1387 : }
1388 92 : else if ((bits & 0x07) == 0)
1389 : {
1390 0 : rtx tmp1 = gen_reg_rtx (V4SImode);
1391 0 : pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
1392 : GEN_INT (68));
1393 0 : emit_insn_before (pat, insn);
1394 0 : rtx tmp2 = gen_reg_rtx (V1TImode);
1395 0 : pat = gen_sse2_lshrv1ti3 (tmp2, gen_lowpart (V1TImode, tmp1),
1396 : GEN_INT (bits));
1397 0 : emit_insn_before (pat, insn);
1398 0 : result = gen_lowpart (V2DImode, tmp2);
1399 : }
1400 : else
1401 : {
1402 92 : rtx tmp1 = gen_reg_rtx (V4SImode);
1403 92 : pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
1404 : GEN_INT (20));
1405 92 : emit_insn_before (pat, insn);
1406 92 : rtx tmp2 = gen_reg_rtx (V2DImode);
1407 92 : pat = gen_lshrv2di3 (tmp2, gen_lowpart (V2DImode, tmp1),
1408 : GEN_INT (bits & 31));
1409 92 : emit_insn_before (pat, insn);
1410 92 : rtx tmp3 = gen_reg_rtx (V4SImode);
1411 139 : pat = gen_sse2_pshufd (tmp3, gen_lowpart (V4SImode, tmp2),
1412 : GEN_INT (bits > 32 ? 34 : 136));
1413 92 : emit_insn_before (pat, insn);
1414 92 : result = gen_lowpart (V2DImode, tmp3);
1415 : }
1416 : }
1417 0 : else if (bits == 16)
1418 : {
1419 0 : rtx tmp1 = gen_reg_rtx (V8HImode);
1420 0 : pat = gen_sse2_pshuflw (tmp1, gen_lowpart (V8HImode, op0), GEN_INT (225));
1421 0 : emit_insn_before (pat, insn);
1422 0 : result = gen_lowpart (V4SImode, tmp1);
1423 : }
1424 0 : else if (TARGET_AVX512VL)
1425 0 : result = simplify_gen_binary (code, V4SImode, op0, op1);
1426 : else
1427 : {
1428 0 : if (code == ROTATE)
1429 0 : bits = 32 - bits;
1430 :
1431 0 : rtx tmp1 = gen_reg_rtx (V4SImode);
1432 0 : emit_insn_before (gen_sse2_pshufd (tmp1, op0, GEN_INT (224)), insn);
1433 0 : rtx tmp2 = gen_reg_rtx (V2DImode);
1434 0 : pat = gen_lshrv2di3 (tmp2, gen_lowpart (V2DImode, tmp1),
1435 : GEN_INT (bits));
1436 0 : emit_insn_before (pat, insn);
1437 0 : result = gen_lowpart (V4SImode, tmp2);
1438 : }
1439 :
1440 : return result;
1441 : }
1442 :
1443 : /* Convert INSN to vector mode. */
1444 :
1445 : void
1446 410511 : general_scalar_chain::convert_insn (rtx_insn *insn)
1447 : {
1448 410511 : rtx def_set = single_set (insn);
1449 410511 : rtx src = SET_SRC (def_set);
1450 410511 : rtx dst = SET_DEST (def_set);
1451 410511 : rtx subreg;
1452 :
1453 410511 : if (MEM_P (dst) && !REG_P (src))
1454 : {
1455 : /* There are no scalar integer instructions and therefore
1456 : temporary register usage is required. */
1457 762 : rtx tmp = gen_reg_rtx (smode);
1458 762 : emit_conversion_insns (gen_move_insn (dst, tmp), insn);
1459 762 : dst = gen_rtx_SUBREG (vmode, tmp, 0);
1460 762 : }
1461 409749 : else if (REG_P (dst) && GET_MODE (dst) == smode)
1462 : {
1463 : /* Replace the definition with a SUBREG to the definition we
1464 : use inside the chain. */
1465 214157 : rtx *vdef = defs_map.get (dst);
1466 214157 : if (vdef)
1467 22795 : dst = *vdef;
1468 214157 : dst = gen_rtx_SUBREG (vmode, dst, 0);
1469 : /* IRA doesn't like to have REG_EQUAL/EQUIV notes when the SET_DEST
1470 : is a non-REG_P. So kill those off. */
1471 214157 : rtx note = find_reg_equal_equiv_note (insn);
1472 214157 : if (note)
1473 9538 : remove_note (insn, note);
1474 : }
1475 :
1476 410511 : switch (GET_CODE (src))
1477 : {
1478 29592 : case PLUS:
1479 29592 : case MINUS:
1480 29592 : case IOR:
1481 29592 : case XOR:
1482 29592 : case AND:
1483 29592 : case SMAX:
1484 29592 : case SMIN:
1485 29592 : case UMAX:
1486 29592 : case UMIN:
1487 29592 : convert_op (&XEXP (src, 1), insn);
1488 : /* FALLTHRU */
1489 :
1490 36870 : case ABS:
1491 36870 : case ASHIFT:
1492 36870 : case ASHIFTRT:
1493 36870 : case LSHIFTRT:
1494 36870 : convert_op (&XEXP (src, 0), insn);
1495 36870 : PUT_MODE (src, vmode);
1496 36870 : break;
1497 :
1498 92 : case ROTATE:
1499 92 : case ROTATERT:
1500 92 : src = convert_rotate (GET_CODE (src), XEXP (src, 0), XEXP (src, 1),
1501 : insn);
1502 92 : break;
1503 :
1504 400 : case NEG:
1505 400 : src = XEXP (src, 0);
1506 :
1507 400 : if (GET_CODE (src) == ABS)
1508 : {
1509 0 : src = XEXP (src, 0);
1510 0 : convert_op (&src, insn);
1511 0 : subreg = gen_reg_rtx (vmode);
1512 0 : emit_insn_before (gen_rtx_SET (subreg,
1513 : gen_rtx_ABS (vmode, src)), insn);
1514 0 : src = subreg;
1515 : }
1516 : else
1517 400 : convert_op (&src, insn);
1518 :
1519 400 : subreg = gen_reg_rtx (vmode);
1520 400 : emit_insn_before (gen_move_insn (subreg, CONST0_RTX (vmode)), insn);
1521 400 : src = gen_rtx_MINUS (vmode, subreg, src);
1522 400 : break;
1523 :
1524 250 : case NOT:
1525 250 : src = XEXP (src, 0);
1526 250 : convert_op (&src, insn);
1527 250 : subreg = gen_reg_rtx (vmode);
1528 250 : emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (vmode)), insn);
1529 250 : src = gen_rtx_XOR (vmode, src, subreg);
1530 250 : break;
1531 :
1532 170787 : case MEM:
1533 170787 : if (!REG_P (dst))
1534 170787 : convert_op (&src, insn);
1535 : break;
1536 :
1537 196972 : case REG:
1538 196972 : if (!MEM_P (dst))
1539 1380 : convert_op (&src, insn);
1540 : break;
1541 :
1542 0 : case SUBREG:
1543 0 : gcc_assert (GET_MODE (src) == vmode);
1544 : break;
1545 :
1546 0 : case COMPARE:
1547 0 : dst = gen_rtx_REG (CCZmode, FLAGS_REG);
1548 0 : src = convert_compare (XEXP (src, 0), XEXP (src, 1), insn);
1549 0 : break;
1550 :
1551 3400 : case CONST_INT:
1552 3400 : convert_op (&src, insn);
1553 3400 : break;
1554 :
1555 1740 : case VEC_SELECT:
1556 1740 : if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
1557 1325 : src = XEXP (src, 0);
1558 415 : else if (smode == DImode)
1559 : {
1560 172 : rtx tmp = gen_lowpart (V1TImode, XEXP (src, 0));
1561 172 : dst = gen_lowpart (V1TImode, dst);
1562 172 : src = gen_rtx_LSHIFTRT (V1TImode, tmp, GEN_INT (64));
1563 : }
1564 : else
1565 : {
1566 243 : rtx tmp = XVECEXP (XEXP (src, 1), 0, 0);
1567 243 : rtvec vec = gen_rtvec (4, tmp, tmp, tmp, tmp);
1568 243 : rtx par = gen_rtx_PARALLEL (VOIDmode, vec);
1569 243 : src = gen_rtx_VEC_SELECT (vmode, XEXP (src, 0), par);
1570 : }
1571 : break;
1572 :
1573 0 : default:
1574 0 : gcc_unreachable ();
1575 : }
1576 :
1577 410511 : SET_SRC (def_set) = src;
1578 410511 : SET_DEST (def_set) = dst;
1579 :
1580 : /* Drop possible dead definitions. */
1581 410511 : PATTERN (insn) = def_set;
1582 :
1583 410511 : INSN_CODE (insn) = -1;
1584 410511 : int patt = recog_memoized (insn);
1585 410511 : if (patt == -1)
1586 0 : fatal_insn_not_found (insn);
1587 410511 : df_insn_rescan (insn);
1588 410511 : }
1589 :
1590 : /* Helper function to compute gain for loading an immediate constant.
1591 : Typically, two movabsq for TImode vs. vmovdqa for V1TImode, but
1592 : with numerous special cases. */
1593 :
1594 : static int
1595 8 : timode_immed_const_gain (rtx cst, basic_block bb)
1596 : {
1597 : /* movabsq vs. movabsq+vmovq+vunpacklqdq. */
1598 8 : if (CONST_WIDE_INT_P (cst)
1599 5 : && CONST_WIDE_INT_NUNITS (cst) == 2
1600 13 : && CONST_WIDE_INT_ELT (cst, 0) == CONST_WIDE_INT_ELT (cst, 1))
1601 0 : return optimize_bb_for_size_p (bb) ? -COSTS_N_BYTES (9)
1602 : : -COSTS_N_INSNS (2);
1603 : /* 2x movabsq ~ vmovdqa. */
1604 : return 0;
1605 : }
1606 :
1607 : /* Return true it's cost profitable for for chain conversion. */
1608 :
1609 : bool
1610 473775 : timode_scalar_chain::compute_convert_gain ()
1611 : {
1612 : /* Assume that if we have to move TImode values between units,
1613 : then transforming this chain isn't worth it. */
1614 473775 : if (cost_sse_integer)
1615 : return false;
1616 :
1617 473775 : bitmap_iterator bi;
1618 473775 : unsigned insn_uid;
1619 :
1620 : /* Split ties to prefer V1TImode when not optimizing for size. */
1621 473775 : int gain = optimize_size ? 0 : 1;
1622 473775 : sreal weighted_gain = 0;
1623 :
1624 473775 : if (dump_file)
1625 0 : fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
1626 :
1627 1406144 : EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
1628 : {
1629 932369 : rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1630 932369 : rtx def_set = single_set (insn);
1631 932369 : rtx src = SET_SRC (def_set);
1632 932369 : rtx dst = SET_DEST (def_set);
1633 932369 : HOST_WIDE_INT op1val;
1634 932369 : basic_block bb = BLOCK_FOR_INSN (insn);
1635 932369 : int scost, vcost;
1636 932369 : int igain = 0;
1637 932369 : profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count;
1638 932369 : bool speed_p = optimize_bb_for_speed_p (bb);
1639 932369 : sreal bb_freq = bb->count.to_sreal_scale (entry_count);
1640 :
1641 932369 : switch (GET_CODE (src))
1642 : {
1643 458213 : case REG:
1644 458213 : if (!speed_p)
1645 20482 : igain = MEM_P (dst) ? COSTS_N_BYTES (6) : COSTS_N_BYTES (3);
1646 : else
1647 : igain = COSTS_N_INSNS (1);
1648 : break;
1649 :
1650 426596 : case MEM:
1651 426596 : igain = !speed_p ? COSTS_N_BYTES (7) : COSTS_N_INSNS (1);
1652 : break;
1653 :
1654 11192 : case CONST_INT:
1655 11192 : if (MEM_P (dst)
1656 11192 : && standard_sse_constant_p (src, V1TImode))
1657 10688 : igain = !speed_p ? COSTS_N_BYTES (11) : 1;
1658 : break;
1659 :
1660 33243 : case CONST_WIDE_INT:
1661 : /* 2 x mov vs. vmovdqa. */
1662 33243 : if (MEM_P (dst))
1663 33059 : igain = !speed_p ? COSTS_N_BYTES (3) : COSTS_N_INSNS (1);
1664 : break;
1665 :
1666 19 : case NOT:
1667 19 : if (MEM_P (dst))
1668 24266 : igain = -COSTS_N_INSNS (1);
1669 : break;
1670 :
1671 14 : case AND:
1672 14 : if (!MEM_P (dst))
1673 3 : igain = COSTS_N_INSNS (1);
1674 14 : if (CONST_SCALAR_INT_P (XEXP (src, 1)))
1675 5 : igain += timode_immed_const_gain (XEXP (src, 1), bb);
1676 : break;
1677 :
1678 2692 : case XOR:
1679 2692 : case IOR:
1680 2692 : if (timode_concatdi_p (src))
1681 : {
1682 : /* vmovq;vpinsrq (11 bytes). */
1683 2641 : igain = speed_p ? -2 * ix86_cost->sse_to_integer
1684 : : -COSTS_N_BYTES (11);
1685 : break;
1686 : }
1687 51 : if (!MEM_P (dst))
1688 43 : igain = COSTS_N_INSNS (1);
1689 51 : if (CONST_SCALAR_INT_P (XEXP (src, 1)))
1690 3 : igain += timode_immed_const_gain (XEXP (src, 1), bb);
1691 : break;
1692 :
1693 0 : case PLUS:
1694 0 : if (timode_concatdi_p (src))
1695 : /* vmovq;vpinsrq (11 bytes). */
1696 0 : igain = speed_p ? -2 * ix86_cost->sse_to_integer
1697 : : -COSTS_N_BYTES (11);
1698 : break;
1699 :
1700 158 : case ASHIFT:
1701 158 : case LSHIFTRT:
1702 : /* See ix86_expand_v1ti_shift. */
1703 158 : op1val = INTVAL (XEXP (src, 1));
1704 158 : if (!speed_p)
1705 : {
1706 15 : if (op1val == 64 || op1val == 65)
1707 : scost = COSTS_N_BYTES (5);
1708 10 : else if (op1val >= 66)
1709 : scost = COSTS_N_BYTES (6);
1710 10 : else if (op1val == 1)
1711 : scost = COSTS_N_BYTES (8);
1712 : else
1713 : scost = COSTS_N_BYTES (9);
1714 :
1715 14 : if ((op1val & 7) == 0)
1716 : vcost = COSTS_N_BYTES (5);
1717 10 : else if (op1val > 64)
1718 : vcost = COSTS_N_BYTES (10);
1719 : else
1720 10 : vcost = TARGET_AVX ? COSTS_N_BYTES (19) : COSTS_N_BYTES (23);
1721 : }
1722 : else
1723 : {
1724 143 : scost = COSTS_N_INSNS (2);
1725 143 : if ((op1val & 7) == 0)
1726 : vcost = COSTS_N_INSNS (1);
1727 110 : else if (op1val > 64)
1728 : vcost = COSTS_N_INSNS (2);
1729 : else
1730 110 : vcost = TARGET_AVX ? COSTS_N_INSNS (4) : COSTS_N_INSNS (5);
1731 : }
1732 158 : igain = scost - vcost;
1733 158 : break;
1734 :
1735 103 : case ASHIFTRT:
1736 : /* See ix86_expand_v1ti_ashiftrt. */
1737 103 : op1val = INTVAL (XEXP (src, 1));
1738 103 : if (!speed_p)
1739 : {
1740 7 : if (op1val == 64 || op1val == 127)
1741 : scost = COSTS_N_BYTES (7);
1742 7 : else if (op1val == 1)
1743 : scost = COSTS_N_BYTES (8);
1744 7 : else if (op1val == 65)
1745 : scost = COSTS_N_BYTES (10);
1746 7 : else if (op1val >= 66)
1747 : scost = COSTS_N_BYTES (11);
1748 : else
1749 : scost = COSTS_N_BYTES (9);
1750 :
1751 0 : if (op1val == 127)
1752 : vcost = COSTS_N_BYTES (10);
1753 7 : else if (op1val == 64)
1754 : vcost = COSTS_N_BYTES (14);
1755 7 : else if (op1val == 96)
1756 : vcost = COSTS_N_BYTES (18);
1757 7 : else if (op1val >= 111)
1758 : vcost = COSTS_N_BYTES (15);
1759 7 : else if (TARGET_AVX2 && op1val == 32)
1760 : vcost = COSTS_N_BYTES (16);
1761 7 : else if (TARGET_SSE4_1 && op1val == 32)
1762 : vcost = COSTS_N_BYTES (20);
1763 7 : else if (op1val >= 96)
1764 : vcost = COSTS_N_BYTES (23);
1765 7 : else if ((op1val & 7) == 0)
1766 : vcost = COSTS_N_BYTES (28);
1767 7 : else if (TARGET_AVX2 && op1val < 32)
1768 : vcost = COSTS_N_BYTES (30);
1769 7 : else if (op1val == 1 || op1val >= 64)
1770 : vcost = COSTS_N_BYTES (42);
1771 : else
1772 7 : vcost = COSTS_N_BYTES (47);
1773 : }
1774 : else
1775 : {
1776 96 : if (op1val >= 65 && op1val <= 126)
1777 : scost = COSTS_N_INSNS (3);
1778 : else
1779 96 : scost = COSTS_N_INSNS (2);
1780 :
1781 96 : if (op1val == 127)
1782 : vcost = COSTS_N_INSNS (2);
1783 96 : else if (op1val == 64)
1784 : vcost = COSTS_N_INSNS (3);
1785 96 : else if (op1val == 96)
1786 : vcost = COSTS_N_INSNS (3);
1787 96 : else if (op1val >= 111)
1788 : vcost = COSTS_N_INSNS (3);
1789 96 : else if (TARGET_SSE4_1 && op1val == 32)
1790 : vcost = COSTS_N_INSNS (3);
1791 96 : else if (TARGET_SSE4_1
1792 0 : && (op1val == 8 || op1val == 16 || op1val == 24))
1793 : vcost = COSTS_N_INSNS (3);
1794 96 : else if (op1val >= 96)
1795 : vcost = COSTS_N_INSNS (4);
1796 96 : else if (TARGET_SSE4_1 && (op1val == 28 || op1val == 80))
1797 : vcost = COSTS_N_INSNS (4);
1798 96 : else if ((op1val & 7) == 0)
1799 : vcost = COSTS_N_INSNS (5);
1800 96 : else if (TARGET_AVX2 && op1val < 32)
1801 : vcost = COSTS_N_INSNS (6);
1802 96 : else if (TARGET_SSE4_1 && op1val < 15)
1803 : vcost = COSTS_N_INSNS (6);
1804 96 : else if (op1val == 1 || op1val >= 64)
1805 : vcost = COSTS_N_INSNS (8);
1806 : else
1807 0 : vcost = COSTS_N_INSNS (9);
1808 : }
1809 103 : igain = scost - vcost;
1810 103 : break;
1811 :
1812 5 : case ROTATE:
1813 5 : case ROTATERT:
1814 : /* See ix86_expand_v1ti_rotate. */
1815 5 : op1val = INTVAL (XEXP (src, 1));
1816 5 : if (!speed_p)
1817 : {
1818 0 : scost = COSTS_N_BYTES (13);
1819 0 : if ((op1val & 31) == 0)
1820 : vcost = COSTS_N_BYTES (5);
1821 0 : else if ((op1val & 7) == 0)
1822 0 : vcost = TARGET_AVX ? COSTS_N_BYTES (13) : COSTS_N_BYTES (18);
1823 0 : else if (op1val > 32 && op1val < 96)
1824 : vcost = COSTS_N_BYTES (24);
1825 : else
1826 0 : vcost = COSTS_N_BYTES (19);
1827 : }
1828 : else
1829 : {
1830 5 : scost = COSTS_N_INSNS (3);
1831 5 : if ((op1val & 31) == 0)
1832 : vcost = COSTS_N_INSNS (1);
1833 3 : else if ((op1val & 7) == 0)
1834 1 : vcost = TARGET_AVX ? COSTS_N_INSNS (3) : COSTS_N_INSNS (4);
1835 2 : else if (op1val > 32 && op1val < 96)
1836 : vcost = COSTS_N_INSNS (5);
1837 : else
1838 2 : vcost = COSTS_N_INSNS (1);
1839 : }
1840 5 : igain = scost - vcost;
1841 5 : break;
1842 :
1843 12 : case COMPARE:
1844 12 : if (XEXP (src, 1) == const0_rtx)
1845 : {
1846 8 : if (GET_CODE (XEXP (src, 0)) == AND)
1847 : /* and;and;or (9 bytes) vs. ptest (5 bytes). */
1848 : igain = !speed_p ? COSTS_N_BYTES (4) : COSTS_N_INSNS (2);
1849 : /* or (3 bytes) vs. ptest (5 bytes). */
1850 8 : else if (!speed_p)
1851 0 : igain = -COSTS_N_BYTES (2);
1852 : }
1853 4 : else if (XEXP (src, 1) == const1_rtx)
1854 : /* and;cmp -1 (7 bytes) vs. pcmpeqd;pxor;ptest (13 bytes). */
1855 0 : igain = !speed_p ? -COSTS_N_BYTES (6) : -COSTS_N_INSNS (1);
1856 : break;
1857 :
1858 122 : case ZERO_EXTEND:
1859 122 : if (GET_MODE (XEXP (src, 0)) == DImode)
1860 : /* xor (2 bytes) vs. vmovq (5 bytes). */
1861 122 : igain = speed_p ? COSTS_N_INSNS (1) - ix86_cost->sse_to_integer
1862 : : -COSTS_N_BYTES (3);
1863 : break;
1864 :
1865 : default:
1866 : break;
1867 : }
1868 :
1869 1823066 : gain += igain;
1870 932361 : if (speed_p)
1871 890705 : weighted_gain += bb_freq * igain;
1872 :
1873 932369 : if (igain != 0 && dump_file)
1874 : {
1875 0 : fprintf (dump_file, " Instruction gain %d with bb_freq %.2f for ",
1876 : igain, bb_freq.to_double ());
1877 0 : dump_insn_slim (dump_file, insn);
1878 : }
1879 : }
1880 :
1881 473775 : if (dump_file)
1882 0 : fprintf (dump_file, " Total gain: %d, weighted gain %.2f\n",
1883 : gain, weighted_gain.to_double ());
1884 :
1885 473775 : if (weighted_gain > (sreal) 0)
1886 : return true;
1887 : else
1888 24307 : return gain > 0;
1889 : }
1890 :
1891 : /* Fix uses of converted REG in debug insns. */
1892 :
1893 : void
1894 427804 : timode_scalar_chain::fix_debug_reg_uses (rtx reg)
1895 : {
1896 427804 : if (!flag_var_tracking)
1897 : return;
1898 :
1899 375770 : df_ref ref, next;
1900 769301 : for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
1901 : {
1902 393531 : rtx_insn *insn = DF_REF_INSN (ref);
1903 : /* Make sure the next ref is for a different instruction,
1904 : so that we're not affected by the rescan. */
1905 393531 : next = DF_REF_NEXT_REG (ref);
1906 393531 : while (next && DF_REF_INSN (next) == insn)
1907 0 : next = DF_REF_NEXT_REG (next);
1908 :
1909 393531 : if (DEBUG_INSN_P (insn))
1910 : {
1911 : /* It may be a debug insn with a TImode variable in
1912 : register. */
1913 : bool changed = false;
1914 178 : for (; ref != next; ref = DF_REF_NEXT_REG (ref))
1915 : {
1916 89 : rtx *loc = DF_REF_LOC (ref);
1917 89 : if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
1918 : {
1919 85 : *loc = gen_rtx_SUBREG (TImode, *loc, 0);
1920 85 : changed = true;
1921 : }
1922 : }
1923 89 : if (changed)
1924 85 : df_insn_rescan (insn);
1925 : }
1926 : }
1927 : }
1928 :
1929 : /* Convert SRC, a *concatditi3 pattern, into a vec_concatv2di instruction.
1930 : Insert this before INSN, and return the result as a V1TImode subreg. */
1931 :
1932 : static rtx
1933 253 : timode_convert_concatdi (rtx src, rtx_insn *insn)
1934 : {
1935 253 : rtx hi, lo;
1936 253 : rtx tmp = gen_reg_rtx (V2DImode);
1937 253 : if (GET_CODE (XEXP (src, 0)) == ASHIFT)
1938 : {
1939 253 : hi = XEXP (XEXP (XEXP (src, 0), 0), 0);
1940 253 : lo = XEXP (XEXP (src, 1), 0);
1941 : }
1942 : else
1943 : {
1944 0 : hi = XEXP (XEXP (XEXP (src, 1), 0), 0);
1945 0 : lo = XEXP (XEXP (src, 0), 0);
1946 : }
1947 253 : emit_insn_before (gen_vec_concatv2di (tmp, lo, hi), insn);
1948 253 : return gen_rtx_SUBREG (V1TImode, tmp, 0);
1949 : }
1950 :
1951 : /* Convert INSN from TImode to V1T1mode. */
1952 :
1953 : void
1954 926273 : timode_scalar_chain::convert_insn (rtx_insn *insn)
1955 : {
1956 926273 : rtx def_set = single_set (insn);
1957 926273 : rtx src = SET_SRC (def_set);
1958 926273 : rtx dst = SET_DEST (def_set);
1959 926273 : rtx tmp;
1960 :
1961 926273 : switch (GET_CODE (dst))
1962 : {
1963 427814 : case REG:
1964 427814 : if (GET_MODE (dst) == TImode)
1965 : {
1966 426053 : PUT_MODE (dst, V1TImode);
1967 426053 : fix_debug_reg_uses (dst);
1968 : }
1969 427814 : if (GET_MODE (dst) == V1TImode)
1970 : {
1971 : /* It might potentially be helpful to convert REG_EQUAL notes,
1972 : but for now we just remove them. */
1973 427804 : rtx note = find_reg_equal_equiv_note (insn);
1974 427804 : if (note)
1975 444 : remove_note (insn, note);
1976 : }
1977 : break;
1978 498459 : case MEM:
1979 498459 : PUT_MODE (dst, V1TImode);
1980 498459 : break;
1981 :
1982 0 : default:
1983 0 : gcc_unreachable ();
1984 : }
1985 :
1986 926273 : switch (GET_CODE (src))
1987 : {
1988 454819 : case REG:
1989 454819 : if (GET_MODE (src) == TImode)
1990 : {
1991 1751 : PUT_MODE (src, V1TImode);
1992 1751 : fix_debug_reg_uses (src);
1993 : }
1994 : break;
1995 :
1996 426548 : case MEM:
1997 426548 : PUT_MODE (src, V1TImode);
1998 426548 : break;
1999 :
2000 33242 : case CONST_WIDE_INT:
2001 33242 : if (NONDEBUG_INSN_P (insn))
2002 : {
2003 : /* Since there are no instructions to store 128-bit constant,
2004 : temporary register usage is required. */
2005 33242 : bool use_move;
2006 33242 : start_sequence ();
2007 33242 : tmp = ix86_convert_const_wide_int_to_broadcast (TImode, src);
2008 33242 : if (tmp)
2009 : {
2010 194 : src = lowpart_subreg (V1TImode, tmp, TImode);
2011 194 : use_move = true;
2012 : }
2013 : else
2014 : {
2015 33048 : src = smode_convert_cst (src, V1TImode);
2016 33048 : src = validize_mem (force_const_mem (V1TImode, src));
2017 33048 : use_move = MEM_P (dst);
2018 : }
2019 33242 : rtx_insn *seq = end_sequence ();
2020 33242 : if (seq)
2021 195 : emit_insn_before (seq, insn);
2022 33242 : if (use_move)
2023 : {
2024 33060 : tmp = gen_reg_rtx (V1TImode);
2025 33060 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2026 33060 : src = tmp;
2027 : }
2028 : }
2029 : break;
2030 :
2031 11192 : case CONST_INT:
2032 11192 : switch (standard_sse_constant_p (src, TImode))
2033 : {
2034 10967 : case 1:
2035 10967 : src = CONST0_RTX (GET_MODE (dst));
2036 10967 : break;
2037 225 : case 2:
2038 225 : src = CONSTM1_RTX (GET_MODE (dst));
2039 225 : break;
2040 0 : default:
2041 0 : gcc_unreachable ();
2042 : }
2043 11192 : if (MEM_P (dst))
2044 : {
2045 10688 : tmp = gen_reg_rtx (V1TImode);
2046 10688 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2047 10688 : src = tmp;
2048 : }
2049 : break;
2050 :
2051 13 : case AND:
2052 13 : if (GET_CODE (XEXP (src, 0)) == NOT)
2053 : {
2054 0 : convert_op (&XEXP (XEXP (src, 0), 0), insn);
2055 0 : convert_op (&XEXP (src, 1), insn);
2056 0 : PUT_MODE (XEXP (src, 0), V1TImode);
2057 0 : PUT_MODE (src, V1TImode);
2058 0 : break;
2059 : }
2060 13 : convert_op (&XEXP (src, 0), insn);
2061 13 : convert_op (&XEXP (src, 1), insn);
2062 13 : PUT_MODE (src, V1TImode);
2063 13 : if (MEM_P (dst))
2064 : {
2065 10 : tmp = gen_reg_rtx (V1TImode);
2066 10 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2067 10 : src = tmp;
2068 : }
2069 : break;
2070 :
2071 304 : case XOR:
2072 304 : case IOR:
2073 304 : if (timode_concatdi_p (src))
2074 : {
2075 253 : src = timode_convert_concatdi (src, insn);
2076 253 : break;
2077 : }
2078 51 : convert_op (&XEXP (src, 0), insn);
2079 51 : convert_op (&XEXP (src, 1), insn);
2080 51 : PUT_MODE (src, V1TImode);
2081 51 : if (MEM_P (dst))
2082 : {
2083 8 : tmp = gen_reg_rtx (V1TImode);
2084 8 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2085 8 : src = tmp;
2086 : }
2087 : break;
2088 :
2089 3 : case NOT:
2090 3 : src = XEXP (src, 0);
2091 3 : convert_op (&src, insn);
2092 3 : tmp = gen_reg_rtx (V1TImode);
2093 3 : emit_insn_before (gen_move_insn (tmp, CONSTM1_RTX (V1TImode)), insn);
2094 3 : src = gen_rtx_XOR (V1TImode, src, tmp);
2095 3 : if (MEM_P (dst))
2096 : {
2097 0 : tmp = gen_reg_rtx (V1TImode);
2098 0 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2099 0 : src = tmp;
2100 : }
2101 : break;
2102 :
2103 10 : case COMPARE:
2104 10 : dst = gen_rtx_REG (CCZmode, FLAGS_REG);
2105 10 : src = convert_compare (XEXP (src, 0), XEXP (src, 1), insn);
2106 10 : break;
2107 :
2108 43 : case ASHIFT:
2109 43 : case LSHIFTRT:
2110 43 : case ASHIFTRT:
2111 43 : case ROTATERT:
2112 43 : case ROTATE:
2113 43 : convert_op (&XEXP (src, 0), insn);
2114 43 : PUT_MODE (src, V1TImode);
2115 43 : break;
2116 :
2117 99 : case ZERO_EXTEND:
2118 99 : if (GET_MODE (XEXP (src, 0)) == DImode)
2119 : {
2120 : /* Convert to *vec_concatv2di_0. */
2121 99 : rtx tmp = gen_reg_rtx (V2DImode);
2122 99 : rtx pat = gen_rtx_VEC_CONCAT (V2DImode, XEXP (src, 0), const0_rtx);
2123 99 : emit_insn_before (gen_move_insn (tmp, pat), insn);
2124 99 : src = gen_rtx_SUBREG (vmode, tmp, 0);
2125 : }
2126 : else
2127 0 : gcc_unreachable ();
2128 99 : break;
2129 :
2130 0 : case PLUS:
2131 0 : if (timode_concatdi_p (src))
2132 0 : src = timode_convert_concatdi (src, insn);
2133 : else
2134 0 : gcc_unreachable ();
2135 0 : break;
2136 :
2137 0 : default:
2138 0 : gcc_unreachable ();
2139 : }
2140 :
2141 926273 : SET_SRC (def_set) = src;
2142 926273 : SET_DEST (def_set) = dst;
2143 :
2144 : /* Drop possible dead definitions. */
2145 926273 : PATTERN (insn) = def_set;
2146 :
2147 926273 : INSN_CODE (insn) = -1;
2148 926273 : recog_memoized (insn);
2149 926273 : df_insn_rescan (insn);
2150 926273 : }
2151 :
2152 : /* Generate copies from defs used by the chain but not defined therein.
2153 : Also populates defs_map which is used later by convert_insn. */
2154 :
2155 : void
2156 643914 : scalar_chain::convert_registers ()
2157 : {
2158 643914 : bitmap_iterator bi;
2159 643914 : unsigned id;
2160 669336 : EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
2161 : {
2162 25422 : rtx chain_reg = gen_reg_rtx (smode);
2163 25422 : defs_map.put (regno_reg_rtx[id], chain_reg);
2164 : }
2165 651974 : EXECUTE_IF_SET_IN_BITMAP (insns_conv, 0, id, bi)
2166 20379 : for (df_ref ref = DF_INSN_UID_DEFS (id); ref; ref = DF_REF_NEXT_LOC (ref))
2167 12319 : if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
2168 8060 : make_vector_copies (DF_REF_INSN (ref), DF_REF_REAL_REG (ref));
2169 643914 : }
2170 :
2171 : /* Convert whole chain creating required register
2172 : conversions and copies. */
2173 :
2174 : int
2175 643914 : scalar_chain::convert ()
2176 : {
2177 643914 : bitmap_iterator bi;
2178 643914 : unsigned id;
2179 643914 : int converted_insns = 0;
2180 :
2181 643914 : if (!dbg_cnt (stv_conversion))
2182 : return 0;
2183 :
2184 643914 : if (dump_file)
2185 0 : fprintf (dump_file, "Converting chain #%d...\n", chain_id);
2186 :
2187 643914 : convert_registers ();
2188 :
2189 1980698 : EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
2190 : {
2191 1336784 : rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
2192 1336784 : convert_insn_common (insn);
2193 1336784 : convert_insn (insn);
2194 1336784 : converted_insns++;
2195 : }
2196 :
2197 : return converted_insns;
2198 : }
2199 :
2200 : /* Return the SET expression if INSN doesn't reference hard register.
2201 : Return NULL if INSN uses or defines a hard register, excluding
2202 : pseudo register pushes, hard register uses in a memory address,
2203 : clobbers and flags definitions. */
2204 :
2205 : static rtx
2206 339055620 : pseudo_reg_set (rtx_insn *insn)
2207 : {
2208 339055620 : rtx set = single_set (insn);
2209 339055620 : if (!set)
2210 : return NULL;
2211 :
2212 : /* Check pseudo register push first. */
2213 135502964 : machine_mode mode = TARGET_64BIT ? TImode : DImode;
2214 135502964 : if (REG_P (SET_SRC (set))
2215 38238551 : && !HARD_REGISTER_P (SET_SRC (set))
2216 165331507 : && push_operand (SET_DEST (set), mode))
2217 : return set;
2218 :
2219 135250625 : df_ref ref;
2220 219146869 : FOR_EACH_INSN_DEF (ref, insn)
2221 120638935 : if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
2222 64768779 : && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
2223 170908075 : && DF_REF_REGNO (ref) != FLAGS_REG)
2224 : return NULL;
2225 :
2226 188714818 : FOR_EACH_INSN_USE (ref, insn)
2227 115658630 : if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
2228 : return NULL;
2229 :
2230 : return set;
2231 : }
2232 :
2233 : /* Return true if the register REG is defined in a single DEF chain.
2234 : If it is defined in more than one DEF chains, we may not be able
2235 : to convert it in all chains. */
2236 :
2237 : static bool
2238 1159377 : single_def_chain_p (rtx reg)
2239 : {
2240 1159377 : df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg));
2241 1159377 : if (!ref)
2242 : return false;
2243 1159361 : return DF_REF_NEXT_REG (ref) == nullptr;
2244 : }
2245 :
2246 : /* Check if comparison INSN may be transformed into vector comparison.
2247 : Currently we transform equality/inequality checks which look like:
2248 : (set (reg:CCZ 17 flags) (compare:CCZ (reg:TI x) (reg:TI y))) */
2249 :
2250 : static bool
2251 12906698 : convertible_comparison_p (rtx_insn *insn, enum machine_mode mode)
2252 : {
2253 14309584 : if (mode != (TARGET_64BIT ? TImode : DImode))
2254 : return false;
2255 :
2256 4709145 : if (!TARGET_SSE4_1)
2257 : return false;
2258 :
2259 164923 : rtx def_set = single_set (insn);
2260 :
2261 164923 : gcc_assert (def_set);
2262 :
2263 164923 : rtx src = SET_SRC (def_set);
2264 164923 : rtx dst = SET_DEST (def_set);
2265 :
2266 164923 : gcc_assert (GET_CODE (src) == COMPARE);
2267 :
2268 164923 : if (!REG_P (dst)
2269 164923 : || REGNO (dst) != FLAGS_REG
2270 329846 : || GET_MODE (dst) != CCZmode)
2271 : return false;
2272 :
2273 120106 : rtx op1 = XEXP (src, 0);
2274 120106 : rtx op2 = XEXP (src, 1);
2275 :
2276 : /* *cmp<dwi>_doubleword. */
2277 120106 : if ((CONST_SCALAR_INT_P (op1)
2278 120106 : || ((REG_P (op1) || MEM_P (op1))
2279 118321 : && GET_MODE (op1) == mode))
2280 60 : && (CONST_SCALAR_INT_P (op2)
2281 12 : || ((REG_P (op2) || MEM_P (op2))
2282 10 : && GET_MODE (op2) == mode)))
2283 : return true;
2284 :
2285 : /* *testti_doubleword. */
2286 120048 : if (op2 == const0_rtx
2287 38296 : && GET_CODE (op1) == AND
2288 150 : && REG_P (XEXP (op1, 0)))
2289 : {
2290 150 : rtx op12 = XEXP (op1, 1);
2291 150 : return GET_MODE (XEXP (op1, 0)) == TImode
2292 150 : && (CONST_SCALAR_INT_P (op12)
2293 0 : || ((REG_P (op12) || MEM_P (op12))
2294 0 : && GET_MODE (op12) == TImode));
2295 : }
2296 :
2297 : /* *test<dwi>_not_doubleword. */
2298 119898 : if (op2 == const0_rtx
2299 38146 : && GET_CODE (op1) == AND
2300 0 : && GET_CODE (XEXP (op1, 0)) == NOT)
2301 : {
2302 0 : rtx op11 = XEXP (XEXP (op1, 0), 0);
2303 0 : rtx op12 = XEXP (op1, 1);
2304 0 : return (REG_P (op11) || MEM_P (op11))
2305 0 : && (REG_P (op12) || MEM_P (op12))
2306 0 : && GET_MODE (op11) == mode
2307 0 : && GET_MODE (op12) == mode;
2308 : }
2309 :
2310 : return false;
2311 : }
2312 :
2313 : /* The general version of scalar_to_vector_candidate_p. */
2314 :
2315 : static bool
2316 237054288 : general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)
2317 : {
2318 237054288 : rtx def_set = pseudo_reg_set (insn);
2319 :
2320 237054288 : if (!def_set)
2321 : return false;
2322 :
2323 49552264 : rtx src = SET_SRC (def_set);
2324 49552264 : rtx dst = SET_DEST (def_set);
2325 :
2326 49552264 : if (GET_CODE (src) == COMPARE)
2327 8898996 : return convertible_comparison_p (insn, mode);
2328 :
2329 : /* We are interested in "mode" only. */
2330 40653268 : if ((GET_MODE (src) != mode
2331 27792050 : && !CONST_INT_P (src))
2332 18021873 : || GET_MODE (dst) != mode)
2333 : return false;
2334 :
2335 15122887 : if (!REG_P (dst) && !MEM_P (dst))
2336 : return false;
2337 :
2338 14893693 : switch (GET_CODE (src))
2339 : {
2340 525992 : case ASHIFT:
2341 525992 : case LSHIFTRT:
2342 525992 : case ASHIFTRT:
2343 525992 : case ROTATE:
2344 525992 : case ROTATERT:
2345 525992 : if (!CONST_INT_P (XEXP (src, 1))
2346 1016749 : || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, GET_MODE_BITSIZE (mode)-1))
2347 : return false;
2348 :
2349 : /* Check for extend highpart case. */
2350 490753 : if (mode != DImode
2351 351859 : || GET_CODE (src) != ASHIFTRT
2352 77039 : || GET_CODE (XEXP (src, 0)) != ASHIFT)
2353 : break;
2354 :
2355 3705022 : src = XEXP (src, 0);
2356 : break;
2357 :
2358 78154 : case SMAX:
2359 78154 : case SMIN:
2360 78154 : case UMAX:
2361 78154 : case UMIN:
2362 78154 : if ((mode == DImode && !TARGET_AVX512VL)
2363 17450 : || (mode == SImode && !TARGET_SSE4_1))
2364 : return false;
2365 : /* Fallthru. */
2366 :
2367 3254998 : case AND:
2368 3254998 : case IOR:
2369 3254998 : case XOR:
2370 3254998 : case PLUS:
2371 3254998 : case MINUS:
2372 3254998 : if (!REG_P (XEXP (src, 1))
2373 : && !MEM_P (XEXP (src, 1))
2374 : && !CONST_INT_P (XEXP (src, 1)))
2375 : return false;
2376 :
2377 3162661 : if (GET_MODE (XEXP (src, 1)) != mode
2378 1848634 : && !CONST_INT_P (XEXP (src, 1)))
2379 : return false;
2380 :
2381 : /* Check for andnot case. */
2382 3162661 : if (GET_CODE (src) != AND
2383 181859 : || GET_CODE (XEXP (src, 0)) != NOT)
2384 : break;
2385 :
2386 3705022 : src = XEXP (src, 0);
2387 : /* FALLTHRU */
2388 :
2389 : case NOT:
2390 : break;
2391 :
2392 24657 : case NEG:
2393 : /* Check for nabs case. */
2394 24657 : if (GET_CODE (XEXP (src, 0)) != ABS)
2395 : break;
2396 :
2397 : src = XEXP (src, 0);
2398 : /* FALLTHRU */
2399 :
2400 2883 : case ABS:
2401 2883 : if ((mode == DImode && !TARGET_AVX512VL)
2402 1427 : || (mode == SImode && !TARGET_SSSE3))
2403 : return false;
2404 : break;
2405 :
2406 : case REG:
2407 : return true;
2408 :
2409 6001584 : case MEM:
2410 6001584 : case CONST_INT:
2411 6001584 : return REG_P (dst);
2412 :
2413 53627 : case VEC_SELECT:
2414 : /* Excluding MEM_P (dst) avoids intefering with vpextr[dq]. */
2415 53627 : return REG_P (dst)
2416 43117 : && REG_P (XEXP (src, 0))
2417 49292 : && GET_MODE (XEXP (src, 0)) == (mode == DImode ? V2DImode
2418 : : V4SImode)
2419 33944 : && GET_CODE (XEXP (src, 1)) == PARALLEL
2420 33944 : && XVECLEN (XEXP (src, 1), 0) == 1
2421 87571 : && CONST_INT_P (XVECEXP (XEXP (src, 1), 0, 0));
2422 :
2423 : default:
2424 : return false;
2425 : }
2426 :
2427 3705022 : if (!REG_P (XEXP (src, 0))
2428 : && !MEM_P (XEXP (src, 0))
2429 : && !CONST_INT_P (XEXP (src, 0)))
2430 : return false;
2431 :
2432 3398188 : if (GET_MODE (XEXP (src, 0)) != mode
2433 0 : && !CONST_INT_P (XEXP (src, 0)))
2434 : return false;
2435 :
2436 : return true;
2437 : }
2438 :
2439 : /* Check for a suitable TImode memory operand. */
2440 :
2441 : static bool
2442 1565 : timode_mem_p (rtx x)
2443 : {
2444 1565 : return MEM_P (x)
2445 1565 : && (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
2446 0 : || !misaligned_operand (x, TImode));
2447 : }
2448 :
2449 : /* The TImode version of scalar_to_vector_candidate_p. */
2450 :
2451 : static bool
2452 102001332 : timode_scalar_to_vector_candidate_p (rtx_insn *insn)
2453 : {
2454 102001332 : rtx def_set = pseudo_reg_set (insn);
2455 :
2456 102001332 : if (!def_set)
2457 : return false;
2458 :
2459 23756263 : rtx src = SET_SRC (def_set);
2460 23756263 : rtx dst = SET_DEST (def_set);
2461 :
2462 23756263 : if (GET_CODE (src) == COMPARE)
2463 4007702 : return convertible_comparison_p (insn, TImode);
2464 :
2465 19748561 : if (GET_MODE (dst) != TImode
2466 1206128 : || (GET_MODE (src) != TImode
2467 62662 : && !CONST_SCALAR_INT_P (src)))
2468 : return false;
2469 :
2470 1206128 : if (!REG_P (dst) && !MEM_P (dst))
2471 : return false;
2472 :
2473 1204675 : if (MEM_P (dst)
2474 535807 : && misaligned_operand (dst, TImode)
2475 1521437 : && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
2476 : return false;
2477 :
2478 1204670 : if (REG_P (dst) && !single_def_chain_p (dst))
2479 : return false;
2480 :
2481 1052230 : switch (GET_CODE (src))
2482 : {
2483 490509 : case REG:
2484 490509 : return single_def_chain_p (src);
2485 :
2486 : case CONST_WIDE_INT:
2487 : return true;
2488 :
2489 13044 : case CONST_INT:
2490 : /* ??? Verify performance impact before enabling CONST_INT for
2491 : __int128 store. */
2492 13044 : return standard_sse_constant_p (src, TImode);
2493 :
2494 449007 : case MEM:
2495 : /* Memory must be aligned or unaligned load is optimal. */
2496 449007 : return (REG_P (dst)
2497 449007 : && (!misaligned_operand (src, TImode)
2498 152540 : || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
2499 :
2500 3920 : case AND:
2501 3920 : if (!MEM_P (dst)
2502 3879 : && GET_CODE (XEXP (src, 0)) == NOT
2503 0 : && REG_P (XEXP (XEXP (src, 0), 0))
2504 3920 : && (REG_P (XEXP (src, 1))
2505 0 : || CONST_SCALAR_INT_P (XEXP (src, 1))
2506 0 : || timode_mem_p (XEXP (src, 1))))
2507 0 : return true;
2508 3920 : return (REG_P (XEXP (src, 0))
2509 46 : || timode_mem_p (XEXP (src, 0)))
2510 3966 : && (REG_P (XEXP (src, 1))
2511 2098 : || CONST_SCALAR_INT_P (XEXP (src, 1))
2512 35 : || timode_mem_p (XEXP (src, 1)));
2513 :
2514 13982 : case IOR:
2515 13982 : case XOR:
2516 13982 : if (timode_concatdi_p (src))
2517 : return true;
2518 2666 : return (REG_P (XEXP (src, 0))
2519 1437 : || timode_mem_p (XEXP (src, 0)))
2520 2683 : && (REG_P (XEXP (src, 1))
2521 267 : || CONST_SCALAR_INT_P (XEXP (src, 1))
2522 31 : || timode_mem_p (XEXP (src, 1)));
2523 :
2524 505 : case NOT:
2525 505 : return REG_P (XEXP (src, 0)) || timode_mem_p (XEXP (src, 0));
2526 :
2527 12281 : case ASHIFT:
2528 12281 : case LSHIFTRT:
2529 12281 : case ASHIFTRT:
2530 12281 : case ROTATERT:
2531 12281 : case ROTATE:
2532 : /* Handle shifts/rotates by integer constants between 0 and 127. */
2533 12281 : return REG_P (XEXP (src, 0))
2534 12249 : && CONST_INT_P (XEXP (src, 1))
2535 24189 : && (INTVAL (XEXP (src, 1)) & ~0x7f) == 0;
2536 :
2537 7206 : case PLUS:
2538 7206 : return timode_concatdi_p (src);
2539 :
2540 3798 : case ZERO_EXTEND:
2541 3798 : return REG_P (XEXP (src, 0))
2542 3798 : && GET_MODE (XEXP (src, 0)) == DImode;
2543 :
2544 : default:
2545 : return false;
2546 : }
2547 : }
2548 :
2549 : /* For a register REGNO, scan instructions for its defs and uses.
2550 : Put REGNO in REGS if a def or use isn't in CANDIDATES. */
2551 :
2552 : static void
2553 1284857 : timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
2554 : unsigned int regno)
2555 : {
2556 : /* Do nothing if REGNO is already in REGS or is a hard reg. */
2557 1284857 : if (bitmap_bit_p (regs, regno)
2558 1284857 : || HARD_REGISTER_NUM_P (regno))
2559 : return;
2560 :
2561 1272245 : for (df_ref def = DF_REG_DEF_CHAIN (regno);
2562 2520025 : def;
2563 1247780 : def = DF_REF_NEXT_REG (def))
2564 : {
2565 1272225 : if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2566 : {
2567 24445 : if (dump_file)
2568 0 : fprintf (dump_file,
2569 : "r%d has non convertible def in insn %d\n",
2570 0 : regno, DF_REF_INSN_UID (def));
2571 :
2572 24445 : bitmap_set_bit (regs, regno);
2573 24445 : break;
2574 : }
2575 : }
2576 :
2577 1272245 : for (df_ref ref = DF_REG_USE_CHAIN (regno);
2578 2795321 : ref;
2579 1523076 : ref = DF_REF_NEXT_REG (ref))
2580 : {
2581 : /* Debug instructions are skipped. */
2582 1587371 : if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
2583 1587371 : && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
2584 : {
2585 64295 : if (dump_file)
2586 0 : fprintf (dump_file,
2587 : "r%d has non convertible use in insn %d\n",
2588 0 : regno, DF_REF_INSN_UID (ref));
2589 :
2590 64295 : bitmap_set_bit (regs, regno);
2591 64295 : break;
2592 : }
2593 : }
2594 : }
2595 :
2596 : /* For a given bitmap of insn UIDs scans all instructions and
2597 : remove insn from CANDIDATES in case it has both convertible
2598 : and not convertible definitions.
2599 :
2600 : All insns in a bitmap are conversion candidates according to
2601 : scalar_to_vector_candidate_p. Currently it implies all insns
2602 : are single_set. */
2603 :
2604 : static void
2605 832304 : timode_remove_non_convertible_regs (bitmap candidates)
2606 : {
2607 832304 : bitmap_iterator bi;
2608 832304 : unsigned id;
2609 832304 : bitmap regs = BITMAP_ALLOC (NULL);
2610 859507 : bool changed;
2611 :
2612 859507 : do {
2613 859507 : changed = false;
2614 2172899 : EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
2615 : {
2616 1313392 : rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
2617 1313392 : df_ref ref;
2618 :
2619 1971184 : FOR_EACH_INSN_DEF (ref, insn)
2620 657792 : if (!DF_REF_REG_MEM_P (ref)
2621 657792 : && GET_MODE (DF_REF_REG (ref)) == TImode)
2622 635067 : timode_check_non_convertible_regs (candidates, regs,
2623 : DF_REF_REGNO (ref));
2624 :
2625 3240546 : FOR_EACH_INSN_USE (ref, insn)
2626 1927154 : if (!DF_REF_REG_MEM_P (ref)
2627 680130 : && GET_MODE (DF_REF_REG (ref)) == TImode)
2628 649790 : timode_check_non_convertible_regs (candidates, regs,
2629 : DF_REF_REGNO (ref));
2630 : }
2631 :
2632 1050236 : EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
2633 : {
2634 190729 : for (df_ref def = DF_REG_DEF_CHAIN (id);
2635 388300 : def;
2636 197571 : def = DF_REF_NEXT_REG (def))
2637 197571 : if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2638 : {
2639 49262 : if (dump_file)
2640 0 : fprintf (dump_file, "Removing insn %d from candidates list\n",
2641 0 : DF_REF_INSN_UID (def));
2642 :
2643 49262 : bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
2644 49262 : changed = true;
2645 : }
2646 :
2647 190729 : for (df_ref ref = DF_REG_USE_CHAIN (id);
2648 513955 : ref;
2649 323226 : ref = DF_REF_NEXT_REG (ref))
2650 323226 : if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
2651 : {
2652 35287 : if (dump_file)
2653 0 : fprintf (dump_file, "Removing insn %d from candidates list\n",
2654 0 : DF_REF_INSN_UID (ref));
2655 :
2656 35287 : bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
2657 35287 : changed = true;
2658 : }
2659 : }
2660 : } while (changed);
2661 :
2662 832304 : BITMAP_FREE (regs);
2663 832304 : }
2664 :
2665 : /* Main STV pass function. Find and convert scalar
2666 : instructions into vector mode when profitable. */
2667 :
2668 : static unsigned int
2669 1790568 : convert_scalars_to_vector (bool timode_p)
2670 : {
2671 1790568 : basic_block bb;
2672 1790568 : int converted_insns = 0;
2673 1790568 : auto_vec<rtx_insn *> control_flow_insns;
2674 :
2675 1790568 : bitmap_obstack_initialize (NULL);
2676 1790568 : const machine_mode cand_mode[3] = { SImode, DImode, TImode };
2677 1790568 : const machine_mode cand_vmode[3] = { V4SImode, V2DImode, V1TImode };
2678 5371704 : bitmap_head candidates[3]; /* { SImode, DImode, TImode } */
2679 7162272 : for (unsigned i = 0; i < 3; ++i)
2680 5371704 : bitmap_initialize (&candidates[i], &bitmap_default_obstack);
2681 :
2682 1790568 : calculate_dominance_info (CDI_DOMINATORS);
2683 1790568 : df_set_flags (DF_DEFER_INSN_RESCAN | DF_RD_PRUNE_DEAD_DEFS);
2684 1790568 : df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2685 1790568 : df_analyze ();
2686 :
2687 : /* Find all instructions we want to convert into vector mode. */
2688 1790568 : if (dump_file)
2689 44 : fprintf (dump_file, "Searching for mode conversion candidates...\n");
2690 :
2691 19843735 : FOR_EACH_BB_FN (bb, cfun)
2692 : {
2693 18053167 : rtx_insn *insn;
2694 240356182 : FOR_BB_INSNS (bb, insn)
2695 222303015 : if (timode_p
2696 222303015 : && timode_scalar_to_vector_candidate_p (insn))
2697 : {
2698 1016918 : if (dump_file)
2699 0 : fprintf (dump_file, " insn %d is marked as a TImode candidate\n",
2700 0 : INSN_UID (insn));
2701 :
2702 1016918 : bitmap_set_bit (&candidates[2], INSN_UID (insn));
2703 : }
2704 221286097 : else if (!timode_p)
2705 : {
2706 : /* Check {SI,DI}mode. */
2707 345677654 : for (unsigned i = 0; i <= 1; ++i)
2708 237054288 : if (general_scalar_to_vector_candidate_p (insn, cand_mode[i]))
2709 : {
2710 11678317 : if (dump_file)
2711 554 : fprintf (dump_file, " insn %d is marked as a %s candidate\n",
2712 277 : INSN_UID (insn), i == 0 ? "SImode" : "DImode");
2713 :
2714 11678317 : bitmap_set_bit (&candidates[i], INSN_UID (insn));
2715 11678317 : break;
2716 : }
2717 : }
2718 : }
2719 :
2720 1790568 : if (timode_p)
2721 832304 : timode_remove_non_convertible_regs (&candidates[2]);
2722 :
2723 5678330 : for (unsigned i = 0; i <= 2; ++i)
2724 4519639 : if (!bitmap_empty_p (&candidates[i]))
2725 : break;
2726 3887762 : else if (i == 2 && dump_file)
2727 23 : fprintf (dump_file, "There are no candidates for optimization.\n");
2728 :
2729 7162272 : for (unsigned i = 0; i <= 2; ++i)
2730 : {
2731 5371704 : auto_bitmap disallowed;
2732 5371704 : bitmap_tree_view (&candidates[i]);
2733 17121315 : while (!bitmap_empty_p (&candidates[i]))
2734 : {
2735 6377907 : unsigned uid = bitmap_first_set_bit (&candidates[i]);
2736 6377907 : scalar_chain *chain;
2737 :
2738 6377907 : if (cand_mode[i] == TImode)
2739 473775 : chain = new timode_scalar_chain;
2740 : else
2741 5904132 : chain = new general_scalar_chain (cand_mode[i], cand_vmode[i]);
2742 :
2743 : /* Find instructions chain we want to convert to vector mode.
2744 : Check all uses and definitions to estimate all required
2745 : conversions. */
2746 6377907 : if (chain->build (&candidates[i], uid, disallowed))
2747 : {
2748 6371018 : if (chain->compute_convert_gain ())
2749 643914 : converted_insns += chain->convert ();
2750 5727104 : else if (dump_file)
2751 136 : fprintf (dump_file, "Chain #%d conversion is not profitable\n",
2752 : chain->chain_id);
2753 : }
2754 :
2755 6377907 : rtx_insn* iter_insn;
2756 6377907 : unsigned int ii;
2757 6381504 : FOR_EACH_VEC_ELT (chain->control_flow_insns, ii, iter_insn)
2758 3597 : control_flow_insns.safe_push (iter_insn);
2759 :
2760 6377907 : delete chain;
2761 : }
2762 5371704 : }
2763 :
2764 1790568 : if (dump_file)
2765 44 : fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
2766 :
2767 7162272 : for (unsigned i = 0; i <= 2; ++i)
2768 5371704 : bitmap_release (&candidates[i]);
2769 1790568 : bitmap_obstack_release (NULL);
2770 1790568 : df_process_deferred_rescans ();
2771 :
2772 : /* Conversion means we may have 128bit register spills/fills
2773 : which require aligned stack. */
2774 1790568 : if (converted_insns)
2775 : {
2776 104811 : if (crtl->stack_alignment_needed < 128)
2777 2372 : crtl->stack_alignment_needed = 128;
2778 104811 : if (crtl->stack_alignment_estimated < 128)
2779 219 : crtl->stack_alignment_estimated = 128;
2780 :
2781 104811 : crtl->stack_realign_needed
2782 104811 : = INCOMING_STACK_BOUNDARY < crtl->stack_alignment_estimated;
2783 104811 : crtl->stack_realign_tried = crtl->stack_realign_needed;
2784 :
2785 104811 : crtl->stack_realign_processed = true;
2786 :
2787 104811 : if (!crtl->drap_reg)
2788 : {
2789 104645 : rtx drap_rtx = targetm.calls.get_drap_rtx ();
2790 :
2791 : /* stack_realign_drap and drap_rtx must match. */
2792 104645 : gcc_assert ((stack_realign_drap != 0) == (drap_rtx != NULL));
2793 :
2794 : /* Do nothing if NULL is returned,
2795 : which means DRAP is not needed. */
2796 104645 : if (drap_rtx != NULL)
2797 : {
2798 0 : crtl->args.internal_arg_pointer = drap_rtx;
2799 :
2800 : /* Call fixup_tail_calls to clean up
2801 : REG_EQUIV note if DRAP is needed. */
2802 0 : fixup_tail_calls ();
2803 : }
2804 : }
2805 :
2806 : /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
2807 104811 : if (TARGET_64BIT)
2808 66377 : for (tree parm = DECL_ARGUMENTS (current_function_decl);
2809 183037 : parm; parm = DECL_CHAIN (parm))
2810 : {
2811 116660 : if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
2812 100506 : continue;
2813 16154 : if (DECL_RTL_SET_P (parm)
2814 32308 : && GET_MODE (DECL_RTL (parm)) == V1TImode)
2815 : {
2816 522 : rtx r = DECL_RTL (parm);
2817 522 : if (REG_P (r))
2818 522 : SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
2819 : }
2820 16154 : if (DECL_INCOMING_RTL (parm)
2821 16154 : && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
2822 : {
2823 0 : rtx r = DECL_INCOMING_RTL (parm);
2824 0 : if (REG_P (r))
2825 0 : DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
2826 : }
2827 : }
2828 :
2829 104811 : if (!control_flow_insns.is_empty ())
2830 : {
2831 1130 : free_dominance_info (CDI_DOMINATORS);
2832 :
2833 1130 : unsigned int i;
2834 1130 : rtx_insn* insn;
2835 5857 : FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
2836 3597 : if (control_flow_insn_p (insn))
2837 : {
2838 : /* Split the block after insn. There will be a fallthru
2839 : edge, which is OK so we keep it. We have to create
2840 : the exception edges ourselves. */
2841 3597 : bb = BLOCK_FOR_INSN (insn);
2842 3597 : split_block (bb, insn);
2843 3597 : rtl_make_eh_edge (NULL, bb, BB_END (bb));
2844 : }
2845 : }
2846 : }
2847 :
2848 1790568 : return 0;
2849 1790568 : }
2850 :
2851 : static unsigned int
2852 74443 : rest_of_handle_insert_vzeroupper (void)
2853 : {
2854 : /* vzeroupper instructions are inserted immediately after reload and
2855 : postreload_cse to clean up after it a little bit to account for possible
2856 : spills from 256bit or 512bit registers. The pass reuses mode switching
2857 : infrastructure by re-running mode insertion pass, so disable entities
2858 : that have already been processed. */
2859 521101 : for (int i = 0; i < MAX_386_ENTITIES; i++)
2860 446658 : ix86_optimize_mode_switching[i] = 0;
2861 :
2862 74443 : ix86_optimize_mode_switching[AVX_U128] = 1;
2863 :
2864 : /* Call optimize_mode_switching. */
2865 74443 : g->get_passes ()->execute_pass_mode_switching ();
2866 :
2867 : /* LRA removes all REG_DEAD/REG_UNUSED notes and normally they
2868 : reappear in the IL only at the start of pass_rtl_dse2, which does
2869 : df_note_add_problem (); df_analyze ();
2870 : The vzeroupper is scheduled after postreload_cse pass and mode
2871 : switching computes the notes as well, the problem is that e.g.
2872 : pass_gcse2 doesn't maintain the notes, see PR113059 and
2873 : PR112760. Remove the notes now to restore status quo ante
2874 : until we figure out how to maintain the notes or what else
2875 : to do. */
2876 74443 : basic_block bb;
2877 74443 : rtx_insn *insn;
2878 409381 : FOR_EACH_BB_FN (bb, cfun)
2879 4319679 : FOR_BB_INSNS (bb, insn)
2880 3984741 : if (NONDEBUG_INSN_P (insn))
2881 : {
2882 2121773 : rtx *pnote = ®_NOTES (insn);
2883 3934874 : while (*pnote != 0)
2884 : {
2885 1813101 : if (REG_NOTE_KIND (*pnote) == REG_DEAD
2886 830088 : || REG_NOTE_KIND (*pnote) == REG_UNUSED)
2887 1300583 : *pnote = XEXP (*pnote, 1);
2888 : else
2889 512518 : pnote = &XEXP (*pnote, 1);
2890 : }
2891 : }
2892 :
2893 74443 : df_remove_problem (df_note);
2894 74443 : df_analyze ();
2895 74443 : return 0;
2896 : }
2897 :
2898 : namespace {
2899 :
2900 : const pass_data pass_data_insert_vzeroupper =
2901 : {
2902 : RTL_PASS, /* type */
2903 : "vzeroupper", /* name */
2904 : OPTGROUP_NONE, /* optinfo_flags */
2905 : TV_MACH_DEP, /* tv_id */
2906 : 0, /* properties_required */
2907 : 0, /* properties_provided */
2908 : 0, /* properties_destroyed */
2909 : 0, /* todo_flags_start */
2910 : TODO_df_finish, /* todo_flags_finish */
2911 : };
2912 :
2913 : class pass_insert_vzeroupper : public rtl_opt_pass
2914 : {
2915 : public:
2916 285722 : pass_insert_vzeroupper(gcc::context *ctxt)
2917 571444 : : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2918 : {}
2919 :
2920 : /* opt_pass methods: */
2921 1471370 : bool gate (function *) final override
2922 : {
2923 1471370 : return TARGET_AVX && TARGET_VZEROUPPER;
2924 : }
2925 :
2926 74443 : unsigned int execute (function *) final override
2927 : {
2928 74443 : return rest_of_handle_insert_vzeroupper ();
2929 : }
2930 :
2931 : }; // class pass_insert_vzeroupper
2932 :
2933 : const pass_data pass_data_stv =
2934 : {
2935 : RTL_PASS, /* type */
2936 : "stv", /* name */
2937 : OPTGROUP_NONE, /* optinfo_flags */
2938 : TV_MACH_DEP, /* tv_id */
2939 : 0, /* properties_required */
2940 : 0, /* properties_provided */
2941 : 0, /* properties_destroyed */
2942 : 0, /* todo_flags_start */
2943 : TODO_df_finish, /* todo_flags_finish */
2944 : };
2945 :
2946 : class pass_stv : public rtl_opt_pass
2947 : {
2948 : public:
2949 571444 : pass_stv (gcc::context *ctxt)
2950 571444 : : rtl_opt_pass (pass_data_stv, ctxt),
2951 1142888 : timode_p (false)
2952 : {}
2953 :
2954 : /* opt_pass methods: */
2955 2942740 : bool gate (function *) final override
2956 : {
2957 1471370 : return ((!timode_p || TARGET_64BIT)
2958 4287829 : && TARGET_STV && TARGET_SSE2 && optimize > 1);
2959 : }
2960 :
2961 1790568 : unsigned int execute (function *) final override
2962 : {
2963 1790568 : return convert_scalars_to_vector (timode_p);
2964 : }
2965 :
2966 285722 : opt_pass *clone () final override
2967 : {
2968 285722 : return new pass_stv (m_ctxt);
2969 : }
2970 :
2971 571444 : void set_pass_param (unsigned int n, bool param) final override
2972 : {
2973 571444 : gcc_assert (n == 0);
2974 571444 : timode_p = param;
2975 571444 : }
2976 :
2977 : private:
2978 : bool timode_p;
2979 : }; // class pass_stv
2980 :
2981 : } // anon namespace
2982 :
2983 : rtl_opt_pass *
2984 285722 : make_pass_insert_vzeroupper (gcc::context *ctxt)
2985 : {
2986 285722 : return new pass_insert_vzeroupper (ctxt);
2987 : }
2988 :
2989 : rtl_opt_pass *
2990 285722 : make_pass_stv (gcc::context *ctxt)
2991 : {
2992 285722 : return new pass_stv (ctxt);
2993 : }
2994 :
2995 : /* Inserting ENDBR and pseudo patchable-area instructions. */
2996 :
2997 : static void
2998 198192 : rest_of_insert_endbr_and_patchable_area (bool need_endbr,
2999 : unsigned int patchable_area_size)
3000 : {
3001 198192 : rtx endbr;
3002 198192 : rtx_insn *insn;
3003 198192 : rtx_insn *endbr_insn = NULL;
3004 198192 : basic_block bb;
3005 :
3006 198192 : if (need_endbr)
3007 : {
3008 : /* Currently emit EB if it's a tracking function, i.e. 'nocf_check'
3009 : is absent among function attributes. Later an optimization will
3010 : be introduced to make analysis if an address of a static function
3011 : is taken. A static function whose address is not taken will get
3012 : a nocf_check attribute. This will allow to reduce the number of
3013 : EB. */
3014 198147 : if (!lookup_attribute ("nocf_check",
3015 198147 : TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
3016 198129 : && (!flag_manual_endbr
3017 8 : || lookup_attribute ("cf_check",
3018 8 : DECL_ATTRIBUTES (cfun->decl)))
3019 396275 : && (!cgraph_node::get (cfun->decl)->only_called_directly_p ()
3020 27192 : || ix86_cmodel == CM_LARGE
3021 27191 : || ix86_cmodel == CM_LARGE_PIC
3022 27190 : || flag_force_indirect_call
3023 27190 : || (TARGET_DLLIMPORT_DECL_ATTRIBUTES
3024 : && DECL_DLLIMPORT_P (cfun->decl))))
3025 : {
3026 170939 : if (crtl->profile && flag_fentry)
3027 : {
3028 : /* Queue ENDBR insertion to x86_function_profiler.
3029 : NB: Any patchable-area insn will be inserted after
3030 : ENDBR. */
3031 6 : cfun->machine->insn_queued_at_entrance = TYPE_ENDBR;
3032 : }
3033 : else
3034 : {
3035 170933 : endbr = gen_nop_endbr ();
3036 170933 : bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
3037 170933 : rtx_insn *insn = BB_HEAD (bb);
3038 170933 : endbr_insn = emit_insn_before (endbr, insn);
3039 : }
3040 : }
3041 : }
3042 :
3043 198192 : if (patchable_area_size)
3044 : {
3045 51 : if (crtl->profile && flag_fentry)
3046 : {
3047 : /* Queue patchable-area insertion to x86_function_profiler.
3048 : NB: If there is a queued ENDBR, x86_function_profiler
3049 : will also handle patchable-area. */
3050 2 : if (!cfun->machine->insn_queued_at_entrance)
3051 1 : cfun->machine->insn_queued_at_entrance = TYPE_PATCHABLE_AREA;
3052 : }
3053 : else
3054 : {
3055 49 : rtx patchable_area
3056 49 : = gen_patchable_area (GEN_INT (patchable_area_size),
3057 49 : GEN_INT (crtl->patch_area_entry == 0));
3058 49 : if (endbr_insn)
3059 3 : emit_insn_after (patchable_area, endbr_insn);
3060 : else
3061 : {
3062 46 : bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
3063 46 : insn = BB_HEAD (bb);
3064 46 : emit_insn_before (patchable_area, insn);
3065 : }
3066 : }
3067 : }
3068 :
3069 198192 : if (!need_endbr)
3070 : return;
3071 :
3072 198147 : bb = 0;
3073 4100595 : FOR_EACH_BB_FN (bb, cfun)
3074 : {
3075 73990475 : for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
3076 70088027 : insn = NEXT_INSN (insn))
3077 : {
3078 70088027 : if (CALL_P (insn))
3079 : {
3080 1377302 : need_endbr = find_reg_note (insn, REG_SETJMP, NULL) != NULL;
3081 1377302 : if (!need_endbr && !SIBLING_CALL_P (insn))
3082 : {
3083 1325732 : rtx call = get_call_rtx_from (insn);
3084 1325732 : rtx fnaddr = XEXP (call, 0);
3085 1325732 : tree fndecl = NULL_TREE;
3086 :
3087 : /* Also generate ENDBRANCH for non-tail call which
3088 : may return via indirect branch. */
3089 1325732 : if (SYMBOL_REF_P (XEXP (fnaddr, 0)))
3090 1263955 : fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
3091 1263955 : if (fndecl == NULL_TREE)
3092 62145 : fndecl = MEM_EXPR (fnaddr);
3093 62145 : if (fndecl
3094 1323240 : && TREE_CODE (TREE_TYPE (fndecl)) != FUNCTION_TYPE
3095 564691 : && TREE_CODE (TREE_TYPE (fndecl)) != METHOD_TYPE)
3096 : fndecl = NULL_TREE;
3097 1325732 : if (fndecl && TYPE_ARG_TYPES (TREE_TYPE (fndecl)))
3098 : {
3099 1284314 : tree fntype = TREE_TYPE (fndecl);
3100 1284314 : if (lookup_attribute ("indirect_return",
3101 1284314 : TYPE_ATTRIBUTES (fntype)))
3102 : need_endbr = true;
3103 : }
3104 : }
3105 1377290 : if (!need_endbr)
3106 1377282 : continue;
3107 : /* Generate ENDBRANCH after CALL, which can return more than
3108 : twice, setjmp-like functions. */
3109 :
3110 20 : endbr = gen_nop_endbr ();
3111 20 : emit_insn_after_setloc (endbr, insn, INSN_LOCATION (insn));
3112 20 : continue;
3113 20 : }
3114 :
3115 68710725 : if (JUMP_P (insn) && flag_cet_switch)
3116 : {
3117 9 : rtx target = JUMP_LABEL (insn);
3118 9 : if (target == NULL_RTX || ANY_RETURN_P (target))
3119 5 : continue;
3120 :
3121 : /* Check the jump is a switch table. */
3122 4 : rtx_insn *label = as_a<rtx_insn *> (target);
3123 4 : rtx_insn *table = next_insn (label);
3124 4 : if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
3125 2 : continue;
3126 :
3127 : /* For the indirect jump find out all places it jumps and insert
3128 : ENDBRANCH there. It should be done under a special flag to
3129 : control ENDBRANCH generation for switch stmts. */
3130 2 : edge_iterator ei;
3131 2 : edge e;
3132 2 : basic_block dest_blk;
3133 :
3134 24 : FOR_EACH_EDGE (e, ei, bb->succs)
3135 : {
3136 22 : rtx_insn *insn;
3137 :
3138 22 : dest_blk = e->dest;
3139 22 : insn = BB_HEAD (dest_blk);
3140 22 : gcc_assert (LABEL_P (insn));
3141 22 : endbr = gen_nop_endbr ();
3142 22 : emit_insn_after (endbr, insn);
3143 : }
3144 2 : continue;
3145 2 : }
3146 :
3147 68710716 : if (LABEL_P (insn) && LABEL_PRESERVE_P (insn))
3148 : {
3149 139168 : endbr = gen_nop_endbr ();
3150 139168 : emit_insn_after (endbr, insn);
3151 139168 : continue;
3152 : }
3153 : }
3154 : }
3155 :
3156 : return;
3157 : }
3158 :
3159 : namespace {
3160 :
3161 : const pass_data pass_data_insert_endbr_and_patchable_area =
3162 : {
3163 : RTL_PASS, /* type. */
3164 : "endbr_and_patchable_area", /* name. */
3165 : OPTGROUP_NONE, /* optinfo_flags. */
3166 : TV_MACH_DEP, /* tv_id. */
3167 : 0, /* properties_required. */
3168 : 0, /* properties_provided. */
3169 : 0, /* properties_destroyed. */
3170 : 0, /* todo_flags_start. */
3171 : 0, /* todo_flags_finish. */
3172 : };
3173 :
3174 : class pass_insert_endbr_and_patchable_area : public rtl_opt_pass
3175 : {
3176 : public:
3177 285722 : pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
3178 571444 : : rtl_opt_pass (pass_data_insert_endbr_and_patchable_area, ctxt)
3179 : {}
3180 :
3181 : /* opt_pass methods: */
3182 1471370 : bool gate (function *) final override
3183 : {
3184 1471370 : need_endbr = (flag_cf_protection & CF_BRANCH) != 0;
3185 1471370 : patchable_area_size = crtl->patch_area_size - crtl->patch_area_entry;
3186 1471370 : return need_endbr || patchable_area_size;
3187 : }
3188 :
3189 198192 : unsigned int execute (function *) final override
3190 : {
3191 198192 : timevar_push (TV_MACH_DEP);
3192 198192 : rest_of_insert_endbr_and_patchable_area (need_endbr,
3193 : patchable_area_size);
3194 198192 : timevar_pop (TV_MACH_DEP);
3195 198192 : return 0;
3196 : }
3197 :
3198 : private:
3199 : bool need_endbr;
3200 : unsigned int patchable_area_size;
3201 : }; // class pass_insert_endbr_and_patchable_area
3202 :
3203 : } // anon namespace
3204 :
3205 : rtl_opt_pass *
3206 285722 : make_pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
3207 : {
3208 285722 : return new pass_insert_endbr_and_patchable_area (ctxt);
3209 : }
3210 :
3211 : bool
3212 6116892 : ix86_rpad_gate ()
3213 : {
3214 6116892 : return (TARGET_AVX
3215 403983 : && TARGET_SSE_PARTIAL_REG_DEPENDENCY
3216 309088 : && TARGET_SSE_MATH
3217 308858 : && optimize
3218 6420547 : && optimize_function_for_speed_p (cfun));
3219 : }
3220 :
3221 : enum x86_cse_kind
3222 : {
3223 : X86_CSE_CONST0_VECTOR,
3224 : X86_CSE_CONSTM1_VECTOR,
3225 : X86_CSE_VEC_DUP,
3226 : X86_CSE_TLS_GD,
3227 : X86_CSE_TLS_LD_BASE,
3228 : X86_CSE_TLSDESC
3229 : };
3230 :
3231 122144 : struct redundant_pattern
3232 : {
3233 : /* Bitmap of basic blocks with broadcast instructions. */
3234 : auto_bitmap bbs;
3235 : /* Bitmap of broadcast instructions. */
3236 : auto_bitmap insns;
3237 : /* The broadcast inner scalar. */
3238 : rtx val;
3239 : /* The actual redundant source value for UNSPEC_TLSDESC. */
3240 : rtx tlsdesc_val;
3241 : /* The inner scalar mode. */
3242 : machine_mode mode;
3243 : /* The instruction which sets the inner scalar. Nullptr if the inner
3244 : scalar is applied to the whole function, instead of within the same
3245 : block. */
3246 : rtx_insn *def_insn;
3247 : /* The widest broadcast source. */
3248 : rtx broadcast_source;
3249 : /* The widest broadcast register. */
3250 : rtx broadcast_reg;
3251 : /* The basic block of the broadcast instruction. */
3252 : basic_block bb;
3253 : /* The number of broadcast instructions with the same inner scalar. */
3254 : unsigned HOST_WIDE_INT count;
3255 : /* The threshold of broadcast instructions with the same inner
3256 : scalar. */
3257 : unsigned int threshold;
3258 : /* The widest broadcast size in bytes. */
3259 : unsigned int size;
3260 : /* Load kind. */
3261 : x86_cse_kind kind;
3262 : };
3263 :
3264 : /* Generate a vector set, DEST = SRC, at entry of the nearest dominator
3265 : for basic block map BBS, which is in the fake loop that contains the
3266 : whole function, so that there is only a single vector set in the
3267 : whole function. If not nullptr, LOAD is a pointer to the load. */
3268 :
3269 : static void
3270 31988 : ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs,
3271 : redundant_pattern *load = nullptr)
3272 : {
3273 31988 : basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
3274 : /* For X86_CSE_VEC_DUP, don't place the vector set outside of the loop
3275 : to avoid extra spills. */
3276 31988 : if (!load || load->kind != X86_CSE_VEC_DUP)
3277 : {
3278 23085 : while (bb->loop_father->latch
3279 23085 : != EXIT_BLOCK_PTR_FOR_FN (cfun))
3280 1350 : bb = get_immediate_dominator (CDI_DOMINATORS,
3281 : bb->loop_father->header);
3282 : }
3283 :
3284 31988 : rtx set = gen_rtx_SET (dest, src);
3285 :
3286 31988 : rtx_insn *insn = BB_HEAD (bb);
3287 123646 : while (insn && !NONDEBUG_INSN_P (insn))
3288 : {
3289 91662 : if (insn == BB_END (bb))
3290 : {
3291 : insn = NULL;
3292 : break;
3293 : }
3294 91658 : insn = NEXT_INSN (insn);
3295 : }
3296 :
3297 31988 : rtx_insn *set_insn;
3298 31988 : if (insn == BB_HEAD (bb))
3299 : {
3300 0 : set_insn = emit_insn_before (set, insn);
3301 0 : if (dump_file)
3302 : {
3303 0 : fprintf (dump_file, "\nPlace:\n\n");
3304 0 : print_rtl_single (dump_file, set_insn);
3305 0 : fprintf (dump_file, "\nbefore:\n\n");
3306 0 : print_rtl_single (dump_file, insn);
3307 0 : fprintf (dump_file, "\n");
3308 : }
3309 : }
3310 : else
3311 : {
3312 31988 : rtx_insn *after = insn ? PREV_INSN (insn) : BB_END (bb);
3313 31988 : set_insn = emit_insn_after (set, after);
3314 31988 : if (dump_file)
3315 : {
3316 0 : fprintf (dump_file, "\nPlace:\n\n");
3317 0 : print_rtl_single (dump_file, set_insn);
3318 0 : fprintf (dump_file, "\nafter:\n\n");
3319 0 : print_rtl_single (dump_file, after);
3320 0 : fprintf (dump_file, "\n");
3321 : }
3322 : }
3323 :
3324 31988 : if (load && load->kind == X86_CSE_VEC_DUP)
3325 : {
3326 : /* Get the source from LOAD as (reg:SI 99) in
3327 :
3328 : (vec_duplicate:V4SI (reg:SI 99))
3329 :
3330 : */
3331 10253 : rtx inner_scalar = load->val;
3332 : /* Set the source in (vec_duplicate:V4SI (reg:SI 99)). */
3333 10253 : rtx reg = XEXP (src, 0);
3334 10253 : if ((REG_P (inner_scalar) || MEM_P (inner_scalar))
3335 278 : && GET_MODE (reg) != GET_MODE (inner_scalar))
3336 0 : inner_scalar = gen_rtx_SUBREG (GET_MODE (reg), inner_scalar, 0);
3337 10253 : rtx set = gen_rtx_SET (reg, inner_scalar);
3338 10253 : insn = emit_insn_before (set, set_insn);
3339 10253 : if (dump_file)
3340 : {
3341 0 : fprintf (dump_file, "\nAdd:\n\n");
3342 0 : print_rtl_single (dump_file, insn);
3343 0 : fprintf (dump_file, "\nbefore:\n\n");
3344 0 : print_rtl_single (dump_file, set_insn);
3345 0 : fprintf (dump_file, "\n");
3346 : }
3347 : }
3348 31988 : }
3349 :
3350 : /* At entry of the nearest common dominator for basic blocks with
3351 : conversions/rcp/sqrt/rsqrt/round, generate a single
3352 : vxorps %xmmN, %xmmN, %xmmN
3353 : for all
3354 : vcvtss2sd op, %xmmN, %xmmX
3355 : vcvtsd2ss op, %xmmN, %xmmX
3356 : vcvtsi2ss op, %xmmN, %xmmX
3357 : vcvtsi2sd op, %xmmN, %xmmX
3358 :
3359 : NB: We want to generate only a single vxorps to cover the whole
3360 : function. The LCM algorithm isn't appropriate here since it may
3361 : place a vxorps inside the loop. */
3362 :
3363 : static unsigned int
3364 33277 : remove_partial_avx_dependency (void)
3365 : {
3366 33277 : timevar_push (TV_MACH_DEP);
3367 :
3368 33277 : bitmap_obstack_initialize (NULL);
3369 33277 : bitmap convert_bbs = BITMAP_ALLOC (NULL);
3370 :
3371 33277 : basic_block bb;
3372 33277 : rtx_insn *insn, *set_insn;
3373 33277 : rtx set;
3374 33277 : rtx v4sf_const0 = NULL_RTX;
3375 :
3376 33277 : auto_vec<rtx_insn *> control_flow_insns;
3377 :
3378 : /* We create invalid RTL initially so defer rescans. */
3379 33277 : df_set_flags (DF_DEFER_INSN_RESCAN);
3380 :
3381 312857 : FOR_EACH_BB_FN (bb, cfun)
3382 : {
3383 3553321 : FOR_BB_INSNS (bb, insn)
3384 : {
3385 3273741 : if (!NONDEBUG_INSN_P (insn))
3386 1465599 : continue;
3387 :
3388 1808142 : set = single_set (insn);
3389 1808142 : if (!set)
3390 70300 : continue;
3391 :
3392 1737842 : if (get_attr_avx_partial_xmm_update (insn)
3393 : != AVX_PARTIAL_XMM_UPDATE_TRUE)
3394 1734613 : continue;
3395 :
3396 : /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF,
3397 : SI -> SF, SI -> DF, DI -> SF, DI -> DF, sqrt, rsqrt, rcp,
3398 : round, to vec_dup and vec_merge with subreg. */
3399 3229 : rtx src = SET_SRC (set);
3400 3229 : rtx dest = SET_DEST (set);
3401 3229 : machine_mode dest_mode = GET_MODE (dest);
3402 3229 : bool convert_p = false;
3403 3229 : switch (GET_CODE (src))
3404 : {
3405 3124 : case FLOAT:
3406 3124 : case FLOAT_EXTEND:
3407 3124 : case FLOAT_TRUNCATE:
3408 3124 : case UNSIGNED_FLOAT:
3409 3124 : convert_p = true;
3410 3124 : break;
3411 : default:
3412 : break;
3413 : }
3414 :
3415 : /* Only handle conversion here. */
3416 3124 : machine_mode src_mode
3417 3124 : = convert_p ? GET_MODE (XEXP (src, 0)) : VOIDmode;
3418 3124 : switch (src_mode)
3419 : {
3420 155 : case E_SFmode:
3421 155 : case E_DFmode:
3422 155 : if (TARGET_USE_VECTOR_FP_CONVERTS
3423 149 : || !TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY)
3424 8 : continue;
3425 : break;
3426 2969 : case E_SImode:
3427 2969 : case E_DImode:
3428 2969 : if (TARGET_USE_VECTOR_CONVERTS
3429 2957 : || !TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY)
3430 14 : continue;
3431 : break;
3432 105 : case E_VOIDmode:
3433 105 : gcc_assert (!convert_p);
3434 : break;
3435 0 : default:
3436 0 : gcc_unreachable ();
3437 : }
3438 :
3439 3207 : if (!v4sf_const0)
3440 1022 : v4sf_const0 = gen_reg_rtx (V4SFmode);
3441 :
3442 3207 : rtx zero;
3443 3207 : machine_mode dest_vecmode;
3444 3207 : switch (dest_mode)
3445 : {
3446 90 : case E_HFmode:
3447 90 : dest_vecmode = V8HFmode;
3448 90 : zero = gen_rtx_SUBREG (V8HFmode, v4sf_const0, 0);
3449 90 : break;
3450 : case E_SFmode:
3451 : dest_vecmode = V4SFmode;
3452 : zero = v4sf_const0;
3453 : break;
3454 1175 : case E_DFmode:
3455 1175 : dest_vecmode = V2DFmode;
3456 1175 : zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0);
3457 1175 : break;
3458 0 : default:
3459 0 : gcc_unreachable ();
3460 : }
3461 :
3462 : /* Change source to vector mode. */
3463 3207 : src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src);
3464 3207 : src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero,
3465 : GEN_INT (HOST_WIDE_INT_1U));
3466 : /* Change destination to vector mode. */
3467 3207 : rtx vec = gen_reg_rtx (dest_vecmode);
3468 : /* Generate an XMM vector SET. */
3469 3207 : set = gen_rtx_SET (vec, src);
3470 3207 : set_insn = emit_insn_before (set, insn);
3471 :
3472 3207 : if (cfun->can_throw_non_call_exceptions)
3473 : {
3474 : /* Handle REG_EH_REGION note. */
3475 0 : rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
3476 0 : if (note)
3477 : {
3478 0 : control_flow_insns.safe_push (set_insn);
3479 0 : add_reg_note (set_insn, REG_EH_REGION, XEXP (note, 0));
3480 : }
3481 : }
3482 :
3483 3207 : src = gen_rtx_SUBREG (dest_mode, vec, 0);
3484 3207 : set = gen_rtx_SET (dest, src);
3485 :
3486 : /* Drop possible dead definitions. */
3487 3207 : PATTERN (insn) = set;
3488 :
3489 3207 : INSN_CODE (insn) = -1;
3490 3207 : recog_memoized (insn);
3491 3207 : df_insn_rescan (insn);
3492 3207 : bitmap_set_bit (convert_bbs, bb->index);
3493 : }
3494 : }
3495 :
3496 33277 : if (v4sf_const0)
3497 : {
3498 : /* (Re-)discover loops so that bb->loop_father can be used in the
3499 : analysis below. */
3500 1022 : calculate_dominance_info (CDI_DOMINATORS);
3501 1022 : loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
3502 :
3503 1022 : ix86_place_single_vector_set (v4sf_const0,
3504 : CONST0_RTX (V4SFmode),
3505 : convert_bbs);
3506 :
3507 1022 : loop_optimizer_finalize ();
3508 :
3509 1022 : if (!control_flow_insns.is_empty ())
3510 : {
3511 0 : free_dominance_info (CDI_DOMINATORS);
3512 :
3513 0 : unsigned int i;
3514 0 : FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
3515 0 : if (control_flow_insn_p (insn))
3516 : {
3517 : /* Split the block after insn. There will be a fallthru
3518 : edge, which is OK so we keep it. We have to create
3519 : the exception edges ourselves. */
3520 0 : bb = BLOCK_FOR_INSN (insn);
3521 0 : split_block (bb, insn);
3522 0 : rtl_make_eh_edge (NULL, bb, BB_END (bb));
3523 : }
3524 : }
3525 : }
3526 :
3527 33277 : df_process_deferred_rescans ();
3528 33277 : df_clear_flags (DF_DEFER_INSN_RESCAN);
3529 33277 : bitmap_obstack_release (NULL);
3530 33277 : BITMAP_FREE (convert_bbs);
3531 :
3532 33277 : timevar_pop (TV_MACH_DEP);
3533 33277 : return 0;
3534 33277 : }
3535 :
3536 : namespace {
3537 :
3538 : const pass_data pass_data_remove_partial_avx_dependency =
3539 : {
3540 : RTL_PASS, /* type */
3541 : "rpad", /* name */
3542 : OPTGROUP_NONE, /* optinfo_flags */
3543 : TV_MACH_DEP, /* tv_id */
3544 : 0, /* properties_required */
3545 : 0, /* properties_provided */
3546 : 0, /* properties_destroyed */
3547 : 0, /* todo_flags_start */
3548 : 0, /* todo_flags_finish */
3549 : };
3550 :
3551 : class pass_remove_partial_avx_dependency : public rtl_opt_pass
3552 : {
3553 : public:
3554 285722 : pass_remove_partial_avx_dependency (gcc::context *ctxt)
3555 571444 : : rtl_opt_pass (pass_data_remove_partial_avx_dependency, ctxt)
3556 : {}
3557 :
3558 : /* opt_pass methods: */
3559 1471370 : bool gate (function *) final override
3560 : {
3561 1471370 : return ix86_rpad_gate ();
3562 : }
3563 :
3564 33277 : unsigned int execute (function *) final override
3565 : {
3566 33277 : return remove_partial_avx_dependency ();
3567 : }
3568 : }; // class pass_rpad
3569 :
3570 : } // anon namespace
3571 :
3572 : rtl_opt_pass *
3573 285722 : make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
3574 : {
3575 285722 : return new pass_remove_partial_avx_dependency (ctxt);
3576 : }
3577 :
3578 : /* Return a machine mode suitable for vector SIZE with SMODE inner
3579 : mode. */
3580 :
3581 : static machine_mode
3582 32228 : ix86_get_vector_cse_mode (unsigned int size, machine_mode smode)
3583 : {
3584 : /* Use the inner scalar mode of vector broadcast source in:
3585 :
3586 : (set (reg:V8DF 394)
3587 : (vec_duplicate:V8DF (reg:V2DF 190 [ alpha ])))
3588 :
3589 : to compute the vector mode for broadcast from vector source.
3590 : */
3591 32228 : if (VECTOR_MODE_P (smode))
3592 1 : smode = GET_MODE_INNER (smode);
3593 32228 : scalar_mode s_mode = as_a <scalar_mode> (smode);
3594 64456 : poly_uint64 nunits = size / GET_MODE_SIZE (smode);
3595 32228 : machine_mode mode = mode_for_vector (s_mode, nunits).require ();
3596 32228 : return mode;
3597 : }
3598 :
3599 : /* Replace the source operand of instructions in VECTOR_INSNS with
3600 : VECTOR_CONST in VECTOR_MODE. */
3601 :
3602 : static void
3603 31797 : replace_vector_const (machine_mode vector_mode, rtx vector_const,
3604 : auto_bitmap &vector_insns,
3605 : machine_mode scalar_mode)
3606 : {
3607 31797 : bitmap_iterator bi;
3608 31797 : unsigned int id;
3609 :
3610 153252 : EXECUTE_IF_SET_IN_BITMAP (vector_insns, 0, id, bi)
3611 : {
3612 121455 : rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
3613 :
3614 : /* Get the single SET instruction. */
3615 121455 : rtx set = single_set (insn);
3616 121455 : rtx src = SET_SRC (set);
3617 121455 : rtx dest = SET_DEST (set);
3618 121455 : machine_mode mode = GET_MODE (dest);
3619 :
3620 121455 : rtx replace;
3621 : /* Replace the source operand with VECTOR_CONST. */
3622 121455 : if (SUBREG_P (src) || mode == vector_mode)
3623 : replace = vector_const;
3624 : else
3625 : {
3626 58713 : unsigned int size = GET_MODE_SIZE (mode);
3627 58713 : if (size < ix86_regmode_natural_size (mode))
3628 : {
3629 : /* If the mode size is smaller than its natural size,
3630 : first insert an extra move with a QI vector SUBREG
3631 : of the same size to avoid validate_subreg failure. */
3632 431 : machine_mode vmode
3633 431 : = ix86_get_vector_cse_mode (size, scalar_mode);
3634 431 : rtx vreg;
3635 431 : if (mode == vmode)
3636 : vreg = vector_const;
3637 : else
3638 : {
3639 40 : vreg = gen_reg_rtx (vmode);
3640 40 : rtx vsubreg = gen_rtx_SUBREG (vmode, vector_const, 0);
3641 40 : rtx pat = gen_rtx_SET (vreg, vsubreg);
3642 40 : rtx_insn *vinsn = emit_insn_before (pat, insn);
3643 40 : if (dump_file)
3644 : {
3645 0 : fprintf (dump_file, "\nInsert an extra move:\n\n");
3646 0 : print_rtl_single (dump_file, vinsn);
3647 0 : fprintf (dump_file, "\nbefore:\n\n");
3648 0 : print_rtl_single (dump_file, insn);
3649 0 : fprintf (dump_file, "\n");
3650 : }
3651 : }
3652 431 : replace = gen_rtx_SUBREG (mode, vreg, 0);
3653 : }
3654 : else
3655 58282 : replace = gen_rtx_SUBREG (mode, vector_const, 0);
3656 : }
3657 :
3658 121455 : if (dump_file)
3659 : {
3660 0 : fprintf (dump_file, "\nReplace:\n\n");
3661 0 : print_rtl_single (dump_file, insn);
3662 : }
3663 121455 : SET_SRC (set) = replace;
3664 : /* Drop possible dead definitions. */
3665 121455 : PATTERN (insn) = set;
3666 121455 : INSN_CODE (insn) = -1;
3667 121455 : recog_memoized (insn);
3668 121455 : if (dump_file)
3669 : {
3670 0 : fprintf (dump_file, "\nwith:\n\n");
3671 0 : print_rtl_single (dump_file, insn);
3672 0 : fprintf (dump_file, "\n");
3673 : }
3674 121455 : df_insn_rescan (insn);
3675 : }
3676 31797 : }
3677 :
3678 : /* Return the inner scalar if OP is a broadcast, else return nullptr. */
3679 :
3680 : static rtx
3681 2185915 : ix86_broadcast_inner (rtx op, machine_mode mode,
3682 : machine_mode *scalar_mode_p,
3683 : x86_cse_kind *kind_p, rtx_insn **insn_p)
3684 : {
3685 2185915 : switch (standard_sse_constant_p (op, mode))
3686 : {
3687 112734 : case 1:
3688 112734 : *scalar_mode_p = QImode;
3689 112734 : *kind_p = X86_CSE_CONST0_VECTOR;
3690 112734 : *insn_p = nullptr;
3691 112734 : return const0_rtx;
3692 10879 : case 2:
3693 10879 : *scalar_mode_p = QImode;
3694 10879 : *kind_p = X86_CSE_CONSTM1_VECTOR;
3695 10879 : *insn_p = nullptr;
3696 10879 : return constm1_rtx;
3697 2062302 : default:
3698 2062302 : break;
3699 : }
3700 :
3701 2062302 : mode = GET_MODE (op);
3702 2062302 : int nunits = GET_MODE_NUNITS (mode);
3703 2062302 : if (nunits < 2)
3704 : return nullptr;
3705 :
3706 1582989 : *kind_p = X86_CSE_VEC_DUP;
3707 :
3708 1582989 : rtx reg;
3709 1582989 : if (GET_CODE (op) == VEC_DUPLICATE)
3710 : {
3711 : /* Only
3712 : (vec_duplicate:V4SI (reg:SI 99))
3713 : (vec_duplicate:V2DF (mem/u/c:DF (symbol_ref/u:DI ("*.LC1") [flags 0x2]) [0 S8 A64]))
3714 : are supported. Set OP to the broadcast source by default. */
3715 95280 : op = XEXP (op, 0);
3716 95280 : reg = op;
3717 95280 : if (SUBREG_P (op)
3718 401 : && SUBREG_BYTE (op) == 0
3719 95681 : && !paradoxical_subreg_p (op))
3720 401 : reg = SUBREG_REG (op);
3721 95280 : if (!REG_P (reg))
3722 : {
3723 7785 : if (MEM_P (op)
3724 7519 : && SYMBOL_REF_P (XEXP (op, 0))
3725 13497 : && CONSTANT_POOL_ADDRESS_P (XEXP (op, 0)))
3726 : {
3727 : /* Handle constant broadcast from memory. */
3728 5491 : *scalar_mode_p = GET_MODE_INNER (mode);
3729 5491 : *insn_p = nullptr;
3730 5491 : return op;
3731 : }
3732 : return nullptr;
3733 : }
3734 : }
3735 1487709 : else if (CONST_VECTOR_P (op))
3736 : {
3737 20 : rtx first = XVECEXP (op, 0, 0);
3738 48 : for (int i = 1; i < nunits; ++i)
3739 : {
3740 48 : rtx tmp = XVECEXP (op, 0, i);
3741 : /* Vector duplicate value. */
3742 48 : if (!rtx_equal_p (tmp, first))
3743 : return nullptr;
3744 : }
3745 0 : *scalar_mode_p = GET_MODE (first);
3746 0 : *insn_p = nullptr;
3747 0 : return first;
3748 : }
3749 : else
3750 : return nullptr;
3751 :
3752 87495 : mode = GET_MODE (op);
3753 :
3754 : /* Only single def chain is supported. */
3755 87495 : df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg));
3756 87495 : if (!ref
3757 87494 : || DF_REF_IS_ARTIFICIAL (ref)
3758 87494 : || DF_REF_NEXT_REG (ref) != nullptr)
3759 : return nullptr;
3760 :
3761 81786 : rtx_insn *insn = DF_REF_INSN (ref);
3762 81786 : rtx set = single_set (insn);
3763 81786 : if (!set)
3764 : return nullptr;
3765 :
3766 81738 : rtx src = SET_SRC (set);
3767 :
3768 81738 : if (CONST_INT_P (src))
3769 : {
3770 : /* Handle sequences like
3771 :
3772 : (set (reg:SI 99)
3773 : (const_int 34 [0x22]))
3774 : (set (reg:V4SI 98)
3775 : (vec_duplicate:V4SI (reg:SI 99)))
3776 :
3777 : Set *INSN_P to nullptr and return SET_SRC if SET_SRC is an
3778 : integer constant. */
3779 67133 : op = src;
3780 67133 : if (mode != GET_MODE (reg))
3781 0 : op = gen_int_mode (INTVAL (src), mode);
3782 67133 : *insn_p = nullptr;
3783 : }
3784 : else
3785 : {
3786 : /* Handle sequences like
3787 :
3788 : (set (reg:QI 105 [ c ])
3789 : (reg:QI 5 di [ c ]))
3790 : (set (reg:V64QI 102 [ _1 ])
3791 : (vec_duplicate:V64QI (reg:QI 105 [ c ])))
3792 :
3793 : (set (reg/v:SI 116 [ argc ])
3794 : (mem/c:SI (reg:SI 135) [2 argc+0 S4 A32]))
3795 : (set (reg:V4SI 119 [ _45 ])
3796 : (vec_duplicate:V4SI (reg/v:SI 116 [ argc ])))
3797 :
3798 : (set (reg:SI 98 [ _1 ])
3799 : (sign_extend:SI (reg:QI 106 [ c ])))
3800 : (set (reg:V16SI 103 [ _2 ])
3801 : (vec_duplicate:V16SI (reg:SI 98 [ _1 ])))
3802 :
3803 : (set (reg:SI 102 [ cost ])
3804 : (mem/c:SI (symbol_ref:DI ("cost") [flags 0x40])))
3805 : (set (reg:V4HI 103 [ _16 ])
3806 : (vec_duplicate:V4HI (subreg:HI (reg:SI 102 [ cost ]) 0)))
3807 :
3808 : (set (subreg:SI (reg/v:HI 107 [ cr_val ]) 0)
3809 : (ashift:SI (reg:SI 158)
3810 : (subreg:QI (reg:SI 156 [ _2 ]) 0)))
3811 : (set (reg:V16HI 183 [ _61 ])
3812 : (vec_duplicate:V16HI (reg/v:HI 107 [ cr_val ])))
3813 :
3814 : Set *INSN_P to INSN and return the broadcast source otherwise. */
3815 14605 : *insn_p = insn;
3816 : }
3817 :
3818 81738 : *scalar_mode_p = mode;
3819 81738 : return op;
3820 : }
3821 :
3822 : /* Replace CALL instruction in TLS_CALL_INSNS with SET from SRC and
3823 : put the updated instruction in UPDATED_TLS_INSNS. */
3824 :
3825 : static void
3826 310 : replace_tls_call (rtx src, auto_bitmap &tls_call_insns,
3827 : auto_bitmap &updated_tls_insns)
3828 : {
3829 310 : bitmap_iterator bi;
3830 310 : unsigned int id;
3831 :
3832 1731 : EXECUTE_IF_SET_IN_BITMAP (tls_call_insns, 0, id, bi)
3833 : {
3834 1421 : rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
3835 :
3836 : /* If this isn't a CALL, only GNU2 TLS implicit CALL patterns are
3837 : allowed. */
3838 1421 : if (!CALL_P (insn))
3839 : {
3840 41 : attr_tls64 tls64 = get_attr_tls64 (insn);
3841 41 : if (tls64 != TLS64_CALL && tls64 != TLS64_COMBINE)
3842 0 : gcc_unreachable ();
3843 : }
3844 :
3845 1421 : rtx pat = PATTERN (insn);
3846 1421 : gcc_assert (GET_CODE (pat) == PARALLEL);
3847 1421 : rtx set = XVECEXP (pat, 0, 0);
3848 1421 : gcc_assert (GET_CODE (set) == SET);
3849 1421 : rtx dest = SET_DEST (set);
3850 :
3851 1421 : set = gen_rtx_SET (dest, src);
3852 1421 : rtx_insn *set_insn = emit_insn_after (set, insn);
3853 1421 : if (recog_memoized (set_insn) < 0)
3854 0 : gcc_unreachable ();
3855 :
3856 : /* Put SET_INSN in UPDATED_TLS_INSNS. */
3857 1421 : bitmap_set_bit (updated_tls_insns, INSN_UID (set_insn));
3858 :
3859 1421 : if (dump_file)
3860 : {
3861 0 : fprintf (dump_file, "\nReplace:\n\n");
3862 0 : print_rtl_single (dump_file, insn);
3863 0 : fprintf (dump_file, "\nwith:\n\n");
3864 0 : print_rtl_single (dump_file, set_insn);
3865 0 : fprintf (dump_file, "\n");
3866 : }
3867 :
3868 : /* Delete the CALL insn. */
3869 1421 : delete_insn (insn);
3870 :
3871 1421 : df_insn_rescan (set_insn);
3872 : }
3873 310 : }
3874 :
3875 : /* Return the basic block which dominates all basic blocks which set
3876 : hard register REGNO used in basic block BB. */
3877 :
3878 : static basic_block
3879 2 : ix86_get_dominator_for_reg (unsigned int regno, basic_block bb)
3880 : {
3881 2 : basic_block set_bb;
3882 2 : auto_bitmap set_bbs;
3883 :
3884 : /* Get all BBs which set REGNO and dominate the current BB from all
3885 : DEFs of REGNO. */
3886 2 : for (df_ref def = DF_REG_DEF_CHAIN (regno);
3887 18 : def;
3888 16 : def = DF_REF_NEXT_REG (def))
3889 16 : if (!DF_REF_IS_ARTIFICIAL (def)
3890 16 : && !DF_REF_FLAGS_IS_SET (def, DF_REF_MAY_CLOBBER)
3891 6 : && !DF_REF_FLAGS_IS_SET (def, DF_REF_MUST_CLOBBER))
3892 : {
3893 4 : set_bb = DF_REF_BB (def);
3894 4 : if (dominated_by_p (CDI_DOMINATORS, bb, set_bb))
3895 2 : bitmap_set_bit (set_bbs, set_bb->index);
3896 : }
3897 :
3898 2 : bb = nearest_common_dominator_for_set (CDI_DOMINATORS, set_bbs);
3899 2 : return bb;
3900 2 : }
3901 :
3902 : /* Mark FLAGS register as live in DATA, a bitmap of live caller-saved
3903 : registers, if DEST is FLAGS register. */
3904 :
3905 : static void
3906 381 : ix86_check_flags_reg (rtx dest, const_rtx x, void *data)
3907 : {
3908 381 : if (GET_CODE (x) == CLOBBER)
3909 : return;
3910 :
3911 374 : auto_bitmap *live_caller_saved_regs = (auto_bitmap *) data;
3912 374 : if (REG_P (dest) && REGNO (dest) == FLAGS_REG)
3913 0 : bitmap_set_bit (*live_caller_saved_regs, FLAGS_REG);
3914 : }
3915 :
3916 : /* Emit a TLS_SET instruction of KIND in basic block BB. Store the
3917 : insertion point in *BEFORE_P for emit_insn_before or in *AFTER_P
3918 : for emit_insn_after. UPDATED_GNU_TLS_INSNS contains instructions
3919 : which replace the GNU TLS instructions. UPDATED_GNU2_TLS_INSNS
3920 : contains instructions which replace the GNU2 TLS instructions. */
3921 :
3922 : static rtx_insn *
3923 310 : ix86_emit_tls_call (rtx tls_set, x86_cse_kind kind, basic_block bb,
3924 : rtx_insn **before_p, rtx_insn **after_p,
3925 : auto_bitmap &updated_gnu_tls_insns,
3926 : auto_bitmap &updated_gnu2_tls_insns)
3927 : {
3928 312 : rtx_insn *tls_insn;
3929 :
3930 312 : do
3931 : {
3932 312 : rtx_insn *insn = BB_HEAD (bb);
3933 1288 : while (insn && !NONDEBUG_INSN_P (insn))
3934 : {
3935 980 : if (insn == BB_END (bb))
3936 : {
3937 : /* This must be the beginning basic block:
3938 :
3939 : (note 4 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
3940 : (note 2 4 26 2 NOTE_INSN_FUNCTION_BEG)
3941 :
3942 : or a basic block with only a label:
3943 :
3944 : (code_label 78 11 77 3 14 (nil) [1 uses])
3945 : (note 77 78 54 3 [bb 3] NOTE_INSN_BASIC_BLOCK)
3946 :
3947 : or a basic block with only a debug marker:
3948 :
3949 : (note 3 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
3950 : (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG)
3951 : (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil))
3952 :
3953 : or a basic block with only deleted instructions:
3954 :
3955 : (code_label 348 23 349 45 3 (nil) [0 uses])
3956 : (note 349 348 436 45 [bb 45] NOTE_INSN_BASIC_BLOCK)
3957 : (note 436 349 362 45 NOTE_INSN_DELETED)
3958 :
3959 : */
3960 4 : gcc_assert (DEBUG_INSN_P (insn)
3961 : || (NOTE_P (insn)
3962 : && ((NOTE_KIND (insn)
3963 : == NOTE_INSN_FUNCTION_BEG)
3964 : || (NOTE_KIND (insn)
3965 : == NOTE_INSN_DELETED)
3966 : || (NOTE_KIND (insn)
3967 : == NOTE_INSN_BASIC_BLOCK))));
3968 : insn = NULL;
3969 : break;
3970 : }
3971 976 : insn = NEXT_INSN (insn);
3972 : }
3973 :
3974 : /* TLS_GD and TLS_LD_BASE instructions are normal functions which
3975 : clobber caller-saved registers. TLSDESC instructions only
3976 : clobber FLAGS. If any registers clobbered by TLS instructions
3977 : are live in this basic block, we must insert TLS instructions
3978 : after all live registers clobbered are dead. */
3979 :
3980 312 : auto_bitmap live_caller_saved_regs;
3981 624 : bitmap in = df_live ? DF_LIVE_IN (bb) : DF_LR_IN (bb);
3982 :
3983 312 : if (bitmap_bit_p (in, FLAGS_REG))
3984 4 : bitmap_set_bit (live_caller_saved_regs, FLAGS_REG);
3985 :
3986 312 : unsigned int i;
3987 :
3988 : /* Get all live caller-saved registers for TLS_GD and TLS_LD_BASE
3989 : instructions. */
3990 312 : if (kind != X86_CSE_TLSDESC)
3991 27249 : for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
3992 26956 : if (call_used_regs[i]
3993 25198 : && !fixed_regs[i]
3994 38993 : && bitmap_bit_p (in, i))
3995 344 : bitmap_set_bit (live_caller_saved_regs, i);
3996 :
3997 312 : if (bitmap_empty_p (live_caller_saved_regs))
3998 : {
3999 79 : if (insn == BB_HEAD (bb))
4000 : {
4001 0 : *before_p = insn;
4002 0 : tls_insn = emit_insn_before (tls_set, insn);
4003 : }
4004 : else
4005 : {
4006 : /* Emit the TLS call after NOTE_INSN_FUNCTION_BEG in the
4007 : beginning basic block:
4008 :
4009 : (note 4 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
4010 : (note 2 4 26 2 NOTE_INSN_FUNCTION_BEG)
4011 :
4012 : or after NOTE_INSN_BASIC_BLOCK in a basic block with
4013 : only a label:
4014 :
4015 : (code_label 78 11 77 3 14 (nil) [1 uses])
4016 : (note 77 78 54 3 [bb 3] NOTE_INSN_BASIC_BLOCK)
4017 :
4018 : or after debug marker in a basic block with only a
4019 : debug marker:
4020 :
4021 : (note 3 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
4022 : (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG)
4023 : (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil))
4024 :
4025 : */
4026 79 : insn = insn ? PREV_INSN (insn) : BB_END (bb);
4027 79 : *after_p = insn;
4028 79 : tls_insn = emit_insn_after (tls_set, insn);
4029 : }
4030 79 : return tls_insn;
4031 : }
4032 :
4033 233 : bool repeat = false;
4034 :
4035 : /* Search for REG_DEAD notes in this basic block. */
4036 661 : FOR_BB_INSNS (bb, insn)
4037 : {
4038 661 : if (!NONDEBUG_INSN_P (insn))
4039 283 : continue;
4040 :
4041 : /* NB: Conditional jump is the only instruction which reads
4042 : flags register and changes control flow. We can never
4043 : place the TLS call after unconditional jump. */
4044 378 : if (JUMP_P (insn))
4045 : {
4046 : /* This must be a conditional jump. */
4047 2 : rtx label = JUMP_LABEL (insn);
4048 2 : if (label == nullptr
4049 2 : || ANY_RETURN_P (label)
4050 2 : || !(LABEL_P (label) || SYMBOL_REF_P (label)))
4051 0 : gcc_unreachable ();
4052 :
4053 : /* Place the call before all FLAGS_REG setting BBs since
4054 : we can't place a call before nor after a conditional
4055 : jump. */
4056 2 : bb = ix86_get_dominator_for_reg (FLAGS_REG, bb);
4057 :
4058 : /* Start over again. */
4059 2 : repeat = true;
4060 2 : break;
4061 : }
4062 :
4063 376 : if (bitmap_bit_p (updated_gnu_tls_insns, INSN_UID (insn)))
4064 : {
4065 : /* Insert the __tls_get_addr call before INSN which
4066 : replaces a __tls_get_addr call. */
4067 1 : *before_p = insn;
4068 1 : tls_insn = emit_insn_before (tls_set, insn);
4069 1 : return tls_insn;
4070 : }
4071 :
4072 375 : if (bitmap_bit_p (updated_gnu2_tls_insns, INSN_UID (insn)))
4073 : {
4074 : /* Mark FLAGS register as dead since FLAGS register
4075 : would be clobbered by the GNU2 TLS instruction. */
4076 1 : bitmap_clear_bit (live_caller_saved_regs, FLAGS_REG);
4077 1 : continue;
4078 : }
4079 :
4080 : /* Check if FLAGS register is live. */
4081 374 : note_stores (insn, ix86_check_flags_reg,
4082 : &live_caller_saved_regs);
4083 :
4084 374 : rtx link;
4085 515 : for (link = REG_NOTES (insn); link; link = XEXP (link, 1))
4086 371 : if ((REG_NOTE_KIND (link) == REG_DEAD
4087 9 : || (REG_NOTE_KIND (link) == REG_UNUSED
4088 7 : && REGNO (XEXP (link, 0)) == FLAGS_REG))
4089 378 : && REG_P (XEXP (link, 0)))
4090 : {
4091 : /* Mark the live caller-saved register as dead. */
4092 743 : for (i = REGNO (XEXP (link, 0));
4093 743 : i < END_REGNO (XEXP (link, 0));
4094 : i++)
4095 374 : if (i < FIRST_PSEUDO_REGISTER)
4096 351 : bitmap_clear_bit (live_caller_saved_regs, i);
4097 :
4098 369 : if (bitmap_empty_p (live_caller_saved_regs))
4099 : {
4100 230 : *after_p = insn;
4101 230 : tls_insn = emit_insn_after (tls_set, insn);
4102 230 : return tls_insn;
4103 : }
4104 : }
4105 : }
4106 :
4107 : /* NB: Start over again for conditional jump. */
4108 2 : if (repeat)
4109 2 : continue;
4110 :
4111 0 : gcc_assert (!bitmap_empty_p (live_caller_saved_regs));
4112 :
4113 : /* If any live caller-saved registers aren't dead at the end of
4114 : this basic block, get the basic block which dominates all
4115 : basic blocks which set the remaining live registers. */
4116 0 : auto_bitmap set_bbs;
4117 0 : bitmap_iterator bi;
4118 0 : unsigned int id;
4119 0 : EXECUTE_IF_SET_IN_BITMAP (live_caller_saved_regs, 0, id, bi)
4120 : {
4121 0 : basic_block set_bb = ix86_get_dominator_for_reg (id, bb);
4122 0 : bitmap_set_bit (set_bbs, set_bb->index);
4123 : }
4124 0 : bb = nearest_common_dominator_for_set (CDI_DOMINATORS, set_bbs);
4125 2 : }
4126 : while (true);
4127 : }
4128 :
4129 : /* Generate a TLS call of KIND with VAL and copy the call result to DEST,
4130 : at entry of the nearest dominator for basic block map BBS, which is in
4131 : the fake loop that contains the whole function, so that there is only
4132 : a single TLS CALL of KIND with VAL in the whole function.
4133 : UPDATED_GNU_TLS_INSNS contains instructions which replace the GNU TLS
4134 : instructions. UPDATED_GNU2_TLS_INSNS contains instructions which
4135 : replace the GNU2 TLS instructions. If TLSDESC_SET isn't nullptr,
4136 : insert it before the TLS call. */
4137 :
4138 : static void
4139 310 : ix86_place_single_tls_call (rtx dest, rtx val, x86_cse_kind kind,
4140 : auto_bitmap &bbs,
4141 : auto_bitmap &updated_gnu_tls_insns,
4142 : auto_bitmap &updated_gnu2_tls_insns,
4143 : rtx tlsdesc_set = nullptr)
4144 : {
4145 310 : basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
4146 310 : while (bb->loop_father->latch
4147 319 : != EXIT_BLOCK_PTR_FOR_FN (cfun))
4148 9 : bb = get_immediate_dominator (CDI_DOMINATORS,
4149 : bb->loop_father->header);
4150 :
4151 310 : rtx rax = nullptr, rdi;
4152 310 : rtx eqv = nullptr;
4153 310 : rtx caddr;
4154 310 : rtx set;
4155 310 : rtx clob;
4156 310 : rtx symbol;
4157 310 : rtx tls;
4158 :
4159 310 : switch (kind)
4160 : {
4161 262 : case X86_CSE_TLS_GD:
4162 262 : rax = gen_rtx_REG (Pmode, AX_REG);
4163 262 : rdi = gen_rtx_REG (Pmode, DI_REG);
4164 262 : caddr = ix86_tls_get_addr ();
4165 :
4166 262 : symbol = XVECEXP (val, 0, 0);
4167 262 : tls = gen_tls_global_dynamic_64 (Pmode, rax, symbol, caddr, rdi);
4168 :
4169 262 : if (GET_MODE (symbol) != Pmode)
4170 0 : symbol = gen_rtx_ZERO_EXTEND (Pmode, symbol);
4171 : eqv = symbol;
4172 : break;
4173 :
4174 30 : case X86_CSE_TLS_LD_BASE:
4175 30 : rax = gen_rtx_REG (Pmode, AX_REG);
4176 30 : rdi = gen_rtx_REG (Pmode, DI_REG);
4177 30 : caddr = ix86_tls_get_addr ();
4178 :
4179 30 : tls = gen_tls_local_dynamic_base_64 (Pmode, rax, caddr, rdi);
4180 :
4181 : /* Attach a unique REG_EQUAL to DEST, to allow the RTL optimizers
4182 : to share the LD_BASE result with other LD model accesses. */
4183 30 : eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
4184 : UNSPEC_TLS_LD_BASE);
4185 :
4186 30 : break;
4187 :
4188 18 : case X86_CSE_TLSDESC:
4189 18 : set = gen_rtx_SET (dest, val);
4190 18 : clob = gen_rtx_CLOBBER (VOIDmode,
4191 : gen_rtx_REG (CCmode, FLAGS_REG));
4192 18 : tls = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clob));
4193 18 : break;
4194 :
4195 0 : default:
4196 0 : gcc_unreachable ();
4197 : }
4198 :
4199 : /* Emit the TLS CALL insn. */
4200 310 : rtx_insn *before = nullptr;
4201 310 : rtx_insn *after = nullptr;
4202 310 : rtx_insn *tls_insn = ix86_emit_tls_call (tls, kind, bb, &before,
4203 : &after,
4204 : updated_gnu_tls_insns,
4205 : updated_gnu2_tls_insns);
4206 :
4207 310 : rtx_insn *tlsdesc_insn = nullptr;
4208 310 : if (tlsdesc_set)
4209 : {
4210 14 : rtx dest = copy_rtx (SET_DEST (tlsdesc_set));
4211 14 : rtx src = copy_rtx (SET_SRC (tlsdesc_set));
4212 14 : tlsdesc_set = gen_rtx_SET (dest, src);
4213 14 : tlsdesc_insn = emit_insn_before (tlsdesc_set, tls_insn);
4214 : }
4215 :
4216 310 : if (kind != X86_CSE_TLSDESC)
4217 : {
4218 292 : RTL_CONST_CALL_P (tls_insn) = 1;
4219 :
4220 : /* Indicate that this function can't jump to non-local gotos. */
4221 292 : make_reg_eh_region_note_nothrow_nononlocal (tls_insn);
4222 : }
4223 :
4224 310 : if (recog_memoized (tls_insn) < 0)
4225 0 : gcc_unreachable ();
4226 :
4227 310 : if (dump_file)
4228 : {
4229 0 : if (after)
4230 : {
4231 0 : fprintf (dump_file, "\nPlace:\n\n");
4232 0 : if (tlsdesc_insn)
4233 0 : print_rtl_single (dump_file, tlsdesc_insn);
4234 0 : print_rtl_single (dump_file, tls_insn);
4235 0 : fprintf (dump_file, "\nafter:\n\n");
4236 0 : print_rtl_single (dump_file, after);
4237 0 : fprintf (dump_file, "\n");
4238 : }
4239 : else
4240 : {
4241 0 : fprintf (dump_file, "\nPlace:\n\n");
4242 0 : if (tlsdesc_insn)
4243 0 : print_rtl_single (dump_file, tlsdesc_insn);
4244 0 : print_rtl_single (dump_file, tls_insn);
4245 0 : fprintf (dump_file, "\nbefore:\n\n");
4246 0 : print_rtl_single (dump_file, before);
4247 0 : fprintf (dump_file, "\n");
4248 : }
4249 : }
4250 :
4251 310 : if (kind != X86_CSE_TLSDESC)
4252 : {
4253 : /* Copy RAX to DEST. */
4254 292 : set = gen_rtx_SET (dest, rax);
4255 292 : rtx_insn *set_insn = emit_insn_after (set, tls_insn);
4256 292 : set_dst_reg_note (set_insn, REG_EQUAL, copy_rtx (eqv), dest);
4257 292 : if (dump_file)
4258 : {
4259 0 : fprintf (dump_file, "\nPlace:\n\n");
4260 0 : print_rtl_single (dump_file, set_insn);
4261 0 : fprintf (dump_file, "\nafter:\n\n");
4262 0 : print_rtl_single (dump_file, tls_insn);
4263 0 : fprintf (dump_file, "\n");
4264 : }
4265 : }
4266 310 : }
4267 :
4268 : namespace {
4269 :
4270 : const pass_data pass_data_x86_cse =
4271 : {
4272 : RTL_PASS, /* type */
4273 : "x86_cse", /* name */
4274 : OPTGROUP_NONE, /* optinfo_flags */
4275 : TV_MACH_DEP, /* tv_id */
4276 : 0, /* properties_required */
4277 : 0, /* properties_provided */
4278 : 0, /* properties_destroyed */
4279 : 0, /* todo_flags_start */
4280 : 0, /* todo_flags_finish */
4281 : };
4282 :
4283 : class pass_x86_cse : public rtl_opt_pass
4284 : {
4285 : public:
4286 285722 : pass_x86_cse (gcc::context *ctxt)
4287 571444 : : rtl_opt_pass (pass_data_x86_cse, ctxt)
4288 : {}
4289 :
4290 : /* opt_pass methods: */
4291 1471370 : bool gate (function *fun) final override
4292 : {
4293 1471370 : return (TARGET_SSE2
4294 1467149 : && optimize
4295 2512707 : && optimize_function_for_speed_p (fun));
4296 : }
4297 :
4298 976823 : unsigned int execute (function *) final override
4299 : {
4300 976823 : return x86_cse ();
4301 : }
4302 :
4303 : private:
4304 : /* The redundant source value. */
4305 : rtx val;
4306 : /* The actual redundant source value for UNSPEC_TLSDESC. */
4307 : rtx tlsdesc_val;
4308 : /* The instruction which defines the redundant value. */
4309 : rtx_insn *def_insn;
4310 : /* Mode of the destination of the candidate redundant instruction. */
4311 : machine_mode mode;
4312 : /* Mode of the source of the candidate redundant instruction. */
4313 : machine_mode scalar_mode;
4314 : /* The classification of the candidate redundant instruction. */
4315 : x86_cse_kind kind;
4316 :
4317 : unsigned int x86_cse (void);
4318 : bool candidate_gnu_tls_p (rtx_insn *, attr_tls64);
4319 : bool candidate_gnu2_tls_p (rtx, attr_tls64);
4320 : bool candidate_vector_p (rtx);
4321 : rtx_insn *tls_set_insn_from_symbol (const_rtx, const_rtx);
4322 : }; // class pass_x86_cse
4323 :
4324 : /* Return the instruction which sets REG from TLS_SYMBOL. */
4325 :
4326 : rtx_insn *
4327 38 : pass_x86_cse::tls_set_insn_from_symbol (const_rtx reg,
4328 : const_rtx tls_symbol)
4329 : {
4330 38 : rtx_insn *set_insn = nullptr;
4331 38 : for (df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg));
4332 103 : ref;
4333 65 : ref = DF_REF_NEXT_REG (ref))
4334 : {
4335 65 : if (DF_REF_IS_ARTIFICIAL (ref))
4336 : return nullptr;
4337 :
4338 65 : set_insn = DF_REF_INSN (ref);
4339 65 : if (get_attr_tls64 (set_insn) != TLS64_LEA)
4340 : return nullptr;
4341 :
4342 65 : rtx tls_set = PATTERN (set_insn);
4343 65 : rtx tls_src = XVECEXP (SET_SRC (tls_set), 0, 0);
4344 65 : if (!rtx_equal_p (tls_symbol, tls_src))
4345 : return nullptr;
4346 : }
4347 :
4348 : return set_insn;
4349 : }
4350 :
4351 : /* Return true and output def_insn, val, mode, scalar_mode and kind if
4352 : INSN is UNSPEC_TLS_GD or UNSPEC_TLS_LD_BASE. */
4353 :
4354 : bool
4355 2186 : pass_x86_cse::candidate_gnu_tls_p (rtx_insn *insn, attr_tls64 tls64)
4356 : {
4357 2186 : if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p)
4358 : return false;
4359 :
4360 : /* Record the redundant TLS CALLs for 64-bit:
4361 :
4362 : (parallel [
4363 : (set (reg:DI 0 ax)
4364 : (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr")))
4365 : (const_int 0 [0])))
4366 : (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50])
4367 : (reg/f:DI 7 sp)] UNSPEC_TLS_GD)
4368 : (clobber (reg:DI 5 di))])
4369 :
4370 :
4371 : and
4372 :
4373 : (parallel [
4374 : (set (reg:DI 0 ax)
4375 : (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr")))
4376 : (const_int 0 [0])))
4377 : (unspec:DI [(reg/f:DI 7 sp)] UNSPEC_TLS_LD_BASE)])
4378 :
4379 : */
4380 :
4381 2023 : rtx pat = PATTERN (insn);
4382 2023 : rtx set = XVECEXP (pat, 0, 0);
4383 2023 : gcc_assert (GET_CODE (set) == SET);
4384 2023 : rtx dest = SET_DEST (set);
4385 2023 : scalar_mode = mode = GET_MODE (dest);
4386 2023 : val = XVECEXP (pat, 0, 1);
4387 2023 : gcc_assert (GET_CODE (val) == UNSPEC);
4388 :
4389 2023 : if (tls64 == TLS64_GD)
4390 1922 : kind = X86_CSE_TLS_GD;
4391 : else
4392 101 : kind = X86_CSE_TLS_LD_BASE;
4393 :
4394 2023 : def_insn = nullptr;
4395 2023 : return true;
4396 : }
4397 :
4398 : /* Return true and output def_insn, val, mode, scalar_mode and kind if
4399 : SET is UNSPEC_TLSDESC. */
4400 :
4401 : bool
4402 50 : pass_x86_cse::candidate_gnu2_tls_p (rtx set, attr_tls64 tls64)
4403 : {
4404 50 : if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p)
4405 : return false;
4406 :
4407 48 : rtx tls_symbol;
4408 48 : rtx_insn *set_insn;
4409 48 : rtx src = SET_SRC (set);
4410 48 : val = src;
4411 48 : tlsdesc_val = src;
4412 48 : kind = X86_CSE_TLSDESC;
4413 :
4414 48 : if (tls64 == TLS64_COMBINE)
4415 : {
4416 : /* Record 64-bit TLS64_COMBINE:
4417 :
4418 : (set (reg/f:DI 104)
4419 : (plus:DI (unspec:DI [
4420 : (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
4421 : (reg:DI 114)
4422 : (reg/f:DI 7 sp)] UNSPEC_TLSDESC)
4423 : (const:DI (unspec:DI [
4424 : (symbol_ref:DI ("e") [flags 0x1a])
4425 : ] UNSPEC_DTPOFF))))
4426 :
4427 : (set (reg/f:DI 104)
4428 : (plus:DI (unspec:DI [
4429 : (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
4430 : (unspec:DI [
4431 : (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
4432 : ] UNSPEC_TLSDESC)
4433 : (reg/f:DI 7 sp)] UNSPEC_TLSDESC)
4434 : (const:DI (unspec:DI [
4435 : (symbol_ref:DI ("e") [flags 0x1a])
4436 : ] UNSPEC_DTPOFF))))
4437 : */
4438 :
4439 10 : scalar_mode = mode = GET_MODE (src);
4440 :
4441 : /* Since the first operand of PLUS in the source TLS_COMBINE
4442 : pattern is unused, use the second operand of PLUS:
4443 :
4444 : (const:DI (unspec:DI [
4445 : (symbol_ref:DI ("e") [flags 0x1a])
4446 : ] UNSPEC_DTPOFF))
4447 :
4448 : as VAL to check if 2 TLS_COMBINE patterns have the same
4449 : source. */
4450 10 : val = XEXP (src, 1);
4451 10 : gcc_assert (GET_CODE (val) == CONST
4452 : && GET_CODE (XEXP (val, 0)) == UNSPEC
4453 : && XINT (XEXP (val, 0), 1) == UNSPEC_DTPOFF
4454 : && SYMBOL_REF_P (XVECEXP (XEXP (val, 0), 0, 0)));
4455 10 : def_insn = nullptr;
4456 10 : return true;
4457 : }
4458 :
4459 : /* Record 64-bit TLS_CALL:
4460 :
4461 : (set (reg:DI 101)
4462 : (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50])
4463 : (reg:DI 112)
4464 : (reg/f:DI 7 sp)] UNSPEC_TLSDESC))
4465 :
4466 : */
4467 :
4468 38 : gcc_assert (GET_CODE (src) == UNSPEC);
4469 38 : tls_symbol = XVECEXP (src, 0, 0);
4470 38 : src = XVECEXP (src, 0, 1);
4471 38 : scalar_mode = mode = GET_MODE (src);
4472 38 : gcc_assert (REG_P (src));
4473 :
4474 : /* All definitions of reg:DI 129 in
4475 :
4476 : (set (reg:DI 110)
4477 : (unspec:DI [(symbol_ref:DI ("foo"))
4478 : (reg:DI 129)
4479 : (reg/f:DI 7 sp)] UNSPEC_TLSDESC))
4480 :
4481 : should have the same source as in
4482 :
4483 : (set (reg:DI 129)
4484 : (unspec:DI [(symbol_ref:DI ("foo"))] UNSPEC_TLSDESC))
4485 :
4486 : */
4487 :
4488 38 : set_insn = tls_set_insn_from_symbol (src, tls_symbol);
4489 38 : if (!set_insn)
4490 : return false;
4491 :
4492 : /* Use TLS_SYMBOL as VAL to check if 2 patterns have the same source. */
4493 38 : val = tls_symbol;
4494 38 : def_insn = set_insn;
4495 38 : return true;
4496 : }
4497 :
4498 : /* Return true and output def_insn, val, mode, scalar_mode and kind if
4499 : INSN is a vector broadcast instruction. */
4500 :
4501 : bool
4502 50170450 : pass_x86_cse::candidate_vector_p (rtx set)
4503 : {
4504 50170450 : rtx src = SET_SRC (set);
4505 50170450 : rtx dest = SET_DEST (set);
4506 50170450 : mode = GET_MODE (dest);
4507 : /* Skip non-vector instruction. */
4508 50170450 : if (!VECTOR_MODE_P (mode))
4509 : return false;
4510 :
4511 : /* Skip non-vector load instruction. */
4512 3686356 : if (!REG_P (dest) && !SUBREG_P (dest))
4513 : return false;
4514 :
4515 2185915 : val = ix86_broadcast_inner (src, mode, &scalar_mode, &kind,
4516 : &def_insn);
4517 2185915 : return val ? true : false;
4518 : }
4519 :
4520 : /* At entry of the nearest common dominator for basic blocks with
4521 :
4522 : 1. Vector CONST0_RTX patterns.
4523 : 2. Vector CONSTM1_RTX patterns.
4524 : 3. Vector broadcast patterns.
4525 : 4. UNSPEC_TLS_GD patterns.
4526 : 5. UNSPEC_TLS_LD_BASE patterns.
4527 : 6. UNSPEC_TLSDESC patterns.
4528 :
4529 : generate a single pattern whose destination is used to replace the
4530 : source in all identical patterns.
4531 :
4532 : NB: We want to generate a pattern, which is executed only once, to
4533 : cover the whole function. The LCM algorithm isn't appropriate here
4534 : since it may place a pattern inside the loop. */
4535 :
4536 : unsigned int
4537 976823 : pass_x86_cse::x86_cse (void)
4538 : {
4539 976823 : timevar_push (TV_MACH_DEP);
4540 :
4541 976823 : auto_vec<redundant_pattern *> loads;
4542 976823 : redundant_pattern *load;
4543 976823 : basic_block bb;
4544 976823 : rtx_insn *insn;
4545 976823 : unsigned int i;
4546 976823 : auto_bitmap updated_gnu_tls_insns;
4547 976823 : auto_bitmap updated_gnu2_tls_insns;
4548 :
4549 976823 : df_set_flags (DF_DEFER_INSN_RESCAN);
4550 :
4551 976823 : bool recursive_call_p = cfun->machine->recursive_function;
4552 :
4553 11001357 : FOR_EACH_BB_FN (bb, cfun)
4554 : {
4555 132360951 : FOR_BB_INSNS (bb, insn)
4556 : {
4557 122336417 : if (!NONDEBUG_INSN_P (insn))
4558 68512091 : continue;
4559 :
4560 53824326 : bool matched = false;
4561 : /* Remove redundant pattens if there are more than 2 of
4562 : them. */
4563 53824326 : unsigned int threshold = 2;
4564 :
4565 53824326 : rtx set = single_set (insn);
4566 53824326 : if (!set && !CALL_P (insn))
4567 1099392 : continue;
4568 :
4569 52724934 : tlsdesc_val = nullptr;
4570 :
4571 52724934 : attr_tls64 tls64 = get_attr_tls64 (insn);
4572 52724934 : switch (tls64)
4573 : {
4574 2186 : case TLS64_GD:
4575 2186 : case TLS64_LD_BASE:
4576 : /* Verify UNSPEC_TLS_GD and UNSPEC_TLS_LD_BASE. */
4577 2186 : if (candidate_gnu_tls_p (insn, tls64))
4578 : break;
4579 163 : continue;
4580 :
4581 50 : case TLS64_CALL:
4582 50 : case TLS64_COMBINE:
4583 : /* Verify UNSPEC_TLSDESC. */
4584 50 : if (candidate_gnu2_tls_p (set, tls64))
4585 : break;
4586 2 : continue;
4587 :
4588 35 : case TLS64_LEA:
4589 : /* Skip TLS64_LEA. */
4590 35 : continue;
4591 :
4592 52722663 : case TLS64_NONE:
4593 52722663 : if (!set)
4594 2552213 : continue;
4595 :
4596 : /* Check for vector broadcast. */
4597 50170450 : if (candidate_vector_p (set))
4598 : break;
4599 49959608 : continue;
4600 : }
4601 :
4602 : /* Check if there is a matching redundant load. */
4603 380462 : FOR_EACH_VEC_ELT (loads, i, load)
4604 258318 : if (load->val
4605 258318 : && load->kind == kind
4606 200709 : && load->mode == scalar_mode
4607 191526 : && (load->bb == bb
4608 155499 : || kind != X86_CSE_VEC_DUP
4609 : /* Non all 0s/1s vector load must be in the same
4610 : basic block if it is in a recursive call. */
4611 97217 : || !recursive_call_p)
4612 447924 : && rtx_equal_p (load->val, val))
4613 : {
4614 : /* Record instruction. */
4615 90769 : bitmap_set_bit (load->insns, INSN_UID (insn));
4616 :
4617 : /* Record the maximum vector size. */
4618 90769 : if (kind <= X86_CSE_VEC_DUP
4619 180427 : && load->size < GET_MODE_SIZE (mode))
4620 962 : load->size = GET_MODE_SIZE (mode);
4621 :
4622 : /* Record the basic block. */
4623 90769 : bitmap_set_bit (load->bbs, bb->index);
4624 :
4625 : /* Increment the count. */
4626 90769 : load->count++;
4627 :
4628 90769 : matched = true;
4629 90769 : break;
4630 : }
4631 :
4632 212913 : if (matched)
4633 90769 : continue;
4634 :
4635 : /* We see this instruction the first time. Record the
4636 : redundant source value, its mode, the destination size,
4637 : instruction which defines the redundant source value,
4638 : instruction basic block and the instruction kind. */
4639 122144 : load = new redundant_pattern;
4640 :
4641 122144 : load->val = copy_rtx (val);
4642 122144 : if (tlsdesc_val)
4643 25 : load->tlsdesc_val = copy_rtx (tlsdesc_val);
4644 : else
4645 122119 : load->tlsdesc_val = nullptr;
4646 122144 : load->mode = scalar_mode;
4647 122144 : load->size = GET_MODE_SIZE (mode);
4648 122144 : load->def_insn = def_insn;
4649 122144 : load->count = 1;
4650 122144 : load->threshold = threshold;
4651 122144 : load->bb = BLOCK_FOR_INSN (insn);
4652 122144 : load->kind = kind;
4653 :
4654 122144 : bitmap_set_bit (load->insns, INSN_UID (insn));
4655 122144 : bitmap_set_bit (load->bbs, bb->index);
4656 :
4657 122144 : loads.safe_push (load);
4658 : }
4659 : }
4660 :
4661 : bool replaced = false;
4662 1098967 : FOR_EACH_VEC_ELT (loads, i, load)
4663 122144 : if (load->count >= load->threshold)
4664 : {
4665 32107 : machine_mode mode;
4666 32107 : rtx reg, broadcast_source, broadcast_reg;
4667 32107 : replaced = true;
4668 32107 : switch (load->kind)
4669 : {
4670 310 : case X86_CSE_TLS_GD:
4671 310 : case X86_CSE_TLS_LD_BASE:
4672 310 : case X86_CSE_TLSDESC:
4673 310 : broadcast_reg = gen_reg_rtx (load->mode);
4674 310 : replace_tls_call (broadcast_reg, load->insns,
4675 310 : (load->kind == X86_CSE_TLSDESC
4676 : ? updated_gnu2_tls_insns
4677 : : updated_gnu_tls_insns));
4678 310 : load->broadcast_reg = broadcast_reg;
4679 310 : break;
4680 :
4681 31797 : case X86_CSE_CONST0_VECTOR:
4682 31797 : case X86_CSE_CONSTM1_VECTOR:
4683 31797 : case X86_CSE_VEC_DUP:
4684 31797 : mode = ix86_get_vector_cse_mode (load->size, load->mode);
4685 31797 : broadcast_reg = gen_reg_rtx (mode);
4686 31797 : if (load->def_insn)
4687 : {
4688 : /* Replace redundant vector loads with a single vector
4689 : load in the same basic block. */
4690 831 : reg = load->val;
4691 831 : if (load->mode != GET_MODE (reg))
4692 0 : reg = gen_rtx_SUBREG (load->mode, reg, 0);
4693 831 : broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
4694 : }
4695 : else
4696 : /* This is a constant integer/double vector. If the
4697 : inner scalar is 0 or -1, set vector to CONST0_RTX
4698 : or CONSTM1_RTX directly. */
4699 30966 : switch (load->kind)
4700 : {
4701 19566 : case X86_CSE_CONST0_VECTOR:
4702 19566 : broadcast_source = CONST0_RTX (mode);
4703 19566 : break;
4704 1147 : case X86_CSE_CONSTM1_VECTOR:
4705 1147 : broadcast_source = CONSTM1_RTX (mode);
4706 1147 : break;
4707 10253 : case X86_CSE_VEC_DUP:
4708 10253 : reg = gen_reg_rtx (load->mode);
4709 10253 : broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
4710 10253 : break;
4711 0 : default:
4712 0 : gcc_unreachable ();
4713 : }
4714 31797 : replace_vector_const (mode, broadcast_reg, load->insns,
4715 : load->mode);
4716 31797 : load->broadcast_source = broadcast_source;
4717 31797 : load->broadcast_reg = broadcast_reg;
4718 31797 : break;
4719 : }
4720 : }
4721 :
4722 976823 : if (replaced)
4723 : {
4724 26041 : auto_vec<rtx_insn *> control_flow_insns;
4725 :
4726 : /* (Re-)discover loops so that bb->loop_father can be used in the
4727 : analysis below. */
4728 26041 : calculate_dominance_info (CDI_DOMINATORS);
4729 26041 : loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
4730 :
4731 71666 : FOR_EACH_VEC_ELT (loads, i, load)
4732 45625 : if (load->count >= load->threshold)
4733 : {
4734 32107 : rtx set;
4735 32107 : if (load->def_insn)
4736 845 : switch (load->kind)
4737 : {
4738 14 : case X86_CSE_TLSDESC:
4739 14 : ix86_place_single_tls_call (load->broadcast_reg,
4740 : load->tlsdesc_val,
4741 : load->kind,
4742 14 : load->bbs,
4743 : updated_gnu_tls_insns,
4744 : updated_gnu2_tls_insns,
4745 14 : PATTERN (load->def_insn));
4746 14 : break;
4747 831 : case X86_CSE_VEC_DUP:
4748 : /* Insert a broadcast after the original scalar
4749 : definition. */
4750 831 : set = gen_rtx_SET (load->broadcast_reg,
4751 : load->broadcast_source);
4752 831 : insn = emit_insn_after (set, load->def_insn);
4753 :
4754 831 : if (cfun->can_throw_non_call_exceptions)
4755 : {
4756 : /* Handle REG_EH_REGION note in DEF_INSN. */
4757 5 : rtx note = find_reg_note (load->def_insn,
4758 : REG_EH_REGION, nullptr);
4759 5 : if (note)
4760 : {
4761 1 : control_flow_insns.safe_push (load->def_insn);
4762 1 : add_reg_note (insn, REG_EH_REGION,
4763 : XEXP (note, 0));
4764 : }
4765 : }
4766 :
4767 831 : if (dump_file)
4768 : {
4769 0 : fprintf (dump_file, "\nAdd:\n\n");
4770 0 : print_rtl_single (dump_file, insn);
4771 0 : fprintf (dump_file, "\nafter:\n\n");
4772 0 : print_rtl_single (dump_file, load->def_insn);
4773 0 : fprintf (dump_file, "\n");
4774 : }
4775 : break;
4776 0 : default:
4777 0 : gcc_unreachable ();
4778 : }
4779 : else
4780 31262 : switch (load->kind)
4781 : {
4782 296 : case X86_CSE_TLS_GD:
4783 296 : case X86_CSE_TLS_LD_BASE:
4784 296 : case X86_CSE_TLSDESC:
4785 296 : ix86_place_single_tls_call (load->broadcast_reg,
4786 : (load->kind == X86_CSE_TLSDESC
4787 : ? load->tlsdesc_val
4788 : : load->val),
4789 : load->kind,
4790 296 : load->bbs,
4791 : updated_gnu_tls_insns,
4792 : updated_gnu2_tls_insns);
4793 296 : break;
4794 30966 : case X86_CSE_CONST0_VECTOR:
4795 30966 : case X86_CSE_CONSTM1_VECTOR:
4796 30966 : case X86_CSE_VEC_DUP:
4797 30966 : ix86_place_single_vector_set (load->broadcast_reg,
4798 : load->broadcast_source,
4799 : load->bbs,
4800 : load);
4801 30966 : break;
4802 : }
4803 : }
4804 :
4805 26041 : loop_optimizer_finalize ();
4806 :
4807 26041 : if (!control_flow_insns.is_empty ())
4808 : {
4809 1 : free_dominance_info (CDI_DOMINATORS);
4810 :
4811 3 : FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
4812 1 : if (control_flow_insn_p (insn))
4813 : {
4814 : /* Split the block after insn. There will be a fallthru
4815 : edge, which is OK so we keep it. We have to create
4816 : the exception edges ourselves. */
4817 1 : bb = BLOCK_FOR_INSN (insn);
4818 1 : split_block (bb, insn);
4819 1 : rtl_make_eh_edge (NULL, bb, BB_END (bb));
4820 : }
4821 : }
4822 :
4823 26041 : df_process_deferred_rescans ();
4824 26041 : }
4825 :
4826 1098967 : FOR_EACH_VEC_ELT (loads, i, load)
4827 244288 : delete load;
4828 :
4829 976823 : df_clear_flags (DF_DEFER_INSN_RESCAN);
4830 :
4831 976823 : timevar_pop (TV_MACH_DEP);
4832 976823 : return 0;
4833 976823 : }
4834 :
4835 : } // anon namespace
4836 :
4837 : rtl_opt_pass *
4838 285722 : make_pass_x86_cse (gcc::context *ctxt)
4839 : {
4840 285722 : return new pass_x86_cse (ctxt);
4841 : }
4842 :
4843 : /* Convert legacy instructions that clobbers EFLAGS to APX_NF
4844 : instructions when there are no flag set between a flag
4845 : producer and user. */
4846 :
4847 : static unsigned int
4848 367 : ix86_apx_nf_convert (void)
4849 : {
4850 367 : timevar_push (TV_MACH_DEP);
4851 :
4852 367 : basic_block bb;
4853 367 : rtx_insn *insn;
4854 367 : hash_map <rtx_insn *, rtx> converting_map;
4855 367 : auto_vec <rtx_insn *> current_convert_list;
4856 :
4857 367 : bool converting_seq = false;
4858 367 : rtx cc = gen_rtx_REG (CCmode, FLAGS_REG);
4859 :
4860 786 : FOR_EACH_BB_FN (bb, cfun)
4861 : {
4862 : /* Reset conversion for each bb. */
4863 419 : converting_seq = false;
4864 5031 : FOR_BB_INSNS (bb, insn)
4865 : {
4866 4612 : if (!NONDEBUG_INSN_P (insn))
4867 4945 : continue;
4868 :
4869 3676 : if (recog_memoized (insn) < 0)
4870 335 : continue;
4871 :
4872 : /* Convert candidate insns after cstore, which should
4873 : satisify the two conditions:
4874 : 1. Is not flag user or producer, only clobbers
4875 : FLAGS_REG.
4876 : 2. Have corresponding nf pattern. */
4877 :
4878 3341 : rtx pat = PATTERN (insn);
4879 :
4880 : /* Starting convertion at first cstorecc. */
4881 3341 : rtx set = NULL_RTX;
4882 3341 : if (!converting_seq
4883 2760 : && (set = single_set (insn))
4884 2684 : && ix86_comparison_operator (SET_SRC (set), VOIDmode)
4885 126 : && reg_overlap_mentioned_p (cc, SET_SRC (set))
4886 3464 : && !reg_overlap_mentioned_p (cc, SET_DEST (set)))
4887 : {
4888 123 : converting_seq = true;
4889 123 : current_convert_list.truncate (0);
4890 : }
4891 : /* Terminate at the next explicit flag set. */
4892 3218 : else if (reg_set_p (cc, pat)
4893 3218 : && GET_CODE (set_of (cc, pat)) != CLOBBER)
4894 : converting_seq = false;
4895 :
4896 3122 : if (!converting_seq)
4897 2738 : continue;
4898 :
4899 603 : if (get_attr_has_nf (insn)
4900 603 : && GET_CODE (pat) == PARALLEL)
4901 : {
4902 : /* Record the insn to candidate map. */
4903 72 : current_convert_list.safe_push (insn);
4904 72 : converting_map.put (insn, pat);
4905 : }
4906 : /* If the insn clobbers flags but has no nf_attr,
4907 : revoke all previous candidates. */
4908 531 : else if (!get_attr_has_nf (insn)
4909 530 : && reg_set_p (cc, pat)
4910 534 : && GET_CODE (set_of (cc, pat)) == CLOBBER)
4911 : {
4912 3 : for (auto item : current_convert_list)
4913 0 : converting_map.remove (item);
4914 3 : converting_seq = false;
4915 : }
4916 : }
4917 : }
4918 :
4919 367 : if (!converting_map.is_empty ())
4920 : {
4921 85 : for (auto iter = converting_map.begin ();
4922 170 : iter != converting_map.end (); ++iter)
4923 : {
4924 72 : rtx_insn *replace = (*iter).first;
4925 72 : rtx pat = (*iter).second;
4926 72 : int i, n = 0, len = XVECLEN (pat, 0);
4927 72 : rtx *new_elems = XALLOCAVEC (rtx, len);
4928 72 : rtx new_pat;
4929 216 : for (i = 0; i < len; i++)
4930 : {
4931 144 : rtx temp = XVECEXP (pat, 0, i);
4932 216 : if (! (GET_CODE (temp) == CLOBBER
4933 72 : && reg_overlap_mentioned_p (cc,
4934 72 : XEXP (temp, 0))))
4935 : {
4936 72 : new_elems[n] = temp;
4937 72 : n++;
4938 : }
4939 : }
4940 :
4941 72 : if (n == 1)
4942 72 : new_pat = new_elems[0];
4943 : else
4944 0 : new_pat =
4945 0 : gen_rtx_PARALLEL (VOIDmode,
4946 : gen_rtvec_v (n,
4947 : new_elems));
4948 :
4949 72 : PATTERN (replace) = new_pat;
4950 72 : INSN_CODE (replace) = -1;
4951 72 : recog_memoized (replace);
4952 72 : df_insn_rescan (replace);
4953 : }
4954 : }
4955 :
4956 367 : timevar_pop (TV_MACH_DEP);
4957 367 : return 0;
4958 367 : }
4959 :
4960 :
4961 : namespace {
4962 :
4963 : const pass_data pass_data_apx_nf_convert =
4964 : {
4965 : RTL_PASS, /* type */
4966 : "apx_nfcvt", /* name */
4967 : OPTGROUP_NONE, /* optinfo_flags */
4968 : TV_MACH_DEP, /* tv_id */
4969 : 0, /* properties_required */
4970 : 0, /* properties_provided */
4971 : 0, /* properties_destroyed */
4972 : 0, /* todo_flags_start */
4973 : 0, /* todo_flags_finish */
4974 : };
4975 :
4976 : class pass_apx_nf_convert : public rtl_opt_pass
4977 : {
4978 : public:
4979 285722 : pass_apx_nf_convert (gcc::context *ctxt)
4980 571444 : : rtl_opt_pass (pass_data_apx_nf_convert, ctxt)
4981 : {}
4982 :
4983 : /* opt_pass methods: */
4984 1471370 : bool gate (function *) final override
4985 : {
4986 1471370 : return (TARGET_APX_NF
4987 459 : && optimize
4988 1471821 : && optimize_function_for_speed_p (cfun));
4989 : }
4990 :
4991 367 : unsigned int execute (function *) final override
4992 : {
4993 367 : return ix86_apx_nf_convert ();
4994 : }
4995 : }; // class pass_apx_nf_convert
4996 :
4997 : } // anon namespace
4998 :
4999 : rtl_opt_pass *
5000 285722 : make_pass_apx_nf_convert (gcc::context *ctxt)
5001 : {
5002 285722 : return new pass_apx_nf_convert (ctxt);
5003 : }
5004 :
5005 : /* When a hot loop can be fit into one cacheline,
5006 : force align the loop without considering the max skip. */
5007 : static void
5008 978644 : ix86_align_loops ()
5009 : {
5010 978644 : basic_block bb;
5011 :
5012 : /* Don't do this when we don't know cache line size. */
5013 978644 : if (ix86_cost->prefetch_block == 0)
5014 9 : return;
5015 :
5016 978635 : loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
5017 978635 : profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
5018 11466240 : FOR_EACH_BB_FN (bb, cfun)
5019 : {
5020 10487605 : rtx_insn *label = BB_HEAD (bb);
5021 10487605 : bool has_fallthru = 0;
5022 10487605 : edge e;
5023 10487605 : edge_iterator ei;
5024 :
5025 10487605 : if (!LABEL_P (label))
5026 5324228 : continue;
5027 :
5028 5168190 : profile_count fallthru_count = profile_count::zero ();
5029 5168190 : profile_count branch_count = profile_count::zero ();
5030 :
5031 15027324 : FOR_EACH_EDGE (e, ei, bb->preds)
5032 : {
5033 9859134 : if (e->flags & EDGE_FALLTHRU)
5034 2517073 : has_fallthru = 1, fallthru_count += e->count ();
5035 : else
5036 7342061 : branch_count += e->count ();
5037 : }
5038 :
5039 5168190 : if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
5040 4813 : continue;
5041 :
5042 5163377 : if (bb->loop_father
5043 5163377 : && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
5044 6511778 : && (has_fallthru
5045 1348401 : ? (!(single_succ_p (bb)
5046 146104 : && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
5047 938933 : && optimize_bb_for_speed_p (bb)
5048 857203 : && branch_count + fallthru_count > count_threshold
5049 733852 : && (branch_count > fallthru_count * param_align_loop_iterations))
5050 : /* In case there'no fallthru for the loop.
5051 : Nops inserted won't be executed. */
5052 409468 : : (branch_count > count_threshold
5053 141382 : || (bb->count > bb->prev_bb->count * 10
5054 12798 : && (bb->prev_bb->count
5055 4629996 : <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)))))
5056 : {
5057 546179 : rtx_insn* insn, *end_insn;
5058 546179 : HOST_WIDE_INT size = 0;
5059 546179 : bool padding_p = true;
5060 546179 : basic_block tbb = bb;
5061 546179 : unsigned cond_branch_num = 0;
5062 546179 : bool detect_tight_loop_p = false;
5063 :
5064 862829 : for (unsigned int i = 0; i != bb->loop_father->num_nodes;
5065 316650 : i++, tbb = tbb->next_bb)
5066 : {
5067 : /* Only handle continuous cfg layout. */
5068 862829 : if (bb->loop_father != tbb->loop_father)
5069 : {
5070 : padding_p = false;
5071 : break;
5072 : }
5073 :
5074 10161450 : FOR_BB_INSNS (tbb, insn)
5075 : {
5076 9497895 : if (!NONDEBUG_INSN_P (insn))
5077 5447853 : continue;
5078 4050042 : size += ix86_min_insn_size (insn);
5079 :
5080 : /* We don't know size of inline asm.
5081 : Don't align loop for call. */
5082 4050042 : if (asm_noperands (PATTERN (insn)) >= 0
5083 4050042 : || CALL_P (insn))
5084 : {
5085 : size = -1;
5086 : break;
5087 : }
5088 : }
5089 :
5090 821782 : if (size == -1 || size > ix86_cost->prefetch_block)
5091 : {
5092 : padding_p = false;
5093 : break;
5094 : }
5095 :
5096 1464466 : FOR_EACH_EDGE (e, ei, tbb->succs)
5097 : {
5098 : /* It could be part of the loop. */
5099 1010786 : if (e->dest == bb)
5100 : {
5101 : detect_tight_loop_p = true;
5102 : break;
5103 : }
5104 : }
5105 :
5106 638158 : if (detect_tight_loop_p)
5107 : break;
5108 :
5109 453680 : end_insn = BB_END (tbb);
5110 453680 : if (JUMP_P (end_insn))
5111 : {
5112 : /* For decoded icache:
5113 : 1. Up to two branches are allowed per Way.
5114 : 2. A non-conditional branch is the last micro-op in a Way.
5115 : */
5116 367537 : if (onlyjump_p (end_insn)
5117 367537 : && (any_uncondjump_p (end_insn)
5118 312313 : || single_succ_p (tbb)))
5119 : {
5120 : padding_p = false;
5121 : break;
5122 : }
5123 312313 : else if (++cond_branch_num >= 2)
5124 : {
5125 : padding_p = false;
5126 : break;
5127 : }
5128 : }
5129 :
5130 : }
5131 :
5132 546179 : if (padding_p && detect_tight_loop_p)
5133 : {
5134 368956 : emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)),
5135 : GEN_INT (0)), label);
5136 : /* End of function. */
5137 184478 : if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun))
5138 : break;
5139 : /* Skip bb which already fits into one cacheline. */
5140 : bb = tbb;
5141 : }
5142 : }
5143 : }
5144 :
5145 978635 : loop_optimizer_finalize ();
5146 978635 : free_dominance_info (CDI_DOMINATORS);
5147 : }
5148 :
5149 : namespace {
5150 :
5151 : const pass_data pass_data_align_tight_loops =
5152 : {
5153 : RTL_PASS, /* type */
5154 : "align_tight_loops", /* name */
5155 : OPTGROUP_NONE, /* optinfo_flags */
5156 : TV_MACH_DEP, /* tv_id */
5157 : 0, /* properties_required */
5158 : 0, /* properties_provided */
5159 : 0, /* properties_destroyed */
5160 : 0, /* todo_flags_start */
5161 : 0, /* todo_flags_finish */
5162 : };
5163 :
5164 : class pass_align_tight_loops : public rtl_opt_pass
5165 : {
5166 : public:
5167 285722 : pass_align_tight_loops (gcc::context *ctxt)
5168 571444 : : rtl_opt_pass (pass_data_align_tight_loops, ctxt)
5169 : {}
5170 :
5171 : /* opt_pass methods: */
5172 1471370 : bool gate (function *) final override
5173 : {
5174 1471370 : return TARGET_ALIGN_TIGHT_LOOPS
5175 1470884 : && optimize
5176 2514573 : && optimize_function_for_speed_p (cfun);
5177 : }
5178 :
5179 978644 : unsigned int execute (function *) final override
5180 : {
5181 978644 : timevar_push (TV_MACH_DEP);
5182 : #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
5183 978644 : ix86_align_loops ();
5184 : #endif
5185 978644 : timevar_pop (TV_MACH_DEP);
5186 978644 : return 0;
5187 : }
5188 : }; // class pass_align_tight_loops
5189 :
5190 : } // anon namespace
5191 :
5192 : rtl_opt_pass *
5193 285722 : make_pass_align_tight_loops (gcc::context *ctxt)
5194 : {
5195 285722 : return new pass_align_tight_loops (ctxt);
5196 : }
5197 :
5198 : /* This compares the priority of target features in function DECL1
5199 : and DECL2. It returns positive value if DECL1 is higher priority,
5200 : negative value if DECL2 is higher priority and 0 if they are the
5201 : same. */
5202 :
5203 : int
5204 5737 : ix86_compare_version_priority (tree decl1, tree decl2)
5205 : {
5206 5737 : unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
5207 5737 : unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
5208 :
5209 5737 : return (int)priority1 - (int)priority2;
5210 : }
5211 :
5212 : /* V1 and V2 point to function versions with different priorities
5213 : based on the target ISA. This function compares their priorities. */
5214 :
5215 : static int
5216 6830 : feature_compare (const void *v1, const void *v2)
5217 : {
5218 6830 : typedef struct _function_version_info
5219 : {
5220 : tree version_decl;
5221 : tree predicate_chain;
5222 : unsigned int dispatch_priority;
5223 : } function_version_info;
5224 :
5225 6830 : const function_version_info c1 = *(const function_version_info *)v1;
5226 6830 : const function_version_info c2 = *(const function_version_info *)v2;
5227 6830 : return (c2.dispatch_priority - c1.dispatch_priority);
5228 : }
5229 :
5230 : /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
5231 : to return a pointer to VERSION_DECL if the outcome of the expression
5232 : formed by PREDICATE_CHAIN is true. This function will be called during
5233 : version dispatch to decide which function version to execute. It returns
5234 : the basic block at the end, to which more conditions can be added. */
5235 :
5236 : static basic_block
5237 818 : add_condition_to_bb (tree function_decl, tree version_decl,
5238 : tree predicate_chain, basic_block new_bb)
5239 : {
5240 818 : gimple *return_stmt;
5241 818 : tree convert_expr, result_var;
5242 818 : gimple *convert_stmt;
5243 818 : gimple *call_cond_stmt;
5244 818 : gimple *if_else_stmt;
5245 :
5246 818 : basic_block bb1, bb2, bb3;
5247 818 : edge e12, e23;
5248 :
5249 818 : tree cond_var, and_expr_var = NULL_TREE;
5250 818 : gimple_seq gseq;
5251 :
5252 818 : tree predicate_decl, predicate_arg;
5253 :
5254 818 : push_cfun (DECL_STRUCT_FUNCTION (function_decl));
5255 :
5256 818 : gcc_assert (new_bb != NULL);
5257 818 : gseq = bb_seq (new_bb);
5258 :
5259 :
5260 818 : convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
5261 : build_fold_addr_expr (version_decl));
5262 818 : result_var = create_tmp_var (ptr_type_node);
5263 818 : convert_stmt = gimple_build_assign (result_var, convert_expr);
5264 818 : return_stmt = gimple_build_return (result_var);
5265 :
5266 818 : if (predicate_chain == NULL_TREE)
5267 : {
5268 195 : gimple_seq_add_stmt (&gseq, convert_stmt);
5269 195 : gimple_seq_add_stmt (&gseq, return_stmt);
5270 195 : set_bb_seq (new_bb, gseq);
5271 195 : gimple_set_bb (convert_stmt, new_bb);
5272 195 : gimple_set_bb (return_stmt, new_bb);
5273 195 : pop_cfun ();
5274 195 : return new_bb;
5275 : }
5276 :
5277 1285 : while (predicate_chain != NULL)
5278 : {
5279 662 : cond_var = create_tmp_var (integer_type_node);
5280 662 : predicate_decl = TREE_PURPOSE (predicate_chain);
5281 662 : predicate_arg = TREE_VALUE (predicate_chain);
5282 662 : call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
5283 662 : gimple_call_set_lhs (call_cond_stmt, cond_var);
5284 :
5285 662 : gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
5286 662 : gimple_set_bb (call_cond_stmt, new_bb);
5287 662 : gimple_seq_add_stmt (&gseq, call_cond_stmt);
5288 :
5289 662 : predicate_chain = TREE_CHAIN (predicate_chain);
5290 :
5291 662 : if (and_expr_var == NULL)
5292 : and_expr_var = cond_var;
5293 : else
5294 : {
5295 39 : gimple *assign_stmt;
5296 : /* Use MIN_EXPR to check if any integer is zero?.
5297 : and_expr_var = min_expr <cond_var, and_expr_var> */
5298 39 : assign_stmt = gimple_build_assign (and_expr_var,
5299 : build2 (MIN_EXPR, integer_type_node,
5300 : cond_var, and_expr_var));
5301 :
5302 39 : gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
5303 39 : gimple_set_bb (assign_stmt, new_bb);
5304 39 : gimple_seq_add_stmt (&gseq, assign_stmt);
5305 : }
5306 : }
5307 :
5308 623 : if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
5309 : integer_zero_node,
5310 : NULL_TREE, NULL_TREE);
5311 623 : gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
5312 623 : gimple_set_bb (if_else_stmt, new_bb);
5313 623 : gimple_seq_add_stmt (&gseq, if_else_stmt);
5314 :
5315 623 : gimple_seq_add_stmt (&gseq, convert_stmt);
5316 623 : gimple_seq_add_stmt (&gseq, return_stmt);
5317 623 : set_bb_seq (new_bb, gseq);
5318 :
5319 623 : bb1 = new_bb;
5320 623 : e12 = split_block (bb1, if_else_stmt);
5321 623 : bb2 = e12->dest;
5322 623 : e12->flags &= ~EDGE_FALLTHRU;
5323 623 : e12->flags |= EDGE_TRUE_VALUE;
5324 :
5325 623 : e23 = split_block (bb2, return_stmt);
5326 :
5327 623 : gimple_set_bb (convert_stmt, bb2);
5328 623 : gimple_set_bb (return_stmt, bb2);
5329 :
5330 623 : bb3 = e23->dest;
5331 623 : make_edge (bb1, bb3, EDGE_FALSE_VALUE);
5332 :
5333 623 : remove_edge (e23);
5334 623 : make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
5335 :
5336 623 : pop_cfun ();
5337 :
5338 623 : return bb3;
5339 : }
5340 :
5341 : /* This function generates the dispatch function for
5342 : multi-versioned functions. DISPATCH_DECL is the function which will
5343 : contain the dispatch logic. FNDECLS are the function choices for
5344 : dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
5345 : in DISPATCH_DECL in which the dispatch code is generated. */
5346 :
5347 : static int
5348 195 : dispatch_function_versions (tree dispatch_decl,
5349 : void *fndecls_p,
5350 : basic_block *empty_bb)
5351 : {
5352 195 : tree default_decl;
5353 195 : gimple *ifunc_cpu_init_stmt;
5354 195 : gimple_seq gseq;
5355 195 : int ix;
5356 195 : tree ele;
5357 195 : vec<tree> *fndecls;
5358 195 : unsigned int num_versions = 0;
5359 195 : unsigned int actual_versions = 0;
5360 195 : unsigned int i;
5361 :
5362 195 : struct _function_version_info
5363 : {
5364 : tree version_decl;
5365 : tree predicate_chain;
5366 : unsigned int dispatch_priority;
5367 : }*function_version_info;
5368 :
5369 195 : gcc_assert (dispatch_decl != NULL
5370 : && fndecls_p != NULL
5371 : && empty_bb != NULL);
5372 :
5373 : /*fndecls_p is actually a vector. */
5374 195 : fndecls = static_cast<vec<tree> *> (fndecls_p);
5375 :
5376 : /* At least one more version other than the default. */
5377 195 : num_versions = fndecls->length ();
5378 195 : gcc_assert (num_versions >= 2);
5379 :
5380 195 : function_version_info = (struct _function_version_info *)
5381 195 : XNEWVEC (struct _function_version_info, (num_versions - 1));
5382 :
5383 : /* The first version in the vector is the default decl. */
5384 195 : default_decl = (*fndecls)[0];
5385 :
5386 195 : push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
5387 :
5388 195 : gseq = bb_seq (*empty_bb);
5389 : /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
5390 : constructors, so explicity call __builtin_cpu_init here. */
5391 195 : ifunc_cpu_init_stmt
5392 195 : = gimple_build_call_vec (get_ix86_builtin (IX86_BUILTIN_CPU_INIT), vNULL);
5393 195 : gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
5394 195 : gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
5395 195 : set_bb_seq (*empty_bb, gseq);
5396 :
5397 195 : pop_cfun ();
5398 :
5399 :
5400 975 : for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
5401 : {
5402 780 : tree version_decl = ele;
5403 780 : tree predicate_chain = NULL_TREE;
5404 780 : unsigned int priority;
5405 : /* Get attribute string, parse it and find the right predicate decl.
5406 : The predicate function could be a lengthy combination of many
5407 : features, like arch-type and various isa-variants. */
5408 780 : priority = get_builtin_code_for_version (version_decl,
5409 : &predicate_chain);
5410 :
5411 780 : if (predicate_chain == NULL_TREE)
5412 157 : continue;
5413 :
5414 623 : function_version_info [actual_versions].version_decl = version_decl;
5415 623 : function_version_info [actual_versions].predicate_chain
5416 623 : = predicate_chain;
5417 623 : function_version_info [actual_versions].dispatch_priority = priority;
5418 623 : actual_versions++;
5419 : }
5420 :
5421 : /* Sort the versions according to descending order of dispatch priority. The
5422 : priority is based on the ISA. This is not a perfect solution. There
5423 : could still be ambiguity. If more than one function version is suitable
5424 : to execute, which one should be dispatched? In future, allow the user
5425 : to specify a dispatch priority next to the version. */
5426 195 : qsort (function_version_info, actual_versions,
5427 : sizeof (struct _function_version_info), feature_compare);
5428 :
5429 1013 : for (i = 0; i < actual_versions; ++i)
5430 623 : *empty_bb = add_condition_to_bb (dispatch_decl,
5431 : function_version_info[i].version_decl,
5432 623 : function_version_info[i].predicate_chain,
5433 : *empty_bb);
5434 :
5435 : /* dispatch default version at the end. */
5436 195 : *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
5437 : NULL, *empty_bb);
5438 :
5439 195 : free (function_version_info);
5440 195 : return 0;
5441 : }
5442 :
5443 : /* This function changes the assembler name for functions that are
5444 : versions. If DECL is a function version and has a "target"
5445 : attribute, it appends the attribute string to its assembler name. */
5446 :
5447 : static tree
5448 1100 : ix86_mangle_function_version_assembler_name (tree decl, tree id)
5449 : {
5450 1100 : tree version_attr;
5451 1100 : char *attr_str;
5452 :
5453 1100 : if (DECL_DECLARED_INLINE_P (decl)
5454 1147 : && lookup_attribute ("gnu_inline",
5455 47 : DECL_ATTRIBUTES (decl)))
5456 0 : error_at (DECL_SOURCE_LOCATION (decl),
5457 : "function versions cannot be marked as %<gnu_inline%>,"
5458 : " bodies have to be generated");
5459 :
5460 1100 : if (DECL_VIRTUAL_P (decl)
5461 2200 : || DECL_VINDEX (decl))
5462 0 : sorry ("virtual function multiversioning not supported");
5463 :
5464 1100 : version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
5465 :
5466 : /* target attribute string cannot be NULL. */
5467 1100 : gcc_assert (version_attr != NULL_TREE);
5468 :
5469 1100 : attr_str = sorted_attr_string (TREE_VALUE (version_attr));
5470 :
5471 : /* Allow assembler name to be modified if already set. */
5472 1100 : if (DECL_ASSEMBLER_NAME_SET_P (decl))
5473 1085 : SET_DECL_RTL (decl, NULL);
5474 :
5475 1100 : tree ret = clone_identifier (id, attr_str, true);
5476 :
5477 1100 : XDELETEVEC (attr_str);
5478 :
5479 1100 : return ret;
5480 : }
5481 :
5482 : tree
5483 496779878 : ix86_mangle_decl_assembler_name (tree decl, tree id)
5484 : {
5485 : /* For function version, add the target suffix to the assembler name. */
5486 496779878 : if (TREE_CODE (decl) == FUNCTION_DECL)
5487 : {
5488 458096156 : cgraph_node *node = cgraph_node::get (decl);
5489 : /* Mangle all versions when annotated with target_clones, but only
5490 : non-default versions when annotated with target attributes. */
5491 458096156 : if (DECL_FUNCTION_VERSIONED (decl)
5492 458096156 : && (node->is_target_clone
5493 1077 : || !is_function_default_version (node->decl)))
5494 1100 : id = ix86_mangle_function_version_assembler_name (decl, id);
5495 : /* Mangle the dispatched symbol but only in the case of target clones. */
5496 458095056 : else if (node && node->dispatcher_function && !node->is_target_clone)
5497 114 : id = clone_identifier (id, "ifunc");
5498 73263234 : else if (node && node->dispatcher_resolver_function)
5499 195 : id = clone_identifier (id, "resolver");
5500 : }
5501 : #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
5502 : id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
5503 : #endif
5504 :
5505 496779878 : return id;
5506 : }
5507 :
5508 : /* Make a dispatcher declaration for the multi-versioned function DECL.
5509 : Calls to DECL function will be replaced with calls to the dispatcher
5510 : by the front-end. Returns the decl of the dispatcher function. */
5511 :
5512 : tree
5513 321 : ix86_get_function_versions_dispatcher (void *decl)
5514 : {
5515 321 : tree fn = (tree) decl;
5516 321 : struct cgraph_node *node = NULL;
5517 321 : struct cgraph_node *default_node = NULL;
5518 321 : struct cgraph_function_version_info *node_v = NULL;
5519 :
5520 321 : tree dispatch_decl = NULL;
5521 :
5522 321 : struct cgraph_function_version_info *default_version_info = NULL;
5523 :
5524 642 : gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
5525 :
5526 321 : node = cgraph_node::get (fn);
5527 321 : gcc_assert (node != NULL);
5528 :
5529 321 : node_v = node->function_version ();
5530 321 : gcc_assert (node_v != NULL);
5531 :
5532 321 : if (node_v->dispatcher_resolver != NULL)
5533 : return node_v->dispatcher_resolver;
5534 :
5535 : /* The default node is always the beginning of the chain. */
5536 : default_version_info = node_v;
5537 660 : while (default_version_info->prev != NULL)
5538 : default_version_info = default_version_info->prev;
5539 207 : default_node = default_version_info->this_node;
5540 :
5541 : /* If there is no default node, just return NULL. */
5542 207 : if (!is_function_default_version (default_node->decl))
5543 : return NULL;
5544 :
5545 : #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
5546 198 : if (targetm.has_ifunc_p ())
5547 : {
5548 198 : struct cgraph_function_version_info *it_v = NULL;
5549 :
5550 : /* Right now, the dispatching is done via ifunc. */
5551 198 : dispatch_decl = make_dispatcher_decl (default_node->decl);
5552 :
5553 : /* Set the dispatcher for all the versions. */
5554 198 : it_v = default_version_info;
5555 1377 : while (it_v != NULL)
5556 : {
5557 981 : it_v->dispatcher_resolver = dispatch_decl;
5558 981 : it_v = it_v->next;
5559 : }
5560 : }
5561 : else
5562 : #endif
5563 : {
5564 0 : error_at (DECL_SOURCE_LOCATION (default_node->decl),
5565 : "multiversioning needs %<ifunc%> which is not supported "
5566 : "on this target");
5567 : }
5568 :
5569 : return dispatch_decl;
5570 : }
5571 :
5572 : /* Make the resolver function decl to dispatch the versions of
5573 : a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
5574 : ifunc alias that will point to the created resolver. Create an
5575 : empty basic block in the resolver and store the pointer in
5576 : EMPTY_BB. Return the decl of the resolver function. */
5577 :
5578 : static tree
5579 195 : make_resolver_func (const tree default_decl,
5580 : const tree ifunc_alias_decl,
5581 : basic_block *empty_bb)
5582 : {
5583 195 : tree decl, type, t;
5584 :
5585 : /* The resolver function should return a (void *). */
5586 195 : type = build_function_type_list (ptr_type_node, NULL_TREE);
5587 :
5588 195 : cgraph_node *node = cgraph_node::get (default_decl);
5589 195 : gcc_assert (node && node->function_version ());
5590 :
5591 195 : decl = build_fn_decl (IDENTIFIER_POINTER (DECL_NAME (default_decl)), type);
5592 :
5593 : /* Set the assembler name to prevent cgraph_node attempting to mangle. */
5594 195 : SET_DECL_ASSEMBLER_NAME (decl, DECL_ASSEMBLER_NAME (default_decl));
5595 :
5596 195 : cgraph_node *resolver_node = cgraph_node::get_create (decl);
5597 195 : resolver_node->dispatcher_resolver_function = true;
5598 :
5599 195 : if (node->is_target_clone)
5600 84 : resolver_node->is_target_clone = true;
5601 :
5602 195 : tree id = ix86_mangle_decl_assembler_name
5603 195 : (decl, node->function_version ()->assembler_name);
5604 195 : SET_DECL_ASSEMBLER_NAME (decl, id);
5605 :
5606 195 : DECL_NAME (decl) = DECL_NAME (default_decl);
5607 195 : TREE_USED (decl) = 1;
5608 195 : DECL_ARTIFICIAL (decl) = 1;
5609 195 : DECL_IGNORED_P (decl) = 1;
5610 195 : TREE_PUBLIC (decl) = 0;
5611 195 : DECL_UNINLINABLE (decl) = 1;
5612 :
5613 : /* Resolver is not external, body is generated. */
5614 195 : DECL_EXTERNAL (decl) = 0;
5615 195 : DECL_EXTERNAL (ifunc_alias_decl) = 0;
5616 :
5617 195 : DECL_CONTEXT (decl) = NULL_TREE;
5618 195 : DECL_INITIAL (decl) = make_node (BLOCK);
5619 195 : DECL_STATIC_CONSTRUCTOR (decl) = 0;
5620 :
5621 195 : if (DECL_COMDAT_GROUP (default_decl)
5622 195 : || TREE_PUBLIC (default_decl))
5623 : {
5624 : /* In this case, each translation unit with a call to this
5625 : versioned function will put out a resolver. Ensure it
5626 : is comdat to keep just one copy. */
5627 171 : DECL_COMDAT (decl) = 1;
5628 171 : make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
5629 : }
5630 : else
5631 24 : TREE_PUBLIC (ifunc_alias_decl) = 0;
5632 :
5633 : /* Build result decl and add to function_decl. */
5634 195 : t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
5635 195 : DECL_CONTEXT (t) = decl;
5636 195 : DECL_ARTIFICIAL (t) = 1;
5637 195 : DECL_IGNORED_P (t) = 1;
5638 195 : DECL_RESULT (decl) = t;
5639 :
5640 195 : gimplify_function_tree (decl);
5641 195 : push_cfun (DECL_STRUCT_FUNCTION (decl));
5642 195 : *empty_bb = init_lowered_empty_function (decl, false,
5643 : profile_count::uninitialized ());
5644 :
5645 195 : cgraph_node::add_new_function (decl, true);
5646 195 : symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
5647 :
5648 195 : pop_cfun ();
5649 :
5650 195 : gcc_assert (ifunc_alias_decl != NULL);
5651 : /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
5652 195 : DECL_ATTRIBUTES (ifunc_alias_decl)
5653 195 : = make_attribute ("ifunc", IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)),
5654 195 : DECL_ATTRIBUTES (ifunc_alias_decl));
5655 :
5656 : /* Create the alias for dispatch to resolver here. */
5657 195 : cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
5658 195 : return decl;
5659 : }
5660 :
5661 : /* Generate the dispatching code body to dispatch multi-versioned function
5662 : DECL. The target hook is called to process the "target" attributes and
5663 : provide the code to dispatch the right function at run-time. NODE points
5664 : to the dispatcher decl whose body will be created. */
5665 :
5666 : tree
5667 195 : ix86_generate_version_dispatcher_body (void *node_p)
5668 : {
5669 195 : tree resolver_decl;
5670 195 : basic_block empty_bb;
5671 195 : tree default_ver_decl;
5672 195 : struct cgraph_node *versn;
5673 195 : struct cgraph_node *node;
5674 :
5675 195 : struct cgraph_function_version_info *node_version_info = NULL;
5676 195 : struct cgraph_function_version_info *versn_info = NULL;
5677 :
5678 195 : node = (cgraph_node *)node_p;
5679 :
5680 195 : node_version_info = node->function_version ();
5681 195 : gcc_assert (node->dispatcher_function
5682 : && node_version_info != NULL);
5683 :
5684 195 : if (node_version_info->dispatcher_resolver)
5685 : return node_version_info->dispatcher_resolver;
5686 :
5687 : /* The first version in the chain corresponds to the default version. */
5688 195 : default_ver_decl = node_version_info->next->this_node->decl;
5689 :
5690 : /* node is going to be an alias, so remove the finalized bit. */
5691 195 : node->definition = false;
5692 :
5693 195 : resolver_decl = make_resolver_func (default_ver_decl,
5694 : node->decl, &empty_bb);
5695 :
5696 195 : node_version_info->dispatcher_resolver = resolver_decl;
5697 :
5698 195 : push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
5699 :
5700 195 : auto_vec<tree, 2> fn_ver_vec;
5701 :
5702 1170 : for (versn_info = node_version_info->next; versn_info;
5703 975 : versn_info = versn_info->next)
5704 : {
5705 975 : versn = versn_info->this_node;
5706 : /* Check for virtual functions here again, as by this time it should
5707 : have been determined if this function needs a vtable index or
5708 : not. This happens for methods in derived classes that override
5709 : virtual methods in base classes but are not explicitly marked as
5710 : virtual. */
5711 975 : if (DECL_VINDEX (versn->decl))
5712 0 : sorry ("virtual function multiversioning not supported");
5713 :
5714 975 : fn_ver_vec.safe_push (versn->decl);
5715 : }
5716 :
5717 195 : dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
5718 195 : cgraph_edge::rebuild_edges ();
5719 195 : pop_cfun ();
5720 195 : return resolver_decl;
5721 195 : }
5722 :
5723 :
|