Line data Source code
1 : /* Copyright (C) 1988-2026 Free Software Foundation, Inc.
2 :
3 : This file is part of GCC.
4 :
5 : GCC is free software; you can redistribute it and/or modify
6 : it under the terms of the GNU General Public License as published by
7 : the Free Software Foundation; either version 3, or (at your option)
8 : any later version.
9 :
10 : GCC is distributed in the hope that it will be useful,
11 : but WITHOUT ANY WARRANTY; without even the implied warranty of
12 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 : GNU General Public License for more details.
14 :
15 : You should have received a copy of the GNU General Public License
16 : along with GCC; see the file COPYING3. If not see
17 : <http://www.gnu.org/licenses/>. */
18 :
19 : #define IN_TARGET_CODE 1
20 :
21 : #include "config.h"
22 : #include "system.h"
23 : #include "coretypes.h"
24 : #include "backend.h"
25 : #include "rtl.h"
26 : #include "tree.h"
27 : #include "memmodel.h"
28 : #include "gimple.h"
29 : #include "cfghooks.h"
30 : #include "cfgloop.h"
31 : #include "df.h"
32 : #include "tm_p.h"
33 : #include "stringpool.h"
34 : #include "expmed.h"
35 : #include "optabs.h"
36 : #include "regs.h"
37 : #include "emit-rtl.h"
38 : #include "recog.h"
39 : #include "cgraph.h"
40 : #include "diagnostic.h"
41 : #include "cfgbuild.h"
42 : #include "alias.h"
43 : #include "fold-const.h"
44 : #include "attribs.h"
45 : #include "calls.h"
46 : #include "stor-layout.h"
47 : #include "varasm.h"
48 : #include "output.h"
49 : #include "insn-attr.h"
50 : #include "flags.h"
51 : #include "except.h"
52 : #include "explow.h"
53 : #include "expr.h"
54 : #include "cfgrtl.h"
55 : #include "common/common-target.h"
56 : #include "langhooks.h"
57 : #include "reload.h"
58 : #include "gimplify.h"
59 : #include "dwarf2.h"
60 : #include "tm-constrs.h"
61 : #include "cselib.h"
62 : #include "sched-int.h"
63 : #include "opts.h"
64 : #include "tree-pass.h"
65 : #include "context.h"
66 : #include "pass_manager.h"
67 : #include "target-globals.h"
68 : #include "gimple-iterator.h"
69 : #include "shrink-wrap.h"
70 : #include "builtins.h"
71 : #include "rtl-iter.h"
72 : #include "tree-iterator.h"
73 : #include "dbgcnt.h"
74 : #include "case-cfn-macros.h"
75 : #include "dojump.h"
76 : #include "fold-const-call.h"
77 : #include "tree-vrp.h"
78 : #include "tree-ssanames.h"
79 : #include "selftest.h"
80 : #include "selftest-rtl.h"
81 : #include "print-rtl.h"
82 : #include "intl.h"
83 : #include "ifcvt.h"
84 : #include "symbol-summary.h"
85 : #include "sreal.h"
86 : #include "ipa-cp.h"
87 : #include "ipa-prop.h"
88 : #include "ipa-fnsummary.h"
89 : #include "wide-int-bitmask.h"
90 : #include "tree-vector-builder.h"
91 : #include "debug.h"
92 : #include "dwarf2out.h"
93 : #include "i386-builtins.h"
94 : #include "i386-features.h"
95 : #include "i386-expand.h"
96 :
97 : const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
98 : "savms64",
99 : "resms64",
100 : "resms64x",
101 : "savms64f",
102 : "resms64f",
103 : "resms64fx"
104 : };
105 :
106 : const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
107 : /* The below offset values are where each register is stored for the layout
108 : relative to incoming stack pointer. The value of each m_regs[].offset will
109 : be relative to the incoming base pointer (rax or rsi) used by the stub.
110 :
111 : s_instances: 0 1 2 3
112 : Offset: realigned or aligned + 8
113 : Register aligned aligned + 8 aligned w/HFP w/HFP */
114 : XMM15_REG, /* 0x10 0x18 0x10 0x18 */
115 : XMM14_REG, /* 0x20 0x28 0x20 0x28 */
116 : XMM13_REG, /* 0x30 0x38 0x30 0x38 */
117 : XMM12_REG, /* 0x40 0x48 0x40 0x48 */
118 : XMM11_REG, /* 0x50 0x58 0x50 0x58 */
119 : XMM10_REG, /* 0x60 0x68 0x60 0x68 */
120 : XMM9_REG, /* 0x70 0x78 0x70 0x78 */
121 : XMM8_REG, /* 0x80 0x88 0x80 0x88 */
122 : XMM7_REG, /* 0x90 0x98 0x90 0x98 */
123 : XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
124 : SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
125 : DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
126 : BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
127 : BP_REG, /* 0xc0 0xc8 N/A N/A */
128 : R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
129 : R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
130 : R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
131 : R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
132 : };
133 :
134 : /* Instantiate static const values. */
135 : const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
136 : const unsigned xlogue_layout::MIN_REGS;
137 : const unsigned xlogue_layout::MAX_REGS;
138 : const unsigned xlogue_layout::MAX_EXTRA_REGS;
139 : const unsigned xlogue_layout::VARIANT_COUNT;
140 : const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
141 :
142 : /* Initialize xlogue_layout::s_stub_names to zero. */
143 : char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
144 : [STUB_NAME_MAX_LEN];
145 :
146 : /* Instantiates all xlogue_layout instances. */
147 : const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
148 : xlogue_layout (0, false),
149 : xlogue_layout (8, false),
150 : xlogue_layout (0, true),
151 : xlogue_layout (8, true)
152 : };
153 :
154 : /* Return an appropriate const instance of xlogue_layout based upon values
155 : in cfun->machine and crtl. */
156 : const class xlogue_layout &
157 49891 : xlogue_layout::get_instance ()
158 : {
159 49891 : enum xlogue_stub_sets stub_set;
160 49891 : bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
161 :
162 49891 : if (stack_realign_fp)
163 : stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
164 40910 : else if (frame_pointer_needed)
165 25246 : stub_set = aligned_plus_8
166 31552 : ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
167 : : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
168 : else
169 9358 : stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
170 :
171 49891 : return s_instances[stub_set];
172 : }
173 :
174 : /* Determine how many clobbered registers can be saved by the stub.
175 : Returns the count of registers the stub will save and restore. */
176 : unsigned
177 35225 : xlogue_layout::count_stub_managed_regs ()
178 : {
179 35225 : bool hfp = frame_pointer_needed || stack_realign_fp;
180 35225 : unsigned i, count;
181 35225 : unsigned regno;
182 :
183 94890 : for (count = i = MIN_REGS; i < MAX_REGS; ++i)
184 : {
185 93670 : regno = REG_ORDER[i];
186 93670 : if (regno == BP_REG && hfp)
187 18200 : continue;
188 75470 : if (!ix86_save_reg (regno, false, false))
189 : break;
190 41465 : ++count;
191 : }
192 35225 : return count;
193 : }
194 :
195 : /* Determine if register REGNO is a stub managed register given the
196 : total COUNT of stub managed registers. */
197 : bool
198 2650688 : xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
199 : {
200 2650688 : bool hfp = frame_pointer_needed || stack_realign_fp;
201 2650688 : unsigned i;
202 :
203 34587805 : for (i = 0; i < count; ++i)
204 : {
205 32436986 : gcc_assert (i < MAX_REGS);
206 32436986 : if (REG_ORDER[i] == BP_REG && hfp)
207 522627 : ++count;
208 31914359 : else if (REG_ORDER[i] == regno)
209 : return true;
210 : }
211 : return false;
212 : }
213 :
214 : /* Constructor for xlogue_layout. */
215 1146964 : xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
216 1146964 : : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
217 1146964 : m_stack_align_off_in (stack_align_off_in)
218 : {
219 1146964 : HOST_WIDE_INT offset = stack_align_off_in;
220 1146964 : unsigned i, j;
221 :
222 21792316 : for (i = j = 0; i < MAX_REGS; ++i)
223 : {
224 20645352 : unsigned regno = REG_ORDER[i];
225 :
226 20645352 : if (regno == BP_REG && hfp)
227 573482 : continue;
228 20071870 : if (SSE_REGNO_P (regno))
229 : {
230 11469640 : offset += 16;
231 : /* Verify that SSE regs are always aligned. */
232 11469640 : gcc_assert (!((stack_align_off_in + offset) & 15));
233 : }
234 : else
235 8602230 : offset += 8;
236 :
237 20071870 : m_regs[j].regno = regno;
238 20071870 : m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
239 : }
240 1146964 : gcc_assert (j == m_nregs);
241 1146964 : }
242 :
243 : const char *
244 14666 : xlogue_layout::get_stub_name (enum xlogue_stub stub,
245 : unsigned n_extra_regs)
246 : {
247 14666 : const int have_avx = TARGET_AVX;
248 14666 : char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
249 :
250 : /* Lazy init */
251 14666 : if (!*name)
252 : {
253 362 : int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
254 : (have_avx ? "avx" : "sse"),
255 181 : STUB_BASE_NAMES[stub],
256 : MIN_REGS + n_extra_regs);
257 181 : gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
258 : }
259 :
260 14666 : return name;
261 : }
262 :
263 : /* Return rtx of a symbol ref for the entry point (based upon
264 : cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
265 : rtx
266 14666 : xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
267 : {
268 14666 : const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
269 14666 : gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
270 14666 : gcc_assert (stub < XLOGUE_STUB_COUNT);
271 14666 : gcc_assert (crtl->stack_realign_finalized);
272 :
273 14666 : return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
274 : }
275 :
276 : unsigned scalar_chain::max_id = 0;
277 :
278 : namespace {
279 :
280 : /* Initialize new chain. */
281 :
282 6369565 : scalar_chain::scalar_chain (enum machine_mode smode_, enum machine_mode vmode_)
283 : {
284 6369565 : smode = smode_;
285 6369565 : vmode = vmode_;
286 :
287 6369565 : chain_id = ++max_id;
288 :
289 6369565 : if (dump_file)
290 136 : fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
291 :
292 6369565 : bitmap_obstack_initialize (NULL);
293 6369565 : insns = BITMAP_ALLOC (NULL);
294 6369565 : defs = BITMAP_ALLOC (NULL);
295 6369565 : defs_conv = BITMAP_ALLOC (NULL);
296 6369565 : insns_conv = BITMAP_ALLOC (NULL);
297 6369565 : queue = NULL;
298 :
299 6369565 : cost_sse_integer = 0;
300 6369565 : weighted_cost_sse_integer = 0 ;
301 6369565 : max_visits = x86_stv_max_visits;
302 6369565 : }
303 :
304 : /* Free chain's data. */
305 :
306 6369565 : scalar_chain::~scalar_chain ()
307 : {
308 6369565 : BITMAP_FREE (insns);
309 6369565 : BITMAP_FREE (defs);
310 6369565 : BITMAP_FREE (defs_conv);
311 6369565 : BITMAP_FREE (insns_conv);
312 6369565 : bitmap_obstack_release (NULL);
313 6369565 : }
314 :
315 : /* Add instruction into chains' queue. */
316 :
317 : void
318 8246227 : scalar_chain::add_to_queue (unsigned insn_uid)
319 : {
320 8246227 : if (!bitmap_set_bit (queue, insn_uid))
321 : return;
322 :
323 6221467 : if (dump_file)
324 141 : fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
325 : insn_uid, chain_id);
326 : }
327 :
328 : /* For DImode conversion, mark register defined by DEF as requiring
329 : conversion. */
330 :
331 : void
332 9384633 : scalar_chain::mark_dual_mode_def (df_ref def)
333 : {
334 9384633 : gcc_assert (DF_REF_REG_DEF_P (def));
335 :
336 : /* Record the def/insn pair so we can later efficiently iterate over
337 : the defs to convert on insns not in the chain. */
338 9384633 : bool reg_new = bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
339 9384633 : basic_block bb = BLOCK_FOR_INSN (DF_REF_INSN (def));
340 9384633 : profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count;
341 9384633 : bool speed_p = optimize_bb_for_speed_p (bb);
342 9384633 : int cost = 0;
343 :
344 9384633 : if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def)))
345 : {
346 2721688 : if (!bitmap_set_bit (insns_conv, DF_REF_INSN_UID (def))
347 2721688 : && !reg_new)
348 1408345 : return;
349 :
350 : /* Cost integer to sse moves. */
351 2472064 : if (speed_p)
352 2193867 : cost = COSTS_N_INSNS (ix86_cost->integer_to_sse) / 2;
353 278197 : else if (TARGET_64BIT || smode == SImode)
354 : cost = COSTS_N_BYTES (4);
355 : /* vmovd (4 bytes) + vpinsrd (6 bytes). */
356 18685 : else if (TARGET_SSE4_1)
357 : cost = COSTS_N_BYTES (10);
358 : /* movd (4 bytes) + movd (4 bytes) + unpckldq (4 bytes). */
359 : else
360 7976288 : cost = COSTS_N_BYTES (12);
361 : }
362 : else
363 : {
364 6662945 : if (!reg_new)
365 : return;
366 :
367 : /* Cost sse to integer moves. */
368 5504224 : if (speed_p)
369 4944528 : cost = COSTS_N_INSNS (ix86_cost->sse_to_integer) / 2;
370 559696 : else if (TARGET_64BIT || smode == SImode)
371 : cost = COSTS_N_BYTES (4);
372 : /* vmovd (4 bytes) + vpextrd (6 bytes). */
373 3016 : else if (TARGET_SSE4_1)
374 : cost = COSTS_N_BYTES (10);
375 : /* movd (4 bytes) + psrlq (5 bytes) + movd (4 bytes). */
376 : else
377 7976288 : cost = COSTS_N_BYTES (13);
378 : }
379 :
380 7976288 : if (speed_p)
381 7138395 : weighted_cost_sse_integer += bb->count.to_sreal_scale (entry_count) * cost;
382 :
383 7976288 : cost_sse_integer += cost;
384 :
385 7976288 : if (dump_file)
386 240 : fprintf (dump_file,
387 : " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
388 240 : DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
389 : }
390 :
391 : /* Check REF's chain to add new insns into a queue
392 : and find registers requiring conversion. Return true if OK, false
393 : if the analysis was aborted. */
394 :
395 : bool
396 17766723 : scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref,
397 : bitmap disallowed)
398 : {
399 17766723 : df_link *chain;
400 17766723 : bool mark_def = false;
401 :
402 17766723 : gcc_checking_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)));
403 :
404 61911976 : for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
405 : {
406 44151982 : unsigned uid = DF_REF_INSN_UID (chain->ref);
407 :
408 44151982 : if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
409 8023977 : continue;
410 :
411 36128005 : if (--max_visits == 0)
412 : return false;
413 :
414 36127419 : if (!DF_REF_REG_MEM_P (chain->ref))
415 : {
416 30122084 : if (bitmap_bit_p (insns, uid))
417 9511708 : continue;
418 :
419 20610376 : if (bitmap_bit_p (candidates, uid))
420 : {
421 8246227 : add_to_queue (uid);
422 8246227 : continue;
423 : }
424 :
425 : /* If we run into parts of an aborted chain discovery abort. */
426 12364149 : if (bitmap_bit_p (disallowed, uid))
427 : return false;
428 : }
429 :
430 18363341 : if (DF_REF_REG_DEF_P (chain->ref))
431 : {
432 2721688 : if (dump_file)
433 125 : fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
434 : DF_REF_REGNO (chain->ref), uid);
435 2721688 : mark_dual_mode_def (chain->ref);
436 : }
437 : else
438 : {
439 15641653 : if (dump_file)
440 524 : fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
441 : DF_REF_REGNO (chain->ref), uid);
442 : mark_def = true;
443 : }
444 : }
445 :
446 17759994 : if (mark_def)
447 6662945 : mark_dual_mode_def (ref);
448 :
449 : return true;
450 : }
451 :
452 : /* Check whether X is a convertible *concatditi_? variant. X is known
453 : to be any_or_plus:TI, i.e. PLUS:TI, IOR:TI or XOR:TI. */
454 :
455 : static bool
456 27276 : timode_concatdi_p (rtx x)
457 : {
458 27276 : rtx op0 = XEXP (x, 0);
459 27276 : rtx op1 = XEXP (x, 1);
460 :
461 27276 : if (GET_CODE (op1) == ASHIFT)
462 958 : std::swap (op0, op1);
463 :
464 27276 : return GET_CODE (op0) == ASHIFT
465 18423 : && GET_CODE (XEXP (op0, 0)) == ZERO_EXTEND
466 18423 : && GET_MODE (XEXP (XEXP (op0, 0), 0)) == DImode
467 18423 : && REG_P (XEXP (XEXP (op0, 0), 0))
468 18288 : && CONST_INT_P (XEXP (op0, 1))
469 18288 : && INTVAL (XEXP (op0, 1)) == 64
470 18288 : && GET_CODE (op1) == ZERO_EXTEND
471 17330 : && GET_MODE (XEXP (op1, 0)) == DImode
472 44606 : && REG_P (XEXP (op1, 0));
473 : }
474 :
475 :
476 : /* Add instruction into a chain. Return true if OK, false if the search
477 : was aborted. */
478 :
479 : bool
480 12582842 : scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid,
481 : bitmap disallowed)
482 : {
483 12582842 : if (!bitmap_set_bit (insns, insn_uid))
484 : return true;
485 :
486 12582842 : if (dump_file)
487 277 : fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
488 :
489 12582842 : rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
490 12582842 : rtx def_set = single_set (insn);
491 12582842 : if (def_set && REG_P (SET_DEST (def_set))
492 22292027 : && !HARD_REGISTER_P (SET_DEST (def_set)))
493 9709173 : bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
494 :
495 : /* ??? The following is quadratic since analyze_register_chain
496 : iterates over all refs to look for dual-mode regs. Instead this
497 : should be done separately for all regs mentioned in the chain once. */
498 12582842 : df_ref ref;
499 25706771 : for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
500 13126385 : if (!HARD_REGISTER_P (DF_REF_REG (ref)))
501 9709173 : if (!analyze_register_chain (candidates, ref, disallowed))
502 : return false;
503 :
504 : /* The operand(s) of VEC_SELECT, ZERO_EXTEND and similar ops don't need
505 : to be converted/convertible. */
506 12580386 : if (def_set)
507 12580386 : switch (GET_CODE (SET_SRC (def_set)))
508 : {
509 : case VEC_SELECT:
510 : return true;
511 122 : case ZERO_EXTEND:
512 122 : if (GET_MODE (XEXP (SET_SRC (def_set), 0)) == DImode)
513 : return true;
514 : break;
515 2371758 : case PLUS:
516 2371758 : case IOR:
517 2371758 : case XOR:
518 2371758 : if (smode == TImode && timode_concatdi_p (SET_SRC (def_set)))
519 : return true;
520 : break;
521 : default:
522 : break;
523 : }
524 :
525 27524962 : for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
526 14989281 : if (!DF_REF_REG_MEM_P (ref))
527 8057550 : if (!analyze_register_chain (candidates, ref, disallowed))
528 : return false;
529 :
530 : return true;
531 : }
532 :
533 : /* Build new chain starting from insn INSN_UID recursively
534 : adding all dependent uses and definitions. Return true if OK, false
535 : if the chain discovery was aborted. */
536 :
537 : bool
538 6369565 : scalar_chain::build (bitmap candidates, unsigned insn_uid, bitmap disallowed)
539 : {
540 6369565 : queue = BITMAP_ALLOC (NULL);
541 6369565 : bitmap_set_bit (queue, insn_uid);
542 :
543 6369565 : if (dump_file)
544 136 : fprintf (dump_file, "Building chain #%d...\n", chain_id);
545 :
546 18945678 : while (!bitmap_empty_p (queue))
547 : {
548 12582842 : insn_uid = bitmap_first_set_bit (queue);
549 12582842 : bitmap_clear_bit (queue, insn_uid);
550 12582842 : bitmap_clear_bit (candidates, insn_uid);
551 12582842 : if (!add_insn (candidates, insn_uid, disallowed))
552 : {
553 : /* If we aborted the search put sofar found insn on the set of
554 : disallowed insns so that further searches reaching them also
555 : abort and thus we abort the whole but yet undiscovered chain. */
556 6729 : bitmap_ior_into (disallowed, insns);
557 6729 : if (dump_file)
558 0 : fprintf (dump_file, "Aborted chain #%d discovery\n", chain_id);
559 6729 : BITMAP_FREE (queue);
560 6729 : return false;
561 : }
562 : }
563 :
564 6362836 : if (dump_file)
565 : {
566 136 : fprintf (dump_file, "Collected chain #%d...\n", chain_id);
567 136 : fprintf (dump_file, " insns: ");
568 136 : dump_bitmap (dump_file, insns);
569 136 : if (!bitmap_empty_p (defs_conv))
570 : {
571 136 : bitmap_iterator bi;
572 136 : unsigned id;
573 136 : const char *comma = "";
574 136 : fprintf (dump_file, " defs to convert: ");
575 366 : EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
576 : {
577 230 : fprintf (dump_file, "%sr%d", comma, id);
578 230 : comma = ", ";
579 : }
580 136 : fprintf (dump_file, "\n");
581 : }
582 : }
583 :
584 6362836 : BITMAP_FREE (queue);
585 :
586 6362836 : return true;
587 : }
588 :
589 : /* Return a cost of building a vector constant
590 : instead of using a scalar one. */
591 :
592 : int
593 2680965 : general_scalar_chain::vector_const_cost (rtx exp, basic_block bb)
594 : {
595 2680965 : gcc_assert (CONST_INT_P (exp));
596 :
597 2680965 : if (standard_sse_constant_p (exp, vmode))
598 619793 : return ix86_cost->sse_op;
599 2061172 : if (optimize_bb_for_size_p (bb))
600 : return COSTS_N_BYTES (8);
601 : /* We have separate costs for SImode and DImode, use SImode costs
602 : for smaller modes. */
603 2455091 : return COSTS_N_INSNS (ix86_cost->sse_load[smode == DImode ? 1 : 0]) / 2;
604 : }
605 :
606 : /* Return true if it's cost profitable for chain conversion. */
607 :
608 : bool
609 5890727 : general_scalar_chain::compute_convert_gain ()
610 : {
611 5890727 : bitmap_iterator bi;
612 5890727 : unsigned insn_uid;
613 5890727 : int gain = 0;
614 5890727 : sreal weighted_gain = 0;
615 :
616 5890727 : if (dump_file)
617 136 : fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
618 :
619 : /* SSE costs distinguish between SImode and DImode loads/stores, for
620 : int costs factor in the number of GPRs involved. When supporting
621 : smaller modes than SImode the int load/store costs need to be
622 : adjusted as well. */
623 5890727 : unsigned sse_cost_idx = smode == DImode ? 1 : 0;
624 5890727 : int m = smode == DImode ? (TARGET_64BIT ? 1 : 2) : 1;
625 :
626 17522562 : EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
627 : {
628 11631835 : rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
629 11631835 : rtx def_set = single_set (insn);
630 11631835 : rtx src = SET_SRC (def_set);
631 11631835 : rtx dst = SET_DEST (def_set);
632 11631835 : basic_block bb = BLOCK_FOR_INSN (insn);
633 11631835 : int igain = 0;
634 11631835 : profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count;
635 11631835 : bool speed_p = optimize_bb_for_speed_p (bb);
636 11631835 : sreal bb_freq = bb->count.to_sreal_scale (entry_count);
637 :
638 11631835 : if (REG_P (src) && REG_P (dst))
639 : {
640 933786 : if (!speed_p)
641 : /* reg-reg move is 2 bytes, while SSE 3. */
642 186982 : igain += COSTS_N_BYTES (2 * m - 3);
643 : else
644 : /* Move costs are normalized to reg-reg move having cost 2. */
645 746804 : igain += COSTS_N_INSNS (2 * m - ix86_cost->xmm_move) / 2;
646 : }
647 10698049 : else if (REG_P (src) && MEM_P (dst))
648 : {
649 2303672 : if (!speed_p)
650 : /* Integer load/store is 3+ bytes and SSE 4+. */
651 191466 : igain += COSTS_N_BYTES (3 * m - 4);
652 : else
653 2112206 : igain
654 2112206 : += COSTS_N_INSNS (m * ix86_cost->int_store[2]
655 : - ix86_cost->sse_store[sse_cost_idx]) / 2;
656 : }
657 8394377 : else if (MEM_P (src) && REG_P (dst))
658 : {
659 3762752 : if (!speed_p)
660 358579 : igain += COSTS_N_BYTES (3 * m - 4);
661 : else
662 3404173 : igain += COSTS_N_INSNS (m * ix86_cost->int_load[2]
663 : - ix86_cost->sse_load[sse_cost_idx]) / 2;
664 : }
665 : else
666 : {
667 : /* For operations on memory operands, include the overhead
668 : of explicit load and store instructions. */
669 4631625 : if (MEM_P (dst))
670 : {
671 67025 : if (!speed_p)
672 : /* ??? This probably should account size difference
673 : of SSE and integer load rather than full SSE load. */
674 : igain -= COSTS_N_BYTES (8);
675 : else
676 : {
677 57834 : int cost = (m * (ix86_cost->int_load[2]
678 57834 : + ix86_cost->int_store[2])
679 57834 : - (ix86_cost->sse_load[sse_cost_idx] +
680 57834 : ix86_cost->sse_store[sse_cost_idx]));
681 57834 : igain += COSTS_N_INSNS (cost) / 2;
682 : }
683 : }
684 :
685 4631625 : switch (GET_CODE (src))
686 : {
687 477837 : case ASHIFT:
688 477837 : case ASHIFTRT:
689 477837 : case LSHIFTRT:
690 477837 : if (m == 2)
691 : {
692 16944 : if (INTVAL (XEXP (src, 1)) >= 32)
693 11524 : igain += ix86_cost->add;
694 : /* Gain for extend highpart case. */
695 5420 : else if (GET_CODE (XEXP (src, 0)) == ASHIFT)
696 0 : igain += ix86_cost->shift_const - ix86_cost->sse_op;
697 : else
698 5420 : igain += ix86_cost->shift_const;
699 : }
700 :
701 477837 : igain += ix86_cost->shift_const - ix86_cost->sse_op;
702 :
703 477837 : if (CONST_INT_P (XEXP (src, 0)))
704 0 : igain -= vector_const_cost (XEXP (src, 0), bb);
705 : break;
706 :
707 3819 : case ROTATE:
708 3819 : case ROTATERT:
709 3819 : igain += m * ix86_cost->shift_const;
710 3819 : if (TARGET_AVX512VL)
711 204 : igain -= ix86_cost->sse_op;
712 3615 : else if (smode == DImode)
713 : {
714 612 : int bits = INTVAL (XEXP (src, 1));
715 612 : if ((bits & 0x0f) == 0)
716 128 : igain -= ix86_cost->sse_op;
717 484 : else if ((bits & 0x07) == 0)
718 27 : igain -= 2 * ix86_cost->sse_op;
719 : else
720 457 : igain -= 3 * ix86_cost->sse_op;
721 : }
722 3003 : else if (INTVAL (XEXP (src, 1)) == 16)
723 242 : igain -= ix86_cost->sse_op;
724 : else
725 2761 : igain -= 2 * ix86_cost->sse_op;
726 : break;
727 :
728 2845481 : case AND:
729 2845481 : case IOR:
730 2845481 : case XOR:
731 2845481 : case PLUS:
732 2845481 : case MINUS:
733 2845481 : igain += m * ix86_cost->add - ix86_cost->sse_op;
734 : /* Additional gain for andnot for targets without BMI. */
735 2845481 : if (GET_CODE (XEXP (src, 0)) == NOT
736 3599 : && !TARGET_BMI)
737 3590 : igain += m * ix86_cost->add;
738 :
739 2845481 : if (CONST_INT_P (XEXP (src, 0)))
740 0 : igain -= vector_const_cost (XEXP (src, 0), bb);
741 2845481 : if (CONST_INT_P (XEXP (src, 1)))
742 1696311 : igain -= vector_const_cost (XEXP (src, 1), bb);
743 2845481 : if (MEM_P (XEXP (src, 1)))
744 : {
745 84397 : if (!speed_p)
746 20527 : igain -= COSTS_N_BYTES (m == 2 ? 3 : 5);
747 : else
748 74129 : igain += COSTS_N_INSNS
749 : (m * ix86_cost->int_load[2]
750 : - ix86_cost->sse_load[sse_cost_idx]) / 2;
751 : }
752 : break;
753 :
754 50600 : case NEG:
755 50600 : case NOT:
756 50600 : igain -= ix86_cost->sse_op + COSTS_N_INSNS (1);
757 :
758 50600 : if (GET_CODE (XEXP (src, 0)) != ABS)
759 : {
760 50600 : igain += m * ix86_cost->add;
761 50600 : break;
762 : }
763 : /* FALLTHRU */
764 :
765 1004 : case ABS:
766 1004 : case SMAX:
767 1004 : case SMIN:
768 1004 : case UMAX:
769 1004 : case UMIN:
770 : /* We do not have any conditional move cost, estimate it as a
771 : reg-reg move. Comparisons are costed as adds. */
772 1004 : igain += m * (COSTS_N_INSNS (2) + ix86_cost->add);
773 : /* Integer SSE ops are all costed the same. */
774 1004 : igain -= ix86_cost->sse_op;
775 1004 : break;
776 :
777 0 : case COMPARE:
778 0 : if (XEXP (src, 1) != const0_rtx)
779 : {
780 : /* cmp vs. pxor;pshufd;ptest. */
781 0 : igain += COSTS_N_INSNS (m - 3);
782 : }
783 0 : else if (GET_CODE (XEXP (src, 0)) != AND)
784 : {
785 : /* test vs. pshufd;ptest. */
786 0 : igain += COSTS_N_INSNS (m - 2);
787 : }
788 0 : else if (GET_CODE (XEXP (XEXP (src, 0), 0)) != NOT)
789 : {
790 : /* and;test vs. pshufd;ptest. */
791 0 : igain += COSTS_N_INSNS (2 * m - 2);
792 : }
793 0 : else if (TARGET_BMI)
794 : {
795 : /* andn;test vs. pandn;pshufd;ptest. */
796 0 : igain += COSTS_N_INSNS (2 * m - 3);
797 : }
798 : else
799 : {
800 : /* not;and;test vs. pandn;pshufd;ptest. */
801 0 : igain += COSTS_N_INSNS (3 * m - 3);
802 : }
803 : break;
804 :
805 1215339 : case CONST_INT:
806 1215339 : if (REG_P (dst))
807 : {
808 1215339 : if (!speed_p)
809 : {
810 : /* xor (2 bytes) vs. xorps (3 bytes). */
811 230685 : if (src == const0_rtx)
812 121787 : igain -= COSTS_N_BYTES (1);
813 : /* movdi_internal vs. movv2di_internal. */
814 : /* => mov (5 bytes) vs. movaps (7 bytes). */
815 108898 : else if (x86_64_immediate_operand (src, SImode))
816 96231 : igain -= COSTS_N_BYTES (2);
817 : else
818 : /* ??? Larger immediate constants are placed in the
819 : constant pool, where the size benefit/impact of
820 : STV conversion is affected by whether and how
821 : often each constant pool entry is shared/reused.
822 : The value below is empirically derived from the
823 : CSiBE benchmark (and the optimal value may drift
824 : over time). */
825 : igain += COSTS_N_BYTES (0);
826 : }
827 : else
828 : {
829 : /* DImode can be immediate for TARGET_64BIT
830 : and SImode always. */
831 984654 : igain += m * COSTS_N_INSNS (1);
832 984654 : igain -= vector_const_cost (src, bb);
833 : }
834 : }
835 0 : else if (MEM_P (dst))
836 : {
837 0 : igain += (m * ix86_cost->int_store[2]
838 0 : - ix86_cost->sse_store[sse_cost_idx]);
839 0 : igain -= vector_const_cost (src, bb);
840 : }
841 : break;
842 :
843 37545 : case VEC_SELECT:
844 37545 : if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
845 : {
846 : // movd (4 bytes) replaced with movdqa (4 bytes).
847 27826 : if (!!speed_p)
848 26021 : igain += COSTS_N_INSNS (ix86_cost->sse_to_integer
849 : - ix86_cost->xmm_move) / 2;
850 : }
851 : else
852 : {
853 : // pshufd; movd replaced with pshufd.
854 9719 : if (!speed_p)
855 674 : igain += COSTS_N_BYTES (4);
856 : else
857 9045 : igain += ix86_cost->sse_to_integer;
858 : }
859 : break;
860 :
861 0 : default:
862 0 : gcc_unreachable ();
863 : }
864 : }
865 :
866 11630030 : if (speed_p)
867 10361167 : weighted_gain += bb_freq * igain;
868 11631835 : gain += igain;
869 :
870 11631835 : if (igain != 0 && dump_file)
871 : {
872 93 : fprintf (dump_file, " Instruction gain %d with bb_freq %.2f for",
873 : igain, bb_freq.to_double ());
874 93 : dump_insn_slim (dump_file, insn);
875 : }
876 : }
877 :
878 5890727 : if (dump_file)
879 : {
880 136 : fprintf (dump_file, " Instruction conversion gain: %d, \n",
881 : gain);
882 136 : fprintf (dump_file, " Registers conversion cost: %d\n",
883 : cost_sse_integer);
884 136 : fprintf (dump_file, " Weighted instruction conversion gain: %.2f, \n",
885 : weighted_gain.to_double ());
886 136 : fprintf (dump_file, " Weighted registers conversion cost: %.2f\n",
887 : weighted_cost_sse_integer.to_double ());
888 : }
889 :
890 5890727 : if (weighted_gain != weighted_cost_sse_integer)
891 4756721 : return weighted_gain > weighted_cost_sse_integer;
892 : else
893 1134006 : return gain > cost_sse_integer;;
894 : }
895 :
896 : /* Insert generated conversion instruction sequence INSNS
897 : after instruction AFTER. New BB may be required in case
898 : instruction has EH region attached. */
899 :
900 : void
901 30742 : scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
902 : {
903 30742 : if (!control_flow_insn_p (after))
904 : {
905 30529 : emit_insn_after (insns, after);
906 30529 : return;
907 : }
908 :
909 213 : basic_block bb = BLOCK_FOR_INSN (after);
910 213 : edge e = find_fallthru_edge (bb->succs);
911 213 : gcc_assert (e);
912 :
913 213 : basic_block new_bb = split_edge (e);
914 213 : emit_insn_after (insns, BB_HEAD (new_bb));
915 : }
916 :
917 : } // anon namespace
918 :
919 : /* Generate the canonical SET_SRC to move GPR to a VMODE vector register,
920 : zeroing the upper parts. */
921 :
922 : static rtx
923 173086 : gen_gpr_to_xmm_move_src (enum machine_mode vmode, rtx gpr)
924 : {
925 346172 : switch (GET_MODE_NUNITS (vmode))
926 : {
927 25 : case 1:
928 25 : return gen_rtx_SUBREG (vmode, gpr, 0);
929 172504 : case 2:
930 345008 : return gen_rtx_VEC_CONCAT (vmode, gpr,
931 : CONST0_RTX (GET_MODE_INNER (vmode)));
932 557 : default:
933 557 : return gen_rtx_VEC_MERGE (vmode, gen_rtx_VEC_DUPLICATE (vmode, gpr),
934 : CONST0_RTX (vmode), GEN_INT (HOST_WIDE_INT_1U));
935 : }
936 : }
937 :
938 : /* Make vector copies for all register REGNO definitions
939 : and replace its uses in a chain. */
940 :
941 : void
942 8088 : scalar_chain::make_vector_copies (rtx_insn *insn, rtx reg)
943 : {
944 8088 : rtx vreg = *defs_map.get (reg);
945 :
946 8088 : start_sequence ();
947 8088 : if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
948 : {
949 0 : rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
950 0 : if (smode == DImode && !TARGET_64BIT)
951 : {
952 0 : emit_move_insn (adjust_address (tmp, SImode, 0),
953 : gen_rtx_SUBREG (SImode, reg, 0));
954 0 : emit_move_insn (adjust_address (tmp, SImode, 4),
955 : gen_rtx_SUBREG (SImode, reg, 4));
956 : }
957 : else
958 0 : emit_move_insn (copy_rtx (tmp), reg);
959 0 : emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
960 : gen_gpr_to_xmm_move_src (vmode, tmp)));
961 : }
962 8088 : else if (!TARGET_64BIT && smode == DImode)
963 : {
964 7950 : if (TARGET_SSE4_1)
965 : {
966 356 : emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
967 : CONST0_RTX (V4SImode),
968 : gen_rtx_SUBREG (SImode, reg, 0)));
969 356 : emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
970 : gen_rtx_SUBREG (V4SImode, vreg, 0),
971 : gen_rtx_SUBREG (SImode, reg, 4),
972 : GEN_INT (2)));
973 : }
974 : else
975 : {
976 7594 : rtx tmp = gen_reg_rtx (DImode);
977 7594 : emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
978 : CONST0_RTX (V4SImode),
979 : gen_rtx_SUBREG (SImode, reg, 0)));
980 7594 : emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
981 : CONST0_RTX (V4SImode),
982 : gen_rtx_SUBREG (SImode, reg, 4)));
983 7594 : emit_insn (gen_vec_interleave_lowv4si
984 : (gen_rtx_SUBREG (V4SImode, vreg, 0),
985 : gen_rtx_SUBREG (V4SImode, vreg, 0),
986 : gen_rtx_SUBREG (V4SImode, tmp, 0)));
987 : }
988 : }
989 : else
990 138 : emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
991 : gen_gpr_to_xmm_move_src (vmode, reg)));
992 8088 : rtx_insn *seq = end_sequence ();
993 8088 : emit_conversion_insns (seq, insn);
994 :
995 8088 : if (dump_file)
996 0 : fprintf (dump_file,
997 : " Copied r%d to a vector register r%d for insn %d\n",
998 0 : REGNO (reg), REGNO (vreg), INSN_UID (insn));
999 8088 : }
1000 :
1001 : /* Copy the definition SRC of INSN inside the chain to DST for
1002 : scalar uses outside of the chain. */
1003 :
1004 : void
1005 21892 : scalar_chain::convert_reg (rtx_insn *insn, rtx dst, rtx src)
1006 : {
1007 21892 : start_sequence ();
1008 21892 : if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
1009 : {
1010 0 : rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
1011 0 : emit_move_insn (tmp, src);
1012 0 : if (!TARGET_64BIT && smode == DImode)
1013 : {
1014 0 : emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
1015 : adjust_address (tmp, SImode, 0));
1016 0 : emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
1017 : adjust_address (tmp, SImode, 4));
1018 : }
1019 : else
1020 0 : emit_move_insn (dst, copy_rtx (tmp));
1021 : }
1022 21892 : else if (!TARGET_64BIT && smode == DImode)
1023 : {
1024 21002 : if (TARGET_SSE4_1)
1025 : {
1026 0 : rtx tmp = gen_rtx_PARALLEL (VOIDmode,
1027 : gen_rtvec (1, const0_rtx));
1028 0 : emit_insn
1029 0 : (gen_rtx_SET
1030 : (gen_rtx_SUBREG (SImode, dst, 0),
1031 : gen_rtx_VEC_SELECT (SImode,
1032 : gen_rtx_SUBREG (V4SImode, src, 0),
1033 : tmp)));
1034 :
1035 0 : tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
1036 0 : emit_insn
1037 0 : (gen_rtx_SET
1038 : (gen_rtx_SUBREG (SImode, dst, 4),
1039 : gen_rtx_VEC_SELECT (SImode,
1040 : gen_rtx_SUBREG (V4SImode, src, 0),
1041 : tmp)));
1042 : }
1043 : else
1044 : {
1045 21002 : rtx vcopy = gen_reg_rtx (V2DImode);
1046 21002 : emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, src, 0));
1047 21002 : emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
1048 : gen_rtx_SUBREG (SImode, vcopy, 0));
1049 21002 : emit_move_insn (vcopy,
1050 : gen_rtx_LSHIFTRT (V2DImode,
1051 : vcopy, GEN_INT (32)));
1052 21002 : emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
1053 : gen_rtx_SUBREG (SImode, vcopy, 0));
1054 : }
1055 : }
1056 : else
1057 890 : emit_move_insn (dst, src);
1058 :
1059 21892 : rtx_insn *seq = end_sequence ();
1060 21892 : emit_conversion_insns (seq, insn);
1061 :
1062 21892 : if (dump_file)
1063 0 : fprintf (dump_file,
1064 : " Copied r%d to a scalar register r%d for insn %d\n",
1065 0 : REGNO (src), REGNO (dst), INSN_UID (insn));
1066 21892 : }
1067 :
1068 : /* Helper function to convert immediate constant X to vmode. */
1069 : static rtx
1070 41474 : smode_convert_cst (rtx x, enum machine_mode vmode)
1071 : {
1072 : /* Prefer all ones vector in case of -1. */
1073 41474 : if (constm1_operand (x, GET_MODE (x)))
1074 894 : return CONSTM1_RTX (vmode);
1075 :
1076 40580 : unsigned n = GET_MODE_NUNITS (vmode);
1077 40580 : rtx *v = XALLOCAVEC (rtx, n);
1078 40580 : v[0] = x;
1079 46366 : for (unsigned i = 1; i < n; ++i)
1080 5786 : v[i] = const0_rtx;
1081 40580 : return gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (n, v));
1082 : }
1083 :
1084 : /* Convert operand OP in INSN. We should handle
1085 : memory operands and uninitialized registers.
1086 : All other register uses are converted during
1087 : registers conversion. */
1088 :
1089 : void
1090 247603 : scalar_chain::convert_op (rtx *op, rtx_insn *insn)
1091 : {
1092 247603 : rtx tmp;
1093 :
1094 247603 : if (GET_MODE (*op) == V1TImode)
1095 : return;
1096 :
1097 247452 : *op = copy_rtx_if_shared (*op);
1098 :
1099 247452 : if (GET_CODE (*op) == NOT
1100 247452 : || GET_CODE (*op) == ASHIFT)
1101 : {
1102 3490 : convert_op (&XEXP (*op, 0), insn);
1103 3490 : PUT_MODE (*op, vmode);
1104 : }
1105 : else if (MEM_P (*op))
1106 : {
1107 172948 : rtx_insn *movabs = NULL;
1108 :
1109 : /* Emit MOVABS to load from a 64-bit absolute address to a GPR. */
1110 172948 : if (!memory_operand (*op, GET_MODE (*op)))
1111 : {
1112 0 : tmp = gen_reg_rtx (GET_MODE (*op));
1113 0 : movabs = emit_insn_before (gen_rtx_SET (tmp, *op), insn);
1114 :
1115 0 : *op = tmp;
1116 : }
1117 :
1118 172948 : tmp = gen_rtx_SUBREG (vmode, gen_reg_rtx (GET_MODE (*op)), 0);
1119 :
1120 172948 : rtx_insn *eh_insn
1121 172948 : = emit_insn_before (gen_rtx_SET (copy_rtx (tmp),
1122 : gen_gpr_to_xmm_move_src (vmode, *op)),
1123 172948 : insn);
1124 :
1125 172948 : if (cfun->can_throw_non_call_exceptions)
1126 : {
1127 : /* Handle REG_EH_REGION note. */
1128 168867 : rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
1129 168867 : if (note)
1130 : {
1131 3597 : if (movabs)
1132 0 : eh_insn = movabs;
1133 3597 : control_flow_insns.safe_push (eh_insn);
1134 3597 : add_reg_note (eh_insn, REG_EH_REGION, XEXP (note, 0));
1135 : }
1136 : }
1137 :
1138 172948 : *op = tmp;
1139 :
1140 172948 : if (dump_file)
1141 0 : fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
1142 0 : INSN_UID (insn), reg_or_subregno (tmp));
1143 : }
1144 : else if (REG_P (*op))
1145 64600 : *op = gen_rtx_SUBREG (vmode, *op, 0);
1146 : else if (CONST_SCALAR_INT_P (*op))
1147 : {
1148 6414 : rtx vec_cst = smode_convert_cst (*op, vmode);
1149 :
1150 6414 : if (!standard_sse_constant_p (vec_cst, vmode))
1151 : {
1152 2698 : start_sequence ();
1153 2698 : vec_cst = validize_mem (force_const_mem (vmode, vec_cst));
1154 2698 : rtx_insn *seq = end_sequence ();
1155 2698 : emit_insn_before (seq, insn);
1156 : }
1157 :
1158 6414 : tmp = gen_rtx_SUBREG (vmode, gen_reg_rtx (smode), 0);
1159 :
1160 6414 : emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
1161 6414 : *op = tmp;
1162 : }
1163 : else
1164 : {
1165 0 : gcc_assert (SUBREG_P (*op));
1166 0 : gcc_assert (GET_MODE (*op) == vmode);
1167 : }
1168 : }
1169 :
1170 : /* Convert CCZmode COMPARE to vector mode. */
1171 :
1172 : rtx
1173 10 : scalar_chain::convert_compare (rtx op1, rtx op2, rtx_insn *insn)
1174 : {
1175 10 : rtx src, tmp;
1176 :
1177 : /* Handle any REG_EQUAL notes. */
1178 10 : tmp = find_reg_equal_equiv_note (insn);
1179 10 : if (tmp)
1180 : {
1181 1 : if (GET_CODE (XEXP (tmp, 0)) == COMPARE
1182 1 : && GET_MODE (XEXP (tmp, 0)) == CCZmode
1183 1 : && REG_P (XEXP (XEXP (tmp, 0), 0)))
1184 : {
1185 1 : rtx *op = &XEXP (XEXP (tmp, 0), 1);
1186 1 : if (CONST_SCALAR_INT_P (*op))
1187 : {
1188 1 : if (constm1_operand (*op, GET_MODE (*op)))
1189 0 : *op = CONSTM1_RTX (vmode);
1190 : else
1191 : {
1192 1 : unsigned n = GET_MODE_NUNITS (vmode);
1193 1 : rtx *v = XALLOCAVEC (rtx, n);
1194 1 : v[0] = *op;
1195 1 : for (unsigned i = 1; i < n; ++i)
1196 0 : v[i] = const0_rtx;
1197 1 : *op = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (n, v));
1198 : }
1199 : tmp = NULL_RTX;
1200 : }
1201 0 : else if (REG_P (*op))
1202 : tmp = NULL_RTX;
1203 : }
1204 :
1205 : if (tmp)
1206 0 : remove_note (insn, tmp);
1207 : }
1208 :
1209 : /* Comparison against anything other than zero, requires an XOR. */
1210 10 : if (op2 != const0_rtx)
1211 : {
1212 4 : convert_op (&op1, insn);
1213 4 : convert_op (&op2, insn);
1214 : /* If both operands are MEMs, explicitly load the OP1 into TMP. */
1215 4 : if (MEM_P (op1) && MEM_P (op2))
1216 : {
1217 0 : tmp = gen_reg_rtx (vmode);
1218 0 : emit_insn_before (gen_rtx_SET (tmp, op1), insn);
1219 0 : src = tmp;
1220 : }
1221 : else
1222 : src = op1;
1223 4 : src = gen_rtx_XOR (vmode, src, op2);
1224 : }
1225 6 : else if (GET_CODE (op1) == AND
1226 0 : && GET_CODE (XEXP (op1, 0)) == NOT)
1227 : {
1228 0 : rtx op11 = XEXP (XEXP (op1, 0), 0);
1229 0 : rtx op12 = XEXP (op1, 1);
1230 0 : convert_op (&op11, insn);
1231 0 : convert_op (&op12, insn);
1232 0 : if (!REG_P (op11))
1233 : {
1234 0 : tmp = gen_reg_rtx (vmode);
1235 0 : emit_insn_before (gen_rtx_SET (tmp, op11), insn);
1236 0 : op11 = tmp;
1237 : }
1238 0 : src = gen_rtx_AND (vmode, gen_rtx_NOT (vmode, op11), op12);
1239 0 : }
1240 6 : else if (GET_CODE (op1) == AND)
1241 : {
1242 0 : rtx op11 = XEXP (op1, 0);
1243 0 : rtx op12 = XEXP (op1, 1);
1244 0 : convert_op (&op11, insn);
1245 0 : convert_op (&op12, insn);
1246 0 : if (!REG_P (op11))
1247 : {
1248 0 : tmp = gen_reg_rtx (vmode);
1249 0 : emit_insn_before (gen_rtx_SET (tmp, op11), insn);
1250 0 : op11 = tmp;
1251 : }
1252 0 : return gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, op11, op12),
1253 : UNSPEC_PTEST);
1254 : }
1255 : else
1256 : {
1257 6 : convert_op (&op1, insn);
1258 6 : src = op1;
1259 : }
1260 :
1261 10 : if (!REG_P (src))
1262 : {
1263 6 : tmp = gen_reg_rtx (vmode);
1264 6 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
1265 6 : src = tmp;
1266 : }
1267 :
1268 10 : if (vmode == V2DImode)
1269 : {
1270 0 : tmp = gen_reg_rtx (vmode);
1271 0 : emit_insn_before (gen_vec_interleave_lowv2di (tmp, src, src), insn);
1272 0 : src = tmp;
1273 : }
1274 10 : else if (vmode == V4SImode)
1275 : {
1276 0 : tmp = gen_reg_rtx (vmode);
1277 0 : emit_insn_before (gen_sse2_pshufd (tmp, src, const0_rtx), insn);
1278 0 : src = tmp;
1279 : }
1280 :
1281 10 : return gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, src, src), UNSPEC_PTEST);
1282 : }
1283 :
1284 : /* Helper function for converting INSN to vector mode. */
1285 :
1286 : void
1287 1333620 : scalar_chain::convert_insn_common (rtx_insn *insn)
1288 : {
1289 : /* Generate copies for out-of-chain uses of defs and adjust debug uses. */
1290 2038619 : for (df_ref ref = DF_INSN_DEFS (insn); ref; ref = DF_REF_NEXT_LOC (ref))
1291 704999 : if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
1292 : {
1293 23319 : df_link *use;
1294 44433 : for (use = DF_REF_CHAIN (ref); use; use = use->next)
1295 43006 : if (NONDEBUG_INSN_P (DF_REF_INSN (use->ref))
1296 43006 : && (DF_REF_REG_MEM_P (use->ref)
1297 38462 : || !bitmap_bit_p (insns, DF_REF_INSN_UID (use->ref))))
1298 : break;
1299 23319 : if (use)
1300 21892 : convert_reg (insn, DF_REF_REG (ref),
1301 21892 : *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]));
1302 1427 : else if (MAY_HAVE_DEBUG_BIND_INSNS)
1303 : {
1304 : /* If we generated a scalar copy we can leave debug-insns
1305 : as-is, if not, we have to adjust them. */
1306 1307 : auto_vec<rtx_insn *, 5> to_reset_debug_insns;
1307 3920 : for (use = DF_REF_CHAIN (ref); use; use = use->next)
1308 2613 : if (DEBUG_INSN_P (DF_REF_INSN (use->ref)))
1309 : {
1310 862 : rtx_insn *debug_insn = DF_REF_INSN (use->ref);
1311 : /* If there's a reaching definition outside of the
1312 : chain we have to reset. */
1313 862 : df_link *def;
1314 3010 : for (def = DF_REF_CHAIN (use->ref); def; def = def->next)
1315 2332 : if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def->ref)))
1316 : break;
1317 862 : if (def)
1318 184 : to_reset_debug_insns.safe_push (debug_insn);
1319 : else
1320 : {
1321 678 : *DF_REF_REAL_LOC (use->ref)
1322 678 : = *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]);
1323 678 : df_insn_rescan (debug_insn);
1324 : }
1325 : }
1326 : /* Have to do the reset outside of the DF_CHAIN walk to not
1327 : disrupt it. */
1328 2798 : while (!to_reset_debug_insns.is_empty ())
1329 : {
1330 184 : rtx_insn *debug_insn = to_reset_debug_insns.pop ();
1331 184 : INSN_VAR_LOCATION_LOC (debug_insn) = gen_rtx_UNKNOWN_VAR_LOC ();
1332 184 : df_insn_rescan_debug_internal (debug_insn);
1333 : }
1334 1307 : }
1335 : }
1336 :
1337 : /* Replace uses in this insn with the defs we use in the chain. */
1338 3335094 : for (df_ref ref = DF_INSN_USES (insn); ref; ref = DF_REF_NEXT_LOC (ref))
1339 2001474 : if (!DF_REF_REG_MEM_P (ref))
1340 715526 : if (rtx *vreg = defs_map.get (regno_reg_rtx[DF_REF_REGNO (ref)]))
1341 : {
1342 : /* Also update a corresponding REG_DEAD note. */
1343 35114 : rtx note = find_reg_note (insn, REG_DEAD, DF_REF_REG (ref));
1344 35114 : if (note)
1345 23182 : XEXP (note, 0) = *vreg;
1346 35114 : *DF_REF_REAL_LOC (ref) = *vreg;
1347 : }
1348 1333620 : }
1349 :
1350 : /* Convert INSN which is an SImode or DImode rotation by a constant
1351 : to vector mode. CODE is either ROTATE or ROTATERT with operands
1352 : OP0 and OP1. Returns the SET_SRC of the last instruction in the
1353 : resulting sequence, which is emitted before INSN. */
1354 :
1355 : rtx
1356 92 : general_scalar_chain::convert_rotate (enum rtx_code code, rtx op0, rtx op1,
1357 : rtx_insn *insn)
1358 : {
1359 92 : int bits = INTVAL (op1);
1360 92 : rtx pat, result;
1361 :
1362 92 : convert_op (&op0, insn);
1363 92 : if (bits == 0)
1364 0 : return op0;
1365 :
1366 92 : if (smode == DImode)
1367 : {
1368 92 : if (code == ROTATE)
1369 45 : bits = 64 - bits;
1370 92 : if (bits == 32)
1371 : {
1372 0 : rtx tmp1 = gen_reg_rtx (V4SImode);
1373 0 : pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
1374 : GEN_INT (225));
1375 0 : emit_insn_before (pat, insn);
1376 0 : result = gen_lowpart (V2DImode, tmp1);
1377 : }
1378 92 : else if (TARGET_AVX512VL)
1379 0 : result = simplify_gen_binary (code, V2DImode, op0, op1);
1380 92 : else if (bits == 16 || bits == 48)
1381 : {
1382 0 : rtx tmp1 = gen_reg_rtx (V8HImode);
1383 0 : pat = gen_sse2_pshuflw (tmp1, gen_lowpart (V8HImode, op0),
1384 : GEN_INT (bits == 16 ? 57 : 147));
1385 0 : emit_insn_before (pat, insn);
1386 0 : result = gen_lowpart (V2DImode, tmp1);
1387 : }
1388 92 : else if ((bits & 0x07) == 0)
1389 : {
1390 0 : rtx tmp1 = gen_reg_rtx (V4SImode);
1391 0 : pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
1392 : GEN_INT (68));
1393 0 : emit_insn_before (pat, insn);
1394 0 : rtx tmp2 = gen_reg_rtx (V1TImode);
1395 0 : pat = gen_sse2_lshrv1ti3 (tmp2, gen_lowpart (V1TImode, tmp1),
1396 : GEN_INT (bits));
1397 0 : emit_insn_before (pat, insn);
1398 0 : result = gen_lowpart (V2DImode, tmp2);
1399 : }
1400 : else
1401 : {
1402 92 : rtx tmp1 = gen_reg_rtx (V4SImode);
1403 92 : pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
1404 : GEN_INT (20));
1405 92 : emit_insn_before (pat, insn);
1406 92 : rtx tmp2 = gen_reg_rtx (V2DImode);
1407 92 : pat = gen_lshrv2di3 (tmp2, gen_lowpart (V2DImode, tmp1),
1408 : GEN_INT (bits & 31));
1409 92 : emit_insn_before (pat, insn);
1410 92 : rtx tmp3 = gen_reg_rtx (V4SImode);
1411 139 : pat = gen_sse2_pshufd (tmp3, gen_lowpart (V4SImode, tmp2),
1412 : GEN_INT (bits > 32 ? 34 : 136));
1413 92 : emit_insn_before (pat, insn);
1414 92 : result = gen_lowpart (V2DImode, tmp3);
1415 : }
1416 : }
1417 0 : else if (bits == 16)
1418 : {
1419 0 : rtx tmp1 = gen_reg_rtx (V8HImode);
1420 0 : pat = gen_sse2_pshuflw (tmp1, gen_lowpart (V8HImode, op0), GEN_INT (225));
1421 0 : emit_insn_before (pat, insn);
1422 0 : result = gen_lowpart (V4SImode, tmp1);
1423 : }
1424 0 : else if (TARGET_AVX512VL)
1425 0 : result = simplify_gen_binary (code, V4SImode, op0, op1);
1426 : else
1427 : {
1428 0 : if (code == ROTATE)
1429 0 : bits = 32 - bits;
1430 :
1431 0 : rtx tmp1 = gen_reg_rtx (V4SImode);
1432 0 : emit_insn_before (gen_sse2_pshufd (tmp1, op0, GEN_INT (224)), insn);
1433 0 : rtx tmp2 = gen_reg_rtx (V2DImode);
1434 0 : pat = gen_lshrv2di3 (tmp2, gen_lowpart (V2DImode, tmp1),
1435 : GEN_INT (bits));
1436 0 : emit_insn_before (pat, insn);
1437 0 : result = gen_lowpart (V4SImode, tmp2);
1438 : }
1439 :
1440 : return result;
1441 : }
1442 :
1443 : /* Convert INSN to vector mode. */
1444 :
1445 : void
1446 412266 : general_scalar_chain::convert_insn (rtx_insn *insn)
1447 : {
1448 412266 : rtx def_set = single_set (insn);
1449 412266 : rtx src = SET_SRC (def_set);
1450 412266 : rtx dst = SET_DEST (def_set);
1451 412266 : rtx subreg;
1452 :
1453 412266 : if (MEM_P (dst) && !REG_P (src))
1454 : {
1455 : /* There are no scalar integer instructions and therefore
1456 : temporary register usage is required. */
1457 762 : rtx tmp = gen_reg_rtx (smode);
1458 762 : emit_conversion_insns (gen_move_insn (dst, tmp), insn);
1459 762 : dst = gen_rtx_SUBREG (vmode, tmp, 0);
1460 762 : }
1461 411504 : else if (REG_P (dst) && GET_MODE (dst) == smode)
1462 : {
1463 : /* Replace the definition with a SUBREG to the definition we
1464 : use inside the chain. */
1465 215877 : rtx *vdef = defs_map.get (dst);
1466 215877 : if (vdef)
1467 23319 : dst = *vdef;
1468 215877 : dst = gen_rtx_SUBREG (vmode, dst, 0);
1469 : /* IRA doesn't like to have REG_EQUAL/EQUIV notes when the SET_DEST
1470 : is a non-REG_P. So kill those off. */
1471 215877 : rtx note = find_reg_equal_equiv_note (insn);
1472 215877 : if (note)
1473 9550 : remove_note (insn, note);
1474 : }
1475 :
1476 412266 : switch (GET_CODE (src))
1477 : {
1478 30139 : case PLUS:
1479 30139 : case MINUS:
1480 30139 : case IOR:
1481 30139 : case XOR:
1482 30139 : case AND:
1483 30139 : case SMAX:
1484 30139 : case SMIN:
1485 30139 : case UMAX:
1486 30139 : case UMIN:
1487 30139 : convert_op (&XEXP (src, 1), insn);
1488 : /* FALLTHRU */
1489 :
1490 37435 : case ABS:
1491 37435 : case ASHIFT:
1492 37435 : case ASHIFTRT:
1493 37435 : case LSHIFTRT:
1494 37435 : convert_op (&XEXP (src, 0), insn);
1495 37435 : PUT_MODE (src, vmode);
1496 37435 : break;
1497 :
1498 92 : case ROTATE:
1499 92 : case ROTATERT:
1500 92 : src = convert_rotate (GET_CODE (src), XEXP (src, 0), XEXP (src, 1),
1501 : insn);
1502 92 : break;
1503 :
1504 400 : case NEG:
1505 400 : src = XEXP (src, 0);
1506 :
1507 400 : if (GET_CODE (src) == ABS)
1508 : {
1509 0 : src = XEXP (src, 0);
1510 0 : convert_op (&src, insn);
1511 0 : subreg = gen_reg_rtx (vmode);
1512 0 : emit_insn_before (gen_rtx_SET (subreg,
1513 : gen_rtx_ABS (vmode, src)), insn);
1514 0 : src = subreg;
1515 : }
1516 : else
1517 400 : convert_op (&src, insn);
1518 :
1519 400 : subreg = gen_reg_rtx (vmode);
1520 400 : emit_insn_before (gen_move_insn (subreg, CONST0_RTX (vmode)), insn);
1521 400 : src = gen_rtx_MINUS (vmode, subreg, src);
1522 400 : break;
1523 :
1524 250 : case NOT:
1525 250 : src = XEXP (src, 0);
1526 250 : convert_op (&src, insn);
1527 250 : subreg = gen_reg_rtx (vmode);
1528 250 : emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (vmode)), insn);
1529 250 : src = gen_rtx_XOR (vmode, src, subreg);
1530 250 : break;
1531 :
1532 170798 : case MEM:
1533 170798 : if (!REG_P (dst))
1534 170798 : convert_op (&src, insn);
1535 : break;
1536 :
1537 197036 : case REG:
1538 197036 : if (!MEM_P (dst))
1539 1409 : convert_op (&src, insn);
1540 : break;
1541 :
1542 0 : case SUBREG:
1543 0 : gcc_assert (GET_MODE (src) == vmode);
1544 : break;
1545 :
1546 0 : case COMPARE:
1547 0 : dst = gen_rtx_REG (CCZmode, FLAGS_REG);
1548 0 : src = convert_compare (XEXP (src, 0), XEXP (src, 1), insn);
1549 0 : break;
1550 :
1551 3402 : case CONST_INT:
1552 3402 : convert_op (&src, insn);
1553 3402 : break;
1554 :
1555 2853 : case VEC_SELECT:
1556 2853 : if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
1557 1875 : src = XEXP (src, 0);
1558 978 : else if (smode == DImode)
1559 : {
1560 735 : rtx tmp = gen_lowpart (V1TImode, XEXP (src, 0));
1561 735 : dst = gen_lowpart (V1TImode, dst);
1562 735 : src = gen_rtx_LSHIFTRT (V1TImode, tmp, GEN_INT (64));
1563 : }
1564 : else
1565 : {
1566 243 : rtx tmp = XVECEXP (XEXP (src, 1), 0, 0);
1567 243 : rtvec vec = gen_rtvec (4, tmp, tmp, tmp, tmp);
1568 243 : rtx par = gen_rtx_PARALLEL (VOIDmode, vec);
1569 243 : src = gen_rtx_VEC_SELECT (vmode, XEXP (src, 0), par);
1570 : }
1571 : break;
1572 :
1573 0 : default:
1574 0 : gcc_unreachable ();
1575 : }
1576 :
1577 412266 : SET_SRC (def_set) = src;
1578 412266 : SET_DEST (def_set) = dst;
1579 :
1580 : /* Drop possible dead definitions. */
1581 412266 : PATTERN (insn) = def_set;
1582 :
1583 412266 : INSN_CODE (insn) = -1;
1584 412266 : int patt = recog_memoized (insn);
1585 412266 : if (patt == -1)
1586 0 : fatal_insn_not_found (insn);
1587 412266 : df_insn_rescan (insn);
1588 412266 : }
1589 :
1590 : /* Helper function to compute gain for loading an immediate constant.
1591 : Typically, two movabsq for TImode vs. vmovdqa for V1TImode, but
1592 : with numerous special cases. */
1593 :
1594 : static int
1595 8 : timode_immed_const_gain (rtx cst, basic_block bb)
1596 : {
1597 : /* movabsq vs. movabsq+vmovq+vunpacklqdq. */
1598 8 : if (CONST_WIDE_INT_P (cst)
1599 5 : && CONST_WIDE_INT_NUNITS (cst) == 2
1600 13 : && CONST_WIDE_INT_ELT (cst, 0) == CONST_WIDE_INT_ELT (cst, 1))
1601 0 : return optimize_bb_for_size_p (bb) ? -COSTS_N_BYTES (9)
1602 : : -COSTS_N_INSNS (2);
1603 : /* 2x movabsq ~ vmovdqa. */
1604 : return 0;
1605 : }
1606 :
1607 : /* Return true it's cost profitable for for chain conversion. */
1608 :
1609 : bool
1610 472109 : timode_scalar_chain::compute_convert_gain ()
1611 : {
1612 : /* Assume that if we have to move TImode values between units,
1613 : then transforming this chain isn't worth it. */
1614 472109 : if (cost_sse_integer)
1615 : return false;
1616 :
1617 472109 : bitmap_iterator bi;
1618 472109 : unsigned insn_uid;
1619 :
1620 : /* Split ties to prefer V1TImode when not optimizing for size. */
1621 472109 : int gain = optimize_size ? 0 : 1;
1622 472109 : sreal weighted_gain = 0;
1623 :
1624 472109 : if (dump_file)
1625 0 : fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
1626 :
1627 1399806 : EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
1628 : {
1629 927697 : rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1630 927697 : rtx def_set = single_set (insn);
1631 927697 : rtx src = SET_SRC (def_set);
1632 927697 : rtx dst = SET_DEST (def_set);
1633 927697 : HOST_WIDE_INT op1val;
1634 927697 : basic_block bb = BLOCK_FOR_INSN (insn);
1635 927697 : int scost, vcost;
1636 927697 : int igain = 0;
1637 927697 : profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count;
1638 927697 : bool speed_p = optimize_bb_for_speed_p (bb);
1639 927697 : sreal bb_freq = bb->count.to_sreal_scale (entry_count);
1640 :
1641 927697 : switch (GET_CODE (src))
1642 : {
1643 455207 : case REG:
1644 455207 : if (!speed_p)
1645 20585 : igain = MEM_P (dst) ? COSTS_N_BYTES (6) : COSTS_N_BYTES (3);
1646 : else
1647 : igain = COSTS_N_INSNS (1);
1648 : break;
1649 :
1650 422636 : case MEM:
1651 422636 : igain = !speed_p ? COSTS_N_BYTES (7) : COSTS_N_INSNS (1);
1652 : break;
1653 :
1654 11350 : case CONST_INT:
1655 11350 : if (MEM_P (dst)
1656 11350 : && standard_sse_constant_p (src, V1TImode))
1657 10846 : igain = !speed_p ? COSTS_N_BYTES (11) : 1;
1658 : break;
1659 :
1660 35255 : case CONST_WIDE_INT:
1661 : /* 2 x mov vs. vmovdqa. */
1662 35255 : if (MEM_P (dst))
1663 35053 : igain = !speed_p ? COSTS_N_BYTES (3) : COSTS_N_INSNS (1);
1664 : break;
1665 :
1666 19 : case NOT:
1667 19 : if (MEM_P (dst))
1668 24511 : igain = -COSTS_N_INSNS (1);
1669 : break;
1670 :
1671 14 : case AND:
1672 14 : if (!MEM_P (dst))
1673 3 : igain = COSTS_N_INSNS (1);
1674 14 : if (CONST_SCALAR_INT_P (XEXP (src, 1)))
1675 5 : igain += timode_immed_const_gain (XEXP (src, 1), bb);
1676 : break;
1677 :
1678 2816 : case XOR:
1679 2816 : case IOR:
1680 2816 : if (timode_concatdi_p (src))
1681 : {
1682 : /* vmovq;vpinsrq (11 bytes). */
1683 2765 : igain = speed_p ? -2 * ix86_cost->sse_to_integer
1684 : : -COSTS_N_BYTES (11);
1685 : break;
1686 : }
1687 51 : if (!MEM_P (dst))
1688 43 : igain = COSTS_N_INSNS (1);
1689 51 : if (CONST_SCALAR_INT_P (XEXP (src, 1)))
1690 3 : igain += timode_immed_const_gain (XEXP (src, 1), bb);
1691 : break;
1692 :
1693 0 : case PLUS:
1694 0 : if (timode_concatdi_p (src))
1695 : /* vmovq;vpinsrq (11 bytes). */
1696 0 : igain = speed_p ? -2 * ix86_cost->sse_to_integer
1697 : : -COSTS_N_BYTES (11);
1698 : break;
1699 :
1700 158 : case ASHIFT:
1701 158 : case LSHIFTRT:
1702 : /* See ix86_expand_v1ti_shift. */
1703 158 : op1val = INTVAL (XEXP (src, 1));
1704 158 : if (!speed_p)
1705 : {
1706 15 : if (op1val == 64 || op1val == 65)
1707 : scost = COSTS_N_BYTES (5);
1708 10 : else if (op1val >= 66)
1709 : scost = COSTS_N_BYTES (6);
1710 10 : else if (op1val == 1)
1711 : scost = COSTS_N_BYTES (8);
1712 : else
1713 : scost = COSTS_N_BYTES (9);
1714 :
1715 14 : if ((op1val & 7) == 0)
1716 : vcost = COSTS_N_BYTES (5);
1717 10 : else if (op1val > 64)
1718 : vcost = COSTS_N_BYTES (10);
1719 : else
1720 10 : vcost = TARGET_AVX ? COSTS_N_BYTES (19) : COSTS_N_BYTES (23);
1721 : }
1722 : else
1723 : {
1724 143 : scost = COSTS_N_INSNS (2);
1725 143 : if ((op1val & 7) == 0)
1726 : vcost = COSTS_N_INSNS (1);
1727 110 : else if (op1val > 64)
1728 : vcost = COSTS_N_INSNS (2);
1729 : else
1730 110 : vcost = TARGET_AVX ? COSTS_N_INSNS (4) : COSTS_N_INSNS (5);
1731 : }
1732 158 : igain = scost - vcost;
1733 158 : break;
1734 :
1735 103 : case ASHIFTRT:
1736 : /* See ix86_expand_v1ti_ashiftrt. */
1737 103 : op1val = INTVAL (XEXP (src, 1));
1738 103 : if (!speed_p)
1739 : {
1740 7 : if (op1val == 64 || op1val == 127)
1741 : scost = COSTS_N_BYTES (7);
1742 7 : else if (op1val == 1)
1743 : scost = COSTS_N_BYTES (8);
1744 7 : else if (op1val == 65)
1745 : scost = COSTS_N_BYTES (10);
1746 7 : else if (op1val >= 66)
1747 : scost = COSTS_N_BYTES (11);
1748 : else
1749 : scost = COSTS_N_BYTES (9);
1750 :
1751 0 : if (op1val == 127)
1752 : vcost = COSTS_N_BYTES (10);
1753 7 : else if (op1val == 64)
1754 : vcost = COSTS_N_BYTES (14);
1755 7 : else if (op1val == 96)
1756 : vcost = COSTS_N_BYTES (18);
1757 7 : else if (op1val >= 111)
1758 : vcost = COSTS_N_BYTES (15);
1759 7 : else if (TARGET_AVX2 && op1val == 32)
1760 : vcost = COSTS_N_BYTES (16);
1761 7 : else if (TARGET_SSE4_1 && op1val == 32)
1762 : vcost = COSTS_N_BYTES (20);
1763 7 : else if (op1val >= 96)
1764 : vcost = COSTS_N_BYTES (23);
1765 7 : else if ((op1val & 7) == 0)
1766 : vcost = COSTS_N_BYTES (28);
1767 7 : else if (TARGET_AVX2 && op1val < 32)
1768 : vcost = COSTS_N_BYTES (30);
1769 7 : else if (op1val == 1 || op1val >= 64)
1770 : vcost = COSTS_N_BYTES (42);
1771 : else
1772 7 : vcost = COSTS_N_BYTES (47);
1773 : }
1774 : else
1775 : {
1776 96 : if (op1val >= 65 && op1val <= 126)
1777 : scost = COSTS_N_INSNS (3);
1778 : else
1779 96 : scost = COSTS_N_INSNS (2);
1780 :
1781 96 : if (op1val == 127)
1782 : vcost = COSTS_N_INSNS (2);
1783 96 : else if (op1val == 64)
1784 : vcost = COSTS_N_INSNS (3);
1785 96 : else if (op1val == 96)
1786 : vcost = COSTS_N_INSNS (3);
1787 96 : else if (op1val >= 111)
1788 : vcost = COSTS_N_INSNS (3);
1789 96 : else if (TARGET_SSE4_1 && op1val == 32)
1790 : vcost = COSTS_N_INSNS (3);
1791 96 : else if (TARGET_SSE4_1
1792 0 : && (op1val == 8 || op1val == 16 || op1val == 24))
1793 : vcost = COSTS_N_INSNS (3);
1794 96 : else if (op1val >= 96)
1795 : vcost = COSTS_N_INSNS (4);
1796 96 : else if (TARGET_SSE4_1 && (op1val == 28 || op1val == 80))
1797 : vcost = COSTS_N_INSNS (4);
1798 96 : else if ((op1val & 7) == 0)
1799 : vcost = COSTS_N_INSNS (5);
1800 96 : else if (TARGET_AVX2 && op1val < 32)
1801 : vcost = COSTS_N_INSNS (6);
1802 96 : else if (TARGET_SSE4_1 && op1val < 15)
1803 : vcost = COSTS_N_INSNS (6);
1804 96 : else if (op1val == 1 || op1val >= 64)
1805 : vcost = COSTS_N_INSNS (8);
1806 : else
1807 0 : vcost = COSTS_N_INSNS (9);
1808 : }
1809 103 : igain = scost - vcost;
1810 103 : break;
1811 :
1812 5 : case ROTATE:
1813 5 : case ROTATERT:
1814 : /* See ix86_expand_v1ti_rotate. */
1815 5 : op1val = INTVAL (XEXP (src, 1));
1816 5 : if (!speed_p)
1817 : {
1818 0 : scost = COSTS_N_BYTES (13);
1819 0 : if ((op1val & 31) == 0)
1820 : vcost = COSTS_N_BYTES (5);
1821 0 : else if ((op1val & 7) == 0)
1822 0 : vcost = TARGET_AVX ? COSTS_N_BYTES (13) : COSTS_N_BYTES (18);
1823 0 : else if (op1val > 32 && op1val < 96)
1824 : vcost = COSTS_N_BYTES (24);
1825 : else
1826 0 : vcost = COSTS_N_BYTES (19);
1827 : }
1828 : else
1829 : {
1830 5 : scost = COSTS_N_INSNS (3);
1831 5 : if ((op1val & 31) == 0)
1832 : vcost = COSTS_N_INSNS (1);
1833 3 : else if ((op1val & 7) == 0)
1834 1 : vcost = TARGET_AVX ? COSTS_N_INSNS (3) : COSTS_N_INSNS (4);
1835 2 : else if (op1val > 32 && op1val < 96)
1836 : vcost = COSTS_N_INSNS (5);
1837 : else
1838 2 : vcost = COSTS_N_INSNS (1);
1839 : }
1840 5 : igain = scost - vcost;
1841 5 : break;
1842 :
1843 12 : case COMPARE:
1844 12 : if (XEXP (src, 1) == const0_rtx)
1845 : {
1846 8 : if (GET_CODE (XEXP (src, 0)) == AND)
1847 : /* and;and;or (9 bytes) vs. ptest (5 bytes). */
1848 : igain = !speed_p ? COSTS_N_BYTES (4) : COSTS_N_INSNS (2);
1849 : /* or (3 bytes) vs. ptest (5 bytes). */
1850 8 : else if (!speed_p)
1851 0 : igain = -COSTS_N_BYTES (2);
1852 : }
1853 4 : else if (XEXP (src, 1) == const1_rtx)
1854 : /* and;cmp -1 (7 bytes) vs. pcmpeqd;pxor;ptest (13 bytes). */
1855 0 : igain = !speed_p ? -COSTS_N_BYTES (6) : -COSTS_N_INSNS (1);
1856 : break;
1857 :
1858 122 : case ZERO_EXTEND:
1859 122 : if (GET_MODE (XEXP (src, 0)) == DImode)
1860 : /* xor (2 bytes) vs. vmovq (5 bytes). */
1861 122 : igain = speed_p ? COSTS_N_INSNS (1) - ix86_cost->sse_to_integer
1862 : : -COSTS_N_BYTES (3);
1863 : break;
1864 :
1865 : default:
1866 : break;
1867 : }
1868 :
1869 1813459 : gain += igain;
1870 927689 : if (speed_p)
1871 885770 : weighted_gain += bb_freq * igain;
1872 :
1873 927697 : if (igain != 0 && dump_file)
1874 : {
1875 0 : fprintf (dump_file, " Instruction gain %d with bb_freq %.2f for ",
1876 : igain, bb_freq.to_double ());
1877 0 : dump_insn_slim (dump_file, insn);
1878 : }
1879 : }
1880 :
1881 472109 : if (dump_file)
1882 0 : fprintf (dump_file, " Total gain: %d, weighted gain %.2f\n",
1883 : gain, weighted_gain.to_double ());
1884 :
1885 472109 : if (weighted_gain > (sreal) 0)
1886 : return true;
1887 : else
1888 24591 : return gain > 0;
1889 : }
1890 :
1891 : /* Fix uses of converted REG in debug insns. */
1892 :
1893 : void
1894 423862 : timode_scalar_chain::fix_debug_reg_uses (rtx reg)
1895 : {
1896 423862 : if (!flag_var_tracking)
1897 : return;
1898 :
1899 374473 : df_ref ref, next;
1900 766908 : for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
1901 : {
1902 392435 : rtx_insn *insn = DF_REF_INSN (ref);
1903 : /* Make sure the next ref is for a different instruction,
1904 : so that we're not affected by the rescan. */
1905 392435 : next = DF_REF_NEXT_REG (ref);
1906 392435 : while (next && DF_REF_INSN (next) == insn)
1907 0 : next = DF_REF_NEXT_REG (next);
1908 :
1909 392435 : if (DEBUG_INSN_P (insn))
1910 : {
1911 : /* It may be a debug insn with a TImode variable in
1912 : register. */
1913 : bool changed = false;
1914 178 : for (; ref != next; ref = DF_REF_NEXT_REG (ref))
1915 : {
1916 89 : rtx *loc = DF_REF_LOC (ref);
1917 89 : if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
1918 : {
1919 85 : *loc = gen_rtx_SUBREG (TImode, *loc, 0);
1920 85 : changed = true;
1921 : }
1922 : }
1923 89 : if (changed)
1924 85 : df_insn_rescan (insn);
1925 : }
1926 : }
1927 : }
1928 :
1929 : /* Convert SRC, a *concatditi3 pattern, into a vec_concatv2di instruction.
1930 : Insert this before INSN, and return the result as a V1TImode subreg. */
1931 :
1932 : static rtx
1933 253 : timode_convert_concatdi (rtx src, rtx_insn *insn)
1934 : {
1935 253 : rtx hi, lo;
1936 253 : rtx tmp = gen_reg_rtx (V2DImode);
1937 253 : if (GET_CODE (XEXP (src, 0)) == ASHIFT)
1938 : {
1939 253 : hi = XEXP (XEXP (XEXP (src, 0), 0), 0);
1940 253 : lo = XEXP (XEXP (src, 1), 0);
1941 : }
1942 : else
1943 : {
1944 0 : hi = XEXP (XEXP (XEXP (src, 1), 0), 0);
1945 0 : lo = XEXP (XEXP (src, 0), 0);
1946 : }
1947 253 : emit_insn_before (gen_vec_concatv2di (tmp, lo, hi), insn);
1948 253 : return gen_rtx_SUBREG (V1TImode, tmp, 0);
1949 : }
1950 :
1951 : /* Convert INSN from TImode to V1T1mode. */
1952 :
1953 : void
1954 921354 : timode_scalar_chain::convert_insn (rtx_insn *insn)
1955 : {
1956 921354 : rtx def_set = single_set (insn);
1957 921354 : rtx src = SET_SRC (def_set);
1958 921354 : rtx dst = SET_DEST (def_set);
1959 921354 : rtx tmp;
1960 :
1961 921354 : switch (GET_CODE (dst))
1962 : {
1963 423872 : case REG:
1964 423872 : if (GET_MODE (dst) == TImode)
1965 : {
1966 422111 : PUT_MODE (dst, V1TImode);
1967 422111 : fix_debug_reg_uses (dst);
1968 : }
1969 423872 : if (GET_MODE (dst) == V1TImode)
1970 : {
1971 : /* It might potentially be helpful to convert REG_EQUAL notes,
1972 : but for now we just remove them. */
1973 423862 : rtx note = find_reg_equal_equiv_note (insn);
1974 423862 : if (note)
1975 444 : remove_note (insn, note);
1976 : }
1977 : break;
1978 497482 : case MEM:
1979 497482 : PUT_MODE (dst, V1TImode);
1980 497482 : break;
1981 :
1982 0 : default:
1983 0 : gcc_unreachable ();
1984 : }
1985 :
1986 921354 : switch (GET_CODE (src))
1987 : {
1988 451690 : case REG:
1989 451690 : if (GET_MODE (src) == TImode)
1990 : {
1991 1751 : PUT_MODE (src, V1TImode);
1992 1751 : fix_debug_reg_uses (src);
1993 : }
1994 : break;
1995 :
1996 422588 : case MEM:
1997 422588 : PUT_MODE (src, V1TImode);
1998 422588 : break;
1999 :
2000 35254 : case CONST_WIDE_INT:
2001 35254 : if (NONDEBUG_INSN_P (insn))
2002 : {
2003 : /* Since there are no instructions to store 128-bit constant,
2004 : temporary register usage is required. */
2005 35254 : bool use_move;
2006 35254 : start_sequence ();
2007 35254 : tmp = ix86_convert_const_wide_int_to_broadcast (TImode, src);
2008 35254 : if (tmp)
2009 : {
2010 194 : src = lowpart_subreg (V1TImode, tmp, TImode);
2011 194 : use_move = true;
2012 : }
2013 : else
2014 : {
2015 35060 : src = smode_convert_cst (src, V1TImode);
2016 35060 : src = validize_mem (force_const_mem (V1TImode, src));
2017 35060 : use_move = MEM_P (dst);
2018 : }
2019 35254 : rtx_insn *seq = end_sequence ();
2020 35254 : if (seq)
2021 195 : emit_insn_before (seq, insn);
2022 35254 : if (use_move)
2023 : {
2024 35054 : tmp = gen_reg_rtx (V1TImode);
2025 35054 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2026 35054 : src = tmp;
2027 : }
2028 : }
2029 : break;
2030 :
2031 11350 : case CONST_INT:
2032 11350 : switch (standard_sse_constant_p (src, TImode))
2033 : {
2034 11125 : case 1:
2035 11125 : src = CONST0_RTX (GET_MODE (dst));
2036 11125 : break;
2037 225 : case 2:
2038 225 : src = CONSTM1_RTX (GET_MODE (dst));
2039 225 : break;
2040 0 : default:
2041 0 : gcc_unreachable ();
2042 : }
2043 11350 : if (MEM_P (dst))
2044 : {
2045 10846 : tmp = gen_reg_rtx (V1TImode);
2046 10846 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2047 10846 : src = tmp;
2048 : }
2049 : break;
2050 :
2051 13 : case AND:
2052 13 : if (GET_CODE (XEXP (src, 0)) == NOT)
2053 : {
2054 0 : convert_op (&XEXP (XEXP (src, 0), 0), insn);
2055 0 : convert_op (&XEXP (src, 1), insn);
2056 0 : PUT_MODE (XEXP (src, 0), V1TImode);
2057 0 : PUT_MODE (src, V1TImode);
2058 0 : break;
2059 : }
2060 13 : convert_op (&XEXP (src, 0), insn);
2061 13 : convert_op (&XEXP (src, 1), insn);
2062 13 : PUT_MODE (src, V1TImode);
2063 13 : if (MEM_P (dst))
2064 : {
2065 10 : tmp = gen_reg_rtx (V1TImode);
2066 10 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2067 10 : src = tmp;
2068 : }
2069 : break;
2070 :
2071 304 : case XOR:
2072 304 : case IOR:
2073 304 : if (timode_concatdi_p (src))
2074 : {
2075 253 : src = timode_convert_concatdi (src, insn);
2076 253 : break;
2077 : }
2078 51 : convert_op (&XEXP (src, 0), insn);
2079 51 : convert_op (&XEXP (src, 1), insn);
2080 51 : PUT_MODE (src, V1TImode);
2081 51 : if (MEM_P (dst))
2082 : {
2083 8 : tmp = gen_reg_rtx (V1TImode);
2084 8 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2085 8 : src = tmp;
2086 : }
2087 : break;
2088 :
2089 3 : case NOT:
2090 3 : src = XEXP (src, 0);
2091 3 : convert_op (&src, insn);
2092 3 : tmp = gen_reg_rtx (V1TImode);
2093 3 : emit_insn_before (gen_move_insn (tmp, CONSTM1_RTX (V1TImode)), insn);
2094 3 : src = gen_rtx_XOR (V1TImode, src, tmp);
2095 3 : if (MEM_P (dst))
2096 : {
2097 0 : tmp = gen_reg_rtx (V1TImode);
2098 0 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2099 0 : src = tmp;
2100 : }
2101 : break;
2102 :
2103 10 : case COMPARE:
2104 10 : dst = gen_rtx_REG (CCZmode, FLAGS_REG);
2105 10 : src = convert_compare (XEXP (src, 0), XEXP (src, 1), insn);
2106 10 : break;
2107 :
2108 43 : case ASHIFT:
2109 43 : case LSHIFTRT:
2110 43 : case ASHIFTRT:
2111 43 : case ROTATERT:
2112 43 : case ROTATE:
2113 43 : convert_op (&XEXP (src, 0), insn);
2114 43 : PUT_MODE (src, V1TImode);
2115 43 : break;
2116 :
2117 99 : case ZERO_EXTEND:
2118 99 : if (GET_MODE (XEXP (src, 0)) == DImode)
2119 : {
2120 : /* Convert to *vec_concatv2di_0. */
2121 99 : rtx tmp = gen_reg_rtx (V2DImode);
2122 99 : rtx pat = gen_rtx_VEC_CONCAT (V2DImode, XEXP (src, 0), const0_rtx);
2123 99 : emit_insn_before (gen_move_insn (tmp, pat), insn);
2124 99 : src = gen_rtx_SUBREG (vmode, tmp, 0);
2125 : }
2126 : else
2127 0 : gcc_unreachable ();
2128 99 : break;
2129 :
2130 0 : case PLUS:
2131 0 : if (timode_concatdi_p (src))
2132 0 : src = timode_convert_concatdi (src, insn);
2133 : else
2134 0 : gcc_unreachable ();
2135 0 : break;
2136 :
2137 0 : default:
2138 0 : gcc_unreachable ();
2139 : }
2140 :
2141 921354 : SET_SRC (def_set) = src;
2142 921354 : SET_DEST (def_set) = dst;
2143 :
2144 : /* Drop possible dead definitions. */
2145 921354 : PATTERN (insn) = def_set;
2146 :
2147 921354 : INSN_CODE (insn) = -1;
2148 921354 : recog_memoized (insn);
2149 921354 : df_insn_rescan (insn);
2150 921354 : }
2151 :
2152 : /* Generate copies from defs used by the chain but not defined therein.
2153 : Also populates defs_map which is used later by convert_insn. */
2154 :
2155 : void
2156 642658 : scalar_chain::convert_registers ()
2157 : {
2158 642658 : bitmap_iterator bi;
2159 642658 : unsigned id;
2160 668632 : EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
2161 : {
2162 25974 : rtx chain_reg = gen_reg_rtx (smode);
2163 25974 : defs_map.put (regno_reg_rtx[id], chain_reg);
2164 : }
2165 650746 : EXECUTE_IF_SET_IN_BITMAP (insns_conv, 0, id, bi)
2166 20442 : for (df_ref ref = DF_INSN_UID_DEFS (id); ref; ref = DF_REF_NEXT_LOC (ref))
2167 12354 : if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
2168 8088 : make_vector_copies (DF_REF_INSN (ref), DF_REF_REAL_REG (ref));
2169 642658 : }
2170 :
2171 : /* Convert whole chain creating required register
2172 : conversions and copies. */
2173 :
2174 : int
2175 642658 : scalar_chain::convert ()
2176 : {
2177 642658 : bitmap_iterator bi;
2178 642658 : unsigned id;
2179 642658 : int converted_insns = 0;
2180 :
2181 642658 : if (!dbg_cnt (stv_conversion))
2182 : return 0;
2183 :
2184 642658 : if (dump_file)
2185 0 : fprintf (dump_file, "Converting chain #%d...\n", chain_id);
2186 :
2187 642658 : convert_registers ();
2188 :
2189 1976278 : EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
2190 : {
2191 1333620 : rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
2192 1333620 : convert_insn_common (insn);
2193 1333620 : convert_insn (insn);
2194 1333620 : converted_insns++;
2195 : }
2196 :
2197 : return converted_insns;
2198 : }
2199 :
2200 : /* Return the SET expression if INSN doesn't reference hard register.
2201 : Return NULL if INSN uses or defines a hard register, excluding
2202 : pseudo register pushes, hard register uses in a memory address,
2203 : clobbers and flags definitions. */
2204 :
2205 : static rtx
2206 337484609 : pseudo_reg_set (rtx_insn *insn)
2207 : {
2208 337484609 : rtx set = single_set (insn);
2209 337484609 : if (!set)
2210 : return NULL;
2211 :
2212 : /* Check pseudo register push first. */
2213 135441446 : machine_mode mode = TARGET_64BIT ? TImode : DImode;
2214 135441446 : if (REG_P (SET_SRC (set))
2215 38145953 : && !HARD_REGISTER_P (SET_SRC (set))
2216 165212978 : && push_operand (SET_DEST (set), mode))
2217 : return set;
2218 :
2219 135189119 : df_ref ref;
2220 219050361 : FOR_EACH_INSN_DEF (ref, insn)
2221 120586782 : if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
2222 64722979 : && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
2223 170801855 : && DF_REF_REGNO (ref) != FLAGS_REG)
2224 : return NULL;
2225 :
2226 188665704 : FOR_EACH_INSN_USE (ref, insn)
2227 115603337 : if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
2228 : return NULL;
2229 :
2230 : return set;
2231 : }
2232 :
2233 : /* Return true if the register REG is defined in a single DEF chain.
2234 : If it is defined in more than one DEF chains, we may not be able
2235 : to convert it in all chains. */
2236 :
2237 : static bool
2238 1155736 : single_def_chain_p (rtx reg)
2239 : {
2240 1155736 : df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg));
2241 1155736 : if (!ref)
2242 : return false;
2243 1155720 : return DF_REF_NEXT_REG (ref) == nullptr;
2244 : }
2245 :
2246 : /* Check if comparison INSN may be transformed into vector comparison.
2247 : Currently we transform equality/inequality checks which look like:
2248 : (set (reg:CCZ 17 flags) (compare:CCZ (reg:TI x) (reg:TI y))) */
2249 :
2250 : static bool
2251 12869469 : convertible_comparison_p (rtx_insn *insn, enum machine_mode mode)
2252 : {
2253 14271865 : if (mode != (TARGET_64BIT ? TImode : DImode))
2254 : return false;
2255 :
2256 4696889 : if (!TARGET_SSE4_1)
2257 : return false;
2258 :
2259 164925 : rtx def_set = single_set (insn);
2260 :
2261 164925 : gcc_assert (def_set);
2262 :
2263 164925 : rtx src = SET_SRC (def_set);
2264 164925 : rtx dst = SET_DEST (def_set);
2265 :
2266 164925 : gcc_assert (GET_CODE (src) == COMPARE);
2267 :
2268 164925 : if (!REG_P (dst)
2269 164925 : || REGNO (dst) != FLAGS_REG
2270 329850 : || GET_MODE (dst) != CCZmode)
2271 : return false;
2272 :
2273 120106 : rtx op1 = XEXP (src, 0);
2274 120106 : rtx op2 = XEXP (src, 1);
2275 :
2276 : /* *cmp<dwi>_doubleword. */
2277 120106 : if ((CONST_SCALAR_INT_P (op1)
2278 120106 : || ((REG_P (op1) || MEM_P (op1))
2279 118321 : && GET_MODE (op1) == mode))
2280 60 : && (CONST_SCALAR_INT_P (op2)
2281 12 : || ((REG_P (op2) || MEM_P (op2))
2282 10 : && GET_MODE (op2) == mode)))
2283 : return true;
2284 :
2285 : /* *testti_doubleword. */
2286 120048 : if (op2 == const0_rtx
2287 38296 : && GET_CODE (op1) == AND
2288 150 : && REG_P (XEXP (op1, 0)))
2289 : {
2290 150 : rtx op12 = XEXP (op1, 1);
2291 150 : return GET_MODE (XEXP (op1, 0)) == TImode
2292 150 : && (CONST_SCALAR_INT_P (op12)
2293 0 : || ((REG_P (op12) || MEM_P (op12))
2294 0 : && GET_MODE (op12) == TImode));
2295 : }
2296 :
2297 : /* *test<dwi>_not_doubleword. */
2298 119898 : if (op2 == const0_rtx
2299 38146 : && GET_CODE (op1) == AND
2300 0 : && GET_CODE (XEXP (op1, 0)) == NOT)
2301 : {
2302 0 : rtx op11 = XEXP (XEXP (op1, 0), 0);
2303 0 : rtx op12 = XEXP (op1, 1);
2304 0 : return (REG_P (op11) || MEM_P (op11))
2305 0 : && (REG_P (op12) || MEM_P (op12))
2306 0 : && GET_MODE (op11) == mode
2307 0 : && GET_MODE (op12) == mode;
2308 : }
2309 :
2310 : return false;
2311 : }
2312 :
2313 : /* The general version of scalar_to_vector_candidate_p. */
2314 :
2315 : static bool
2316 236001426 : general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)
2317 : {
2318 236001426 : rtx def_set = pseudo_reg_set (insn);
2319 :
2320 236001426 : if (!def_set)
2321 : return false;
2322 :
2323 49551398 : rtx src = SET_SRC (def_set);
2324 49551398 : rtx dst = SET_DEST (def_set);
2325 :
2326 49551398 : if (GET_CODE (src) == COMPARE)
2327 8873778 : return convertible_comparison_p (insn, mode);
2328 :
2329 : /* We are interested in "mode" only. */
2330 40677620 : if ((GET_MODE (src) != mode
2331 27828861 : && !CONST_INT_P (src))
2332 18007183 : || GET_MODE (dst) != mode)
2333 : return false;
2334 :
2335 15111358 : if (!REG_P (dst) && !MEM_P (dst))
2336 : return false;
2337 :
2338 14881811 : switch (GET_CODE (src))
2339 : {
2340 530193 : case ASHIFT:
2341 530193 : case LSHIFTRT:
2342 530193 : case ASHIFTRT:
2343 530193 : case ROTATE:
2344 530193 : case ROTATERT:
2345 530193 : if (!CONST_INT_P (XEXP (src, 1))
2346 1024225 : || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, GET_MODE_BITSIZE (mode)-1))
2347 : return false;
2348 :
2349 : /* Check for extend highpart case. */
2350 494028 : if (mode != DImode
2351 353750 : || GET_CODE (src) != ASHIFTRT
2352 76423 : || GET_CODE (XEXP (src, 0)) != ASHIFT)
2353 : break;
2354 :
2355 3697171 : src = XEXP (src, 0);
2356 : break;
2357 :
2358 78460 : case SMAX:
2359 78460 : case SMIN:
2360 78460 : case UMAX:
2361 78460 : case UMIN:
2362 78460 : if ((mode == DImode && !TARGET_AVX512VL)
2363 17538 : || (mode == SImode && !TARGET_SSE4_1))
2364 : return false;
2365 : /* Fallthru. */
2366 :
2367 3243340 : case AND:
2368 3243340 : case IOR:
2369 3243340 : case XOR:
2370 3243340 : case PLUS:
2371 3243340 : case MINUS:
2372 3243340 : if (!REG_P (XEXP (src, 1))
2373 : && !MEM_P (XEXP (src, 1))
2374 : && !CONST_INT_P (XEXP (src, 1)))
2375 : return false;
2376 :
2377 3150759 : if (GET_MODE (XEXP (src, 1)) != mode
2378 1842608 : && !CONST_INT_P (XEXP (src, 1)))
2379 : return false;
2380 :
2381 : /* Check for andnot case. */
2382 3150759 : if (GET_CODE (src) != AND
2383 182409 : || GET_CODE (XEXP (src, 0)) != NOT)
2384 : break;
2385 :
2386 3697171 : src = XEXP (src, 0);
2387 : /* FALLTHRU */
2388 :
2389 : case NOT:
2390 : break;
2391 :
2392 24992 : case NEG:
2393 : /* Check for nabs case. */
2394 24992 : if (GET_CODE (XEXP (src, 0)) != ABS)
2395 : break;
2396 :
2397 : src = XEXP (src, 0);
2398 : /* FALLTHRU */
2399 :
2400 2880 : case ABS:
2401 2880 : if ((mode == DImode && !TARGET_AVX512VL)
2402 1428 : || (mode == SImode && !TARGET_SSSE3))
2403 : return false;
2404 : break;
2405 :
2406 : case REG:
2407 : return true;
2408 :
2409 5984483 : case MEM:
2410 5984483 : case CONST_INT:
2411 5984483 : return REG_P (dst);
2412 :
2413 57439 : case VEC_SELECT:
2414 : /* Excluding MEM_P (dst) avoids intefering with vpextr[dq]. */
2415 57439 : return REG_P (dst)
2416 46833 : && REG_P (XEXP (src, 0))
2417 53874 : && GET_MODE (XEXP (src, 0)) == (mode == DImode ? V2DImode
2418 : : V4SImode)
2419 37545 : && GET_CODE (XEXP (src, 1)) == PARALLEL
2420 37545 : && XVECLEN (XEXP (src, 1), 0) == 1
2421 94984 : && CONST_INT_P (XVECEXP (XEXP (src, 1), 0, 0));
2422 :
2423 : default:
2424 : return false;
2425 : }
2426 :
2427 3697171 : if (!REG_P (XEXP (src, 0))
2428 : && !MEM_P (XEXP (src, 0))
2429 : && !CONST_INT_P (XEXP (src, 0)))
2430 : return false;
2431 :
2432 3388876 : if (GET_MODE (XEXP (src, 0)) != mode
2433 0 : && !CONST_INT_P (XEXP (src, 0)))
2434 : return false;
2435 :
2436 : return true;
2437 : }
2438 :
2439 : /* Check for a suitable TImode memory operand. */
2440 :
2441 : static bool
2442 1566 : timode_mem_p (rtx x)
2443 : {
2444 1566 : return MEM_P (x)
2445 1566 : && (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
2446 0 : || !misaligned_operand (x, TImode));
2447 : }
2448 :
2449 : /* The TImode version of scalar_to_vector_candidate_p. */
2450 :
2451 : static bool
2452 101483183 : timode_scalar_to_vector_candidate_p (rtx_insn *insn)
2453 : {
2454 101483183 : rtx def_set = pseudo_reg_set (insn);
2455 :
2456 101483183 : if (!def_set)
2457 : return false;
2458 :
2459 23763296 : rtx src = SET_SRC (def_set);
2460 23763296 : rtx dst = SET_DEST (def_set);
2461 :
2462 23763296 : if (GET_CODE (src) == COMPARE)
2463 3995691 : return convertible_comparison_p (insn, TImode);
2464 :
2465 19767605 : if (GET_MODE (dst) != TImode
2466 1204687 : || (GET_MODE (src) != TImode
2467 64872 : && !CONST_SCALAR_INT_P (src)))
2468 : return false;
2469 :
2470 1204687 : if (!REG_P (dst) && !MEM_P (dst))
2471 : return false;
2472 :
2473 1203234 : if (MEM_P (dst)
2474 535302 : && misaligned_operand (dst, TImode)
2475 1515885 : && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
2476 : return false;
2477 :
2478 1203229 : if (REG_P (dst) && !single_def_chain_p (dst))
2479 : return false;
2480 :
2481 1048034 : switch (GET_CODE (src))
2482 : {
2483 487804 : case REG:
2484 487804 : return single_def_chain_p (src);
2485 :
2486 : case CONST_WIDE_INT:
2487 : return true;
2488 :
2489 13226 : case CONST_INT:
2490 : /* ??? Verify performance impact before enabling CONST_INT for
2491 : __int128 store. */
2492 13226 : return standard_sse_constant_p (src, TImode);
2493 :
2494 445057 : case MEM:
2495 : /* Memory must be aligned or unaligned load is optimal. */
2496 445057 : return (REG_P (dst)
2497 445057 : && (!misaligned_operand (src, TImode)
2498 148775 : || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
2499 :
2500 3930 : case AND:
2501 3930 : if (!MEM_P (dst)
2502 3889 : && GET_CODE (XEXP (src, 0)) == NOT
2503 0 : && REG_P (XEXP (XEXP (src, 0), 0))
2504 3930 : && (REG_P (XEXP (src, 1))
2505 0 : || CONST_SCALAR_INT_P (XEXP (src, 1))
2506 0 : || timode_mem_p (XEXP (src, 1))))
2507 0 : return true;
2508 3930 : return (REG_P (XEXP (src, 0))
2509 46 : || timode_mem_p (XEXP (src, 0)))
2510 3976 : && (REG_P (XEXP (src, 1))
2511 2108 : || CONST_SCALAR_INT_P (XEXP (src, 1))
2512 35 : || timode_mem_p (XEXP (src, 1)));
2513 :
2514 14107 : case IOR:
2515 14107 : case XOR:
2516 14107 : if (timode_concatdi_p (src))
2517 : return true;
2518 2667 : return (REG_P (XEXP (src, 0))
2519 1438 : || timode_mem_p (XEXP (src, 0)))
2520 2684 : && (REG_P (XEXP (src, 1))
2521 267 : || CONST_SCALAR_INT_P (XEXP (src, 1))
2522 31 : || timode_mem_p (XEXP (src, 1)));
2523 :
2524 505 : case NOT:
2525 505 : return REG_P (XEXP (src, 0)) || timode_mem_p (XEXP (src, 0));
2526 :
2527 12321 : case ASHIFT:
2528 12321 : case LSHIFTRT:
2529 12321 : case ASHIFTRT:
2530 12321 : case ROTATERT:
2531 12321 : case ROTATE:
2532 : /* Handle shifts/rotates by integer constants between 0 and 127. */
2533 12321 : return REG_P (XEXP (src, 0))
2534 12289 : && CONST_INT_P (XEXP (src, 1))
2535 24269 : && (INTVAL (XEXP (src, 1)) & ~0x7f) == 0;
2536 :
2537 7233 : case PLUS:
2538 7233 : return timode_concatdi_p (src);
2539 :
2540 3828 : case ZERO_EXTEND:
2541 3828 : return REG_P (XEXP (src, 0))
2542 3828 : && GET_MODE (XEXP (src, 0)) == DImode;
2543 :
2544 : default:
2545 : return false;
2546 : }
2547 : }
2548 :
2549 : /* For a register REGNO, scan instructions for its defs and uses.
2550 : Put REGNO in REGS if a def or use isn't in CANDIDATES. */
2551 :
2552 : static void
2553 1278271 : timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
2554 : unsigned int regno)
2555 : {
2556 : /* Do nothing if REGNO is already in REGS or is a hard reg. */
2557 1278271 : if (bitmap_bit_p (regs, regno)
2558 1278271 : || HARD_REGISTER_NUM_P (regno))
2559 : return;
2560 :
2561 1265640 : for (df_ref def = DF_REG_DEF_CHAIN (regno);
2562 2506786 : def;
2563 1241146 : def = DF_REF_NEXT_REG (def))
2564 : {
2565 1265620 : if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2566 : {
2567 24474 : if (dump_file)
2568 0 : fprintf (dump_file,
2569 : "r%d has non convertible def in insn %d\n",
2570 0 : regno, DF_REF_INSN_UID (def));
2571 :
2572 24474 : bitmap_set_bit (regs, regno);
2573 24474 : break;
2574 : }
2575 : }
2576 :
2577 1265640 : for (df_ref ref = DF_REG_USE_CHAIN (regno);
2578 2784574 : ref;
2579 1518934 : ref = DF_REF_NEXT_REG (ref))
2580 : {
2581 : /* Debug instructions are skipped. */
2582 1583333 : if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
2583 1583333 : && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
2584 : {
2585 64399 : if (dump_file)
2586 0 : fprintf (dump_file,
2587 : "r%d has non convertible use in insn %d\n",
2588 0 : regno, DF_REF_INSN_UID (ref));
2589 :
2590 64399 : bitmap_set_bit (regs, regno);
2591 64399 : break;
2592 : }
2593 : }
2594 : }
2595 :
2596 : /* For a given bitmap of insn UIDs scans all instructions and
2597 : remove insn from CANDIDATES in case it has both convertible
2598 : and not convertible definitions.
2599 :
2600 : All insns in a bitmap are conversion candidates according to
2601 : scalar_to_vector_candidate_p. Currently it implies all insns
2602 : are single_set. */
2603 :
2604 : static void
2605 828673 : timode_remove_non_convertible_regs (bitmap candidates)
2606 : {
2607 828673 : bitmap_iterator bi;
2608 828673 : unsigned id;
2609 828673 : bitmap regs = BITMAP_ALLOC (NULL);
2610 855893 : bool changed;
2611 :
2612 855893 : do {
2613 855893 : changed = false;
2614 2165342 : EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
2615 : {
2616 1309449 : rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
2617 1309449 : df_ref ref;
2618 :
2619 1963608 : FOR_EACH_INSN_DEF (ref, insn)
2620 654159 : if (!DF_REF_REG_MEM_P (ref)
2621 654159 : && GET_MODE (DF_REF_REG (ref)) == TImode)
2622 631384 : timode_check_non_convertible_regs (candidates, regs,
2623 : DF_REF_REGNO (ref));
2624 :
2625 3229581 : FOR_EACH_INSN_USE (ref, insn)
2626 1920132 : if (!DF_REF_REG_MEM_P (ref)
2627 677505 : && GET_MODE (DF_REF_REG (ref)) == TImode)
2628 646887 : timode_check_non_convertible_regs (candidates, regs,
2629 : DF_REF_REGNO (ref));
2630 : }
2631 :
2632 1047010 : EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
2633 : {
2634 191117 : for (df_ref def = DF_REG_DEF_CHAIN (id);
2635 389178 : def;
2636 198061 : def = DF_REF_NEXT_REG (def))
2637 198061 : if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2638 : {
2639 49317 : if (dump_file)
2640 0 : fprintf (dump_file, "Removing insn %d from candidates list\n",
2641 0 : DF_REF_INSN_UID (def));
2642 :
2643 49317 : bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
2644 49317 : changed = true;
2645 : }
2646 :
2647 191117 : for (df_ref ref = DF_REG_USE_CHAIN (id);
2648 520047 : ref;
2649 328930 : ref = DF_REF_NEXT_REG (ref))
2650 328930 : if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
2651 : {
2652 35325 : if (dump_file)
2653 0 : fprintf (dump_file, "Removing insn %d from candidates list\n",
2654 0 : DF_REF_INSN_UID (ref));
2655 :
2656 35325 : bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
2657 35325 : changed = true;
2658 : }
2659 : }
2660 : } while (changed);
2661 :
2662 828673 : BITMAP_FREE (regs);
2663 828673 : }
2664 :
2665 : /* Main STV pass function. Find and convert scalar
2666 : instructions into vector mode when profitable. */
2667 :
2668 : static unsigned int
2669 1783308 : convert_scalars_to_vector (bool timode_p)
2670 : {
2671 1783308 : basic_block bb;
2672 1783308 : int converted_insns = 0;
2673 1783308 : auto_vec<rtx_insn *> control_flow_insns;
2674 :
2675 1783308 : bitmap_obstack_initialize (NULL);
2676 1783308 : const machine_mode cand_mode[3] = { SImode, DImode, TImode };
2677 1783308 : const machine_mode cand_vmode[3] = { V4SImode, V2DImode, V1TImode };
2678 5349924 : bitmap_head candidates[3]; /* { SImode, DImode, TImode } */
2679 7133232 : for (unsigned i = 0; i < 3; ++i)
2680 5349924 : bitmap_initialize (&candidates[i], &bitmap_default_obstack);
2681 :
2682 1783308 : calculate_dominance_info (CDI_DOMINATORS);
2683 1783308 : df_set_flags (DF_DEFER_INSN_RESCAN | DF_RD_PRUNE_DEAD_DEFS);
2684 1783308 : df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2685 1783308 : df_analyze ();
2686 :
2687 : /* Find all instructions we want to convert into vector mode. */
2688 1783308 : if (dump_file)
2689 44 : fprintf (dump_file, "Searching for mode conversion candidates...\n");
2690 :
2691 19780882 : FOR_EACH_BB_FN (bb, cfun)
2692 : {
2693 17997574 : rtx_insn *insn;
2694 239259237 : FOR_BB_INSNS (bb, insn)
2695 221261663 : if (timode_p
2696 221261663 : && timode_scalar_to_vector_candidate_p (insn))
2697 : {
2698 1012339 : if (dump_file)
2699 0 : fprintf (dump_file, " insn %d is marked as a TImode candidate\n",
2700 0 : INSN_UID (insn));
2701 :
2702 1012339 : bitmap_set_bit (&candidates[2], INSN_UID (insn));
2703 : }
2704 220249324 : else if (!timode_p)
2705 : {
2706 : /* Check {SI,DI}mode. */
2707 344124761 : for (unsigned i = 0; i <= 1; ++i)
2708 236001426 : if (general_scalar_to_vector_candidate_p (insn, cand_mode[i]))
2709 : {
2710 11655145 : if (dump_file)
2711 554 : fprintf (dump_file, " insn %d is marked as a %s candidate\n",
2712 277 : INSN_UID (insn), i == 0 ? "SImode" : "DImode");
2713 :
2714 11655145 : bitmap_set_bit (&candidates[i], INSN_UID (insn));
2715 11655145 : break;
2716 : }
2717 : }
2718 : }
2719 :
2720 1783308 : if (timode_p)
2721 828673 : timode_remove_non_convertible_regs (&candidates[2]);
2722 :
2723 5652591 : for (unsigned i = 0; i <= 2; ++i)
2724 4499029 : if (!bitmap_empty_p (&candidates[i]))
2725 : break;
2726 3869283 : else if (i == 2 && dump_file)
2727 23 : fprintf (dump_file, "There are no candidates for optimization.\n");
2728 :
2729 7133232 : for (unsigned i = 0; i <= 2; ++i)
2730 : {
2731 5349924 : auto_bitmap disallowed;
2732 5349924 : bitmap_tree_view (&candidates[i]);
2733 17069413 : while (!bitmap_empty_p (&candidates[i]))
2734 : {
2735 6369565 : unsigned uid = bitmap_first_set_bit (&candidates[i]);
2736 6369565 : scalar_chain *chain;
2737 :
2738 6369565 : if (cand_mode[i] == TImode)
2739 472109 : chain = new timode_scalar_chain;
2740 : else
2741 5897456 : chain = new general_scalar_chain (cand_mode[i], cand_vmode[i]);
2742 :
2743 : /* Find instructions chain we want to convert to vector mode.
2744 : Check all uses and definitions to estimate all required
2745 : conversions. */
2746 6369565 : if (chain->build (&candidates[i], uid, disallowed))
2747 : {
2748 6362836 : if (chain->compute_convert_gain ())
2749 642658 : converted_insns += chain->convert ();
2750 5720178 : else if (dump_file)
2751 136 : fprintf (dump_file, "Chain #%d conversion is not profitable\n",
2752 : chain->chain_id);
2753 : }
2754 :
2755 6369565 : rtx_insn* iter_insn;
2756 6369565 : unsigned int ii;
2757 6373162 : FOR_EACH_VEC_ELT (chain->control_flow_insns, ii, iter_insn)
2758 3597 : control_flow_insns.safe_push (iter_insn);
2759 :
2760 6369565 : delete chain;
2761 : }
2762 5349924 : }
2763 :
2764 1783308 : if (dump_file)
2765 44 : fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
2766 :
2767 7133232 : for (unsigned i = 0; i <= 2; ++i)
2768 5349924 : bitmap_release (&candidates[i]);
2769 1783308 : bitmap_obstack_release (NULL);
2770 1783308 : df_process_deferred_rescans ();
2771 :
2772 : /* Conversion means we may have 128bit register spills/fills
2773 : which require aligned stack. */
2774 1783308 : if (converted_insns)
2775 : {
2776 104611 : if (crtl->stack_alignment_needed < 128)
2777 2354 : crtl->stack_alignment_needed = 128;
2778 104611 : if (crtl->stack_alignment_estimated < 128)
2779 219 : crtl->stack_alignment_estimated = 128;
2780 :
2781 104611 : crtl->stack_realign_needed
2782 104611 : = INCOMING_STACK_BOUNDARY < crtl->stack_alignment_estimated;
2783 104611 : crtl->stack_realign_tried = crtl->stack_realign_needed;
2784 :
2785 104611 : crtl->stack_realign_processed = true;
2786 :
2787 104611 : if (!crtl->drap_reg)
2788 : {
2789 104432 : rtx drap_rtx = targetm.calls.get_drap_rtx ();
2790 :
2791 : /* stack_realign_drap and drap_rtx must match. */
2792 104432 : gcc_assert ((stack_realign_drap != 0) == (drap_rtx != NULL));
2793 :
2794 : /* Do nothing if NULL is returned,
2795 : which means DRAP is not needed. */
2796 104432 : if (drap_rtx != NULL)
2797 : {
2798 0 : crtl->args.internal_arg_pointer = drap_rtx;
2799 :
2800 : /* Call fixup_tail_calls to clean up
2801 : REG_EQUIV note if DRAP is needed. */
2802 0 : fixup_tail_calls ();
2803 : }
2804 : }
2805 :
2806 : /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
2807 104611 : if (TARGET_64BIT)
2808 66181 : for (tree parm = DECL_ARGUMENTS (current_function_decl);
2809 182034 : parm; parm = DECL_CHAIN (parm))
2810 : {
2811 115853 : if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
2812 99797 : continue;
2813 16056 : if (DECL_RTL_SET_P (parm)
2814 32112 : && GET_MODE (DECL_RTL (parm)) == V1TImode)
2815 : {
2816 522 : rtx r = DECL_RTL (parm);
2817 522 : if (REG_P (r))
2818 522 : SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
2819 : }
2820 16056 : if (DECL_INCOMING_RTL (parm)
2821 16056 : && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
2822 : {
2823 0 : rtx r = DECL_INCOMING_RTL (parm);
2824 0 : if (REG_P (r))
2825 0 : DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
2826 : }
2827 : }
2828 :
2829 104611 : if (!control_flow_insns.is_empty ())
2830 : {
2831 1130 : free_dominance_info (CDI_DOMINATORS);
2832 :
2833 1130 : unsigned int i;
2834 1130 : rtx_insn* insn;
2835 5857 : FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
2836 3597 : if (control_flow_insn_p (insn))
2837 : {
2838 : /* Split the block after insn. There will be a fallthru
2839 : edge, which is OK so we keep it. We have to create
2840 : the exception edges ourselves. */
2841 3597 : bb = BLOCK_FOR_INSN (insn);
2842 3597 : split_block (bb, insn);
2843 3597 : rtl_make_eh_edge (NULL, bb, BB_END (bb));
2844 : }
2845 : }
2846 : }
2847 :
2848 1783308 : return 0;
2849 1783308 : }
2850 :
2851 : static unsigned int
2852 74342 : rest_of_handle_insert_vzeroupper (void)
2853 : {
2854 : /* vzeroupper instructions are inserted immediately after reload and
2855 : postreload_cse to clean up after it a little bit to account for possible
2856 : spills from 256bit or 512bit registers. The pass reuses mode switching
2857 : infrastructure by re-running mode insertion pass, so disable entities
2858 : that have already been processed. */
2859 520394 : for (int i = 0; i < MAX_386_ENTITIES; i++)
2860 446052 : ix86_optimize_mode_switching[i] = 0;
2861 :
2862 74342 : ix86_optimize_mode_switching[AVX_U128] = 1;
2863 :
2864 : /* Call optimize_mode_switching. */
2865 74342 : g->get_passes ()->execute_pass_mode_switching ();
2866 :
2867 : /* LRA removes all REG_DEAD/REG_UNUSED notes and normally they
2868 : reappear in the IL only at the start of pass_rtl_dse2, which does
2869 : df_note_add_problem (); df_analyze ();
2870 : The vzeroupper is scheduled after postreload_cse pass and mode
2871 : switching computes the notes as well, the problem is that e.g.
2872 : pass_gcse2 doesn't maintain the notes, see PR113059 and
2873 : PR112760. Remove the notes now to restore status quo ante
2874 : until we figure out how to maintain the notes or what else
2875 : to do. */
2876 74342 : basic_block bb;
2877 74342 : rtx_insn *insn;
2878 409112 : FOR_EACH_BB_FN (bb, cfun)
2879 4318273 : FOR_BB_INSNS (bb, insn)
2880 3983503 : if (NONDEBUG_INSN_P (insn))
2881 : {
2882 2121088 : rtx *pnote = ®_NOTES (insn);
2883 3933459 : while (*pnote != 0)
2884 : {
2885 1812371 : if (REG_NOTE_KIND (*pnote) == REG_DEAD
2886 829627 : || REG_NOTE_KIND (*pnote) == REG_UNUSED)
2887 1299935 : *pnote = XEXP (*pnote, 1);
2888 : else
2889 512436 : pnote = &XEXP (*pnote, 1);
2890 : }
2891 : }
2892 :
2893 74342 : df_remove_problem (df_note);
2894 74342 : df_analyze ();
2895 74342 : return 0;
2896 : }
2897 :
2898 : namespace {
2899 :
2900 : const pass_data pass_data_insert_vzeroupper =
2901 : {
2902 : RTL_PASS, /* type */
2903 : "vzeroupper", /* name */
2904 : OPTGROUP_NONE, /* optinfo_flags */
2905 : TV_MACH_DEP, /* tv_id */
2906 : 0, /* properties_required */
2907 : 0, /* properties_provided */
2908 : 0, /* properties_destroyed */
2909 : 0, /* todo_flags_start */
2910 : TODO_df_finish, /* todo_flags_finish */
2911 : };
2912 :
2913 : class pass_insert_vzeroupper : public rtl_opt_pass
2914 : {
2915 : public:
2916 287872 : pass_insert_vzeroupper(gcc::context *ctxt)
2917 575744 : : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2918 : {}
2919 :
2920 : /* opt_pass methods: */
2921 1480955 : bool gate (function *) final override
2922 : {
2923 1480955 : return TARGET_AVX && TARGET_VZEROUPPER;
2924 : }
2925 :
2926 74342 : unsigned int execute (function *) final override
2927 : {
2928 74342 : return rest_of_handle_insert_vzeroupper ();
2929 : }
2930 :
2931 : }; // class pass_insert_vzeroupper
2932 :
2933 : const pass_data pass_data_stv =
2934 : {
2935 : RTL_PASS, /* type */
2936 : "stv", /* name */
2937 : OPTGROUP_NONE, /* optinfo_flags */
2938 : TV_MACH_DEP, /* tv_id */
2939 : 0, /* properties_required */
2940 : 0, /* properties_provided */
2941 : 0, /* properties_destroyed */
2942 : 0, /* todo_flags_start */
2943 : TODO_df_finish, /* todo_flags_finish */
2944 : };
2945 :
2946 : class pass_stv : public rtl_opt_pass
2947 : {
2948 : public:
2949 575744 : pass_stv (gcc::context *ctxt)
2950 575744 : : rtl_opt_pass (pass_data_stv, ctxt),
2951 1151488 : timode_p (false)
2952 : {}
2953 :
2954 : /* opt_pass methods: */
2955 2961910 : bool gate (function *) final override
2956 : {
2957 1480955 : return ((!timode_p || TARGET_64BIT)
2958 4316582 : && TARGET_STV && TARGET_SSE2 && optimize > 1);
2959 : }
2960 :
2961 1783308 : unsigned int execute (function *) final override
2962 : {
2963 1783308 : return convert_scalars_to_vector (timode_p);
2964 : }
2965 :
2966 287872 : opt_pass *clone () final override
2967 : {
2968 287872 : return new pass_stv (m_ctxt);
2969 : }
2970 :
2971 575744 : void set_pass_param (unsigned int n, bool param) final override
2972 : {
2973 575744 : gcc_assert (n == 0);
2974 575744 : timode_p = param;
2975 575744 : }
2976 :
2977 : private:
2978 : bool timode_p;
2979 : }; // class pass_stv
2980 :
2981 : } // anon namespace
2982 :
2983 : rtl_opt_pass *
2984 287872 : make_pass_insert_vzeroupper (gcc::context *ctxt)
2985 : {
2986 287872 : return new pass_insert_vzeroupper (ctxt);
2987 : }
2988 :
2989 : rtl_opt_pass *
2990 287872 : make_pass_stv (gcc::context *ctxt)
2991 : {
2992 287872 : return new pass_stv (ctxt);
2993 : }
2994 :
2995 : /* Inserting ENDBR and pseudo patchable-area instructions. */
2996 :
2997 : static void
2998 194176 : rest_of_insert_endbr_and_patchable_area (bool need_endbr,
2999 : unsigned int patchable_area_size)
3000 : {
3001 194176 : rtx endbr;
3002 194176 : rtx_insn *insn;
3003 194176 : rtx_insn *endbr_insn = NULL;
3004 194176 : basic_block bb;
3005 :
3006 194176 : if (need_endbr)
3007 : {
3008 : /* Currently emit EB if it's a tracking function, i.e. 'nocf_check'
3009 : is absent among function attributes. Later an optimization will
3010 : be introduced to make analysis if an address of a static function
3011 : is taken. A static function whose address is not taken will get
3012 : a nocf_check attribute. This will allow to reduce the number of
3013 : EB. */
3014 194131 : if (!lookup_attribute ("nocf_check",
3015 194131 : TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
3016 194113 : && (!flag_manual_endbr
3017 8 : || lookup_attribute ("cf_check",
3018 8 : DECL_ATTRIBUTES (cfun->decl)))
3019 388243 : && (!cgraph_node::get (cfun->decl)->only_called_directly_p ()
3020 26492 : || ix86_cmodel == CM_LARGE
3021 26491 : || ix86_cmodel == CM_LARGE_PIC
3022 26490 : || flag_force_indirect_call
3023 26490 : || (TARGET_DLLIMPORT_DECL_ATTRIBUTES
3024 : && DECL_DLLIMPORT_P (cfun->decl))))
3025 : {
3026 167623 : if (crtl->profile && flag_fentry)
3027 : {
3028 : /* Queue ENDBR insertion to x86_function_profiler.
3029 : NB: Any patchable-area insn will be inserted after
3030 : ENDBR. */
3031 6 : cfun->machine->insn_queued_at_entrance = TYPE_ENDBR;
3032 : }
3033 : else
3034 : {
3035 167617 : endbr = gen_nop_endbr ();
3036 167617 : bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
3037 167617 : rtx_insn *insn = BB_HEAD (bb);
3038 167617 : endbr_insn = emit_insn_before (endbr, insn);
3039 : }
3040 : }
3041 : }
3042 :
3043 194176 : if (patchable_area_size)
3044 : {
3045 51 : if (crtl->profile && flag_fentry)
3046 : {
3047 : /* Queue patchable-area insertion to x86_function_profiler.
3048 : NB: If there is a queued ENDBR, x86_function_profiler
3049 : will also handle patchable-area. */
3050 2 : if (!cfun->machine->insn_queued_at_entrance)
3051 1 : cfun->machine->insn_queued_at_entrance = TYPE_PATCHABLE_AREA;
3052 : }
3053 : else
3054 : {
3055 49 : rtx patchable_area
3056 49 : = gen_patchable_area (GEN_INT (patchable_area_size),
3057 49 : GEN_INT (crtl->patch_area_entry == 0));
3058 49 : if (endbr_insn)
3059 3 : emit_insn_after (patchable_area, endbr_insn);
3060 : else
3061 : {
3062 46 : bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
3063 46 : insn = BB_HEAD (bb);
3064 46 : emit_insn_before (patchable_area, insn);
3065 : }
3066 : }
3067 : }
3068 :
3069 194176 : if (!need_endbr)
3070 : return;
3071 :
3072 194131 : bb = 0;
3073 4062745 : FOR_EACH_BB_FN (bb, cfun)
3074 : {
3075 73394830 : for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
3076 69526216 : insn = NEXT_INSN (insn))
3077 : {
3078 69526216 : if (CALL_P (insn))
3079 : {
3080 1360600 : need_endbr = find_reg_note (insn, REG_SETJMP, NULL) != NULL;
3081 1360600 : if (!need_endbr && !SIBLING_CALL_P (insn))
3082 : {
3083 1310772 : rtx call = get_call_rtx_from (insn);
3084 1310772 : rtx fnaddr = XEXP (call, 0);
3085 1310772 : tree fndecl = NULL_TREE;
3086 :
3087 : /* Also generate ENDBRANCH for non-tail call which
3088 : may return via indirect branch. */
3089 1310772 : if (SYMBOL_REF_P (XEXP (fnaddr, 0)))
3090 1251480 : fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
3091 1251480 : if (fndecl == NULL_TREE)
3092 59660 : fndecl = MEM_EXPR (fnaddr);
3093 59660 : if (fndecl
3094 1308325 : && TREE_CODE (TREE_TYPE (fndecl)) != FUNCTION_TYPE
3095 553873 : && TREE_CODE (TREE_TYPE (fndecl)) != METHOD_TYPE)
3096 : fndecl = NULL_TREE;
3097 1310772 : if (fndecl && TYPE_ARG_TYPES (TREE_TYPE (fndecl)))
3098 : {
3099 1269399 : tree fntype = TREE_TYPE (fndecl);
3100 1269399 : if (lookup_attribute ("indirect_return",
3101 1269399 : TYPE_ATTRIBUTES (fntype)))
3102 : need_endbr = true;
3103 : }
3104 : }
3105 1360588 : if (!need_endbr)
3106 1360580 : continue;
3107 : /* Generate ENDBRANCH after CALL, which can return more than
3108 : twice, setjmp-like functions. */
3109 :
3110 20 : endbr = gen_nop_endbr ();
3111 20 : emit_insn_after_setloc (endbr, insn, INSN_LOCATION (insn));
3112 20 : continue;
3113 20 : }
3114 :
3115 68165616 : if (JUMP_P (insn) && flag_cet_switch)
3116 : {
3117 9 : rtx target = JUMP_LABEL (insn);
3118 9 : if (target == NULL_RTX || ANY_RETURN_P (target))
3119 5 : continue;
3120 :
3121 : /* Check the jump is a switch table. */
3122 4 : rtx_insn *label = as_a<rtx_insn *> (target);
3123 4 : rtx_insn *table = next_insn (label);
3124 4 : if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
3125 2 : continue;
3126 :
3127 : /* For the indirect jump find out all places it jumps and insert
3128 : ENDBRANCH there. It should be done under a special flag to
3129 : control ENDBRANCH generation for switch stmts. */
3130 2 : edge_iterator ei;
3131 2 : edge e;
3132 2 : basic_block dest_blk;
3133 :
3134 24 : FOR_EACH_EDGE (e, ei, bb->succs)
3135 : {
3136 22 : rtx_insn *insn;
3137 :
3138 22 : dest_blk = e->dest;
3139 22 : insn = BB_HEAD (dest_blk);
3140 22 : gcc_assert (LABEL_P (insn));
3141 22 : endbr = gen_nop_endbr ();
3142 22 : emit_insn_after (endbr, insn);
3143 : }
3144 2 : continue;
3145 2 : }
3146 :
3147 68165607 : if (LABEL_P (insn) && LABEL_PRESERVE_P (insn))
3148 : {
3149 135905 : endbr = gen_nop_endbr ();
3150 135905 : emit_insn_after (endbr, insn);
3151 135905 : continue;
3152 : }
3153 : }
3154 : }
3155 :
3156 : return;
3157 : }
3158 :
3159 : namespace {
3160 :
3161 : const pass_data pass_data_insert_endbr_and_patchable_area =
3162 : {
3163 : RTL_PASS, /* type. */
3164 : "endbr_and_patchable_area", /* name. */
3165 : OPTGROUP_NONE, /* optinfo_flags. */
3166 : TV_MACH_DEP, /* tv_id. */
3167 : 0, /* properties_required. */
3168 : 0, /* properties_provided. */
3169 : 0, /* properties_destroyed. */
3170 : 0, /* todo_flags_start. */
3171 : 0, /* todo_flags_finish. */
3172 : };
3173 :
3174 : class pass_insert_endbr_and_patchable_area : public rtl_opt_pass
3175 : {
3176 : public:
3177 287872 : pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
3178 575744 : : rtl_opt_pass (pass_data_insert_endbr_and_patchable_area, ctxt)
3179 : {}
3180 :
3181 : /* opt_pass methods: */
3182 1480955 : bool gate (function *) final override
3183 : {
3184 1480955 : need_endbr = (flag_cf_protection & CF_BRANCH) != 0;
3185 1480955 : patchable_area_size = crtl->patch_area_size - crtl->patch_area_entry;
3186 1480955 : return need_endbr || patchable_area_size;
3187 : }
3188 :
3189 194176 : unsigned int execute (function *) final override
3190 : {
3191 194176 : timevar_push (TV_MACH_DEP);
3192 194176 : rest_of_insert_endbr_and_patchable_area (need_endbr,
3193 : patchable_area_size);
3194 194176 : timevar_pop (TV_MACH_DEP);
3195 194176 : return 0;
3196 : }
3197 :
3198 : private:
3199 : bool need_endbr;
3200 : unsigned int patchable_area_size;
3201 : }; // class pass_insert_endbr_and_patchable_area
3202 :
3203 : } // anon namespace
3204 :
3205 : rtl_opt_pass *
3206 287872 : make_pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
3207 : {
3208 287872 : return new pass_insert_endbr_and_patchable_area (ctxt);
3209 : }
3210 :
3211 : bool
3212 6148285 : ix86_rpad_gate ()
3213 : {
3214 6148285 : return (TARGET_AVX
3215 403835 : && TARGET_SSE_PARTIAL_REG_DEPENDENCY
3216 308940 : && TARGET_SSE_MATH
3217 308710 : && optimize
3218 6451827 : && optimize_function_for_speed_p (cfun));
3219 : }
3220 :
3221 : enum x86_cse_kind
3222 : {
3223 : X86_CSE_CONST0_VECTOR,
3224 : X86_CSE_CONSTM1_VECTOR,
3225 : X86_CSE_VEC_DUP,
3226 : X86_CSE_TLS_GD,
3227 : X86_CSE_TLS_LD_BASE,
3228 : X86_CSE_TLSDESC
3229 : };
3230 :
3231 123323 : struct redundant_pattern
3232 : {
3233 : /* Bitmap of basic blocks with broadcast instructions. */
3234 : auto_bitmap bbs;
3235 : /* Bitmap of broadcast instructions. */
3236 : auto_bitmap insns;
3237 : /* The broadcast inner scalar. */
3238 : rtx val;
3239 : /* The actual redundant source value for UNSPEC_TLSDESC. */
3240 : rtx tlsdesc_val;
3241 : /* The inner scalar mode. */
3242 : machine_mode mode;
3243 : /* The instruction which sets the inner scalar. Nullptr if the inner
3244 : scalar is applied to the whole function, instead of within the same
3245 : block. */
3246 : rtx_insn *def_insn;
3247 : /* The widest broadcast source. */
3248 : rtx broadcast_source;
3249 : /* The widest broadcast register. */
3250 : rtx broadcast_reg;
3251 : /* The basic block of the broadcast instruction. */
3252 : basic_block bb;
3253 : /* The number of broadcast instructions with the same inner scalar. */
3254 : unsigned HOST_WIDE_INT count;
3255 : /* The threshold of broadcast instructions with the same inner
3256 : scalar. */
3257 : unsigned int threshold;
3258 : /* The widest broadcast size in bytes. */
3259 : unsigned int size;
3260 : /* Load kind. */
3261 : x86_cse_kind kind;
3262 : };
3263 :
3264 : /* Generate a vector set, DEST = SRC, at entry of the nearest dominator
3265 : for basic block map BBS, which is in the fake loop that contains the
3266 : whole function, so that there is only a single vector set in the
3267 : whole function. If not nullptr, LOAD is a pointer to the load. */
3268 :
3269 : static void
3270 32270 : ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs,
3271 : redundant_pattern *load = nullptr)
3272 : {
3273 32270 : basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
3274 : /* For X86_CSE_VEC_DUP, don't place the vector set outside of the loop
3275 : to avoid extra spills. */
3276 32270 : if (!load || load->kind != X86_CSE_VEC_DUP)
3277 : {
3278 23345 : while (bb->loop_father->latch
3279 23345 : != EXIT_BLOCK_PTR_FOR_FN (cfun))
3280 1351 : bb = get_immediate_dominator (CDI_DOMINATORS,
3281 : bb->loop_father->header);
3282 : }
3283 :
3284 32270 : rtx set = gen_rtx_SET (dest, src);
3285 :
3286 32270 : rtx_insn *insn = BB_HEAD (bb);
3287 125309 : while (insn && !NONDEBUG_INSN_P (insn))
3288 : {
3289 93043 : if (insn == BB_END (bb))
3290 : {
3291 : insn = NULL;
3292 : break;
3293 : }
3294 93039 : insn = NEXT_INSN (insn);
3295 : }
3296 :
3297 32270 : rtx_insn *set_insn;
3298 32270 : if (insn == BB_HEAD (bb))
3299 : {
3300 0 : set_insn = emit_insn_before (set, insn);
3301 0 : if (dump_file)
3302 : {
3303 0 : fprintf (dump_file, "\nPlace:\n\n");
3304 0 : print_rtl_single (dump_file, set_insn);
3305 0 : fprintf (dump_file, "\nbefore:\n\n");
3306 0 : print_rtl_single (dump_file, insn);
3307 0 : fprintf (dump_file, "\n");
3308 : }
3309 : }
3310 : else
3311 : {
3312 32270 : rtx_insn *after = insn ? PREV_INSN (insn) : BB_END (bb);
3313 32270 : set_insn = emit_insn_after (set, after);
3314 32270 : if (dump_file)
3315 : {
3316 1 : fprintf (dump_file, "\nPlace:\n\n");
3317 1 : print_rtl_single (dump_file, set_insn);
3318 1 : fprintf (dump_file, "\nafter:\n\n");
3319 1 : print_rtl_single (dump_file, after);
3320 1 : fprintf (dump_file, "\n");
3321 : }
3322 : }
3323 :
3324 32270 : if (load && load->kind == X86_CSE_VEC_DUP)
3325 : {
3326 : /* Get the source from LOAD as (reg:SI 99) in
3327 :
3328 : (vec_duplicate:V4SI (reg:SI 99))
3329 :
3330 : */
3331 10276 : rtx inner_scalar = load->val;
3332 : /* Set the source in (vec_duplicate:V4SI (reg:SI 99)). */
3333 10276 : rtx reg = XEXP (src, 0);
3334 10276 : machine_mode reg_mode = GET_MODE (reg);
3335 10276 : if (reg_mode != GET_MODE (inner_scalar))
3336 : {
3337 9994 : if (REG_P (inner_scalar) || MEM_P (inner_scalar))
3338 0 : inner_scalar = gen_rtx_SUBREG (reg_mode, inner_scalar, 0);
3339 9994 : else if (!SCALAR_INT_MODE_P (reg_mode))
3340 : {
3341 : /* For non-int load with integer constant, generate
3342 :
3343 : (set (subreg:SI (reg/v:SF 105 [ f ]) 0)
3344 : (const_int 1313486336 [0x4e4a3600]))
3345 :
3346 : */
3347 1 : gcc_assert (CONST_INT_P (inner_scalar));
3348 1 : unsigned int bits = GET_MODE_BITSIZE (reg_mode);
3349 1 : machine_mode mode = int_mode_for_size (bits, 0).require ();
3350 1 : reg = gen_rtx_SUBREG (mode, reg, 0);
3351 : }
3352 : }
3353 10276 : rtx set = gen_rtx_SET (reg, inner_scalar);
3354 10276 : insn = emit_insn_before (set, set_insn);
3355 10276 : if (dump_file)
3356 : {
3357 1 : fprintf (dump_file, "\nAdd:\n\n");
3358 1 : print_rtl_single (dump_file, insn);
3359 1 : fprintf (dump_file, "\nbefore:\n\n");
3360 1 : print_rtl_single (dump_file, set_insn);
3361 1 : fprintf (dump_file, "\n");
3362 : }
3363 : }
3364 32270 : }
3365 :
3366 : /* At entry of the nearest common dominator for basic blocks with
3367 : conversions/rcp/sqrt/rsqrt/round, generate a single
3368 : vxorps %xmmN, %xmmN, %xmmN
3369 : for all
3370 : vcvtss2sd op, %xmmN, %xmmX
3371 : vcvtsd2ss op, %xmmN, %xmmX
3372 : vcvtsi2ss op, %xmmN, %xmmX
3373 : vcvtsi2sd op, %xmmN, %xmmX
3374 :
3375 : NB: We want to generate only a single vxorps to cover the whole
3376 : function. The LCM algorithm isn't appropriate here since it may
3377 : place a vxorps inside the loop. */
3378 :
3379 : static unsigned int
3380 33177 : remove_partial_avx_dependency (void)
3381 : {
3382 33177 : timevar_push (TV_MACH_DEP);
3383 :
3384 33177 : bitmap_obstack_initialize (NULL);
3385 33177 : bitmap convert_bbs = BITMAP_ALLOC (NULL);
3386 :
3387 33177 : basic_block bb;
3388 33177 : rtx_insn *insn, *set_insn;
3389 33177 : rtx set;
3390 33177 : rtx v4sf_const0 = NULL_RTX;
3391 :
3392 33177 : auto_vec<rtx_insn *> control_flow_insns;
3393 :
3394 : /* We create invalid RTL initially so defer rescans. */
3395 33177 : df_set_flags (DF_DEFER_INSN_RESCAN);
3396 :
3397 312626 : FOR_EACH_BB_FN (bb, cfun)
3398 : {
3399 3552205 : FOR_BB_INSNS (bb, insn)
3400 : {
3401 3272756 : if (!NONDEBUG_INSN_P (insn))
3402 1465244 : continue;
3403 :
3404 1807512 : set = single_set (insn);
3405 1807512 : if (!set)
3406 70179 : continue;
3407 :
3408 1737333 : if (get_attr_avx_partial_xmm_update (insn)
3409 : != AVX_PARTIAL_XMM_UPDATE_TRUE)
3410 1734104 : continue;
3411 :
3412 : /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF,
3413 : SI -> SF, SI -> DF, DI -> SF, DI -> DF, sqrt, rsqrt, rcp,
3414 : round, to vec_dup and vec_merge with subreg. */
3415 3229 : rtx src = SET_SRC (set);
3416 3229 : rtx dest = SET_DEST (set);
3417 3229 : machine_mode dest_mode = GET_MODE (dest);
3418 3229 : bool convert_p = false;
3419 3229 : switch (GET_CODE (src))
3420 : {
3421 3124 : case FLOAT:
3422 3124 : case FLOAT_EXTEND:
3423 3124 : case FLOAT_TRUNCATE:
3424 3124 : case UNSIGNED_FLOAT:
3425 3124 : convert_p = true;
3426 3124 : break;
3427 : default:
3428 : break;
3429 : }
3430 :
3431 : /* Only handle conversion here. */
3432 3124 : machine_mode src_mode
3433 3124 : = convert_p ? GET_MODE (XEXP (src, 0)) : VOIDmode;
3434 3124 : switch (src_mode)
3435 : {
3436 155 : case E_SFmode:
3437 155 : case E_DFmode:
3438 155 : if (TARGET_USE_VECTOR_FP_CONVERTS
3439 149 : || !TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY)
3440 8 : continue;
3441 : break;
3442 2969 : case E_SImode:
3443 2969 : case E_DImode:
3444 2969 : if (TARGET_USE_VECTOR_CONVERTS
3445 2957 : || !TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY)
3446 14 : continue;
3447 : break;
3448 105 : case E_VOIDmode:
3449 105 : gcc_assert (!convert_p);
3450 : break;
3451 0 : default:
3452 0 : gcc_unreachable ();
3453 : }
3454 :
3455 3207 : if (!v4sf_const0)
3456 1022 : v4sf_const0 = gen_reg_rtx (V4SFmode);
3457 :
3458 3207 : rtx zero;
3459 3207 : machine_mode dest_vecmode;
3460 3207 : switch (dest_mode)
3461 : {
3462 90 : case E_HFmode:
3463 90 : dest_vecmode = V8HFmode;
3464 90 : zero = gen_rtx_SUBREG (V8HFmode, v4sf_const0, 0);
3465 90 : break;
3466 : case E_SFmode:
3467 : dest_vecmode = V4SFmode;
3468 : zero = v4sf_const0;
3469 : break;
3470 1175 : case E_DFmode:
3471 1175 : dest_vecmode = V2DFmode;
3472 1175 : zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0);
3473 1175 : break;
3474 0 : default:
3475 0 : gcc_unreachable ();
3476 : }
3477 :
3478 : /* Change source to vector mode. */
3479 3207 : src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src);
3480 3207 : src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero,
3481 : GEN_INT (HOST_WIDE_INT_1U));
3482 : /* Change destination to vector mode. */
3483 3207 : rtx vec = gen_reg_rtx (dest_vecmode);
3484 : /* Generate an XMM vector SET. */
3485 3207 : set = gen_rtx_SET (vec, src);
3486 3207 : set_insn = emit_insn_before (set, insn);
3487 :
3488 3207 : if (cfun->can_throw_non_call_exceptions)
3489 : {
3490 : /* Handle REG_EH_REGION note. */
3491 0 : rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
3492 0 : if (note)
3493 : {
3494 0 : control_flow_insns.safe_push (set_insn);
3495 0 : add_reg_note (set_insn, REG_EH_REGION, XEXP (note, 0));
3496 : }
3497 : }
3498 :
3499 3207 : src = gen_rtx_SUBREG (dest_mode, vec, 0);
3500 3207 : set = gen_rtx_SET (dest, src);
3501 :
3502 : /* Drop possible dead definitions. */
3503 3207 : PATTERN (insn) = set;
3504 :
3505 3207 : INSN_CODE (insn) = -1;
3506 3207 : recog_memoized (insn);
3507 3207 : df_insn_rescan (insn);
3508 3207 : bitmap_set_bit (convert_bbs, bb->index);
3509 : }
3510 : }
3511 :
3512 33177 : if (v4sf_const0)
3513 : {
3514 : /* (Re-)discover loops so that bb->loop_father can be used in the
3515 : analysis below. */
3516 1022 : calculate_dominance_info (CDI_DOMINATORS);
3517 1022 : loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
3518 :
3519 1022 : ix86_place_single_vector_set (v4sf_const0,
3520 : CONST0_RTX (V4SFmode),
3521 : convert_bbs);
3522 :
3523 1022 : loop_optimizer_finalize ();
3524 :
3525 1022 : if (!control_flow_insns.is_empty ())
3526 : {
3527 0 : free_dominance_info (CDI_DOMINATORS);
3528 :
3529 0 : unsigned int i;
3530 0 : FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
3531 0 : if (control_flow_insn_p (insn))
3532 : {
3533 : /* Split the block after insn. There will be a fallthru
3534 : edge, which is OK so we keep it. We have to create
3535 : the exception edges ourselves. */
3536 0 : bb = BLOCK_FOR_INSN (insn);
3537 0 : split_block (bb, insn);
3538 0 : rtl_make_eh_edge (NULL, bb, BB_END (bb));
3539 : }
3540 : }
3541 : }
3542 :
3543 33177 : df_process_deferred_rescans ();
3544 33177 : df_clear_flags (DF_DEFER_INSN_RESCAN);
3545 33177 : bitmap_obstack_release (NULL);
3546 33177 : BITMAP_FREE (convert_bbs);
3547 :
3548 33177 : timevar_pop (TV_MACH_DEP);
3549 33177 : return 0;
3550 33177 : }
3551 :
3552 : namespace {
3553 :
3554 : const pass_data pass_data_remove_partial_avx_dependency =
3555 : {
3556 : RTL_PASS, /* type */
3557 : "rpad", /* name */
3558 : OPTGROUP_NONE, /* optinfo_flags */
3559 : TV_MACH_DEP, /* tv_id */
3560 : 0, /* properties_required */
3561 : 0, /* properties_provided */
3562 : 0, /* properties_destroyed */
3563 : 0, /* todo_flags_start */
3564 : 0, /* todo_flags_finish */
3565 : };
3566 :
3567 : class pass_remove_partial_avx_dependency : public rtl_opt_pass
3568 : {
3569 : public:
3570 287872 : pass_remove_partial_avx_dependency (gcc::context *ctxt)
3571 575744 : : rtl_opt_pass (pass_data_remove_partial_avx_dependency, ctxt)
3572 : {}
3573 :
3574 : /* opt_pass methods: */
3575 1480955 : bool gate (function *) final override
3576 : {
3577 1480955 : return ix86_rpad_gate ();
3578 : }
3579 :
3580 33177 : unsigned int execute (function *) final override
3581 : {
3582 33177 : return remove_partial_avx_dependency ();
3583 : }
3584 : }; // class pass_rpad
3585 :
3586 : } // anon namespace
3587 :
3588 : rtl_opt_pass *
3589 287872 : make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
3590 : {
3591 287872 : return new pass_remove_partial_avx_dependency (ctxt);
3592 : }
3593 :
3594 : /* Return a machine mode suitable for vector SIZE with SMODE inner
3595 : mode. */
3596 :
3597 : static machine_mode
3598 32537 : ix86_get_vector_cse_mode (unsigned int size, machine_mode smode)
3599 : {
3600 : /* Use the inner scalar mode of vector broadcast source in:
3601 :
3602 : (set (reg:V8DF 394)
3603 : (vec_duplicate:V8DF (reg:V2DF 190 [ alpha ])))
3604 :
3605 : to compute the vector mode for broadcast from vector source.
3606 : */
3607 32537 : if (VECTOR_MODE_P (smode))
3608 1 : smode = GET_MODE_INNER (smode);
3609 32537 : scalar_mode s_mode = as_a <scalar_mode> (smode);
3610 65074 : poly_uint64 nunits = size / GET_MODE_SIZE (smode);
3611 32537 : machine_mode mode = mode_for_vector (s_mode, nunits).require ();
3612 32537 : return mode;
3613 : }
3614 :
3615 : /* Replace the source operand of instructions in VECTOR_INSNS with
3616 : VECTOR_CONST in VECTOR_MODE. */
3617 :
3618 : static void
3619 32083 : replace_vector_const (machine_mode vector_mode, rtx vector_const,
3620 : auto_bitmap &vector_insns,
3621 : machine_mode scalar_mode)
3622 : {
3623 32083 : bitmap_iterator bi;
3624 32083 : unsigned int id;
3625 :
3626 154827 : EXECUTE_IF_SET_IN_BITMAP (vector_insns, 0, id, bi)
3627 : {
3628 122744 : rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
3629 :
3630 : /* Get the single SET instruction. */
3631 122744 : rtx set = single_set (insn);
3632 122744 : rtx src = SET_SRC (set);
3633 122744 : rtx dest = SET_DEST (set);
3634 122744 : machine_mode mode = GET_MODE (dest);
3635 :
3636 122744 : rtx replace;
3637 : /* Replace the source operand with VECTOR_CONST. */
3638 122744 : if (SUBREG_P (src) || mode == vector_mode)
3639 : replace = vector_const;
3640 : else
3641 : {
3642 59346 : unsigned int size = GET_MODE_SIZE (mode);
3643 59346 : if (size < ix86_regmode_natural_size (mode))
3644 : {
3645 : /* If the mode size is smaller than its natural size,
3646 : first insert an extra move with a QI vector SUBREG
3647 : of the same size to avoid validate_subreg failure. */
3648 454 : machine_mode vmode
3649 454 : = ix86_get_vector_cse_mode (size, scalar_mode);
3650 454 : rtx vreg;
3651 454 : if (mode == vmode)
3652 : vreg = vector_const;
3653 : else
3654 : {
3655 43 : vreg = gen_reg_rtx (vmode);
3656 43 : rtx vsubreg = gen_rtx_SUBREG (vmode, vector_const, 0);
3657 43 : rtx pat = gen_rtx_SET (vreg, vsubreg);
3658 43 : rtx_insn *vinsn = emit_insn_before (pat, insn);
3659 43 : if (dump_file)
3660 : {
3661 0 : fprintf (dump_file, "\nInsert an extra move:\n\n");
3662 0 : print_rtl_single (dump_file, vinsn);
3663 0 : fprintf (dump_file, "\nbefore:\n\n");
3664 0 : print_rtl_single (dump_file, insn);
3665 0 : fprintf (dump_file, "\n");
3666 : }
3667 : }
3668 454 : replace = gen_rtx_SUBREG (mode, vreg, 0);
3669 : }
3670 : else
3671 58892 : replace = gen_rtx_SUBREG (mode, vector_const, 0);
3672 : }
3673 :
3674 122744 : if (dump_file)
3675 : {
3676 2 : fprintf (dump_file, "\nReplace:\n\n");
3677 2 : print_rtl_single (dump_file, insn);
3678 : }
3679 122744 : SET_SRC (set) = replace;
3680 : /* Drop possible dead definitions. */
3681 122744 : PATTERN (insn) = set;
3682 122744 : INSN_CODE (insn) = -1;
3683 122744 : recog_memoized (insn);
3684 122744 : if (dump_file)
3685 : {
3686 2 : fprintf (dump_file, "\nwith:\n\n");
3687 2 : print_rtl_single (dump_file, insn);
3688 2 : fprintf (dump_file, "\n");
3689 : }
3690 122744 : df_insn_rescan (insn);
3691 : }
3692 32083 : }
3693 :
3694 : /* Return the inner scalar if OP is a broadcast, else return nullptr. */
3695 :
3696 : static rtx
3697 2200031 : ix86_broadcast_inner (rtx op, machine_mode mode,
3698 : machine_mode *scalar_mode_p,
3699 : x86_cse_kind *kind_p, rtx_insn **insn_p)
3700 : {
3701 2200031 : switch (standard_sse_constant_p (op, mode))
3702 : {
3703 113951 : case 1:
3704 113951 : *scalar_mode_p = QImode;
3705 113951 : *kind_p = X86_CSE_CONST0_VECTOR;
3706 113951 : *insn_p = nullptr;
3707 113951 : return const0_rtx;
3708 11426 : case 2:
3709 11426 : *scalar_mode_p = QImode;
3710 11426 : *kind_p = X86_CSE_CONSTM1_VECTOR;
3711 11426 : *insn_p = nullptr;
3712 11426 : return constm1_rtx;
3713 2074654 : default:
3714 2074654 : break;
3715 : }
3716 :
3717 2074654 : mode = GET_MODE (op);
3718 2074654 : int nunits = GET_MODE_NUNITS (mode);
3719 2074654 : if (nunits < 2)
3720 : return nullptr;
3721 :
3722 1597234 : *kind_p = X86_CSE_VEC_DUP;
3723 :
3724 1597234 : rtx reg;
3725 1597234 : if (GET_CODE (op) == VEC_DUPLICATE)
3726 : {
3727 : /* Only
3728 : (vec_duplicate:V4SI (reg:SI 99))
3729 : (vec_duplicate:V2DF (mem/u/c:DF (symbol_ref/u:DI ("*.LC1") [flags 0x2]) [0 S8 A64]))
3730 : are supported. Set OP to the broadcast source by default. */
3731 95711 : op = XEXP (op, 0);
3732 95711 : reg = op;
3733 95711 : if (SUBREG_P (op)
3734 402 : && SUBREG_BYTE (op) == 0
3735 96113 : && !paradoxical_subreg_p (op))
3736 402 : reg = SUBREG_REG (op);
3737 95711 : if (!REG_P (reg))
3738 : {
3739 7848 : if (MEM_P (op)
3740 7582 : && SYMBOL_REF_P (XEXP (op, 0))
3741 13623 : && CONSTANT_POOL_ADDRESS_P (XEXP (op, 0)))
3742 : {
3743 : /* Handle constant broadcast from memory. */
3744 5552 : *scalar_mode_p = GET_MODE_INNER (mode);
3745 5552 : *insn_p = nullptr;
3746 5552 : return op;
3747 : }
3748 : return nullptr;
3749 : }
3750 : }
3751 1501523 : else if (CONST_VECTOR_P (op))
3752 : {
3753 20 : rtx first = XVECEXP (op, 0, 0);
3754 48 : for (int i = 1; i < nunits; ++i)
3755 : {
3756 48 : rtx tmp = XVECEXP (op, 0, i);
3757 : /* Vector duplicate value. */
3758 48 : if (!rtx_equal_p (tmp, first))
3759 : return nullptr;
3760 : }
3761 0 : *scalar_mode_p = GET_MODE (first);
3762 0 : *insn_p = nullptr;
3763 0 : return first;
3764 : }
3765 : else
3766 : return nullptr;
3767 :
3768 87863 : mode = GET_MODE (op);
3769 :
3770 : /* Only single def chain is supported. */
3771 87863 : df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg));
3772 87863 : if (!ref
3773 87862 : || DF_REF_IS_ARTIFICIAL (ref)
3774 87862 : || DF_REF_NEXT_REG (ref) != nullptr)
3775 : return nullptr;
3776 :
3777 82143 : rtx_insn *insn = DF_REF_INSN (ref);
3778 82143 : rtx set = single_set (insn);
3779 82143 : if (!set)
3780 : return nullptr;
3781 :
3782 82095 : rtx src = SET_SRC (set);
3783 :
3784 82095 : if (CONST_INT_P (src))
3785 : {
3786 : /* Handle sequences like
3787 :
3788 : (set (subreg:SI (reg/v:SF 105 [ f ]) 0)
3789 : (const_int 0 [0]))
3790 : (set (reg:V4SF 110)
3791 : (vec_duplicate:V4SF (reg/v:SF 105 [ f ])))
3792 :
3793 : and
3794 :
3795 : (set (reg:SI 99)
3796 : (const_int 34 [0x22]))
3797 : (set (reg:V4SI 98)
3798 : (vec_duplicate:V4SI (reg:SI 99)))
3799 :
3800 : Set *INSN_P to nullptr and return SET_SRC if SET_SRC is an
3801 : integer constant. */
3802 67361 : op = src;
3803 67361 : if (SCALAR_INT_MODE_P (mode))
3804 : {
3805 67351 : if (mode != GET_MODE (reg))
3806 0 : op = gen_int_mode (INTVAL (src), mode);
3807 : }
3808 10 : else if (op == const0_rtx)
3809 2 : op = CONST0_RTX (mode);
3810 67361 : *insn_p = nullptr;
3811 : }
3812 : else
3813 : {
3814 : /* Handle sequences like
3815 :
3816 : (set (reg:QI 105 [ c ])
3817 : (reg:QI 5 di [ c ]))
3818 : (set (reg:V64QI 102 [ _1 ])
3819 : (vec_duplicate:V64QI (reg:QI 105 [ c ])))
3820 :
3821 : (set (reg/v:SI 116 [ argc ])
3822 : (mem/c:SI (reg:SI 135) [2 argc+0 S4 A32]))
3823 : (set (reg:V4SI 119 [ _45 ])
3824 : (vec_duplicate:V4SI (reg/v:SI 116 [ argc ])))
3825 :
3826 : (set (reg:SI 98 [ _1 ])
3827 : (sign_extend:SI (reg:QI 106 [ c ])))
3828 : (set (reg:V16SI 103 [ _2 ])
3829 : (vec_duplicate:V16SI (reg:SI 98 [ _1 ])))
3830 :
3831 : (set (reg:SI 102 [ cost ])
3832 : (mem/c:SI (symbol_ref:DI ("cost") [flags 0x40])))
3833 : (set (reg:V4HI 103 [ _16 ])
3834 : (vec_duplicate:V4HI (subreg:HI (reg:SI 102 [ cost ]) 0)))
3835 :
3836 : (set (subreg:SI (reg/v:HI 107 [ cr_val ]) 0)
3837 : (ashift:SI (reg:SI 158)
3838 : (subreg:QI (reg:SI 156 [ _2 ]) 0)))
3839 : (set (reg:V16HI 183 [ _61 ])
3840 : (vec_duplicate:V16HI (reg/v:HI 107 [ cr_val ])))
3841 :
3842 : Set *INSN_P to INSN and return the broadcast source otherwise. */
3843 14734 : *insn_p = insn;
3844 : }
3845 :
3846 82095 : *scalar_mode_p = mode;
3847 82095 : return op;
3848 : }
3849 :
3850 : /* Replace CALL instruction in TLS_CALL_INSNS with SET from SRC and
3851 : put the updated instruction in UPDATED_TLS_INSNS. */
3852 :
3853 : static void
3854 310 : replace_tls_call (rtx src, auto_bitmap &tls_call_insns,
3855 : auto_bitmap &updated_tls_insns)
3856 : {
3857 310 : bitmap_iterator bi;
3858 310 : unsigned int id;
3859 :
3860 1730 : EXECUTE_IF_SET_IN_BITMAP (tls_call_insns, 0, id, bi)
3861 : {
3862 1420 : rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
3863 :
3864 : /* If this isn't a CALL, only GNU2 TLS implicit CALL patterns are
3865 : allowed. */
3866 1420 : if (!CALL_P (insn))
3867 : {
3868 41 : attr_tls64 tls64 = get_attr_tls64 (insn);
3869 41 : if (tls64 != TLS64_CALL && tls64 != TLS64_COMBINE)
3870 0 : gcc_unreachable ();
3871 : }
3872 :
3873 1420 : rtx pat = PATTERN (insn);
3874 1420 : gcc_assert (GET_CODE (pat) == PARALLEL);
3875 1420 : rtx set = XVECEXP (pat, 0, 0);
3876 1420 : gcc_assert (GET_CODE (set) == SET);
3877 1420 : rtx dest = SET_DEST (set);
3878 :
3879 1420 : set = gen_rtx_SET (dest, src);
3880 1420 : rtx_insn *set_insn = emit_insn_after (set, insn);
3881 1420 : if (recog_memoized (set_insn) < 0)
3882 0 : gcc_unreachable ();
3883 :
3884 : /* Put SET_INSN in UPDATED_TLS_INSNS. */
3885 1420 : bitmap_set_bit (updated_tls_insns, INSN_UID (set_insn));
3886 :
3887 1420 : if (dump_file)
3888 : {
3889 0 : fprintf (dump_file, "\nReplace:\n\n");
3890 0 : print_rtl_single (dump_file, insn);
3891 0 : fprintf (dump_file, "\nwith:\n\n");
3892 0 : print_rtl_single (dump_file, set_insn);
3893 0 : fprintf (dump_file, "\n");
3894 : }
3895 :
3896 : /* Delete the CALL insn. */
3897 1420 : delete_insn (insn);
3898 :
3899 1420 : df_insn_rescan (set_insn);
3900 : }
3901 310 : }
3902 :
3903 : /* Return the basic block which dominates all basic blocks which set
3904 : hard register REGNO used in basic block BB. */
3905 :
3906 : static basic_block
3907 2 : ix86_get_dominator_for_reg (unsigned int regno, basic_block bb)
3908 : {
3909 2 : basic_block set_bb;
3910 2 : auto_bitmap set_bbs;
3911 :
3912 : /* Get all BBs which set REGNO and dominate the current BB from all
3913 : DEFs of REGNO. */
3914 2 : for (df_ref def = DF_REG_DEF_CHAIN (regno);
3915 18 : def;
3916 16 : def = DF_REF_NEXT_REG (def))
3917 16 : if (!DF_REF_IS_ARTIFICIAL (def)
3918 16 : && !DF_REF_FLAGS_IS_SET (def, DF_REF_MAY_CLOBBER)
3919 6 : && !DF_REF_FLAGS_IS_SET (def, DF_REF_MUST_CLOBBER))
3920 : {
3921 4 : set_bb = DF_REF_BB (def);
3922 4 : if (dominated_by_p (CDI_DOMINATORS, bb, set_bb))
3923 2 : bitmap_set_bit (set_bbs, set_bb->index);
3924 : }
3925 :
3926 2 : bb = nearest_common_dominator_for_set (CDI_DOMINATORS, set_bbs);
3927 2 : return bb;
3928 2 : }
3929 :
3930 : /* Mark FLAGS register as live in DATA, a bitmap of live caller-saved
3931 : registers, if DEST is FLAGS register. */
3932 :
3933 : static void
3934 381 : ix86_check_flags_reg (rtx dest, const_rtx x, void *data)
3935 : {
3936 381 : if (GET_CODE (x) == CLOBBER)
3937 : return;
3938 :
3939 374 : auto_bitmap *live_caller_saved_regs = (auto_bitmap *) data;
3940 374 : if (REG_P (dest) && REGNO (dest) == FLAGS_REG)
3941 0 : bitmap_set_bit (*live_caller_saved_regs, FLAGS_REG);
3942 : }
3943 :
3944 : /* Emit a TLS_SET instruction of KIND in basic block BB. Store the
3945 : insertion point in *BEFORE_P for emit_insn_before or in *AFTER_P
3946 : for emit_insn_after. UPDATED_GNU_TLS_INSNS contains instructions
3947 : which replace the GNU TLS instructions. UPDATED_GNU2_TLS_INSNS
3948 : contains instructions which replace the GNU2 TLS instructions. */
3949 :
3950 : static rtx_insn *
3951 310 : ix86_emit_tls_call (rtx tls_set, x86_cse_kind kind, basic_block bb,
3952 : rtx_insn **before_p, rtx_insn **after_p,
3953 : auto_bitmap &updated_gnu_tls_insns,
3954 : auto_bitmap &updated_gnu2_tls_insns)
3955 : {
3956 312 : rtx_insn *tls_insn;
3957 :
3958 312 : do
3959 : {
3960 312 : rtx_insn *insn = BB_HEAD (bb);
3961 1288 : while (insn && !NONDEBUG_INSN_P (insn))
3962 : {
3963 980 : if (insn == BB_END (bb))
3964 : {
3965 : /* This must be the beginning basic block:
3966 :
3967 : (note 4 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
3968 : (note 2 4 26 2 NOTE_INSN_FUNCTION_BEG)
3969 :
3970 : or a basic block with only a label:
3971 :
3972 : (code_label 78 11 77 3 14 (nil) [1 uses])
3973 : (note 77 78 54 3 [bb 3] NOTE_INSN_BASIC_BLOCK)
3974 :
3975 : or a basic block with only a debug marker:
3976 :
3977 : (note 3 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
3978 : (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG)
3979 : (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil))
3980 :
3981 : or a basic block with only deleted instructions:
3982 :
3983 : (code_label 348 23 349 45 3 (nil) [0 uses])
3984 : (note 349 348 436 45 [bb 45] NOTE_INSN_BASIC_BLOCK)
3985 : (note 436 349 362 45 NOTE_INSN_DELETED)
3986 :
3987 : */
3988 4 : gcc_assert (DEBUG_INSN_P (insn)
3989 : || (NOTE_P (insn)
3990 : && ((NOTE_KIND (insn)
3991 : == NOTE_INSN_FUNCTION_BEG)
3992 : || (NOTE_KIND (insn)
3993 : == NOTE_INSN_DELETED)
3994 : || (NOTE_KIND (insn)
3995 : == NOTE_INSN_BASIC_BLOCK))));
3996 : insn = NULL;
3997 : break;
3998 : }
3999 976 : insn = NEXT_INSN (insn);
4000 : }
4001 :
4002 : /* TLS_GD and TLS_LD_BASE instructions are normal functions which
4003 : clobber caller-saved registers. TLSDESC instructions only
4004 : clobber FLAGS. If any registers clobbered by TLS instructions
4005 : are live in this basic block, we must insert TLS instructions
4006 : after all live registers clobbered are dead. */
4007 :
4008 312 : auto_bitmap live_caller_saved_regs;
4009 624 : bitmap in = df_live ? DF_LIVE_IN (bb) : DF_LR_IN (bb);
4010 :
4011 312 : if (bitmap_bit_p (in, FLAGS_REG))
4012 4 : bitmap_set_bit (live_caller_saved_regs, FLAGS_REG);
4013 :
4014 312 : unsigned int i;
4015 :
4016 : /* Get all live caller-saved registers for TLS_GD and TLS_LD_BASE
4017 : instructions. */
4018 312 : if (kind != X86_CSE_TLSDESC)
4019 27249 : for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4020 26956 : if (call_used_regs[i]
4021 25198 : && !fixed_regs[i]
4022 38993 : && bitmap_bit_p (in, i))
4023 344 : bitmap_set_bit (live_caller_saved_regs, i);
4024 :
4025 312 : if (bitmap_empty_p (live_caller_saved_regs))
4026 : {
4027 79 : if (insn == BB_HEAD (bb))
4028 : {
4029 0 : *before_p = insn;
4030 0 : tls_insn = emit_insn_before (tls_set, insn);
4031 : }
4032 : else
4033 : {
4034 : /* Emit the TLS call after NOTE_INSN_FUNCTION_BEG in the
4035 : beginning basic block:
4036 :
4037 : (note 4 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
4038 : (note 2 4 26 2 NOTE_INSN_FUNCTION_BEG)
4039 :
4040 : or after NOTE_INSN_BASIC_BLOCK in a basic block with
4041 : only a label:
4042 :
4043 : (code_label 78 11 77 3 14 (nil) [1 uses])
4044 : (note 77 78 54 3 [bb 3] NOTE_INSN_BASIC_BLOCK)
4045 :
4046 : or after debug marker in a basic block with only a
4047 : debug marker:
4048 :
4049 : (note 3 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
4050 : (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG)
4051 : (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil))
4052 :
4053 : */
4054 79 : insn = insn ? PREV_INSN (insn) : BB_END (bb);
4055 79 : *after_p = insn;
4056 79 : tls_insn = emit_insn_after (tls_set, insn);
4057 : }
4058 79 : return tls_insn;
4059 : }
4060 :
4061 233 : bool repeat = false;
4062 :
4063 : /* Search for REG_DEAD notes in this basic block. */
4064 661 : FOR_BB_INSNS (bb, insn)
4065 : {
4066 661 : if (!NONDEBUG_INSN_P (insn))
4067 283 : continue;
4068 :
4069 : /* NB: Conditional jump is the only instruction which reads
4070 : flags register and changes control flow. We can never
4071 : place the TLS call after unconditional jump. */
4072 378 : if (JUMP_P (insn))
4073 : {
4074 : /* This must be a conditional jump. */
4075 2 : rtx label = JUMP_LABEL (insn);
4076 2 : if (label == nullptr
4077 2 : || ANY_RETURN_P (label)
4078 2 : || !(LABEL_P (label) || SYMBOL_REF_P (label)))
4079 0 : gcc_unreachable ();
4080 :
4081 : /* Place the call before all FLAGS_REG setting BBs since
4082 : we can't place a call before nor after a conditional
4083 : jump. */
4084 2 : bb = ix86_get_dominator_for_reg (FLAGS_REG, bb);
4085 :
4086 : /* Start over again. */
4087 2 : repeat = true;
4088 2 : break;
4089 : }
4090 :
4091 376 : if (bitmap_bit_p (updated_gnu_tls_insns, INSN_UID (insn)))
4092 : {
4093 : /* Insert the __tls_get_addr call before INSN which
4094 : replaces a __tls_get_addr call. */
4095 1 : *before_p = insn;
4096 1 : tls_insn = emit_insn_before (tls_set, insn);
4097 1 : return tls_insn;
4098 : }
4099 :
4100 375 : if (bitmap_bit_p (updated_gnu2_tls_insns, INSN_UID (insn)))
4101 : {
4102 : /* Mark FLAGS register as dead since FLAGS register
4103 : would be clobbered by the GNU2 TLS instruction. */
4104 1 : bitmap_clear_bit (live_caller_saved_regs, FLAGS_REG);
4105 1 : continue;
4106 : }
4107 :
4108 : /* Check if FLAGS register is live. */
4109 374 : note_stores (insn, ix86_check_flags_reg,
4110 : &live_caller_saved_regs);
4111 :
4112 374 : rtx link;
4113 515 : for (link = REG_NOTES (insn); link; link = XEXP (link, 1))
4114 371 : if ((REG_NOTE_KIND (link) == REG_DEAD
4115 9 : || (REG_NOTE_KIND (link) == REG_UNUSED
4116 7 : && REGNO (XEXP (link, 0)) == FLAGS_REG))
4117 378 : && REG_P (XEXP (link, 0)))
4118 : {
4119 : /* Mark the live caller-saved register as dead. */
4120 743 : for (i = REGNO (XEXP (link, 0));
4121 743 : i < END_REGNO (XEXP (link, 0));
4122 : i++)
4123 374 : if (i < FIRST_PSEUDO_REGISTER)
4124 351 : bitmap_clear_bit (live_caller_saved_regs, i);
4125 :
4126 369 : if (bitmap_empty_p (live_caller_saved_regs))
4127 : {
4128 230 : *after_p = insn;
4129 230 : tls_insn = emit_insn_after (tls_set, insn);
4130 230 : return tls_insn;
4131 : }
4132 : }
4133 : }
4134 :
4135 : /* NB: Start over again for conditional jump. */
4136 2 : if (repeat)
4137 2 : continue;
4138 :
4139 0 : gcc_assert (!bitmap_empty_p (live_caller_saved_regs));
4140 :
4141 : /* If any live caller-saved registers aren't dead at the end of
4142 : this basic block, get the basic block which dominates all
4143 : basic blocks which set the remaining live registers. */
4144 0 : auto_bitmap set_bbs;
4145 0 : bitmap_iterator bi;
4146 0 : unsigned int id;
4147 0 : EXECUTE_IF_SET_IN_BITMAP (live_caller_saved_regs, 0, id, bi)
4148 : {
4149 0 : basic_block set_bb = ix86_get_dominator_for_reg (id, bb);
4150 0 : bitmap_set_bit (set_bbs, set_bb->index);
4151 : }
4152 0 : bb = nearest_common_dominator_for_set (CDI_DOMINATORS, set_bbs);
4153 2 : }
4154 : while (true);
4155 : }
4156 :
4157 : /* Generate a TLS call of KIND with VAL and copy the call result to DEST,
4158 : at entry of the nearest dominator for basic block map BBS, which is in
4159 : the fake loop that contains the whole function, so that there is only
4160 : a single TLS CALL of KIND with VAL in the whole function.
4161 : UPDATED_GNU_TLS_INSNS contains instructions which replace the GNU TLS
4162 : instructions. UPDATED_GNU2_TLS_INSNS contains instructions which
4163 : replace the GNU2 TLS instructions. If TLSDESC_SET isn't nullptr,
4164 : insert it before the TLS call. */
4165 :
4166 : static void
4167 310 : ix86_place_single_tls_call (rtx dest, rtx val, x86_cse_kind kind,
4168 : auto_bitmap &bbs,
4169 : auto_bitmap &updated_gnu_tls_insns,
4170 : auto_bitmap &updated_gnu2_tls_insns,
4171 : rtx tlsdesc_set = nullptr)
4172 : {
4173 310 : basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
4174 310 : while (bb->loop_father->latch
4175 319 : != EXIT_BLOCK_PTR_FOR_FN (cfun))
4176 9 : bb = get_immediate_dominator (CDI_DOMINATORS,
4177 : bb->loop_father->header);
4178 :
4179 310 : rtx rax = nullptr, rdi;
4180 310 : rtx eqv = nullptr;
4181 310 : rtx caddr;
4182 310 : rtx set;
4183 310 : rtx clob;
4184 310 : rtx symbol;
4185 310 : rtx tls;
4186 :
4187 310 : switch (kind)
4188 : {
4189 262 : case X86_CSE_TLS_GD:
4190 262 : rax = gen_rtx_REG (Pmode, AX_REG);
4191 262 : rdi = gen_rtx_REG (Pmode, DI_REG);
4192 262 : caddr = ix86_tls_get_addr ();
4193 :
4194 262 : symbol = XVECEXP (val, 0, 0);
4195 262 : tls = gen_tls_global_dynamic_64 (Pmode, rax, symbol, caddr, rdi);
4196 :
4197 262 : if (GET_MODE (symbol) != Pmode)
4198 0 : symbol = gen_rtx_ZERO_EXTEND (Pmode, symbol);
4199 : eqv = symbol;
4200 : break;
4201 :
4202 30 : case X86_CSE_TLS_LD_BASE:
4203 30 : rax = gen_rtx_REG (Pmode, AX_REG);
4204 30 : rdi = gen_rtx_REG (Pmode, DI_REG);
4205 30 : caddr = ix86_tls_get_addr ();
4206 :
4207 30 : tls = gen_tls_local_dynamic_base_64 (Pmode, rax, caddr, rdi);
4208 :
4209 : /* Attach a unique REG_EQUAL to DEST, to allow the RTL optimizers
4210 : to share the LD_BASE result with other LD model accesses. */
4211 30 : eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
4212 : UNSPEC_TLS_LD_BASE);
4213 :
4214 30 : break;
4215 :
4216 18 : case X86_CSE_TLSDESC:
4217 18 : set = gen_rtx_SET (dest, val);
4218 18 : clob = gen_rtx_CLOBBER (VOIDmode,
4219 : gen_rtx_REG (CCmode, FLAGS_REG));
4220 18 : tls = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clob));
4221 18 : break;
4222 :
4223 0 : default:
4224 0 : gcc_unreachable ();
4225 : }
4226 :
4227 : /* Emit the TLS CALL insn. */
4228 310 : rtx_insn *before = nullptr;
4229 310 : rtx_insn *after = nullptr;
4230 310 : rtx_insn *tls_insn = ix86_emit_tls_call (tls, kind, bb, &before,
4231 : &after,
4232 : updated_gnu_tls_insns,
4233 : updated_gnu2_tls_insns);
4234 :
4235 310 : rtx_insn *tlsdesc_insn = nullptr;
4236 310 : if (tlsdesc_set)
4237 : {
4238 14 : rtx dest = copy_rtx (SET_DEST (tlsdesc_set));
4239 14 : rtx src = copy_rtx (SET_SRC (tlsdesc_set));
4240 14 : tlsdesc_set = gen_rtx_SET (dest, src);
4241 14 : tlsdesc_insn = emit_insn_before (tlsdesc_set, tls_insn);
4242 : }
4243 :
4244 310 : if (kind != X86_CSE_TLSDESC)
4245 : {
4246 292 : RTL_CONST_CALL_P (tls_insn) = 1;
4247 :
4248 : /* Indicate that this function can't jump to non-local gotos. */
4249 292 : make_reg_eh_region_note_nothrow_nononlocal (tls_insn);
4250 : }
4251 :
4252 310 : if (recog_memoized (tls_insn) < 0)
4253 0 : gcc_unreachable ();
4254 :
4255 310 : if (dump_file)
4256 : {
4257 0 : if (after)
4258 : {
4259 0 : fprintf (dump_file, "\nPlace:\n\n");
4260 0 : if (tlsdesc_insn)
4261 0 : print_rtl_single (dump_file, tlsdesc_insn);
4262 0 : print_rtl_single (dump_file, tls_insn);
4263 0 : fprintf (dump_file, "\nafter:\n\n");
4264 0 : print_rtl_single (dump_file, after);
4265 0 : fprintf (dump_file, "\n");
4266 : }
4267 : else
4268 : {
4269 0 : fprintf (dump_file, "\nPlace:\n\n");
4270 0 : if (tlsdesc_insn)
4271 0 : print_rtl_single (dump_file, tlsdesc_insn);
4272 0 : print_rtl_single (dump_file, tls_insn);
4273 0 : fprintf (dump_file, "\nbefore:\n\n");
4274 0 : print_rtl_single (dump_file, before);
4275 0 : fprintf (dump_file, "\n");
4276 : }
4277 : }
4278 :
4279 310 : if (kind != X86_CSE_TLSDESC)
4280 : {
4281 : /* Copy RAX to DEST. */
4282 292 : set = gen_rtx_SET (dest, rax);
4283 292 : rtx_insn *set_insn = emit_insn_after (set, tls_insn);
4284 292 : set_dst_reg_note (set_insn, REG_EQUAL, copy_rtx (eqv), dest);
4285 292 : if (dump_file)
4286 : {
4287 0 : fprintf (dump_file, "\nPlace:\n\n");
4288 0 : print_rtl_single (dump_file, set_insn);
4289 0 : fprintf (dump_file, "\nafter:\n\n");
4290 0 : print_rtl_single (dump_file, tls_insn);
4291 0 : fprintf (dump_file, "\n");
4292 : }
4293 : }
4294 310 : }
4295 :
4296 : namespace {
4297 :
4298 : const pass_data pass_data_x86_cse =
4299 : {
4300 : RTL_PASS, /* type */
4301 : "x86_cse", /* name */
4302 : OPTGROUP_NONE, /* optinfo_flags */
4303 : TV_MACH_DEP, /* tv_id */
4304 : 0, /* properties_required */
4305 : 0, /* properties_provided */
4306 : 0, /* properties_destroyed */
4307 : 0, /* todo_flags_start */
4308 : 0, /* todo_flags_finish */
4309 : };
4310 :
4311 : class pass_x86_cse : public rtl_opt_pass
4312 : {
4313 : public:
4314 287872 : pass_x86_cse (gcc::context *ctxt)
4315 575744 : : rtl_opt_pass (pass_data_x86_cse, ctxt)
4316 : {}
4317 :
4318 : /* opt_pass methods: */
4319 1480955 : bool gate (function *fun) final override
4320 : {
4321 1480955 : return (TARGET_SSE2
4322 1476734 : && optimize
4323 2518835 : && optimize_function_for_speed_p (fun));
4324 : }
4325 :
4326 973304 : unsigned int execute (function *) final override
4327 : {
4328 973304 : return x86_cse ();
4329 : }
4330 :
4331 : private:
4332 : /* The redundant source value. */
4333 : rtx val;
4334 : /* The actual redundant source value for UNSPEC_TLSDESC. */
4335 : rtx tlsdesc_val;
4336 : /* The instruction which defines the redundant value. */
4337 : rtx_insn *def_insn;
4338 : /* Mode of the destination of the candidate redundant instruction. */
4339 : machine_mode mode;
4340 : /* Mode of the source of the candidate redundant instruction. */
4341 : machine_mode scalar_mode;
4342 : /* The classification of the candidate redundant instruction. */
4343 : x86_cse_kind kind;
4344 :
4345 : unsigned int x86_cse (void);
4346 : bool candidate_gnu_tls_p (rtx_insn *, attr_tls64);
4347 : bool candidate_gnu2_tls_p (rtx, attr_tls64);
4348 : bool candidate_vector_p (rtx);
4349 : rtx_insn *tls_set_insn_from_symbol (const_rtx, const_rtx);
4350 : }; // class pass_x86_cse
4351 :
4352 : /* Return the instruction which sets REG from TLS_SYMBOL. */
4353 :
4354 : rtx_insn *
4355 38 : pass_x86_cse::tls_set_insn_from_symbol (const_rtx reg,
4356 : const_rtx tls_symbol)
4357 : {
4358 38 : rtx_insn *set_insn = nullptr;
4359 38 : for (df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg));
4360 103 : ref;
4361 65 : ref = DF_REF_NEXT_REG (ref))
4362 : {
4363 65 : if (DF_REF_IS_ARTIFICIAL (ref))
4364 : return nullptr;
4365 :
4366 65 : set_insn = DF_REF_INSN (ref);
4367 65 : if (get_attr_tls64 (set_insn) != TLS64_LEA)
4368 : return nullptr;
4369 :
4370 65 : rtx tls_set = PATTERN (set_insn);
4371 65 : rtx tls_src = XVECEXP (SET_SRC (tls_set), 0, 0);
4372 65 : if (!rtx_equal_p (tls_symbol, tls_src))
4373 : return nullptr;
4374 : }
4375 :
4376 : return set_insn;
4377 : }
4378 :
4379 : /* Return true and output def_insn, val, mode, scalar_mode and kind if
4380 : INSN is UNSPEC_TLS_GD or UNSPEC_TLS_LD_BASE. */
4381 :
4382 : bool
4383 2185 : pass_x86_cse::candidate_gnu_tls_p (rtx_insn *insn, attr_tls64 tls64)
4384 : {
4385 2185 : if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p)
4386 : return false;
4387 :
4388 : /* Record the redundant TLS CALLs for 64-bit:
4389 :
4390 : (parallel [
4391 : (set (reg:DI 0 ax)
4392 : (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr")))
4393 : (const_int 0 [0])))
4394 : (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50])
4395 : (reg/f:DI 7 sp)] UNSPEC_TLS_GD)
4396 : (clobber (reg:DI 5 di))])
4397 :
4398 :
4399 : and
4400 :
4401 : (parallel [
4402 : (set (reg:DI 0 ax)
4403 : (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr")))
4404 : (const_int 0 [0])))
4405 : (unspec:DI [(reg/f:DI 7 sp)] UNSPEC_TLS_LD_BASE)])
4406 :
4407 : */
4408 :
4409 2022 : rtx pat = PATTERN (insn);
4410 2022 : rtx set = XVECEXP (pat, 0, 0);
4411 2022 : gcc_assert (GET_CODE (set) == SET);
4412 2022 : rtx dest = SET_DEST (set);
4413 2022 : scalar_mode = mode = GET_MODE (dest);
4414 2022 : val = XVECEXP (pat, 0, 1);
4415 2022 : gcc_assert (GET_CODE (val) == UNSPEC);
4416 :
4417 2022 : if (tls64 == TLS64_GD)
4418 1921 : kind = X86_CSE_TLS_GD;
4419 : else
4420 101 : kind = X86_CSE_TLS_LD_BASE;
4421 :
4422 2022 : def_insn = nullptr;
4423 2022 : return true;
4424 : }
4425 :
4426 : /* Return true and output def_insn, val, mode, scalar_mode and kind if
4427 : SET is UNSPEC_TLSDESC. */
4428 :
4429 : bool
4430 50 : pass_x86_cse::candidate_gnu2_tls_p (rtx set, attr_tls64 tls64)
4431 : {
4432 50 : if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p)
4433 : return false;
4434 :
4435 48 : rtx tls_symbol;
4436 48 : rtx_insn *set_insn;
4437 48 : rtx src = SET_SRC (set);
4438 48 : val = src;
4439 48 : tlsdesc_val = src;
4440 48 : kind = X86_CSE_TLSDESC;
4441 :
4442 48 : if (tls64 == TLS64_COMBINE)
4443 : {
4444 : /* Record 64-bit TLS64_COMBINE:
4445 :
4446 : (set (reg/f:DI 104)
4447 : (plus:DI (unspec:DI [
4448 : (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
4449 : (reg:DI 114)
4450 : (reg/f:DI 7 sp)] UNSPEC_TLSDESC)
4451 : (const:DI (unspec:DI [
4452 : (symbol_ref:DI ("e") [flags 0x1a])
4453 : ] UNSPEC_DTPOFF))))
4454 :
4455 : (set (reg/f:DI 104)
4456 : (plus:DI (unspec:DI [
4457 : (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
4458 : (unspec:DI [
4459 : (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
4460 : ] UNSPEC_TLSDESC)
4461 : (reg/f:DI 7 sp)] UNSPEC_TLSDESC)
4462 : (const:DI (unspec:DI [
4463 : (symbol_ref:DI ("e") [flags 0x1a])
4464 : ] UNSPEC_DTPOFF))))
4465 : */
4466 :
4467 10 : scalar_mode = mode = GET_MODE (src);
4468 :
4469 : /* Since the first operand of PLUS in the source TLS_COMBINE
4470 : pattern is unused, use the second operand of PLUS:
4471 :
4472 : (const:DI (unspec:DI [
4473 : (symbol_ref:DI ("e") [flags 0x1a])
4474 : ] UNSPEC_DTPOFF))
4475 :
4476 : as VAL to check if 2 TLS_COMBINE patterns have the same
4477 : source. */
4478 10 : val = XEXP (src, 1);
4479 10 : gcc_assert (GET_CODE (val) == CONST
4480 : && GET_CODE (XEXP (val, 0)) == UNSPEC
4481 : && XINT (XEXP (val, 0), 1) == UNSPEC_DTPOFF
4482 : && SYMBOL_REF_P (XVECEXP (XEXP (val, 0), 0, 0)));
4483 10 : def_insn = nullptr;
4484 10 : return true;
4485 : }
4486 :
4487 : /* Record 64-bit TLS_CALL:
4488 :
4489 : (set (reg:DI 101)
4490 : (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50])
4491 : (reg:DI 112)
4492 : (reg/f:DI 7 sp)] UNSPEC_TLSDESC))
4493 :
4494 : */
4495 :
4496 38 : gcc_assert (GET_CODE (src) == UNSPEC);
4497 38 : tls_symbol = XVECEXP (src, 0, 0);
4498 38 : src = XVECEXP (src, 0, 1);
4499 38 : scalar_mode = mode = GET_MODE (src);
4500 38 : gcc_assert (REG_P (src));
4501 :
4502 : /* All definitions of reg:DI 129 in
4503 :
4504 : (set (reg:DI 110)
4505 : (unspec:DI [(symbol_ref:DI ("foo"))
4506 : (reg:DI 129)
4507 : (reg/f:DI 7 sp)] UNSPEC_TLSDESC))
4508 :
4509 : should have the same source as in
4510 :
4511 : (set (reg:DI 129)
4512 : (unspec:DI [(symbol_ref:DI ("foo"))] UNSPEC_TLSDESC))
4513 :
4514 : */
4515 :
4516 38 : set_insn = tls_set_insn_from_symbol (src, tls_symbol);
4517 38 : if (!set_insn)
4518 : return false;
4519 :
4520 : /* Use TLS_SYMBOL as VAL to check if 2 patterns have the same source. */
4521 38 : val = tls_symbol;
4522 38 : def_insn = set_insn;
4523 38 : return true;
4524 : }
4525 :
4526 : /* Return true and output def_insn, val, mode, scalar_mode and kind if
4527 : INSN is a vector broadcast instruction. */
4528 :
4529 : bool
4530 50151996 : pass_x86_cse::candidate_vector_p (rtx set)
4531 : {
4532 50151996 : rtx src = SET_SRC (set);
4533 50151996 : rtx dest = SET_DEST (set);
4534 50151996 : mode = GET_MODE (dest);
4535 : /* Skip non-vector instruction. */
4536 50151996 : if (!VECTOR_MODE_P (mode))
4537 : return false;
4538 :
4539 : /* Skip non-vector load instruction. */
4540 3707448 : if (!REG_P (dest) && !SUBREG_P (dest))
4541 : return false;
4542 :
4543 2200031 : val = ix86_broadcast_inner (src, mode, &scalar_mode, &kind,
4544 : &def_insn);
4545 2200031 : return val ? true : false;
4546 : }
4547 :
4548 : /* At entry of the nearest common dominator for basic blocks with
4549 :
4550 : 1. Vector CONST0_RTX patterns.
4551 : 2. Vector CONSTM1_RTX patterns.
4552 : 3. Vector broadcast patterns.
4553 : 4. UNSPEC_TLS_GD patterns.
4554 : 5. UNSPEC_TLS_LD_BASE patterns.
4555 : 6. UNSPEC_TLSDESC patterns.
4556 :
4557 : generate a single pattern whose destination is used to replace the
4558 : source in all identical patterns.
4559 :
4560 : NB: We want to generate a pattern, which is executed only once, to
4561 : cover the whole function. The LCM algorithm isn't appropriate here
4562 : since it may place a pattern inside the loop. */
4563 :
4564 : unsigned int
4565 973304 : pass_x86_cse::x86_cse (void)
4566 : {
4567 973304 : timevar_push (TV_MACH_DEP);
4568 :
4569 973304 : auto_vec<redundant_pattern *> loads;
4570 973304 : redundant_pattern *load;
4571 973304 : basic_block bb;
4572 973304 : rtx_insn *insn;
4573 973304 : unsigned int i;
4574 973304 : auto_bitmap updated_gnu_tls_insns;
4575 973304 : auto_bitmap updated_gnu2_tls_insns;
4576 :
4577 973304 : df_set_flags (DF_DEFER_INSN_RESCAN);
4578 :
4579 973304 : bool recursive_call_p = cfun->machine->recursive_function;
4580 :
4581 10970407 : FOR_EACH_BB_FN (bb, cfun)
4582 : {
4583 131812237 : FOR_BB_INSNS (bb, insn)
4584 : {
4585 121815134 : if (!NONDEBUG_INSN_P (insn))
4586 68020174 : continue;
4587 :
4588 53794960 : bool matched = false;
4589 : /* Remove redundant pattens if there are more than 2 of
4590 : them. */
4591 53794960 : unsigned int threshold = 2;
4592 :
4593 53794960 : rtx set = single_set (insn);
4594 53794960 : if (!set && !CALL_P (insn))
4595 1100054 : continue;
4596 :
4597 52694906 : tlsdesc_val = nullptr;
4598 :
4599 52694906 : attr_tls64 tls64 = get_attr_tls64 (insn);
4600 52694906 : switch (tls64)
4601 : {
4602 2185 : case TLS64_GD:
4603 2185 : case TLS64_LD_BASE:
4604 : /* Verify UNSPEC_TLS_GD and UNSPEC_TLS_LD_BASE. */
4605 2185 : if (candidate_gnu_tls_p (insn, tls64))
4606 : break;
4607 163 : continue;
4608 :
4609 50 : case TLS64_CALL:
4610 50 : case TLS64_COMBINE:
4611 : /* Verify UNSPEC_TLSDESC. */
4612 50 : if (candidate_gnu2_tls_p (set, tls64))
4613 : break;
4614 2 : continue;
4615 :
4616 35 : case TLS64_LEA:
4617 : /* Skip TLS64_LEA. */
4618 35 : continue;
4619 :
4620 52692636 : case TLS64_NONE:
4621 52692636 : if (!set)
4622 2540640 : continue;
4623 :
4624 : /* Check for vector broadcast. */
4625 50151996 : if (candidate_vector_p (set))
4626 : break;
4627 49938972 : continue;
4628 : }
4629 :
4630 : /* Check if there is a matching redundant load. */
4631 384315 : FOR_EACH_VEC_ELT (loads, i, load)
4632 260992 : if (load->val
4633 260992 : && load->kind == kind
4634 202295 : && load->mode == scalar_mode
4635 193093 : && (load->bb == bb
4636 156848 : || kind != X86_CSE_VEC_DUP
4637 : /* Non all 0s/1s vector load must be in the same
4638 : basic block if it is in a recursive call. */
4639 97748 : || !recursive_call_p)
4640 452165 : && rtx_equal_p (load->val, val))
4641 : {
4642 : /* Record instruction. */
4643 91771 : bitmap_set_bit (load->insns, INSN_UID (insn));
4644 :
4645 : /* Record the maximum vector size. */
4646 91771 : if (kind <= X86_CSE_VEC_DUP
4647 182432 : && load->size < GET_MODE_SIZE (mode))
4648 980 : load->size = GET_MODE_SIZE (mode);
4649 :
4650 : /* Record the basic block. */
4651 91771 : bitmap_set_bit (load->bbs, bb->index);
4652 :
4653 : /* Increment the count. */
4654 91771 : load->count++;
4655 :
4656 91771 : matched = true;
4657 91771 : break;
4658 : }
4659 :
4660 215094 : if (matched)
4661 91771 : continue;
4662 :
4663 : /* We see this instruction the first time. Record the
4664 : redundant source value, its mode, the destination size,
4665 : instruction which defines the redundant source value,
4666 : instruction basic block and the instruction kind. */
4667 123323 : load = new redundant_pattern;
4668 :
4669 123323 : load->val = copy_rtx (val);
4670 123323 : if (tlsdesc_val)
4671 25 : load->tlsdesc_val = copy_rtx (tlsdesc_val);
4672 : else
4673 123298 : load->tlsdesc_val = nullptr;
4674 123323 : load->mode = scalar_mode;
4675 123323 : load->size = GET_MODE_SIZE (mode);
4676 123323 : load->def_insn = def_insn;
4677 123323 : load->count = 1;
4678 123323 : load->threshold = threshold;
4679 123323 : load->bb = BLOCK_FOR_INSN (insn);
4680 123323 : load->kind = kind;
4681 :
4682 123323 : bitmap_set_bit (load->insns, INSN_UID (insn));
4683 123323 : bitmap_set_bit (load->bbs, bb->index);
4684 :
4685 123323 : loads.safe_push (load);
4686 : }
4687 : }
4688 :
4689 : bool replaced = false;
4690 1096627 : FOR_EACH_VEC_ELT (loads, i, load)
4691 123323 : if (load->count >= load->threshold)
4692 : {
4693 32393 : machine_mode mode;
4694 32393 : rtx reg, broadcast_source, broadcast_reg;
4695 32393 : replaced = true;
4696 32393 : switch (load->kind)
4697 : {
4698 310 : case X86_CSE_TLS_GD:
4699 310 : case X86_CSE_TLS_LD_BASE:
4700 310 : case X86_CSE_TLSDESC:
4701 310 : broadcast_reg = gen_reg_rtx (load->mode);
4702 310 : replace_tls_call (broadcast_reg, load->insns,
4703 310 : (load->kind == X86_CSE_TLSDESC
4704 : ? updated_gnu2_tls_insns
4705 : : updated_gnu_tls_insns));
4706 310 : load->broadcast_reg = broadcast_reg;
4707 310 : break;
4708 :
4709 32083 : case X86_CSE_CONST0_VECTOR:
4710 32083 : case X86_CSE_CONSTM1_VECTOR:
4711 32083 : case X86_CSE_VEC_DUP:
4712 32083 : mode = ix86_get_vector_cse_mode (load->size, load->mode);
4713 32083 : broadcast_reg = gen_reg_rtx (mode);
4714 32083 : if (load->def_insn)
4715 : {
4716 : /* Replace redundant vector loads with a single vector
4717 : load in the same basic block. */
4718 835 : reg = load->val;
4719 835 : if (load->mode != GET_MODE (reg))
4720 0 : reg = gen_rtx_SUBREG (load->mode, reg, 0);
4721 835 : broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
4722 : }
4723 : else
4724 : /* This is a constant integer/double vector. If the
4725 : inner scalar is 0 or -1, set vector to CONST0_RTX
4726 : or CONSTM1_RTX directly. */
4727 31248 : switch (load->kind)
4728 : {
4729 19725 : case X86_CSE_CONST0_VECTOR:
4730 19725 : broadcast_source = CONST0_RTX (mode);
4731 19725 : break;
4732 1247 : case X86_CSE_CONSTM1_VECTOR:
4733 1247 : broadcast_source = CONSTM1_RTX (mode);
4734 1247 : break;
4735 10276 : case X86_CSE_VEC_DUP:
4736 10276 : reg = gen_reg_rtx (load->mode);
4737 10276 : broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
4738 10276 : break;
4739 0 : default:
4740 0 : gcc_unreachable ();
4741 : }
4742 32083 : replace_vector_const (mode, broadcast_reg, load->insns,
4743 : load->mode);
4744 32083 : load->broadcast_source = broadcast_source;
4745 32083 : load->broadcast_reg = broadcast_reg;
4746 32083 : break;
4747 : }
4748 : }
4749 :
4750 973304 : if (replaced)
4751 : {
4752 26251 : auto_vec<rtx_insn *> control_flow_insns;
4753 :
4754 : /* (Re-)discover loops so that bb->loop_father can be used in the
4755 : analysis below. */
4756 26251 : calculate_dominance_info (CDI_DOMINATORS);
4757 26251 : loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
4758 :
4759 72378 : FOR_EACH_VEC_ELT (loads, i, load)
4760 46127 : if (load->count >= load->threshold)
4761 : {
4762 32393 : rtx set;
4763 32393 : if (load->def_insn)
4764 849 : switch (load->kind)
4765 : {
4766 14 : case X86_CSE_TLSDESC:
4767 14 : ix86_place_single_tls_call (load->broadcast_reg,
4768 : load->tlsdesc_val,
4769 : load->kind,
4770 14 : load->bbs,
4771 : updated_gnu_tls_insns,
4772 : updated_gnu2_tls_insns,
4773 14 : PATTERN (load->def_insn));
4774 14 : break;
4775 835 : case X86_CSE_VEC_DUP:
4776 : /* Insert a broadcast after the original scalar
4777 : definition. */
4778 835 : set = gen_rtx_SET (load->broadcast_reg,
4779 : load->broadcast_source);
4780 835 : insn = emit_insn_after (set, load->def_insn);
4781 :
4782 835 : if (cfun->can_throw_non_call_exceptions)
4783 : {
4784 : /* Handle REG_EH_REGION note in DEF_INSN. */
4785 5 : rtx note = find_reg_note (load->def_insn,
4786 : REG_EH_REGION, nullptr);
4787 5 : if (note)
4788 : {
4789 1 : control_flow_insns.safe_push (load->def_insn);
4790 1 : add_reg_note (insn, REG_EH_REGION,
4791 : XEXP (note, 0));
4792 : }
4793 : }
4794 :
4795 835 : if (dump_file)
4796 : {
4797 0 : fprintf (dump_file, "\nAdd:\n\n");
4798 0 : print_rtl_single (dump_file, insn);
4799 0 : fprintf (dump_file, "\nafter:\n\n");
4800 0 : print_rtl_single (dump_file, load->def_insn);
4801 0 : fprintf (dump_file, "\n");
4802 : }
4803 : break;
4804 0 : default:
4805 0 : gcc_unreachable ();
4806 : }
4807 : else
4808 31544 : switch (load->kind)
4809 : {
4810 296 : case X86_CSE_TLS_GD:
4811 296 : case X86_CSE_TLS_LD_BASE:
4812 296 : case X86_CSE_TLSDESC:
4813 296 : ix86_place_single_tls_call (load->broadcast_reg,
4814 : (load->kind == X86_CSE_TLSDESC
4815 : ? load->tlsdesc_val
4816 : : load->val),
4817 : load->kind,
4818 296 : load->bbs,
4819 : updated_gnu_tls_insns,
4820 : updated_gnu2_tls_insns);
4821 296 : break;
4822 31248 : case X86_CSE_CONST0_VECTOR:
4823 31248 : case X86_CSE_CONSTM1_VECTOR:
4824 31248 : case X86_CSE_VEC_DUP:
4825 31248 : ix86_place_single_vector_set (load->broadcast_reg,
4826 : load->broadcast_source,
4827 : load->bbs,
4828 : load);
4829 31248 : break;
4830 : }
4831 : }
4832 :
4833 26251 : loop_optimizer_finalize ();
4834 :
4835 26251 : if (!control_flow_insns.is_empty ())
4836 : {
4837 1 : free_dominance_info (CDI_DOMINATORS);
4838 :
4839 3 : FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
4840 1 : if (control_flow_insn_p (insn))
4841 : {
4842 : /* Split the block after insn. There will be a fallthru
4843 : edge, which is OK so we keep it. We have to create
4844 : the exception edges ourselves. */
4845 1 : bb = BLOCK_FOR_INSN (insn);
4846 1 : split_block (bb, insn);
4847 1 : rtl_make_eh_edge (NULL, bb, BB_END (bb));
4848 : }
4849 : }
4850 :
4851 26251 : df_process_deferred_rescans ();
4852 26251 : }
4853 :
4854 1096627 : FOR_EACH_VEC_ELT (loads, i, load)
4855 246646 : delete load;
4856 :
4857 973304 : df_clear_flags (DF_DEFER_INSN_RESCAN);
4858 :
4859 973304 : timevar_pop (TV_MACH_DEP);
4860 973304 : return 0;
4861 973304 : }
4862 :
4863 : } // anon namespace
4864 :
4865 : rtl_opt_pass *
4866 287872 : make_pass_x86_cse (gcc::context *ctxt)
4867 : {
4868 287872 : return new pass_x86_cse (ctxt);
4869 : }
4870 :
4871 : /* Convert legacy instructions that clobbers EFLAGS to APX_NF
4872 : instructions when there are no flag set between a flag
4873 : producer and user. */
4874 :
4875 : static unsigned int
4876 367 : ix86_apx_nf_convert (void)
4877 : {
4878 367 : timevar_push (TV_MACH_DEP);
4879 :
4880 367 : basic_block bb;
4881 367 : rtx_insn *insn;
4882 367 : hash_map <rtx_insn *, rtx> converting_map;
4883 367 : auto_vec <rtx_insn *> current_convert_list;
4884 :
4885 367 : bool converting_seq = false;
4886 367 : rtx cc = gen_rtx_REG (CCmode, FLAGS_REG);
4887 :
4888 786 : FOR_EACH_BB_FN (bb, cfun)
4889 : {
4890 : /* Reset conversion for each bb. */
4891 419 : converting_seq = false;
4892 5031 : FOR_BB_INSNS (bb, insn)
4893 : {
4894 4612 : if (!NONDEBUG_INSN_P (insn))
4895 4945 : continue;
4896 :
4897 3676 : if (recog_memoized (insn) < 0)
4898 335 : continue;
4899 :
4900 : /* Convert candidate insns after cstore, which should
4901 : satisify the two conditions:
4902 : 1. Is not flag user or producer, only clobbers
4903 : FLAGS_REG.
4904 : 2. Have corresponding nf pattern. */
4905 :
4906 3341 : rtx pat = PATTERN (insn);
4907 :
4908 : /* Starting convertion at first cstorecc. */
4909 3341 : rtx set = NULL_RTX;
4910 3341 : if (!converting_seq
4911 2760 : && (set = single_set (insn))
4912 2684 : && ix86_comparison_operator (SET_SRC (set), VOIDmode)
4913 126 : && reg_overlap_mentioned_p (cc, SET_SRC (set))
4914 3464 : && !reg_overlap_mentioned_p (cc, SET_DEST (set)))
4915 : {
4916 123 : converting_seq = true;
4917 123 : current_convert_list.truncate (0);
4918 : }
4919 : /* Terminate at the next explicit flag set. */
4920 3218 : else if (reg_set_p (cc, pat)
4921 3218 : && GET_CODE (set_of (cc, pat)) != CLOBBER)
4922 : converting_seq = false;
4923 :
4924 3122 : if (!converting_seq)
4925 2738 : continue;
4926 :
4927 603 : if (get_attr_has_nf (insn)
4928 603 : && GET_CODE (pat) == PARALLEL)
4929 : {
4930 : /* Record the insn to candidate map. */
4931 72 : current_convert_list.safe_push (insn);
4932 72 : converting_map.put (insn, pat);
4933 : }
4934 : /* If the insn clobbers flags but has no nf_attr,
4935 : revoke all previous candidates. */
4936 531 : else if (!get_attr_has_nf (insn)
4937 530 : && reg_set_p (cc, pat)
4938 534 : && GET_CODE (set_of (cc, pat)) == CLOBBER)
4939 : {
4940 3 : for (auto item : current_convert_list)
4941 0 : converting_map.remove (item);
4942 3 : converting_seq = false;
4943 : }
4944 : }
4945 : }
4946 :
4947 367 : if (!converting_map.is_empty ())
4948 : {
4949 85 : for (auto iter = converting_map.begin ();
4950 170 : iter != converting_map.end (); ++iter)
4951 : {
4952 72 : rtx_insn *replace = (*iter).first;
4953 72 : rtx pat = (*iter).second;
4954 72 : int i, n = 0, len = XVECLEN (pat, 0);
4955 72 : rtx *new_elems = XALLOCAVEC (rtx, len);
4956 72 : rtx new_pat;
4957 216 : for (i = 0; i < len; i++)
4958 : {
4959 144 : rtx temp = XVECEXP (pat, 0, i);
4960 216 : if (! (GET_CODE (temp) == CLOBBER
4961 72 : && reg_overlap_mentioned_p (cc,
4962 72 : XEXP (temp, 0))))
4963 : {
4964 72 : new_elems[n] = temp;
4965 72 : n++;
4966 : }
4967 : }
4968 :
4969 72 : if (n == 1)
4970 72 : new_pat = new_elems[0];
4971 : else
4972 0 : new_pat =
4973 0 : gen_rtx_PARALLEL (VOIDmode,
4974 : gen_rtvec_v (n,
4975 : new_elems));
4976 :
4977 72 : PATTERN (replace) = new_pat;
4978 72 : INSN_CODE (replace) = -1;
4979 72 : recog_memoized (replace);
4980 72 : df_insn_rescan (replace);
4981 : }
4982 : }
4983 :
4984 367 : timevar_pop (TV_MACH_DEP);
4985 367 : return 0;
4986 367 : }
4987 :
4988 :
4989 : namespace {
4990 :
4991 : const pass_data pass_data_apx_nf_convert =
4992 : {
4993 : RTL_PASS, /* type */
4994 : "apx_nfcvt", /* name */
4995 : OPTGROUP_NONE, /* optinfo_flags */
4996 : TV_MACH_DEP, /* tv_id */
4997 : 0, /* properties_required */
4998 : 0, /* properties_provided */
4999 : 0, /* properties_destroyed */
5000 : 0, /* todo_flags_start */
5001 : 0, /* todo_flags_finish */
5002 : };
5003 :
5004 : class pass_apx_nf_convert : public rtl_opt_pass
5005 : {
5006 : public:
5007 287872 : pass_apx_nf_convert (gcc::context *ctxt)
5008 575744 : : rtl_opt_pass (pass_data_apx_nf_convert, ctxt)
5009 : {}
5010 :
5011 : /* opt_pass methods: */
5012 1480955 : bool gate (function *) final override
5013 : {
5014 1480955 : return (TARGET_APX_NF
5015 460 : && optimize
5016 1481406 : && optimize_function_for_speed_p (cfun));
5017 : }
5018 :
5019 367 : unsigned int execute (function *) final override
5020 : {
5021 367 : return ix86_apx_nf_convert ();
5022 : }
5023 : }; // class pass_apx_nf_convert
5024 :
5025 : } // anon namespace
5026 :
5027 : rtl_opt_pass *
5028 287872 : make_pass_apx_nf_convert (gcc::context *ctxt)
5029 : {
5030 287872 : return new pass_apx_nf_convert (ctxt);
5031 : }
5032 :
5033 : /* When a hot loop can be fit into one cacheline,
5034 : force align the loop without considering the max skip. */
5035 : static void
5036 975125 : ix86_align_loops ()
5037 : {
5038 975125 : basic_block bb;
5039 :
5040 : /* Don't do this when we don't know cache line size. */
5041 975125 : if (ix86_cost->prefetch_block == 0)
5042 9 : return;
5043 :
5044 975116 : loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
5045 975116 : profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
5046 11431595 : FOR_EACH_BB_FN (bb, cfun)
5047 : {
5048 10456479 : rtx_insn *label = BB_HEAD (bb);
5049 10456479 : bool has_fallthru = 0;
5050 10456479 : edge e;
5051 10456479 : edge_iterator ei;
5052 :
5053 10456479 : if (!LABEL_P (label))
5054 5306450 : continue;
5055 :
5056 5154842 : profile_count fallthru_count = profile_count::zero ();
5057 5154842 : profile_count branch_count = profile_count::zero ();
5058 :
5059 14986998 : FOR_EACH_EDGE (e, ei, bb->preds)
5060 : {
5061 9832156 : if (e->flags & EDGE_FALLTHRU)
5062 2508083 : has_fallthru = 1, fallthru_count += e->count ();
5063 : else
5064 7324073 : branch_count += e->count ();
5065 : }
5066 :
5067 5154842 : if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
5068 4813 : continue;
5069 :
5070 5150029 : if (bb->loop_father
5071 5150029 : && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
5072 6492781 : && (has_fallthru
5073 1342752 : ? (!(single_succ_p (bb)
5074 145770 : && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
5075 933463 : && optimize_bb_for_speed_p (bb)
5076 851756 : && branch_count + fallthru_count > count_threshold
5077 729312 : && (branch_count > fallthru_count * param_align_loop_iterations))
5078 : /* In case there'no fallthru for the loop.
5079 : Nops inserted won't be executed. */
5080 409289 : : (branch_count > count_threshold
5081 140789 : || (bb->count > bb->prev_bb->count * 10
5082 13093 : && (bb->prev_bb->count
5083 4616954 : <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)))))
5084 : {
5085 546168 : rtx_insn* insn, *end_insn;
5086 546168 : HOST_WIDE_INT size = 0;
5087 546168 : bool padding_p = true;
5088 546168 : basic_block tbb = bb;
5089 546168 : unsigned cond_branch_num = 0;
5090 546168 : bool detect_tight_loop_p = false;
5091 :
5092 860655 : for (unsigned int i = 0; i != bb->loop_father->num_nodes;
5093 314487 : i++, tbb = tbb->next_bb)
5094 : {
5095 : /* Only handle continuous cfg layout. */
5096 860655 : if (bb->loop_father != tbb->loop_father)
5097 : {
5098 : padding_p = false;
5099 : break;
5100 : }
5101 :
5102 10150850 : FOR_BB_INSNS (tbb, insn)
5103 : {
5104 9487581 : if (!NONDEBUG_INSN_P (insn))
5105 5423779 : continue;
5106 4063802 : size += ix86_min_insn_size (insn);
5107 :
5108 : /* We don't know size of inline asm.
5109 : Don't align loop for call. */
5110 4063802 : if (asm_noperands (PATTERN (insn)) >= 0
5111 4063802 : || CALL_P (insn))
5112 : {
5113 : size = -1;
5114 : break;
5115 : }
5116 : }
5117 :
5118 820239 : if (size == -1 || size > ix86_cost->prefetch_block)
5119 : {
5120 : padding_p = false;
5121 : break;
5122 : }
5123 :
5124 1460132 : FOR_EACH_EDGE (e, ei, tbb->succs)
5125 : {
5126 : /* It could be part of the loop. */
5127 1007625 : if (e->dest == bb)
5128 : {
5129 : detect_tight_loop_p = true;
5130 : break;
5131 : }
5132 : }
5133 :
5134 637315 : if (detect_tight_loop_p)
5135 : break;
5136 :
5137 452507 : end_insn = BB_END (tbb);
5138 452507 : if (JUMP_P (end_insn))
5139 : {
5140 : /* For decoded icache:
5141 : 1. Up to two branches are allowed per Way.
5142 : 2. A non-conditional branch is the last micro-op in a Way.
5143 : */
5144 366621 : if (onlyjump_p (end_insn)
5145 366621 : && (any_uncondjump_p (end_insn)
5146 310890 : || single_succ_p (tbb)))
5147 : {
5148 : padding_p = false;
5149 : break;
5150 : }
5151 310890 : else if (++cond_branch_num >= 2)
5152 : {
5153 : padding_p = false;
5154 : break;
5155 : }
5156 : }
5157 :
5158 : }
5159 :
5160 546168 : if (padding_p && detect_tight_loop_p)
5161 : {
5162 369616 : emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)),
5163 : GEN_INT (0)), label);
5164 : /* End of function. */
5165 184808 : if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun))
5166 : break;
5167 : /* Skip bb which already fits into one cacheline. */
5168 : bb = tbb;
5169 : }
5170 : }
5171 : }
5172 :
5173 975116 : loop_optimizer_finalize ();
5174 975116 : free_dominance_info (CDI_DOMINATORS);
5175 : }
5176 :
5177 : namespace {
5178 :
5179 : const pass_data pass_data_align_tight_loops =
5180 : {
5181 : RTL_PASS, /* type */
5182 : "align_tight_loops", /* name */
5183 : OPTGROUP_NONE, /* optinfo_flags */
5184 : TV_MACH_DEP, /* tv_id */
5185 : 0, /* properties_required */
5186 : 0, /* properties_provided */
5187 : 0, /* properties_destroyed */
5188 : 0, /* todo_flags_start */
5189 : 0, /* todo_flags_finish */
5190 : };
5191 :
5192 : class pass_align_tight_loops : public rtl_opt_pass
5193 : {
5194 : public:
5195 287872 : pass_align_tight_loops (gcc::context *ctxt)
5196 575744 : : rtl_opt_pass (pass_data_align_tight_loops, ctxt)
5197 : {}
5198 :
5199 : /* opt_pass methods: */
5200 1480955 : bool gate (function *) final override
5201 : {
5202 1480955 : return TARGET_ALIGN_TIGHT_LOOPS
5203 1480469 : && optimize
5204 2520701 : && optimize_function_for_speed_p (cfun);
5205 : }
5206 :
5207 975125 : unsigned int execute (function *) final override
5208 : {
5209 975125 : timevar_push (TV_MACH_DEP);
5210 : #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
5211 975125 : ix86_align_loops ();
5212 : #endif
5213 975125 : timevar_pop (TV_MACH_DEP);
5214 975125 : return 0;
5215 : }
5216 : }; // class pass_align_tight_loops
5217 :
5218 : } // anon namespace
5219 :
5220 : rtl_opt_pass *
5221 287872 : make_pass_align_tight_loops (gcc::context *ctxt)
5222 : {
5223 287872 : return new pass_align_tight_loops (ctxt);
5224 : }
5225 :
5226 : /* This compares the priority of target features in function DECL1
5227 : and DECL2. It returns positive value if DECL1 is higher priority,
5228 : negative value if DECL2 is higher priority and 0 if they are the
5229 : same. */
5230 :
5231 : int
5232 5739 : ix86_compare_version_priority (tree decl1, tree decl2)
5233 : {
5234 5739 : unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
5235 5739 : unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
5236 :
5237 5739 : return (int)priority1 - (int)priority2;
5238 : }
5239 :
5240 : /* V1 and V2 point to function versions with different priorities
5241 : based on the target ISA. This function compares their priorities. */
5242 :
5243 : static int
5244 6830 : feature_compare (const void *v1, const void *v2)
5245 : {
5246 6830 : typedef struct _function_version_info
5247 : {
5248 : tree version_decl;
5249 : tree predicate_chain;
5250 : unsigned int dispatch_priority;
5251 : } function_version_info;
5252 :
5253 6830 : const function_version_info c1 = *(const function_version_info *)v1;
5254 6830 : const function_version_info c2 = *(const function_version_info *)v2;
5255 6830 : return (c2.dispatch_priority - c1.dispatch_priority);
5256 : }
5257 :
5258 : /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
5259 : to return a pointer to VERSION_DECL if the outcome of the expression
5260 : formed by PREDICATE_CHAIN is true. This function will be called during
5261 : version dispatch to decide which function version to execute. It returns
5262 : the basic block at the end, to which more conditions can be added. */
5263 :
5264 : static basic_block
5265 822 : add_condition_to_bb (tree function_decl, tree version_decl,
5266 : tree predicate_chain, basic_block new_bb)
5267 : {
5268 822 : gimple *return_stmt;
5269 822 : tree convert_expr, result_var;
5270 822 : gimple *convert_stmt;
5271 822 : gimple *call_cond_stmt;
5272 822 : gimple *if_else_stmt;
5273 :
5274 822 : basic_block bb1, bb2, bb3;
5275 822 : edge e12, e23;
5276 :
5277 822 : tree cond_var, and_expr_var = NULL_TREE;
5278 822 : gimple_seq gseq;
5279 :
5280 822 : tree predicate_decl, predicate_arg;
5281 :
5282 822 : push_cfun (DECL_STRUCT_FUNCTION (function_decl));
5283 :
5284 822 : gcc_assert (new_bb != NULL);
5285 822 : gseq = bb_seq (new_bb);
5286 :
5287 :
5288 822 : convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
5289 : build_fold_addr_expr (version_decl));
5290 822 : result_var = create_tmp_var (ptr_type_node);
5291 822 : convert_stmt = gimple_build_assign (result_var, convert_expr);
5292 822 : return_stmt = gimple_build_return (result_var);
5293 :
5294 822 : if (predicate_chain == NULL_TREE)
5295 : {
5296 197 : gimple_seq_add_stmt (&gseq, convert_stmt);
5297 197 : gimple_seq_add_stmt (&gseq, return_stmt);
5298 197 : set_bb_seq (new_bb, gseq);
5299 197 : gimple_set_bb (convert_stmt, new_bb);
5300 197 : gimple_set_bb (return_stmt, new_bb);
5301 197 : pop_cfun ();
5302 197 : return new_bb;
5303 : }
5304 :
5305 1289 : while (predicate_chain != NULL)
5306 : {
5307 664 : cond_var = create_tmp_var (integer_type_node);
5308 664 : predicate_decl = TREE_PURPOSE (predicate_chain);
5309 664 : predicate_arg = TREE_VALUE (predicate_chain);
5310 664 : call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
5311 664 : gimple_call_set_lhs (call_cond_stmt, cond_var);
5312 :
5313 664 : gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
5314 664 : gimple_set_bb (call_cond_stmt, new_bb);
5315 664 : gimple_seq_add_stmt (&gseq, call_cond_stmt);
5316 :
5317 664 : predicate_chain = TREE_CHAIN (predicate_chain);
5318 :
5319 664 : if (and_expr_var == NULL)
5320 : and_expr_var = cond_var;
5321 : else
5322 : {
5323 39 : gimple *assign_stmt;
5324 : /* Use MIN_EXPR to check if any integer is zero?.
5325 : and_expr_var = min_expr <cond_var, and_expr_var> */
5326 39 : assign_stmt = gimple_build_assign (and_expr_var,
5327 : build2 (MIN_EXPR, integer_type_node,
5328 : cond_var, and_expr_var));
5329 :
5330 39 : gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
5331 39 : gimple_set_bb (assign_stmt, new_bb);
5332 39 : gimple_seq_add_stmt (&gseq, assign_stmt);
5333 : }
5334 : }
5335 :
5336 625 : if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
5337 : integer_zero_node,
5338 : NULL_TREE, NULL_TREE);
5339 625 : gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
5340 625 : gimple_set_bb (if_else_stmt, new_bb);
5341 625 : gimple_seq_add_stmt (&gseq, if_else_stmt);
5342 :
5343 625 : gimple_seq_add_stmt (&gseq, convert_stmt);
5344 625 : gimple_seq_add_stmt (&gseq, return_stmt);
5345 625 : set_bb_seq (new_bb, gseq);
5346 :
5347 625 : bb1 = new_bb;
5348 625 : e12 = split_block (bb1, if_else_stmt);
5349 625 : bb2 = e12->dest;
5350 625 : e12->flags &= ~EDGE_FALLTHRU;
5351 625 : e12->flags |= EDGE_TRUE_VALUE;
5352 :
5353 625 : e23 = split_block (bb2, return_stmt);
5354 :
5355 625 : gimple_set_bb (convert_stmt, bb2);
5356 625 : gimple_set_bb (return_stmt, bb2);
5357 :
5358 625 : bb3 = e23->dest;
5359 625 : make_edge (bb1, bb3, EDGE_FALSE_VALUE);
5360 :
5361 625 : remove_edge (e23);
5362 625 : make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
5363 :
5364 625 : pop_cfun ();
5365 :
5366 625 : return bb3;
5367 : }
5368 :
5369 : /* This function generates the dispatch function for
5370 : multi-versioned functions. DISPATCH_DECL is the function which will
5371 : contain the dispatch logic. FNDECLS are the function choices for
5372 : dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
5373 : in DISPATCH_DECL in which the dispatch code is generated. */
5374 :
5375 : static int
5376 197 : dispatch_function_versions (tree dispatch_decl,
5377 : void *fndecls_p,
5378 : basic_block *empty_bb)
5379 : {
5380 197 : tree default_decl;
5381 197 : gimple *ifunc_cpu_init_stmt;
5382 197 : gimple_seq gseq;
5383 197 : int ix;
5384 197 : tree ele;
5385 197 : vec<tree> *fndecls;
5386 197 : unsigned int num_versions = 0;
5387 197 : unsigned int actual_versions = 0;
5388 197 : unsigned int i;
5389 :
5390 197 : struct _function_version_info
5391 : {
5392 : tree version_decl;
5393 : tree predicate_chain;
5394 : unsigned int dispatch_priority;
5395 : }*function_version_info;
5396 :
5397 197 : gcc_assert (dispatch_decl != NULL
5398 : && fndecls_p != NULL
5399 : && empty_bb != NULL);
5400 :
5401 : /*fndecls_p is actually a vector. */
5402 197 : fndecls = static_cast<vec<tree> *> (fndecls_p);
5403 :
5404 : /* At least one more version other than the default. */
5405 197 : num_versions = fndecls->length ();
5406 197 : gcc_assert (num_versions >= 2);
5407 :
5408 197 : function_version_info = (struct _function_version_info *)
5409 197 : XNEWVEC (struct _function_version_info, (num_versions - 1));
5410 :
5411 : /* The first version in the vector is the default decl. */
5412 197 : default_decl = (*fndecls)[0];
5413 :
5414 197 : push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
5415 :
5416 197 : gseq = bb_seq (*empty_bb);
5417 : /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
5418 : constructors, so explicity call __builtin_cpu_init here. */
5419 197 : ifunc_cpu_init_stmt
5420 197 : = gimple_build_call_vec (get_ix86_builtin (IX86_BUILTIN_CPU_INIT), vNULL);
5421 197 : gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
5422 197 : gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
5423 197 : set_bb_seq (*empty_bb, gseq);
5424 :
5425 197 : pop_cfun ();
5426 :
5427 :
5428 979 : for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
5429 : {
5430 782 : tree version_decl = ele;
5431 782 : tree predicate_chain = NULL_TREE;
5432 782 : unsigned int priority;
5433 : /* Get attribute string, parse it and find the right predicate decl.
5434 : The predicate function could be a lengthy combination of many
5435 : features, like arch-type and various isa-variants. */
5436 782 : priority = get_builtin_code_for_version (version_decl,
5437 : &predicate_chain);
5438 :
5439 782 : if (predicate_chain == NULL_TREE)
5440 157 : continue;
5441 :
5442 625 : function_version_info [actual_versions].version_decl = version_decl;
5443 625 : function_version_info [actual_versions].predicate_chain
5444 625 : = predicate_chain;
5445 625 : function_version_info [actual_versions].dispatch_priority = priority;
5446 625 : actual_versions++;
5447 : }
5448 :
5449 : /* Sort the versions according to descending order of dispatch priority. The
5450 : priority is based on the ISA. This is not a perfect solution. There
5451 : could still be ambiguity. If more than one function version is suitable
5452 : to execute, which one should be dispatched? In future, allow the user
5453 : to specify a dispatch priority next to the version. */
5454 197 : qsort (function_version_info, actual_versions,
5455 : sizeof (struct _function_version_info), feature_compare);
5456 :
5457 1019 : for (i = 0; i < actual_versions; ++i)
5458 625 : *empty_bb = add_condition_to_bb (dispatch_decl,
5459 : function_version_info[i].version_decl,
5460 625 : function_version_info[i].predicate_chain,
5461 : *empty_bb);
5462 :
5463 : /* dispatch default version at the end. */
5464 197 : *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
5465 : NULL, *empty_bb);
5466 :
5467 197 : free (function_version_info);
5468 197 : return 0;
5469 : }
5470 :
5471 : /* This function changes the assembler name for functions that are
5472 : versions. If DECL is a function version and has a "target"
5473 : attribute, it appends the attribute string to its assembler name. */
5474 :
5475 : static tree
5476 1104 : ix86_mangle_function_version_assembler_name (tree decl, tree id)
5477 : {
5478 1104 : tree version_attr;
5479 1104 : char *attr_str;
5480 :
5481 1104 : if (DECL_DECLARED_INLINE_P (decl)
5482 1153 : && lookup_attribute ("gnu_inline",
5483 49 : DECL_ATTRIBUTES (decl)))
5484 0 : error_at (DECL_SOURCE_LOCATION (decl),
5485 : "function versions cannot be marked as %<gnu_inline%>,"
5486 : " bodies have to be generated");
5487 :
5488 1104 : if (DECL_VIRTUAL_P (decl)
5489 2208 : || DECL_VINDEX (decl))
5490 0 : sorry ("virtual function multiversioning not supported");
5491 :
5492 1104 : version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
5493 :
5494 : /* target attribute string cannot be NULL. */
5495 1104 : gcc_assert (version_attr != NULL_TREE);
5496 :
5497 1104 : attr_str = sorted_attr_string (TREE_VALUE (version_attr));
5498 :
5499 : /* Allow assembler name to be modified if already set. */
5500 1104 : if (DECL_ASSEMBLER_NAME_SET_P (decl))
5501 1089 : SET_DECL_RTL (decl, NULL);
5502 :
5503 1104 : tree ret = clone_identifier (id, attr_str, true);
5504 :
5505 1104 : XDELETEVEC (attr_str);
5506 :
5507 1104 : return ret;
5508 : }
5509 :
5510 : tree
5511 495876008 : ix86_mangle_decl_assembler_name (tree decl, tree id)
5512 : {
5513 : /* For function version, add the target suffix to the assembler name. */
5514 495876008 : if (TREE_CODE (decl) == FUNCTION_DECL)
5515 : {
5516 457464050 : cgraph_node *node = cgraph_node::get (decl);
5517 : /* Mangle all versions when annotated with target_clones, but only
5518 : non-default versions when annotated with target attributes. */
5519 457464050 : if (DECL_FUNCTION_VERSIONED (decl)
5520 457464050 : && (node->is_target_clone
5521 1077 : || !is_function_default_version (node->decl)))
5522 1104 : id = ix86_mangle_function_version_assembler_name (decl, id);
5523 : /* Mangle the dispatched symbol but only in the case of target clones. */
5524 457462946 : else if (node && node->dispatcher_function && !node->is_target_clone)
5525 114 : id = clone_identifier (id, "ifunc");
5526 72847012 : else if (node && node->dispatcher_resolver_function)
5527 197 : id = clone_identifier (id, "resolver");
5528 : }
5529 : #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
5530 : id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
5531 : #endif
5532 :
5533 495876008 : return id;
5534 : }
5535 :
5536 : /* Make a dispatcher declaration for the multi-versioned function DECL.
5537 : Calls to DECL function will be replaced with calls to the dispatcher
5538 : by the front-end. Returns the decl of the dispatcher function. */
5539 :
5540 : tree
5541 323 : ix86_get_function_versions_dispatcher (void *decl)
5542 : {
5543 323 : tree fn = (tree) decl;
5544 323 : struct cgraph_node *node = NULL;
5545 323 : struct cgraph_node *default_node = NULL;
5546 323 : struct cgraph_function_version_info *node_v = NULL;
5547 :
5548 323 : tree dispatch_decl = NULL;
5549 :
5550 323 : struct cgraph_function_version_info *default_version_info = NULL;
5551 :
5552 646 : gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
5553 :
5554 323 : node = cgraph_node::get (fn);
5555 323 : gcc_assert (node != NULL);
5556 :
5557 323 : node_v = node->function_version ();
5558 323 : gcc_assert (node_v != NULL);
5559 :
5560 323 : if (node_v->dispatcher_resolver != NULL)
5561 : return node_v->dispatcher_resolver;
5562 :
5563 : /* The default node is always the beginning of the chain. */
5564 : default_version_info = node_v;
5565 662 : while (default_version_info->prev != NULL)
5566 : default_version_info = default_version_info->prev;
5567 209 : default_node = default_version_info->this_node;
5568 :
5569 : /* If there is no default node, just return NULL. */
5570 209 : if (!is_function_default_version (default_node->decl))
5571 : return NULL;
5572 :
5573 : #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
5574 200 : if (targetm.has_ifunc_p ())
5575 : {
5576 200 : struct cgraph_function_version_info *it_v = NULL;
5577 :
5578 : /* Right now, the dispatching is done via ifunc. */
5579 200 : dispatch_decl = make_dispatcher_decl (default_node->decl);
5580 :
5581 : /* Set the dispatcher for all the versions. */
5582 200 : it_v = default_version_info;
5583 1385 : while (it_v != NULL)
5584 : {
5585 985 : it_v->dispatcher_resolver = dispatch_decl;
5586 985 : it_v = it_v->next;
5587 : }
5588 : }
5589 : else
5590 : #endif
5591 : {
5592 0 : error_at (DECL_SOURCE_LOCATION (default_node->decl),
5593 : "multiversioning needs %<ifunc%> which is not supported "
5594 : "on this target");
5595 : }
5596 :
5597 : return dispatch_decl;
5598 : }
5599 :
5600 : /* Make the resolver function decl to dispatch the versions of
5601 : a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
5602 : ifunc alias that will point to the created resolver. Create an
5603 : empty basic block in the resolver and store the pointer in
5604 : EMPTY_BB. Return the decl of the resolver function. */
5605 :
5606 : static tree
5607 197 : make_resolver_func (const tree default_decl,
5608 : const tree ifunc_alias_decl,
5609 : basic_block *empty_bb)
5610 : {
5611 197 : tree decl, type, t;
5612 :
5613 : /* The resolver function should return a (void *). */
5614 197 : type = build_function_type_list (ptr_type_node, NULL_TREE);
5615 :
5616 197 : cgraph_node *node = cgraph_node::get (default_decl);
5617 197 : gcc_assert (node && node->function_version ());
5618 :
5619 197 : decl = build_fn_decl (IDENTIFIER_POINTER (DECL_NAME (default_decl)), type);
5620 :
5621 : /* Set the assembler name to prevent cgraph_node attempting to mangle. */
5622 197 : SET_DECL_ASSEMBLER_NAME (decl, DECL_ASSEMBLER_NAME (default_decl));
5623 :
5624 197 : cgraph_node *resolver_node = cgraph_node::get_create (decl);
5625 197 : resolver_node->dispatcher_resolver_function = true;
5626 :
5627 197 : if (node->is_target_clone)
5628 86 : resolver_node->is_target_clone = true;
5629 :
5630 197 : tree id = ix86_mangle_decl_assembler_name
5631 197 : (decl, node->function_version ()->assembler_name);
5632 197 : symtab->change_decl_assembler_name (decl, id);
5633 :
5634 197 : DECL_NAME (decl) = DECL_NAME (default_decl);
5635 197 : TREE_USED (decl) = 1;
5636 197 : DECL_ARTIFICIAL (decl) = 1;
5637 197 : DECL_IGNORED_P (decl) = 1;
5638 197 : TREE_PUBLIC (decl) = 0;
5639 197 : DECL_UNINLINABLE (decl) = 1;
5640 :
5641 : /* Resolver is not external, body is generated. */
5642 197 : DECL_EXTERNAL (decl) = 0;
5643 197 : DECL_EXTERNAL (ifunc_alias_decl) = 0;
5644 :
5645 197 : DECL_CONTEXT (decl) = NULL_TREE;
5646 197 : DECL_INITIAL (decl) = make_node (BLOCK);
5647 197 : DECL_STATIC_CONSTRUCTOR (decl) = 0;
5648 :
5649 197 : if (DECL_COMDAT_GROUP (default_decl)
5650 197 : || TREE_PUBLIC (default_decl))
5651 : {
5652 : /* In this case, each translation unit with a call to this
5653 : versioned function will put out a resolver. Ensure it
5654 : is comdat to keep just one copy. */
5655 173 : DECL_COMDAT (decl) = 1;
5656 173 : make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
5657 : }
5658 : else
5659 24 : TREE_PUBLIC (ifunc_alias_decl) = 0;
5660 :
5661 : /* Build result decl and add to function_decl. */
5662 197 : t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
5663 197 : DECL_CONTEXT (t) = decl;
5664 197 : DECL_ARTIFICIAL (t) = 1;
5665 197 : DECL_IGNORED_P (t) = 1;
5666 197 : DECL_RESULT (decl) = t;
5667 :
5668 197 : gimplify_function_tree (decl);
5669 197 : push_cfun (DECL_STRUCT_FUNCTION (decl));
5670 197 : *empty_bb = init_lowered_empty_function (decl, false,
5671 : profile_count::uninitialized ());
5672 :
5673 197 : cgraph_node::add_new_function (decl, true);
5674 197 : symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
5675 :
5676 197 : pop_cfun ();
5677 :
5678 197 : gcc_assert (ifunc_alias_decl != NULL);
5679 : /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
5680 197 : DECL_ATTRIBUTES (ifunc_alias_decl)
5681 197 : = make_attribute ("ifunc", IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)),
5682 197 : DECL_ATTRIBUTES (ifunc_alias_decl));
5683 :
5684 : /* Create the alias for dispatch to resolver here. */
5685 197 : cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
5686 197 : return decl;
5687 : }
5688 :
5689 : /* Generate the dispatching code body to dispatch multi-versioned function
5690 : DECL. The target hook is called to process the "target" attributes and
5691 : provide the code to dispatch the right function at run-time. NODE points
5692 : to the dispatcher decl whose body will be created. */
5693 :
5694 : tree
5695 197 : ix86_generate_version_dispatcher_body (void *node_p)
5696 : {
5697 197 : tree resolver_decl;
5698 197 : basic_block empty_bb;
5699 197 : tree default_ver_decl;
5700 197 : struct cgraph_node *versn;
5701 197 : struct cgraph_node *node;
5702 :
5703 197 : struct cgraph_function_version_info *node_version_info = NULL;
5704 197 : struct cgraph_function_version_info *versn_info = NULL;
5705 :
5706 197 : node = (cgraph_node *)node_p;
5707 :
5708 197 : node_version_info = node->function_version ();
5709 197 : gcc_assert (node->dispatcher_function
5710 : && node_version_info != NULL);
5711 :
5712 197 : if (node_version_info->dispatcher_resolver)
5713 : return node_version_info->dispatcher_resolver;
5714 :
5715 : /* The first version in the chain corresponds to the default version. */
5716 197 : default_ver_decl = node_version_info->next->this_node->decl;
5717 :
5718 : /* node is going to be an alias, so remove the finalized bit. */
5719 197 : node->definition = false;
5720 :
5721 197 : resolver_decl = make_resolver_func (default_ver_decl,
5722 : node->decl, &empty_bb);
5723 :
5724 197 : node_version_info->dispatcher_resolver = resolver_decl;
5725 :
5726 197 : push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
5727 :
5728 197 : auto_vec<tree, 2> fn_ver_vec;
5729 :
5730 1176 : for (versn_info = node_version_info->next; versn_info;
5731 979 : versn_info = versn_info->next)
5732 : {
5733 979 : versn = versn_info->this_node;
5734 : /* Check for virtual functions here again, as by this time it should
5735 : have been determined if this function needs a vtable index or
5736 : not. This happens for methods in derived classes that override
5737 : virtual methods in base classes but are not explicitly marked as
5738 : virtual. */
5739 979 : if (DECL_VIRTUAL_P (versn->decl))
5740 0 : sorry ("virtual function multiversioning not supported");
5741 :
5742 979 : fn_ver_vec.safe_push (versn->decl);
5743 : }
5744 :
5745 197 : dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
5746 197 : cgraph_edge::rebuild_edges ();
5747 197 : pop_cfun ();
5748 197 : return resolver_decl;
5749 197 : }
5750 :
5751 :
|