Line data Source code
1 : /* Copyright (C) 1988-2026 Free Software Foundation, Inc.
2 :
3 : This file is part of GCC.
4 :
5 : GCC is free software; you can redistribute it and/or modify
6 : it under the terms of the GNU General Public License as published by
7 : the Free Software Foundation; either version 3, or (at your option)
8 : any later version.
9 :
10 : GCC is distributed in the hope that it will be useful,
11 : but WITHOUT ANY WARRANTY; without even the implied warranty of
12 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 : GNU General Public License for more details.
14 :
15 : You should have received a copy of the GNU General Public License
16 : along with GCC; see the file COPYING3. If not see
17 : <http://www.gnu.org/licenses/>. */
18 :
19 : #define IN_TARGET_CODE 1
20 :
21 : #include "config.h"
22 : #include "system.h"
23 : #include "coretypes.h"
24 : #include "backend.h"
25 : #include "rtl.h"
26 : #include "tree.h"
27 : #include "memmodel.h"
28 : #include "gimple.h"
29 : #include "cfghooks.h"
30 : #include "cfgloop.h"
31 : #include "df.h"
32 : #include "tm_p.h"
33 : #include "stringpool.h"
34 : #include "expmed.h"
35 : #include "optabs.h"
36 : #include "regs.h"
37 : #include "emit-rtl.h"
38 : #include "recog.h"
39 : #include "cgraph.h"
40 : #include "diagnostic.h"
41 : #include "cfgbuild.h"
42 : #include "alias.h"
43 : #include "fold-const.h"
44 : #include "attribs.h"
45 : #include "calls.h"
46 : #include "stor-layout.h"
47 : #include "varasm.h"
48 : #include "output.h"
49 : #include "insn-attr.h"
50 : #include "flags.h"
51 : #include "except.h"
52 : #include "explow.h"
53 : #include "expr.h"
54 : #include "cfgrtl.h"
55 : #include "common/common-target.h"
56 : #include "langhooks.h"
57 : #include "reload.h"
58 : #include "gimplify.h"
59 : #include "dwarf2.h"
60 : #include "tm-constrs.h"
61 : #include "cselib.h"
62 : #include "sched-int.h"
63 : #include "opts.h"
64 : #include "tree-pass.h"
65 : #include "context.h"
66 : #include "pass_manager.h"
67 : #include "target-globals.h"
68 : #include "gimple-iterator.h"
69 : #include "shrink-wrap.h"
70 : #include "builtins.h"
71 : #include "rtl-iter.h"
72 : #include "tree-iterator.h"
73 : #include "dbgcnt.h"
74 : #include "case-cfn-macros.h"
75 : #include "dojump.h"
76 : #include "fold-const-call.h"
77 : #include "tree-vrp.h"
78 : #include "tree-ssanames.h"
79 : #include "selftest.h"
80 : #include "selftest-rtl.h"
81 : #include "print-rtl.h"
82 : #include "intl.h"
83 : #include "ifcvt.h"
84 : #include "symbol-summary.h"
85 : #include "sreal.h"
86 : #include "ipa-cp.h"
87 : #include "ipa-prop.h"
88 : #include "ipa-fnsummary.h"
89 : #include "wide-int-bitmask.h"
90 : #include "tree-vector-builder.h"
91 : #include "debug.h"
92 : #include "dwarf2out.h"
93 : #include "i386-builtins.h"
94 : #include "i386-features.h"
95 : #include "i386-expand.h"
96 :
97 : const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
98 : "savms64",
99 : "resms64",
100 : "resms64x",
101 : "savms64f",
102 : "resms64f",
103 : "resms64fx"
104 : };
105 :
106 : const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
107 : /* The below offset values are where each register is stored for the layout
108 : relative to incoming stack pointer. The value of each m_regs[].offset will
109 : be relative to the incoming base pointer (rax or rsi) used by the stub.
110 :
111 : s_instances: 0 1 2 3
112 : Offset: realigned or aligned + 8
113 : Register aligned aligned + 8 aligned w/HFP w/HFP */
114 : XMM15_REG, /* 0x10 0x18 0x10 0x18 */
115 : XMM14_REG, /* 0x20 0x28 0x20 0x28 */
116 : XMM13_REG, /* 0x30 0x38 0x30 0x38 */
117 : XMM12_REG, /* 0x40 0x48 0x40 0x48 */
118 : XMM11_REG, /* 0x50 0x58 0x50 0x58 */
119 : XMM10_REG, /* 0x60 0x68 0x60 0x68 */
120 : XMM9_REG, /* 0x70 0x78 0x70 0x78 */
121 : XMM8_REG, /* 0x80 0x88 0x80 0x88 */
122 : XMM7_REG, /* 0x90 0x98 0x90 0x98 */
123 : XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
124 : SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
125 : DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
126 : BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
127 : BP_REG, /* 0xc0 0xc8 N/A N/A */
128 : R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
129 : R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
130 : R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
131 : R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
132 : };
133 :
134 : /* Instantiate static const values. */
135 : const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
136 : const unsigned xlogue_layout::MIN_REGS;
137 : const unsigned xlogue_layout::MAX_REGS;
138 : const unsigned xlogue_layout::MAX_EXTRA_REGS;
139 : const unsigned xlogue_layout::VARIANT_COUNT;
140 : const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
141 :
142 : /* Initialize xlogue_layout::s_stub_names to zero. */
143 : char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
144 : [STUB_NAME_MAX_LEN];
145 :
146 : /* Instantiates all xlogue_layout instances. */
147 : const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
148 : xlogue_layout (0, false),
149 : xlogue_layout (8, false),
150 : xlogue_layout (0, true),
151 : xlogue_layout (8, true)
152 : };
153 :
154 : /* Return an appropriate const instance of xlogue_layout based upon values
155 : in cfun->machine and crtl. */
156 : const class xlogue_layout &
157 49891 : xlogue_layout::get_instance ()
158 : {
159 49891 : enum xlogue_stub_sets stub_set;
160 49891 : bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
161 :
162 49891 : if (stack_realign_fp)
163 : stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
164 40910 : else if (frame_pointer_needed)
165 25246 : stub_set = aligned_plus_8
166 31552 : ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
167 : : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
168 : else
169 9358 : stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
170 :
171 49891 : return s_instances[stub_set];
172 : }
173 :
174 : /* Determine how many clobbered registers can be saved by the stub.
175 : Returns the count of registers the stub will save and restore. */
176 : unsigned
177 35225 : xlogue_layout::count_stub_managed_regs ()
178 : {
179 35225 : bool hfp = frame_pointer_needed || stack_realign_fp;
180 35225 : unsigned i, count;
181 35225 : unsigned regno;
182 :
183 94890 : for (count = i = MIN_REGS; i < MAX_REGS; ++i)
184 : {
185 93670 : regno = REG_ORDER[i];
186 93670 : if (regno == BP_REG && hfp)
187 18200 : continue;
188 75470 : if (!ix86_save_reg (regno, false, false))
189 : break;
190 41465 : ++count;
191 : }
192 35225 : return count;
193 : }
194 :
195 : /* Determine if register REGNO is a stub managed register given the
196 : total COUNT of stub managed registers. */
197 : bool
198 2650688 : xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
199 : {
200 2650688 : bool hfp = frame_pointer_needed || stack_realign_fp;
201 2650688 : unsigned i;
202 :
203 34587805 : for (i = 0; i < count; ++i)
204 : {
205 32436986 : gcc_assert (i < MAX_REGS);
206 32436986 : if (REG_ORDER[i] == BP_REG && hfp)
207 522627 : ++count;
208 31914359 : else if (REG_ORDER[i] == regno)
209 : return true;
210 : }
211 : return false;
212 : }
213 :
214 : /* Constructor for xlogue_layout. */
215 1147664 : xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
216 1147664 : : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
217 1147664 : m_stack_align_off_in (stack_align_off_in)
218 : {
219 1147664 : HOST_WIDE_INT offset = stack_align_off_in;
220 1147664 : unsigned i, j;
221 :
222 21805616 : for (i = j = 0; i < MAX_REGS; ++i)
223 : {
224 20657952 : unsigned regno = REG_ORDER[i];
225 :
226 20657952 : if (regno == BP_REG && hfp)
227 573832 : continue;
228 20084120 : if (SSE_REGNO_P (regno))
229 : {
230 11476640 : offset += 16;
231 : /* Verify that SSE regs are always aligned. */
232 11476640 : gcc_assert (!((stack_align_off_in + offset) & 15));
233 : }
234 : else
235 8607480 : offset += 8;
236 :
237 20084120 : m_regs[j].regno = regno;
238 20084120 : m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
239 : }
240 1147664 : gcc_assert (j == m_nregs);
241 1147664 : }
242 :
243 : const char *
244 14666 : xlogue_layout::get_stub_name (enum xlogue_stub stub,
245 : unsigned n_extra_regs)
246 : {
247 14666 : const int have_avx = TARGET_AVX;
248 14666 : char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
249 :
250 : /* Lazy init */
251 14666 : if (!*name)
252 : {
253 362 : int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
254 : (have_avx ? "avx" : "sse"),
255 181 : STUB_BASE_NAMES[stub],
256 : MIN_REGS + n_extra_regs);
257 181 : gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
258 : }
259 :
260 14666 : return name;
261 : }
262 :
263 : /* Return rtx of a symbol ref for the entry point (based upon
264 : cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
265 : rtx
266 14666 : xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
267 : {
268 14666 : const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
269 14666 : gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
270 14666 : gcc_assert (stub < XLOGUE_STUB_COUNT);
271 14666 : gcc_assert (crtl->stack_realign_finalized);
272 :
273 14666 : return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
274 : }
275 :
276 : unsigned scalar_chain::max_id = 0;
277 :
278 : namespace {
279 :
280 : /* Initialize new chain. */
281 :
282 6313817 : scalar_chain::scalar_chain (enum machine_mode smode_, enum machine_mode vmode_)
283 : {
284 6313817 : smode = smode_;
285 6313817 : vmode = vmode_;
286 :
287 6313817 : chain_id = ++max_id;
288 :
289 6313817 : if (dump_file)
290 136 : fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
291 :
292 6313817 : bitmap_obstack_initialize (NULL);
293 6313817 : insns = BITMAP_ALLOC (NULL);
294 6313817 : defs = BITMAP_ALLOC (NULL);
295 6313817 : defs_conv = BITMAP_ALLOC (NULL);
296 6313817 : insns_conv = BITMAP_ALLOC (NULL);
297 6313817 : queue = NULL;
298 :
299 6313817 : cost_sse_integer = 0;
300 6313817 : weighted_cost_sse_integer = 0 ;
301 6313817 : max_visits = x86_stv_max_visits;
302 6313817 : }
303 :
304 : /* Free chain's data. */
305 :
306 6313817 : scalar_chain::~scalar_chain ()
307 : {
308 6313817 : BITMAP_FREE (insns);
309 6313817 : BITMAP_FREE (defs);
310 6313817 : BITMAP_FREE (defs_conv);
311 6313817 : BITMAP_FREE (insns_conv);
312 6313817 : bitmap_obstack_release (NULL);
313 6313817 : }
314 :
315 : /* Add instruction into chains' queue. */
316 :
317 : void
318 8176123 : scalar_chain::add_to_queue (unsigned insn_uid)
319 : {
320 8176123 : if (!bitmap_set_bit (queue, insn_uid))
321 : return;
322 :
323 6151556 : if (dump_file)
324 141 : fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
325 : insn_uid, chain_id);
326 : }
327 :
328 : /* For DImode conversion, mark register defined by DEF as requiring
329 : conversion. */
330 :
331 : void
332 9245003 : scalar_chain::mark_dual_mode_def (df_ref def)
333 : {
334 9245003 : gcc_assert (DF_REF_REG_DEF_P (def));
335 :
336 : /* Record the def/insn pair so we can later efficiently iterate over
337 : the defs to convert on insns not in the chain. */
338 9245003 : bool reg_new = bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
339 9245003 : basic_block bb = BLOCK_FOR_INSN (DF_REF_INSN (def));
340 9245003 : profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count;
341 9245003 : bool speed_p = optimize_bb_for_speed_p (bb);
342 9245003 : int cost = 0;
343 :
344 9245003 : if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def)))
345 : {
346 2675947 : if (!bitmap_set_bit (insns_conv, DF_REF_INSN_UID (def))
347 2675947 : && !reg_new)
348 1381996 : return;
349 :
350 : /* Cost integer to sse moves. */
351 2439045 : if (speed_p)
352 2158852 : cost = COSTS_N_INSNS (ix86_cost->integer_to_sse) / 2;
353 280193 : else if (TARGET_64BIT || smode == SImode)
354 : cost = COSTS_N_BYTES (4);
355 : /* vmovd (4 bytes) + vpinsrd (6 bytes). */
356 18647 : else if (TARGET_SSE4_1)
357 : cost = COSTS_N_BYTES (10);
358 : /* movd (4 bytes) + movd (4 bytes) + unpckldq (4 bytes). */
359 : else
360 7863007 : cost = COSTS_N_BYTES (12);
361 : }
362 : else
363 : {
364 6569056 : if (!reg_new)
365 : return;
366 :
367 : /* Cost sse to integer moves. */
368 5423962 : if (speed_p)
369 4866801 : cost = COSTS_N_INSNS (ix86_cost->sse_to_integer) / 2;
370 557161 : else if (TARGET_64BIT || smode == SImode)
371 : cost = COSTS_N_BYTES (4);
372 : /* vmovd (4 bytes) + vpextrd (6 bytes). */
373 2971 : else if (TARGET_SSE4_1)
374 : cost = COSTS_N_BYTES (10);
375 : /* movd (4 bytes) + psrlq (5 bytes) + movd (4 bytes). */
376 : else
377 7863007 : cost = COSTS_N_BYTES (13);
378 : }
379 :
380 7863007 : if (speed_p)
381 7025653 : weighted_cost_sse_integer += bb->count.to_sreal_scale (entry_count) * cost;
382 :
383 7863007 : cost_sse_integer += cost;
384 :
385 7863007 : if (dump_file)
386 240 : fprintf (dump_file,
387 : " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
388 240 : DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
389 : }
390 :
391 : /* Check REF's chain to add new insns into a queue
392 : and find registers requiring conversion. Return true if OK, false
393 : if the analysis was aborted. */
394 :
395 : bool
396 17563148 : scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref,
397 : bitmap disallowed)
398 : {
399 17563148 : df_link *chain;
400 17563148 : bool mark_def = false;
401 :
402 17563148 : gcc_checking_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)));
403 :
404 61158873 : for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
405 : {
406 43601140 : unsigned uid = DF_REF_INSN_UID (chain->ref);
407 :
408 43601140 : if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
409 7906954 : continue;
410 :
411 35694186 : if (--max_visits == 0)
412 : return false;
413 :
414 35693606 : if (!DF_REF_REG_MEM_P (chain->ref))
415 : {
416 29753976 : if (bitmap_bit_p (insns, uid))
417 9453839 : continue;
418 :
419 20300137 : if (bitmap_bit_p (candidates, uid))
420 : {
421 8176123 : add_to_queue (uid);
422 8176123 : continue;
423 : }
424 :
425 : /* If we run into parts of an aborted chain discovery abort. */
426 12124014 : if (bitmap_bit_p (disallowed, uid))
427 : return false;
428 : }
429 :
430 18058809 : if (DF_REF_REG_DEF_P (chain->ref))
431 : {
432 2675947 : if (dump_file)
433 125 : fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
434 : DF_REF_REGNO (chain->ref), uid);
435 2675947 : mark_dual_mode_def (chain->ref);
436 : }
437 : else
438 : {
439 15382862 : if (dump_file)
440 524 : fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
441 : DF_REF_REGNO (chain->ref), uid);
442 : mark_def = true;
443 : }
444 : }
445 :
446 17557733 : if (mark_def)
447 6569056 : mark_dual_mode_def (ref);
448 :
449 : return true;
450 : }
451 :
452 : /* Check whether X is a convertible *concatditi_? variant. X is known
453 : to be any_or_plus:TI, i.e. PLUS:TI, IOR:TI or XOR:TI. */
454 :
455 : static bool
456 26830 : timode_concatdi_p (rtx x)
457 : {
458 26830 : rtx op0 = XEXP (x, 0);
459 26830 : rtx op1 = XEXP (x, 1);
460 :
461 26830 : if (GET_CODE (op1) == ASHIFT)
462 946 : std::swap (op0, op1);
463 :
464 26830 : return GET_CODE (op0) == ASHIFT
465 18161 : && GET_CODE (XEXP (op0, 0)) == ZERO_EXTEND
466 18161 : && GET_MODE (XEXP (XEXP (op0, 0), 0)) == DImode
467 18161 : && REG_P (XEXP (XEXP (op0, 0), 0))
468 18026 : && CONST_INT_P (XEXP (op0, 1))
469 18026 : && INTVAL (XEXP (op0, 1)) == 64
470 18026 : && GET_CODE (op1) == ZERO_EXTEND
471 17080 : && GET_MODE (XEXP (op1, 0)) == DImode
472 43910 : && REG_P (XEXP (op1, 0));
473 : }
474 :
475 :
476 : /* Add instruction into a chain. Return true if OK, false if the search
477 : was aborted. */
478 :
479 : bool
480 12459232 : scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid,
481 : bitmap disallowed)
482 : {
483 12459232 : if (!bitmap_set_bit (insns, insn_uid))
484 : return true;
485 :
486 12459232 : if (dump_file)
487 277 : fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
488 :
489 12459232 : rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
490 12459232 : rtx def_set = single_set (insn);
491 12459232 : if (def_set && REG_P (SET_DEST (def_set))
492 22054735 : && !HARD_REGISTER_P (SET_DEST (def_set)))
493 9595491 : bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
494 :
495 : /* ??? The following is quadratic since analyze_register_chain
496 : iterates over all refs to look for dual-mode regs. Instead this
497 : should be done separately for all regs mentioned in the chain once. */
498 12459232 : df_ref ref;
499 25416645 : for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
500 12959274 : if (!HARD_REGISTER_P (DF_REF_REG (ref)))
501 9595491 : if (!analyze_register_chain (candidates, ref, disallowed))
502 : return false;
503 :
504 : /* The operand(s) of VEC_SELECT, ZERO_EXTEND and similar ops don't need
505 : to be converted/convertible. */
506 12457371 : if (def_set)
507 12457371 : switch (GET_CODE (SET_SRC (def_set)))
508 : {
509 : case VEC_SELECT:
510 : return true;
511 122 : case ZERO_EXTEND:
512 122 : if (GET_MODE (XEXP (SET_SRC (def_set), 0)) == DImode)
513 : return true;
514 : break;
515 2339725 : case PLUS:
516 2339725 : case IOR:
517 2339725 : case XOR:
518 2339725 : if (smode == TImode && timode_concatdi_p (SET_SRC (def_set)))
519 : return true;
520 : break;
521 : default:
522 : break;
523 : }
524 :
525 27284629 : for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
526 14869771 : if (!DF_REF_REG_MEM_P (ref))
527 7967657 : if (!analyze_register_chain (candidates, ref, disallowed))
528 : return false;
529 :
530 : return true;
531 : }
532 :
533 : /* Build new chain starting from insn INSN_UID recursively
534 : adding all dependent uses and definitions. Return true if OK, false
535 : if the chain discovery was aborted. */
536 :
537 : bool
538 6313817 : scalar_chain::build (bitmap candidates, unsigned insn_uid, bitmap disallowed)
539 : {
540 6313817 : queue = BITMAP_ALLOC (NULL);
541 6313817 : bitmap_set_bit (queue, insn_uid);
542 :
543 6313817 : if (dump_file)
544 136 : fprintf (dump_file, "Building chain #%d...\n", chain_id);
545 :
546 18767634 : while (!bitmap_empty_p (queue))
547 : {
548 12459232 : insn_uid = bitmap_first_set_bit (queue);
549 12459232 : bitmap_clear_bit (queue, insn_uid);
550 12459232 : bitmap_clear_bit (candidates, insn_uid);
551 12459232 : if (!add_insn (candidates, insn_uid, disallowed))
552 : {
553 : /* If we aborted the search put sofar found insn on the set of
554 : disallowed insns so that further searches reaching them also
555 : abort and thus we abort the whole but yet undiscovered chain. */
556 5415 : bitmap_ior_into (disallowed, insns);
557 5415 : if (dump_file)
558 0 : fprintf (dump_file, "Aborted chain #%d discovery\n", chain_id);
559 5415 : BITMAP_FREE (queue);
560 5415 : return false;
561 : }
562 : }
563 :
564 6308402 : if (dump_file)
565 : {
566 136 : fprintf (dump_file, "Collected chain #%d...\n", chain_id);
567 136 : fprintf (dump_file, " insns: ");
568 136 : dump_bitmap (dump_file, insns);
569 136 : if (!bitmap_empty_p (defs_conv))
570 : {
571 136 : bitmap_iterator bi;
572 136 : unsigned id;
573 136 : const char *comma = "";
574 136 : fprintf (dump_file, " defs to convert: ");
575 366 : EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
576 : {
577 230 : fprintf (dump_file, "%sr%d", comma, id);
578 230 : comma = ", ";
579 : }
580 136 : fprintf (dump_file, "\n");
581 : }
582 : }
583 :
584 6308402 : BITMAP_FREE (queue);
585 :
586 6308402 : return true;
587 : }
588 :
589 : /* Return a cost of building a vector constant
590 : instead of using a scalar one. */
591 :
592 : int
593 2631554 : general_scalar_chain::vector_const_cost (rtx exp, basic_block bb)
594 : {
595 2631554 : gcc_assert (CONST_INT_P (exp));
596 :
597 2631554 : if (standard_sse_constant_p (exp, vmode))
598 609064 : return ix86_cost->sse_op;
599 2022490 : if (optimize_bb_for_size_p (bb))
600 : return COSTS_N_BYTES (8);
601 : /* We have separate costs for SImode and DImode, use SImode costs
602 : for smaller modes. */
603 2400957 : return COSTS_N_INSNS (ix86_cost->sse_load[smode == DImode ? 1 : 0]) / 2;
604 : }
605 :
606 : /* Return true if it's cost profitable for chain conversion. */
607 :
608 : bool
609 5842535 : general_scalar_chain::compute_convert_gain ()
610 : {
611 5842535 : bitmap_iterator bi;
612 5842535 : unsigned insn_uid;
613 5842535 : int gain = 0;
614 5842535 : sreal weighted_gain = 0;
615 :
616 5842535 : if (dump_file)
617 136 : fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
618 :
619 : /* SSE costs distinguish between SImode and DImode loads/stores, for
620 : int costs factor in the number of GPRs involved. When supporting
621 : smaller modes than SImode the int load/store costs need to be
622 : adjusted as well. */
623 5842535 : unsigned sse_cost_idx = smode == DImode ? 1 : 0;
624 5842535 : int m = smode == DImode ? (TARGET_64BIT ? 1 : 2) : 1;
625 :
626 17360138 : EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
627 : {
628 11517603 : rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
629 11517603 : rtx def_set = single_set (insn);
630 11517603 : rtx src = SET_SRC (def_set);
631 11517603 : rtx dst = SET_DEST (def_set);
632 11517603 : basic_block bb = BLOCK_FOR_INSN (insn);
633 11517603 : int igain = 0;
634 11517603 : profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count;
635 11517603 : bool speed_p = optimize_bb_for_speed_p (bb);
636 11517603 : sreal bb_freq = bb->count.to_sreal_scale (entry_count);
637 :
638 11517603 : if (REG_P (src) && REG_P (dst))
639 : {
640 922764 : if (!speed_p)
641 : /* reg-reg move is 2 bytes, while SSE 3. */
642 187039 : igain += COSTS_N_BYTES (2 * m - 3);
643 : else
644 : /* Move costs are normalized to reg-reg move having cost 2. */
645 735725 : igain += COSTS_N_INSNS (2 * m - ix86_cost->xmm_move) / 2;
646 : }
647 10594839 : else if (REG_P (src) && MEM_P (dst))
648 : {
649 2301079 : if (!speed_p)
650 : /* Integer load/store is 3+ bytes and SSE 4+. */
651 194174 : igain += COSTS_N_BYTES (3 * m - 4);
652 : else
653 2106905 : igain
654 2106905 : += COSTS_N_INSNS (m * ix86_cost->int_store[2]
655 : - ix86_cost->sse_store[sse_cost_idx]) / 2;
656 : }
657 8293760 : else if (MEM_P (src) && REG_P (dst))
658 : {
659 3746136 : if (!speed_p)
660 361157 : igain += COSTS_N_BYTES (3 * m - 4);
661 : else
662 3384979 : igain += COSTS_N_INSNS (m * ix86_cost->int_load[2]
663 : - ix86_cost->sse_load[sse_cost_idx]) / 2;
664 : }
665 : else
666 : {
667 : /* For operations on memory operands, include the overhead
668 : of explicit load and store instructions. */
669 4547624 : if (MEM_P (dst))
670 : {
671 66315 : if (!speed_p)
672 : /* ??? This probably should account size difference
673 : of SSE and integer load rather than full SSE load. */
674 : igain -= COSTS_N_BYTES (8);
675 : else
676 : {
677 57071 : int cost = (m * (ix86_cost->int_load[2]
678 57071 : + ix86_cost->int_store[2])
679 57071 : - (ix86_cost->sse_load[sse_cost_idx] +
680 57071 : ix86_cost->sse_store[sse_cost_idx]));
681 57071 : igain += COSTS_N_INSNS (cost) / 2;
682 : }
683 : }
684 :
685 4547624 : switch (GET_CODE (src))
686 : {
687 469336 : case ASHIFT:
688 469336 : case ASHIFTRT:
689 469336 : case LSHIFTRT:
690 469336 : if (m == 2)
691 : {
692 17068 : if (INTVAL (XEXP (src, 1)) >= 32)
693 11526 : igain += ix86_cost->add;
694 : /* Gain for extend highpart case. */
695 5542 : else if (GET_CODE (XEXP (src, 0)) == ASHIFT)
696 0 : igain += ix86_cost->shift_const - ix86_cost->sse_op;
697 : else
698 5542 : igain += ix86_cost->shift_const;
699 : }
700 :
701 469336 : igain += ix86_cost->shift_const - ix86_cost->sse_op;
702 :
703 469336 : if (CONST_INT_P (XEXP (src, 0)))
704 0 : igain -= vector_const_cost (XEXP (src, 0), bb);
705 : break;
706 :
707 3817 : case ROTATE:
708 3817 : case ROTATERT:
709 3817 : igain += m * ix86_cost->shift_const;
710 3817 : if (TARGET_AVX512VL)
711 204 : igain -= ix86_cost->sse_op;
712 3613 : else if (smode == DImode)
713 : {
714 612 : int bits = INTVAL (XEXP (src, 1));
715 612 : if ((bits & 0x0f) == 0)
716 128 : igain -= ix86_cost->sse_op;
717 484 : else if ((bits & 0x07) == 0)
718 27 : igain -= 2 * ix86_cost->sse_op;
719 : else
720 457 : igain -= 3 * ix86_cost->sse_op;
721 : }
722 3001 : else if (INTVAL (XEXP (src, 1)) == 16)
723 240 : igain -= ix86_cost->sse_op;
724 : else
725 2761 : igain -= 2 * ix86_cost->sse_op;
726 : break;
727 :
728 2801816 : case AND:
729 2801816 : case IOR:
730 2801816 : case XOR:
731 2801816 : case PLUS:
732 2801816 : case MINUS:
733 2801816 : igain += m * ix86_cost->add - ix86_cost->sse_op;
734 : /* Additional gain for andnot for targets without BMI. */
735 2801816 : if (GET_CODE (XEXP (src, 0)) == NOT
736 3599 : && !TARGET_BMI)
737 3590 : igain += m * ix86_cost->add;
738 :
739 2801816 : if (CONST_INT_P (XEXP (src, 0)))
740 0 : igain -= vector_const_cost (XEXP (src, 0), bb);
741 2801816 : if (CONST_INT_P (XEXP (src, 1)))
742 1674978 : igain -= vector_const_cost (XEXP (src, 1), bb);
743 2801816 : if (MEM_P (XEXP (src, 1)))
744 : {
745 84881 : if (!speed_p)
746 20485 : igain -= COSTS_N_BYTES (m == 2 ? 3 : 5);
747 : else
748 74634 : igain += COSTS_N_INSNS
749 : (m * ix86_cost->int_load[2]
750 : - ix86_cost->sse_load[sse_cost_idx]) / 2;
751 : }
752 : break;
753 :
754 49966 : case NEG:
755 49966 : case NOT:
756 49966 : igain -= ix86_cost->sse_op + COSTS_N_INSNS (1);
757 :
758 49966 : if (GET_CODE (XEXP (src, 0)) != ABS)
759 : {
760 49966 : igain += m * ix86_cost->add;
761 49966 : break;
762 : }
763 : /* FALLTHRU */
764 :
765 1006 : case ABS:
766 1006 : case SMAX:
767 1006 : case SMIN:
768 1006 : case UMAX:
769 1006 : case UMIN:
770 : /* We do not have any conditional move cost, estimate it as a
771 : reg-reg move. Comparisons are costed as adds. */
772 1006 : igain += m * (COSTS_N_INSNS (2) + ix86_cost->add);
773 : /* Integer SSE ops are all costed the same. */
774 1006 : igain -= ix86_cost->sse_op;
775 1006 : break;
776 :
777 0 : case COMPARE:
778 0 : if (XEXP (src, 1) != const0_rtx)
779 : {
780 : /* cmp vs. pxor;pshufd;ptest. */
781 0 : igain += COSTS_N_INSNS (m - 3);
782 : }
783 0 : else if (GET_CODE (XEXP (src, 0)) != AND)
784 : {
785 : /* test vs. pshufd;ptest. */
786 0 : igain += COSTS_N_INSNS (m - 2);
787 : }
788 0 : else if (GET_CODE (XEXP (XEXP (src, 0), 0)) != NOT)
789 : {
790 : /* and;test vs. pshufd;ptest. */
791 0 : igain += COSTS_N_INSNS (2 * m - 2);
792 : }
793 0 : else if (TARGET_BMI)
794 : {
795 : /* andn;test vs. pandn;pshufd;ptest. */
796 0 : igain += COSTS_N_INSNS (2 * m - 3);
797 : }
798 : else
799 : {
800 : /* not;and;test vs. pandn;pshufd;ptest. */
801 0 : igain += COSTS_N_INSNS (3 * m - 3);
802 : }
803 : break;
804 :
805 1185549 : case CONST_INT:
806 1185549 : if (REG_P (dst))
807 : {
808 1185549 : if (!speed_p)
809 : {
810 : /* xor (2 bytes) vs. xorps (3 bytes). */
811 228973 : if (src == const0_rtx)
812 120066 : igain -= COSTS_N_BYTES (1);
813 : /* movdi_internal vs. movv2di_internal. */
814 : /* => mov (5 bytes) vs. movaps (7 bytes). */
815 108907 : else if (x86_64_immediate_operand (src, SImode))
816 95987 : igain -= COSTS_N_BYTES (2);
817 : else
818 : /* ??? Larger immediate constants are placed in the
819 : constant pool, where the size benefit/impact of
820 : STV conversion is affected by whether and how
821 : often each constant pool entry is shared/reused.
822 : The value below is empirically derived from the
823 : CSiBE benchmark (and the optimal value may drift
824 : over time). */
825 : igain += COSTS_N_BYTES (0);
826 : }
827 : else
828 : {
829 : /* DImode can be immediate for TARGET_64BIT
830 : and SImode always. */
831 956576 : igain += m * COSTS_N_INSNS (1);
832 956576 : igain -= vector_const_cost (src, bb);
833 : }
834 : }
835 0 : else if (MEM_P (dst))
836 : {
837 0 : igain += (m * ix86_cost->int_store[2]
838 0 : - ix86_cost->sse_store[sse_cost_idx]);
839 0 : igain -= vector_const_cost (src, bb);
840 : }
841 : break;
842 :
843 36134 : case VEC_SELECT:
844 36134 : if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
845 : {
846 : // movd (4 bytes) replaced with movdqa (4 bytes).
847 26667 : if (!!speed_p)
848 24910 : igain += COSTS_N_INSNS (ix86_cost->sse_to_integer
849 : - ix86_cost->xmm_move) / 2;
850 : }
851 : else
852 : {
853 : // pshufd; movd replaced with pshufd.
854 9467 : if (!speed_p)
855 648 : igain += COSTS_N_BYTES (4);
856 : else
857 8819 : igain += ix86_cost->sse_to_integer;
858 : }
859 : break;
860 :
861 0 : default:
862 0 : gcc_unreachable ();
863 : }
864 : }
865 :
866 11515846 : if (speed_p)
867 10249267 : weighted_gain += bb_freq * igain;
868 11517603 : gain += igain;
869 :
870 11517603 : if (igain != 0 && dump_file)
871 : {
872 93 : fprintf (dump_file, " Instruction gain %d with bb_freq %.2f for",
873 : igain, bb_freq.to_double ());
874 93 : dump_insn_slim (dump_file, insn);
875 : }
876 : }
877 :
878 5842535 : if (dump_file)
879 : {
880 136 : fprintf (dump_file, " Instruction conversion gain: %d, \n",
881 : gain);
882 136 : fprintf (dump_file, " Registers conversion cost: %d\n",
883 : cost_sse_integer);
884 136 : fprintf (dump_file, " Weighted instruction conversion gain: %.2f, \n",
885 : weighted_gain.to_double ());
886 136 : fprintf (dump_file, " Weighted registers conversion cost: %.2f\n",
887 : weighted_cost_sse_integer.to_double ());
888 : }
889 :
890 5842535 : if (weighted_gain != weighted_cost_sse_integer)
891 4706867 : return weighted_gain > weighted_cost_sse_integer;
892 : else
893 1135668 : return gain > cost_sse_integer;;
894 : }
895 :
896 : /* Insert generated conversion instruction sequence INSNS
897 : after instruction AFTER. New BB may be required in case
898 : instruction has EH region attached. */
899 :
900 : void
901 31171 : scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
902 : {
903 31171 : if (!control_flow_insn_p (after))
904 : {
905 30958 : emit_insn_after (insns, after);
906 30958 : return;
907 : }
908 :
909 213 : basic_block bb = BLOCK_FOR_INSN (after);
910 213 : edge e = find_fallthru_edge (bb->succs);
911 213 : gcc_assert (e);
912 :
913 213 : basic_block new_bb = split_edge (e);
914 213 : emit_insn_after (insns, BB_HEAD (new_bb));
915 : }
916 :
917 : } // anon namespace
918 :
919 : /* Generate the canonical SET_SRC to move GPR to a VMODE vector register,
920 : zeroing the upper parts. */
921 :
922 : static rtx
923 173167 : gen_gpr_to_xmm_move_src (enum machine_mode vmode, rtx gpr)
924 : {
925 346334 : switch (GET_MODE_NUNITS (vmode))
926 : {
927 25 : case 1:
928 25 : return gen_rtx_SUBREG (vmode, gpr, 0);
929 172582 : case 2:
930 345164 : return gen_rtx_VEC_CONCAT (vmode, gpr,
931 : CONST0_RTX (GET_MODE_INNER (vmode)));
932 560 : default:
933 560 : return gen_rtx_VEC_MERGE (vmode, gen_rtx_VEC_DUPLICATE (vmode, gpr),
934 : CONST0_RTX (vmode), GEN_INT (HOST_WIDE_INT_1U));
935 : }
936 : }
937 :
938 : /* Make vector copies for all register REGNO definitions
939 : and replace its uses in a chain. */
940 :
941 : void
942 8415 : scalar_chain::make_vector_copies (rtx_insn *insn, rtx reg)
943 : {
944 8415 : rtx vreg = *defs_map.get (reg);
945 :
946 8415 : start_sequence ();
947 8415 : if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
948 : {
949 0 : rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
950 0 : if (smode == DImode && !TARGET_64BIT)
951 : {
952 0 : emit_move_insn (adjust_address (tmp, SImode, 0),
953 : gen_rtx_SUBREG (SImode, reg, 0));
954 0 : emit_move_insn (adjust_address (tmp, SImode, 4),
955 : gen_rtx_SUBREG (SImode, reg, 4));
956 : }
957 : else
958 0 : emit_move_insn (copy_rtx (tmp), reg);
959 0 : emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
960 : gen_gpr_to_xmm_move_src (vmode, tmp)));
961 : }
962 8415 : else if (!TARGET_64BIT && smode == DImode)
963 : {
964 8273 : if (TARGET_SSE4_1)
965 : {
966 356 : emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
967 : CONST0_RTX (V4SImode),
968 : gen_rtx_SUBREG (SImode, reg, 0)));
969 356 : emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
970 : gen_rtx_SUBREG (V4SImode, vreg, 0),
971 : gen_rtx_SUBREG (SImode, reg, 4),
972 : GEN_INT (2)));
973 : }
974 : else
975 : {
976 7917 : rtx tmp = gen_reg_rtx (DImode);
977 7917 : emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
978 : CONST0_RTX (V4SImode),
979 : gen_rtx_SUBREG (SImode, reg, 0)));
980 7917 : emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
981 : CONST0_RTX (V4SImode),
982 : gen_rtx_SUBREG (SImode, reg, 4)));
983 7917 : emit_insn (gen_vec_interleave_lowv4si
984 : (gen_rtx_SUBREG (V4SImode, vreg, 0),
985 : gen_rtx_SUBREG (V4SImode, vreg, 0),
986 : gen_rtx_SUBREG (V4SImode, tmp, 0)));
987 : }
988 : }
989 : else
990 142 : emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
991 : gen_gpr_to_xmm_move_src (vmode, reg)));
992 8415 : rtx_insn *seq = end_sequence ();
993 8415 : emit_conversion_insns (seq, insn);
994 :
995 8415 : if (dump_file)
996 0 : fprintf (dump_file,
997 : " Copied r%d to a vector register r%d for insn %d\n",
998 0 : REGNO (reg), REGNO (vreg), INSN_UID (insn));
999 8415 : }
1000 :
1001 : /* Copy the definition SRC of INSN inside the chain to DST for
1002 : scalar uses outside of the chain. */
1003 :
1004 : void
1005 21998 : scalar_chain::convert_reg (rtx_insn *insn, rtx dst, rtx src)
1006 : {
1007 21998 : start_sequence ();
1008 21998 : if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
1009 : {
1010 0 : rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
1011 0 : emit_move_insn (tmp, src);
1012 0 : if (!TARGET_64BIT && smode == DImode)
1013 : {
1014 0 : emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
1015 : adjust_address (tmp, SImode, 0));
1016 0 : emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
1017 : adjust_address (tmp, SImode, 4));
1018 : }
1019 : else
1020 0 : emit_move_insn (dst, copy_rtx (tmp));
1021 : }
1022 21998 : else if (!TARGET_64BIT && smode == DImode)
1023 : {
1024 21107 : if (TARGET_SSE4_1)
1025 : {
1026 0 : rtx tmp = gen_rtx_PARALLEL (VOIDmode,
1027 : gen_rtvec (1, const0_rtx));
1028 0 : emit_insn
1029 0 : (gen_rtx_SET
1030 : (gen_rtx_SUBREG (SImode, dst, 0),
1031 : gen_rtx_VEC_SELECT (SImode,
1032 : gen_rtx_SUBREG (V4SImode, src, 0),
1033 : tmp)));
1034 :
1035 0 : tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
1036 0 : emit_insn
1037 0 : (gen_rtx_SET
1038 : (gen_rtx_SUBREG (SImode, dst, 4),
1039 : gen_rtx_VEC_SELECT (SImode,
1040 : gen_rtx_SUBREG (V4SImode, src, 0),
1041 : tmp)));
1042 : }
1043 : else
1044 : {
1045 21107 : rtx vcopy = gen_reg_rtx (V2DImode);
1046 21107 : emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, src, 0));
1047 21107 : emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
1048 : gen_rtx_SUBREG (SImode, vcopy, 0));
1049 21107 : emit_move_insn (vcopy,
1050 : gen_rtx_LSHIFTRT (V2DImode,
1051 : vcopy, GEN_INT (32)));
1052 21107 : emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
1053 : gen_rtx_SUBREG (SImode, vcopy, 0));
1054 : }
1055 : }
1056 : else
1057 891 : emit_move_insn (dst, src);
1058 :
1059 21998 : rtx_insn *seq = end_sequence ();
1060 21998 : emit_conversion_insns (seq, insn);
1061 :
1062 21998 : if (dump_file)
1063 0 : fprintf (dump_file,
1064 : " Copied r%d to a scalar register r%d for insn %d\n",
1065 0 : REGNO (src), REGNO (dst), INSN_UID (insn));
1066 21998 : }
1067 :
1068 : /* Helper function to convert immediate constant X to vmode. */
1069 : static rtx
1070 36524 : smode_convert_cst (rtx x, enum machine_mode vmode)
1071 : {
1072 : /* Prefer all ones vector in case of -1. */
1073 36524 : if (constm1_operand (x, GET_MODE (x)))
1074 623 : return CONSTM1_RTX (vmode);
1075 :
1076 35901 : unsigned n = GET_MODE_NUNITS (vmode);
1077 35901 : rtx *v = XALLOCAVEC (rtx, n);
1078 35901 : v[0] = x;
1079 41641 : for (unsigned i = 1; i < n; ++i)
1080 5740 : v[i] = const0_rtx;
1081 35901 : return gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (n, v));
1082 : }
1083 :
1084 : /* Convert operand OP in INSN. We should handle
1085 : memory operands and uninitialized registers.
1086 : All other register uses are converted during
1087 : registers conversion. */
1088 :
1089 : void
1090 247504 : scalar_chain::convert_op (rtx *op, rtx_insn *insn)
1091 : {
1092 247504 : rtx tmp;
1093 :
1094 247504 : if (GET_MODE (*op) == V1TImode)
1095 : return;
1096 :
1097 247353 : *op = copy_rtx_if_shared (*op);
1098 :
1099 247353 : if (GET_CODE (*op) == NOT
1100 247353 : || GET_CODE (*op) == ASHIFT)
1101 : {
1102 3493 : convert_op (&XEXP (*op, 0), insn);
1103 3493 : PUT_MODE (*op, vmode);
1104 : }
1105 : else if (MEM_P (*op))
1106 : {
1107 173025 : rtx_insn *movabs = NULL;
1108 :
1109 : /* Emit MOVABS to load from a 64-bit absolute address to a GPR. */
1110 173025 : if (!memory_operand (*op, GET_MODE (*op)))
1111 : {
1112 0 : tmp = gen_reg_rtx (GET_MODE (*op));
1113 0 : movabs = emit_insn_before (gen_rtx_SET (tmp, *op), insn);
1114 :
1115 0 : *op = tmp;
1116 : }
1117 :
1118 173025 : tmp = gen_rtx_SUBREG (vmode, gen_reg_rtx (GET_MODE (*op)), 0);
1119 :
1120 173025 : rtx_insn *eh_insn
1121 173025 : = emit_insn_before (gen_rtx_SET (copy_rtx (tmp),
1122 : gen_gpr_to_xmm_move_src (vmode, *op)),
1123 173025 : insn);
1124 :
1125 173025 : if (cfun->can_throw_non_call_exceptions)
1126 : {
1127 : /* Handle REG_EH_REGION note. */
1128 168754 : rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
1129 168754 : if (note)
1130 : {
1131 3588 : if (movabs)
1132 0 : eh_insn = movabs;
1133 3588 : control_flow_insns.safe_push (eh_insn);
1134 3588 : add_reg_note (eh_insn, REG_EH_REGION, XEXP (note, 0));
1135 : }
1136 : }
1137 :
1138 173025 : *op = tmp;
1139 :
1140 173025 : if (dump_file)
1141 0 : fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
1142 0 : INSN_UID (insn), reg_or_subregno (tmp));
1143 : }
1144 : else if (REG_P (*op))
1145 64740 : *op = gen_rtx_SUBREG (vmode, *op, 0);
1146 : else if (CONST_SCALAR_INT_P (*op))
1147 : {
1148 6095 : rtx vec_cst = smode_convert_cst (*op, vmode);
1149 :
1150 6095 : if (!standard_sse_constant_p (vec_cst, vmode))
1151 : {
1152 2703 : start_sequence ();
1153 2703 : vec_cst = validize_mem (force_const_mem (vmode, vec_cst));
1154 2703 : rtx_insn *seq = end_sequence ();
1155 2703 : emit_insn_before (seq, insn);
1156 : }
1157 :
1158 6095 : tmp = gen_rtx_SUBREG (vmode, gen_reg_rtx (smode), 0);
1159 :
1160 6095 : emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
1161 6095 : *op = tmp;
1162 : }
1163 : else
1164 : {
1165 0 : gcc_assert (SUBREG_P (*op));
1166 0 : gcc_assert (GET_MODE (*op) == vmode);
1167 : }
1168 : }
1169 :
1170 : /* Convert CCZmode COMPARE to vector mode. */
1171 :
1172 : rtx
1173 10 : scalar_chain::convert_compare (rtx op1, rtx op2, rtx_insn *insn)
1174 : {
1175 10 : rtx src, tmp;
1176 :
1177 : /* Handle any REG_EQUAL notes. */
1178 10 : tmp = find_reg_equal_equiv_note (insn);
1179 10 : if (tmp)
1180 : {
1181 1 : if (GET_CODE (XEXP (tmp, 0)) == COMPARE
1182 1 : && GET_MODE (XEXP (tmp, 0)) == CCZmode
1183 1 : && REG_P (XEXP (XEXP (tmp, 0), 0)))
1184 : {
1185 1 : rtx *op = &XEXP (XEXP (tmp, 0), 1);
1186 1 : if (CONST_SCALAR_INT_P (*op))
1187 : {
1188 1 : if (constm1_operand (*op, GET_MODE (*op)))
1189 0 : *op = CONSTM1_RTX (vmode);
1190 : else
1191 : {
1192 1 : unsigned n = GET_MODE_NUNITS (vmode);
1193 1 : rtx *v = XALLOCAVEC (rtx, n);
1194 1 : v[0] = *op;
1195 1 : for (unsigned i = 1; i < n; ++i)
1196 0 : v[i] = const0_rtx;
1197 1 : *op = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (n, v));
1198 : }
1199 : tmp = NULL_RTX;
1200 : }
1201 0 : else if (REG_P (*op))
1202 : tmp = NULL_RTX;
1203 : }
1204 :
1205 : if (tmp)
1206 0 : remove_note (insn, tmp);
1207 : }
1208 :
1209 : /* Comparison against anything other than zero, requires an XOR. */
1210 10 : if (op2 != const0_rtx)
1211 : {
1212 4 : convert_op (&op1, insn);
1213 4 : convert_op (&op2, insn);
1214 : /* If both operands are MEMs, explicitly load the OP1 into TMP. */
1215 4 : if (MEM_P (op1) && MEM_P (op2))
1216 : {
1217 0 : tmp = gen_reg_rtx (vmode);
1218 0 : emit_insn_before (gen_rtx_SET (tmp, op1), insn);
1219 0 : src = tmp;
1220 : }
1221 : else
1222 : src = op1;
1223 4 : src = gen_rtx_XOR (vmode, src, op2);
1224 : }
1225 6 : else if (GET_CODE (op1) == AND
1226 0 : && GET_CODE (XEXP (op1, 0)) == NOT)
1227 : {
1228 0 : rtx op11 = XEXP (XEXP (op1, 0), 0);
1229 0 : rtx op12 = XEXP (op1, 1);
1230 0 : convert_op (&op11, insn);
1231 0 : convert_op (&op12, insn);
1232 0 : if (!REG_P (op11))
1233 : {
1234 0 : tmp = gen_reg_rtx (vmode);
1235 0 : emit_insn_before (gen_rtx_SET (tmp, op11), insn);
1236 0 : op11 = tmp;
1237 : }
1238 0 : src = gen_rtx_AND (vmode, gen_rtx_NOT (vmode, op11), op12);
1239 0 : }
1240 6 : else if (GET_CODE (op1) == AND)
1241 : {
1242 0 : rtx op11 = XEXP (op1, 0);
1243 0 : rtx op12 = XEXP (op1, 1);
1244 0 : convert_op (&op11, insn);
1245 0 : convert_op (&op12, insn);
1246 0 : if (!REG_P (op11))
1247 : {
1248 0 : tmp = gen_reg_rtx (vmode);
1249 0 : emit_insn_before (gen_rtx_SET (tmp, op11), insn);
1250 0 : op11 = tmp;
1251 : }
1252 0 : return gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, op11, op12),
1253 : UNSPEC_PTEST);
1254 : }
1255 : else
1256 : {
1257 6 : convert_op (&op1, insn);
1258 6 : src = op1;
1259 : }
1260 :
1261 10 : if (!REG_P (src))
1262 : {
1263 6 : tmp = gen_reg_rtx (vmode);
1264 6 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
1265 6 : src = tmp;
1266 : }
1267 :
1268 10 : if (vmode == V2DImode)
1269 : {
1270 0 : tmp = gen_reg_rtx (vmode);
1271 0 : emit_insn_before (gen_vec_interleave_lowv2di (tmp, src, src), insn);
1272 0 : src = tmp;
1273 : }
1274 10 : else if (vmode == V4SImode)
1275 : {
1276 0 : tmp = gen_reg_rtx (vmode);
1277 0 : emit_insn_before (gen_sse2_pshufd (tmp, src, const0_rtx), insn);
1278 0 : src = tmp;
1279 : }
1280 :
1281 10 : return gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, src, src), UNSPEC_PTEST);
1282 : }
1283 :
1284 : /* Helper function for converting INSN to vector mode. */
1285 :
1286 : void
1287 1325975 : scalar_chain::convert_insn_common (rtx_insn *insn)
1288 : {
1289 : /* Generate copies for out-of-chain uses of defs and adjust debug uses. */
1290 2029905 : for (df_ref ref = DF_INSN_DEFS (insn); ref; ref = DF_REF_NEXT_LOC (ref))
1291 703930 : if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
1292 : {
1293 23421 : df_link *use;
1294 44064 : for (use = DF_REF_CHAIN (ref); use; use = use->next)
1295 42641 : if (NONDEBUG_INSN_P (DF_REF_INSN (use->ref))
1296 42641 : && (DF_REF_REG_MEM_P (use->ref)
1297 38568 : || !bitmap_bit_p (insns, DF_REF_INSN_UID (use->ref))))
1298 : break;
1299 23421 : if (use)
1300 21998 : convert_reg (insn, DF_REF_REG (ref),
1301 21998 : *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]));
1302 1423 : else if (MAY_HAVE_DEBUG_BIND_INSNS)
1303 : {
1304 : /* If we generated a scalar copy we can leave debug-insns
1305 : as-is, if not, we have to adjust them. */
1306 1303 : auto_vec<rtx_insn *, 5> to_reset_debug_insns;
1307 3866 : for (use = DF_REF_CHAIN (ref); use; use = use->next)
1308 2563 : if (DEBUG_INSN_P (DF_REF_INSN (use->ref)))
1309 : {
1310 824 : rtx_insn *debug_insn = DF_REF_INSN (use->ref);
1311 : /* If there's a reaching definition outside of the
1312 : chain we have to reset. */
1313 824 : df_link *def;
1314 2931 : for (def = DF_REF_CHAIN (use->ref); def; def = def->next)
1315 2290 : if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def->ref)))
1316 : break;
1317 824 : if (def)
1318 183 : to_reset_debug_insns.safe_push (debug_insn);
1319 : else
1320 : {
1321 641 : *DF_REF_REAL_LOC (use->ref)
1322 641 : = *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]);
1323 641 : df_insn_rescan (debug_insn);
1324 : }
1325 : }
1326 : /* Have to do the reset outside of the DF_CHAIN walk to not
1327 : disrupt it. */
1328 2789 : while (!to_reset_debug_insns.is_empty ())
1329 : {
1330 183 : rtx_insn *debug_insn = to_reset_debug_insns.pop ();
1331 183 : INSN_VAR_LOCATION_LOC (debug_insn) = gen_rtx_UNKNOWN_VAR_LOC ();
1332 183 : df_insn_rescan_debug_internal (debug_insn);
1333 : }
1334 1303 : }
1335 : }
1336 :
1337 : /* Replace uses in this insn with the defs we use in the chain. */
1338 3316905 : for (df_ref ref = DF_INSN_USES (insn); ref; ref = DF_REF_NEXT_LOC (ref))
1339 1990930 : if (!DF_REF_REG_MEM_P (ref))
1340 714247 : if (rtx *vreg = defs_map.get (regno_reg_rtx[DF_REF_REGNO (ref)]))
1341 : {
1342 : /* Also update a corresponding REG_DEAD note. */
1343 35333 : rtx note = find_reg_note (insn, REG_DEAD, DF_REF_REG (ref));
1344 35333 : if (note)
1345 23369 : XEXP (note, 0) = *vreg;
1346 35333 : *DF_REF_REAL_LOC (ref) = *vreg;
1347 : }
1348 1325975 : }
1349 :
1350 : /* Convert INSN which is an SImode or DImode rotation by a constant
1351 : to vector mode. CODE is either ROTATE or ROTATERT with operands
1352 : OP0 and OP1. Returns the SET_SRC of the last instruction in the
1353 : resulting sequence, which is emitted before INSN. */
1354 :
1355 : rtx
1356 92 : general_scalar_chain::convert_rotate (enum rtx_code code, rtx op0, rtx op1,
1357 : rtx_insn *insn)
1358 : {
1359 92 : int bits = INTVAL (op1);
1360 92 : rtx pat, result;
1361 :
1362 92 : convert_op (&op0, insn);
1363 92 : if (bits == 0)
1364 0 : return op0;
1365 :
1366 92 : if (smode == DImode)
1367 : {
1368 92 : if (code == ROTATE)
1369 45 : bits = 64 - bits;
1370 92 : if (bits == 32)
1371 : {
1372 0 : rtx tmp1 = gen_reg_rtx (V4SImode);
1373 0 : pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
1374 : GEN_INT (225));
1375 0 : emit_insn_before (pat, insn);
1376 0 : result = gen_lowpart (V2DImode, tmp1);
1377 : }
1378 92 : else if (TARGET_AVX512VL)
1379 0 : result = simplify_gen_binary (code, V2DImode, op0, op1);
1380 92 : else if (bits == 16 || bits == 48)
1381 : {
1382 0 : rtx tmp1 = gen_reg_rtx (V8HImode);
1383 0 : pat = gen_sse2_pshuflw (tmp1, gen_lowpart (V8HImode, op0),
1384 : GEN_INT (bits == 16 ? 57 : 147));
1385 0 : emit_insn_before (pat, insn);
1386 0 : result = gen_lowpart (V2DImode, tmp1);
1387 : }
1388 92 : else if ((bits & 0x07) == 0)
1389 : {
1390 0 : rtx tmp1 = gen_reg_rtx (V4SImode);
1391 0 : pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
1392 : GEN_INT (68));
1393 0 : emit_insn_before (pat, insn);
1394 0 : rtx tmp2 = gen_reg_rtx (V1TImode);
1395 0 : pat = gen_sse2_lshrv1ti3 (tmp2, gen_lowpart (V1TImode, tmp1),
1396 : GEN_INT (bits));
1397 0 : emit_insn_before (pat, insn);
1398 0 : result = gen_lowpart (V2DImode, tmp2);
1399 : }
1400 : else
1401 : {
1402 92 : rtx tmp1 = gen_reg_rtx (V4SImode);
1403 92 : pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
1404 : GEN_INT (20));
1405 92 : emit_insn_before (pat, insn);
1406 92 : rtx tmp2 = gen_reg_rtx (V2DImode);
1407 92 : pat = gen_lshrv2di3 (tmp2, gen_lowpart (V2DImode, tmp1),
1408 : GEN_INT (bits & 31));
1409 92 : emit_insn_before (pat, insn);
1410 92 : rtx tmp3 = gen_reg_rtx (V4SImode);
1411 139 : pat = gen_sse2_pshufd (tmp3, gen_lowpart (V4SImode, tmp2),
1412 : GEN_INT (bits > 32 ? 34 : 136));
1413 92 : emit_insn_before (pat, insn);
1414 92 : result = gen_lowpart (V2DImode, tmp3);
1415 : }
1416 : }
1417 0 : else if (bits == 16)
1418 : {
1419 0 : rtx tmp1 = gen_reg_rtx (V8HImode);
1420 0 : pat = gen_sse2_pshuflw (tmp1, gen_lowpart (V8HImode, op0), GEN_INT (225));
1421 0 : emit_insn_before (pat, insn);
1422 0 : result = gen_lowpart (V4SImode, tmp1);
1423 : }
1424 0 : else if (TARGET_AVX512VL)
1425 0 : result = simplify_gen_binary (code, V4SImode, op0, op1);
1426 : else
1427 : {
1428 0 : if (code == ROTATE)
1429 0 : bits = 32 - bits;
1430 :
1431 0 : rtx tmp1 = gen_reg_rtx (V4SImode);
1432 0 : emit_insn_before (gen_sse2_pshufd (tmp1, op0, GEN_INT (224)), insn);
1433 0 : rtx tmp2 = gen_reg_rtx (V2DImode);
1434 0 : pat = gen_lshrv2di3 (tmp2, gen_lowpart (V2DImode, tmp1),
1435 : GEN_INT (bits));
1436 0 : emit_insn_before (pat, insn);
1437 0 : result = gen_lowpart (V4SImode, tmp2);
1438 : }
1439 :
1440 : return result;
1441 : }
1442 :
1443 : /* Convert INSN to vector mode. */
1444 :
1445 : void
1446 411623 : general_scalar_chain::convert_insn (rtx_insn *insn)
1447 : {
1448 411623 : rtx def_set = single_set (insn);
1449 411623 : rtx src = SET_SRC (def_set);
1450 411623 : rtx dst = SET_DEST (def_set);
1451 411623 : rtx subreg;
1452 :
1453 411623 : if (MEM_P (dst) && !REG_P (src))
1454 : {
1455 : /* There are no scalar integer instructions and therefore
1456 : temporary register usage is required. */
1457 758 : rtx tmp = gen_reg_rtx (smode);
1458 758 : emit_conversion_insns (gen_move_insn (dst, tmp), insn);
1459 758 : dst = gen_rtx_SUBREG (vmode, tmp, 0);
1460 758 : }
1461 410865 : else if (REG_P (dst) && GET_MODE (dst) == smode)
1462 : {
1463 : /* Replace the definition with a SUBREG to the definition we
1464 : use inside the chain. */
1465 215538 : rtx *vdef = defs_map.get (dst);
1466 215538 : if (vdef)
1467 23421 : dst = *vdef;
1468 215538 : dst = gen_rtx_SUBREG (vmode, dst, 0);
1469 : /* IRA doesn't like to have REG_EQUAL/EQUIV notes when the SET_DEST
1470 : is a non-REG_P. So kill those off. */
1471 215538 : rtx note = find_reg_equal_equiv_note (insn);
1472 215538 : if (note)
1473 9741 : remove_note (insn, note);
1474 : }
1475 :
1476 411623 : switch (GET_CODE (src))
1477 : {
1478 30083 : case PLUS:
1479 30083 : case MINUS:
1480 30083 : case IOR:
1481 30083 : case XOR:
1482 30083 : case AND:
1483 30083 : case SMAX:
1484 30083 : case SMIN:
1485 30083 : case UMAX:
1486 30083 : case UMIN:
1487 30083 : convert_op (&XEXP (src, 1), insn);
1488 : /* FALLTHRU */
1489 :
1490 37481 : case ABS:
1491 37481 : case ASHIFT:
1492 37481 : case ASHIFTRT:
1493 37481 : case LSHIFTRT:
1494 37481 : convert_op (&XEXP (src, 0), insn);
1495 37481 : PUT_MODE (src, vmode);
1496 37481 : break;
1497 :
1498 92 : case ROTATE:
1499 92 : case ROTATERT:
1500 92 : src = convert_rotate (GET_CODE (src), XEXP (src, 0), XEXP (src, 1),
1501 : insn);
1502 92 : break;
1503 :
1504 391 : case NEG:
1505 391 : src = XEXP (src, 0);
1506 :
1507 391 : if (GET_CODE (src) == ABS)
1508 : {
1509 0 : src = XEXP (src, 0);
1510 0 : convert_op (&src, insn);
1511 0 : subreg = gen_reg_rtx (vmode);
1512 0 : emit_insn_before (gen_rtx_SET (subreg,
1513 : gen_rtx_ABS (vmode, src)), insn);
1514 0 : src = subreg;
1515 : }
1516 : else
1517 391 : convert_op (&src, insn);
1518 :
1519 391 : subreg = gen_reg_rtx (vmode);
1520 391 : emit_insn_before (gen_move_insn (subreg, CONST0_RTX (vmode)), insn);
1521 391 : src = gen_rtx_MINUS (vmode, subreg, src);
1522 391 : break;
1523 :
1524 250 : case NOT:
1525 250 : src = XEXP (src, 0);
1526 250 : convert_op (&src, insn);
1527 250 : subreg = gen_reg_rtx (vmode);
1528 250 : emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (vmode)), insn);
1529 250 : src = gen_rtx_XOR (vmode, src, subreg);
1530 250 : break;
1531 :
1532 170873 : case MEM:
1533 170873 : if (!REG_P (dst))
1534 170873 : convert_op (&src, insn);
1535 : break;
1536 :
1537 196629 : case REG:
1538 196629 : if (!MEM_P (dst))
1539 1302 : convert_op (&src, insn);
1540 : break;
1541 :
1542 0 : case SUBREG:
1543 0 : gcc_assert (GET_MODE (src) == vmode);
1544 : break;
1545 :
1546 0 : case COMPARE:
1547 0 : dst = gen_rtx_REG (CCZmode, FLAGS_REG);
1548 0 : src = convert_compare (XEXP (src, 0), XEXP (src, 1), insn);
1549 0 : break;
1550 :
1551 3351 : case CONST_INT:
1552 3351 : convert_op (&src, insn);
1553 3351 : break;
1554 :
1555 2556 : case VEC_SELECT:
1556 2556 : if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
1557 1565 : src = XEXP (src, 0);
1558 991 : else if (smode == DImode)
1559 : {
1560 746 : rtx tmp = gen_lowpart (V1TImode, XEXP (src, 0));
1561 746 : dst = gen_lowpart (V1TImode, dst);
1562 746 : src = gen_rtx_LSHIFTRT (V1TImode, tmp, GEN_INT (64));
1563 : }
1564 : else
1565 : {
1566 245 : rtx tmp = XVECEXP (XEXP (src, 1), 0, 0);
1567 245 : rtvec vec = gen_rtvec (4, tmp, tmp, tmp, tmp);
1568 245 : rtx par = gen_rtx_PARALLEL (VOIDmode, vec);
1569 245 : src = gen_rtx_VEC_SELECT (vmode, XEXP (src, 0), par);
1570 : }
1571 : break;
1572 :
1573 0 : default:
1574 0 : gcc_unreachable ();
1575 : }
1576 :
1577 411623 : SET_SRC (def_set) = src;
1578 411623 : SET_DEST (def_set) = dst;
1579 :
1580 : /* Drop possible dead definitions. */
1581 411623 : PATTERN (insn) = def_set;
1582 :
1583 411623 : INSN_CODE (insn) = -1;
1584 411623 : int patt = recog_memoized (insn);
1585 411623 : if (patt == -1)
1586 0 : fatal_insn_not_found (insn);
1587 411623 : df_insn_rescan (insn);
1588 411623 : }
1589 :
1590 : /* Helper function to compute gain for loading an immediate constant.
1591 : Typically, two movabsq for TImode vs. vmovdqa for V1TImode, but
1592 : with numerous special cases. */
1593 :
1594 : static int
1595 8 : timode_immed_const_gain (rtx cst, basic_block bb)
1596 : {
1597 : /* movabsq vs. movabsq+vmovq+vunpacklqdq. */
1598 8 : if (CONST_WIDE_INT_P (cst)
1599 5 : && CONST_WIDE_INT_NUNITS (cst) == 2
1600 13 : && CONST_WIDE_INT_ELT (cst, 0) == CONST_WIDE_INT_ELT (cst, 1))
1601 0 : return optimize_bb_for_size_p (bb) ? -COSTS_N_BYTES (9)
1602 : : -COSTS_N_INSNS (2);
1603 : /* 2x movabsq ~ vmovdqa. */
1604 : return 0;
1605 : }
1606 :
1607 : /* Return true it's cost profitable for for chain conversion. */
1608 :
1609 : bool
1610 465867 : timode_scalar_chain::compute_convert_gain ()
1611 : {
1612 : /* Assume that if we have to move TImode values between units,
1613 : then transforming this chain isn't worth it. */
1614 465867 : if (cost_sse_integer)
1615 : return false;
1616 :
1617 465867 : bitmap_iterator bi;
1618 465867 : unsigned insn_uid;
1619 :
1620 : /* Split ties to prefer V1TImode when not optimizing for size. */
1621 465867 : int gain = optimize_size ? 0 : 1;
1622 465867 : sreal weighted_gain = 0;
1623 :
1624 465867 : if (dump_file)
1625 0 : fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
1626 :
1627 1386392 : EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
1628 : {
1629 920525 : rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1630 920525 : rtx def_set = single_set (insn);
1631 920525 : rtx src = SET_SRC (def_set);
1632 920525 : rtx dst = SET_DEST (def_set);
1633 920525 : HOST_WIDE_INT op1val;
1634 920525 : basic_block bb = BLOCK_FOR_INSN (insn);
1635 920525 : int scost, vcost;
1636 920525 : int igain = 0;
1637 920525 : profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count;
1638 920525 : bool speed_p = optimize_bb_for_speed_p (bb);
1639 920525 : sreal bb_freq = bb->count.to_sreal_scale (entry_count);
1640 :
1641 920525 : switch (GET_CODE (src))
1642 : {
1643 454277 : case REG:
1644 454277 : if (!speed_p)
1645 20528 : igain = MEM_P (dst) ? COSTS_N_BYTES (6) : COSTS_N_BYTES (3);
1646 : else
1647 : igain = COSTS_N_INSNS (1);
1648 : break;
1649 :
1650 421720 : case MEM:
1651 421720 : igain = !speed_p ? COSTS_N_BYTES (7) : COSTS_N_INSNS (1);
1652 : break;
1653 :
1654 10717 : case CONST_INT:
1655 10717 : if (MEM_P (dst)
1656 10717 : && standard_sse_constant_p (src, V1TImode))
1657 10185 : igain = !speed_p ? COSTS_N_BYTES (11) : 1;
1658 : break;
1659 :
1660 30624 : case CONST_WIDE_INT:
1661 : /* 2 x mov vs. vmovdqa. */
1662 30624 : if (MEM_P (dst))
1663 30419 : igain = !speed_p ? COSTS_N_BYTES (3) : COSTS_N_INSNS (1);
1664 : break;
1665 :
1666 19 : case NOT:
1667 19 : if (MEM_P (dst))
1668 24423 : igain = -COSTS_N_INSNS (1);
1669 : break;
1670 :
1671 14 : case AND:
1672 14 : if (!MEM_P (dst))
1673 3 : igain = COSTS_N_INSNS (1);
1674 14 : if (CONST_SCALAR_INT_P (XEXP (src, 1)))
1675 5 : igain += timode_immed_const_gain (XEXP (src, 1), bb);
1676 : break;
1677 :
1678 2754 : case XOR:
1679 2754 : case IOR:
1680 2754 : if (timode_concatdi_p (src))
1681 : {
1682 : /* vmovq;vpinsrq (11 bytes). */
1683 2703 : igain = speed_p ? -2 * ix86_cost->sse_to_integer
1684 : : -COSTS_N_BYTES (11);
1685 : break;
1686 : }
1687 51 : if (!MEM_P (dst))
1688 43 : igain = COSTS_N_INSNS (1);
1689 51 : if (CONST_SCALAR_INT_P (XEXP (src, 1)))
1690 3 : igain += timode_immed_const_gain (XEXP (src, 1), bb);
1691 : break;
1692 :
1693 0 : case PLUS:
1694 0 : if (timode_concatdi_p (src))
1695 : /* vmovq;vpinsrq (11 bytes). */
1696 0 : igain = speed_p ? -2 * ix86_cost->sse_to_integer
1697 : : -COSTS_N_BYTES (11);
1698 : break;
1699 :
1700 158 : case ASHIFT:
1701 158 : case LSHIFTRT:
1702 : /* See ix86_expand_v1ti_shift. */
1703 158 : op1val = INTVAL (XEXP (src, 1));
1704 158 : if (!speed_p)
1705 : {
1706 15 : if (op1val == 64 || op1val == 65)
1707 : scost = COSTS_N_BYTES (5);
1708 10 : else if (op1val >= 66)
1709 : scost = COSTS_N_BYTES (6);
1710 10 : else if (op1val == 1)
1711 : scost = COSTS_N_BYTES (8);
1712 : else
1713 : scost = COSTS_N_BYTES (9);
1714 :
1715 14 : if ((op1val & 7) == 0)
1716 : vcost = COSTS_N_BYTES (5);
1717 10 : else if (op1val > 64)
1718 : vcost = COSTS_N_BYTES (10);
1719 : else
1720 10 : vcost = TARGET_AVX ? COSTS_N_BYTES (19) : COSTS_N_BYTES (23);
1721 : }
1722 : else
1723 : {
1724 143 : scost = COSTS_N_INSNS (2);
1725 143 : if ((op1val & 7) == 0)
1726 : vcost = COSTS_N_INSNS (1);
1727 110 : else if (op1val > 64)
1728 : vcost = COSTS_N_INSNS (2);
1729 : else
1730 110 : vcost = TARGET_AVX ? COSTS_N_INSNS (4) : COSTS_N_INSNS (5);
1731 : }
1732 158 : igain = scost - vcost;
1733 158 : break;
1734 :
1735 103 : case ASHIFTRT:
1736 : /* See ix86_expand_v1ti_ashiftrt. */
1737 103 : op1val = INTVAL (XEXP (src, 1));
1738 103 : if (!speed_p)
1739 : {
1740 7 : if (op1val == 64 || op1val == 127)
1741 : scost = COSTS_N_BYTES (7);
1742 7 : else if (op1val == 1)
1743 : scost = COSTS_N_BYTES (8);
1744 7 : else if (op1val == 65)
1745 : scost = COSTS_N_BYTES (10);
1746 7 : else if (op1val >= 66)
1747 : scost = COSTS_N_BYTES (11);
1748 : else
1749 : scost = COSTS_N_BYTES (9);
1750 :
1751 0 : if (op1val == 127)
1752 : vcost = COSTS_N_BYTES (10);
1753 7 : else if (op1val == 64)
1754 : vcost = COSTS_N_BYTES (14);
1755 7 : else if (op1val == 96)
1756 : vcost = COSTS_N_BYTES (18);
1757 7 : else if (op1val >= 111)
1758 : vcost = COSTS_N_BYTES (15);
1759 7 : else if (TARGET_AVX2 && op1val == 32)
1760 : vcost = COSTS_N_BYTES (16);
1761 7 : else if (TARGET_SSE4_1 && op1val == 32)
1762 : vcost = COSTS_N_BYTES (20);
1763 7 : else if (op1val >= 96)
1764 : vcost = COSTS_N_BYTES (23);
1765 7 : else if ((op1val & 7) == 0)
1766 : vcost = COSTS_N_BYTES (28);
1767 7 : else if (TARGET_AVX2 && op1val < 32)
1768 : vcost = COSTS_N_BYTES (30);
1769 7 : else if (op1val == 1 || op1val >= 64)
1770 : vcost = COSTS_N_BYTES (42);
1771 : else
1772 7 : vcost = COSTS_N_BYTES (47);
1773 : }
1774 : else
1775 : {
1776 96 : if (op1val >= 65 && op1val <= 126)
1777 : scost = COSTS_N_INSNS (3);
1778 : else
1779 96 : scost = COSTS_N_INSNS (2);
1780 :
1781 96 : if (op1val == 127)
1782 : vcost = COSTS_N_INSNS (2);
1783 96 : else if (op1val == 64)
1784 : vcost = COSTS_N_INSNS (3);
1785 96 : else if (op1val == 96)
1786 : vcost = COSTS_N_INSNS (3);
1787 96 : else if (op1val >= 111)
1788 : vcost = COSTS_N_INSNS (3);
1789 96 : else if (TARGET_SSE4_1 && op1val == 32)
1790 : vcost = COSTS_N_INSNS (3);
1791 96 : else if (TARGET_SSE4_1
1792 0 : && (op1val == 8 || op1val == 16 || op1val == 24))
1793 : vcost = COSTS_N_INSNS (3);
1794 96 : else if (op1val >= 96)
1795 : vcost = COSTS_N_INSNS (4);
1796 96 : else if (TARGET_SSE4_1 && (op1val == 28 || op1val == 80))
1797 : vcost = COSTS_N_INSNS (4);
1798 96 : else if ((op1val & 7) == 0)
1799 : vcost = COSTS_N_INSNS (5);
1800 96 : else if (TARGET_AVX2 && op1val < 32)
1801 : vcost = COSTS_N_INSNS (6);
1802 96 : else if (TARGET_SSE4_1 && op1val < 15)
1803 : vcost = COSTS_N_INSNS (6);
1804 96 : else if (op1val == 1 || op1val >= 64)
1805 : vcost = COSTS_N_INSNS (8);
1806 : else
1807 0 : vcost = COSTS_N_INSNS (9);
1808 : }
1809 103 : igain = scost - vcost;
1810 103 : break;
1811 :
1812 5 : case ROTATE:
1813 5 : case ROTATERT:
1814 : /* See ix86_expand_v1ti_rotate. */
1815 5 : op1val = INTVAL (XEXP (src, 1));
1816 5 : if (!speed_p)
1817 : {
1818 0 : scost = COSTS_N_BYTES (13);
1819 0 : if ((op1val & 31) == 0)
1820 : vcost = COSTS_N_BYTES (5);
1821 0 : else if ((op1val & 7) == 0)
1822 0 : vcost = TARGET_AVX ? COSTS_N_BYTES (13) : COSTS_N_BYTES (18);
1823 0 : else if (op1val > 32 && op1val < 96)
1824 : vcost = COSTS_N_BYTES (24);
1825 : else
1826 0 : vcost = COSTS_N_BYTES (19);
1827 : }
1828 : else
1829 : {
1830 5 : scost = COSTS_N_INSNS (3);
1831 5 : if ((op1val & 31) == 0)
1832 : vcost = COSTS_N_INSNS (1);
1833 3 : else if ((op1val & 7) == 0)
1834 1 : vcost = TARGET_AVX ? COSTS_N_INSNS (3) : COSTS_N_INSNS (4);
1835 2 : else if (op1val > 32 && op1val < 96)
1836 : vcost = COSTS_N_INSNS (5);
1837 : else
1838 2 : vcost = COSTS_N_INSNS (1);
1839 : }
1840 5 : igain = scost - vcost;
1841 5 : break;
1842 :
1843 12 : case COMPARE:
1844 12 : if (XEXP (src, 1) == const0_rtx)
1845 : {
1846 8 : if (GET_CODE (XEXP (src, 0)) == AND)
1847 : /* and;and;or (9 bytes) vs. ptest (5 bytes). */
1848 : igain = !speed_p ? COSTS_N_BYTES (4) : COSTS_N_INSNS (2);
1849 : /* or (3 bytes) vs. ptest (5 bytes). */
1850 8 : else if (!speed_p)
1851 0 : igain = -COSTS_N_BYTES (2);
1852 : }
1853 4 : else if (XEXP (src, 1) == const1_rtx)
1854 : /* and;cmp -1 (7 bytes) vs. pcmpeqd;pxor;ptest (13 bytes). */
1855 0 : igain = !speed_p ? -COSTS_N_BYTES (6) : -COSTS_N_INSNS (1);
1856 : break;
1857 :
1858 122 : case ZERO_EXTEND:
1859 122 : if (GET_MODE (XEXP (src, 0)) == DImode)
1860 : /* xor (2 bytes) vs. vmovq (5 bytes). */
1861 122 : igain = speed_p ? COSTS_N_INSNS (1) - ix86_cost->sse_to_integer
1862 : : -COSTS_N_BYTES (3);
1863 : break;
1864 :
1865 : default:
1866 : break;
1867 : }
1868 :
1869 1799362 : gain += igain;
1870 920517 : if (speed_p)
1871 878845 : weighted_gain += bb_freq * igain;
1872 :
1873 920525 : if (igain != 0 && dump_file)
1874 : {
1875 0 : fprintf (dump_file, " Instruction gain %d with bb_freq %.2f for ",
1876 : igain, bb_freq.to_double ());
1877 0 : dump_insn_slim (dump_file, insn);
1878 : }
1879 : }
1880 :
1881 465867 : if (dump_file)
1882 0 : fprintf (dump_file, " Total gain: %d, weighted gain %.2f\n",
1883 : gain, weighted_gain.to_double ());
1884 :
1885 465867 : if (weighted_gain > (sreal) 0)
1886 : return true;
1887 : else
1888 24367 : return gain > 0;
1889 : }
1890 :
1891 : /* Fix uses of converted REG in debug insns. */
1892 :
1893 : void
1894 422977 : timode_scalar_chain::fix_debug_reg_uses (rtx reg)
1895 : {
1896 422977 : if (!flag_var_tracking)
1897 : return;
1898 :
1899 371904 : df_ref ref, next;
1900 761648 : for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
1901 : {
1902 389744 : rtx_insn *insn = DF_REF_INSN (ref);
1903 : /* Make sure the next ref is for a different instruction,
1904 : so that we're not affected by the rescan. */
1905 389744 : next = DF_REF_NEXT_REG (ref);
1906 389744 : while (next && DF_REF_INSN (next) == insn)
1907 0 : next = DF_REF_NEXT_REG (next);
1908 :
1909 389744 : if (DEBUG_INSN_P (insn))
1910 : {
1911 : /* It may be a debug insn with a TImode variable in
1912 : register. */
1913 : bool changed = false;
1914 176 : for (; ref != next; ref = DF_REF_NEXT_REG (ref))
1915 : {
1916 88 : rtx *loc = DF_REF_LOC (ref);
1917 88 : if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
1918 : {
1919 84 : *loc = gen_rtx_SUBREG (TImode, *loc, 0);
1920 84 : changed = true;
1921 : }
1922 : }
1923 88 : if (changed)
1924 84 : df_insn_rescan (insn);
1925 : }
1926 : }
1927 : }
1928 :
1929 : /* Convert SRC, a *concatditi3 pattern, into a vec_concatv2di instruction.
1930 : Insert this before INSN, and return the result as a V1TImode subreg. */
1931 :
1932 : static rtx
1933 253 : timode_convert_concatdi (rtx src, rtx_insn *insn)
1934 : {
1935 253 : rtx hi, lo;
1936 253 : rtx tmp = gen_reg_rtx (V2DImode);
1937 253 : if (GET_CODE (XEXP (src, 0)) == ASHIFT)
1938 : {
1939 253 : hi = XEXP (XEXP (XEXP (src, 0), 0), 0);
1940 253 : lo = XEXP (XEXP (src, 1), 0);
1941 : }
1942 : else
1943 : {
1944 0 : hi = XEXP (XEXP (XEXP (src, 1), 0), 0);
1945 0 : lo = XEXP (XEXP (src, 0), 0);
1946 : }
1947 253 : emit_insn_before (gen_vec_concatv2di (tmp, lo, hi), insn);
1948 253 : return gen_rtx_SUBREG (V1TImode, tmp, 0);
1949 : }
1950 :
1951 : /* Convert INSN from TImode to V1T1mode. */
1952 :
1953 : void
1954 914352 : timode_scalar_chain::convert_insn (rtx_insn *insn)
1955 : {
1956 914352 : rtx def_set = single_set (insn);
1957 914352 : rtx src = SET_SRC (def_set);
1958 914352 : rtx dst = SET_DEST (def_set);
1959 914352 : rtx tmp;
1960 :
1961 914352 : switch (GET_CODE (dst))
1962 : {
1963 422987 : case REG:
1964 422987 : if (GET_MODE (dst) == TImode)
1965 : {
1966 421214 : PUT_MODE (dst, V1TImode);
1967 421214 : fix_debug_reg_uses (dst);
1968 : }
1969 422987 : if (GET_MODE (dst) == V1TImode)
1970 : {
1971 : /* It might potentially be helpful to convert REG_EQUAL notes,
1972 : but for now we just remove them. */
1973 422977 : rtx note = find_reg_equal_equiv_note (insn);
1974 422977 : if (note)
1975 444 : remove_note (insn, note);
1976 : }
1977 : break;
1978 491365 : case MEM:
1979 491365 : PUT_MODE (dst, V1TImode);
1980 491365 : break;
1981 :
1982 0 : default:
1983 0 : gcc_unreachable ();
1984 : }
1985 :
1986 914352 : switch (GET_CODE (src))
1987 : {
1988 450868 : case REG:
1989 450868 : if (GET_MODE (src) == TImode)
1990 : {
1991 1763 : PUT_MODE (src, V1TImode);
1992 1763 : fix_debug_reg_uses (src);
1993 : }
1994 : break;
1995 :
1996 421672 : case MEM:
1997 421672 : PUT_MODE (src, V1TImode);
1998 421672 : break;
1999 :
2000 30623 : case CONST_WIDE_INT:
2001 30623 : if (NONDEBUG_INSN_P (insn))
2002 : {
2003 : /* Since there are no instructions to store 128-bit constant,
2004 : temporary register usage is required. */
2005 30623 : bool use_move;
2006 30623 : start_sequence ();
2007 30623 : tmp = ix86_convert_const_wide_int_to_broadcast (TImode, src);
2008 30623 : if (tmp)
2009 : {
2010 194 : src = lowpart_subreg (V1TImode, tmp, TImode);
2011 194 : use_move = true;
2012 : }
2013 : else
2014 : {
2015 30429 : src = smode_convert_cst (src, V1TImode);
2016 30429 : src = validize_mem (force_const_mem (V1TImode, src));
2017 30429 : use_move = MEM_P (dst);
2018 : }
2019 30623 : rtx_insn *seq = end_sequence ();
2020 30623 : if (seq)
2021 195 : emit_insn_before (seq, insn);
2022 30623 : if (use_move)
2023 : {
2024 30420 : tmp = gen_reg_rtx (V1TImode);
2025 30420 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2026 30420 : src = tmp;
2027 : }
2028 : }
2029 : break;
2030 :
2031 10717 : case CONST_INT:
2032 10717 : switch (standard_sse_constant_p (src, TImode))
2033 : {
2034 10494 : case 1:
2035 10494 : src = CONST0_RTX (GET_MODE (dst));
2036 10494 : break;
2037 223 : case 2:
2038 223 : src = CONSTM1_RTX (GET_MODE (dst));
2039 223 : break;
2040 0 : default:
2041 0 : gcc_unreachable ();
2042 : }
2043 10717 : if (MEM_P (dst))
2044 : {
2045 10185 : tmp = gen_reg_rtx (V1TImode);
2046 10185 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2047 10185 : src = tmp;
2048 : }
2049 : break;
2050 :
2051 13 : case AND:
2052 13 : if (GET_CODE (XEXP (src, 0)) == NOT)
2053 : {
2054 0 : convert_op (&XEXP (XEXP (src, 0), 0), insn);
2055 0 : convert_op (&XEXP (src, 1), insn);
2056 0 : PUT_MODE (XEXP (src, 0), V1TImode);
2057 0 : PUT_MODE (src, V1TImode);
2058 0 : break;
2059 : }
2060 13 : convert_op (&XEXP (src, 0), insn);
2061 13 : convert_op (&XEXP (src, 1), insn);
2062 13 : PUT_MODE (src, V1TImode);
2063 13 : if (MEM_P (dst))
2064 : {
2065 10 : tmp = gen_reg_rtx (V1TImode);
2066 10 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2067 10 : src = tmp;
2068 : }
2069 : break;
2070 :
2071 304 : case XOR:
2072 304 : case IOR:
2073 304 : if (timode_concatdi_p (src))
2074 : {
2075 253 : src = timode_convert_concatdi (src, insn);
2076 253 : break;
2077 : }
2078 51 : convert_op (&XEXP (src, 0), insn);
2079 51 : convert_op (&XEXP (src, 1), insn);
2080 51 : PUT_MODE (src, V1TImode);
2081 51 : if (MEM_P (dst))
2082 : {
2083 8 : tmp = gen_reg_rtx (V1TImode);
2084 8 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2085 8 : src = tmp;
2086 : }
2087 : break;
2088 :
2089 3 : case NOT:
2090 3 : src = XEXP (src, 0);
2091 3 : convert_op (&src, insn);
2092 3 : tmp = gen_reg_rtx (V1TImode);
2093 3 : emit_insn_before (gen_move_insn (tmp, CONSTM1_RTX (V1TImode)), insn);
2094 3 : src = gen_rtx_XOR (V1TImode, src, tmp);
2095 3 : if (MEM_P (dst))
2096 : {
2097 0 : tmp = gen_reg_rtx (V1TImode);
2098 0 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2099 0 : src = tmp;
2100 : }
2101 : break;
2102 :
2103 10 : case COMPARE:
2104 10 : dst = gen_rtx_REG (CCZmode, FLAGS_REG);
2105 10 : src = convert_compare (XEXP (src, 0), XEXP (src, 1), insn);
2106 10 : break;
2107 :
2108 43 : case ASHIFT:
2109 43 : case LSHIFTRT:
2110 43 : case ASHIFTRT:
2111 43 : case ROTATERT:
2112 43 : case ROTATE:
2113 43 : convert_op (&XEXP (src, 0), insn);
2114 43 : PUT_MODE (src, V1TImode);
2115 43 : break;
2116 :
2117 99 : case ZERO_EXTEND:
2118 99 : if (GET_MODE (XEXP (src, 0)) == DImode)
2119 : {
2120 : /* Convert to *vec_concatv2di_0. */
2121 99 : rtx tmp = gen_reg_rtx (V2DImode);
2122 99 : rtx pat = gen_rtx_VEC_CONCAT (V2DImode, XEXP (src, 0), const0_rtx);
2123 99 : emit_insn_before (gen_move_insn (tmp, pat), insn);
2124 99 : src = gen_rtx_SUBREG (vmode, tmp, 0);
2125 : }
2126 : else
2127 0 : gcc_unreachable ();
2128 99 : break;
2129 :
2130 0 : case PLUS:
2131 0 : if (timode_concatdi_p (src))
2132 0 : src = timode_convert_concatdi (src, insn);
2133 : else
2134 0 : gcc_unreachable ();
2135 0 : break;
2136 :
2137 0 : default:
2138 0 : gcc_unreachable ();
2139 : }
2140 :
2141 914352 : SET_SRC (def_set) = src;
2142 914352 : SET_DEST (def_set) = dst;
2143 :
2144 : /* Drop possible dead definitions. */
2145 914352 : PATTERN (insn) = def_set;
2146 :
2147 914352 : INSN_CODE (insn) = -1;
2148 914352 : recog_memoized (insn);
2149 914352 : df_insn_rescan (insn);
2150 914352 : }
2151 :
2152 : /* Generate copies from defs used by the chain but not defined therein.
2153 : Also populates defs_map which is used later by convert_insn. */
2154 :
2155 : void
2156 636221 : scalar_chain::convert_registers ()
2157 : {
2158 636221 : bitmap_iterator bi;
2159 636221 : unsigned id;
2160 662357 : EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
2161 : {
2162 26136 : rtx chain_reg = gen_reg_rtx (smode);
2163 26136 : defs_map.put (regno_reg_rtx[id], chain_reg);
2164 : }
2165 644636 : EXECUTE_IF_SET_IN_BITMAP (insns_conv, 0, id, bi)
2166 21074 : for (df_ref ref = DF_INSN_UID_DEFS (id); ref; ref = DF_REF_NEXT_LOC (ref))
2167 12659 : if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
2168 8415 : make_vector_copies (DF_REF_INSN (ref), DF_REF_REAL_REG (ref));
2169 636221 : }
2170 :
2171 : /* Convert whole chain creating required register
2172 : conversions and copies. */
2173 :
2174 : int
2175 636221 : scalar_chain::convert ()
2176 : {
2177 636221 : bitmap_iterator bi;
2178 636221 : unsigned id;
2179 636221 : int converted_insns = 0;
2180 :
2181 636221 : if (!dbg_cnt (stv_conversion))
2182 : return 0;
2183 :
2184 636221 : if (dump_file)
2185 0 : fprintf (dump_file, "Converting chain #%d...\n", chain_id);
2186 :
2187 636221 : convert_registers ();
2188 :
2189 1962196 : EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
2190 : {
2191 1325975 : rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
2192 1325975 : convert_insn_common (insn);
2193 1325975 : convert_insn (insn);
2194 1325975 : converted_insns++;
2195 : }
2196 :
2197 : return converted_insns;
2198 : }
2199 :
2200 : /* Return the SET expression if INSN doesn't reference hard register.
2201 : Return NULL if INSN uses or defines a hard register, excluding
2202 : pseudo register pushes, hard register uses in a memory address,
2203 : clobbers and flags definitions. */
2204 :
2205 : static rtx
2206 335086571 : pseudo_reg_set (rtx_insn *insn)
2207 : {
2208 335086571 : rtx set = single_set (insn);
2209 335086571 : if (!set)
2210 : return NULL;
2211 :
2212 : /* Check pseudo register push first. */
2213 134375852 : machine_mode mode = TARGET_64BIT ? TImode : DImode;
2214 134375852 : if (REG_P (SET_SRC (set))
2215 38048947 : && !HARD_REGISTER_P (SET_SRC (set))
2216 164067858 : && push_operand (SET_DEST (set), mode))
2217 : return set;
2218 :
2219 134122697 : df_ref ref;
2220 216937303 : FOR_EACH_INSN_DEF (ref, insn)
2221 119434042 : if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
2222 64242673 : && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
2223 169417210 : && DF_REF_REGNO (ref) != FLAGS_REG)
2224 : return NULL;
2225 :
2226 186944983 : FOR_EACH_INSN_USE (ref, insn)
2227 114655617 : if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
2228 : return NULL;
2229 :
2230 : return set;
2231 : }
2232 :
2233 : /* Return true if the register REG is defined in a single DEF chain.
2234 : If it is defined in more than one DEF chains, we may not be able
2235 : to convert it in all chains. */
2236 :
2237 : static bool
2238 1147323 : single_def_chain_p (rtx reg)
2239 : {
2240 1147323 : df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg));
2241 1147323 : if (!ref)
2242 : return false;
2243 1147307 : return DF_REF_NEXT_REG (ref) == nullptr;
2244 : }
2245 :
2246 : /* Check if comparison INSN may be transformed into vector comparison.
2247 : Currently we transform equality/inequality checks which look like:
2248 : (set (reg:CCZ 17 flags) (compare:CCZ (reg:TI x) (reg:TI y))) */
2249 :
2250 : static bool
2251 12750555 : convertible_comparison_p (rtx_insn *insn, enum machine_mode mode)
2252 : {
2253 14159771 : if (mode != (TARGET_64BIT ? TImode : DImode))
2254 : return false;
2255 :
2256 4659163 : if (!TARGET_SSE4_1)
2257 : return false;
2258 :
2259 166622 : rtx def_set = single_set (insn);
2260 :
2261 166622 : gcc_assert (def_set);
2262 :
2263 166622 : rtx src = SET_SRC (def_set);
2264 166622 : rtx dst = SET_DEST (def_set);
2265 :
2266 166622 : gcc_assert (GET_CODE (src) == COMPARE);
2267 :
2268 166622 : if (!REG_P (dst)
2269 166622 : || REGNO (dst) != FLAGS_REG
2270 333244 : || GET_MODE (dst) != CCZmode)
2271 : return false;
2272 :
2273 116416 : rtx op1 = XEXP (src, 0);
2274 116416 : rtx op2 = XEXP (src, 1);
2275 :
2276 : /* *cmp<dwi>_doubleword. */
2277 116416 : if ((CONST_SCALAR_INT_P (op1)
2278 116416 : || ((REG_P (op1) || MEM_P (op1))
2279 114737 : && GET_MODE (op1) == mode))
2280 60 : && (CONST_SCALAR_INT_P (op2)
2281 12 : || ((REG_P (op2) || MEM_P (op2))
2282 10 : && GET_MODE (op2) == mode)))
2283 : return true;
2284 :
2285 : /* *testti_doubleword. */
2286 116358 : if (op2 == const0_rtx
2287 38416 : && GET_CODE (op1) == AND
2288 142 : && REG_P (XEXP (op1, 0)))
2289 : {
2290 142 : rtx op12 = XEXP (op1, 1);
2291 142 : return GET_MODE (XEXP (op1, 0)) == TImode
2292 142 : && (CONST_SCALAR_INT_P (op12)
2293 0 : || ((REG_P (op12) || MEM_P (op12))
2294 0 : && GET_MODE (op12) == TImode));
2295 : }
2296 :
2297 : /* *test<dwi>_not_doubleword. */
2298 116216 : if (op2 == const0_rtx
2299 38274 : && GET_CODE (op1) == AND
2300 0 : && GET_CODE (XEXP (op1, 0)) == NOT)
2301 : {
2302 0 : rtx op11 = XEXP (XEXP (op1, 0), 0);
2303 0 : rtx op12 = XEXP (op1, 1);
2304 0 : return (REG_P (op11) || MEM_P (op11))
2305 0 : && (REG_P (op12) || MEM_P (op12))
2306 0 : && GET_MODE (op11) == mode
2307 0 : && GET_MODE (op12) == mode;
2308 : }
2309 :
2310 : return false;
2311 : }
2312 :
2313 : /* The general version of scalar_to_vector_candidate_p. */
2314 :
2315 : static bool
2316 234484123 : general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)
2317 : {
2318 234484123 : rtx def_set = pseudo_reg_set (insn);
2319 :
2320 234484123 : if (!def_set)
2321 : return false;
2322 :
2323 49064443 : rtx src = SET_SRC (def_set);
2324 49064443 : rtx dst = SET_DEST (def_set);
2325 :
2326 49064443 : if (GET_CODE (src) == COMPARE)
2327 8796000 : return convertible_comparison_p (insn, mode);
2328 :
2329 : /* We are interested in "mode" only. */
2330 40268443 : if ((GET_MODE (src) != mode
2331 27544208 : && !CONST_INT_P (src))
2332 17816088 : || GET_MODE (dst) != mode)
2333 : return false;
2334 :
2335 14983411 : if (!REG_P (dst) && !MEM_P (dst))
2336 : return false;
2337 :
2338 14727299 : switch (GET_CODE (src))
2339 : {
2340 520923 : case ASHIFT:
2341 520923 : case LSHIFTRT:
2342 520923 : case ASHIFTRT:
2343 520923 : case ROTATE:
2344 520923 : case ROTATERT:
2345 520923 : if (!CONST_INT_P (XEXP (src, 1))
2346 1005848 : || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, GET_MODE_BITSIZE (mode)-1))
2347 : return false;
2348 :
2349 : /* Check for extend highpart case. */
2350 484921 : if (mode != DImode
2351 350124 : || GET_CODE (src) != ASHIFTRT
2352 76128 : || GET_CODE (XEXP (src, 0)) != ASHIFT)
2353 : break;
2354 :
2355 3640642 : src = XEXP (src, 0);
2356 : break;
2357 :
2358 78418 : case SMAX:
2359 78418 : case SMIN:
2360 78418 : case UMAX:
2361 78418 : case UMIN:
2362 78418 : if ((mode == DImode && !TARGET_AVX512VL)
2363 17615 : || (mode == SImode && !TARGET_SSE4_1))
2364 : return false;
2365 : /* Fallthru. */
2366 :
2367 3194873 : case AND:
2368 3194873 : case IOR:
2369 3194873 : case XOR:
2370 3194873 : case PLUS:
2371 3194873 : case MINUS:
2372 3194873 : if (!REG_P (XEXP (src, 1))
2373 : && !MEM_P (XEXP (src, 1))
2374 : && !CONST_INT_P (XEXP (src, 1)))
2375 : return false;
2376 :
2377 3103953 : if (GET_MODE (XEXP (src, 1)) != mode
2378 1817212 : && !CONST_INT_P (XEXP (src, 1)))
2379 : return false;
2380 :
2381 : /* Check for andnot case. */
2382 3103953 : if (GET_CODE (src) != AND
2383 177049 : || GET_CODE (XEXP (src, 0)) != NOT)
2384 : break;
2385 :
2386 3640642 : src = XEXP (src, 0);
2387 : /* FALLTHRU */
2388 :
2389 : case NOT:
2390 : break;
2391 :
2392 24421 : case NEG:
2393 : /* Check for nabs case. */
2394 24421 : if (GET_CODE (XEXP (src, 0)) != ABS)
2395 : break;
2396 :
2397 : src = XEXP (src, 0);
2398 : /* FALLTHRU */
2399 :
2400 2793 : case ABS:
2401 2793 : if ((mode == DImode && !TARGET_AVX512VL)
2402 1385 : || (mode == SImode && !TARGET_SSSE3))
2403 : return false;
2404 : break;
2405 :
2406 : case REG:
2407 : return true;
2408 :
2409 5935729 : case MEM:
2410 5935729 : case CONST_INT:
2411 5935729 : return REG_P (dst);
2412 :
2413 56175 : case VEC_SELECT:
2414 : /* Excluding MEM_P (dst) avoids intefering with vpextr[dq]. */
2415 56175 : return REG_P (dst)
2416 45733 : && REG_P (XEXP (src, 0))
2417 52628 : && GET_MODE (XEXP (src, 0)) == (mode == DImode ? V2DImode
2418 : : V4SImode)
2419 36134 : && GET_CODE (XEXP (src, 1)) == PARALLEL
2420 36134 : && XVECLEN (XEXP (src, 1), 0) == 1
2421 92309 : && CONST_INT_P (XVECEXP (XEXP (src, 1), 0, 0));
2422 :
2423 : default:
2424 : return false;
2425 : }
2426 :
2427 3640642 : if (!REG_P (XEXP (src, 0))
2428 : && !MEM_P (XEXP (src, 0))
2429 : && !CONST_INT_P (XEXP (src, 0)))
2430 : return false;
2431 :
2432 3335469 : if (GET_MODE (XEXP (src, 0)) != mode
2433 0 : && !CONST_INT_P (XEXP (src, 0)))
2434 : return false;
2435 :
2436 : return true;
2437 : }
2438 :
2439 : /* Check for a suitable TImode memory operand. */
2440 :
2441 : static bool
2442 1561 : timode_mem_p (rtx x)
2443 : {
2444 1561 : return MEM_P (x)
2445 1561 : && (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
2446 0 : || !misaligned_operand (x, TImode));
2447 : }
2448 :
2449 : /* The TImode version of scalar_to_vector_candidate_p. */
2450 :
2451 : static bool
2452 100602448 : timode_scalar_to_vector_candidate_p (rtx_insn *insn)
2453 : {
2454 100602448 : rtx def_set = pseudo_reg_set (insn);
2455 :
2456 100602448 : if (!def_set)
2457 : return false;
2458 :
2459 23478078 : rtx src = SET_SRC (def_set);
2460 23478078 : rtx dst = SET_DEST (def_set);
2461 :
2462 23478078 : if (GET_CODE (src) == COMPARE)
2463 3954555 : return convertible_comparison_p (insn, TImode);
2464 :
2465 19523523 : if (GET_MODE (dst) != TImode
2466 1190975 : || (GET_MODE (src) != TImode
2467 59632 : && !CONST_SCALAR_INT_P (src)))
2468 : return false;
2469 :
2470 1190975 : if (!REG_P (dst) && !MEM_P (dst))
2471 : return false;
2472 :
2473 1189522 : if (MEM_P (dst)
2474 529647 : && misaligned_operand (dst, TImode)
2475 1502968 : && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
2476 : return false;
2477 :
2478 1189517 : if (REG_P (dst) && !single_def_chain_p (dst))
2479 : return false;
2480 :
2481 1039438 : switch (GET_CODE (src))
2482 : {
2483 487448 : case REG:
2484 487448 : return single_def_chain_p (src);
2485 :
2486 : case CONST_WIDE_INT:
2487 : return true;
2488 :
2489 12567 : case CONST_INT:
2490 : /* ??? Verify performance impact before enabling CONST_INT for
2491 : __int128 store. */
2492 12567 : return standard_sse_constant_p (src, TImode);
2493 :
2494 444089 : case MEM:
2495 : /* Memory must be aligned or unaligned load is optimal. */
2496 444089 : return (REG_P (dst)
2497 444089 : && (!misaligned_operand (src, TImode)
2498 149142 : || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
2499 :
2500 3102 : case AND:
2501 3102 : if (!MEM_P (dst)
2502 3061 : && GET_CODE (XEXP (src, 0)) == NOT
2503 0 : && REG_P (XEXP (XEXP (src, 0), 0))
2504 3102 : && (REG_P (XEXP (src, 1))
2505 0 : || CONST_SCALAR_INT_P (XEXP (src, 1))
2506 0 : || timode_mem_p (XEXP (src, 1))))
2507 0 : return true;
2508 3102 : return (REG_P (XEXP (src, 0))
2509 46 : || timode_mem_p (XEXP (src, 0)))
2510 3148 : && (REG_P (XEXP (src, 1))
2511 1280 : || CONST_SCALAR_INT_P (XEXP (src, 1))
2512 35 : || timode_mem_p (XEXP (src, 1)));
2513 :
2514 13987 : case IOR:
2515 13987 : case XOR:
2516 13987 : if (timode_concatdi_p (src))
2517 : return true;
2518 2673 : return (REG_P (XEXP (src, 0))
2519 1433 : || timode_mem_p (XEXP (src, 0)))
2520 2690 : && (REG_P (XEXP (src, 1))
2521 267 : || CONST_SCALAR_INT_P (XEXP (src, 1))
2522 31 : || timode_mem_p (XEXP (src, 1)));
2523 :
2524 504 : case NOT:
2525 504 : return REG_P (XEXP (src, 0)) || timode_mem_p (XEXP (src, 0));
2526 :
2527 11647 : case ASHIFT:
2528 11647 : case LSHIFTRT:
2529 11647 : case ASHIFTRT:
2530 11647 : case ROTATERT:
2531 11647 : case ROTATE:
2532 : /* Handle shifts/rotates by integer constants between 0 and 127. */
2533 11647 : return REG_P (XEXP (src, 0))
2534 11615 : && CONST_INT_P (XEXP (src, 1))
2535 22921 : && (INTVAL (XEXP (src, 1)) & ~0x7f) == 0;
2536 :
2537 7031 : case PLUS:
2538 7031 : return timode_concatdi_p (src);
2539 :
2540 3730 : case ZERO_EXTEND:
2541 3730 : return REG_P (XEXP (src, 0))
2542 3730 : && GET_MODE (XEXP (src, 0)) == DImode;
2543 :
2544 : default:
2545 : return false;
2546 : }
2547 : }
2548 :
2549 : /* For a register REGNO, scan instructions for its defs and uses.
2550 : Put REGNO in REGS if a def or use isn't in CANDIDATES. */
2551 :
2552 : static void
2553 1273546 : timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
2554 : unsigned int regno)
2555 : {
2556 : /* Do nothing if REGNO is already in REGS or is a hard reg. */
2557 1273546 : if (bitmap_bit_p (regs, regno)
2558 1273546 : || HARD_REGISTER_NUM_P (regno))
2559 : return;
2560 :
2561 1261873 : for (df_ref def = DF_REG_DEF_CHAIN (regno);
2562 2499494 : def;
2563 1237621 : def = DF_REF_NEXT_REG (def))
2564 : {
2565 1261853 : if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2566 : {
2567 24232 : if (dump_file)
2568 0 : fprintf (dump_file,
2569 : "r%d has non convertible def in insn %d\n",
2570 0 : regno, DF_REF_INSN_UID (def));
2571 :
2572 24232 : bitmap_set_bit (regs, regno);
2573 24232 : break;
2574 : }
2575 : }
2576 :
2577 1261873 : for (df_ref ref = DF_REG_USE_CHAIN (regno);
2578 2778641 : ref;
2579 1516768 : ref = DF_REF_NEXT_REG (ref))
2580 : {
2581 : /* Debug instructions are skipped. */
2582 1579553 : if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
2583 1579553 : && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
2584 : {
2585 62785 : if (dump_file)
2586 0 : fprintf (dump_file,
2587 : "r%d has non convertible use in insn %d\n",
2588 0 : regno, DF_REF_INSN_UID (ref));
2589 :
2590 62785 : bitmap_set_bit (regs, regno);
2591 62785 : break;
2592 : }
2593 : }
2594 : }
2595 :
2596 : /* For a given bitmap of insn UIDs scans all instructions and
2597 : remove insn from CANDIDATES in case it has both convertible
2598 : and not convertible definitions.
2599 :
2600 : All insns in a bitmap are conversion candidates according to
2601 : scalar_to_vector_candidate_p. Currently it implies all insns
2602 : are single_set. */
2603 :
2604 : static void
2605 829665 : timode_remove_non_convertible_regs (bitmap candidates)
2606 : {
2607 829665 : bitmap_iterator bi;
2608 829665 : unsigned id;
2609 829665 : bitmap regs = BITMAP_ALLOC (NULL);
2610 856651 : bool changed;
2611 :
2612 856651 : do {
2613 856651 : changed = false;
2614 2156228 : EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
2615 : {
2616 1299577 : rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
2617 1299577 : df_ref ref;
2618 :
2619 1949783 : FOR_EACH_INSN_DEF (ref, insn)
2620 650206 : if (!DF_REF_REG_MEM_P (ref)
2621 650206 : && GET_MODE (DF_REF_REG (ref)) == TImode)
2622 628917 : timode_check_non_convertible_regs (candidates, regs,
2623 : DF_REF_REGNO (ref));
2624 :
2625 3206651 : FOR_EACH_INSN_USE (ref, insn)
2626 1907074 : if (!DF_REF_REG_MEM_P (ref)
2627 674908 : && GET_MODE (DF_REF_REG (ref)) == TImode)
2628 644629 : timode_check_non_convertible_regs (candidates, regs,
2629 : DF_REF_REGNO (ref));
2630 : }
2631 :
2632 1044144 : EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
2633 : {
2634 187493 : for (df_ref def = DF_REG_DEF_CHAIN (id);
2635 380765 : def;
2636 193272 : def = DF_REF_NEXT_REG (def))
2637 193272 : if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2638 : {
2639 48825 : if (dump_file)
2640 0 : fprintf (dump_file, "Removing insn %d from candidates list\n",
2641 0 : DF_REF_INSN_UID (def));
2642 :
2643 48825 : bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
2644 48825 : changed = true;
2645 : }
2646 :
2647 187493 : for (df_ref ref = DF_REG_USE_CHAIN (id);
2648 495859 : ref;
2649 308366 : ref = DF_REF_NEXT_REG (ref))
2650 308366 : if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
2651 : {
2652 34159 : if (dump_file)
2653 0 : fprintf (dump_file, "Removing insn %d from candidates list\n",
2654 0 : DF_REF_INSN_UID (ref));
2655 :
2656 34159 : bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
2657 34159 : changed = true;
2658 : }
2659 : }
2660 : } while (changed);
2661 :
2662 829665 : BITMAP_FREE (regs);
2663 829665 : }
2664 :
2665 : /* Main STV pass function. Find and convert scalar
2666 : instructions into vector mode when profitable. */
2667 :
2668 : static unsigned int
2669 1785493 : convert_scalars_to_vector (bool timode_p)
2670 : {
2671 1785493 : basic_block bb;
2672 1785493 : int converted_insns = 0;
2673 1785493 : auto_vec<rtx_insn *> control_flow_insns;
2674 :
2675 1785493 : bitmap_obstack_initialize (NULL);
2676 1785493 : const machine_mode cand_mode[3] = { SImode, DImode, TImode };
2677 1785493 : const machine_mode cand_vmode[3] = { V4SImode, V2DImode, V1TImode };
2678 5356479 : bitmap_head candidates[3]; /* { SImode, DImode, TImode } */
2679 7141972 : for (unsigned i = 0; i < 3; ++i)
2680 5356479 : bitmap_initialize (&candidates[i], &bitmap_default_obstack);
2681 :
2682 1785493 : calculate_dominance_info (CDI_DOMINATORS);
2683 1785493 : df_set_flags (DF_DEFER_INSN_RESCAN | DF_RD_PRUNE_DEAD_DEFS);
2684 1785493 : df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2685 1785493 : df_analyze ();
2686 :
2687 : /* Find all instructions we want to convert into vector mode. */
2688 1785493 : if (dump_file)
2689 44 : fprintf (dump_file, "Searching for mode conversion candidates...\n");
2690 :
2691 19627540 : FOR_EACH_BB_FN (bb, cfun)
2692 : {
2693 17842047 : rtx_insn *insn;
2694 237447419 : FOR_BB_INSNS (bb, insn)
2695 219605372 : if (timode_p
2696 219605372 : && timode_scalar_to_vector_candidate_p (insn))
2697 : {
2698 1003509 : if (dump_file)
2699 0 : fprintf (dump_file, " insn %d is marked as a TImode candidate\n",
2700 0 : INSN_UID (insn));
2701 :
2702 1003509 : bitmap_set_bit (&candidates[2], INSN_UID (insn));
2703 : }
2704 218601863 : else if (!timode_p)
2705 : {
2706 : /* Check {SI,DI}mode. */
2707 341948340 : for (unsigned i = 0; i <= 1; ++i)
2708 234484123 : if (general_scalar_to_vector_candidate_p (insn, cand_mode[i]))
2709 : {
2710 11538707 : if (dump_file)
2711 554 : fprintf (dump_file, " insn %d is marked as a %s candidate\n",
2712 277 : INSN_UID (insn), i == 0 ? "SImode" : "DImode");
2713 :
2714 11538707 : bitmap_set_bit (&candidates[i], INSN_UID (insn));
2715 11538707 : break;
2716 : }
2717 : }
2718 : }
2719 :
2720 1785493 : if (timode_p)
2721 829665 : timode_remove_non_convertible_regs (&candidates[2]);
2722 :
2723 5666550 : for (unsigned i = 0; i <= 2; ++i)
2724 4508137 : if (!bitmap_empty_p (&candidates[i]))
2725 : break;
2726 3881057 : else if (i == 2 && dump_file)
2727 23 : fprintf (dump_file, "There are no candidates for optimization.\n");
2728 :
2729 7141972 : for (unsigned i = 0; i <= 2; ++i)
2730 : {
2731 5356479 : auto_bitmap disallowed;
2732 5356479 : bitmap_tree_view (&candidates[i]);
2733 17026775 : while (!bitmap_empty_p (&candidates[i]))
2734 : {
2735 6313817 : unsigned uid = bitmap_first_set_bit (&candidates[i]);
2736 6313817 : scalar_chain *chain;
2737 :
2738 6313817 : if (cand_mode[i] == TImode)
2739 465867 : chain = new timode_scalar_chain;
2740 : else
2741 5847950 : chain = new general_scalar_chain (cand_mode[i], cand_vmode[i]);
2742 :
2743 : /* Find instructions chain we want to convert to vector mode.
2744 : Check all uses and definitions to estimate all required
2745 : conversions. */
2746 6313817 : if (chain->build (&candidates[i], uid, disallowed))
2747 : {
2748 6308402 : if (chain->compute_convert_gain ())
2749 636221 : converted_insns += chain->convert ();
2750 5672181 : else if (dump_file)
2751 136 : fprintf (dump_file, "Chain #%d conversion is not profitable\n",
2752 : chain->chain_id);
2753 : }
2754 :
2755 6313817 : rtx_insn* iter_insn;
2756 6313817 : unsigned int ii;
2757 6317405 : FOR_EACH_VEC_ELT (chain->control_flow_insns, ii, iter_insn)
2758 3588 : control_flow_insns.safe_push (iter_insn);
2759 :
2760 6313817 : delete chain;
2761 : }
2762 5356479 : }
2763 :
2764 1785493 : if (dump_file)
2765 44 : fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
2766 :
2767 7141972 : for (unsigned i = 0; i <= 2; ++i)
2768 5356479 : bitmap_release (&candidates[i]);
2769 1785493 : bitmap_obstack_release (NULL);
2770 1785493 : df_process_deferred_rescans ();
2771 :
2772 : /* Conversion means we may have 128bit register spills/fills
2773 : which require aligned stack. */
2774 1785493 : if (converted_insns)
2775 : {
2776 103401 : if (crtl->stack_alignment_needed < 128)
2777 2395 : crtl->stack_alignment_needed = 128;
2778 103401 : if (crtl->stack_alignment_estimated < 128)
2779 219 : crtl->stack_alignment_estimated = 128;
2780 :
2781 103401 : crtl->stack_realign_needed
2782 103401 : = INCOMING_STACK_BOUNDARY < crtl->stack_alignment_estimated;
2783 103401 : crtl->stack_realign_tried = crtl->stack_realign_needed;
2784 :
2785 103401 : crtl->stack_realign_processed = true;
2786 :
2787 103401 : if (!crtl->drap_reg)
2788 : {
2789 103224 : rtx drap_rtx = targetm.calls.get_drap_rtx ();
2790 :
2791 : /* stack_realign_drap and drap_rtx must match. */
2792 103224 : gcc_assert ((stack_realign_drap != 0) == (drap_rtx != NULL));
2793 :
2794 : /* Do nothing if NULL is returned,
2795 : which means DRAP is not needed. */
2796 103224 : if (drap_rtx != NULL)
2797 : {
2798 0 : crtl->args.internal_arg_pointer = drap_rtx;
2799 :
2800 : /* Call fixup_tail_calls to clean up
2801 : REG_EQUIV note if DRAP is needed. */
2802 0 : fixup_tail_calls ();
2803 : }
2804 : }
2805 :
2806 : /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
2807 103401 : if (TARGET_64BIT)
2808 64906 : for (tree parm = DECL_ARGUMENTS (current_function_decl);
2809 177143 : parm; parm = DECL_CHAIN (parm))
2810 : {
2811 112237 : if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
2812 96566 : continue;
2813 15671 : if (DECL_RTL_SET_P (parm)
2814 31342 : && GET_MODE (DECL_RTL (parm)) == V1TImode)
2815 : {
2816 522 : rtx r = DECL_RTL (parm);
2817 522 : if (REG_P (r))
2818 522 : SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
2819 : }
2820 15671 : if (DECL_INCOMING_RTL (parm)
2821 15671 : && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
2822 : {
2823 0 : rtx r = DECL_INCOMING_RTL (parm);
2824 0 : if (REG_P (r))
2825 0 : DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
2826 : }
2827 : }
2828 :
2829 103401 : if (!control_flow_insns.is_empty ())
2830 : {
2831 1130 : free_dominance_info (CDI_DOMINATORS);
2832 :
2833 1130 : unsigned int i;
2834 1130 : rtx_insn* insn;
2835 5848 : FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
2836 3588 : if (control_flow_insn_p (insn))
2837 : {
2838 : /* Split the block after insn. There will be a fallthru
2839 : edge, which is OK so we keep it. We have to create
2840 : the exception edges ourselves. */
2841 3588 : bb = BLOCK_FOR_INSN (insn);
2842 3588 : split_block (bb, insn);
2843 3588 : rtl_make_eh_edge (NULL, bb, BB_END (bb));
2844 : }
2845 : }
2846 : }
2847 :
2848 1785493 : return 0;
2849 1785493 : }
2850 :
2851 : static unsigned int
2852 74536 : rest_of_handle_insert_vzeroupper (void)
2853 : {
2854 : /* vzeroupper instructions are inserted immediately after reload and
2855 : postreload_cse to clean up after it a little bit to account for possible
2856 : spills from 256bit or 512bit registers. The pass reuses mode switching
2857 : infrastructure by re-running mode insertion pass, so disable entities
2858 : that have already been processed. */
2859 521752 : for (int i = 0; i < MAX_386_ENTITIES; i++)
2860 447216 : ix86_optimize_mode_switching[i] = 0;
2861 :
2862 74536 : ix86_optimize_mode_switching[AVX_U128] = 1;
2863 :
2864 : /* Call optimize_mode_switching. */
2865 74536 : g->get_passes ()->execute_pass_mode_switching ();
2866 :
2867 : /* LRA removes all REG_DEAD/REG_UNUSED notes and normally they
2868 : reappear in the IL only at the start of pass_rtl_dse2, which does
2869 : df_note_add_problem (); df_analyze ();
2870 : The vzeroupper is scheduled after postreload_cse pass and mode
2871 : switching computes the notes as well, the problem is that e.g.
2872 : pass_gcse2 doesn't maintain the notes, see PR113059 and
2873 : PR112760. Remove the notes now to restore status quo ante
2874 : until we figure out how to maintain the notes or what else
2875 : to do. */
2876 74536 : basic_block bb;
2877 74536 : rtx_insn *insn;
2878 412477 : FOR_EACH_BB_FN (bb, cfun)
2879 4294160 : FOR_BB_INSNS (bb, insn)
2880 3956219 : if (NONDEBUG_INSN_P (insn))
2881 : {
2882 2106982 : rtx *pnote = ®_NOTES (insn);
2883 3908360 : while (*pnote != 0)
2884 : {
2885 1801378 : if (REG_NOTE_KIND (*pnote) == REG_DEAD
2886 822487 : || REG_NOTE_KIND (*pnote) == REG_UNUSED)
2887 1292531 : *pnote = XEXP (*pnote, 1);
2888 : else
2889 508847 : pnote = &XEXP (*pnote, 1);
2890 : }
2891 : }
2892 :
2893 74536 : df_remove_problem (df_note);
2894 74536 : df_analyze ();
2895 74536 : return 0;
2896 : }
2897 :
2898 : namespace {
2899 :
2900 : const pass_data pass_data_insert_vzeroupper =
2901 : {
2902 : RTL_PASS, /* type */
2903 : "vzeroupper", /* name */
2904 : OPTGROUP_NONE, /* optinfo_flags */
2905 : TV_MACH_DEP, /* tv_id */
2906 : 0, /* properties_required */
2907 : 0, /* properties_provided */
2908 : 0, /* properties_destroyed */
2909 : 0, /* todo_flags_start */
2910 : TODO_df_finish, /* todo_flags_finish */
2911 : };
2912 :
2913 : class pass_insert_vzeroupper : public rtl_opt_pass
2914 : {
2915 : public:
2916 288047 : pass_insert_vzeroupper(gcc::context *ctxt)
2917 576094 : : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
2918 : {}
2919 :
2920 : /* opt_pass methods: */
2921 1474422 : bool gate (function *) final override
2922 : {
2923 1474422 : return TARGET_AVX && TARGET_VZEROUPPER;
2924 : }
2925 :
2926 74536 : unsigned int execute (function *) final override
2927 : {
2928 74536 : return rest_of_handle_insert_vzeroupper ();
2929 : }
2930 :
2931 : }; // class pass_insert_vzeroupper
2932 :
2933 : const pass_data pass_data_stv =
2934 : {
2935 : RTL_PASS, /* type */
2936 : "stv", /* name */
2937 : OPTGROUP_NONE, /* optinfo_flags */
2938 : TV_MACH_DEP, /* tv_id */
2939 : 0, /* properties_required */
2940 : 0, /* properties_provided */
2941 : 0, /* properties_destroyed */
2942 : 0, /* todo_flags_start */
2943 : TODO_df_finish, /* todo_flags_finish */
2944 : };
2945 :
2946 : class pass_stv : public rtl_opt_pass
2947 : {
2948 : public:
2949 576094 : pass_stv (gcc::context *ctxt)
2950 576094 : : rtl_opt_pass (pass_data_stv, ctxt),
2951 1152188 : timode_p (false)
2952 : {}
2953 :
2954 : /* opt_pass methods: */
2955 2948844 : bool gate (function *) final override
2956 : {
2957 1474422 : return ((!timode_p || TARGET_64BIT)
2958 4296782 : && TARGET_STV && TARGET_SSE2 && optimize > 1);
2959 : }
2960 :
2961 1785493 : unsigned int execute (function *) final override
2962 : {
2963 1785493 : return convert_scalars_to_vector (timode_p);
2964 : }
2965 :
2966 288047 : opt_pass *clone () final override
2967 : {
2968 288047 : return new pass_stv (m_ctxt);
2969 : }
2970 :
2971 576094 : void set_pass_param (unsigned int n, bool param) final override
2972 : {
2973 576094 : gcc_assert (n == 0);
2974 576094 : timode_p = param;
2975 576094 : }
2976 :
2977 : private:
2978 : bool timode_p;
2979 : }; // class pass_stv
2980 :
2981 : } // anon namespace
2982 :
2983 : rtl_opt_pass *
2984 288047 : make_pass_insert_vzeroupper (gcc::context *ctxt)
2985 : {
2986 288047 : return new pass_insert_vzeroupper (ctxt);
2987 : }
2988 :
2989 : rtl_opt_pass *
2990 288047 : make_pass_stv (gcc::context *ctxt)
2991 : {
2992 288047 : return new pass_stv (ctxt);
2993 : }
2994 :
2995 : /* Inserting ENDBR and pseudo patchable-area instructions. */
2996 :
2997 : static void
2998 193924 : rest_of_insert_endbr_and_patchable_area (bool need_endbr,
2999 : unsigned int patchable_area_size)
3000 : {
3001 193924 : rtx endbr;
3002 193924 : rtx_insn *insn;
3003 193924 : rtx_insn *endbr_insn = NULL;
3004 193924 : basic_block bb;
3005 :
3006 193924 : if (need_endbr)
3007 : {
3008 : /* Currently emit EB if it's a tracking function, i.e. 'nocf_check'
3009 : is absent among function attributes. Later an optimization will
3010 : be introduced to make analysis if an address of a static function
3011 : is taken. A static function whose address is not taken will get
3012 : a nocf_check attribute. This will allow to reduce the number of
3013 : EB. */
3014 193879 : if (!lookup_attribute ("nocf_check",
3015 193879 : TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
3016 193861 : && (!flag_manual_endbr
3017 8 : || lookup_attribute ("cf_check",
3018 8 : DECL_ATTRIBUTES (cfun->decl)))
3019 387739 : && (!cgraph_node::get (cfun->decl)->only_called_directly_p ()
3020 27348 : || ix86_cmodel == CM_LARGE
3021 27347 : || ix86_cmodel == CM_LARGE_PIC
3022 27346 : || flag_force_indirect_call
3023 27346 : || (TARGET_DLLIMPORT_DECL_ATTRIBUTES
3024 : && DECL_DLLIMPORT_P (cfun->decl))))
3025 : {
3026 166515 : if (crtl->profile && flag_fentry)
3027 : {
3028 : /* Queue ENDBR insertion to x86_function_profiler.
3029 : NB: Any patchable-area insn will be inserted after
3030 : ENDBR. */
3031 6 : cfun->machine->insn_queued_at_entrance = TYPE_ENDBR;
3032 : }
3033 : else
3034 : {
3035 166509 : endbr = gen_nop_endbr ();
3036 166509 : bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
3037 166509 : rtx_insn *insn = BB_HEAD (bb);
3038 166509 : endbr_insn = emit_insn_before (endbr, insn);
3039 : }
3040 : }
3041 : }
3042 :
3043 193924 : if (patchable_area_size)
3044 : {
3045 51 : if (crtl->profile && flag_fentry)
3046 : {
3047 : /* Queue patchable-area insertion to x86_function_profiler.
3048 : NB: If there is a queued ENDBR, x86_function_profiler
3049 : will also handle patchable-area. */
3050 2 : if (!cfun->machine->insn_queued_at_entrance)
3051 1 : cfun->machine->insn_queued_at_entrance = TYPE_PATCHABLE_AREA;
3052 : }
3053 : else
3054 : {
3055 49 : rtx patchable_area
3056 49 : = gen_patchable_area (GEN_INT (patchable_area_size),
3057 49 : GEN_INT (crtl->patch_area_entry == 0));
3058 49 : if (endbr_insn)
3059 3 : emit_insn_after (patchable_area, endbr_insn);
3060 : else
3061 : {
3062 46 : bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
3063 46 : insn = BB_HEAD (bb);
3064 46 : emit_insn_before (patchable_area, insn);
3065 : }
3066 : }
3067 : }
3068 :
3069 193924 : if (!need_endbr)
3070 : return;
3071 :
3072 193879 : bb = 0;
3073 3988087 : FOR_EACH_BB_FN (bb, cfun)
3074 : {
3075 72430749 : for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
3076 68636541 : insn = NEXT_INSN (insn))
3077 : {
3078 68636541 : if (CALL_P (insn))
3079 : {
3080 1363041 : need_endbr = find_reg_note (insn, REG_SETJMP, NULL) != NULL;
3081 1363041 : if (!need_endbr && !SIBLING_CALL_P (insn))
3082 : {
3083 1313361 : rtx call = get_call_rtx_from (insn);
3084 1313361 : rtx fnaddr = XEXP (call, 0);
3085 1313361 : tree fndecl = NULL_TREE;
3086 :
3087 : /* Also generate ENDBRANCH for non-tail call which
3088 : may return via indirect branch. */
3089 1313361 : if (SYMBOL_REF_P (XEXP (fnaddr, 0)))
3090 1256032 : fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
3091 1256032 : if (fndecl == NULL_TREE)
3092 57697 : fndecl = MEM_EXPR (fnaddr);
3093 57697 : if (fndecl
3094 1311028 : && TREE_CODE (TREE_TYPE (fndecl)) != FUNCTION_TYPE
3095 553942 : && TREE_CODE (TREE_TYPE (fndecl)) != METHOD_TYPE)
3096 : fndecl = NULL_TREE;
3097 1313361 : if (fndecl && TYPE_ARG_TYPES (TREE_TYPE (fndecl)))
3098 : {
3099 1272706 : tree fntype = TREE_TYPE (fndecl);
3100 1272706 : if (lookup_attribute ("indirect_return",
3101 1272706 : TYPE_ATTRIBUTES (fntype)))
3102 : need_endbr = true;
3103 : }
3104 : }
3105 1363029 : if (!need_endbr)
3106 1363021 : continue;
3107 : /* Generate ENDBRANCH after CALL, which can return more than
3108 : twice, setjmp-like functions. */
3109 :
3110 20 : endbr = gen_nop_endbr ();
3111 20 : emit_insn_after_setloc (endbr, insn, INSN_LOCATION (insn));
3112 20 : continue;
3113 20 : }
3114 :
3115 67273500 : if (JUMP_P (insn) && flag_cet_switch)
3116 : {
3117 9 : rtx target = JUMP_LABEL (insn);
3118 9 : if (target == NULL_RTX || ANY_RETURN_P (target))
3119 5 : continue;
3120 :
3121 : /* Check the jump is a switch table. */
3122 4 : rtx_insn *label = as_a<rtx_insn *> (target);
3123 4 : rtx_insn *table = next_insn (label);
3124 4 : if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
3125 2 : continue;
3126 :
3127 : /* For the indirect jump find out all places it jumps and insert
3128 : ENDBRANCH there. It should be done under a special flag to
3129 : control ENDBRANCH generation for switch stmts. */
3130 2 : edge_iterator ei;
3131 2 : edge e;
3132 2 : basic_block dest_blk;
3133 :
3134 24 : FOR_EACH_EDGE (e, ei, bb->succs)
3135 : {
3136 22 : rtx_insn *insn;
3137 :
3138 22 : dest_blk = e->dest;
3139 22 : insn = BB_HEAD (dest_blk);
3140 22 : gcc_assert (LABEL_P (insn));
3141 22 : endbr = gen_nop_endbr ();
3142 22 : emit_insn_after (endbr, insn);
3143 : }
3144 2 : continue;
3145 2 : }
3146 :
3147 67273491 : if (LABEL_P (insn) && LABEL_PRESERVE_P (insn))
3148 : {
3149 139305 : endbr = gen_nop_endbr ();
3150 139305 : emit_insn_after (endbr, insn);
3151 139305 : continue;
3152 : }
3153 : }
3154 : }
3155 :
3156 : return;
3157 : }
3158 :
3159 : namespace {
3160 :
3161 : const pass_data pass_data_insert_endbr_and_patchable_area =
3162 : {
3163 : RTL_PASS, /* type. */
3164 : "endbr_and_patchable_area", /* name. */
3165 : OPTGROUP_NONE, /* optinfo_flags. */
3166 : TV_MACH_DEP, /* tv_id. */
3167 : 0, /* properties_required. */
3168 : 0, /* properties_provided. */
3169 : 0, /* properties_destroyed. */
3170 : 0, /* todo_flags_start. */
3171 : 0, /* todo_flags_finish. */
3172 : };
3173 :
3174 : class pass_insert_endbr_and_patchable_area : public rtl_opt_pass
3175 : {
3176 : public:
3177 288047 : pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
3178 576094 : : rtl_opt_pass (pass_data_insert_endbr_and_patchable_area, ctxt)
3179 : {}
3180 :
3181 : /* opt_pass methods: */
3182 1474422 : bool gate (function *) final override
3183 : {
3184 1474422 : need_endbr = (flag_cf_protection & CF_BRANCH) != 0;
3185 1474422 : patchable_area_size = crtl->patch_area_size - crtl->patch_area_entry;
3186 1474422 : return need_endbr || patchable_area_size;
3187 : }
3188 :
3189 193924 : unsigned int execute (function *) final override
3190 : {
3191 193924 : timevar_push (TV_MACH_DEP);
3192 193924 : rest_of_insert_endbr_and_patchable_area (need_endbr,
3193 : patchable_area_size);
3194 193924 : timevar_pop (TV_MACH_DEP);
3195 193924 : return 0;
3196 : }
3197 :
3198 : private:
3199 : bool need_endbr;
3200 : unsigned int patchable_area_size;
3201 : }; // class pass_insert_endbr_and_patchable_area
3202 :
3203 : } // anon namespace
3204 :
3205 : rtl_opt_pass *
3206 288047 : make_pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
3207 : {
3208 288047 : return new pass_insert_endbr_and_patchable_area (ctxt);
3209 : }
3210 :
3211 : bool
3212 6104462 : ix86_rpad_gate ()
3213 : {
3214 6104462 : return (TARGET_AVX
3215 387221 : && TARGET_SSE_PARTIAL_REG_DEPENDENCY
3216 292326 : && TARGET_SSE_MATH
3217 292102 : && optimize
3218 6391349 : && optimize_function_for_speed_p (cfun));
3219 : }
3220 :
3221 : enum x86_cse_kind
3222 : {
3223 : X86_CSE_CONST0_VECTOR,
3224 : X86_CSE_CONSTM1_VECTOR,
3225 : X86_CSE_CONST_VECTOR,
3226 : X86_CSE_VEC_DUP,
3227 : X86_CSE_TLS_GD,
3228 : X86_CSE_TLS_LD_BASE,
3229 : X86_CSE_TLSDESC
3230 : };
3231 :
3232 153890 : struct redundant_pattern
3233 : {
3234 : /* Bitmap of basic blocks with broadcast instructions. */
3235 : auto_bitmap bbs;
3236 : /* Bitmap of broadcast instructions. */
3237 : auto_bitmap insns;
3238 : /* The broadcast inner scalar. */
3239 : rtx val;
3240 : /* The actual redundant source value for UNSPEC_TLSDESC. */
3241 : rtx tlsdesc_val;
3242 : /* The inner scalar mode. */
3243 : machine_mode mode;
3244 : /* The destination mode which can be changed to the integer mode of
3245 : the same time. */
3246 : machine_mode dest_mode;
3247 : /* The instruction which sets the inner scalar. Nullptr if the inner
3248 : scalar is applied to the whole function, instead of within the same
3249 : block. */
3250 : rtx_insn *def_insn;
3251 : /* The widest broadcast source. */
3252 : rtx broadcast_source;
3253 : /* The widest broadcast register. */
3254 : rtx broadcast_reg;
3255 : /* The basic block of the broadcast instruction. */
3256 : basic_block bb;
3257 : /* The number of broadcast instructions with the same inner scalar. */
3258 : unsigned HOST_WIDE_INT count;
3259 : /* The threshold of broadcast instructions with the same inner
3260 : scalar. */
3261 : unsigned int threshold;
3262 : /* The widest broadcast size in bytes. */
3263 : unsigned int size;
3264 : /* Load kind. */
3265 : x86_cse_kind kind;
3266 : };
3267 :
3268 : /* Generate a vector set, DEST = SRC, at entry of the nearest dominator
3269 : for basic block map BBS, which is in the fake loop that contains the
3270 : whole function, so that there is only a single vector set in the
3271 : whole function. If not nullptr, LOAD is a pointer to the load. */
3272 :
3273 : static void
3274 42784 : ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs,
3275 : redundant_pattern *load = nullptr)
3276 : {
3277 42784 : basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
3278 : /* For X86_CSE_VEC_DUP and X86_CSE_CONST_VECTOR, don't place the vector
3279 : set outside of the loop to avoid extra spills. */
3280 42784 : if (!load
3281 41762 : || (load->kind != X86_CSE_VEC_DUP
3282 41762 : && load->kind != X86_CSE_CONST_VECTOR))
3283 : {
3284 23389 : while (bb->loop_father->latch
3285 23389 : != EXIT_BLOCK_PTR_FOR_FN (cfun))
3286 1363 : bb = get_immediate_dominator (CDI_DOMINATORS,
3287 : bb->loop_father->header);
3288 : }
3289 :
3290 42784 : if (CONST_INT_P (src))
3291 10452 : dest = gen_rtx_SUBREG (load->dest_mode, dest, 0);
3292 32332 : else if (CONST_VECTOR_P (src))
3293 : {
3294 : /* The only possible CONST_VECTORs of SRC are CONST0_RTX and
3295 : CONSTM1_RTX. Otherwise,
3296 :
3297 : rtx set = gen_rtx_SET (dest, src);
3298 :
3299 : won't be a valid instruction. CONST0_RTX always works. It
3300 : can comes from:
3301 :
3302 : 1. remove_partial_avx_dependency with LOAD == NULL.
3303 : 2. X86_CSE_VEC_DUP with
3304 :
3305 : (insn 48 58 16 3 (set (reg:V4HI 123)
3306 : (const_vector:V4HI [
3307 : (const_int 0 [0]) repeated x4
3308 : ])) 2065 {*movv4hi_internal} (nil))
3309 :
3310 : 3. X86_CSE_CONST0_VECTOR.
3311 : */
3312 22026 : machine_mode mode = GET_MODE (dest);
3313 22026 : if (!(src == CONST0_RTX (mode)
3314 1235 : || (src == CONSTM1_RTX (mode)
3315 1235 : && load->kind == X86_CSE_CONSTM1_VECTOR)))
3316 0 : gcc_unreachable ();
3317 : }
3318 42784 : rtx set = gen_rtx_SET (dest, src);
3319 :
3320 42784 : rtx_insn *insn = BB_HEAD (bb);
3321 164837 : while (insn && !NONDEBUG_INSN_P (insn))
3322 : {
3323 122057 : if (insn == BB_END (bb))
3324 : {
3325 : insn = NULL;
3326 : break;
3327 : }
3328 122053 : insn = NEXT_INSN (insn);
3329 : }
3330 :
3331 42784 : rtx_insn *set_insn;
3332 42784 : if (insn == BB_HEAD (bb))
3333 : {
3334 0 : set_insn = emit_insn_before (set, insn);
3335 0 : if (dump_file)
3336 : {
3337 0 : fprintf (dump_file, "\nPlace:\n\n");
3338 0 : print_rtl_single (dump_file, set_insn);
3339 0 : fprintf (dump_file, "\nbefore:\n\n");
3340 0 : print_rtl_single (dump_file, insn);
3341 0 : fprintf (dump_file, "\n");
3342 : }
3343 : }
3344 : else
3345 : {
3346 42784 : rtx_insn *after = insn ? PREV_INSN (insn) : BB_END (bb);
3347 42784 : set_insn = emit_insn_after (set, after);
3348 42784 : if (dump_file)
3349 : {
3350 2 : fprintf (dump_file, "\nPlace:\n\n");
3351 2 : print_rtl_single (dump_file, set_insn);
3352 2 : fprintf (dump_file, "\nafter:\n\n");
3353 2 : print_rtl_single (dump_file, after);
3354 2 : fprintf (dump_file, "\n");
3355 : }
3356 : }
3357 :
3358 42784 : if (load && load->kind == X86_CSE_VEC_DUP)
3359 : {
3360 : /* Get the source from LOAD as (reg:SI 99) in
3361 :
3362 : (vec_duplicate:V4SI (reg:SI 99))
3363 :
3364 : */
3365 10306 : rtx inner_scalar = load->val;
3366 : /* Set the source in (vec_duplicate:V4SI (reg:SI 99)). */
3367 10306 : rtx reg = XEXP (src, 0);
3368 10306 : machine_mode reg_mode = GET_MODE (reg);
3369 10306 : if (reg_mode != GET_MODE (inner_scalar))
3370 : {
3371 10027 : if (REG_P (inner_scalar) || MEM_P (inner_scalar))
3372 0 : inner_scalar = gen_rtx_SUBREG (reg_mode, inner_scalar, 0);
3373 10027 : else if (!SCALAR_INT_MODE_P (reg_mode))
3374 : {
3375 : /* For non-int load with integer constant, generate
3376 :
3377 : (set (subreg:SI (reg/v:SF 105 [ f ]) 0)
3378 : (const_int 1313486336 [0x4e4a3600]))
3379 :
3380 : */
3381 1 : gcc_assert (CONST_INT_P (inner_scalar));
3382 1 : unsigned int bits = GET_MODE_BITSIZE (reg_mode);
3383 1 : machine_mode mode = int_mode_for_size (bits, 0).require ();
3384 1 : reg = gen_rtx_SUBREG (mode, reg, 0);
3385 : }
3386 : }
3387 10306 : rtx set = gen_rtx_SET (reg, inner_scalar);
3388 10306 : insn = emit_insn_before (set, set_insn);
3389 10306 : if (dump_file)
3390 : {
3391 0 : fprintf (dump_file, "\nAdd:\n\n");
3392 0 : print_rtl_single (dump_file, insn);
3393 0 : fprintf (dump_file, "\nbefore:\n\n");
3394 0 : print_rtl_single (dump_file, set_insn);
3395 0 : fprintf (dump_file, "\n");
3396 : }
3397 : }
3398 42784 : }
3399 :
3400 : /* At entry of the nearest common dominator for basic blocks with
3401 : conversions/rcp/sqrt/rsqrt/round, generate a single
3402 : vxorps %xmmN, %xmmN, %xmmN
3403 : for all
3404 : vcvtss2sd op, %xmmN, %xmmX
3405 : vcvtsd2ss op, %xmmN, %xmmX
3406 : vcvtsi2ss op, %xmmN, %xmmX
3407 : vcvtsi2sd op, %xmmN, %xmmX
3408 :
3409 : NB: We want to generate only a single vxorps to cover the whole
3410 : function. The LCM algorithm isn't appropriate here since it may
3411 : place a vxorps inside the loop. */
3412 :
3413 : static unsigned int
3414 33373 : remove_partial_avx_dependency (void)
3415 : {
3416 33373 : timevar_push (TV_MACH_DEP);
3417 :
3418 33373 : bitmap_obstack_initialize (NULL);
3419 33373 : bitmap convert_bbs = BITMAP_ALLOC (NULL);
3420 :
3421 33373 : basic_block bb;
3422 33373 : rtx_insn *insn, *set_insn;
3423 33373 : rtx set;
3424 33373 : rtx v4sf_const0 = NULL_RTX;
3425 :
3426 33373 : auto_vec<rtx_insn *> control_flow_insns;
3427 :
3428 : /* We create invalid RTL initially so defer rescans. */
3429 33373 : df_set_flags (DF_DEFER_INSN_RESCAN);
3430 :
3431 315942 : FOR_EACH_BB_FN (bb, cfun)
3432 : {
3433 3518513 : FOR_BB_INSNS (bb, insn)
3434 : {
3435 3235944 : if (!NONDEBUG_INSN_P (insn))
3436 1446432 : continue;
3437 :
3438 1789512 : set = single_set (insn);
3439 1789512 : if (!set)
3440 71013 : continue;
3441 :
3442 1718499 : if (get_attr_avx_partial_xmm_update (insn)
3443 : != AVX_PARTIAL_XMM_UPDATE_TRUE)
3444 1715318 : continue;
3445 :
3446 : /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF,
3447 : SI -> SF, SI -> DF, DI -> SF, DI -> DF, sqrt, rsqrt, rcp,
3448 : round, to vec_dup and vec_merge with subreg. */
3449 3181 : rtx src = SET_SRC (set);
3450 3181 : rtx dest = SET_DEST (set);
3451 3181 : machine_mode dest_mode = GET_MODE (dest);
3452 3181 : bool convert_p = false;
3453 3181 : switch (GET_CODE (src))
3454 : {
3455 3116 : case FLOAT:
3456 3116 : case FLOAT_EXTEND:
3457 3116 : case FLOAT_TRUNCATE:
3458 3116 : case UNSIGNED_FLOAT:
3459 3116 : convert_p = true;
3460 3116 : break;
3461 : default:
3462 : break;
3463 : }
3464 :
3465 : /* Only handle conversion here. */
3466 3116 : machine_mode src_mode
3467 3116 : = convert_p ? GET_MODE (XEXP (src, 0)) : VOIDmode;
3468 3116 : switch (src_mode)
3469 : {
3470 155 : case E_SFmode:
3471 155 : case E_DFmode:
3472 155 : if (TARGET_USE_VECTOR_FP_CONVERTS
3473 149 : || !TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY)
3474 8 : continue;
3475 : break;
3476 2961 : case E_SImode:
3477 2961 : case E_DImode:
3478 2961 : if (TARGET_USE_VECTOR_CONVERTS
3479 2949 : || !TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY)
3480 14 : continue;
3481 : break;
3482 65 : case E_VOIDmode:
3483 65 : gcc_assert (!convert_p);
3484 : break;
3485 0 : default:
3486 0 : gcc_unreachable ();
3487 : }
3488 :
3489 3159 : if (!v4sf_const0)
3490 1022 : v4sf_const0 = gen_reg_rtx (V4SFmode);
3491 :
3492 3159 : rtx zero;
3493 3159 : machine_mode dest_vecmode;
3494 3159 : switch (dest_mode)
3495 : {
3496 50 : case E_HFmode:
3497 50 : dest_vecmode = V8HFmode;
3498 50 : zero = gen_rtx_SUBREG (V8HFmode, v4sf_const0, 0);
3499 50 : break;
3500 : case E_SFmode:
3501 : dest_vecmode = V4SFmode;
3502 : zero = v4sf_const0;
3503 : break;
3504 1167 : case E_DFmode:
3505 1167 : dest_vecmode = V2DFmode;
3506 1167 : zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0);
3507 1167 : break;
3508 0 : default:
3509 0 : gcc_unreachable ();
3510 : }
3511 :
3512 : /* Change source to vector mode. */
3513 3159 : src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src);
3514 3159 : src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero,
3515 : GEN_INT (HOST_WIDE_INT_1U));
3516 : /* Change destination to vector mode. */
3517 3159 : rtx vec = gen_reg_rtx (dest_vecmode);
3518 : /* Generate an XMM vector SET. */
3519 3159 : set = gen_rtx_SET (vec, src);
3520 3159 : set_insn = emit_insn_before (set, insn);
3521 :
3522 3159 : if (cfun->can_throw_non_call_exceptions)
3523 : {
3524 : /* Handle REG_EH_REGION note. */
3525 0 : rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
3526 0 : if (note)
3527 : {
3528 0 : control_flow_insns.safe_push (set_insn);
3529 0 : add_reg_note (set_insn, REG_EH_REGION, XEXP (note, 0));
3530 : }
3531 : }
3532 :
3533 3159 : src = gen_rtx_SUBREG (dest_mode, vec, 0);
3534 3159 : set = gen_rtx_SET (dest, src);
3535 :
3536 : /* Drop possible dead definitions. */
3537 3159 : PATTERN (insn) = set;
3538 :
3539 3159 : INSN_CODE (insn) = -1;
3540 3159 : recog_memoized (insn);
3541 3159 : df_insn_rescan (insn);
3542 3159 : bitmap_set_bit (convert_bbs, bb->index);
3543 : }
3544 : }
3545 :
3546 33373 : if (v4sf_const0)
3547 : {
3548 : /* (Re-)discover loops so that bb->loop_father can be used in the
3549 : analysis below. */
3550 1022 : calculate_dominance_info (CDI_DOMINATORS);
3551 1022 : loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
3552 :
3553 1022 : ix86_place_single_vector_set (v4sf_const0,
3554 : CONST0_RTX (V4SFmode),
3555 : convert_bbs);
3556 :
3557 1022 : loop_optimizer_finalize ();
3558 :
3559 1022 : if (!control_flow_insns.is_empty ())
3560 : {
3561 0 : free_dominance_info (CDI_DOMINATORS);
3562 :
3563 0 : unsigned int i;
3564 0 : FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
3565 0 : if (control_flow_insn_p (insn))
3566 : {
3567 : /* Split the block after insn. There will be a fallthru
3568 : edge, which is OK so we keep it. We have to create
3569 : the exception edges ourselves. */
3570 0 : bb = BLOCK_FOR_INSN (insn);
3571 0 : split_block (bb, insn);
3572 0 : rtl_make_eh_edge (NULL, bb, BB_END (bb));
3573 : }
3574 : }
3575 : }
3576 :
3577 33373 : df_process_deferred_rescans ();
3578 33373 : df_clear_flags (DF_DEFER_INSN_RESCAN);
3579 33373 : bitmap_obstack_release (NULL);
3580 33373 : BITMAP_FREE (convert_bbs);
3581 :
3582 33373 : timevar_pop (TV_MACH_DEP);
3583 33373 : return 0;
3584 33373 : }
3585 :
3586 : namespace {
3587 :
3588 : const pass_data pass_data_remove_partial_avx_dependency =
3589 : {
3590 : RTL_PASS, /* type */
3591 : "rpad", /* name */
3592 : OPTGROUP_NONE, /* optinfo_flags */
3593 : TV_MACH_DEP, /* tv_id */
3594 : 0, /* properties_required */
3595 : 0, /* properties_provided */
3596 : 0, /* properties_destroyed */
3597 : 0, /* todo_flags_start */
3598 : 0, /* todo_flags_finish */
3599 : };
3600 :
3601 : class pass_remove_partial_avx_dependency : public rtl_opt_pass
3602 : {
3603 : public:
3604 288047 : pass_remove_partial_avx_dependency (gcc::context *ctxt)
3605 576094 : : rtl_opt_pass (pass_data_remove_partial_avx_dependency, ctxt)
3606 : {}
3607 :
3608 : /* opt_pass methods: */
3609 1474422 : bool gate (function *) final override
3610 : {
3611 1474422 : return ix86_rpad_gate ();
3612 : }
3613 :
3614 33373 : unsigned int execute (function *) final override
3615 : {
3616 33373 : return remove_partial_avx_dependency ();
3617 : }
3618 : }; // class pass_rpad
3619 :
3620 : } // anon namespace
3621 :
3622 : rtl_opt_pass *
3623 288047 : make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
3624 : {
3625 288047 : return new pass_remove_partial_avx_dependency (ctxt);
3626 : }
3627 :
3628 : /* Return a machine mode suitable for vector SIZE with SMODE inner
3629 : mode. */
3630 :
3631 : static machine_mode
3632 63539 : ix86_get_vector_cse_mode (unsigned int size, machine_mode smode)
3633 : {
3634 : /* Use the inner scalar mode of vector broadcast source in:
3635 :
3636 : (set (reg:V8DF 394)
3637 : (vec_duplicate:V8DF (reg:V2DF 190 [ alpha ])))
3638 :
3639 : to compute the vector mode for broadcast from vector source.
3640 : */
3641 63539 : if (VECTOR_MODE_P (smode))
3642 30941 : smode = GET_MODE_INNER (smode);
3643 63539 : scalar_mode s_mode = as_a <scalar_mode> (smode);
3644 127078 : poly_uint64 nunits = size / GET_MODE_SIZE (smode);
3645 63539 : machine_mode mode = mode_for_vector (s_mode, nunits).require ();
3646 63539 : return mode;
3647 : }
3648 :
3649 : /* Replace the source operand of instructions in VECTOR_INSNS with
3650 : VECTOR_CONST in VECTOR_MODE. */
3651 :
3652 : static void
3653 63064 : replace_vector_const (machine_mode vector_mode, rtx vector_const,
3654 : auto_bitmap &vector_insns,
3655 : machine_mode scalar_mode)
3656 : {
3657 63064 : bitmap_iterator bi;
3658 63064 : unsigned int id;
3659 :
3660 220843 : EXECUTE_IF_SET_IN_BITMAP (vector_insns, 0, id, bi)
3661 : {
3662 157779 : rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
3663 :
3664 : /* Get the single SET instruction. */
3665 157779 : rtx set = single_set (insn);
3666 157779 : rtx src = SET_SRC (set);
3667 157779 : rtx dest = SET_DEST (set);
3668 157779 : machine_mode mode = GET_MODE (dest);
3669 :
3670 157779 : rtx replace;
3671 : /* Replace the source operand with VECTOR_CONST. */
3672 157779 : if (SUBREG_P (src)
3673 157779 : || mode == vector_mode
3674 59265 : || CONST_INT_P (vector_const))
3675 : replace = vector_const;
3676 : else
3677 : {
3678 59265 : unsigned int size = GET_MODE_SIZE (mode);
3679 59265 : if (size < ix86_regmode_natural_size (mode))
3680 : {
3681 : /* If the mode size is smaller than its natural size,
3682 : first insert an extra move with a QI vector SUBREG
3683 : of the same size to avoid validate_subreg failure. */
3684 475 : machine_mode vmode
3685 475 : = ix86_get_vector_cse_mode (size, scalar_mode);
3686 475 : rtx vreg;
3687 475 : if (mode == vmode)
3688 : vreg = vector_const;
3689 : else
3690 : {
3691 59 : vreg = gen_reg_rtx (vmode);
3692 59 : rtx vsubreg = gen_rtx_SUBREG (vmode, vector_const, 0);
3693 59 : rtx pat = gen_rtx_SET (vreg, vsubreg);
3694 59 : rtx_insn *vinsn = emit_insn_before (pat, insn);
3695 59 : if (dump_file)
3696 : {
3697 0 : fprintf (dump_file, "\nInsert an extra move:\n\n");
3698 0 : print_rtl_single (dump_file, vinsn);
3699 0 : fprintf (dump_file, "\nbefore:\n\n");
3700 0 : print_rtl_single (dump_file, insn);
3701 0 : fprintf (dump_file, "\n");
3702 : }
3703 : }
3704 475 : replace = gen_rtx_SUBREG (mode, vreg, 0);
3705 : }
3706 : else
3707 58790 : replace = gen_rtx_SUBREG (mode, vector_const, 0);
3708 : }
3709 :
3710 157779 : if (dump_file)
3711 : {
3712 3 : fprintf (dump_file, "\nReplace:\n\n");
3713 3 : print_rtl_single (dump_file, insn);
3714 : }
3715 157779 : SET_SRC (set) = replace;
3716 157779 : if (CONST_INT_P (replace))
3717 : {
3718 23743 : dest = gen_rtx_SUBREG (scalar_mode, dest, 0);
3719 23743 : SET_DEST (set) = dest;
3720 : }
3721 : /* Drop possible dead definitions. */
3722 157779 : PATTERN (insn) = set;
3723 157779 : INSN_CODE (insn) = -1;
3724 157779 : recog_memoized (insn);
3725 157779 : if (dump_file)
3726 : {
3727 3 : fprintf (dump_file, "\nwith:\n\n");
3728 3 : print_rtl_single (dump_file, insn);
3729 3 : fprintf (dump_file, "\n");
3730 : }
3731 157779 : df_insn_rescan (insn);
3732 : }
3733 63064 : }
3734 :
3735 : /* Return the inner scalar if OP is a broadcast, else return nullptr. */
3736 :
3737 : static rtx
3738 2203660 : ix86_broadcast_inner (rtx op, machine_mode mode,
3739 : machine_mode *scalar_mode_p,
3740 : x86_cse_kind *kind_p, rtx_insn **insn_p)
3741 : {
3742 2203660 : switch (standard_sse_constant_p (op, mode))
3743 : {
3744 113918 : case 1:
3745 113918 : *scalar_mode_p = QImode;
3746 113918 : *kind_p = X86_CSE_CONST0_VECTOR;
3747 113918 : *insn_p = nullptr;
3748 113918 : return const0_rtx;
3749 11355 : case 2:
3750 11355 : *scalar_mode_p = QImode;
3751 11355 : *kind_p = X86_CSE_CONSTM1_VECTOR;
3752 11355 : *insn_p = nullptr;
3753 11355 : return constm1_rtx;
3754 2078387 : default:
3755 2078387 : break;
3756 : }
3757 :
3758 2078387 : mode = GET_MODE (op);
3759 2078387 : int nunits = GET_MODE_NUNITS (mode);
3760 2078387 : if (nunits < 2)
3761 : return nullptr;
3762 :
3763 1606927 : bool const_vector_p = CONST_VECTOR_P (op);
3764 1606927 : bool duplicated = GET_CODE (op) == VEC_DUPLICATE;
3765 1606927 : rtx orig_op = op;
3766 1606927 : if (!const_vector_p)
3767 : {
3768 : /* Check CONST_VECTOR in REG_EQUAL note. */
3769 1606907 : rtx equal = find_reg_equal_equiv_note (*insn_p);
3770 1606907 : if (equal)
3771 : {
3772 346491 : equal = XEXP (equal, 0);
3773 346491 : const_vector_p = CONST_VECTOR_P (equal);
3774 : /* Use CONST_VECTOR in REG_EQUAL note. */
3775 346491 : if (const_vector_p)
3776 : {
3777 : /* Handle REG_EQUAL note in:
3778 :
3779 : (insn 7 5 12 2 (set (subreg:V8SI (reg:V4DI 100) 0)
3780 : (vec_duplicate:V8SI (reg:SI 102)))
3781 : (expr_list:REG_DEAD (reg:SI 102)
3782 : (expr_list:REG_EQUAL (const_vector:V4DI [
3783 : (const_int -1 [0xffffffffffffffff]) repeated x4]) (nil))))
3784 :
3785 : NB: Don't treat it as CONST_VECTOR since EQUAL isn't
3786 : supported by ISAs as in gcc.target/i386/pr40957.c. */
3787 262594 : if (GET_MODE (equal) != mode)
3788 : const_vector_p = false;
3789 : else
3790 1606927 : op = equal;
3791 : }
3792 : }
3793 : }
3794 :
3795 1606927 : machine_mode inner_mode = GET_MODE_INNER (mode);
3796 :
3797 1606927 : if (const_vector_p)
3798 : {
3799 525160 : bool int_load_p = GET_MODE_SIZE (mode) <= UNITS_PER_WORD;
3800 262580 : *kind_p = X86_CSE_CONST_VECTOR;
3801 262580 : if (int_load_p)
3802 : {
3803 : /* This CONST_VECTOR load can be converted to constant
3804 : integer load. */
3805 34808 : *scalar_mode_p = mode;
3806 34808 : *insn_p = nullptr;
3807 34808 : return op;
3808 : }
3809 :
3810 : /* This CONST_VECTOR is wider than the integer register. */
3811 227772 : rtx first = XVECEXP (op, 0, 0);
3812 :
3813 227772 : if (duplicated)
3814 : {
3815 : /* Check if CONST_VECTOR in REG_EQUAL note is duplicated in
3816 :
3817 : (insn 10 7 12 2 (set (reg:V8SI 128)
3818 : (vec_duplicate:V8SI (vec_select:V2SI (reg:V4SI 180)
3819 : (parallel [(const_int 0 [0])
3820 : (const_int 1 [0x1])]))))
3821 : (expr_list:REG_EQUAL (const_vector:V8SI [
3822 : (const_int 0 [0])
3823 : (const_int 34 [0x22])
3824 : (const_int 0 [0])
3825 : (const_int 34 [0x22])
3826 : (const_int 0 [0])
3827 : (const_int 34 [0x22])
3828 : (const_int 0 [0])
3829 : (const_int 34 [0x22])])(nil)))
3830 :
3831 : */
3832 :
3833 210802 : bool duplicated_const_vector = true;
3834 210802 : for (int i = 1; i < nunits; ++i)
3835 : {
3836 137963 : rtx tmp = XVECEXP (op, 0, i);
3837 137963 : if (!rtx_equal_p (tmp, first))
3838 : {
3839 : duplicated_const_vector = false;
3840 : break;
3841 : }
3842 : }
3843 :
3844 72855 : if (duplicated_const_vector)
3845 : {
3846 72839 : bool const_double_p = CONST_DOUBLE_P (first);
3847 : /* Force the floating point constant to memory. */
3848 72839 : if (const_double_p)
3849 5491 : first = validize_mem (force_const_mem (inner_mode, first));
3850 :
3851 72839 : if (const_double_p || CONST_INT_P (first))
3852 : {
3853 : /* Handle
3854 :
3855 : (insn 7 6 8 2 (set (reg:V4SF 99)
3856 : (vec_duplicate:V4SF (mem/u/c:SF (symbol_ref/u:DI ("*.LC2") [flags 0x2]) [0 S4 A32])))
3857 : (expr_list:REG_EQUAL (const_vector:V4SF [
3858 : (const_double:SF 3.4e+1 [0x0.88p+6]) repeated x4]) (nil)))
3859 :
3860 : and
3861 :
3862 : (insn 14 15 16 3 (set (reg:V4SI 116)
3863 : (vec_duplicate:V4SI (reg:SI 117)))
3864 : (expr_list:REG_EQUAL (const_vector:V4SI [
3865 : (const_int 34 [0x22]) repeated x4]) (nil)))
3866 :
3867 : */
3868 72839 : *kind_p = X86_CSE_VEC_DUP;
3869 72839 : *insn_p = nullptr;
3870 72839 : *scalar_mode_p = inner_mode;
3871 72839 : return first;
3872 : }
3873 : }
3874 :
3875 : op = orig_op;
3876 : }
3877 : else
3878 : {
3879 : /* Only native CONST_VECTOR is allowed. */
3880 154917 : if (orig_op != op)
3881 : return nullptr;
3882 :
3883 : /* Check if VEC_DUPLICATE can be used. */
3884 48 : for (int i = 1; i < nunits; ++i)
3885 : {
3886 48 : rtx tmp = XVECEXP (op, 0, i);
3887 : /* Vector duplicate value. */
3888 48 : if (!rtx_equal_p (tmp, first))
3889 : return nullptr;
3890 : }
3891 :
3892 : /* Use the inner mode to handle
3893 : (const_vector:V2QI [(const_int 0 [0]) repeated x2])
3894 : */
3895 0 : *scalar_mode_p = inner_mode;
3896 0 : *insn_p = nullptr;
3897 0 : return first;
3898 : }
3899 : }
3900 :
3901 1344363 : if (!duplicated)
3902 : return nullptr;
3903 :
3904 22503 : *kind_p = X86_CSE_VEC_DUP;
3905 :
3906 : /* Only
3907 :
3908 : (vec_duplicate:V4SI (reg:SI 99))
3909 : (vec_duplicate:V2DF (mem/u/c:DF (symbol_ref/u:DI ("*.LC1") [flags 0x2]) [0 S8 A64]))
3910 :
3911 : are supported. Set OP to the broadcast source by default. */
3912 22503 : op = XEXP (op, 0);
3913 22503 : rtx reg = op;
3914 22503 : if (SUBREG_P (op)
3915 401 : && SUBREG_BYTE (op) == 0
3916 22904 : && !paradoxical_subreg_p (op))
3917 401 : reg = SUBREG_REG (op);
3918 22503 : if (!REG_P (reg))
3919 : {
3920 2290 : if (MEM_P (op)
3921 2035 : && SYMBOL_REF_P (XEXP (op, 0))
3922 2524 : && CONSTANT_POOL_ADDRESS_P (XEXP (op, 0)))
3923 : {
3924 : /* Handle constant broadcast from memory. */
3925 11 : *scalar_mode_p = inner_mode;
3926 11 : *insn_p = nullptr;
3927 11 : return op;
3928 : }
3929 : return nullptr;
3930 : }
3931 :
3932 20213 : machine_mode orig_mode = mode;
3933 20213 : mode = GET_MODE (op);
3934 :
3935 : /* Only single def chain is supported. */
3936 20213 : df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg));
3937 20213 : if (!ref
3938 20212 : || DF_REF_IS_ARTIFICIAL (ref)
3939 20212 : || DF_REF_NEXT_REG (ref) != nullptr)
3940 : return nullptr;
3941 :
3942 14750 : rtx_insn *insn = DF_REF_INSN (ref);
3943 14750 : rtx set = single_set (insn);
3944 14750 : if (!set)
3945 : return nullptr;
3946 :
3947 14711 : rtx src = SET_SRC (set);
3948 :
3949 14711 : if (CONST_INT_P (src))
3950 : {
3951 : /* Handle sequences like
3952 :
3953 : (set (subreg:SI (reg/v:SF 105 [ f ]) 0)
3954 : (const_int 0 [0]))
3955 : (set (reg:V4SF 110)
3956 : (vec_duplicate:V4SF (reg/v:SF 105 [ f ])))
3957 :
3958 : and
3959 :
3960 : (set (reg:SI 99)
3961 : (const_int 34 [0x22]))
3962 : (set (reg:V4SI 98)
3963 : (vec_duplicate:V4SI (reg:SI 99)))
3964 :
3965 : Set *INSN_P to nullptr and return SET_SRC if SET_SRC is an
3966 : integer constant. */
3967 235 : op = src;
3968 235 : if (SCALAR_INT_MODE_P (mode) && mode != GET_MODE (reg))
3969 0 : op = gen_int_mode (INTVAL (src), mode);
3970 235 : if (op == const0_rtx)
3971 : {
3972 6 : if (standard_sse_constant_p (CONST0_RTX (orig_mode),
3973 : orig_mode) == 1)
3974 : {
3975 6 : *scalar_mode_p = QImode;
3976 6 : *kind_p = X86_CSE_CONST0_VECTOR;
3977 6 : *insn_p = nullptr;
3978 6 : return const0_rtx;
3979 : }
3980 0 : op = CONST0_RTX (mode);
3981 : }
3982 229 : else if (op == constm1_rtx
3983 229 : && standard_sse_constant_p (CONSTM1_RTX (orig_mode),
3984 : orig_mode) == 2)
3985 : {
3986 0 : *scalar_mode_p = QImode;
3987 0 : *kind_p = X86_CSE_CONSTM1_VECTOR;
3988 0 : *insn_p = nullptr;
3989 0 : return constm1_rtx;
3990 : }
3991 :
3992 : /* Check if we can convert:
3993 :
3994 : (insn 14 465 412 3 (set (reg:SI 507 [ j_lsm.26 ])
3995 : (const_int 2 [0x2])) "foo.c":10:12 discrim 2 100 {*movsi_internal} (nil))
3996 : ...
3997 : (insn 518 507 434 16 (set (reg:V2SI 493)
3998 : (vec_duplicate:V2SI (reg:SI 507 [ j_lsm.26 ]))) 2395 {*vec_dupv2si} (nil))
3999 :
4000 : to constant integer load:
4001 :
4002 : (insn 566 55 56 6 (set (subreg:DI (reg:V2SI 517) 0)
4003 : (const_int 8589934594 [0x200000002])) -1 (nil))
4004 : ...
4005 : (insn 518 507 434 16 (set (reg:V2SI 493)
4006 : (reg:V2SI 517)) 2066 {*movv2si_internal} (nil))
4007 :
4008 : */
4009 458 : if (GET_MODE_SIZE (orig_mode) <= UNITS_PER_WORD)
4010 6 : *kind_p = X86_CSE_CONST_VECTOR;
4011 :
4012 229 : *insn_p = nullptr;
4013 : }
4014 : else
4015 : {
4016 : /* Handle sequences like
4017 :
4018 : (set (reg:QI 105 [ c ])
4019 : (reg:QI 5 di [ c ]))
4020 : (set (reg:V64QI 102 [ _1 ])
4021 : (vec_duplicate:V64QI (reg:QI 105 [ c ])))
4022 :
4023 : (set (reg/v:SI 116 [ argc ])
4024 : (mem/c:SI (reg:SI 135) [2 argc+0 S4 A32]))
4025 : (set (reg:V4SI 119 [ _45 ])
4026 : (vec_duplicate:V4SI (reg/v:SI 116 [ argc ])))
4027 :
4028 : (set (reg:SI 98 [ _1 ])
4029 : (sign_extend:SI (reg:QI 106 [ c ])))
4030 : (set (reg:V16SI 103 [ _2 ])
4031 : (vec_duplicate:V16SI (reg:SI 98 [ _1 ])))
4032 :
4033 : (set (reg:SI 102 [ cost ])
4034 : (mem/c:SI (symbol_ref:DI ("cost") [flags 0x40])))
4035 : (set (reg:V4HI 103 [ _16 ])
4036 : (vec_duplicate:V4HI (subreg:HI (reg:SI 102 [ cost ]) 0)))
4037 :
4038 : (set (subreg:SI (reg/v:HI 107 [ cr_val ]) 0)
4039 : (ashift:SI (reg:SI 158)
4040 : (subreg:QI (reg:SI 156 [ _2 ]) 0)))
4041 : (set (reg:V16HI 183 [ _61 ])
4042 : (vec_duplicate:V16HI (reg/v:HI 107 [ cr_val ])))
4043 :
4044 : Set *INSN_P to INSN and return the broadcast source otherwise. */
4045 14476 : *insn_p = insn;
4046 : }
4047 :
4048 14705 : *scalar_mode_p = mode;
4049 14705 : return op;
4050 : }
4051 :
4052 : /* Replace CALL instruction in TLS_CALL_INSNS with SET from SRC and
4053 : put the updated instruction in UPDATED_TLS_INSNS. */
4054 :
4055 : static void
4056 313 : replace_tls_call (rtx src, auto_bitmap &tls_call_insns,
4057 : auto_bitmap &updated_tls_insns)
4058 : {
4059 313 : bitmap_iterator bi;
4060 313 : unsigned int id;
4061 :
4062 1739 : EXECUTE_IF_SET_IN_BITMAP (tls_call_insns, 0, id, bi)
4063 : {
4064 1426 : rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
4065 :
4066 : /* If this isn't a CALL, only GNU2 TLS implicit CALL patterns are
4067 : allowed. */
4068 1426 : if (!CALL_P (insn))
4069 : {
4070 47 : attr_tls64 tls64 = get_attr_tls64 (insn);
4071 47 : if (tls64 != TLS64_CALL && tls64 != TLS64_COMBINE)
4072 0 : gcc_unreachable ();
4073 : }
4074 :
4075 1426 : rtx pat = PATTERN (insn);
4076 1426 : gcc_assert (GET_CODE (pat) == PARALLEL);
4077 1426 : rtx set = XVECEXP (pat, 0, 0);
4078 1426 : gcc_assert (GET_CODE (set) == SET);
4079 1426 : rtx dest = SET_DEST (set);
4080 :
4081 1426 : set = gen_rtx_SET (dest, src);
4082 1426 : rtx_insn *set_insn = emit_insn_after (set, insn);
4083 1426 : if (recog_memoized (set_insn) < 0)
4084 0 : gcc_unreachable ();
4085 :
4086 : /* Put SET_INSN in UPDATED_TLS_INSNS. */
4087 1426 : bitmap_set_bit (updated_tls_insns, INSN_UID (set_insn));
4088 :
4089 1426 : if (dump_file)
4090 : {
4091 0 : fprintf (dump_file, "\nReplace:\n\n");
4092 0 : print_rtl_single (dump_file, insn);
4093 0 : fprintf (dump_file, "\nwith:\n\n");
4094 0 : print_rtl_single (dump_file, set_insn);
4095 0 : fprintf (dump_file, "\n");
4096 : }
4097 :
4098 : /* Delete the CALL insn. */
4099 1426 : delete_insn (insn);
4100 :
4101 1426 : df_insn_rescan (set_insn);
4102 : }
4103 313 : }
4104 :
4105 : /* Return the basic block which dominates all basic blocks which set
4106 : hard register REGNO used in basic block BB. */
4107 :
4108 : static basic_block
4109 2 : ix86_get_dominator_for_reg (unsigned int regno, basic_block bb)
4110 : {
4111 2 : basic_block set_bb;
4112 2 : auto_bitmap set_bbs;
4113 :
4114 : /* Get all BBs which set REGNO and dominate the current BB from all
4115 : DEFs of REGNO. */
4116 2 : for (df_ref def = DF_REG_DEF_CHAIN (regno);
4117 18 : def;
4118 16 : def = DF_REF_NEXT_REG (def))
4119 16 : if (!DF_REF_IS_ARTIFICIAL (def)
4120 16 : && !DF_REF_FLAGS_IS_SET (def, DF_REF_MAY_CLOBBER)
4121 6 : && !DF_REF_FLAGS_IS_SET (def, DF_REF_MUST_CLOBBER))
4122 : {
4123 4 : set_bb = DF_REF_BB (def);
4124 4 : if (dominated_by_p (CDI_DOMINATORS, bb, set_bb))
4125 2 : bitmap_set_bit (set_bbs, set_bb->index);
4126 : }
4127 :
4128 2 : bb = nearest_common_dominator_for_set (CDI_DOMINATORS, set_bbs);
4129 2 : return bb;
4130 2 : }
4131 :
4132 : /* Mark FLAGS register as live in DATA, a bitmap of live caller-saved
4133 : registers, if DEST is FLAGS register. */
4134 :
4135 : static void
4136 381 : ix86_check_flags_reg (rtx dest, const_rtx x, void *data)
4137 : {
4138 381 : if (GET_CODE (x) == CLOBBER)
4139 : return;
4140 :
4141 374 : auto_bitmap *live_caller_saved_regs = (auto_bitmap *) data;
4142 374 : if (REG_P (dest) && REGNO (dest) == FLAGS_REG)
4143 0 : bitmap_set_bit (*live_caller_saved_regs, FLAGS_REG);
4144 : }
4145 :
4146 : /* Emit a TLS_SET instruction of KIND in basic block BB. Store the
4147 : insertion point in *BEFORE_P for emit_insn_before or in *AFTER_P
4148 : for emit_insn_after. UPDATED_GNU_TLS_INSNS contains instructions
4149 : which replace the GNU TLS instructions. UPDATED_GNU2_TLS_INSNS
4150 : contains instructions which replace the GNU2 TLS instructions. */
4151 :
4152 : static rtx_insn *
4153 313 : ix86_emit_tls_call (rtx tls_set, x86_cse_kind kind, basic_block bb,
4154 : rtx_insn **before_p, rtx_insn **after_p,
4155 : auto_bitmap &updated_gnu_tls_insns,
4156 : auto_bitmap &updated_gnu2_tls_insns)
4157 : {
4158 315 : rtx_insn *tls_insn;
4159 :
4160 315 : do
4161 : {
4162 315 : rtx_insn *insn = BB_HEAD (bb);
4163 1297 : while (insn && !NONDEBUG_INSN_P (insn))
4164 : {
4165 986 : if (insn == BB_END (bb))
4166 : {
4167 : /* This must be the beginning basic block:
4168 :
4169 : (note 4 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
4170 : (note 2 4 26 2 NOTE_INSN_FUNCTION_BEG)
4171 :
4172 : or a basic block with only a label:
4173 :
4174 : (code_label 78 11 77 3 14 (nil) [1 uses])
4175 : (note 77 78 54 3 [bb 3] NOTE_INSN_BASIC_BLOCK)
4176 :
4177 : or a basic block with only a debug marker:
4178 :
4179 : (note 3 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
4180 : (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG)
4181 : (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil))
4182 :
4183 : or a basic block with only deleted instructions:
4184 :
4185 : (code_label 348 23 349 45 3 (nil) [0 uses])
4186 : (note 349 348 436 45 [bb 45] NOTE_INSN_BASIC_BLOCK)
4187 : (note 436 349 362 45 NOTE_INSN_DELETED)
4188 :
4189 : */
4190 4 : gcc_assert (DEBUG_INSN_P (insn)
4191 : || (NOTE_P (insn)
4192 : && ((NOTE_KIND (insn)
4193 : == NOTE_INSN_FUNCTION_BEG)
4194 : || (NOTE_KIND (insn)
4195 : == NOTE_INSN_DELETED)
4196 : || (NOTE_KIND (insn)
4197 : == NOTE_INSN_BASIC_BLOCK))));
4198 : insn = NULL;
4199 : break;
4200 : }
4201 982 : insn = NEXT_INSN (insn);
4202 : }
4203 :
4204 : /* TLS_GD and TLS_LD_BASE instructions are normal functions which
4205 : clobber caller-saved registers. TLSDESC instructions only
4206 : clobber FLAGS. If any registers clobbered by TLS instructions
4207 : are live in this basic block, we must insert TLS instructions
4208 : after all live registers clobbered are dead. */
4209 :
4210 315 : auto_bitmap live_caller_saved_regs;
4211 630 : bitmap in = df_live ? DF_LIVE_IN (bb) : DF_LR_IN (bb);
4212 :
4213 315 : if (bitmap_bit_p (in, FLAGS_REG))
4214 4 : bitmap_set_bit (live_caller_saved_regs, FLAGS_REG);
4215 :
4216 315 : unsigned int i;
4217 :
4218 : /* Get all live caller-saved registers for TLS_GD and TLS_LD_BASE
4219 : instructions. */
4220 315 : if (kind != X86_CSE_TLSDESC)
4221 27249 : for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4222 26956 : if (call_used_regs[i]
4223 25198 : && !fixed_regs[i]
4224 38993 : && bitmap_bit_p (in, i))
4225 344 : bitmap_set_bit (live_caller_saved_regs, i);
4226 :
4227 315 : if (bitmap_empty_p (live_caller_saved_regs))
4228 : {
4229 82 : if (insn == BB_HEAD (bb))
4230 : {
4231 0 : *before_p = insn;
4232 0 : tls_insn = emit_insn_before (tls_set, insn);
4233 : }
4234 : else
4235 : {
4236 : /* Emit the TLS call after NOTE_INSN_FUNCTION_BEG in the
4237 : beginning basic block:
4238 :
4239 : (note 4 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
4240 : (note 2 4 26 2 NOTE_INSN_FUNCTION_BEG)
4241 :
4242 : or after NOTE_INSN_BASIC_BLOCK in a basic block with
4243 : only a label:
4244 :
4245 : (code_label 78 11 77 3 14 (nil) [1 uses])
4246 : (note 77 78 54 3 [bb 3] NOTE_INSN_BASIC_BLOCK)
4247 :
4248 : or after debug marker in a basic block with only a
4249 : debug marker:
4250 :
4251 : (note 3 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
4252 : (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG)
4253 : (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil))
4254 :
4255 : */
4256 82 : insn = insn ? PREV_INSN (insn) : BB_END (bb);
4257 82 : *after_p = insn;
4258 82 : tls_insn = emit_insn_after (tls_set, insn);
4259 : }
4260 82 : return tls_insn;
4261 : }
4262 :
4263 233 : bool repeat = false;
4264 :
4265 : /* Search for REG_DEAD notes in this basic block. */
4266 661 : FOR_BB_INSNS (bb, insn)
4267 : {
4268 661 : if (!NONDEBUG_INSN_P (insn))
4269 283 : continue;
4270 :
4271 : /* NB: Conditional jump is the only instruction which reads
4272 : flags register and changes control flow. We can never
4273 : place the TLS call after unconditional jump. */
4274 378 : if (JUMP_P (insn))
4275 : {
4276 : /* This must be a conditional jump. */
4277 2 : rtx label = JUMP_LABEL (insn);
4278 2 : if (label == nullptr
4279 2 : || ANY_RETURN_P (label)
4280 2 : || !(LABEL_P (label) || SYMBOL_REF_P (label)))
4281 0 : gcc_unreachable ();
4282 :
4283 : /* Place the call before all FLAGS_REG setting BBs since
4284 : we can't place a call before nor after a conditional
4285 : jump. */
4286 2 : bb = ix86_get_dominator_for_reg (FLAGS_REG, bb);
4287 :
4288 : /* Start over again. */
4289 2 : repeat = true;
4290 2 : break;
4291 : }
4292 :
4293 376 : if (bitmap_bit_p (updated_gnu_tls_insns, INSN_UID (insn)))
4294 : {
4295 : /* Insert the __tls_get_addr call before INSN which
4296 : replaces a __tls_get_addr call. */
4297 1 : *before_p = insn;
4298 1 : tls_insn = emit_insn_before (tls_set, insn);
4299 1 : return tls_insn;
4300 : }
4301 :
4302 375 : if (bitmap_bit_p (updated_gnu2_tls_insns, INSN_UID (insn)))
4303 : {
4304 : /* Mark FLAGS register as dead since FLAGS register
4305 : would be clobbered by the GNU2 TLS instruction. */
4306 1 : bitmap_clear_bit (live_caller_saved_regs, FLAGS_REG);
4307 1 : continue;
4308 : }
4309 :
4310 : /* Check if FLAGS register is live. */
4311 374 : note_stores (insn, ix86_check_flags_reg,
4312 : &live_caller_saved_regs);
4313 :
4314 374 : rtx link;
4315 515 : for (link = REG_NOTES (insn); link; link = XEXP (link, 1))
4316 371 : if ((REG_NOTE_KIND (link) == REG_DEAD
4317 9 : || (REG_NOTE_KIND (link) == REG_UNUSED
4318 7 : && REGNO (XEXP (link, 0)) == FLAGS_REG))
4319 378 : && REG_P (XEXP (link, 0)))
4320 : {
4321 : /* Mark the live caller-saved register as dead. */
4322 743 : for (i = REGNO (XEXP (link, 0));
4323 743 : i < END_REGNO (XEXP (link, 0));
4324 : i++)
4325 374 : if (i < FIRST_PSEUDO_REGISTER)
4326 351 : bitmap_clear_bit (live_caller_saved_regs, i);
4327 :
4328 369 : if (bitmap_empty_p (live_caller_saved_regs))
4329 : {
4330 230 : *after_p = insn;
4331 230 : tls_insn = emit_insn_after (tls_set, insn);
4332 230 : return tls_insn;
4333 : }
4334 : }
4335 : }
4336 :
4337 : /* NB: Start over again for conditional jump. */
4338 2 : if (repeat)
4339 2 : continue;
4340 :
4341 0 : gcc_assert (!bitmap_empty_p (live_caller_saved_regs));
4342 :
4343 : /* If any live caller-saved registers aren't dead at the end of
4344 : this basic block, get the basic block which dominates all
4345 : basic blocks which set the remaining live registers. */
4346 0 : auto_bitmap set_bbs;
4347 0 : bitmap_iterator bi;
4348 0 : unsigned int id;
4349 0 : EXECUTE_IF_SET_IN_BITMAP (live_caller_saved_regs, 0, id, bi)
4350 : {
4351 0 : basic_block set_bb = ix86_get_dominator_for_reg (id, bb);
4352 0 : bitmap_set_bit (set_bbs, set_bb->index);
4353 : }
4354 0 : bb = nearest_common_dominator_for_set (CDI_DOMINATORS, set_bbs);
4355 2 : }
4356 : while (true);
4357 : }
4358 :
4359 : /* Generate a TLS call of KIND with VAL and copy the call result to DEST,
4360 : at entry of the nearest dominator for basic block map BBS, which is in
4361 : the fake loop that contains the whole function, so that there is only
4362 : a single TLS CALL of KIND with VAL in the whole function.
4363 : UPDATED_GNU_TLS_INSNS contains instructions which replace the GNU TLS
4364 : instructions. UPDATED_GNU2_TLS_INSNS contains instructions which
4365 : replace the GNU2 TLS instructions. If TLSDESC_SET isn't nullptr,
4366 : insert it before the TLS call. */
4367 :
4368 : static void
4369 313 : ix86_place_single_tls_call (rtx dest, rtx val, x86_cse_kind kind,
4370 : auto_bitmap &bbs,
4371 : auto_bitmap &updated_gnu_tls_insns,
4372 : auto_bitmap &updated_gnu2_tls_insns,
4373 : rtx tlsdesc_set = nullptr)
4374 : {
4375 313 : basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
4376 313 : while (bb->loop_father->latch
4377 322 : != EXIT_BLOCK_PTR_FOR_FN (cfun))
4378 9 : bb = get_immediate_dominator (CDI_DOMINATORS,
4379 : bb->loop_father->header);
4380 :
4381 313 : rtx rax = nullptr, rdi;
4382 313 : rtx eqv = nullptr;
4383 313 : rtx caddr;
4384 313 : rtx set;
4385 313 : rtx clob;
4386 313 : rtx symbol;
4387 313 : rtx tls;
4388 :
4389 313 : switch (kind)
4390 : {
4391 262 : case X86_CSE_TLS_GD:
4392 262 : rax = gen_rtx_REG (Pmode, AX_REG);
4393 262 : rdi = gen_rtx_REG (Pmode, DI_REG);
4394 262 : caddr = ix86_tls_get_addr ();
4395 :
4396 262 : symbol = XVECEXP (val, 0, 0);
4397 262 : tls = gen_tls_global_dynamic_64 (Pmode, rax, symbol, caddr, rdi);
4398 :
4399 262 : if (GET_MODE (symbol) != Pmode)
4400 0 : symbol = gen_rtx_ZERO_EXTEND (Pmode, symbol);
4401 : eqv = symbol;
4402 : break;
4403 :
4404 30 : case X86_CSE_TLS_LD_BASE:
4405 30 : rax = gen_rtx_REG (Pmode, AX_REG);
4406 30 : rdi = gen_rtx_REG (Pmode, DI_REG);
4407 30 : caddr = ix86_tls_get_addr ();
4408 :
4409 30 : tls = gen_tls_local_dynamic_base_64 (Pmode, rax, caddr, rdi);
4410 :
4411 : /* Attach a unique REG_EQUAL to DEST, to allow the RTL optimizers
4412 : to share the LD_BASE result with other LD model accesses. */
4413 30 : eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
4414 : UNSPEC_TLS_LD_BASE);
4415 :
4416 30 : break;
4417 :
4418 21 : case X86_CSE_TLSDESC:
4419 21 : set = gen_rtx_SET (dest, val);
4420 21 : clob = gen_rtx_CLOBBER (VOIDmode,
4421 : gen_rtx_REG (CCmode, FLAGS_REG));
4422 21 : tls = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clob));
4423 21 : break;
4424 :
4425 0 : default:
4426 0 : gcc_unreachable ();
4427 : }
4428 :
4429 : /* Emit the TLS CALL insn. */
4430 313 : rtx_insn *before = nullptr;
4431 313 : rtx_insn *after = nullptr;
4432 313 : rtx_insn *tls_insn = ix86_emit_tls_call (tls, kind, bb, &before,
4433 : &after,
4434 : updated_gnu_tls_insns,
4435 : updated_gnu2_tls_insns);
4436 :
4437 313 : rtx_insn *tlsdesc_insn = nullptr;
4438 313 : if (tlsdesc_set)
4439 : {
4440 16 : rtx dest = copy_rtx (SET_DEST (tlsdesc_set));
4441 16 : rtx src = copy_rtx (SET_SRC (tlsdesc_set));
4442 16 : tlsdesc_set = gen_rtx_SET (dest, src);
4443 16 : tlsdesc_insn = emit_insn_before (tlsdesc_set, tls_insn);
4444 : }
4445 :
4446 313 : if (kind != X86_CSE_TLSDESC)
4447 : {
4448 292 : RTL_CONST_CALL_P (tls_insn) = 1;
4449 :
4450 : /* Indicate that this function can't jump to non-local gotos. */
4451 292 : make_reg_eh_region_note_nothrow_nononlocal (tls_insn);
4452 : }
4453 :
4454 313 : if (recog_memoized (tls_insn) < 0)
4455 0 : gcc_unreachable ();
4456 :
4457 313 : if (dump_file)
4458 : {
4459 0 : if (after)
4460 : {
4461 0 : fprintf (dump_file, "\nPlace:\n\n");
4462 0 : if (tlsdesc_insn)
4463 0 : print_rtl_single (dump_file, tlsdesc_insn);
4464 0 : print_rtl_single (dump_file, tls_insn);
4465 0 : fprintf (dump_file, "\nafter:\n\n");
4466 0 : print_rtl_single (dump_file, after);
4467 0 : fprintf (dump_file, "\n");
4468 : }
4469 : else
4470 : {
4471 0 : fprintf (dump_file, "\nPlace:\n\n");
4472 0 : if (tlsdesc_insn)
4473 0 : print_rtl_single (dump_file, tlsdesc_insn);
4474 0 : print_rtl_single (dump_file, tls_insn);
4475 0 : fprintf (dump_file, "\nbefore:\n\n");
4476 0 : print_rtl_single (dump_file, before);
4477 0 : fprintf (dump_file, "\n");
4478 : }
4479 : }
4480 :
4481 313 : if (kind != X86_CSE_TLSDESC)
4482 : {
4483 : /* Copy RAX to DEST. */
4484 292 : set = gen_rtx_SET (dest, rax);
4485 292 : rtx_insn *set_insn = emit_insn_after (set, tls_insn);
4486 292 : set_dst_reg_note (set_insn, REG_EQUAL, copy_rtx (eqv), dest);
4487 292 : if (dump_file)
4488 : {
4489 0 : fprintf (dump_file, "\nPlace:\n\n");
4490 0 : print_rtl_single (dump_file, set_insn);
4491 0 : fprintf (dump_file, "\nafter:\n\n");
4492 0 : print_rtl_single (dump_file, tls_insn);
4493 0 : fprintf (dump_file, "\n");
4494 : }
4495 : }
4496 313 : }
4497 :
4498 : namespace {
4499 :
4500 : const pass_data pass_data_x86_cse =
4501 : {
4502 : RTL_PASS, /* type */
4503 : "x86_cse", /* name */
4504 : OPTGROUP_NONE, /* optinfo_flags */
4505 : TV_MACH_DEP, /* tv_id */
4506 : 0, /* properties_required */
4507 : 0, /* properties_provided */
4508 : 0, /* properties_destroyed */
4509 : 0, /* todo_flags_start */
4510 : 0, /* todo_flags_finish */
4511 : };
4512 :
4513 : class pass_x86_cse : public rtl_opt_pass
4514 : {
4515 : public:
4516 288047 : pass_x86_cse (gcc::context *ctxt)
4517 576094 : : rtl_opt_pass (pass_data_x86_cse, ctxt)
4518 : {}
4519 :
4520 : /* opt_pass methods: */
4521 1474422 : bool gate (function *fun) final override
4522 : {
4523 1474422 : return optimize && optimize_function_for_speed_p (fun);
4524 : }
4525 :
4526 976721 : unsigned int execute (function *) final override
4527 : {
4528 976721 : return x86_cse ();
4529 : }
4530 :
4531 : private:
4532 : /* The redundant source value. */
4533 : rtx val;
4534 : /* The actual redundant source value for UNSPEC_TLSDESC. */
4535 : rtx tlsdesc_val;
4536 : /* The instruction which defines the redundant value. */
4537 : rtx_insn *def_insn;
4538 : /* Mode of the destination of the candidate redundant instruction. */
4539 : machine_mode mode;
4540 : /* Mode of the source of the candidate redundant instruction. */
4541 : machine_mode scalar_mode;
4542 : /* The classification of the candidate redundant instruction. */
4543 : x86_cse_kind kind;
4544 :
4545 : unsigned int x86_cse (void);
4546 : bool candidate_gnu_tls_p (rtx_insn *, attr_tls64);
4547 : bool candidate_gnu2_tls_p (rtx, attr_tls64);
4548 : bool candidate_vector_p (rtx, rtx_insn *);
4549 : rtx_insn *tls_set_insn_from_symbol (const_rtx, const_rtx);
4550 : }; // class pass_x86_cse
4551 :
4552 : /* Return the instruction which sets REG from TLS_SYMBOL. */
4553 :
4554 : rtx_insn *
4555 42 : pass_x86_cse::tls_set_insn_from_symbol (const_rtx reg,
4556 : const_rtx tls_symbol)
4557 : {
4558 42 : rtx_insn *set_insn = nullptr;
4559 42 : for (df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg));
4560 111 : ref;
4561 69 : ref = DF_REF_NEXT_REG (ref))
4562 : {
4563 69 : if (DF_REF_IS_ARTIFICIAL (ref))
4564 : return nullptr;
4565 :
4566 69 : set_insn = DF_REF_INSN (ref);
4567 69 : if (get_attr_tls64 (set_insn) != TLS64_LEA)
4568 : return nullptr;
4569 :
4570 69 : rtx tls_set = PATTERN (set_insn);
4571 69 : rtx tls_src = XVECEXP (SET_SRC (tls_set), 0, 0);
4572 69 : if (!rtx_equal_p (tls_symbol, tls_src))
4573 : return nullptr;
4574 : }
4575 :
4576 : return set_insn;
4577 : }
4578 :
4579 : /* Return true and output def_insn, val, mode, scalar_mode and kind if
4580 : INSN is UNSPEC_TLS_GD or UNSPEC_TLS_LD_BASE. */
4581 :
4582 : bool
4583 2185 : pass_x86_cse::candidate_gnu_tls_p (rtx_insn *insn, attr_tls64 tls64)
4584 : {
4585 2185 : if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p)
4586 : return false;
4587 :
4588 : /* Record the redundant TLS CALLs for 64-bit:
4589 :
4590 : (parallel [
4591 : (set (reg:DI 0 ax)
4592 : (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr")))
4593 : (const_int 0 [0])))
4594 : (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50])
4595 : (reg/f:DI 7 sp)] UNSPEC_TLS_GD)
4596 : (clobber (reg:DI 5 di))])
4597 :
4598 :
4599 : and
4600 :
4601 : (parallel [
4602 : (set (reg:DI 0 ax)
4603 : (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr")))
4604 : (const_int 0 [0])))
4605 : (unspec:DI [(reg/f:DI 7 sp)] UNSPEC_TLS_LD_BASE)])
4606 :
4607 : */
4608 :
4609 2022 : rtx pat = PATTERN (insn);
4610 2022 : rtx set = XVECEXP (pat, 0, 0);
4611 2022 : gcc_assert (GET_CODE (set) == SET);
4612 2022 : rtx dest = SET_DEST (set);
4613 2022 : scalar_mode = mode = GET_MODE (dest);
4614 2022 : val = XVECEXP (pat, 0, 1);
4615 2022 : gcc_assert (GET_CODE (val) == UNSPEC);
4616 :
4617 2022 : if (tls64 == TLS64_GD)
4618 1921 : kind = X86_CSE_TLS_GD;
4619 : else
4620 101 : kind = X86_CSE_TLS_LD_BASE;
4621 :
4622 2022 : def_insn = nullptr;
4623 2022 : return true;
4624 : }
4625 :
4626 : /* Return true and output def_insn, val, mode, scalar_mode and kind if
4627 : SET is UNSPEC_TLSDESC. */
4628 :
4629 : bool
4630 56 : pass_x86_cse::candidate_gnu2_tls_p (rtx set, attr_tls64 tls64)
4631 : {
4632 56 : if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p)
4633 : return false;
4634 :
4635 54 : rtx tls_symbol;
4636 54 : rtx_insn *set_insn;
4637 54 : rtx src = SET_SRC (set);
4638 54 : val = src;
4639 54 : tlsdesc_val = src;
4640 54 : kind = X86_CSE_TLSDESC;
4641 :
4642 54 : if (tls64 == TLS64_COMBINE)
4643 : {
4644 : /* Record 64-bit TLS64_COMBINE:
4645 :
4646 : (set (reg/f:DI 104)
4647 : (plus:DI (unspec:DI [
4648 : (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
4649 : (reg:DI 114)
4650 : (reg/f:DI 7 sp)] UNSPEC_TLSDESC)
4651 : (const:DI (unspec:DI [
4652 : (symbol_ref:DI ("e") [flags 0x1a])
4653 : ] UNSPEC_DTPOFF))))
4654 :
4655 : (set (reg/f:DI 104)
4656 : (plus:DI (unspec:DI [
4657 : (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
4658 : (unspec:DI [
4659 : (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
4660 : ] UNSPEC_TLSDESC)
4661 : (reg/f:DI 7 sp)] UNSPEC_TLSDESC)
4662 : (const:DI (unspec:DI [
4663 : (symbol_ref:DI ("e") [flags 0x1a])
4664 : ] UNSPEC_DTPOFF))))
4665 : */
4666 :
4667 12 : scalar_mode = mode = GET_MODE (src);
4668 :
4669 : /* Since the first operand of PLUS in the source TLS_COMBINE
4670 : pattern is unused, use the second operand of PLUS:
4671 :
4672 : (const:DI (unspec:DI [
4673 : (symbol_ref:DI ("e") [flags 0x1a])
4674 : ] UNSPEC_DTPOFF))
4675 :
4676 : as VAL to check if 2 TLS_COMBINE patterns have the same
4677 : source. */
4678 12 : val = XEXP (src, 1);
4679 12 : gcc_assert (GET_CODE (val) == CONST
4680 : && GET_CODE (XEXP (val, 0)) == UNSPEC
4681 : && XINT (XEXP (val, 0), 1) == UNSPEC_DTPOFF
4682 : && SYMBOL_REF_P (XVECEXP (XEXP (val, 0), 0, 0)));
4683 12 : def_insn = nullptr;
4684 12 : return true;
4685 : }
4686 :
4687 : /* Record 64-bit TLS_CALL:
4688 :
4689 : (set (reg:DI 101)
4690 : (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50])
4691 : (reg:DI 112)
4692 : (reg/f:DI 7 sp)] UNSPEC_TLSDESC))
4693 :
4694 : */
4695 :
4696 42 : gcc_assert (GET_CODE (src) == UNSPEC);
4697 42 : tls_symbol = XVECEXP (src, 0, 0);
4698 42 : src = XVECEXP (src, 0, 1);
4699 42 : scalar_mode = mode = GET_MODE (src);
4700 42 : gcc_assert (REG_P (src));
4701 :
4702 : /* All definitions of reg:DI 129 in
4703 :
4704 : (set (reg:DI 110)
4705 : (unspec:DI [(symbol_ref:DI ("foo"))
4706 : (reg:DI 129)
4707 : (reg/f:DI 7 sp)] UNSPEC_TLSDESC))
4708 :
4709 : should have the same source as in
4710 :
4711 : (set (reg:DI 129)
4712 : (unspec:DI [(symbol_ref:DI ("foo"))] UNSPEC_TLSDESC))
4713 :
4714 : */
4715 :
4716 42 : set_insn = tls_set_insn_from_symbol (src, tls_symbol);
4717 42 : if (!set_insn)
4718 : return false;
4719 :
4720 : /* Use TLS_SYMBOL as VAL to check if 2 patterns have the same source. */
4721 42 : val = tls_symbol;
4722 42 : def_insn = set_insn;
4723 42 : return true;
4724 : }
4725 :
4726 : /* Return true and output def_insn, val, mode, scalar_mode and kind if
4727 : INSN is a vector broadcast instruction. */
4728 :
4729 : bool
4730 49796522 : pass_x86_cse::candidate_vector_p (rtx set, rtx_insn *insn)
4731 : {
4732 49796522 : rtx src = SET_SRC (set);
4733 49796522 : rtx dest = SET_DEST (set);
4734 49796522 : mode = GET_MODE (dest);
4735 : /* Skip non-vector instruction. */
4736 49796522 : if (!VECTOR_MODE_P (mode))
4737 : return false;
4738 :
4739 : /* Skip non-vector load instruction. */
4740 3715758 : if (!REG_P (dest) && !SUBREG_P (dest))
4741 : return false;
4742 :
4743 2203660 : def_insn = insn;
4744 2203660 : val = ix86_broadcast_inner (src, mode, &scalar_mode, &kind,
4745 : &def_insn);
4746 2203660 : return val ? true : false;
4747 : }
4748 :
4749 : /* At entry of the nearest common dominator for basic blocks with
4750 :
4751 : 1. Vector CONST0_RTX patterns.
4752 : 2. Vector CONSTM1_RTX patterns.
4753 : 3. Vector broadcast patterns.
4754 : 4. UNSPEC_TLS_GD patterns.
4755 : 5. UNSPEC_TLS_LD_BASE patterns.
4756 : 6. UNSPEC_TLSDESC patterns.
4757 :
4758 : generate a single pattern whose destination is used to replace the
4759 : source in all identical patterns.
4760 :
4761 : NB: We want to generate a pattern, which is executed only once, to
4762 : cover the whole function. The LCM algorithm isn't appropriate here
4763 : since it may place a pattern inside the loop. */
4764 :
4765 : unsigned int
4766 976721 : pass_x86_cse::x86_cse (void)
4767 : {
4768 976721 : timevar_push (TV_MACH_DEP);
4769 :
4770 976721 : auto_vec<redundant_pattern *> loads;
4771 976721 : redundant_pattern *load;
4772 976721 : basic_block bb;
4773 976721 : rtx_insn *insn;
4774 976721 : unsigned int i;
4775 976721 : auto_bitmap updated_gnu_tls_insns;
4776 976721 : auto_bitmap updated_gnu2_tls_insns;
4777 976721 : auto_bitmap call_bbs;
4778 :
4779 976721 : df_set_flags (DF_DEFER_INSN_RESCAN);
4780 :
4781 976721 : bool recursive_call_p = cfun->machine->recursive_function;
4782 :
4783 10901283 : FOR_EACH_BB_FN (bb, cfun)
4784 : {
4785 130959437 : FOR_BB_INSNS (bb, insn)
4786 : {
4787 121034875 : if (!NONDEBUG_INSN_P (insn))
4788 67586072 : continue;
4789 :
4790 53448803 : bool matched = false;
4791 : /* Remove redundant pattens if there are more than 2 of
4792 : them. */
4793 53448803 : unsigned int threshold = 2;
4794 :
4795 53448803 : bool call_p = CALL_P (insn);
4796 53448803 : rtx set = single_set (insn);
4797 53448803 : if (!set && !call_p)
4798 1101789 : continue;
4799 :
4800 52347014 : tlsdesc_val = nullptr;
4801 :
4802 52347014 : attr_tls64 tls64 = get_attr_tls64 (insn);
4803 :
4804 : /* NB: TLS calls preserve all registers. */
4805 52347014 : if (call_p && tls64 == TLS64_NONE)
4806 4414050 : bitmap_set_bit (call_bbs, BLOCK_FOR_INSN (insn)->index);
4807 :
4808 52347014 : switch (tls64)
4809 : {
4810 2185 : case TLS64_GD:
4811 2185 : case TLS64_LD_BASE:
4812 : /* Verify UNSPEC_TLS_GD and UNSPEC_TLS_LD_BASE. */
4813 2185 : if (candidate_gnu_tls_p (insn, tls64))
4814 : break;
4815 163 : continue;
4816 :
4817 56 : case TLS64_CALL:
4818 56 : case TLS64_COMBINE:
4819 : /* Verify UNSPEC_TLSDESC. */
4820 56 : if (candidate_gnu2_tls_p (set, tls64))
4821 : break;
4822 2 : continue;
4823 :
4824 38 : case TLS64_LEA:
4825 : /* Skip TLS64_LEA. */
4826 38 : continue;
4827 :
4828 52344735 : case TLS64_NONE:
4829 52344735 : if (!set)
4830 2548213 : continue;
4831 :
4832 : /* Check for vector broadcast. */
4833 49796522 : if (candidate_vector_p (set, insn))
4834 : break;
4835 49548880 : continue;
4836 : }
4837 :
4838 : /* Check if there is a matching redundant load. */
4839 590397 : FOR_EACH_VEC_ELT (loads, i, load)
4840 436507 : if (load->val
4841 436507 : && load->kind == kind
4842 291507 : && load->mode == scalar_mode
4843 255907 : && (load->bb == bb
4844 196767 : || (kind != X86_CSE_VEC_DUP
4845 196767 : && kind != X86_CSE_CONST_VECTOR)
4846 : /* Non all 0s/1s vector load must be in the same
4847 : basic block if it is in a recursive call. */
4848 137647 : || !recursive_call_p)
4849 690305 : && rtx_equal_p (load->val, val))
4850 : {
4851 : /* Record instruction. */
4852 95828 : bitmap_set_bit (load->insns, INSN_UID (insn));
4853 :
4854 : /* Record the maximum vector size. */
4855 95828 : if (kind <= X86_CSE_VEC_DUP
4856 190543 : && load->size < GET_MODE_SIZE (mode))
4857 1016 : load->size = GET_MODE_SIZE (mode);
4858 :
4859 : /* Record the basic block. */
4860 95828 : bitmap_set_bit (load->bbs, bb->index);
4861 :
4862 : /* Increment the count. */
4863 95828 : load->count++;
4864 :
4865 95828 : matched = true;
4866 95828 : break;
4867 : }
4868 :
4869 249718 : if (matched)
4870 95828 : continue;
4871 :
4872 : /* We see this instruction the first time. Record the
4873 : redundant source value, its mode, the destination size,
4874 : instruction which defines the redundant source value,
4875 : instruction basic block and the instruction kind. */
4876 153890 : load = new redundant_pattern;
4877 :
4878 : /* Convert CONST_VECTOR load no larger than integer register
4879 : to constant integer load even if there is no redundant
4880 : CONST_VECTOR load. */
4881 153890 : if (CONST_VECTOR_P (val))
4882 30940 : threshold = 1;
4883 :
4884 153890 : load->val = copy_rtx (val);
4885 153890 : if (tlsdesc_val)
4886 28 : load->tlsdesc_val = copy_rtx (tlsdesc_val);
4887 : else
4888 153862 : load->tlsdesc_val = nullptr;
4889 153890 : load->mode = scalar_mode;
4890 153890 : load->dest_mode = mode;
4891 153890 : load->size = GET_MODE_SIZE (mode);
4892 153890 : load->def_insn = def_insn;
4893 153890 : load->count = 1;
4894 153890 : load->threshold = threshold;
4895 153890 : load->bb = BLOCK_FOR_INSN (insn);
4896 153890 : load->kind = kind;
4897 :
4898 153890 : bitmap_set_bit (load->insns, INSN_UID (insn));
4899 153890 : bitmap_set_bit (load->bbs, bb->index);
4900 :
4901 153890 : loads.safe_push (load);
4902 : }
4903 : }
4904 :
4905 : bool replaced = false;
4906 1130611 : FOR_EACH_VEC_ELT (loads, i, load)
4907 153890 : if (load->count >= load->threshold)
4908 : {
4909 63377 : machine_mode mode;
4910 63377 : rtx reg, broadcast_reg;
4911 63377 : rtx broadcast_source = nullptr;
4912 63377 : replaced = true;
4913 63377 : switch (load->kind)
4914 : {
4915 313 : case X86_CSE_TLS_GD:
4916 313 : case X86_CSE_TLS_LD_BASE:
4917 313 : case X86_CSE_TLSDESC:
4918 313 : broadcast_reg = gen_reg_rtx (load->mode);
4919 313 : replace_tls_call (broadcast_reg, load->insns,
4920 313 : (load->kind == X86_CSE_TLSDESC
4921 : ? updated_gnu2_tls_insns
4922 : : updated_gnu_tls_insns));
4923 313 : load->broadcast_reg = broadcast_reg;
4924 313 : break;
4925 :
4926 11119 : case X86_CSE_VEC_DUP:
4927 11119 : if (CONST_INT_P (load->val)
4928 10027 : && (load->val == CONST0_RTX (load->mode)
4929 10051 : || load->size <= UNITS_PER_WORD))
4930 : {
4931 : /* Generate CONST_VECTOR load. */
4932 30941 : case X86_CSE_CONST_VECTOR:
4933 30941 : mode = ix86_get_vector_cse_mode (load->size,
4934 : load->mode);
4935 :
4936 30941 : if (CONST_VECTOR_P (load->val))
4937 : broadcast_source = load->val;
4938 1 : else if (load->val == CONST0_RTX (load->mode))
4939 0 : broadcast_source = CONST0_RTX (mode);
4940 1 : else if (load->val == CONSTM1_RTX (load->mode))
4941 0 : broadcast_source = CONSTM1_RTX (mode);
4942 : else
4943 : {
4944 1 : int nunits = GET_MODE_NUNITS (mode);
4945 1 : rtvec v = rtvec_alloc (nunits);
4946 3 : for (int j = 0; j < nunits ; j++)
4947 2 : RTVEC_ELT (v, j) = load->val;
4948 1 : broadcast_source = gen_rtx_CONST_VECTOR (mode, v);
4949 : }
4950 :
4951 : /* NB: Zero CONST_VECTOR load works for MMX and XMM
4952 : registers. */
4953 32352 : if (load->size <= UNITS_PER_WORD)
4954 : {
4955 : /* Convert CONST_VECTOR load no larger than integer
4956 : register:
4957 :
4958 : (set (reg:V2SI 106)
4959 : (const_vector:V2SI [(const_int 1 [1]) repeated x2]))
4960 :
4961 : to constant integer load:
4962 :
4963 : (set (subreg:DI (reg:V2SI 106 [ _20 ]) 0)
4964 : (const_int 4294967297 [0x100000001]))
4965 : */
4966 30941 : machine_mode int_mode
4967 30941 : = int_mode_for_mode (mode).require ();
4968 30941 : load->dest_mode = int_mode;
4969 30941 : broadcast_source = simplify_subreg (int_mode,
4970 : broadcast_source,
4971 : mode, 0);
4972 30941 : gcc_assert (broadcast_source != nullptr);
4973 :
4974 30941 : bool keep_const_int_load = false;
4975 30941 : if (!bitmap_empty_p (call_bbs))
4976 : {
4977 27734 : bitmap_iterator bi;
4978 27734 : unsigned int id;
4979 36148 : EXECUTE_IF_SET_IN_BITMAP (load->bbs, 0, id, bi)
4980 28903 : if (bitmap_bit_p (call_bbs, id))
4981 : {
4982 : /* NB: Constant integer load is faster
4983 : than save and restore an integer
4984 : register when crossing a function call.
4985 : */
4986 : keep_const_int_load = true;
4987 : break;
4988 : }
4989 : }
4990 :
4991 27734 : if (keep_const_int_load)
4992 : {
4993 : /* Keep constant integer load. */
4994 20489 : replace_vector_const (mode, broadcast_source,
4995 20489 : load->insns, int_mode);
4996 20489 : load->broadcast_source = nullptr;
4997 20489 : load->broadcast_reg = nullptr;
4998 : }
4999 : else
5000 : {
5001 10452 : broadcast_reg = gen_reg_rtx (mode);
5002 10452 : reg = gen_reg_rtx (load->mode);
5003 10452 : replace_vector_const (mode, broadcast_reg,
5004 10452 : load->insns, load->mode);
5005 10452 : load->broadcast_source = broadcast_source;
5006 10452 : load->broadcast_reg = broadcast_reg;
5007 : }
5008 : break;
5009 : }
5010 : }
5011 : /* FALLTHRU */
5012 :
5013 32123 : case X86_CSE_CONST0_VECTOR:
5014 32123 : case X86_CSE_CONSTM1_VECTOR:
5015 32123 : mode = ix86_get_vector_cse_mode (load->size, load->mode);
5016 32123 : broadcast_reg = gen_reg_rtx (mode);
5017 32123 : if (load->def_insn)
5018 : {
5019 : /* Replace redundant vector loads with a single vector
5020 : load in the same basic block. */
5021 813 : reg = load->val;
5022 813 : if (load->mode != GET_MODE (reg))
5023 0 : reg = gen_rtx_SUBREG (load->mode, reg, 0);
5024 813 : broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
5025 : }
5026 : else
5027 : /* This is a constant integer/double vector. If the
5028 : inner scalar is 0 or -1, set vector to CONST0_RTX
5029 : or CONSTM1_RTX directly. */
5030 31310 : switch (load->kind)
5031 : {
5032 19769 : case X86_CSE_CONST0_VECTOR:
5033 19769 : broadcast_source = CONST0_RTX (mode);
5034 19769 : break;
5035 1235 : case X86_CSE_CONSTM1_VECTOR:
5036 1235 : broadcast_source = CONSTM1_RTX (mode);
5037 1235 : break;
5038 10306 : case X86_CSE_CONST_VECTOR:
5039 10306 : case X86_CSE_VEC_DUP:
5040 10306 : if (!broadcast_source)
5041 : {
5042 10306 : reg = gen_reg_rtx (load->mode);
5043 10306 : broadcast_source = gen_rtx_VEC_DUPLICATE (mode,
5044 : reg);
5045 : }
5046 : break;
5047 0 : default:
5048 0 : gcc_unreachable ();
5049 : }
5050 32123 : replace_vector_const (mode, broadcast_reg, load->insns,
5051 : load->mode);
5052 32123 : load->broadcast_source = broadcast_source;
5053 32123 : load->broadcast_reg = broadcast_reg;
5054 32123 : break;
5055 : }
5056 : }
5057 :
5058 976721 : if (replaced)
5059 : {
5060 41352 : auto_vec<rtx_insn *> control_flow_insns;
5061 :
5062 : /* (Re-)discover loops so that bb->loop_father can be used in the
5063 : analysis below. */
5064 41352 : calculate_dominance_info (CDI_DOMINATORS);
5065 41352 : loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
5066 :
5067 125313 : FOR_EACH_VEC_ELT (loads, i, load)
5068 83961 : if (load->count >= load->threshold)
5069 : {
5070 63377 : rtx set;
5071 63377 : if (load->def_insn)
5072 829 : switch (load->kind)
5073 : {
5074 16 : case X86_CSE_TLSDESC:
5075 16 : ix86_place_single_tls_call (load->broadcast_reg,
5076 : load->tlsdesc_val,
5077 : load->kind,
5078 16 : load->bbs,
5079 : updated_gnu_tls_insns,
5080 : updated_gnu2_tls_insns,
5081 16 : PATTERN (load->def_insn));
5082 16 : break;
5083 813 : case X86_CSE_VEC_DUP:
5084 : /* Insert a broadcast after the original scalar
5085 : definition. */
5086 813 : set = gen_rtx_SET (load->broadcast_reg,
5087 : load->broadcast_source);
5088 813 : insn = emit_insn_after (set, load->def_insn);
5089 :
5090 813 : if (cfun->can_throw_non_call_exceptions)
5091 : {
5092 : /* Handle REG_EH_REGION note in DEF_INSN. */
5093 4 : rtx note = find_reg_note (load->def_insn,
5094 : REG_EH_REGION, nullptr);
5095 4 : if (note)
5096 : {
5097 1 : control_flow_insns.safe_push (load->def_insn);
5098 1 : add_reg_note (insn, REG_EH_REGION,
5099 : XEXP (note, 0));
5100 : }
5101 : }
5102 :
5103 813 : if (dump_file)
5104 : {
5105 0 : fprintf (dump_file, "\nAdd:\n\n");
5106 0 : print_rtl_single (dump_file, insn);
5107 0 : fprintf (dump_file, "\nafter:\n\n");
5108 0 : print_rtl_single (dump_file, load->def_insn);
5109 0 : fprintf (dump_file, "\n");
5110 : }
5111 : break;
5112 0 : default:
5113 0 : gcc_unreachable ();
5114 : }
5115 : else
5116 62548 : switch (load->kind)
5117 : {
5118 297 : case X86_CSE_TLS_GD:
5119 297 : case X86_CSE_TLS_LD_BASE:
5120 297 : case X86_CSE_TLSDESC:
5121 297 : ix86_place_single_tls_call (load->broadcast_reg,
5122 : (load->kind == X86_CSE_TLSDESC
5123 : ? load->tlsdesc_val
5124 : : load->val),
5125 : load->kind,
5126 297 : load->bbs,
5127 : updated_gnu_tls_insns,
5128 : updated_gnu2_tls_insns);
5129 297 : break;
5130 41247 : case X86_CSE_CONST_VECTOR:
5131 41247 : case X86_CSE_VEC_DUP:
5132 : /* Keep redundant constant integer load. */
5133 41247 : if (!load->broadcast_reg)
5134 : break;
5135 : /* FALLTHRU */
5136 41762 : case X86_CSE_CONST0_VECTOR:
5137 41762 : case X86_CSE_CONSTM1_VECTOR:
5138 41762 : ix86_place_single_vector_set (load->broadcast_reg,
5139 : load->broadcast_source,
5140 : load->bbs,
5141 : load);
5142 41762 : break;
5143 : }
5144 : }
5145 :
5146 41352 : loop_optimizer_finalize ();
5147 :
5148 41352 : if (!control_flow_insns.is_empty ())
5149 : {
5150 1 : free_dominance_info (CDI_DOMINATORS);
5151 :
5152 3 : FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
5153 1 : if (control_flow_insn_p (insn))
5154 : {
5155 : /* Split the block after insn. There will be a fallthru
5156 : edge, which is OK so we keep it. We have to create
5157 : the exception edges ourselves. */
5158 1 : bb = BLOCK_FOR_INSN (insn);
5159 1 : split_block (bb, insn);
5160 1 : rtl_make_eh_edge (NULL, bb, BB_END (bb));
5161 : }
5162 : }
5163 :
5164 41352 : df_process_deferred_rescans ();
5165 41352 : }
5166 :
5167 1130611 : FOR_EACH_VEC_ELT (loads, i, load)
5168 307780 : delete load;
5169 :
5170 976721 : df_clear_flags (DF_DEFER_INSN_RESCAN);
5171 :
5172 976721 : timevar_pop (TV_MACH_DEP);
5173 976721 : return 0;
5174 976721 : }
5175 :
5176 : } // anon namespace
5177 :
5178 : rtl_opt_pass *
5179 288047 : make_pass_x86_cse (gcc::context *ctxt)
5180 : {
5181 288047 : return new pass_x86_cse (ctxt);
5182 : }
5183 :
5184 : /* Convert legacy instructions that clobbers EFLAGS to APX_NF
5185 : instructions when there are no flag set between a flag
5186 : producer and user. */
5187 :
5188 : static unsigned int
5189 369 : ix86_apx_nf_convert (void)
5190 : {
5191 369 : timevar_push (TV_MACH_DEP);
5192 :
5193 369 : basic_block bb;
5194 369 : rtx_insn *insn;
5195 369 : hash_map <rtx_insn *, rtx> converting_map;
5196 369 : auto_vec <rtx_insn *> current_convert_list;
5197 :
5198 369 : bool converting_seq = false;
5199 369 : rtx cc = gen_rtx_REG (CCmode, FLAGS_REG);
5200 :
5201 790 : FOR_EACH_BB_FN (bb, cfun)
5202 : {
5203 : /* Reset conversion for each bb. */
5204 421 : converting_seq = false;
5205 5049 : FOR_BB_INSNS (bb, insn)
5206 : {
5207 4628 : if (!NONDEBUG_INSN_P (insn))
5208 4965 : continue;
5209 :
5210 3688 : if (recog_memoized (insn) < 0)
5211 336 : continue;
5212 :
5213 : /* Convert candidate insns after cstore, which should
5214 : satisify the two conditions:
5215 : 1. Is not flag user or producer, only clobbers
5216 : FLAGS_REG.
5217 : 2. Have corresponding nf pattern. */
5218 :
5219 3352 : rtx pat = PATTERN (insn);
5220 :
5221 : /* Starting convertion at first cstorecc. */
5222 3352 : rtx set = NULL_RTX;
5223 3352 : if (!converting_seq
5224 2771 : && (set = single_set (insn))
5225 2695 : && ix86_comparison_operator (SET_SRC (set), VOIDmode)
5226 126 : && reg_overlap_mentioned_p (cc, SET_SRC (set))
5227 3475 : && !reg_overlap_mentioned_p (cc, SET_DEST (set)))
5228 : {
5229 123 : converting_seq = true;
5230 123 : current_convert_list.truncate (0);
5231 : }
5232 : /* Terminate at the next explicit flag set. */
5233 3229 : else if (reg_set_p (cc, pat)
5234 3229 : && GET_CODE (set_of (cc, pat)) != CLOBBER)
5235 : converting_seq = false;
5236 :
5237 3132 : if (!converting_seq)
5238 2749 : continue;
5239 :
5240 603 : if (get_attr_has_nf (insn)
5241 603 : && GET_CODE (pat) == PARALLEL)
5242 : {
5243 : /* Record the insn to candidate map. */
5244 72 : current_convert_list.safe_push (insn);
5245 72 : converting_map.put (insn, pat);
5246 : }
5247 : /* If the insn clobbers flags but has no nf_attr,
5248 : revoke all previous candidates. */
5249 531 : else if (!get_attr_has_nf (insn)
5250 530 : && reg_set_p (cc, pat)
5251 534 : && GET_CODE (set_of (cc, pat)) == CLOBBER)
5252 : {
5253 3 : for (auto item : current_convert_list)
5254 0 : converting_map.remove (item);
5255 3 : converting_seq = false;
5256 : }
5257 : }
5258 : }
5259 :
5260 369 : if (!converting_map.is_empty ())
5261 : {
5262 85 : for (auto iter = converting_map.begin ();
5263 170 : iter != converting_map.end (); ++iter)
5264 : {
5265 72 : rtx_insn *replace = (*iter).first;
5266 72 : rtx pat = (*iter).second;
5267 72 : int i, n = 0, len = XVECLEN (pat, 0);
5268 72 : rtx *new_elems = XALLOCAVEC (rtx, len);
5269 72 : rtx new_pat;
5270 216 : for (i = 0; i < len; i++)
5271 : {
5272 144 : rtx temp = XVECEXP (pat, 0, i);
5273 216 : if (! (GET_CODE (temp) == CLOBBER
5274 72 : && reg_overlap_mentioned_p (cc,
5275 72 : XEXP (temp, 0))))
5276 : {
5277 72 : new_elems[n] = temp;
5278 72 : n++;
5279 : }
5280 : }
5281 :
5282 72 : if (n == 1)
5283 72 : new_pat = new_elems[0];
5284 : else
5285 0 : new_pat =
5286 0 : gen_rtx_PARALLEL (VOIDmode,
5287 : gen_rtvec_v (n,
5288 : new_elems));
5289 :
5290 72 : PATTERN (replace) = new_pat;
5291 72 : INSN_CODE (replace) = -1;
5292 72 : recog_memoized (replace);
5293 72 : df_insn_rescan (replace);
5294 : }
5295 : }
5296 :
5297 369 : timevar_pop (TV_MACH_DEP);
5298 369 : return 0;
5299 369 : }
5300 :
5301 :
5302 : namespace {
5303 :
5304 : const pass_data pass_data_apx_nf_convert =
5305 : {
5306 : RTL_PASS, /* type */
5307 : "apx_nfcvt", /* name */
5308 : OPTGROUP_NONE, /* optinfo_flags */
5309 : TV_MACH_DEP, /* tv_id */
5310 : 0, /* properties_required */
5311 : 0, /* properties_provided */
5312 : 0, /* properties_destroyed */
5313 : 0, /* todo_flags_start */
5314 : 0, /* todo_flags_finish */
5315 : };
5316 :
5317 : class pass_apx_nf_convert : public rtl_opt_pass
5318 : {
5319 : public:
5320 288047 : pass_apx_nf_convert (gcc::context *ctxt)
5321 576094 : : rtl_opt_pass (pass_data_apx_nf_convert, ctxt)
5322 : {}
5323 :
5324 : /* opt_pass methods: */
5325 1474422 : bool gate (function *) final override
5326 : {
5327 1474422 : return (TARGET_APX_NF
5328 461 : && optimize
5329 1474875 : && optimize_function_for_speed_p (cfun));
5330 : }
5331 :
5332 369 : unsigned int execute (function *) final override
5333 : {
5334 369 : return ix86_apx_nf_convert ();
5335 : }
5336 : }; // class pass_apx_nf_convert
5337 :
5338 : } // anon namespace
5339 :
5340 : rtl_opt_pass *
5341 288047 : make_pass_apx_nf_convert (gcc::context *ctxt)
5342 : {
5343 288047 : return new pass_apx_nf_convert (ctxt);
5344 : }
5345 :
5346 : /* When a hot loop can be fit into one cacheline,
5347 : force align the loop without considering the max skip. */
5348 : static void
5349 976242 : ix86_align_loops ()
5350 : {
5351 976242 : basic_block bb;
5352 :
5353 : /* Don't do this when we don't know cache line size. */
5354 976242 : if (ix86_cost->prefetch_block == 0)
5355 9 : return;
5356 :
5357 976233 : loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
5358 976233 : profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
5359 11361065 : FOR_EACH_BB_FN (bb, cfun)
5360 : {
5361 10384832 : rtx_insn *label = BB_HEAD (bb);
5362 10384832 : bool has_fallthru = 0;
5363 10384832 : edge e;
5364 10384832 : edge_iterator ei;
5365 :
5366 10384832 : if (!LABEL_P (label))
5367 5286369 : continue;
5368 :
5369 5103274 : profile_count fallthru_count = profile_count::zero ();
5370 5103274 : profile_count branch_count = profile_count::zero ();
5371 :
5372 14837079 : FOR_EACH_EDGE (e, ei, bb->preds)
5373 : {
5374 9733805 : if (e->flags & EDGE_FALLTHRU)
5375 2482408 : has_fallthru = 1, fallthru_count += e->count ();
5376 : else
5377 7251397 : branch_count += e->count ();
5378 : }
5379 :
5380 5103274 : if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
5381 4811 : continue;
5382 :
5383 5098463 : if (bb->loop_father
5384 5098463 : && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
5385 6438084 : && (has_fallthru
5386 1339621 : ? (!(single_succ_p (bb)
5387 146732 : && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
5388 932723 : && optimize_bb_for_speed_p (bb)
5389 852243 : && branch_count + fallthru_count > count_threshold
5390 728300 : && (branch_count > fallthru_count * param_align_loop_iterations))
5391 : /* In case there'no fallthru for the loop.
5392 : Nops inserted won't be executed. */
5393 406898 : : (branch_count > count_threshold
5394 137369 : || (bb->count > bb->prev_bb->count * 10
5395 12434 : && (bb->prev_bb->count
5396 4567705 : <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)))))
5397 : {
5398 543192 : rtx_insn* insn, *end_insn;
5399 543192 : HOST_WIDE_INT size = 0;
5400 543192 : bool padding_p = true;
5401 543192 : basic_block tbb = bb;
5402 543192 : unsigned cond_branch_num = 0;
5403 543192 : bool detect_tight_loop_p = false;
5404 :
5405 857766 : for (unsigned int i = 0; i != bb->loop_father->num_nodes;
5406 314574 : i++, tbb = tbb->next_bb)
5407 : {
5408 : /* Only handle continuous cfg layout. */
5409 857766 : if (bb->loop_father != tbb->loop_father)
5410 : {
5411 : padding_p = false;
5412 : break;
5413 : }
5414 :
5415 10082020 : FOR_BB_INSNS (tbb, insn)
5416 : {
5417 9421869 : if (!NONDEBUG_INSN_P (insn))
5418 5398086 : continue;
5419 4023783 : size += ix86_min_insn_size (insn);
5420 :
5421 : /* We don't know size of inline asm.
5422 : Don't align loop for call. */
5423 4023783 : if (asm_noperands (PATTERN (insn)) >= 0
5424 4023783 : || CALL_P (insn))
5425 : {
5426 : size = -1;
5427 : break;
5428 : }
5429 : }
5430 :
5431 817235 : if (size == -1 || size > ix86_cost->prefetch_block)
5432 : {
5433 : padding_p = false;
5434 : break;
5435 : }
5436 :
5437 1454095 : FOR_EACH_EDGE (e, ei, tbb->succs)
5438 : {
5439 : /* It could be part of the loop. */
5440 1002877 : if (e->dest == bb)
5441 : {
5442 : detect_tight_loop_p = true;
5443 : break;
5444 : }
5445 : }
5446 :
5447 634828 : if (detect_tight_loop_p)
5448 : break;
5449 :
5450 451218 : end_insn = BB_END (tbb);
5451 451218 : if (JUMP_P (end_insn))
5452 : {
5453 : /* For decoded icache:
5454 : 1. Up to two branches are allowed per Way.
5455 : 2. A non-conditional branch is the last micro-op in a Way.
5456 : */
5457 363540 : if (onlyjump_p (end_insn)
5458 363540 : && (any_uncondjump_p (end_insn)
5459 307820 : || single_succ_p (tbb)))
5460 : {
5461 : padding_p = false;
5462 : break;
5463 : }
5464 307820 : else if (++cond_branch_num >= 2)
5465 : {
5466 : padding_p = false;
5467 : break;
5468 : }
5469 : }
5470 :
5471 : }
5472 :
5473 543192 : if (padding_p && detect_tight_loop_p)
5474 : {
5475 367220 : emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)),
5476 : GEN_INT (0)), label);
5477 : /* End of function. */
5478 183610 : if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun))
5479 : break;
5480 : /* Skip bb which already fits into one cacheline. */
5481 : bb = tbb;
5482 : }
5483 : }
5484 : }
5485 :
5486 976233 : loop_optimizer_finalize ();
5487 976233 : free_dominance_info (CDI_DOMINATORS);
5488 : }
5489 :
5490 : namespace {
5491 :
5492 : const pass_data pass_data_align_tight_loops =
5493 : {
5494 : RTL_PASS, /* type */
5495 : "align_tight_loops", /* name */
5496 : OPTGROUP_NONE, /* optinfo_flags */
5497 : TV_MACH_DEP, /* tv_id */
5498 : 0, /* properties_required */
5499 : 0, /* properties_provided */
5500 : 0, /* properties_destroyed */
5501 : 0, /* todo_flags_start */
5502 : 0, /* todo_flags_finish */
5503 : };
5504 :
5505 : class pass_align_tight_loops : public rtl_opt_pass
5506 : {
5507 : public:
5508 288047 : pass_align_tight_loops (gcc::context *ctxt)
5509 576094 : : rtl_opt_pass (pass_data_align_tight_loops, ctxt)
5510 : {}
5511 :
5512 : /* opt_pass methods: */
5513 1474422 : bool gate (function *) final override
5514 : {
5515 1474422 : return TARGET_ALIGN_TIGHT_LOOPS
5516 1473936 : && optimize
5517 2515432 : && optimize_function_for_speed_p (cfun);
5518 : }
5519 :
5520 976242 : unsigned int execute (function *) final override
5521 : {
5522 976242 : timevar_push (TV_MACH_DEP);
5523 : #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
5524 976242 : ix86_align_loops ();
5525 : #endif
5526 976242 : timevar_pop (TV_MACH_DEP);
5527 976242 : return 0;
5528 : }
5529 : }; // class pass_align_tight_loops
5530 :
5531 : } // anon namespace
5532 :
5533 : rtl_opt_pass *
5534 288047 : make_pass_align_tight_loops (gcc::context *ctxt)
5535 : {
5536 288047 : return new pass_align_tight_loops (ctxt);
5537 : }
5538 :
5539 : /* This compares the priority of target features in function DECL1
5540 : and DECL2. It returns positive value if DECL1 is higher priority,
5541 : negative value if DECL2 is higher priority and 0 if they are the
5542 : same. */
5543 :
5544 : int
5545 5772 : ix86_compare_version_priority (tree decl1, tree decl2)
5546 : {
5547 5772 : unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
5548 5772 : unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
5549 :
5550 5772 : return (int)priority1 - (int)priority2;
5551 : }
5552 :
5553 : /* V1 and V2 point to function versions with different priorities
5554 : based on the target ISA. This function compares their priorities. */
5555 :
5556 : static int
5557 6860 : feature_compare (const void *v1, const void *v2)
5558 : {
5559 6860 : typedef struct _function_version_info
5560 : {
5561 : tree version_decl;
5562 : tree predicate_chain;
5563 : unsigned int dispatch_priority;
5564 : } function_version_info;
5565 :
5566 6860 : const function_version_info c1 = *(const function_version_info *)v1;
5567 6860 : const function_version_info c2 = *(const function_version_info *)v2;
5568 6860 : return (c2.dispatch_priority - c1.dispatch_priority);
5569 : }
5570 :
5571 : /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
5572 : to return a pointer to VERSION_DECL if the outcome of the expression
5573 : formed by PREDICATE_CHAIN is true. This function will be called during
5574 : version dispatch to decide which function version to execute. It returns
5575 : the basic block at the end, to which more conditions can be added. */
5576 :
5577 : static basic_block
5578 834 : add_condition_to_bb (tree function_decl, tree version_decl,
5579 : tree predicate_chain, basic_block new_bb)
5580 : {
5581 834 : gimple *return_stmt;
5582 834 : tree convert_expr, result_var;
5583 834 : gimple *convert_stmt;
5584 834 : gimple *call_cond_stmt;
5585 834 : gimple *if_else_stmt;
5586 :
5587 834 : basic_block bb1, bb2, bb3;
5588 834 : edge e12, e23;
5589 :
5590 834 : tree cond_var, and_expr_var = NULL_TREE;
5591 834 : gimple_seq gseq;
5592 :
5593 834 : tree predicate_decl, predicate_arg;
5594 :
5595 834 : push_cfun (DECL_STRUCT_FUNCTION (function_decl));
5596 :
5597 834 : gcc_assert (new_bb != NULL);
5598 834 : gseq = bb_seq (new_bb);
5599 :
5600 :
5601 834 : convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
5602 : build_fold_addr_expr (version_decl));
5603 834 : result_var = create_tmp_var (ptr_type_node);
5604 834 : convert_stmt = gimple_build_assign (result_var, convert_expr);
5605 834 : return_stmt = gimple_build_return (result_var);
5606 :
5607 834 : if (predicate_chain == NULL_TREE)
5608 : {
5609 200 : gimple_seq_add_stmt (&gseq, convert_stmt);
5610 200 : gimple_seq_add_stmt (&gseq, return_stmt);
5611 200 : set_bb_seq (new_bb, gseq);
5612 200 : gimple_set_bb (convert_stmt, new_bb);
5613 200 : gimple_set_bb (return_stmt, new_bb);
5614 200 : pop_cfun ();
5615 200 : return new_bb;
5616 : }
5617 :
5618 1307 : while (predicate_chain != NULL)
5619 : {
5620 673 : cond_var = create_tmp_var (integer_type_node);
5621 673 : predicate_decl = TREE_PURPOSE (predicate_chain);
5622 673 : predicate_arg = TREE_VALUE (predicate_chain);
5623 673 : call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
5624 673 : gimple_call_set_lhs (call_cond_stmt, cond_var);
5625 :
5626 673 : gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
5627 673 : gimple_set_bb (call_cond_stmt, new_bb);
5628 673 : gimple_seq_add_stmt (&gseq, call_cond_stmt);
5629 :
5630 673 : predicate_chain = TREE_CHAIN (predicate_chain);
5631 :
5632 673 : if (and_expr_var == NULL)
5633 : and_expr_var = cond_var;
5634 : else
5635 : {
5636 39 : gimple *assign_stmt;
5637 : /* Use MIN_EXPR to check if any integer is zero?.
5638 : and_expr_var = min_expr <cond_var, and_expr_var> */
5639 39 : assign_stmt = gimple_build_assign (and_expr_var,
5640 : build2 (MIN_EXPR, integer_type_node,
5641 : cond_var, and_expr_var));
5642 :
5643 39 : gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
5644 39 : gimple_set_bb (assign_stmt, new_bb);
5645 39 : gimple_seq_add_stmt (&gseq, assign_stmt);
5646 : }
5647 : }
5648 :
5649 634 : if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
5650 : integer_zero_node,
5651 : NULL_TREE, NULL_TREE);
5652 634 : gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
5653 634 : gimple_set_bb (if_else_stmt, new_bb);
5654 634 : gimple_seq_add_stmt (&gseq, if_else_stmt);
5655 :
5656 634 : gimple_seq_add_stmt (&gseq, convert_stmt);
5657 634 : gimple_seq_add_stmt (&gseq, return_stmt);
5658 634 : set_bb_seq (new_bb, gseq);
5659 :
5660 634 : bb1 = new_bb;
5661 634 : e12 = split_block (bb1, if_else_stmt);
5662 634 : bb2 = e12->dest;
5663 634 : e12->flags &= ~EDGE_FALLTHRU;
5664 634 : e12->flags |= EDGE_TRUE_VALUE;
5665 :
5666 634 : e23 = split_block (bb2, return_stmt);
5667 :
5668 634 : gimple_set_bb (convert_stmt, bb2);
5669 634 : gimple_set_bb (return_stmt, bb2);
5670 :
5671 634 : bb3 = e23->dest;
5672 634 : make_edge (bb1, bb3, EDGE_FALSE_VALUE);
5673 :
5674 634 : remove_edge (e23);
5675 634 : make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
5676 :
5677 634 : pop_cfun ();
5678 :
5679 634 : return bb3;
5680 : }
5681 :
5682 : /* This function generates the dispatch function for
5683 : multi-versioned functions. DISPATCH_DECL is the function which will
5684 : contain the dispatch logic. FNDECLS are the function choices for
5685 : dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
5686 : in DISPATCH_DECL in which the dispatch code is generated. */
5687 :
5688 : static int
5689 200 : dispatch_function_versions (tree dispatch_decl,
5690 : void *fndecls_p,
5691 : basic_block *empty_bb)
5692 : {
5693 200 : tree default_decl;
5694 200 : gimple *ifunc_cpu_init_stmt;
5695 200 : gimple_seq gseq;
5696 200 : int ix;
5697 200 : tree ele;
5698 200 : vec<tree> *fndecls;
5699 200 : unsigned int num_versions = 0;
5700 200 : unsigned int actual_versions = 0;
5701 200 : unsigned int i;
5702 :
5703 200 : struct _function_version_info
5704 : {
5705 : tree version_decl;
5706 : tree predicate_chain;
5707 : unsigned int dispatch_priority;
5708 : }*function_version_info;
5709 :
5710 200 : gcc_assert (dispatch_decl != NULL
5711 : && fndecls_p != NULL
5712 : && empty_bb != NULL);
5713 :
5714 : /*fndecls_p is actually a vector. */
5715 200 : fndecls = static_cast<vec<tree> *> (fndecls_p);
5716 :
5717 : /* At least one more version other than the default. */
5718 200 : num_versions = fndecls->length ();
5719 200 : gcc_assert (num_versions >= 2);
5720 :
5721 200 : function_version_info = (struct _function_version_info *)
5722 200 : XNEWVEC (struct _function_version_info, (num_versions - 1));
5723 :
5724 : /* The first version in the vector is the default decl. */
5725 200 : default_decl = (*fndecls)[0];
5726 :
5727 200 : push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
5728 :
5729 200 : gseq = bb_seq (*empty_bb);
5730 : /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
5731 : constructors, so explicity call __builtin_cpu_init here. */
5732 200 : ifunc_cpu_init_stmt
5733 200 : = gimple_build_call_vec (get_ix86_builtin (IX86_BUILTIN_CPU_INIT), vNULL);
5734 200 : gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
5735 200 : gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
5736 200 : set_bb_seq (*empty_bb, gseq);
5737 :
5738 200 : pop_cfun ();
5739 :
5740 :
5741 991 : for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
5742 : {
5743 791 : tree version_decl = ele;
5744 791 : tree predicate_chain = NULL_TREE;
5745 791 : unsigned int priority;
5746 : /* Get attribute string, parse it and find the right predicate decl.
5747 : The predicate function could be a lengthy combination of many
5748 : features, like arch-type and various isa-variants. */
5749 791 : priority = get_builtin_code_for_version (version_decl,
5750 : &predicate_chain);
5751 :
5752 791 : if (predicate_chain == NULL_TREE)
5753 157 : continue;
5754 :
5755 634 : function_version_info [actual_versions].version_decl = version_decl;
5756 634 : function_version_info [actual_versions].predicate_chain
5757 634 : = predicate_chain;
5758 634 : function_version_info [actual_versions].dispatch_priority = priority;
5759 634 : actual_versions++;
5760 : }
5761 :
5762 : /* Sort the versions according to descending order of dispatch priority. The
5763 : priority is based on the ISA. This is not a perfect solution. There
5764 : could still be ambiguity. If more than one function version is suitable
5765 : to execute, which one should be dispatched? In future, allow the user
5766 : to specify a dispatch priority next to the version. */
5767 200 : qsort (function_version_info, actual_versions,
5768 : sizeof (struct _function_version_info), feature_compare);
5769 :
5770 1034 : for (i = 0; i < actual_versions; ++i)
5771 634 : *empty_bb = add_condition_to_bb (dispatch_decl,
5772 : function_version_info[i].version_decl,
5773 634 : function_version_info[i].predicate_chain,
5774 : *empty_bb);
5775 :
5776 : /* dispatch default version at the end. */
5777 200 : *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
5778 : NULL, *empty_bb);
5779 :
5780 200 : free (function_version_info);
5781 200 : return 0;
5782 : }
5783 :
5784 : /* This function changes the assembler name for functions that are
5785 : versions. If DECL is a function version and has a "target"
5786 : attribute, it appends the attribute string to its assembler name. */
5787 :
5788 : static tree
5789 1113 : ix86_mangle_function_version_assembler_name (tree decl, tree id)
5790 : {
5791 1113 : tree version_attr;
5792 1113 : char *attr_str;
5793 :
5794 1113 : if (DECL_DECLARED_INLINE_P (decl)
5795 1162 : && lookup_attribute ("gnu_inline",
5796 49 : DECL_ATTRIBUTES (decl)))
5797 0 : error_at (DECL_SOURCE_LOCATION (decl),
5798 : "function versions cannot be marked as %<gnu_inline%>,"
5799 : " bodies have to be generated");
5800 :
5801 1113 : if (DECL_VIRTUAL_P (decl)
5802 2226 : || DECL_VINDEX (decl))
5803 0 : sorry ("virtual function multiversioning not supported");
5804 :
5805 1113 : version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
5806 :
5807 : /* target attribute string cannot be NULL. */
5808 1113 : gcc_assert (version_attr != NULL_TREE);
5809 :
5810 1113 : attr_str = sorted_attr_string (TREE_VALUE (version_attr));
5811 :
5812 : /* Allow assembler name to be modified if already set. */
5813 1113 : if (DECL_ASSEMBLER_NAME_SET_P (decl))
5814 1098 : SET_DECL_RTL (decl, NULL);
5815 :
5816 1113 : tree ret = clone_identifier (id, attr_str, true);
5817 :
5818 1113 : XDELETEVEC (attr_str);
5819 :
5820 1113 : return ret;
5821 : }
5822 :
5823 : tree
5824 482940034 : ix86_mangle_decl_assembler_name (tree decl, tree id)
5825 : {
5826 : /* For function version, add the target suffix to the assembler name. */
5827 482940034 : if (TREE_CODE (decl) == FUNCTION_DECL)
5828 : {
5829 448611536 : cgraph_node *node = cgraph_node::get (decl);
5830 : /* Mangle all versions when annotated with target_clones, but only
5831 : non-default versions when annotated with target attributes. */
5832 448611536 : if (DECL_FUNCTION_VERSIONED (decl)
5833 448611536 : && (node->is_target_clone
5834 1089 : || !is_function_default_version (node->decl)))
5835 1113 : id = ix86_mangle_function_version_assembler_name (decl, id);
5836 : /* Mangle the dispatched symbol but only in the case of target clones. */
5837 448610423 : else if (node && node->dispatcher_function && !node->is_target_clone)
5838 117 : id = clone_identifier (id, "ifunc");
5839 63723989 : else if (node && node->dispatcher_resolver_function)
5840 200 : id = clone_identifier (id, "resolver");
5841 : }
5842 : #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
5843 : id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
5844 : #endif
5845 :
5846 482940034 : return id;
5847 : }
5848 :
5849 : /* Make a dispatcher declaration for the multi-versioned function DECL.
5850 : Calls to DECL function will be replaced with calls to the dispatcher
5851 : by the front-end. Returns the decl of the dispatcher function. */
5852 :
5853 : tree
5854 326 : ix86_get_function_versions_dispatcher (void *decl)
5855 : {
5856 326 : tree fn = (tree) decl;
5857 326 : struct cgraph_node *node = NULL;
5858 326 : struct cgraph_node *default_node = NULL;
5859 326 : struct cgraph_function_version_info *node_v = NULL;
5860 :
5861 326 : tree dispatch_decl = NULL;
5862 :
5863 326 : struct cgraph_function_version_info *default_version_info = NULL;
5864 :
5865 652 : gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
5866 :
5867 326 : node = cgraph_node::get (fn);
5868 326 : gcc_assert (node != NULL);
5869 :
5870 326 : node_v = node->function_version ();
5871 326 : gcc_assert (node_v != NULL);
5872 :
5873 326 : if (node_v->dispatcher_resolver != NULL)
5874 : return node_v->dispatcher_resolver;
5875 :
5876 : /* The default node is always the beginning of the chain. */
5877 : default_version_info = node_v;
5878 674 : while (default_version_info->prev != NULL)
5879 : default_version_info = default_version_info->prev;
5880 212 : default_node = default_version_info->this_node;
5881 :
5882 : /* If there is no default node, just return NULL. */
5883 212 : if (!is_function_default_version (default_node->decl))
5884 : return NULL;
5885 :
5886 : #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
5887 203 : if (targetm.has_ifunc_p ())
5888 : {
5889 203 : struct cgraph_function_version_info *it_v = NULL;
5890 :
5891 : /* Right now, the dispatching is done via ifunc. */
5892 203 : dispatch_decl = make_dispatcher_decl (default_node->decl);
5893 :
5894 : /* Set the dispatcher for all the versions. */
5895 203 : it_v = default_version_info;
5896 1403 : while (it_v != NULL)
5897 : {
5898 997 : it_v->dispatcher_resolver = dispatch_decl;
5899 997 : it_v = it_v->next;
5900 : }
5901 : }
5902 : else
5903 : #endif
5904 : {
5905 0 : error_at (DECL_SOURCE_LOCATION (default_node->decl),
5906 : "multiversioning needs %<ifunc%> which is not supported "
5907 : "on this target");
5908 : }
5909 :
5910 : return dispatch_decl;
5911 : }
5912 :
5913 : /* Make the resolver function decl to dispatch the versions of
5914 : a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
5915 : ifunc alias that will point to the created resolver. Create an
5916 : empty basic block in the resolver and store the pointer in
5917 : EMPTY_BB. Return the decl of the resolver function. */
5918 :
5919 : static tree
5920 200 : make_resolver_func (const tree default_decl,
5921 : const tree ifunc_alias_decl,
5922 : basic_block *empty_bb)
5923 : {
5924 200 : tree decl, type, t;
5925 :
5926 : /* The resolver function should return a (void *). */
5927 200 : type = build_function_type_list (ptr_type_node, NULL_TREE);
5928 :
5929 200 : cgraph_node *node = cgraph_node::get (default_decl);
5930 200 : gcc_assert (node && node->function_version ());
5931 :
5932 200 : decl = build_fn_decl (IDENTIFIER_POINTER (DECL_NAME (default_decl)), type);
5933 :
5934 : /* Set the assembler name to prevent cgraph_node attempting to mangle. */
5935 200 : SET_DECL_ASSEMBLER_NAME (decl, DECL_ASSEMBLER_NAME (default_decl));
5936 :
5937 200 : cgraph_node *resolver_node = cgraph_node::get_create (decl);
5938 200 : resolver_node->dispatcher_resolver_function = true;
5939 :
5940 200 : if (node->is_target_clone)
5941 86 : resolver_node->is_target_clone = true;
5942 :
5943 200 : tree id = ix86_mangle_decl_assembler_name
5944 200 : (decl, node->function_version ()->assembler_name);
5945 200 : symtab->change_decl_assembler_name (decl, id);
5946 :
5947 200 : DECL_NAME (decl) = DECL_NAME (default_decl);
5948 200 : TREE_USED (decl) = 1;
5949 200 : DECL_ARTIFICIAL (decl) = 1;
5950 200 : DECL_IGNORED_P (decl) = 1;
5951 200 : TREE_PUBLIC (decl) = 0;
5952 200 : DECL_UNINLINABLE (decl) = 1;
5953 :
5954 : /* Resolver is not external, body is generated. */
5955 200 : DECL_EXTERNAL (decl) = 0;
5956 200 : DECL_EXTERNAL (ifunc_alias_decl) = 0;
5957 :
5958 200 : DECL_CONTEXT (decl) = NULL_TREE;
5959 200 : DECL_INITIAL (decl) = make_node (BLOCK);
5960 200 : DECL_STATIC_CONSTRUCTOR (decl) = 0;
5961 :
5962 200 : if (DECL_COMDAT_GROUP (default_decl)
5963 200 : || TREE_PUBLIC (default_decl))
5964 : {
5965 : /* In this case, each translation unit with a call to this
5966 : versioned function will put out a resolver. Ensure it
5967 : is comdat to keep just one copy. */
5968 176 : DECL_COMDAT (decl) = 1;
5969 176 : make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
5970 : }
5971 : else
5972 24 : TREE_PUBLIC (ifunc_alias_decl) = 0;
5973 :
5974 : /* Build result decl and add to function_decl. */
5975 200 : t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
5976 200 : DECL_CONTEXT (t) = decl;
5977 200 : DECL_ARTIFICIAL (t) = 1;
5978 200 : DECL_IGNORED_P (t) = 1;
5979 200 : DECL_RESULT (decl) = t;
5980 :
5981 200 : gimplify_function_tree (decl);
5982 200 : push_cfun (DECL_STRUCT_FUNCTION (decl));
5983 200 : *empty_bb = init_lowered_empty_function (decl, false,
5984 : profile_count::uninitialized ());
5985 :
5986 200 : cgraph_node::add_new_function (decl, true);
5987 200 : symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
5988 :
5989 200 : pop_cfun ();
5990 :
5991 200 : gcc_assert (ifunc_alias_decl != NULL);
5992 : /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
5993 200 : DECL_ATTRIBUTES (ifunc_alias_decl)
5994 200 : = make_attribute ("ifunc", IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)),
5995 200 : DECL_ATTRIBUTES (ifunc_alias_decl));
5996 :
5997 : /* Create the alias for dispatch to resolver here. */
5998 200 : cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
5999 200 : return decl;
6000 : }
6001 :
6002 : /* Generate the dispatching code body to dispatch multi-versioned function
6003 : DECL. The target hook is called to process the "target" attributes and
6004 : provide the code to dispatch the right function at run-time. NODE points
6005 : to the dispatcher decl whose body will be created. */
6006 :
6007 : tree
6008 200 : ix86_generate_version_dispatcher_body (void *node_p)
6009 : {
6010 200 : tree resolver_decl;
6011 200 : basic_block empty_bb;
6012 200 : tree default_ver_decl;
6013 200 : struct cgraph_node *versn;
6014 200 : struct cgraph_node *node;
6015 :
6016 200 : struct cgraph_function_version_info *node_version_info = NULL;
6017 200 : struct cgraph_function_version_info *versn_info = NULL;
6018 :
6019 200 : node = (cgraph_node *)node_p;
6020 :
6021 200 : node_version_info = node->function_version ();
6022 200 : gcc_assert (node->dispatcher_function
6023 : && node_version_info != NULL);
6024 :
6025 200 : if (node_version_info->dispatcher_resolver)
6026 : return node_version_info->dispatcher_resolver;
6027 :
6028 : /* The first version in the chain corresponds to the default version. */
6029 200 : default_ver_decl = node_version_info->next->this_node->decl;
6030 :
6031 : /* node is going to be an alias, so remove the finalized bit. */
6032 200 : node->definition = false;
6033 :
6034 200 : resolver_decl = make_resolver_func (default_ver_decl,
6035 : node->decl, &empty_bb);
6036 :
6037 200 : node_version_info->dispatcher_resolver = resolver_decl;
6038 :
6039 200 : push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
6040 :
6041 200 : auto_vec<tree, 2> fn_ver_vec;
6042 :
6043 1191 : for (versn_info = node_version_info->next; versn_info;
6044 991 : versn_info = versn_info->next)
6045 : {
6046 991 : versn = versn_info->this_node;
6047 : /* Check for virtual functions here again, as by this time it should
6048 : have been determined if this function needs a vtable index or
6049 : not. This happens for methods in derived classes that override
6050 : virtual methods in base classes but are not explicitly marked as
6051 : virtual. */
6052 991 : if (DECL_VIRTUAL_P (versn->decl))
6053 0 : sorry ("virtual function multiversioning not supported");
6054 :
6055 991 : fn_ver_vec.safe_push (versn->decl);
6056 : }
6057 :
6058 200 : dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
6059 200 : cgraph_edge::rebuild_edges ();
6060 200 : pop_cfun ();
6061 200 : return resolver_decl;
6062 200 : }
6063 :
6064 :
|