Line data Source code
1 : /* Copyright (C) 1988-2026 Free Software Foundation, Inc.
2 :
3 : This file is part of GCC.
4 :
5 : GCC is free software; you can redistribute it and/or modify
6 : it under the terms of the GNU General Public License as published by
7 : the Free Software Foundation; either version 3, or (at your option)
8 : any later version.
9 :
10 : GCC is distributed in the hope that it will be useful,
11 : but WITHOUT ANY WARRANTY; without even the implied warranty of
12 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 : GNU General Public License for more details.
14 :
15 : You should have received a copy of the GNU General Public License
16 : along with GCC; see the file COPYING3. If not see
17 : <http://www.gnu.org/licenses/>. */
18 :
19 : #define IN_TARGET_CODE 1
20 :
21 : #include "config.h"
22 : #include "system.h"
23 : #include "coretypes.h"
24 : #include "backend.h"
25 : #include "rtl.h"
26 : #include "tree.h"
27 : #include "memmodel.h"
28 : #include "gimple.h"
29 : #include "cfghooks.h"
30 : #include "cfgloop.h"
31 : #include "df.h"
32 : #include "tm_p.h"
33 : #include "stringpool.h"
34 : #include "expmed.h"
35 : #include "optabs.h"
36 : #include "regs.h"
37 : #include "emit-rtl.h"
38 : #include "recog.h"
39 : #include "cgraph.h"
40 : #include "diagnostic.h"
41 : #include "cfgbuild.h"
42 : #include "alias.h"
43 : #include "fold-const.h"
44 : #include "attribs.h"
45 : #include "calls.h"
46 : #include "stor-layout.h"
47 : #include "varasm.h"
48 : #include "output.h"
49 : #include "insn-attr.h"
50 : #include "flags.h"
51 : #include "except.h"
52 : #include "explow.h"
53 : #include "expr.h"
54 : #include "cfgrtl.h"
55 : #include "common/common-target.h"
56 : #include "langhooks.h"
57 : #include "reload.h"
58 : #include "gimplify.h"
59 : #include "dwarf2.h"
60 : #include "tm-constrs.h"
61 : #include "cselib.h"
62 : #include "sched-int.h"
63 : #include "opts.h"
64 : #include "tree-pass.h"
65 : #include "context.h"
66 : #include "pass_manager.h"
67 : #include "target-globals.h"
68 : #include "gimple-iterator.h"
69 : #include "shrink-wrap.h"
70 : #include "builtins.h"
71 : #include "rtl-iter.h"
72 : #include "tree-iterator.h"
73 : #include "dbgcnt.h"
74 : #include "case-cfn-macros.h"
75 : #include "dojump.h"
76 : #include "fold-const-call.h"
77 : #include "tree-vrp.h"
78 : #include "tree-ssanames.h"
79 : #include "selftest.h"
80 : #include "selftest-rtl.h"
81 : #include "print-rtl.h"
82 : #include "intl.h"
83 : #include "ifcvt.h"
84 : #include "symbol-summary.h"
85 : #include "sreal.h"
86 : #include "ipa-cp.h"
87 : #include "ipa-prop.h"
88 : #include "ipa-fnsummary.h"
89 : #include "wide-int-bitmask.h"
90 : #include "tree-vector-builder.h"
91 : #include "debug.h"
92 : #include "dwarf2out.h"
93 : #include "i386-builtins.h"
94 : #include "i386-features.h"
95 : #include "i386-expand.h"
96 :
97 : const char * const xlogue_layout::STUB_BASE_NAMES[XLOGUE_STUB_COUNT] = {
98 : "savms64",
99 : "resms64",
100 : "resms64x",
101 : "savms64f",
102 : "resms64f",
103 : "resms64fx"
104 : };
105 :
106 : const unsigned xlogue_layout::REG_ORDER[xlogue_layout::MAX_REGS] = {
107 : /* The below offset values are where each register is stored for the layout
108 : relative to incoming stack pointer. The value of each m_regs[].offset will
109 : be relative to the incoming base pointer (rax or rsi) used by the stub.
110 :
111 : s_instances: 0 1 2 3
112 : Offset: realigned or aligned + 8
113 : Register aligned aligned + 8 aligned w/HFP w/HFP */
114 : XMM15_REG, /* 0x10 0x18 0x10 0x18 */
115 : XMM14_REG, /* 0x20 0x28 0x20 0x28 */
116 : XMM13_REG, /* 0x30 0x38 0x30 0x38 */
117 : XMM12_REG, /* 0x40 0x48 0x40 0x48 */
118 : XMM11_REG, /* 0x50 0x58 0x50 0x58 */
119 : XMM10_REG, /* 0x60 0x68 0x60 0x68 */
120 : XMM9_REG, /* 0x70 0x78 0x70 0x78 */
121 : XMM8_REG, /* 0x80 0x88 0x80 0x88 */
122 : XMM7_REG, /* 0x90 0x98 0x90 0x98 */
123 : XMM6_REG, /* 0xa0 0xa8 0xa0 0xa8 */
124 : SI_REG, /* 0xa8 0xb0 0xa8 0xb0 */
125 : DI_REG, /* 0xb0 0xb8 0xb0 0xb8 */
126 : BX_REG, /* 0xb8 0xc0 0xb8 0xc0 */
127 : BP_REG, /* 0xc0 0xc8 N/A N/A */
128 : R12_REG, /* 0xc8 0xd0 0xc0 0xc8 */
129 : R13_REG, /* 0xd0 0xd8 0xc8 0xd0 */
130 : R14_REG, /* 0xd8 0xe0 0xd0 0xd8 */
131 : R15_REG, /* 0xe0 0xe8 0xd8 0xe0 */
132 : };
133 :
134 : /* Instantiate static const values. */
135 : const HOST_WIDE_INT xlogue_layout::STUB_INDEX_OFFSET;
136 : const unsigned xlogue_layout::MIN_REGS;
137 : const unsigned xlogue_layout::MAX_REGS;
138 : const unsigned xlogue_layout::MAX_EXTRA_REGS;
139 : const unsigned xlogue_layout::VARIANT_COUNT;
140 : const unsigned xlogue_layout::STUB_NAME_MAX_LEN;
141 :
142 : /* Initialize xlogue_layout::s_stub_names to zero. */
143 : char xlogue_layout::s_stub_names[2][XLOGUE_STUB_COUNT][VARIANT_COUNT]
144 : [STUB_NAME_MAX_LEN];
145 :
146 : /* Instantiates all xlogue_layout instances. */
147 : const xlogue_layout xlogue_layout::s_instances[XLOGUE_SET_COUNT] = {
148 : xlogue_layout (0, false),
149 : xlogue_layout (8, false),
150 : xlogue_layout (0, true),
151 : xlogue_layout (8, true)
152 : };
153 :
154 : /* Return an appropriate const instance of xlogue_layout based upon values
155 : in cfun->machine and crtl. */
156 : const class xlogue_layout &
157 49891 : xlogue_layout::get_instance ()
158 : {
159 49891 : enum xlogue_stub_sets stub_set;
160 49891 : bool aligned_plus_8 = cfun->machine->call_ms2sysv_pad_in;
161 :
162 49891 : if (stack_realign_fp)
163 : stub_set = XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
164 40910 : else if (frame_pointer_needed)
165 25246 : stub_set = aligned_plus_8
166 31552 : ? XLOGUE_SET_HFP_ALIGNED_PLUS_8
167 : : XLOGUE_SET_HFP_ALIGNED_OR_REALIGN;
168 : else
169 9358 : stub_set = aligned_plus_8 ? XLOGUE_SET_ALIGNED_PLUS_8 : XLOGUE_SET_ALIGNED;
170 :
171 49891 : return s_instances[stub_set];
172 : }
173 :
174 : /* Determine how many clobbered registers can be saved by the stub.
175 : Returns the count of registers the stub will save and restore. */
176 : unsigned
177 35225 : xlogue_layout::count_stub_managed_regs ()
178 : {
179 35225 : bool hfp = frame_pointer_needed || stack_realign_fp;
180 35225 : unsigned i, count;
181 35225 : unsigned regno;
182 :
183 94890 : for (count = i = MIN_REGS; i < MAX_REGS; ++i)
184 : {
185 93670 : regno = REG_ORDER[i];
186 93670 : if (regno == BP_REG && hfp)
187 18200 : continue;
188 75470 : if (!ix86_save_reg (regno, false, false))
189 : break;
190 41465 : ++count;
191 : }
192 35225 : return count;
193 : }
194 :
195 : /* Determine if register REGNO is a stub managed register given the
196 : total COUNT of stub managed registers. */
197 : bool
198 2641728 : xlogue_layout::is_stub_managed_reg (unsigned regno, unsigned count)
199 : {
200 2641728 : bool hfp = frame_pointer_needed || stack_realign_fp;
201 2641728 : unsigned i;
202 :
203 34456982 : for (i = 0; i < count; ++i)
204 : {
205 32315123 : gcc_assert (i < MAX_REGS);
206 32315123 : if (REG_ORDER[i] == BP_REG && hfp)
207 519694 : ++count;
208 31795429 : else if (REG_ORDER[i] == regno)
209 : return true;
210 : }
211 : return false;
212 : }
213 :
214 : /* Constructor for xlogue_layout. */
215 1190632 : xlogue_layout::xlogue_layout (HOST_WIDE_INT stack_align_off_in, bool hfp)
216 1190632 : : m_hfp (hfp) , m_nregs (hfp ? 17 : 18),
217 1190632 : m_stack_align_off_in (stack_align_off_in)
218 : {
219 1190632 : HOST_WIDE_INT offset = stack_align_off_in;
220 1190632 : unsigned i, j;
221 :
222 22622008 : for (i = j = 0; i < MAX_REGS; ++i)
223 : {
224 21431376 : unsigned regno = REG_ORDER[i];
225 :
226 21431376 : if (regno == BP_REG && hfp)
227 595316 : continue;
228 20836060 : if (SSE_REGNO_P (regno))
229 : {
230 11906320 : offset += 16;
231 : /* Verify that SSE regs are always aligned. */
232 11906320 : gcc_assert (!((stack_align_off_in + offset) & 15));
233 : }
234 : else
235 8929740 : offset += 8;
236 :
237 20836060 : m_regs[j].regno = regno;
238 20836060 : m_regs[j++].offset = offset - STUB_INDEX_OFFSET;
239 : }
240 1190632 : gcc_assert (j == m_nregs);
241 1190632 : }
242 :
243 : const char *
244 14666 : xlogue_layout::get_stub_name (enum xlogue_stub stub,
245 : unsigned n_extra_regs)
246 : {
247 14666 : const int have_avx = TARGET_AVX;
248 14666 : char *name = s_stub_names[!!have_avx][stub][n_extra_regs];
249 :
250 : /* Lazy init */
251 14666 : if (!*name)
252 : {
253 362 : int res = snprintf (name, STUB_NAME_MAX_LEN, "__%s_%s_%u",
254 : (have_avx ? "avx" : "sse"),
255 181 : STUB_BASE_NAMES[stub],
256 : MIN_REGS + n_extra_regs);
257 181 : gcc_checking_assert (res < (int)STUB_NAME_MAX_LEN);
258 : }
259 :
260 14666 : return name;
261 : }
262 :
263 : /* Return rtx of a symbol ref for the entry point (based upon
264 : cfun->machine->call_ms2sysv_extra_regs) of the specified stub. */
265 : rtx
266 14666 : xlogue_layout::get_stub_rtx (enum xlogue_stub stub)
267 : {
268 14666 : const unsigned n_extra_regs = cfun->machine->call_ms2sysv_extra_regs;
269 14666 : gcc_checking_assert (n_extra_regs <= MAX_EXTRA_REGS);
270 14666 : gcc_assert (stub < XLOGUE_STUB_COUNT);
271 14666 : gcc_assert (crtl->stack_realign_finalized);
272 :
273 14666 : return gen_rtx_SYMBOL_REF (Pmode, get_stub_name (stub, n_extra_regs));
274 : }
275 :
276 : unsigned scalar_chain::max_id = 0;
277 :
278 : namespace {
279 :
280 : /* Initialize new chain. */
281 :
282 6314062 : scalar_chain::scalar_chain (enum machine_mode smode_, enum machine_mode vmode_)
283 : {
284 6314062 : smode = smode_;
285 6314062 : vmode = vmode_;
286 :
287 6314062 : chain_id = ++max_id;
288 :
289 6314062 : if (dump_file)
290 136 : fprintf (dump_file, "Created a new instruction chain #%d\n", chain_id);
291 :
292 6314062 : bitmap_obstack_initialize (NULL);
293 6314062 : insns = BITMAP_ALLOC (NULL);
294 6314062 : defs = BITMAP_ALLOC (NULL);
295 6314062 : defs_conv = BITMAP_ALLOC (NULL);
296 6314062 : insns_conv = BITMAP_ALLOC (NULL);
297 6314062 : queue = NULL;
298 :
299 6314062 : cost_sse_integer = 0;
300 6314062 : weighted_cost_sse_integer = 0 ;
301 6314062 : max_visits = x86_stv_max_visits;
302 6314062 : }
303 :
304 : /* Free chain's data. */
305 :
306 6314062 : scalar_chain::~scalar_chain ()
307 : {
308 6314062 : BITMAP_FREE (insns);
309 6314062 : BITMAP_FREE (defs);
310 6314062 : BITMAP_FREE (defs_conv);
311 6314062 : BITMAP_FREE (insns_conv);
312 6314062 : bitmap_obstack_release (NULL);
313 6314062 : }
314 :
315 : /* Add instruction into chains' queue. */
316 :
317 : void
318 8142055 : scalar_chain::add_to_queue (unsigned insn_uid)
319 : {
320 8142055 : if (!bitmap_set_bit (queue, insn_uid))
321 : return;
322 :
323 6155053 : if (dump_file)
324 141 : fprintf (dump_file, " Adding insn %d into chain's #%d queue\n",
325 : insn_uid, chain_id);
326 : }
327 :
328 : /* For DImode conversion, mark register defined by DEF as requiring
329 : conversion. */
330 :
331 : void
332 9191797 : scalar_chain::mark_dual_mode_def (df_ref def)
333 : {
334 9191797 : gcc_assert (DF_REF_REG_DEF_P (def));
335 :
336 : /* Record the def/insn pair so we can later efficiently iterate over
337 : the defs to convert on insns not in the chain. */
338 9191797 : bool reg_new = bitmap_set_bit (defs_conv, DF_REF_REGNO (def));
339 9191797 : basic_block bb = BLOCK_FOR_INSN (DF_REF_INSN (def));
340 9191797 : profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count;
341 9191797 : bool speed_p = optimize_bb_for_speed_p (bb);
342 9191797 : int cost = 0;
343 :
344 9191797 : if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def)))
345 : {
346 2678878 : if (!bitmap_set_bit (insns_conv, DF_REF_INSN_UID (def))
347 2678878 : && !reg_new)
348 1360489 : return;
349 :
350 : /* Cost integer to sse moves. */
351 2442098 : if (speed_p)
352 2165721 : cost = COSTS_N_INSNS (ix86_cost->integer_to_sse) / 2;
353 276377 : else if (TARGET_64BIT || smode == SImode)
354 : cost = COSTS_N_BYTES (4);
355 : /* vmovd (4 bytes) + vpinsrd (6 bytes). */
356 18654 : else if (TARGET_SSE4_1)
357 : cost = COSTS_N_BYTES (10);
358 : /* movd (4 bytes) + movd (4 bytes) + unpckldq (4 bytes). */
359 : else
360 7831308 : cost = COSTS_N_BYTES (12);
361 : }
362 : else
363 : {
364 6512919 : if (!reg_new)
365 : return;
366 :
367 : /* Cost sse to integer moves. */
368 5389210 : if (speed_p)
369 4838286 : cost = COSTS_N_INSNS (ix86_cost->sse_to_integer) / 2;
370 550924 : else if (TARGET_64BIT || smode == SImode)
371 : cost = COSTS_N_BYTES (4);
372 : /* vmovd (4 bytes) + vpextrd (6 bytes). */
373 2999 : else if (TARGET_SSE4_1)
374 : cost = COSTS_N_BYTES (10);
375 : /* movd (4 bytes) + psrlq (5 bytes) + movd (4 bytes). */
376 : else
377 7831308 : cost = COSTS_N_BYTES (13);
378 : }
379 :
380 7831308 : if (speed_p)
381 7004007 : weighted_cost_sse_integer += bb->count.to_sreal_scale (entry_count) * cost;
382 :
383 7831308 : cost_sse_integer += cost;
384 :
385 7831308 : if (dump_file)
386 240 : fprintf (dump_file,
387 : " Mark r%d def in insn %d as requiring both modes in chain #%d\n",
388 240 : DF_REF_REGNO (def), DF_REF_INSN_UID (def), chain_id);
389 : }
390 :
391 : /* Check REF's chain to add new insns into a queue
392 : and find registers requiring conversion. Return true if OK, false
393 : if the analysis was aborted. */
394 :
395 : bool
396 17574674 : scalar_chain::analyze_register_chain (bitmap candidates, df_ref ref,
397 : bitmap disallowed)
398 : {
399 17574674 : df_link *chain;
400 17574674 : bool mark_def = false;
401 :
402 17574674 : gcc_checking_assert (bitmap_bit_p (insns, DF_REF_INSN_UID (ref)));
403 :
404 60602178 : for (chain = DF_REF_CHAIN (ref); chain; chain = chain->next)
405 : {
406 43031373 : unsigned uid = DF_REF_INSN_UID (chain->ref);
407 :
408 43031373 : if (!NONDEBUG_INSN_P (DF_REF_INSN (chain->ref)))
409 7576709 : continue;
410 :
411 35454664 : if (--max_visits == 0)
412 : return false;
413 :
414 35454104 : if (!DF_REF_REG_MEM_P (chain->ref))
415 : {
416 29589528 : if (bitmap_bit_p (insns, uid))
417 9438995 : continue;
418 :
419 20150533 : if (bitmap_bit_p (candidates, uid))
420 : {
421 8142055 : add_to_queue (uid);
422 8142055 : continue;
423 : }
424 :
425 : /* If we run into parts of an aborted chain discovery abort. */
426 12008478 : if (bitmap_bit_p (disallowed, uid))
427 : return false;
428 : }
429 :
430 17869745 : if (DF_REF_REG_DEF_P (chain->ref))
431 : {
432 2678878 : if (dump_file)
433 125 : fprintf (dump_file, " r%d def in insn %d isn't convertible\n",
434 : DF_REF_REGNO (chain->ref), uid);
435 2678878 : mark_dual_mode_def (chain->ref);
436 : }
437 : else
438 : {
439 15190867 : if (dump_file)
440 524 : fprintf (dump_file, " r%d use in insn %d isn't convertible\n",
441 : DF_REF_REGNO (chain->ref), uid);
442 : mark_def = true;
443 : }
444 : }
445 :
446 17570805 : if (mark_def)
447 6512919 : mark_dual_mode_def (ref);
448 :
449 : return true;
450 : }
451 :
452 : /* Check whether X is a convertible *concatditi_? variant. X is known
453 : to be any_or_plus:TI, i.e. PLUS:TI, IOR:TI or XOR:TI. */
454 :
455 : static bool
456 30008 : timode_concatdi_p (rtx x)
457 : {
458 30008 : rtx op0 = XEXP (x, 0);
459 30008 : rtx op1 = XEXP (x, 1);
460 :
461 30008 : if (GET_CODE (op1) == ASHIFT)
462 952 : std::swap (op0, op1);
463 :
464 30008 : return GET_CODE (op0) == ASHIFT
465 21083 : && GET_CODE (XEXP (op0, 0)) == ZERO_EXTEND
466 21083 : && GET_MODE (XEXP (XEXP (op0, 0), 0)) == DImode
467 21083 : && REG_P (XEXP (XEXP (op0, 0), 0))
468 20952 : && CONST_INT_P (XEXP (op0, 1))
469 20952 : && INTVAL (XEXP (op0, 1)) == 64
470 20952 : && GET_CODE (op1) == ZERO_EXTEND
471 20000 : && GET_MODE (XEXP (op1, 0)) == DImode
472 50008 : && REG_P (XEXP (op1, 0));
473 : }
474 :
475 :
476 : /* Add instruction into a chain. Return true if OK, false if the search
477 : was aborted. */
478 :
479 : bool
480 12464983 : scalar_chain::add_insn (bitmap candidates, unsigned int insn_uid,
481 : bitmap disallowed)
482 : {
483 12464983 : if (!bitmap_set_bit (insns, insn_uid))
484 : return true;
485 :
486 12464983 : if (dump_file)
487 277 : fprintf (dump_file, " Adding insn %d to chain #%d\n", insn_uid, chain_id);
488 :
489 12464983 : rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
490 12464983 : rtx def_set = single_set (insn);
491 12464983 : if (def_set && REG_P (SET_DEST (def_set))
492 22054876 : && !HARD_REGISTER_P (SET_DEST (def_set)))
493 9564994 : bitmap_set_bit (defs, REGNO (SET_DEST (def_set)));
494 :
495 : /* ??? The following is quadratic since analyze_register_chain
496 : iterates over all refs to look for dual-mode regs. Instead this
497 : should be done separately for all regs mentioned in the chain once. */
498 12464983 : df_ref ref;
499 25458130 : for (ref = DF_INSN_UID_DEFS (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
500 12994544 : if (!HARD_REGISTER_P (DF_REF_REG (ref)))
501 9564994 : if (!analyze_register_chain (candidates, ref, disallowed))
502 : return false;
503 :
504 : /* The operand(s) of VEC_SELECT, ZERO_EXTEND and similar ops don't need
505 : to be converted/convertible. */
506 12463586 : if (def_set)
507 12463586 : switch (GET_CODE (SET_SRC (def_set)))
508 : {
509 3721207 : case REG:
510 3721207 : if (HARD_REGISTER_P (SET_SRC (def_set)))
511 : return true;
512 : break;
513 : case VEC_SELECT:
514 : return true;
515 264 : case ZERO_EXTEND:
516 264 : if (GET_MODE (XEXP (SET_SRC (def_set), 0)) == DImode)
517 : return true;
518 : break;
519 2334392 : case PLUS:
520 2334392 : case IOR:
521 2334392 : case XOR:
522 2334392 : if (smode == TImode && timode_concatdi_p (SET_SRC (def_set)))
523 : return true;
524 : break;
525 : default:
526 : break;
527 : }
528 :
529 27277480 : for (ref = DF_INSN_UID_USES (insn_uid); ref; ref = DF_REF_NEXT_LOC (ref))
530 14882853 : if (DF_REF_TYPE (ref) == DF_REF_REG_USE
531 8009683 : && !SUBREG_P (DF_REF_REG (ref)))
532 8009680 : if (!analyze_register_chain (candidates, ref, disallowed))
533 : return false;
534 :
535 : return true;
536 : }
537 :
538 : /* Build new chain starting from insn INSN_UID recursively
539 : adding all dependent uses and definitions. Return true if OK, false
540 : if the chain discovery was aborted. */
541 :
542 : bool
543 6314062 : scalar_chain::build (bitmap candidates, unsigned insn_uid, bitmap disallowed)
544 : {
545 6314062 : queue = BITMAP_ALLOC (NULL);
546 6314062 : bitmap_set_bit (queue, insn_uid);
547 :
548 6314062 : if (dump_file)
549 136 : fprintf (dump_file, "Building chain #%d...\n", chain_id);
550 :
551 18775176 : while (!bitmap_empty_p (queue))
552 : {
553 12464983 : insn_uid = bitmap_first_set_bit (queue);
554 12464983 : bitmap_clear_bit (queue, insn_uid);
555 12464983 : bitmap_clear_bit (candidates, insn_uid);
556 12464983 : if (!add_insn (candidates, insn_uid, disallowed))
557 : {
558 : /* If we aborted the search put sofar found insn on the set of
559 : disallowed insns so that further searches reaching them also
560 : abort and thus we abort the whole but yet undiscovered chain. */
561 3869 : bitmap_ior_into (disallowed, insns);
562 3869 : if (dump_file)
563 0 : fprintf (dump_file, "Aborted chain #%d discovery\n", chain_id);
564 3869 : BITMAP_FREE (queue);
565 3869 : return false;
566 : }
567 : }
568 :
569 6310193 : if (dump_file)
570 : {
571 136 : fprintf (dump_file, "Collected chain #%d...\n", chain_id);
572 136 : fprintf (dump_file, " insns: ");
573 136 : dump_bitmap (dump_file, insns);
574 136 : if (!bitmap_empty_p (defs_conv))
575 : {
576 136 : bitmap_iterator bi;
577 136 : unsigned id;
578 136 : const char *comma = "";
579 136 : fprintf (dump_file, " defs to convert: ");
580 366 : EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
581 : {
582 230 : fprintf (dump_file, "%sr%d", comma, id);
583 230 : comma = ", ";
584 : }
585 136 : fprintf (dump_file, "\n");
586 : }
587 : }
588 :
589 6310193 : BITMAP_FREE (queue);
590 :
591 6310193 : return true;
592 : }
593 :
594 : /* Return a cost of building a vector constant
595 : instead of using a scalar one. */
596 :
597 : int
598 2601966 : general_scalar_chain::vector_const_cost (rtx exp, basic_block bb)
599 : {
600 2601966 : gcc_assert (CONST_INT_P (exp));
601 :
602 2601966 : if (standard_sse_constant_p (exp, vmode))
603 607468 : return ix86_cost->sse_op;
604 1994498 : if (optimize_bb_for_size_p (bb))
605 : return COSTS_N_BYTES (8);
606 : /* We have separate costs for SImode and DImode, use SImode costs
607 : for smaller modes. */
608 2372852 : return COSTS_N_INSNS (ix86_cost->sse_load[smode == DImode ? 1 : 0]) / 2;
609 : }
610 :
611 : /* Return true if it's cost profitable for chain conversion. */
612 :
613 : bool
614 5810710 : general_scalar_chain::compute_convert_gain ()
615 : {
616 5810710 : bitmap_iterator bi;
617 5810710 : unsigned insn_uid;
618 5810710 : int gain = 0;
619 5810710 : sreal weighted_gain = 0;
620 :
621 5810710 : if (dump_file)
622 136 : fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
623 :
624 : /* SSE costs distinguish between SImode and DImode loads/stores, for
625 : int costs factor in the number of GPRs involved. When supporting
626 : smaller modes than SImode the int load/store costs need to be
627 : adjusted as well. */
628 5810710 : unsigned sse_cost_idx = smode == DImode ? 1 : 0;
629 5810710 : int m = smode == DImode ? (TARGET_64BIT ? 1 : 2) : 1;
630 :
631 17266671 : EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
632 : {
633 11455961 : rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
634 11455961 : rtx def_set = single_set (insn);
635 11455961 : rtx src = SET_SRC (def_set);
636 11455961 : rtx dst = SET_DEST (def_set);
637 11455961 : basic_block bb = BLOCK_FOR_INSN (insn);
638 11455961 : int igain = 0;
639 11455961 : profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count;
640 11455961 : bool speed_p = optimize_bb_for_speed_p (bb);
641 11455961 : sreal bb_freq = bb->count.to_sreal_scale (entry_count);
642 :
643 11455961 : if (REG_P (src) && REG_P (dst))
644 : {
645 900055 : if (!speed_p)
646 : /* reg-reg move is 2 bytes, while SSE 3. */
647 182066 : igain += COSTS_N_BYTES (2 * m - 3);
648 : else
649 : /* Move costs are normalized to reg-reg move having cost 2. */
650 717989 : igain += COSTS_N_INSNS (2 * m - ix86_cost->xmm_move) / 2;
651 : }
652 10555906 : else if (REG_P (src) && MEM_P (dst))
653 : {
654 2298053 : if (!speed_p)
655 : /* Integer load/store is 3+ bytes and SSE 4+. */
656 188831 : igain += COSTS_N_BYTES (3 * m - 4);
657 : else
658 2109222 : igain
659 2109222 : += COSTS_N_INSNS (m * ix86_cost->int_store[2]
660 : - ix86_cost->sse_store[sse_cost_idx]) / 2;
661 : }
662 8257853 : else if (MEM_P (src) && REG_P (dst))
663 : {
664 3709762 : if (!speed_p)
665 350314 : igain += COSTS_N_BYTES (3 * m - 4);
666 : else
667 3359448 : igain += COSTS_N_INSNS (m * ix86_cost->int_load[2]
668 : - ix86_cost->sse_load[sse_cost_idx]) / 2;
669 : }
670 : else
671 : {
672 : /* For operations on memory operands, include the overhead
673 : of explicit load and store instructions. */
674 4548091 : if (MEM_P (dst))
675 : {
676 66364 : if (!speed_p)
677 : /* ??? This probably should account size difference
678 : of SSE and integer load rather than full SSE load. */
679 : igain -= COSTS_N_BYTES (8);
680 : else
681 : {
682 57071 : int cost = (m * (ix86_cost->int_load[2]
683 57071 : + ix86_cost->int_store[2])
684 57071 : - (ix86_cost->sse_load[sse_cost_idx] +
685 57071 : ix86_cost->sse_store[sse_cost_idx]));
686 57071 : igain += COSTS_N_INSNS (cost) / 2;
687 : }
688 : }
689 :
690 4548091 : switch (GET_CODE (src))
691 : {
692 481035 : case ASHIFT:
693 481035 : case ASHIFTRT:
694 481035 : case LSHIFTRT:
695 481035 : if (m == 2)
696 : {
697 16981 : if (INTVAL (XEXP (src, 1)) >= 32)
698 11523 : igain += ix86_cost->add;
699 : /* Gain for extend highpart case. */
700 5458 : else if (GET_CODE (XEXP (src, 0)) == ASHIFT)
701 0 : igain += ix86_cost->shift_const - ix86_cost->sse_op;
702 : else
703 5458 : igain += ix86_cost->shift_const;
704 : }
705 :
706 481035 : igain += ix86_cost->shift_const - ix86_cost->sse_op;
707 :
708 481035 : if (CONST_INT_P (XEXP (src, 0)))
709 0 : igain -= vector_const_cost (XEXP (src, 0), bb);
710 : break;
711 :
712 3646 : case ROTATE:
713 3646 : case ROTATERT:
714 3646 : igain += m * ix86_cost->shift_const;
715 3646 : if (TARGET_AVX512VL)
716 204 : igain -= ix86_cost->sse_op;
717 3442 : else if (smode == DImode)
718 : {
719 590 : int bits = INTVAL (XEXP (src, 1));
720 590 : if ((bits & 0x0f) == 0)
721 106 : igain -= ix86_cost->sse_op;
722 484 : else if ((bits & 0x07) == 0)
723 27 : igain -= 2 * ix86_cost->sse_op;
724 : else
725 457 : igain -= 3 * ix86_cost->sse_op;
726 : }
727 2852 : else if (INTVAL (XEXP (src, 1)) == 16)
728 139 : igain -= ix86_cost->sse_op;
729 : else
730 2713 : igain -= 2 * ix86_cost->sse_op;
731 : break;
732 :
733 2807499 : case AND:
734 2807499 : case IOR:
735 2807499 : case XOR:
736 2807499 : case PLUS:
737 2807499 : case MINUS:
738 2807499 : igain += m * ix86_cost->add - ix86_cost->sse_op;
739 : /* Additional gain for andnot for targets without BMI. */
740 2807499 : if (GET_CODE (XEXP (src, 0)) == NOT
741 3598 : && !TARGET_BMI)
742 3589 : igain += m * ix86_cost->add;
743 :
744 2807499 : if (CONST_INT_P (XEXP (src, 0)))
745 0 : igain -= vector_const_cost (XEXP (src, 0), bb);
746 2807499 : if (CONST_INT_P (XEXP (src, 1)))
747 1662264 : igain -= vector_const_cost (XEXP (src, 1), bb);
748 2807499 : if (MEM_P (XEXP (src, 1)))
749 : {
750 87980 : if (!speed_p)
751 20721 : igain -= COSTS_N_BYTES (m == 2 ? 3 : 5);
752 : else
753 77615 : igain += COSTS_N_INSNS
754 : (m * ix86_cost->int_load[2]
755 : - ix86_cost->sse_load[sse_cost_idx]) / 2;
756 : }
757 : break;
758 :
759 50940 : case NEG:
760 50940 : case NOT:
761 50940 : igain -= ix86_cost->sse_op + COSTS_N_INSNS (1);
762 :
763 50940 : if (GET_CODE (XEXP (src, 0)) != ABS)
764 : {
765 50940 : igain += m * ix86_cost->add;
766 50940 : break;
767 : }
768 : /* FALLTHRU */
769 :
770 1002 : case ABS:
771 1002 : case SMAX:
772 1002 : case SMIN:
773 1002 : case UMAX:
774 1002 : case UMIN:
775 : /* We do not have any conditional move cost, estimate it as a
776 : reg-reg move. Comparisons are costed as adds. */
777 1002 : igain += m * (COSTS_N_INSNS (2) + ix86_cost->add);
778 : /* Integer SSE ops are all costed the same. */
779 1002 : igain -= ix86_cost->sse_op;
780 1002 : break;
781 :
782 0 : case COMPARE:
783 0 : if (XEXP (src, 1) != const0_rtx)
784 : {
785 : /* cmp vs. pxor;pshufd;ptest. */
786 0 : igain += COSTS_N_INSNS (m - 3);
787 : }
788 0 : else if (GET_CODE (XEXP (src, 0)) != AND)
789 : {
790 : /* test vs. pshufd;ptest. */
791 0 : igain += COSTS_N_INSNS (m - 2);
792 : }
793 0 : else if (GET_CODE (XEXP (XEXP (src, 0), 0)) != NOT)
794 : {
795 : /* and;test vs. pshufd;ptest. */
796 0 : igain += COSTS_N_INSNS (2 * m - 2);
797 : }
798 0 : else if (TARGET_BMI)
799 : {
800 : /* andn;test vs. pandn;pshufd;ptest. */
801 0 : igain += COSTS_N_INSNS (2 * m - 3);
802 : }
803 : else
804 : {
805 : /* not;and;test vs. pandn;pshufd;ptest. */
806 0 : igain += COSTS_N_INSNS (3 * m - 3);
807 : }
808 : break;
809 :
810 1166877 : case CONST_INT:
811 1166877 : if (REG_P (dst))
812 : {
813 1166877 : if (!speed_p)
814 : {
815 : /* xor (2 bytes) vs. xorps (3 bytes). */
816 227175 : if (src == const0_rtx)
817 119839 : igain -= COSTS_N_BYTES (1);
818 : /* movdi_internal vs. movv2di_internal. */
819 : /* => mov (5 bytes) vs. movaps (7 bytes). */
820 107336 : else if (x86_64_immediate_operand (src, SImode))
821 95423 : igain -= COSTS_N_BYTES (2);
822 : else
823 : /* ??? Larger immediate constants are placed in the
824 : constant pool, where the size benefit/impact of
825 : STV conversion is affected by whether and how
826 : often each constant pool entry is shared/reused.
827 : The value below is empirically derived from the
828 : CSiBE benchmark (and the optimal value may drift
829 : over time). */
830 : igain += COSTS_N_BYTES (0);
831 : }
832 : else
833 : {
834 : /* DImode can be immediate for TARGET_64BIT
835 : and SImode always. */
836 939702 : igain += m * COSTS_N_INSNS (1);
837 939702 : igain -= vector_const_cost (src, bb);
838 : }
839 : }
840 0 : else if (MEM_P (dst))
841 : {
842 0 : igain += (m * ix86_cost->int_store[2]
843 0 : - ix86_cost->sse_store[sse_cost_idx]);
844 0 : igain -= vector_const_cost (src, bb);
845 : }
846 : break;
847 :
848 37092 : case VEC_SELECT:
849 37092 : if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
850 : {
851 : // movd (4 bytes) replaced with movdqa (4 bytes).
852 27101 : if (!!speed_p)
853 25280 : igain += COSTS_N_INSNS (ix86_cost->sse_to_integer
854 : - ix86_cost->xmm_move) / 2;
855 : }
856 : else
857 : {
858 : // pshufd; movd replaced with pshufd.
859 9991 : if (!speed_p)
860 666 : igain += COSTS_N_BYTES (4);
861 : else
862 9325 : igain += ix86_cost->sse_to_integer;
863 : }
864 : break;
865 :
866 0 : default:
867 0 : gcc_unreachable ();
868 : }
869 : }
870 :
871 11454140 : if (speed_p)
872 10213707 : weighted_gain += bb_freq * igain;
873 11455961 : gain += igain;
874 :
875 11455961 : if (igain != 0 && dump_file)
876 : {
877 93 : fprintf (dump_file, " Instruction gain %d with bb_freq %.2f for",
878 : igain, bb_freq.to_double ());
879 93 : dump_insn_slim (dump_file, insn);
880 : }
881 : }
882 :
883 5810710 : if (dump_file)
884 : {
885 136 : fprintf (dump_file, " Instruction conversion gain: %d, \n",
886 : gain);
887 136 : fprintf (dump_file, " Registers conversion cost: %d\n",
888 : cost_sse_integer);
889 136 : fprintf (dump_file, " Weighted instruction conversion gain: %.2f, \n",
890 : weighted_gain.to_double ());
891 136 : fprintf (dump_file, " Weighted registers conversion cost: %.2f\n",
892 : weighted_cost_sse_integer.to_double ());
893 : }
894 :
895 5810710 : if (weighted_gain != weighted_cost_sse_integer)
896 4685730 : return weighted_gain > weighted_cost_sse_integer;
897 : else
898 1124980 : return gain > cost_sse_integer;;
899 : }
900 :
901 : /* Insert generated conversion instruction sequence INSNS
902 : after instruction AFTER. New BB may be required in case
903 : instruction has EH region attached. */
904 :
905 : void
906 31197 : scalar_chain::emit_conversion_insns (rtx insns, rtx_insn *after)
907 : {
908 31197 : if (!control_flow_insn_p (after))
909 : {
910 30984 : emit_insn_after (insns, after);
911 30984 : return;
912 : }
913 :
914 213 : basic_block bb = BLOCK_FOR_INSN (after);
915 213 : edge e = find_fallthru_edge (bb->succs);
916 213 : gcc_assert (e);
917 :
918 213 : basic_block new_bb = split_edge (e);
919 213 : emit_insn_after (insns, BB_HEAD (new_bb));
920 : }
921 :
922 : } // anon namespace
923 :
924 : /* Generate the canonical SET_SRC to move GPR to a VMODE vector register,
925 : zeroing the upper parts. */
926 :
927 : static rtx
928 173061 : gen_gpr_to_xmm_move_src (enum machine_mode vmode, rtx gpr)
929 : {
930 346122 : switch (GET_MODE_NUNITS (vmode))
931 : {
932 45 : case 1:
933 45 : return gen_rtx_SUBREG (vmode, gpr, 0);
934 172454 : case 2:
935 344908 : return gen_rtx_VEC_CONCAT (vmode, gpr,
936 : CONST0_RTX (GET_MODE_INNER (vmode)));
937 562 : default:
938 562 : return gen_rtx_VEC_MERGE (vmode, gen_rtx_VEC_DUPLICATE (vmode, gpr),
939 : CONST0_RTX (vmode), GEN_INT (HOST_WIDE_INT_1U));
940 : }
941 : }
942 :
943 : /* Make vector copies for all register REGNO definitions
944 : and replace its uses in a chain. */
945 :
946 : void
947 8398 : scalar_chain::make_vector_copies (rtx_insn *insn, rtx reg)
948 : {
949 8398 : rtx vreg = *defs_map.get (reg);
950 :
951 8398 : start_sequence ();
952 8398 : if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
953 : {
954 0 : rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
955 0 : if (smode == DImode && !TARGET_64BIT)
956 : {
957 0 : emit_move_insn (adjust_address (tmp, SImode, 0),
958 : gen_rtx_SUBREG (SImode, reg, 0));
959 0 : emit_move_insn (adjust_address (tmp, SImode, 4),
960 : gen_rtx_SUBREG (SImode, reg, 4));
961 : }
962 : else
963 0 : emit_move_insn (copy_rtx (tmp), reg);
964 0 : emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
965 : gen_gpr_to_xmm_move_src (vmode, tmp)));
966 : }
967 8398 : else if (!TARGET_64BIT && smode == DImode)
968 : {
969 8262 : if (TARGET_SSE4_1)
970 : {
971 356 : emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
972 : CONST0_RTX (V4SImode),
973 : gen_rtx_SUBREG (SImode, reg, 0)));
974 356 : emit_insn (gen_sse4_1_pinsrd (gen_rtx_SUBREG (V4SImode, vreg, 0),
975 : gen_rtx_SUBREG (V4SImode, vreg, 0),
976 : gen_rtx_SUBREG (SImode, reg, 4),
977 : GEN_INT (2)));
978 : }
979 : else
980 : {
981 7906 : rtx tmp = gen_reg_rtx (DImode);
982 7906 : emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, vreg, 0),
983 : CONST0_RTX (V4SImode),
984 : gen_rtx_SUBREG (SImode, reg, 0)));
985 7906 : emit_insn (gen_sse2_loadld (gen_rtx_SUBREG (V4SImode, tmp, 0),
986 : CONST0_RTX (V4SImode),
987 : gen_rtx_SUBREG (SImode, reg, 4)));
988 7906 : emit_insn (gen_vec_interleave_lowv4si
989 : (gen_rtx_SUBREG (V4SImode, vreg, 0),
990 : gen_rtx_SUBREG (V4SImode, vreg, 0),
991 : gen_rtx_SUBREG (V4SImode, tmp, 0)));
992 : }
993 : }
994 : else
995 136 : emit_insn (gen_rtx_SET (gen_rtx_SUBREG (vmode, vreg, 0),
996 : gen_gpr_to_xmm_move_src (vmode, reg)));
997 8398 : rtx_insn *seq = end_sequence ();
998 8398 : emit_conversion_insns (seq, insn);
999 :
1000 8398 : if (dump_file)
1001 0 : fprintf (dump_file,
1002 : " Copied r%d to a vector register r%d for insn %d\n",
1003 0 : REGNO (reg), REGNO (vreg), INSN_UID (insn));
1004 8398 : }
1005 :
1006 : /* Copy the definition SRC of INSN inside the chain to DST for
1007 : scalar uses outside of the chain. */
1008 :
1009 : void
1010 22041 : scalar_chain::convert_reg (rtx_insn *insn, rtx dst, rtx src)
1011 : {
1012 22041 : start_sequence ();
1013 22041 : if (!TARGET_INTER_UNIT_MOVES_FROM_VEC)
1014 : {
1015 0 : rtx tmp = assign_386_stack_local (smode, SLOT_STV_TEMP);
1016 0 : emit_move_insn (tmp, src);
1017 0 : if (!TARGET_64BIT && smode == DImode)
1018 : {
1019 0 : emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
1020 : adjust_address (tmp, SImode, 0));
1021 0 : emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
1022 : adjust_address (tmp, SImode, 4));
1023 : }
1024 : else
1025 0 : emit_move_insn (dst, copy_rtx (tmp));
1026 : }
1027 22041 : else if (!TARGET_64BIT && smode == DImode)
1028 : {
1029 21117 : if (TARGET_SSE4_1)
1030 : {
1031 0 : rtx tmp = gen_rtx_PARALLEL (VOIDmode,
1032 : gen_rtvec (1, const0_rtx));
1033 0 : emit_insn
1034 0 : (gen_rtx_SET
1035 : (gen_rtx_SUBREG (SImode, dst, 0),
1036 : gen_rtx_VEC_SELECT (SImode,
1037 : gen_rtx_SUBREG (V4SImode, src, 0),
1038 : tmp)));
1039 :
1040 0 : tmp = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (1, const1_rtx));
1041 0 : emit_insn
1042 0 : (gen_rtx_SET
1043 : (gen_rtx_SUBREG (SImode, dst, 4),
1044 : gen_rtx_VEC_SELECT (SImode,
1045 : gen_rtx_SUBREG (V4SImode, src, 0),
1046 : tmp)));
1047 : }
1048 : else
1049 : {
1050 21117 : rtx vcopy = gen_reg_rtx (V2DImode);
1051 21117 : emit_move_insn (vcopy, gen_rtx_SUBREG (V2DImode, src, 0));
1052 21117 : emit_move_insn (gen_rtx_SUBREG (SImode, dst, 0),
1053 : gen_rtx_SUBREG (SImode, vcopy, 0));
1054 21117 : emit_move_insn (vcopy,
1055 : gen_rtx_LSHIFTRT (V2DImode,
1056 : vcopy, GEN_INT (32)));
1057 21117 : emit_move_insn (gen_rtx_SUBREG (SImode, dst, 4),
1058 : gen_rtx_SUBREG (SImode, vcopy, 0));
1059 : }
1060 : }
1061 : else
1062 924 : emit_move_insn (dst, src);
1063 :
1064 22041 : rtx_insn *seq = end_sequence ();
1065 22041 : emit_conversion_insns (seq, insn);
1066 :
1067 22041 : if (dump_file)
1068 0 : fprintf (dump_file,
1069 : " Copied r%d to a scalar register r%d for insn %d\n",
1070 0 : REGNO (src), REGNO (dst), INSN_UID (insn));
1071 22041 : }
1072 :
1073 : /* Helper function to convert immediate constant X to vmode. */
1074 : static rtx
1075 35847 : smode_convert_cst (rtx x, enum machine_mode vmode)
1076 : {
1077 : /* Prefer all ones vector in case of -1. */
1078 35847 : if (constm1_operand (x, GET_MODE (x)))
1079 625 : return CONSTM1_RTX (vmode);
1080 :
1081 35222 : unsigned n = GET_MODE_NUNITS (vmode);
1082 35222 : rtx *v = XALLOCAVEC (rtx, n);
1083 35222 : v[0] = x;
1084 41004 : for (unsigned i = 1; i < n; ++i)
1085 5782 : v[i] = const0_rtx;
1086 35222 : return gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (n, v));
1087 : }
1088 :
1089 : /* Convert operand OP in INSN. We should handle
1090 : memory operands and uninitialized registers.
1091 : All other register uses are converted during
1092 : registers conversion. */
1093 :
1094 : void
1095 247860 : scalar_chain::convert_op (rtx *op, rtx_insn *insn)
1096 : {
1097 247860 : rtx tmp;
1098 :
1099 247860 : if (GET_MODE (*op) == V1TImode)
1100 : return;
1101 :
1102 247677 : *op = copy_rtx_if_shared (*op);
1103 :
1104 247677 : if (GET_CODE (*op) == NOT
1105 247677 : || GET_CODE (*op) == ASHIFT)
1106 : {
1107 3493 : convert_op (&XEXP (*op, 0), insn);
1108 3493 : PUT_MODE (*op, vmode);
1109 : }
1110 : else if (MEM_P (*op))
1111 : {
1112 172925 : rtx_insn *movabs = NULL;
1113 :
1114 : /* Emit MOVABS to load from a 64-bit absolute address to a GPR. */
1115 172925 : if (!memory_operand (*op, GET_MODE (*op)))
1116 : {
1117 0 : tmp = gen_reg_rtx (GET_MODE (*op));
1118 0 : movabs = emit_insn_before (gen_rtx_SET (tmp, *op), insn);
1119 :
1120 0 : *op = tmp;
1121 : }
1122 :
1123 172925 : tmp = gen_rtx_SUBREG (vmode, gen_reg_rtx (GET_MODE (*op)), 0);
1124 :
1125 172925 : rtx_insn *eh_insn
1126 172925 : = emit_insn_before (gen_rtx_SET (copy_rtx (tmp),
1127 : gen_gpr_to_xmm_move_src (vmode, *op)),
1128 172925 : insn);
1129 :
1130 172925 : if (cfun->can_throw_non_call_exceptions)
1131 : {
1132 : /* Handle REG_EH_REGION note. */
1133 168754 : rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
1134 168754 : if (note)
1135 : {
1136 3588 : if (movabs)
1137 0 : eh_insn = movabs;
1138 3588 : control_flow_insns.safe_push (eh_insn);
1139 3588 : add_reg_note (eh_insn, REG_EH_REGION, XEXP (note, 0));
1140 : }
1141 : }
1142 :
1143 172925 : *op = tmp;
1144 :
1145 172925 : if (dump_file)
1146 0 : fprintf (dump_file, " Preloading operand for insn %d into r%d\n",
1147 0 : INSN_UID (insn), reg_or_subregno (tmp));
1148 : }
1149 : else if (REG_P (*op))
1150 65117 : *op = gen_rtx_SUBREG (vmode, *op, 0);
1151 : else if (CONST_SCALAR_INT_P (*op))
1152 : {
1153 6139 : rtx vec_cst = smode_convert_cst (*op, vmode);
1154 :
1155 6139 : if (!standard_sse_constant_p (vec_cst, vmode))
1156 : {
1157 2731 : start_sequence ();
1158 2731 : vec_cst = validize_mem (force_const_mem (vmode, vec_cst));
1159 2731 : rtx_insn *seq = end_sequence ();
1160 2731 : emit_insn_before (seq, insn);
1161 : }
1162 :
1163 6139 : tmp = gen_rtx_SUBREG (vmode, gen_reg_rtx (smode), 0);
1164 :
1165 6139 : emit_insn_before (gen_move_insn (copy_rtx (tmp), vec_cst), insn);
1166 6139 : *op = tmp;
1167 : }
1168 : else
1169 : {
1170 0 : gcc_assert (SUBREG_P (*op));
1171 3 : if (GET_MODE (*op) != vmode)
1172 : {
1173 3 : rtx inner = SUBREG_REG (*op);
1174 3 : poly_uint64 byte = SUBREG_BYTE (*op);
1175 3 : if (targetm.modes_tieable_p (vmode, GET_MODE (inner))
1176 3 : && validate_subreg (vmode, GET_MODE (inner), inner, byte))
1177 3 : *op = gen_lowpart (vmode, *op);
1178 : else
1179 : {
1180 0 : tmp = gen_reg_rtx (GET_MODE (*op));
1181 0 : emit_insn_before (gen_rtx_SET (tmp, *op), insn);
1182 0 : *op = gen_rtx_SUBREG (vmode, tmp, 0);
1183 : }
1184 : }
1185 : }
1186 : }
1187 :
1188 : /* Convert CCZmode COMPARE to vector mode. */
1189 :
1190 : rtx
1191 12 : scalar_chain::convert_compare (rtx op1, rtx op2, rtx_insn *insn)
1192 : {
1193 12 : rtx src, tmp;
1194 :
1195 : /* Handle any REG_EQUAL notes. */
1196 12 : tmp = find_reg_equal_equiv_note (insn);
1197 12 : if (tmp)
1198 : {
1199 1 : if (GET_CODE (XEXP (tmp, 0)) == COMPARE
1200 1 : && GET_MODE (XEXP (tmp, 0)) == CCZmode
1201 1 : && REG_P (XEXP (XEXP (tmp, 0), 0)))
1202 : {
1203 1 : rtx *op = &XEXP (XEXP (tmp, 0), 1);
1204 1 : if (CONST_SCALAR_INT_P (*op))
1205 : {
1206 1 : if (constm1_operand (*op, GET_MODE (*op)))
1207 0 : *op = CONSTM1_RTX (vmode);
1208 : else
1209 : {
1210 1 : unsigned n = GET_MODE_NUNITS (vmode);
1211 1 : rtx *v = XALLOCAVEC (rtx, n);
1212 1 : v[0] = *op;
1213 1 : for (unsigned i = 1; i < n; ++i)
1214 0 : v[i] = const0_rtx;
1215 1 : *op = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (n, v));
1216 : }
1217 : tmp = NULL_RTX;
1218 : }
1219 0 : else if (REG_P (*op))
1220 : tmp = NULL_RTX;
1221 : }
1222 :
1223 : if (tmp)
1224 0 : remove_note (insn, tmp);
1225 : }
1226 :
1227 : /* Comparison against anything other than zero, requires an XOR. */
1228 12 : if (op2 != const0_rtx)
1229 : {
1230 6 : convert_op (&op1, insn);
1231 6 : convert_op (&op2, insn);
1232 : /* If both operands are MEMs, explicitly load the OP1 into TMP. */
1233 6 : if (MEM_P (op1) && MEM_P (op2))
1234 : {
1235 0 : tmp = gen_reg_rtx (vmode);
1236 0 : emit_insn_before (gen_rtx_SET (tmp, op1), insn);
1237 0 : src = tmp;
1238 : }
1239 : else
1240 : src = op1;
1241 6 : src = gen_rtx_XOR (vmode, src, op2);
1242 : }
1243 6 : else if (GET_CODE (op1) == AND
1244 0 : && GET_CODE (XEXP (op1, 0)) == NOT)
1245 : {
1246 0 : rtx op11 = XEXP (XEXP (op1, 0), 0);
1247 0 : rtx op12 = XEXP (op1, 1);
1248 0 : convert_op (&op11, insn);
1249 0 : convert_op (&op12, insn);
1250 0 : if (!REG_P (op11))
1251 : {
1252 0 : tmp = gen_reg_rtx (vmode);
1253 0 : emit_insn_before (gen_rtx_SET (tmp, op11), insn);
1254 0 : op11 = tmp;
1255 : }
1256 0 : src = gen_rtx_AND (vmode, gen_rtx_NOT (vmode, op11), op12);
1257 0 : }
1258 6 : else if (GET_CODE (op1) == AND)
1259 : {
1260 0 : rtx op11 = XEXP (op1, 0);
1261 0 : rtx op12 = XEXP (op1, 1);
1262 0 : convert_op (&op11, insn);
1263 0 : convert_op (&op12, insn);
1264 0 : if (!REG_P (op11))
1265 : {
1266 0 : tmp = gen_reg_rtx (vmode);
1267 0 : emit_insn_before (gen_rtx_SET (tmp, op11), insn);
1268 0 : op11 = tmp;
1269 : }
1270 0 : return gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, op11, op12),
1271 : UNSPEC_PTEST);
1272 : }
1273 : else
1274 : {
1275 6 : convert_op (&op1, insn);
1276 6 : src = op1;
1277 : }
1278 :
1279 12 : if (!REG_P (src))
1280 : {
1281 8 : tmp = gen_reg_rtx (vmode);
1282 8 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
1283 8 : src = tmp;
1284 : }
1285 :
1286 12 : if (vmode == V2DImode)
1287 : {
1288 0 : tmp = gen_reg_rtx (vmode);
1289 0 : emit_insn_before (gen_vec_interleave_lowv2di (tmp, src, src), insn);
1290 0 : src = tmp;
1291 : }
1292 12 : else if (vmode == V4SImode)
1293 : {
1294 0 : tmp = gen_reg_rtx (vmode);
1295 0 : emit_insn_before (gen_sse2_pshufd (tmp, src, const0_rtx), insn);
1296 0 : src = tmp;
1297 : }
1298 :
1299 12 : return gen_rtx_UNSPEC (CCZmode, gen_rtvec (2, src, src), UNSPEC_PTEST);
1300 : }
1301 :
1302 : /* Helper function for converting INSN to vector mode. */
1303 :
1304 : void
1305 1318482 : scalar_chain::convert_insn_common (rtx_insn *insn)
1306 : {
1307 : /* Generate copies for out-of-chain uses of defs and adjust debug uses. */
1308 2019968 : for (df_ref ref = DF_INSN_DEFS (insn); ref; ref = DF_REF_NEXT_LOC (ref))
1309 701486 : if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
1310 : {
1311 23488 : df_link *use;
1312 44239 : for (use = DF_REF_CHAIN (ref); use; use = use->next)
1313 42792 : if (NONDEBUG_INSN_P (DF_REF_INSN (use->ref))
1314 42792 : && (DF_REF_REG_MEM_P (use->ref)
1315 38699 : || !bitmap_bit_p (insns, DF_REF_INSN_UID (use->ref))))
1316 : break;
1317 23488 : if (use)
1318 22041 : convert_reg (insn, DF_REF_REG (ref),
1319 22041 : *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]));
1320 1447 : else if (MAY_HAVE_DEBUG_BIND_INSNS)
1321 : {
1322 : /* If we generated a scalar copy we can leave debug-insns
1323 : as-is, if not, we have to adjust them. */
1324 1325 : auto_vec<rtx_insn *, 5> to_reset_debug_insns;
1325 3956 : for (use = DF_REF_CHAIN (ref); use; use = use->next)
1326 2631 : if (DEBUG_INSN_P (DF_REF_INSN (use->ref)))
1327 : {
1328 833 : rtx_insn *debug_insn = DF_REF_INSN (use->ref);
1329 : /* If there's a reaching definition outside of the
1330 : chain we have to reset. */
1331 833 : df_link *def;
1332 2953 : for (def = DF_REF_CHAIN (use->ref); def; def = def->next)
1333 2300 : if (!bitmap_bit_p (insns, DF_REF_INSN_UID (def->ref)))
1334 : break;
1335 833 : if (def)
1336 180 : to_reset_debug_insns.safe_push (debug_insn);
1337 : else
1338 : {
1339 653 : *DF_REF_REAL_LOC (use->ref)
1340 653 : = *defs_map.get (regno_reg_rtx [DF_REF_REGNO (ref)]);
1341 653 : df_insn_rescan (debug_insn);
1342 : }
1343 : }
1344 : /* Have to do the reset outside of the DF_CHAIN walk to not
1345 : disrupt it. */
1346 2830 : while (!to_reset_debug_insns.is_empty ())
1347 : {
1348 180 : rtx_insn *debug_insn = to_reset_debug_insns.pop ();
1349 180 : INSN_VAR_LOCATION_LOC (debug_insn) = gen_rtx_UNKNOWN_VAR_LOC ();
1350 180 : df_insn_rescan_debug_internal (debug_insn);
1351 : }
1352 1325 : }
1353 : }
1354 :
1355 : /* Replace uses in this insn with the defs we use in the chain. */
1356 3292877 : for (df_ref ref = DF_INSN_USES (insn); ref; ref = DF_REF_NEXT_LOC (ref))
1357 1974395 : if (!DF_REF_REG_MEM_P (ref))
1358 712135 : if (rtx *vreg = defs_map.get (regno_reg_rtx[DF_REF_REGNO (ref)]))
1359 : {
1360 : /* Also update a corresponding REG_DEAD note. */
1361 35362 : rtx note = find_reg_note (insn, REG_DEAD, DF_REF_REG (ref));
1362 35362 : if (note)
1363 23369 : XEXP (note, 0) = *vreg;
1364 35362 : *DF_REF_REAL_LOC (ref) = *vreg;
1365 : }
1366 1318482 : }
1367 :
1368 : /* Convert INSN which is an SImode or DImode rotation by a constant
1369 : to vector mode. CODE is either ROTATE or ROTATERT with operands
1370 : OP0 and OP1. Returns the SET_SRC of the last instruction in the
1371 : resulting sequence, which is emitted before INSN. */
1372 :
1373 : rtx
1374 92 : general_scalar_chain::convert_rotate (enum rtx_code code, rtx op0, rtx op1,
1375 : rtx_insn *insn)
1376 : {
1377 92 : int bits = INTVAL (op1);
1378 92 : rtx pat, result;
1379 :
1380 92 : convert_op (&op0, insn);
1381 92 : if (bits == 0)
1382 0 : return op0;
1383 :
1384 92 : if (smode == DImode)
1385 : {
1386 92 : if (code == ROTATE)
1387 45 : bits = 64 - bits;
1388 92 : if (bits == 32)
1389 : {
1390 0 : rtx tmp1 = gen_reg_rtx (V4SImode);
1391 0 : pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
1392 : GEN_INT (225));
1393 0 : emit_insn_before (pat, insn);
1394 0 : result = gen_lowpart (V2DImode, tmp1);
1395 : }
1396 92 : else if (TARGET_AVX512VL)
1397 0 : result = simplify_gen_binary (code, V2DImode, op0, op1);
1398 92 : else if (bits == 16 || bits == 48)
1399 : {
1400 0 : rtx tmp1 = gen_reg_rtx (V8HImode);
1401 0 : pat = gen_sse2_pshuflw (tmp1, gen_lowpart (V8HImode, op0),
1402 : GEN_INT (bits == 16 ? 57 : 147));
1403 0 : emit_insn_before (pat, insn);
1404 0 : result = gen_lowpart (V2DImode, tmp1);
1405 : }
1406 92 : else if ((bits & 0x07) == 0)
1407 : {
1408 0 : rtx tmp1 = gen_reg_rtx (V4SImode);
1409 0 : pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
1410 : GEN_INT (68));
1411 0 : emit_insn_before (pat, insn);
1412 0 : rtx tmp2 = gen_reg_rtx (V1TImode);
1413 0 : pat = gen_sse2_lshrv1ti3 (tmp2, gen_lowpart (V1TImode, tmp1),
1414 : GEN_INT (bits));
1415 0 : emit_insn_before (pat, insn);
1416 0 : result = gen_lowpart (V2DImode, tmp2);
1417 : }
1418 : else
1419 : {
1420 92 : rtx tmp1 = gen_reg_rtx (V4SImode);
1421 92 : pat = gen_sse2_pshufd (tmp1, gen_lowpart (V4SImode, op0),
1422 : GEN_INT (20));
1423 92 : emit_insn_before (pat, insn);
1424 92 : rtx tmp2 = gen_reg_rtx (V2DImode);
1425 92 : pat = gen_lshrv2di3 (tmp2, gen_lowpart (V2DImode, tmp1),
1426 : GEN_INT (bits & 31));
1427 92 : emit_insn_before (pat, insn);
1428 92 : rtx tmp3 = gen_reg_rtx (V4SImode);
1429 139 : pat = gen_sse2_pshufd (tmp3, gen_lowpart (V4SImode, tmp2),
1430 : GEN_INT (bits > 32 ? 34 : 136));
1431 92 : emit_insn_before (pat, insn);
1432 92 : result = gen_lowpart (V2DImode, tmp3);
1433 : }
1434 : }
1435 0 : else if (bits == 16)
1436 : {
1437 0 : rtx tmp1 = gen_reg_rtx (V8HImode);
1438 0 : pat = gen_sse2_pshuflw (tmp1, gen_lowpart (V8HImode, op0), GEN_INT (225));
1439 0 : emit_insn_before (pat, insn);
1440 0 : result = gen_lowpart (V4SImode, tmp1);
1441 : }
1442 0 : else if (TARGET_AVX512VL)
1443 0 : result = simplify_gen_binary (code, V4SImode, op0, op1);
1444 : else
1445 : {
1446 0 : if (code == ROTATE)
1447 0 : bits = 32 - bits;
1448 :
1449 0 : rtx tmp1 = gen_reg_rtx (V4SImode);
1450 0 : emit_insn_before (gen_sse2_pshufd (tmp1, op0, GEN_INT (224)), insn);
1451 0 : rtx tmp2 = gen_reg_rtx (V2DImode);
1452 0 : pat = gen_lshrv2di3 (tmp2, gen_lowpart (V2DImode, tmp1),
1453 : GEN_INT (bits));
1454 0 : emit_insn_before (pat, insn);
1455 0 : result = gen_lowpart (V4SImode, tmp2);
1456 : }
1457 :
1458 : return result;
1459 : }
1460 :
1461 : /* Convert INSN to vector mode. */
1462 :
1463 : void
1464 411822 : general_scalar_chain::convert_insn (rtx_insn *insn)
1465 : {
1466 411822 : rtx def_set = single_set (insn);
1467 411822 : rtx src = SET_SRC (def_set);
1468 411822 : rtx dst = SET_DEST (def_set);
1469 411822 : rtx subreg;
1470 :
1471 411822 : if (MEM_P (dst) && !REG_P (src))
1472 : {
1473 : /* There are no scalar integer instructions and therefore
1474 : temporary register usage is required. */
1475 758 : rtx tmp = gen_reg_rtx (smode);
1476 758 : emit_conversion_insns (gen_move_insn (dst, tmp), insn);
1477 758 : dst = gen_rtx_SUBREG (vmode, tmp, 0);
1478 758 : }
1479 411064 : else if (REG_P (dst) && GET_MODE (dst) == smode)
1480 : {
1481 : /* Replace the definition with a SUBREG to the definition we
1482 : use inside the chain. */
1483 215844 : rtx *vdef = defs_map.get (dst);
1484 215844 : if (vdef)
1485 23488 : dst = *vdef;
1486 215844 : dst = gen_rtx_SUBREG (vmode, dst, 0);
1487 : /* IRA doesn't like to have REG_EQUAL/EQUIV notes when the SET_DEST
1488 : is a non-REG_P. So kill those off. */
1489 215844 : rtx note = find_reg_equal_equiv_note (insn);
1490 215844 : if (note)
1491 9726 : remove_note (insn, note);
1492 : }
1493 :
1494 411822 : switch (GET_CODE (src))
1495 : {
1496 30288 : case PLUS:
1497 30288 : case MINUS:
1498 30288 : case IOR:
1499 30288 : case XOR:
1500 30288 : case AND:
1501 30288 : case SMAX:
1502 30288 : case SMIN:
1503 30288 : case UMAX:
1504 30288 : case UMIN:
1505 30288 : convert_op (&XEXP (src, 1), insn);
1506 : /* FALLTHRU */
1507 :
1508 37664 : case ABS:
1509 37664 : case ASHIFT:
1510 37664 : case ASHIFTRT:
1511 37664 : case LSHIFTRT:
1512 37664 : convert_op (&XEXP (src, 0), insn);
1513 37664 : PUT_MODE (src, vmode);
1514 37664 : break;
1515 :
1516 92 : case ROTATE:
1517 92 : case ROTATERT:
1518 92 : src = convert_rotate (GET_CODE (src), XEXP (src, 0), XEXP (src, 1),
1519 : insn);
1520 92 : break;
1521 :
1522 391 : case NEG:
1523 391 : src = XEXP (src, 0);
1524 :
1525 391 : if (GET_CODE (src) == ABS)
1526 : {
1527 0 : src = XEXP (src, 0);
1528 0 : convert_op (&src, insn);
1529 0 : subreg = gen_reg_rtx (vmode);
1530 0 : emit_insn_before (gen_rtx_SET (subreg,
1531 : gen_rtx_ABS (vmode, src)), insn);
1532 0 : src = subreg;
1533 : }
1534 : else
1535 391 : convert_op (&src, insn);
1536 :
1537 391 : subreg = gen_reg_rtx (vmode);
1538 391 : emit_insn_before (gen_move_insn (subreg, CONST0_RTX (vmode)), insn);
1539 391 : src = gen_rtx_MINUS (vmode, subreg, src);
1540 391 : break;
1541 :
1542 250 : case NOT:
1543 250 : src = XEXP (src, 0);
1544 250 : convert_op (&src, insn);
1545 250 : subreg = gen_reg_rtx (vmode);
1546 250 : emit_insn_before (gen_move_insn (subreg, CONSTM1_RTX (vmode)), insn);
1547 250 : src = gen_rtx_XOR (vmode, src, subreg);
1548 250 : break;
1549 :
1550 170761 : case MEM:
1551 170761 : if (!REG_P (dst))
1552 170761 : convert_op (&src, insn);
1553 : break;
1554 :
1555 196524 : case REG:
1556 196524 : if (!MEM_P (dst))
1557 1304 : convert_op (&src, insn);
1558 : break;
1559 :
1560 0 : case SUBREG:
1561 0 : gcc_assert (GET_MODE (src) == vmode);
1562 : break;
1563 :
1564 0 : case COMPARE:
1565 0 : dst = gen_rtx_REG (CCZmode, FLAGS_REG);
1566 0 : src = convert_compare (XEXP (src, 0), XEXP (src, 1), insn);
1567 0 : break;
1568 :
1569 3373 : case CONST_INT:
1570 3373 : convert_op (&src, insn);
1571 3373 : break;
1572 :
1573 2767 : case VEC_SELECT:
1574 2767 : if (XVECEXP (XEXP (src, 1), 0, 0) == const0_rtx)
1575 1619 : src = XEXP (src, 0);
1576 1148 : else if (smode == DImode)
1577 : {
1578 759 : rtx tmp = gen_lowpart (V1TImode, XEXP (src, 0));
1579 759 : dst = gen_lowpart (V1TImode, dst);
1580 759 : src = gen_rtx_LSHIFTRT (V1TImode, tmp, GEN_INT (64));
1581 : }
1582 : else
1583 : {
1584 389 : rtx tmp = XVECEXP (XEXP (src, 1), 0, 0);
1585 389 : rtvec vec = gen_rtvec (4, tmp, tmp, tmp, tmp);
1586 389 : rtx par = gen_rtx_PARALLEL (VOIDmode, vec);
1587 389 : src = gen_rtx_VEC_SELECT (vmode, XEXP (src, 0), par);
1588 : }
1589 : break;
1590 :
1591 0 : default:
1592 0 : gcc_unreachable ();
1593 : }
1594 :
1595 411822 : SET_SRC (def_set) = src;
1596 411822 : SET_DEST (def_set) = dst;
1597 :
1598 : /* Drop possible dead definitions. */
1599 411822 : PATTERN (insn) = def_set;
1600 :
1601 411822 : INSN_CODE (insn) = -1;
1602 411822 : int patt = recog_memoized (insn);
1603 411822 : if (patt == -1)
1604 0 : fatal_insn_not_found (insn);
1605 411822 : df_insn_rescan (insn);
1606 411822 : }
1607 :
1608 : /* Helper function to compute gain for loading an immediate constant.
1609 : Typically, two movabsq for TImode vs. vmovdqa for V1TImode, but
1610 : with numerous special cases. */
1611 :
1612 : static int
1613 19 : timode_immed_const_gain (rtx cst, basic_block bb)
1614 : {
1615 : /* movabsq vs. movabsq+vmovq+vunpacklqdq. */
1616 19 : if (CONST_WIDE_INT_P (cst)
1617 7 : && CONST_WIDE_INT_NUNITS (cst) == 2
1618 26 : && CONST_WIDE_INT_ELT (cst, 0) == CONST_WIDE_INT_ELT (cst, 1))
1619 0 : return optimize_bb_for_size_p (bb) ? -COSTS_N_BYTES (9)
1620 : : -COSTS_N_INSNS (2);
1621 : /* 2x movabsq ~ vmovdqa. */
1622 : return 0;
1623 : }
1624 :
1625 : /* Return true it's cost profitable for for chain conversion. */
1626 :
1627 : bool
1628 499483 : timode_scalar_chain::compute_convert_gain ()
1629 : {
1630 : /* Assume that if we have to move TImode values between units,
1631 : then transforming this chain isn't worth it. */
1632 499483 : if (cost_sse_integer)
1633 : return false;
1634 :
1635 499483 : bitmap_iterator bi;
1636 499483 : unsigned insn_uid;
1637 :
1638 : /* Split ties to prefer V1TImode when not optimizing for size. */
1639 499483 : int gain = optimize_size ? 0 : 1;
1640 499483 : sreal weighted_gain = 0;
1641 :
1642 499483 : if (dump_file)
1643 0 : fprintf (dump_file, "Computing gain for chain #%d...\n", chain_id);
1644 :
1645 1494615 : EXECUTE_IF_SET_IN_BITMAP (insns, 0, insn_uid, bi)
1646 : {
1647 995132 : rtx_insn *insn = DF_INSN_UID_GET (insn_uid)->insn;
1648 995132 : rtx def_set = single_set (insn);
1649 995132 : rtx src = SET_SRC (def_set);
1650 995132 : rtx dst = SET_DEST (def_set);
1651 995132 : HOST_WIDE_INT op1val;
1652 995132 : basic_block bb = BLOCK_FOR_INSN (insn);
1653 995132 : int scost, vcost;
1654 995132 : int igain = 0;
1655 995132 : profile_count entry_count = ENTRY_BLOCK_PTR_FOR_FN (cfun)->count;
1656 995132 : bool speed_p = optimize_bb_for_speed_p (bb);
1657 995132 : sreal bb_freq = bb->count.to_sreal_scale (entry_count);
1658 :
1659 995132 : switch (GET_CODE (src))
1660 : {
1661 519970 : case REG:
1662 519970 : if (GENERAL_REGNO_P (REGNO (src)))
1663 : {
1664 24980 : if (TARGET_AVX)
1665 : /* vmovq + vpinsrq */
1666 26 : igain = speed_p ? -ix86_cost->integer_to_sse
1667 : - COSTS_N_INSNS (1)
1668 : : -COSTS_N_BYTES (11);
1669 : else
1670 : /* movq + movq + punpcklqdq */
1671 24954 : igain = speed_p ? -ix86_cost->integer_to_sse
1672 : - COSTS_N_INSNS (2)
1673 : : -COSTS_N_BYTES (14);
1674 : }
1675 494990 : else if (GENERAL_REG_P (dst))
1676 : {
1677 24506 : if (TARGET_AVX)
1678 : /* vpextrq + vmovq */
1679 26 : igain = speed_p ? -ix86_cost->sse_to_integer
1680 : - COSTS_N_INSNS (1)
1681 : : -COSTS_N_BYTES (11);
1682 : else
1683 : /* movhlps + movq + movq */
1684 24480 : igain = speed_p ? -ix86_cost->sse_to_integer
1685 : - COSTS_N_INSNS (2)
1686 : : -COSTS_N_BYTES (13);
1687 : }
1688 470484 : else if (!speed_p)
1689 14486 : igain = MEM_P (dst) ? COSTS_N_BYTES (6) : COSTS_N_BYTES (3);
1690 : else
1691 : igain = COSTS_N_INSNS (1);
1692 : break;
1693 :
1694 429462 : case MEM:
1695 429462 : igain = !speed_p ? COSTS_N_BYTES (7) : COSTS_N_INSNS (1);
1696 : break;
1697 :
1698 10619 : case CONST_INT:
1699 10619 : if (MEM_P (dst)
1700 10619 : && standard_sse_constant_p (src, V1TImode))
1701 10086 : igain = !speed_p ? COSTS_N_BYTES (11) : 1;
1702 : break;
1703 :
1704 30073 : case CONST_WIDE_INT:
1705 : /* 2 x mov vs. vmovdqa. */
1706 30073 : if (MEM_P (dst))
1707 29529 : igain = !speed_p ? COSTS_N_BYTES (3) : COSTS_N_INSNS (1);
1708 : break;
1709 :
1710 78 : case NOT:
1711 78 : if (MEM_P (dst))
1712 66417 : igain = -COSTS_N_INSNS (1);
1713 : break;
1714 :
1715 39 : case AND:
1716 39 : if (!MEM_P (dst))
1717 28 : igain = COSTS_N_INSNS (1);
1718 39 : if (CONST_SCALAR_INT_P (XEXP (src, 1)))
1719 10 : igain += timode_immed_const_gain (XEXP (src, 1), bb);
1720 : break;
1721 :
1722 4273 : case XOR:
1723 4273 : case IOR:
1724 4273 : if (timode_concatdi_p (src))
1725 : {
1726 : /* vmovq;vpinsrq (11 bytes). */
1727 4145 : igain = speed_p ? -ix86_cost->integer_to_sse - COSTS_N_INSNS (1)
1728 : : -COSTS_N_BYTES (11);
1729 : break;
1730 : }
1731 128 : if (!MEM_P (dst))
1732 120 : igain = COSTS_N_INSNS (1);
1733 128 : if (CONST_SCALAR_INT_P (XEXP (src, 1)))
1734 9 : igain += timode_immed_const_gain (XEXP (src, 1), bb);
1735 : break;
1736 :
1737 0 : case PLUS:
1738 0 : if (timode_concatdi_p (src))
1739 : /* vmovq;vpinsrq (11 bytes). */
1740 0 : igain = speed_p ? -ix86_cost->integer_to_sse - COSTS_N_INSNS (1)
1741 : : -COSTS_N_BYTES (11);
1742 : break;
1743 :
1744 206 : case ASHIFT:
1745 206 : case LSHIFTRT:
1746 : /* See ix86_expand_v1ti_shift. */
1747 206 : op1val = INTVAL (XEXP (src, 1));
1748 206 : if (!speed_p)
1749 : {
1750 19 : if (op1val == 64 || op1val == 65)
1751 : scost = COSTS_N_BYTES (5);
1752 13 : else if (op1val >= 66)
1753 : scost = COSTS_N_BYTES (6);
1754 13 : else if (op1val == 1)
1755 : scost = COSTS_N_BYTES (8);
1756 : else
1757 : scost = COSTS_N_BYTES (9);
1758 :
1759 17 : if ((op1val & 7) == 0)
1760 : vcost = COSTS_N_BYTES (5);
1761 13 : else if (op1val > 64)
1762 : vcost = COSTS_N_BYTES (10);
1763 : else
1764 13 : vcost = TARGET_AVX ? COSTS_N_BYTES (19) : COSTS_N_BYTES (23);
1765 : }
1766 : else
1767 : {
1768 187 : scost = COSTS_N_INSNS (2);
1769 187 : if ((op1val & 7) == 0)
1770 : vcost = COSTS_N_INSNS (1);
1771 133 : else if (op1val > 64)
1772 : vcost = COSTS_N_INSNS (2);
1773 : else
1774 133 : vcost = TARGET_AVX ? COSTS_N_INSNS (4) : COSTS_N_INSNS (5);
1775 : }
1776 206 : igain = scost - vcost;
1777 206 : break;
1778 :
1779 123 : case ASHIFTRT:
1780 : /* See ix86_expand_v1ti_ashiftrt. */
1781 123 : op1val = INTVAL (XEXP (src, 1));
1782 123 : if (!speed_p)
1783 : {
1784 9 : if (op1val == 64 || op1val == 127)
1785 : scost = COSTS_N_BYTES (7);
1786 9 : else if (op1val == 1)
1787 : scost = COSTS_N_BYTES (8);
1788 8 : else if (op1val == 65)
1789 : scost = COSTS_N_BYTES (10);
1790 8 : else if (op1val >= 66)
1791 : scost = COSTS_N_BYTES (11);
1792 : else
1793 : scost = COSTS_N_BYTES (9);
1794 :
1795 0 : if (op1val == 127)
1796 : vcost = COSTS_N_BYTES (10);
1797 9 : else if (op1val == 64)
1798 : vcost = COSTS_N_BYTES (14);
1799 9 : else if (op1val == 96)
1800 : vcost = COSTS_N_BYTES (18);
1801 9 : else if (op1val >= 111)
1802 : vcost = COSTS_N_BYTES (15);
1803 9 : else if (TARGET_AVX2 && op1val == 32)
1804 : vcost = COSTS_N_BYTES (16);
1805 9 : else if (TARGET_SSE4_1 && op1val == 32)
1806 : vcost = COSTS_N_BYTES (20);
1807 9 : else if (op1val >= 96)
1808 : vcost = COSTS_N_BYTES (23);
1809 9 : else if ((op1val & 7) == 0)
1810 : vcost = COSTS_N_BYTES (28);
1811 9 : else if (TARGET_AVX2 && op1val < 32)
1812 : vcost = COSTS_N_BYTES (30);
1813 9 : else if (op1val == 1 || op1val >= 64)
1814 : vcost = COSTS_N_BYTES (42);
1815 : else
1816 8 : vcost = COSTS_N_BYTES (47);
1817 : }
1818 : else
1819 : {
1820 114 : if (op1val >= 65 && op1val <= 126)
1821 : scost = COSTS_N_INSNS (3);
1822 : else
1823 114 : scost = COSTS_N_INSNS (2);
1824 :
1825 114 : if (op1val == 127)
1826 : vcost = COSTS_N_INSNS (2);
1827 113 : else if (op1val == 64)
1828 : vcost = COSTS_N_INSNS (3);
1829 113 : else if (op1val == 96)
1830 : vcost = COSTS_N_INSNS (3);
1831 113 : else if (op1val >= 111)
1832 : vcost = COSTS_N_INSNS (3);
1833 113 : else if (TARGET_SSE4_1 && op1val == 32)
1834 : vcost = COSTS_N_INSNS (3);
1835 113 : else if (TARGET_SSE4_1
1836 0 : && (op1val == 8 || op1val == 16 || op1val == 24))
1837 : vcost = COSTS_N_INSNS (3);
1838 113 : else if (op1val >= 96)
1839 : vcost = COSTS_N_INSNS (4);
1840 113 : else if (TARGET_SSE4_1 && (op1val == 28 || op1val == 80))
1841 : vcost = COSTS_N_INSNS (4);
1842 113 : else if ((op1val & 7) == 0)
1843 : vcost = COSTS_N_INSNS (5);
1844 113 : else if (TARGET_AVX2 && op1val < 32)
1845 : vcost = COSTS_N_INSNS (6);
1846 113 : else if (TARGET_SSE4_1 && op1val < 15)
1847 : vcost = COSTS_N_INSNS (6);
1848 113 : else if (op1val == 1 || op1val >= 64)
1849 : vcost = COSTS_N_INSNS (8);
1850 : else
1851 16 : vcost = COSTS_N_INSNS (9);
1852 : }
1853 123 : igain = scost - vcost;
1854 123 : break;
1855 :
1856 6 : case ROTATE:
1857 6 : case ROTATERT:
1858 : /* See ix86_expand_v1ti_rotate. */
1859 6 : op1val = INTVAL (XEXP (src, 1));
1860 6 : if (!speed_p)
1861 : {
1862 0 : scost = COSTS_N_BYTES (13);
1863 0 : if ((op1val & 31) == 0)
1864 : vcost = COSTS_N_BYTES (5);
1865 0 : else if ((op1val & 7) == 0)
1866 0 : vcost = TARGET_AVX ? COSTS_N_BYTES (13) : COSTS_N_BYTES (18);
1867 0 : else if (op1val > 32 && op1val < 96)
1868 : vcost = COSTS_N_BYTES (24);
1869 : else
1870 0 : vcost = COSTS_N_BYTES (19);
1871 : }
1872 : else
1873 : {
1874 6 : scost = COSTS_N_INSNS (3);
1875 6 : if ((op1val & 31) == 0)
1876 : vcost = COSTS_N_INSNS (1);
1877 4 : else if ((op1val & 7) == 0)
1878 1 : vcost = TARGET_AVX ? COSTS_N_INSNS (3) : COSTS_N_INSNS (4);
1879 3 : else if (op1val > 32 && op1val < 96)
1880 : vcost = COSTS_N_INSNS (5);
1881 : else
1882 3 : vcost = COSTS_N_INSNS (4);
1883 : }
1884 6 : igain = scost - vcost;
1885 6 : break;
1886 :
1887 19 : case COMPARE:
1888 19 : if (XEXP (src, 1) == const0_rtx)
1889 : {
1890 8 : if (GET_CODE (XEXP (src, 0)) == AND)
1891 : /* and;and;or (9 bytes) vs. ptest (5 bytes). */
1892 : igain = !speed_p ? COSTS_N_BYTES (4) : COSTS_N_INSNS (2);
1893 : /* or (3 bytes) vs. ptest (5 bytes). */
1894 8 : else if (!speed_p)
1895 0 : igain = -COSTS_N_BYTES (2);
1896 : }
1897 11 : else if (XEXP (src, 1) == const1_rtx)
1898 : /* and;cmp -1 (7 bytes) vs. pcmpeqd;pxor;ptest (13 bytes). */
1899 0 : igain = !speed_p ? -COSTS_N_BYTES (6) : -COSTS_N_INSNS (1);
1900 : break;
1901 :
1902 264 : case ZERO_EXTEND:
1903 264 : if (GET_MODE (XEXP (src, 0)) == DImode)
1904 : /* xor (2 bytes) vs. vmovq (5 bytes). */
1905 264 : igain = speed_p ? COSTS_N_INSNS (1) - ix86_cost->sse_to_integer
1906 : : -COSTS_N_BYTES (3);
1907 : break;
1908 :
1909 : default:
1910 : break;
1911 : }
1912 :
1913 1958888 : gain += igain;
1914 995124 : if (speed_p)
1915 963764 : weighted_gain += bb_freq * igain;
1916 :
1917 995132 : if (igain != 0 && dump_file)
1918 : {
1919 0 : fprintf (dump_file, " Instruction gain %d with bb_freq %.2f for ",
1920 : igain, bb_freq.to_double ());
1921 0 : dump_insn_slim (dump_file, insn);
1922 : }
1923 : }
1924 :
1925 499483 : if (dump_file)
1926 0 : fprintf (dump_file, " Total gain: %d, weighted gain %.2f\n",
1927 : gain, weighted_gain.to_double ());
1928 :
1929 499483 : if (weighted_gain > (sreal) 0)
1930 : return true;
1931 : else
1932 54248 : return gain > 0;
1933 : }
1934 :
1935 : /* Fix uses of converted REG in debug insns. */
1936 :
1937 : void
1938 419337 : timode_scalar_chain::fix_debug_reg_uses (rtx reg)
1939 : {
1940 419337 : if (!flag_var_tracking)
1941 : return;
1942 :
1943 370532 : df_ref ref, next;
1944 759833 : for (ref = DF_REG_USE_CHAIN (REGNO (reg)); ref; ref = next)
1945 : {
1946 389301 : rtx_insn *insn = DF_REF_INSN (ref);
1947 : /* Make sure the next ref is for a different instruction,
1948 : so that we're not affected by the rescan. */
1949 389301 : next = DF_REF_NEXT_REG (ref);
1950 389301 : while (next && DF_REF_INSN (next) == insn)
1951 0 : next = DF_REF_NEXT_REG (next);
1952 :
1953 389301 : if (DEBUG_INSN_P (insn))
1954 : {
1955 : /* It may be a debug insn with a TImode variable in
1956 : register. */
1957 : bool changed = false;
1958 228 : for (; ref != next; ref = DF_REF_NEXT_REG (ref))
1959 : {
1960 114 : rtx *loc = DF_REF_LOC (ref);
1961 114 : if (REG_P (*loc) && GET_MODE (*loc) == V1TImode)
1962 : {
1963 105 : *loc = gen_rtx_SUBREG (TImode, *loc, 0);
1964 105 : changed = true;
1965 : }
1966 : }
1967 114 : if (changed)
1968 105 : df_insn_rescan (insn);
1969 : }
1970 : }
1971 : }
1972 :
1973 : /* Convert SRC, a *concatditi3 pattern, into a vec_concatv2di instruction.
1974 : Insert this before INSN, and return the result as a V1TImode subreg. */
1975 :
1976 : static rtx
1977 266 : timode_convert_concatdi (rtx src, rtx_insn *insn)
1978 : {
1979 266 : rtx hi, lo;
1980 266 : rtx tmp = gen_reg_rtx (V2DImode);
1981 266 : if (GET_CODE (XEXP (src, 0)) == ASHIFT)
1982 : {
1983 266 : hi = XEXP (XEXP (XEXP (src, 0), 0), 0);
1984 266 : lo = XEXP (XEXP (src, 1), 0);
1985 : }
1986 : else
1987 : {
1988 0 : hi = XEXP (XEXP (XEXP (src, 1), 0), 0);
1989 0 : lo = XEXP (XEXP (src, 0), 0);
1990 : }
1991 266 : emit_insn_before (gen_vec_concatv2di (tmp, lo, hi), insn);
1992 266 : return gen_rtx_SUBREG (V1TImode, tmp, 0);
1993 : }
1994 :
1995 : /* Convert INSN from TImode to V1T1mode. */
1996 :
1997 : void
1998 906660 : timode_scalar_chain::convert_insn (rtx_insn *insn)
1999 : {
2000 906660 : rtx def_set = single_set (insn);
2001 906660 : rtx src = SET_SRC (def_set);
2002 906660 : rtx dst = SET_DEST (def_set);
2003 906660 : rtx tmp;
2004 :
2005 906660 : switch (GET_CODE (dst))
2006 : {
2007 419854 : case REG:
2008 419854 : if (GET_MODE (dst) == TImode)
2009 : {
2010 419223 : if (!HARD_REGISTER_NUM_P (REGNO (dst)))
2011 : {
2012 418718 : PUT_MODE (dst, V1TImode);
2013 418718 : fix_debug_reg_uses (dst);
2014 : }
2015 505 : else if (!GENERAL_REGNO_P (REGNO (dst)))
2016 359 : dst = gen_raw_REG (V1TImode, REGNO (dst));
2017 : }
2018 419854 : if (GET_MODE (dst) == V1TImode)
2019 : {
2020 : /* It might potentially be helpful to convert REG_EQUAL notes,
2021 : but for now we just remove them. */
2022 419696 : rtx note = find_reg_equal_equiv_note (insn);
2023 419696 : if (note)
2024 470 : remove_note (insn, note);
2025 : }
2026 : break;
2027 486806 : case MEM:
2028 486806 : PUT_MODE (dst, V1TImode);
2029 486806 : break;
2030 :
2031 0 : default:
2032 0 : gcc_unreachable ();
2033 : }
2034 :
2035 906660 : switch (GET_CODE (src))
2036 : {
2037 448009 : case REG:
2038 448009 : if (GET_MODE (src) == TImode)
2039 : {
2040 825 : if (GENERAL_REGNO_P (REGNO (src)))
2041 : {
2042 201 : rtx lo = gen_reg_rtx (DImode);
2043 201 : rtx hi = gen_reg_rtx (DImode);
2044 201 : emit_insn_before (gen_rtx_SET (lo, gen_lowpart (DImode, src)),
2045 : insn);
2046 201 : emit_insn_before (gen_rtx_SET (hi, gen_highpart (DImode, src)),
2047 : insn);
2048 201 : src = gen_reg_rtx (V2DImode);
2049 201 : emit_insn_before (gen_vec_concatv2di (src, lo, hi), insn);
2050 201 : src = gen_lowpart (V1TImode, src);
2051 : }
2052 624 : else if (!HARD_REGISTER_NUM_P (REGNO (src)))
2053 : {
2054 619 : PUT_MODE (src, V1TImode);
2055 619 : fix_debug_reg_uses (src);
2056 : }
2057 : else
2058 5 : src = gen_raw_REG (V1TImode, REGNO (src));
2059 : }
2060 448009 : if (GENERAL_REG_P (dst))
2061 : {
2062 146 : rtx tmp = gen_reg_rtx (V2DImode);
2063 146 : src = gen_lowpart (V2DImode, src);
2064 146 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2065 : /* Extracting hi before lo helps register allocation. */
2066 146 : rtx hi = gen_reg_rtx (DImode);
2067 146 : rtx lo = gen_reg_rtx (DImode);
2068 146 : emit_insn_before (gen_vec_extractv2didi (hi, tmp, const1_rtx), insn);
2069 146 : emit_insn_before (gen_vec_extractv2didi (lo, tmp, const0_rtx), insn);
2070 :
2071 : /* Construct *concatditi3 pattern from lo and hi. */
2072 146 : hi = gen_rtx_ZERO_EXTEND (TImode, hi);
2073 146 : hi = gen_rtx_ASHIFT (TImode, hi, GEN_INT (64));
2074 146 : lo = gen_rtx_ZERO_EXTEND (TImode, lo);
2075 146 : src = gen_rtx_PLUS (TImode, hi, lo);
2076 : }
2077 : break;
2078 :
2079 417613 : case MEM:
2080 417613 : PUT_MODE (src, V1TImode);
2081 417613 : break;
2082 :
2083 29902 : case CONST_WIDE_INT:
2084 29902 : if (NONDEBUG_INSN_P (insn))
2085 : {
2086 : /* Since there are no instructions to store 128-bit constant,
2087 : temporary register usage is required. */
2088 29902 : bool use_move;
2089 29902 : start_sequence ();
2090 29902 : tmp = ix86_convert_const_wide_int_to_broadcast (TImode, src);
2091 29902 : if (tmp)
2092 : {
2093 194 : src = lowpart_subreg (V1TImode, tmp, TImode);
2094 194 : use_move = true;
2095 : }
2096 : else
2097 : {
2098 29708 : src = smode_convert_cst (src, V1TImode);
2099 29708 : src = validize_mem (force_const_mem (V1TImode, src));
2100 29708 : use_move = MEM_P (dst);
2101 : }
2102 29902 : rtx_insn *seq = end_sequence ();
2103 29902 : if (seq)
2104 195 : emit_insn_before (seq, insn);
2105 29902 : if (use_move)
2106 : {
2107 29530 : tmp = gen_reg_rtx (V1TImode);
2108 29530 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2109 29530 : src = tmp;
2110 : }
2111 : }
2112 : break;
2113 :
2114 10619 : case CONST_INT:
2115 10619 : switch (standard_sse_constant_p (src, TImode))
2116 : {
2117 10396 : case 1:
2118 10396 : src = CONST0_RTX (GET_MODE (dst));
2119 10396 : break;
2120 223 : case 2:
2121 223 : src = CONSTM1_RTX (GET_MODE (dst));
2122 223 : break;
2123 0 : default:
2124 0 : gcc_unreachable ();
2125 : }
2126 10619 : if (MEM_P (dst))
2127 : {
2128 10086 : tmp = gen_reg_rtx (V1TImode);
2129 10086 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2130 10086 : src = tmp;
2131 : }
2132 : break;
2133 :
2134 13 : case AND:
2135 13 : if (GET_CODE (XEXP (src, 0)) == NOT)
2136 : {
2137 0 : convert_op (&XEXP (XEXP (src, 0), 0), insn);
2138 0 : convert_op (&XEXP (src, 1), insn);
2139 0 : PUT_MODE (XEXP (src, 0), V1TImode);
2140 0 : PUT_MODE (src, V1TImode);
2141 0 : break;
2142 : }
2143 13 : convert_op (&XEXP (src, 0), insn);
2144 13 : convert_op (&XEXP (src, 1), insn);
2145 13 : PUT_MODE (src, V1TImode);
2146 13 : if (MEM_P (dst))
2147 : {
2148 10 : tmp = gen_reg_rtx (V1TImode);
2149 10 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2150 10 : src = tmp;
2151 : }
2152 : break;
2153 :
2154 343 : case XOR:
2155 343 : case IOR:
2156 343 : if (timode_concatdi_p (src))
2157 : {
2158 266 : src = timode_convert_concatdi (src, insn);
2159 266 : break;
2160 : }
2161 77 : convert_op (&XEXP (src, 0), insn);
2162 77 : convert_op (&XEXP (src, 1), insn);
2163 77 : PUT_MODE (src, V1TImode);
2164 77 : if (MEM_P (dst))
2165 : {
2166 8 : tmp = gen_reg_rtx (V1TImode);
2167 8 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2168 8 : src = tmp;
2169 : }
2170 : break;
2171 :
2172 3 : case NOT:
2173 3 : src = XEXP (src, 0);
2174 3 : convert_op (&src, insn);
2175 3 : tmp = gen_reg_rtx (V1TImode);
2176 3 : emit_insn_before (gen_move_insn (tmp, CONSTM1_RTX (V1TImode)), insn);
2177 3 : src = gen_rtx_XOR (V1TImode, src, tmp);
2178 3 : if (MEM_P (dst))
2179 : {
2180 0 : tmp = gen_reg_rtx (V1TImode);
2181 0 : emit_insn_before (gen_rtx_SET (tmp, src), insn);
2182 0 : src = tmp;
2183 : }
2184 : break;
2185 :
2186 12 : case COMPARE:
2187 12 : dst = gen_rtx_REG (CCZmode, FLAGS_REG);
2188 12 : src = convert_compare (XEXP (src, 0), XEXP (src, 1), insn);
2189 12 : break;
2190 :
2191 43 : case ASHIFT:
2192 43 : case LSHIFTRT:
2193 43 : case ASHIFTRT:
2194 43 : case ROTATERT:
2195 43 : case ROTATE:
2196 43 : convert_op (&XEXP (src, 0), insn);
2197 43 : PUT_MODE (src, V1TImode);
2198 43 : break;
2199 :
2200 103 : case ZERO_EXTEND:
2201 103 : if (GET_MODE (XEXP (src, 0)) == DImode)
2202 : {
2203 : /* Convert to *vec_concatv2di_0. */
2204 103 : rtx tmp = gen_reg_rtx (V2DImode);
2205 103 : rtx pat = gen_rtx_VEC_CONCAT (V2DImode, XEXP (src, 0), const0_rtx);
2206 103 : emit_insn_before (gen_move_insn (tmp, pat), insn);
2207 103 : src = gen_rtx_SUBREG (vmode, tmp, 0);
2208 : }
2209 : else
2210 0 : gcc_unreachable ();
2211 103 : break;
2212 :
2213 0 : case PLUS:
2214 0 : if (timode_concatdi_p (src))
2215 0 : src = timode_convert_concatdi (src, insn);
2216 : else
2217 0 : gcc_unreachable ();
2218 0 : break;
2219 :
2220 0 : default:
2221 0 : gcc_unreachable ();
2222 : }
2223 :
2224 906660 : SET_SRC (def_set) = src;
2225 906660 : SET_DEST (def_set) = dst;
2226 :
2227 : /* Drop possible dead definitions. */
2228 906660 : PATTERN (insn) = def_set;
2229 :
2230 906660 : INSN_CODE (insn) = -1;
2231 906660 : recog_memoized (insn);
2232 906660 : df_insn_rescan (insn);
2233 906660 : }
2234 :
2235 : /* Generate copies from defs used by the chain but not defined therein.
2236 : Also populates defs_map which is used later by convert_insn. */
2237 :
2238 : void
2239 631483 : scalar_chain::convert_registers ()
2240 : {
2241 631483 : bitmap_iterator bi;
2242 631483 : unsigned id;
2243 657652 : EXECUTE_IF_SET_IN_BITMAP (defs_conv, 0, id, bi)
2244 : {
2245 26169 : rtx chain_reg = gen_reg_rtx (smode);
2246 26169 : defs_map.put (regno_reg_rtx[id], chain_reg);
2247 : }
2248 639881 : EXECUTE_IF_SET_IN_BITMAP (insns_conv, 0, id, bi)
2249 21038 : for (df_ref ref = DF_INSN_UID_DEFS (id); ref; ref = DF_REF_NEXT_LOC (ref))
2250 12640 : if (bitmap_bit_p (defs_conv, DF_REF_REGNO (ref)))
2251 8398 : make_vector_copies (DF_REF_INSN (ref), DF_REF_REAL_REG (ref));
2252 631483 : }
2253 :
2254 : /* Convert whole chain creating required register
2255 : conversions and copies. */
2256 :
2257 : int
2258 631483 : scalar_chain::convert ()
2259 : {
2260 631483 : bitmap_iterator bi;
2261 631483 : unsigned id;
2262 631483 : int converted_insns = 0;
2263 :
2264 631483 : if (!dbg_cnt (stv_conversion))
2265 : return 0;
2266 :
2267 631483 : if (dump_file)
2268 0 : fprintf (dump_file, "Converting chain #%d...\n", chain_id);
2269 :
2270 631483 : convert_registers ();
2271 :
2272 1949965 : EXECUTE_IF_SET_IN_BITMAP (insns, 0, id, bi)
2273 : {
2274 1318482 : rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
2275 1318482 : convert_insn_common (insn);
2276 1318482 : convert_insn (insn);
2277 1318482 : converted_insns++;
2278 : }
2279 :
2280 : return converted_insns;
2281 : }
2282 :
2283 : /* Return the SET expression if INSN doesn't reference hard register.
2284 : Return NULL if INSN uses or defines a hard register, excluding
2285 : pseudo register pushes, hard register uses in a memory address,
2286 : clobbers and flags definitions. */
2287 :
2288 : static rtx
2289 331858289 : pseudo_reg_set (rtx_insn *insn)
2290 : {
2291 331858289 : rtx set = single_set (insn);
2292 331858289 : if (!set)
2293 : return NULL;
2294 :
2295 : /* Check pseudo register push first. */
2296 133735835 : machine_mode mode = TARGET_64BIT ? TImode : DImode;
2297 133735835 : if (REG_P (SET_SRC (set))
2298 37795028 : && !HARD_REGISTER_P (SET_SRC (set))
2299 163213290 : && push_operand (SET_DEST (set), mode))
2300 : return set;
2301 :
2302 133482756 : df_ref ref;
2303 216015776 : FOR_EACH_INSN_DEF (ref, insn)
2304 119099428 : if (HARD_REGISTER_P (DF_REF_REAL_REG (ref))
2305 64147785 : && !DF_REF_FLAGS_IS_SET (ref, DF_REF_MUST_CLOBBER)
2306 168924637 : && DF_REF_REGNO (ref) != FLAGS_REG)
2307 : return NULL;
2308 :
2309 185926403 : FOR_EACH_INSN_USE (ref, insn)
2310 114087623 : if (!DF_REF_REG_MEM_P (ref) && HARD_REGISTER_P (DF_REF_REAL_REG (ref)))
2311 : return NULL;
2312 :
2313 : return set;
2314 : }
2315 :
2316 : /* Return true if the register REG is defined in a single DEF chain.
2317 : If it is defined in more than one DEF chains, we may not be able
2318 : to convert it in all chains. */
2319 :
2320 : static bool
2321 1240414 : single_def_chain_p (rtx reg)
2322 : {
2323 1240414 : df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg));
2324 1240414 : if (!ref)
2325 : return false;
2326 1240394 : return DF_REF_NEXT_REG (ref) == nullptr;
2327 : }
2328 :
2329 : /* Check if comparison INSN may be transformed into vector comparison.
2330 : Currently we transform equality/inequality checks which look like:
2331 : (set (reg:CCZ 17 flags) (compare:CCZ (reg:TI x) (reg:TI y))) */
2332 :
2333 : static bool
2334 12644761 : convertible_comparison_p (rtx_insn *insn, enum machine_mode mode)
2335 : {
2336 14053755 : if (mode != (TARGET_64BIT ? TImode : DImode))
2337 : return false;
2338 :
2339 4625264 : if (!TARGET_SSE4_1)
2340 : return false;
2341 :
2342 162788 : rtx def_set = single_set (insn);
2343 :
2344 162788 : gcc_assert (def_set);
2345 :
2346 162788 : rtx src = SET_SRC (def_set);
2347 162788 : rtx dst = SET_DEST (def_set);
2348 :
2349 162788 : gcc_assert (GET_CODE (src) == COMPARE);
2350 :
2351 162788 : if (!REG_P (dst)
2352 162788 : || REGNO (dst) != FLAGS_REG
2353 325576 : || GET_MODE (dst) != CCZmode)
2354 : return false;
2355 :
2356 114198 : rtx op1 = XEXP (src, 0);
2357 114198 : rtx op2 = XEXP (src, 1);
2358 :
2359 : /* *cmp<dwi>_doubleword. */
2360 114198 : if (general_operand (op1, mode)
2361 114198 : && general_operand (op2, mode))
2362 : return true;
2363 :
2364 : /* *testti_doubleword. */
2365 114142 : if (op2 == const0_rtx
2366 38110 : && GET_CODE (op1) == AND
2367 142 : && REG_P (XEXP (op1, 0)))
2368 : {
2369 142 : rtx op12 = XEXP (op1, 1);
2370 142 : return GET_MODE (XEXP (op1, 0)) == TImode
2371 142 : && (CONST_SCALAR_INT_P (op12)
2372 0 : || ((REG_P (op12) || MEM_P (op12))
2373 0 : && GET_MODE (op12) == TImode));
2374 : }
2375 :
2376 : /* *test<dwi>_not_doubleword. */
2377 114000 : if (op2 == const0_rtx
2378 37968 : && GET_CODE (op1) == AND
2379 0 : && GET_CODE (XEXP (op1, 0)) == NOT)
2380 : {
2381 0 : rtx op11 = XEXP (XEXP (op1, 0), 0);
2382 0 : rtx op12 = XEXP (op1, 1);
2383 0 : return (REG_P (op11) || MEM_P (op11))
2384 0 : && (REG_P (op12) || MEM_P (op12))
2385 0 : && GET_MODE (op11) == mode
2386 0 : && GET_MODE (op12) == mode;
2387 : }
2388 :
2389 : return false;
2390 : }
2391 :
2392 : /* The general version of scalar_to_vector_candidate_p. */
2393 :
2394 : static bool
2395 232331737 : general_scalar_to_vector_candidate_p (rtx_insn *insn, enum machine_mode mode)
2396 : {
2397 232331737 : rtx def_set = pseudo_reg_set (insn);
2398 :
2399 232331737 : if (!def_set)
2400 : return false;
2401 :
2402 48777665 : rtx src = SET_SRC (def_set);
2403 48777665 : rtx dst = SET_DEST (def_set);
2404 :
2405 48777665 : if (GET_CODE (src) == COMPARE)
2406 8723994 : return convertible_comparison_p (insn, mode);
2407 :
2408 : /* We are interested in "mode" only. */
2409 40053671 : if ((GET_MODE (src) != mode
2410 27382226 : && !CONST_INT_P (src))
2411 17723813 : || GET_MODE (dst) != mode)
2412 : return false;
2413 :
2414 14909467 : if (!REG_P (dst) && !MEM_P (dst))
2415 : return false;
2416 :
2417 14652393 : switch (GET_CODE (src))
2418 : {
2419 531384 : case ASHIFT:
2420 531384 : case LSHIFTRT:
2421 531384 : case ASHIFTRT:
2422 531384 : case ROTATE:
2423 531384 : case ROTATERT:
2424 531384 : if (!CONST_INT_P (XEXP (src, 1))
2425 1026763 : || !IN_RANGE (INTVAL (XEXP (src, 1)), 0, GET_MODE_BITSIZE (mode)-1))
2426 : return false;
2427 :
2428 : /* Check for extend highpart case. */
2429 495375 : if (mode != DImode
2430 361223 : || GET_CODE (src) != ASHIFTRT
2431 81083 : || GET_CODE (XEXP (src, 0)) != ASHIFT)
2432 : break;
2433 :
2434 3651331 : src = XEXP (src, 0);
2435 : break;
2436 :
2437 86344 : case SMAX:
2438 86344 : case SMIN:
2439 86344 : case UMAX:
2440 86344 : case UMIN:
2441 86344 : if ((mode == DImode && !TARGET_AVX512VL)
2442 17992 : || (mode == SImode && !TARGET_SSE4_1))
2443 : return false;
2444 : /* Fallthru. */
2445 :
2446 3194121 : case AND:
2447 3194121 : case IOR:
2448 3194121 : case XOR:
2449 3194121 : case PLUS:
2450 3194121 : case MINUS:
2451 3194121 : if (!REG_P (XEXP (src, 1))
2452 : && !MEM_P (XEXP (src, 1))
2453 : && !CONST_INT_P (XEXP (src, 1)))
2454 : return false;
2455 :
2456 3103201 : if (GET_MODE (XEXP (src, 1)) != mode
2457 1798098 : && !CONST_INT_P (XEXP (src, 1)))
2458 : return false;
2459 :
2460 : /* Check for andnot case. */
2461 3103201 : if (GET_CODE (src) != AND
2462 177563 : || GET_CODE (XEXP (src, 0)) != NOT)
2463 : break;
2464 :
2465 3651331 : src = XEXP (src, 0);
2466 : /* FALLTHRU */
2467 :
2468 : case NOT:
2469 : break;
2470 :
2471 24730 : case NEG:
2472 : /* Check for nabs case. */
2473 24730 : if (GET_CODE (XEXP (src, 0)) != ABS)
2474 : break;
2475 :
2476 : src = XEXP (src, 0);
2477 : /* FALLTHRU */
2478 :
2479 3798 : case ABS:
2480 3798 : if ((mode == DImode && !TARGET_AVX512VL)
2481 1446 : || (mode == SImode && !TARGET_SSSE3))
2482 : return false;
2483 : break;
2484 :
2485 : case REG:
2486 : return true;
2487 :
2488 5876928 : case MEM:
2489 5876928 : case CONST_INT:
2490 5876928 : return REG_P (dst);
2491 :
2492 56967 : case VEC_SELECT:
2493 : /* Excluding MEM_P (dst) avoids interfering with vpextr[dq]. */
2494 56967 : return REG_P (dst)
2495 46802 : && REG_P (XEXP (src, 0))
2496 53850 : && GET_MODE (XEXP (src, 0)) == (mode == DImode ? V2DImode
2497 : : V4SImode)
2498 37092 : && GET_CODE (XEXP (src, 1)) == PARALLEL
2499 37092 : && XVECLEN (XEXP (src, 1), 0) == 1
2500 94059 : && CONST_INT_P (XVECEXP (XEXP (src, 1), 0, 0));
2501 :
2502 : default:
2503 : return false;
2504 : }
2505 :
2506 3651331 : if (!REG_P (XEXP (src, 0))
2507 : && !MEM_P (XEXP (src, 0))
2508 : && !CONST_INT_P (XEXP (src, 0)))
2509 : return false;
2510 :
2511 3349889 : if (GET_MODE (XEXP (src, 0)) != mode
2512 0 : && !CONST_INT_P (XEXP (src, 0)))
2513 : return false;
2514 :
2515 : return true;
2516 : }
2517 :
2518 : /* Check for a suitable TImode memory operand. */
2519 :
2520 : static bool
2521 1586 : timode_mem_p (rtx x)
2522 : {
2523 1586 : return MEM_P (x)
2524 1586 : && (TARGET_SSE_UNALIGNED_LOAD_OPTIMAL
2525 0 : || !misaligned_operand (x, TImode));
2526 : }
2527 :
2528 : /* The TImode version of scalar_to_vector_candidate_p. */
2529 :
2530 : static bool
2531 99526552 : timode_scalar_to_vector_candidate_p (rtx_insn *insn)
2532 : {
2533 99526552 : rtx def_set = pseudo_reg_set (insn);
2534 :
2535 : /* We allow two exceptions to the pseudo registers only rule.
2536 : Setting a hard register from a pseudo, and setting a pseudo
2537 : from a hard register. */
2538 99526552 : if (!def_set)
2539 : {
2540 76212358 : def_set = single_set (insn);
2541 76212358 : if (def_set)
2542 : {
2543 17698628 : rtx src = SET_SRC (def_set);
2544 17698628 : rtx dst = SET_DEST (def_set);
2545 17698628 : if (GET_MODE (dst) == TImode
2546 220921 : && REG_P (src) && REG_P (dst))
2547 : {
2548 101574 : if (HARD_REGISTER_P (dst)
2549 52212 : && !HARD_REGISTER_P (src)
2550 153786 : && single_def_chain_p (src))
2551 : return true;
2552 72934 : if (HARD_REGISTER_P (src)
2553 49362 : && !HARD_REGISTER_P (dst)
2554 122296 : && single_def_chain_p (dst))
2555 : return true;
2556 : }
2557 : }
2558 : return false;
2559 : }
2560 :
2561 23314194 : rtx src = SET_SRC (def_set);
2562 23314194 : rtx dst = SET_DEST (def_set);
2563 :
2564 23314194 : if (GET_CODE (src) == COMPARE)
2565 3920767 : return convertible_comparison_p (insn, TImode);
2566 :
2567 19393427 : if (GET_MODE (dst) != TImode
2568 1181543 : || (GET_MODE (src) != TImode
2569 58772 : && !CONST_SCALAR_INT_P (src)))
2570 : return false;
2571 :
2572 1181543 : if (!REG_P (dst) && !MEM_P (dst))
2573 : return false;
2574 :
2575 1180090 : if (MEM_P (dst)
2576 523220 : && misaligned_operand (dst, TImode)
2577 1486795 : && !TARGET_SSE_UNALIGNED_STORE_OPTIMAL)
2578 : return false;
2579 :
2580 1180085 : if (REG_P (dst) && !single_def_chain_p (dst))
2581 : return false;
2582 :
2583 1028842 : switch (GET_CODE (src))
2584 : {
2585 481970 : case REG:
2586 481970 : return single_def_chain_p (src);
2587 :
2588 : case CONST_WIDE_INT:
2589 : return true;
2590 :
2591 12471 : case CONST_INT:
2592 : /* ??? Verify performance impact before enabling CONST_INT for
2593 : __int128 store. */
2594 12471 : return standard_sse_constant_p (src, TImode);
2595 :
2596 439644 : case MEM:
2597 : /* Memory must be aligned or unaligned load is optimal. */
2598 439644 : return (REG_P (dst)
2599 439644 : && (!misaligned_operand (src, TImode)
2600 141390 : || TARGET_SSE_UNALIGNED_LOAD_OPTIMAL));
2601 :
2602 3151 : case AND:
2603 3151 : if (!MEM_P (dst)
2604 3110 : && GET_CODE (XEXP (src, 0)) == NOT
2605 0 : && REG_P (XEXP (XEXP (src, 0), 0))
2606 3151 : && (REG_P (XEXP (src, 1))
2607 0 : || CONST_SCALAR_INT_P (XEXP (src, 1))
2608 0 : || timode_mem_p (XEXP (src, 1))))
2609 0 : return true;
2610 3151 : return (REG_P (XEXP (src, 0))
2611 46 : || timode_mem_p (XEXP (src, 0)))
2612 3197 : && (REG_P (XEXP (src, 1))
2613 1282 : || CONST_SCALAR_INT_P (XEXP (src, 1))
2614 35 : || timode_mem_p (XEXP (src, 1)));
2615 :
2616 14103 : case IOR:
2617 14103 : case XOR:
2618 14103 : if (timode_concatdi_p (src))
2619 : return true;
2620 2766 : return (REG_P (XEXP (src, 0))
2621 1435 : || timode_mem_p (XEXP (src, 0)))
2622 2783 : && (REG_P (XEXP (src, 1))
2623 290 : || CONST_SCALAR_INT_P (XEXP (src, 1))
2624 54 : || timode_mem_p (XEXP (src, 1)));
2625 :
2626 509 : case NOT:
2627 509 : return REG_P (XEXP (src, 0)) || timode_mem_p (XEXP (src, 0));
2628 :
2629 11541 : case ASHIFT:
2630 11541 : case LSHIFTRT:
2631 11541 : case ASHIFTRT:
2632 11541 : case ROTATERT:
2633 11541 : case ROTATE:
2634 : /* Handle shifts/rotates by integer constants between 0 and 127. */
2635 11541 : return REG_P (XEXP (src, 0))
2636 11509 : && CONST_INT_P (XEXP (src, 1))
2637 22690 : && (INTVAL (XEXP (src, 1)) & ~0x7f) == 0;
2638 :
2639 7016 : case PLUS:
2640 7016 : return timode_concatdi_p (src);
2641 :
2642 3754 : case ZERO_EXTEND:
2643 3754 : return REG_P (XEXP (src, 0))
2644 3754 : && GET_MODE (XEXP (src, 0)) == DImode;
2645 :
2646 : default:
2647 : return false;
2648 : }
2649 : }
2650 :
2651 : /* For a register REGNO, scan instructions for its defs and uses.
2652 : Put REGNO in REGS if a def or use isn't in CANDIDATES. */
2653 :
2654 : static void
2655 1222085 : timode_check_non_convertible_regs (bitmap candidates, bitmap regs,
2656 : unsigned int regno)
2657 : {
2658 : /* Do nothing if REGNO is already in REGS or is a hard reg. */
2659 1222085 : if (bitmap_bit_p (regs, regno)
2660 1222085 : || HARD_REGISTER_NUM_P (regno))
2661 : return;
2662 :
2663 1214151 : for (df_ref def = DF_REG_DEF_CHAIN (regno);
2664 2417983 : def;
2665 1203832 : def = DF_REF_NEXT_REG (def))
2666 : {
2667 1214131 : if (!bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2668 : {
2669 10299 : if (dump_file)
2670 0 : fprintf (dump_file,
2671 : "r%d has non convertible def in insn %d\n",
2672 0 : regno, DF_REF_INSN_UID (def));
2673 :
2674 10299 : bitmap_set_bit (regs, regno);
2675 10299 : break;
2676 : }
2677 : }
2678 :
2679 1214151 : for (df_ref ref = DF_REG_USE_CHAIN (regno);
2680 2690097 : ref;
2681 1475946 : ref = DF_REF_NEXT_REG (ref))
2682 : {
2683 : /* Debug instructions are skipped. */
2684 1545455 : if (NONDEBUG_INSN_P (DF_REF_INSN (ref))
2685 1545455 : && !bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
2686 : {
2687 69509 : if (dump_file)
2688 0 : fprintf (dump_file,
2689 : "r%d has non convertible use in insn %d\n",
2690 0 : regno, DF_REF_INSN_UID (ref));
2691 :
2692 69509 : bitmap_set_bit (regs, regno);
2693 69509 : break;
2694 : }
2695 : }
2696 : }
2697 :
2698 : /* For a given bitmap of insn UIDs scans all instructions and
2699 : remove insn from CANDIDATES in case it has both convertible
2700 : and not convertible definitions.
2701 :
2702 : All insns in a bitmap are conversion candidates according to
2703 : scalar_to_vector_candidate_p. Currently it implies all insns
2704 : are single_set. */
2705 :
2706 : static void
2707 829339 : timode_remove_non_convertible_regs (bitmap candidates)
2708 : {
2709 829339 : bitmap_iterator bi;
2710 829339 : unsigned id;
2711 829339 : bitmap regs = BITMAP_ALLOC (NULL);
2712 850472 : bool changed;
2713 :
2714 850472 : do {
2715 850472 : changed = false;
2716 2094538 : EXECUTE_IF_SET_IN_BITMAP (candidates, 0, id, bi)
2717 : {
2718 1244066 : rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
2719 1244066 : df_ref ref;
2720 :
2721 1946221 : FOR_EACH_INSN_DEF (ref, insn)
2722 702155 : if (!DF_REF_REG_MEM_P (ref)
2723 702155 : && GET_MODE (DF_REF_REG (ref)) == TImode)
2724 614168 : timode_check_non_convertible_regs (candidates, regs,
2725 : DF_REF_REGNO (ref));
2726 :
2727 3068378 : FOR_EACH_INSN_USE (ref, insn)
2728 1824312 : if (DF_REF_TYPE (ref) == DF_REF_REG_USE
2729 744088 : && GET_MODE (DF_REF_REG (ref)) == TImode
2730 607922 : && !SUBREG_P (DF_REF_REG (ref)))
2731 607917 : timode_check_non_convertible_regs (candidates, regs,
2732 : DF_REF_REGNO (ref));
2733 : }
2734 :
2735 1026060 : EXECUTE_IF_SET_IN_BITMAP (regs, 0, id, bi)
2736 : {
2737 175588 : for (df_ref def = DF_REG_DEF_CHAIN (id);
2738 357268 : def;
2739 181680 : def = DF_REF_NEXT_REG (def))
2740 181680 : if (bitmap_bit_p (candidates, DF_REF_INSN_UID (def)))
2741 : {
2742 56217 : if (dump_file)
2743 0 : fprintf (dump_file, "Removing insn %d from candidates list\n",
2744 0 : DF_REF_INSN_UID (def));
2745 :
2746 56217 : bitmap_clear_bit (candidates, DF_REF_INSN_UID (def));
2747 56217 : changed = true;
2748 : }
2749 :
2750 175588 : for (df_ref ref = DF_REG_USE_CHAIN (id);
2751 525544 : ref;
2752 349956 : ref = DF_REF_NEXT_REG (ref))
2753 349956 : if (bitmap_bit_p (candidates, DF_REF_INSN_UID (ref)))
2754 : {
2755 16000 : if (dump_file)
2756 0 : fprintf (dump_file, "Removing insn %d from candidates list\n",
2757 0 : DF_REF_INSN_UID (ref));
2758 :
2759 16000 : bitmap_clear_bit (candidates, DF_REF_INSN_UID (ref));
2760 16000 : changed = true;
2761 : }
2762 : }
2763 : } while (changed);
2764 :
2765 829339 : BITMAP_FREE (regs);
2766 829339 : }
2767 :
2768 : /* Main STV pass function. Find and convert scalar
2769 : instructions into vector mode when profitable. */
2770 :
2771 : static unsigned int
2772 1784924 : convert_scalars_to_vector (bool timode_p)
2773 : {
2774 1784924 : basic_block bb;
2775 1784924 : int converted_insns = 0;
2776 1784924 : auto_vec<rtx_insn *> control_flow_insns;
2777 :
2778 1784924 : bitmap_obstack_initialize (NULL);
2779 1784924 : const machine_mode cand_mode[3] = { SImode, DImode, TImode };
2780 1784924 : const machine_mode cand_vmode[3] = { V4SImode, V2DImode, V1TImode };
2781 5354772 : bitmap_head candidates[3]; /* { SImode, DImode, TImode } */
2782 7139696 : for (unsigned i = 0; i < 3; ++i)
2783 5354772 : bitmap_initialize (&candidates[i], &bitmap_default_obstack);
2784 :
2785 1784924 : calculate_dominance_info (CDI_DOMINATORS);
2786 1784924 : df_set_flags (DF_DEFER_INSN_RESCAN | DF_RD_PRUNE_DEAD_DEFS);
2787 1784924 : df_chain_add_problem (DF_DU_CHAIN | DF_UD_CHAIN);
2788 1784924 : df_analyze ();
2789 :
2790 : /* Find all instructions we want to convert into vector mode. */
2791 1784924 : if (dump_file)
2792 44 : fprintf (dump_file, "Searching for mode conversion candidates...\n");
2793 :
2794 19473538 : FOR_EACH_BB_FN (bb, cfun)
2795 : {
2796 17688614 : rtx_insn *insn;
2797 235136556 : FOR_BB_INSNS (bb, insn)
2798 217447942 : if (timode_p
2799 217447942 : && timode_scalar_to_vector_candidate_p (insn))
2800 : {
2801 1067349 : if (dump_file)
2802 0 : fprintf (dump_file, " insn %d is marked as a TImode candidate\n",
2803 0 : INSN_UID (insn));
2804 :
2805 1067349 : bitmap_set_bit (&candidates[2], INSN_UID (insn));
2806 : }
2807 216380593 : else if (!timode_p)
2808 : {
2809 : /* Check {SI,DI}mode. */
2810 338783276 : for (unsigned i = 0; i <= 1; ++i)
2811 232331737 : if (general_scalar_to_vector_candidate_p (insn, cand_mode[i]))
2812 : {
2813 11469851 : if (dump_file)
2814 554 : fprintf (dump_file, " insn %d is marked as a %s candidate\n",
2815 277 : INSN_UID (insn), i == 0 ? "SImode" : "DImode");
2816 :
2817 11469851 : bitmap_set_bit (&candidates[i], INSN_UID (insn));
2818 11469851 : break;
2819 : }
2820 : }
2821 : }
2822 :
2823 1784924 : if (timode_p)
2824 829339 : timode_remove_non_convertible_regs (&candidates[2]);
2825 :
2826 5654139 : for (unsigned i = 0; i <= 2; ++i)
2827 4505860 : if (!bitmap_empty_p (&candidates[i]))
2828 : break;
2829 3869215 : else if (i == 2 && dump_file)
2830 23 : fprintf (dump_file, "There are no candidates for optimization.\n");
2831 :
2832 7139696 : for (unsigned i = 0; i <= 2; ++i)
2833 : {
2834 5354772 : auto_bitmap disallowed;
2835 5354772 : bitmap_tree_view (&candidates[i]);
2836 17023606 : while (!bitmap_empty_p (&candidates[i]))
2837 : {
2838 6314062 : unsigned uid = bitmap_first_set_bit (&candidates[i]);
2839 6314062 : scalar_chain *chain;
2840 :
2841 6314062 : if (cand_mode[i] == TImode)
2842 499483 : chain = new timode_scalar_chain;
2843 : else
2844 5814579 : chain = new general_scalar_chain (cand_mode[i], cand_vmode[i]);
2845 :
2846 : /* Find instructions chain we want to convert to vector mode.
2847 : Check all uses and definitions to estimate all required
2848 : conversions. */
2849 6314062 : if (chain->build (&candidates[i], uid, disallowed))
2850 : {
2851 6310193 : if (chain->compute_convert_gain ())
2852 631483 : converted_insns += chain->convert ();
2853 5678710 : else if (dump_file)
2854 136 : fprintf (dump_file, "Chain #%d conversion is not profitable\n",
2855 : chain->chain_id);
2856 : }
2857 :
2858 6314062 : rtx_insn* iter_insn;
2859 6314062 : unsigned int ii;
2860 6317650 : FOR_EACH_VEC_ELT (chain->control_flow_insns, ii, iter_insn)
2861 3588 : control_flow_insns.safe_push (iter_insn);
2862 :
2863 6314062 : delete chain;
2864 : }
2865 5354772 : }
2866 :
2867 1784924 : if (dump_file)
2868 44 : fprintf (dump_file, "Total insns converted: %d\n", converted_insns);
2869 :
2870 7139696 : for (unsigned i = 0; i <= 2; ++i)
2871 5354772 : bitmap_release (&candidates[i]);
2872 1784924 : bitmap_obstack_release (NULL);
2873 1784924 : df_process_deferred_rescans ();
2874 :
2875 : /* Conversion means we may have 128bit register spills/fills
2876 : which require aligned stack. */
2877 1784924 : if (converted_insns)
2878 : {
2879 104041 : if (crtl->stack_alignment_needed < 128)
2880 2290 : crtl->stack_alignment_needed = 128;
2881 104041 : if (crtl->stack_alignment_estimated < 128)
2882 220 : crtl->stack_alignment_estimated = 128;
2883 :
2884 104041 : crtl->stack_realign_needed
2885 104041 : = INCOMING_STACK_BOUNDARY < crtl->stack_alignment_estimated;
2886 104041 : crtl->stack_realign_tried = crtl->stack_realign_needed;
2887 :
2888 104041 : crtl->stack_realign_processed = true;
2889 :
2890 104041 : if (!crtl->drap_reg)
2891 : {
2892 103864 : rtx drap_rtx = targetm.calls.get_drap_rtx ();
2893 :
2894 : /* stack_realign_drap and drap_rtx must match. */
2895 103864 : gcc_assert ((stack_realign_drap != 0) == (drap_rtx != NULL));
2896 :
2897 : /* Do nothing if NULL is returned,
2898 : which means DRAP is not needed. */
2899 103864 : if (drap_rtx != NULL)
2900 : {
2901 0 : crtl->args.internal_arg_pointer = drap_rtx;
2902 :
2903 : /* Call fixup_tail_calls to clean up
2904 : REG_EQUIV note if DRAP is needed. */
2905 0 : fixup_tail_calls ();
2906 : }
2907 : }
2908 :
2909 : /* Fix up DECL_RTL/DECL_INCOMING_RTL of arguments. */
2910 104041 : if (TARGET_64BIT)
2911 65546 : for (tree parm = DECL_ARGUMENTS (current_function_decl);
2912 178960 : parm; parm = DECL_CHAIN (parm))
2913 : {
2914 113414 : if (TYPE_MODE (TREE_TYPE (parm)) != TImode)
2915 97689 : continue;
2916 15725 : if (DECL_RTL_SET_P (parm)
2917 31450 : && GET_MODE (DECL_RTL (parm)) == V1TImode)
2918 : {
2919 611 : rtx r = DECL_RTL (parm);
2920 611 : if (REG_P (r))
2921 611 : SET_DECL_RTL (parm, gen_rtx_SUBREG (TImode, r, 0));
2922 : }
2923 15725 : if (DECL_INCOMING_RTL (parm)
2924 15725 : && GET_MODE (DECL_INCOMING_RTL (parm)) == V1TImode)
2925 : {
2926 0 : rtx r = DECL_INCOMING_RTL (parm);
2927 0 : if (REG_P (r))
2928 0 : DECL_INCOMING_RTL (parm) = gen_rtx_SUBREG (TImode, r, 0);
2929 : }
2930 : }
2931 :
2932 104041 : if (!control_flow_insns.is_empty ())
2933 : {
2934 1130 : free_dominance_info (CDI_DOMINATORS);
2935 :
2936 1130 : unsigned int i;
2937 1130 : rtx_insn* insn;
2938 5848 : FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
2939 3588 : if (control_flow_insn_p (insn))
2940 : {
2941 : /* Split the block after insn. There will be a fallthru
2942 : edge, which is OK so we keep it. We have to create
2943 : the exception edges ourselves. */
2944 3588 : bb = BLOCK_FOR_INSN (insn);
2945 3588 : split_block (bb, insn);
2946 3588 : rtl_make_eh_edge (NULL, bb, BB_END (bb));
2947 : }
2948 : }
2949 : }
2950 :
2951 1784924 : return 0;
2952 1784924 : }
2953 :
2954 : static unsigned int
2955 75059 : rest_of_handle_insert_vzeroupper (void)
2956 : {
2957 : /* vzeroupper instructions are inserted immediately after reload and
2958 : postreload_cse to clean up after it a little bit to account for possible
2959 : spills from 256bit or 512bit registers. The pass reuses mode switching
2960 : infrastructure by re-running mode insertion pass, so disable entities
2961 : that have already been processed. */
2962 525413 : for (int i = 0; i < MAX_386_ENTITIES; i++)
2963 450354 : ix86_optimize_mode_switching[i] = 0;
2964 :
2965 75059 : ix86_optimize_mode_switching[AVX_U128] = 1;
2966 :
2967 : /* Call optimize_mode_switching. */
2968 75059 : g->get_passes ()->execute_pass_mode_switching ();
2969 :
2970 : /* LRA removes all REG_DEAD/REG_UNUSED notes and normally they
2971 : reappear in the IL only at the start of pass_rtl_dse2, which does
2972 : df_note_add_problem (); df_analyze ();
2973 : The vzeroupper is scheduled after postreload_cse pass and mode
2974 : switching computes the notes as well, the problem is that e.g.
2975 : pass_gcse2 doesn't maintain the notes, see PR113059 and
2976 : PR112760. Remove the notes now to restore status quo ante
2977 : until we figure out how to maintain the notes or what else
2978 : to do. */
2979 75059 : basic_block bb;
2980 75059 : rtx_insn *insn;
2981 405839 : FOR_EACH_BB_FN (bb, cfun)
2982 4245661 : FOR_BB_INSNS (bb, insn)
2983 3914881 : if (NONDEBUG_INSN_P (insn))
2984 : {
2985 2095565 : rtx *pnote = ®_NOTES (insn);
2986 3891768 : while (*pnote != 0)
2987 : {
2988 1796203 : if (REG_NOTE_KIND (*pnote) == REG_DEAD
2989 822929 : || REG_NOTE_KIND (*pnote) == REG_UNUSED)
2990 1289585 : *pnote = XEXP (*pnote, 1);
2991 : else
2992 506618 : pnote = &XEXP (*pnote, 1);
2993 : }
2994 : }
2995 :
2996 75059 : df_remove_problem (df_note);
2997 75059 : df_analyze ();
2998 75059 : return 0;
2999 : }
3000 :
3001 : namespace {
3002 :
3003 : const pass_data pass_data_insert_vzeroupper =
3004 : {
3005 : RTL_PASS, /* type */
3006 : "vzeroupper", /* name */
3007 : OPTGROUP_NONE, /* optinfo_flags */
3008 : TV_MACH_DEP, /* tv_id */
3009 : 0, /* properties_required */
3010 : 0, /* properties_provided */
3011 : 0, /* properties_destroyed */
3012 : 0, /* todo_flags_start */
3013 : TODO_df_finish, /* todo_flags_finish */
3014 : };
3015 :
3016 : class pass_insert_vzeroupper : public rtl_opt_pass
3017 : {
3018 : public:
3019 298828 : pass_insert_vzeroupper(gcc::context *ctxt)
3020 597656 : : rtl_opt_pass(pass_data_insert_vzeroupper, ctxt)
3021 : {}
3022 :
3023 : /* opt_pass methods: */
3024 1488378 : bool gate (function *) final override
3025 : {
3026 1488378 : return TARGET_AVX && TARGET_VZEROUPPER;
3027 : }
3028 :
3029 75059 : unsigned int execute (function *) final override
3030 : {
3031 75059 : return rest_of_handle_insert_vzeroupper ();
3032 : }
3033 :
3034 : }; // class pass_insert_vzeroupper
3035 :
3036 : const pass_data pass_data_stv =
3037 : {
3038 : RTL_PASS, /* type */
3039 : "stv", /* name */
3040 : OPTGROUP_NONE, /* optinfo_flags */
3041 : TV_MACH_DEP, /* tv_id */
3042 : 0, /* properties_required */
3043 : 0, /* properties_provided */
3044 : 0, /* properties_destroyed */
3045 : 0, /* todo_flags_start */
3046 : TODO_df_finish, /* todo_flags_finish */
3047 : };
3048 :
3049 : class pass_stv : public rtl_opt_pass
3050 : {
3051 : public:
3052 597656 : pass_stv (gcc::context *ctxt)
3053 597656 : : rtl_opt_pass (pass_data_stv, ctxt),
3054 1195312 : timode_p (false)
3055 : {}
3056 :
3057 : /* opt_pass methods: */
3058 2976756 : bool gate (function *) final override
3059 : {
3060 1488378 : return ((!timode_p || TARGET_64BIT)
3061 4338567 : && TARGET_STV && TARGET_SSE2 && optimize > 1);
3062 : }
3063 :
3064 1784924 : unsigned int execute (function *) final override
3065 : {
3066 1784924 : return convert_scalars_to_vector (timode_p);
3067 : }
3068 :
3069 298828 : opt_pass *clone () final override
3070 : {
3071 298828 : return new pass_stv (m_ctxt);
3072 : }
3073 :
3074 597656 : void set_pass_param (unsigned int n, bool param) final override
3075 : {
3076 597656 : gcc_assert (n == 0);
3077 597656 : timode_p = param;
3078 597656 : }
3079 :
3080 : private:
3081 : bool timode_p;
3082 : }; // class pass_stv
3083 :
3084 : } // anon namespace
3085 :
3086 : rtl_opt_pass *
3087 298828 : make_pass_insert_vzeroupper (gcc::context *ctxt)
3088 : {
3089 298828 : return new pass_insert_vzeroupper (ctxt);
3090 : }
3091 :
3092 : rtl_opt_pass *
3093 298828 : make_pass_stv (gcc::context *ctxt)
3094 : {
3095 298828 : return new pass_stv (ctxt);
3096 : }
3097 :
3098 : /* Inserting ENDBR and pseudo patchable-area instructions. */
3099 :
3100 : static void
3101 190323 : rest_of_insert_endbr_and_patchable_area (bool need_endbr,
3102 : unsigned int patchable_area_size)
3103 : {
3104 190323 : rtx endbr;
3105 190323 : rtx_insn *insn;
3106 190323 : rtx_insn *endbr_insn = NULL;
3107 190323 : basic_block bb;
3108 :
3109 190323 : if (need_endbr)
3110 : {
3111 : /* Currently emit EB if it's a tracking function, i.e. 'nocf_check'
3112 : is absent among function attributes. Later an optimization will
3113 : be introduced to make analysis if an address of a static function
3114 : is taken. A static function whose address is not taken will get
3115 : a nocf_check attribute. This will allow to reduce the number of
3116 : EB. */
3117 190278 : if (!lookup_attribute ("nocf_check",
3118 190278 : TYPE_ATTRIBUTES (TREE_TYPE (cfun->decl)))
3119 190260 : && (!flag_manual_endbr
3120 8 : || lookup_attribute ("cf_check",
3121 8 : DECL_ATTRIBUTES (cfun->decl)))
3122 380537 : && (!cgraph_node::get (cfun->decl)->only_called_directly_p ()
3123 27283 : || ix86_cmodel == CM_LARGE
3124 27282 : || ix86_cmodel == CM_LARGE_PIC
3125 27281 : || flag_force_indirect_call
3126 27281 : || (TARGET_DLLIMPORT_DECL_ATTRIBUTES
3127 : && DECL_DLLIMPORT_P (cfun->decl))))
3128 : {
3129 162979 : if (crtl->profile && flag_fentry)
3130 : {
3131 : /* Queue ENDBR insertion to x86_function_profiler.
3132 : NB: Any patchable-area insn will be inserted after
3133 : ENDBR. */
3134 6 : cfun->machine->insn_queued_at_entrance = TYPE_ENDBR;
3135 : }
3136 : else
3137 : {
3138 162973 : endbr = gen_nop_endbr ();
3139 162973 : bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
3140 162973 : rtx_insn *insn = BB_HEAD (bb);
3141 162973 : endbr_insn = emit_insn_before (endbr, insn);
3142 : }
3143 : }
3144 : }
3145 :
3146 190323 : if (patchable_area_size)
3147 : {
3148 51 : if (crtl->profile && flag_fentry)
3149 : {
3150 : /* Queue patchable-area insertion to x86_function_profiler.
3151 : NB: If there is a queued ENDBR, x86_function_profiler
3152 : will also handle patchable-area. */
3153 2 : if (!cfun->machine->insn_queued_at_entrance)
3154 1 : cfun->machine->insn_queued_at_entrance = TYPE_PATCHABLE_AREA;
3155 : }
3156 : else
3157 : {
3158 49 : rtx patchable_area
3159 49 : = gen_patchable_area (GEN_INT (patchable_area_size),
3160 49 : GEN_INT (crtl->patch_area_entry == 0));
3161 49 : if (endbr_insn)
3162 3 : emit_insn_after (patchable_area, endbr_insn);
3163 : else
3164 : {
3165 46 : bb = ENTRY_BLOCK_PTR_FOR_FN (cfun)->next_bb;
3166 46 : insn = BB_HEAD (bb);
3167 46 : emit_insn_before (patchable_area, insn);
3168 : }
3169 : }
3170 : }
3171 :
3172 190323 : if (!need_endbr)
3173 : return;
3174 :
3175 190278 : bb = 0;
3176 3897299 : FOR_EACH_BB_FN (bb, cfun)
3177 : {
3178 71470982 : for (insn = BB_HEAD (bb); insn != NEXT_INSN (BB_END (bb));
3179 67763961 : insn = NEXT_INSN (insn))
3180 : {
3181 67763961 : if (CALL_P (insn))
3182 : {
3183 1336996 : need_endbr = find_reg_note (insn, REG_SETJMP, NULL) != NULL;
3184 1336996 : if (!need_endbr && !SIBLING_CALL_P (insn))
3185 : {
3186 1289568 : rtx call = get_call_rtx_from (insn);
3187 1289568 : rtx fnaddr = XEXP (call, 0);
3188 1289568 : tree fndecl = NULL_TREE;
3189 :
3190 : /* Also generate ENDBRANCH for non-tail call which
3191 : may return via indirect branch. */
3192 1289568 : if (SYMBOL_REF_P (XEXP (fnaddr, 0)))
3193 1233227 : fndecl = SYMBOL_REF_DECL (XEXP (fnaddr, 0));
3194 1233227 : if (fndecl == NULL_TREE)
3195 56709 : fndecl = MEM_EXPR (fnaddr);
3196 56709 : if (fndecl
3197 1287327 : && TREE_CODE (TREE_TYPE (fndecl)) != FUNCTION_TYPE
3198 543006 : && TREE_CODE (TREE_TYPE (fndecl)) != METHOD_TYPE)
3199 : fndecl = NULL_TREE;
3200 1289568 : if (fndecl && TYPE_ARG_TYPES (TREE_TYPE (fndecl)))
3201 : {
3202 1248979 : tree fntype = TREE_TYPE (fndecl);
3203 1248979 : if (lookup_attribute ("indirect_return",
3204 1248979 : TYPE_ATTRIBUTES (fntype)))
3205 : need_endbr = true;
3206 : }
3207 : }
3208 1336984 : if (!need_endbr)
3209 1336976 : continue;
3210 : /* Generate ENDBRANCH after CALL, which can return more than
3211 : twice, setjmp-like functions. */
3212 :
3213 20 : endbr = gen_nop_endbr ();
3214 20 : emit_insn_after_setloc (endbr, insn, INSN_LOCATION (insn));
3215 20 : continue;
3216 20 : }
3217 :
3218 66426965 : if (JUMP_P (insn) && flag_cet_switch)
3219 : {
3220 9 : rtx target = JUMP_LABEL (insn);
3221 9 : if (target == NULL_RTX || ANY_RETURN_P (target))
3222 5 : continue;
3223 :
3224 : /* Check the jump is a switch table. */
3225 4 : rtx_insn *label = as_a<rtx_insn *> (target);
3226 4 : rtx_insn *table = next_insn (label);
3227 4 : if (table == NULL_RTX || !JUMP_TABLE_DATA_P (table))
3228 2 : continue;
3229 :
3230 : /* For the indirect jump find out all places it jumps and insert
3231 : ENDBRANCH there. It should be done under a special flag to
3232 : control ENDBRANCH generation for switch stmts. */
3233 2 : edge_iterator ei;
3234 2 : edge e;
3235 2 : basic_block dest_blk;
3236 :
3237 24 : FOR_EACH_EDGE (e, ei, bb->succs)
3238 : {
3239 22 : rtx_insn *insn;
3240 :
3241 22 : dest_blk = e->dest;
3242 22 : insn = BB_HEAD (dest_blk);
3243 22 : gcc_assert (LABEL_P (insn));
3244 22 : endbr = gen_nop_endbr ();
3245 22 : emit_insn_after (endbr, insn);
3246 : }
3247 2 : continue;
3248 2 : }
3249 :
3250 66426956 : if (LABEL_P (insn) && LABEL_PRESERVE_P (insn))
3251 : {
3252 135411 : endbr = gen_nop_endbr ();
3253 135411 : emit_insn_after (endbr, insn);
3254 135411 : continue;
3255 : }
3256 : }
3257 : }
3258 :
3259 : return;
3260 : }
3261 :
3262 : namespace {
3263 :
3264 : const pass_data pass_data_insert_endbr_and_patchable_area =
3265 : {
3266 : RTL_PASS, /* type. */
3267 : "endbr_and_patchable_area", /* name. */
3268 : OPTGROUP_NONE, /* optinfo_flags. */
3269 : TV_MACH_DEP, /* tv_id. */
3270 : 0, /* properties_required. */
3271 : 0, /* properties_provided. */
3272 : 0, /* properties_destroyed. */
3273 : 0, /* todo_flags_start. */
3274 : 0, /* todo_flags_finish. */
3275 : };
3276 :
3277 : class pass_insert_endbr_and_patchable_area : public rtl_opt_pass
3278 : {
3279 : public:
3280 298828 : pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
3281 597656 : : rtl_opt_pass (pass_data_insert_endbr_and_patchable_area, ctxt)
3282 : {}
3283 :
3284 : /* opt_pass methods: */
3285 1488378 : bool gate (function *) final override
3286 : {
3287 1488378 : need_endbr = (flag_cf_protection & CF_BRANCH) != 0;
3288 1488378 : patchable_area_size = crtl->patch_area_size - crtl->patch_area_entry;
3289 1488378 : return need_endbr || patchable_area_size;
3290 : }
3291 :
3292 190323 : unsigned int execute (function *) final override
3293 : {
3294 190323 : timevar_push (TV_MACH_DEP);
3295 190323 : rest_of_insert_endbr_and_patchable_area (need_endbr,
3296 : patchable_area_size);
3297 190323 : timevar_pop (TV_MACH_DEP);
3298 190323 : return 0;
3299 : }
3300 :
3301 : private:
3302 : bool need_endbr;
3303 : unsigned int patchable_area_size;
3304 : }; // class pass_insert_endbr_and_patchable_area
3305 :
3306 : } // anon namespace
3307 :
3308 : rtl_opt_pass *
3309 298828 : make_pass_insert_endbr_and_patchable_area (gcc::context *ctxt)
3310 : {
3311 298828 : return new pass_insert_endbr_and_patchable_area (ctxt);
3312 : }
3313 :
3314 : bool
3315 6036338 : ix86_rpad_gate ()
3316 : {
3317 6036338 : return (TARGET_AVX
3318 392529 : && TARGET_SSE_PARTIAL_REG_DEPENDENCY
3319 297634 : && TARGET_SSE_MATH
3320 297320 : && optimize
3321 6328305 : && optimize_function_for_speed_p (cfun));
3322 : }
3323 :
3324 : enum x86_cse_kind
3325 : {
3326 : X86_CSE_CONST0_VECTOR,
3327 : X86_CSE_CONSTM1_VECTOR,
3328 : X86_CSE_CONST_VECTOR,
3329 : X86_CSE_VEC_DUP,
3330 : X86_CSE_TLS_GD,
3331 : X86_CSE_TLS_LD_BASE,
3332 : X86_CSE_TLSDESC
3333 : };
3334 :
3335 154760 : struct redundant_pattern
3336 : {
3337 : /* Bitmap of basic blocks with broadcast instructions. */
3338 : auto_bitmap bbs;
3339 : /* Bitmap of broadcast instructions. */
3340 : auto_bitmap insns;
3341 : /* The broadcast inner scalar. */
3342 : rtx val;
3343 : /* The actual redundant source value for UNSPEC_TLSDESC. */
3344 : rtx tlsdesc_val;
3345 : /* The inner scalar mode. */
3346 : machine_mode mode;
3347 : /* The destination mode which can be changed to the integer mode of
3348 : the same time. */
3349 : machine_mode dest_mode;
3350 : /* The instruction which sets the inner scalar. Nullptr if the inner
3351 : scalar is applied to the whole function, instead of within the same
3352 : block. */
3353 : rtx_insn *def_insn;
3354 : /* The widest broadcast source. */
3355 : rtx broadcast_source;
3356 : /* The widest broadcast register. */
3357 : rtx broadcast_reg;
3358 : /* The basic block of the broadcast instruction. */
3359 : basic_block bb;
3360 : /* The number of broadcast instructions with the same inner scalar. */
3361 : unsigned HOST_WIDE_INT count;
3362 : /* The threshold of broadcast instructions with the same inner
3363 : scalar. */
3364 : unsigned int threshold;
3365 : /* The widest broadcast size in bytes. */
3366 : unsigned int size;
3367 : /* Load kind. */
3368 : x86_cse_kind kind;
3369 : };
3370 :
3371 : /* Generate a vector set, DEST = SRC, at entry of the nearest dominator
3372 : for basic block map BBS, which is in the fake loop that contains the
3373 : whole function, so that there is only a single vector set in the
3374 : whole function. If not nullptr, LOAD is a pointer to the load. */
3375 :
3376 : static void
3377 43402 : ix86_place_single_vector_set (rtx dest, rtx src, bitmap bbs,
3378 : redundant_pattern *load = nullptr)
3379 : {
3380 43402 : basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
3381 : /* For X86_CSE_VEC_DUP and X86_CSE_CONST_VECTOR, don't place the vector
3382 : set outside of the loop to avoid extra spills. */
3383 43402 : if (!load
3384 42380 : || (load->kind != X86_CSE_VEC_DUP
3385 42380 : && load->kind != X86_CSE_CONST_VECTOR))
3386 : {
3387 23839 : while (bb->loop_father->latch
3388 23839 : != EXIT_BLOCK_PTR_FOR_FN (cfun))
3389 1411 : bb = get_immediate_dominator (CDI_DOMINATORS,
3390 : bb->loop_father->header);
3391 : }
3392 :
3393 43402 : if (CONST_INT_P (src))
3394 10644 : dest = gen_rtx_SUBREG (load->dest_mode, dest, 0);
3395 32758 : else if (CONST_VECTOR_P (src))
3396 : {
3397 : /* The only possible CONST_VECTORs of SRC are CONST0_RTX and
3398 : CONSTM1_RTX. Otherwise,
3399 :
3400 : rtx set = gen_rtx_SET (dest, src);
3401 :
3402 : won't be a valid instruction. CONST0_RTX always works. It
3403 : can comes from:
3404 :
3405 : 1. remove_partial_avx_dependency with LOAD == NULL.
3406 : 2. X86_CSE_VEC_DUP with
3407 :
3408 : (insn 48 58 16 3 (set (reg:V4HI 123)
3409 : (const_vector:V4HI [
3410 : (const_int 0 [0]) repeated x4
3411 : ])) 2065 {*movv4hi_internal} (nil))
3412 :
3413 : 3. X86_CSE_CONST0_VECTOR.
3414 : */
3415 22428 : machine_mode mode = GET_MODE (dest);
3416 22428 : if (!(src == CONST0_RTX (mode)
3417 1578 : || (src == CONSTM1_RTX (mode)
3418 1578 : && load->kind == X86_CSE_CONSTM1_VECTOR)))
3419 0 : gcc_unreachable ();
3420 : }
3421 43402 : rtx set = gen_rtx_SET (dest, src);
3422 :
3423 43402 : rtx_insn *insn = BB_HEAD (bb);
3424 170720 : while (insn && !NONDEBUG_INSN_P (insn))
3425 : {
3426 127322 : if (insn == BB_END (bb))
3427 : {
3428 : insn = NULL;
3429 : break;
3430 : }
3431 127318 : insn = NEXT_INSN (insn);
3432 : }
3433 :
3434 43402 : rtx_insn *set_insn;
3435 43402 : if (insn == BB_HEAD (bb))
3436 : {
3437 0 : set_insn = emit_insn_before (set, insn);
3438 0 : if (dump_file)
3439 : {
3440 0 : fprintf (dump_file, "\nPlace:\n\n");
3441 0 : print_rtl_single (dump_file, set_insn);
3442 0 : fprintf (dump_file, "\nbefore:\n\n");
3443 0 : print_rtl_single (dump_file, insn);
3444 0 : fprintf (dump_file, "\n");
3445 : }
3446 : }
3447 : else
3448 : {
3449 43402 : rtx_insn *after = insn ? PREV_INSN (insn) : BB_END (bb);
3450 43402 : set_insn = emit_insn_after (set, after);
3451 43402 : if (dump_file)
3452 : {
3453 2 : fprintf (dump_file, "\nPlace:\n\n");
3454 2 : print_rtl_single (dump_file, set_insn);
3455 2 : fprintf (dump_file, "\nafter:\n\n");
3456 2 : print_rtl_single (dump_file, after);
3457 2 : fprintf (dump_file, "\n");
3458 : }
3459 : }
3460 :
3461 43402 : if (load && load->kind == X86_CSE_VEC_DUP)
3462 : {
3463 : /* Get the source from LOAD as (reg:SI 99) in
3464 :
3465 : (vec_duplicate:V4SI (reg:SI 99))
3466 :
3467 : */
3468 10330 : rtx inner_scalar = load->val;
3469 : /* Set the source in (vec_duplicate:V4SI (reg:SI 99)). */
3470 10330 : rtx reg = XEXP (src, 0);
3471 10330 : machine_mode reg_mode = GET_MODE (reg);
3472 10330 : if (reg_mode != GET_MODE (inner_scalar))
3473 : {
3474 10048 : if (REG_P (inner_scalar) || MEM_P (inner_scalar))
3475 0 : inner_scalar = gen_rtx_SUBREG (reg_mode, inner_scalar, 0);
3476 10048 : else if (!SCALAR_INT_MODE_P (reg_mode))
3477 : {
3478 : /* For non-int load with integer constant, generate
3479 :
3480 : (set (subreg:SI (reg/v:SF 105 [ f ]) 0)
3481 : (const_int 1313486336 [0x4e4a3600]))
3482 :
3483 : */
3484 1 : gcc_assert (CONST_INT_P (inner_scalar));
3485 1 : unsigned int bits = GET_MODE_BITSIZE (reg_mode);
3486 1 : machine_mode mode = int_mode_for_size (bits, 0).require ();
3487 1 : reg = gen_rtx_SUBREG (mode, reg, 0);
3488 : }
3489 : }
3490 10330 : rtx set = gen_rtx_SET (reg, inner_scalar);
3491 10330 : insn = emit_insn_before (set, set_insn);
3492 10330 : if (dump_file)
3493 : {
3494 0 : fprintf (dump_file, "\nAdd:\n\n");
3495 0 : print_rtl_single (dump_file, insn);
3496 0 : fprintf (dump_file, "\nbefore:\n\n");
3497 0 : print_rtl_single (dump_file, set_insn);
3498 0 : fprintf (dump_file, "\n");
3499 : }
3500 : }
3501 43402 : }
3502 :
3503 : /* At entry of the nearest common dominator for basic blocks with
3504 : conversions/rcp/sqrt/rsqrt/round, generate a single
3505 : vxorps %xmmN, %xmmN, %xmmN
3506 : for all
3507 : vcvtss2sd op, %xmmN, %xmmX
3508 : vcvtsd2ss op, %xmmN, %xmmX
3509 : vcvtsi2ss op, %xmmN, %xmmX
3510 : vcvtsi2sd op, %xmmN, %xmmX
3511 :
3512 : NB: We want to generate only a single vxorps to cover the whole
3513 : function. The LCM algorithm isn't appropriate here since it may
3514 : place a vxorps inside the loop. */
3515 :
3516 : static unsigned int
3517 33896 : remove_partial_avx_dependency (void)
3518 : {
3519 33896 : timevar_push (TV_MACH_DEP);
3520 :
3521 33896 : bitmap_obstack_initialize (NULL);
3522 33896 : bitmap convert_bbs = BITMAP_ALLOC (NULL);
3523 :
3524 33896 : basic_block bb;
3525 33896 : rtx_insn *insn, *set_insn;
3526 33896 : rtx set;
3527 33896 : rtx v4sf_const0 = NULL_RTX;
3528 :
3529 33896 : auto_vec<rtx_insn *> control_flow_insns;
3530 :
3531 : /* We create invalid RTL initially so defer rescans. */
3532 33896 : df_set_flags (DF_DEFER_INSN_RESCAN);
3533 :
3534 311131 : FOR_EACH_BB_FN (bb, cfun)
3535 : {
3536 3474682 : FOR_BB_INSNS (bb, insn)
3537 : {
3538 3197447 : if (!NONDEBUG_INSN_P (insn))
3539 1417202 : continue;
3540 :
3541 1780245 : set = single_set (insn);
3542 1780245 : if (!set)
3543 71411 : continue;
3544 :
3545 1708834 : if (get_attr_avx_partial_xmm_update (insn)
3546 : != AVX_PARTIAL_XMM_UPDATE_TRUE)
3547 1705661 : continue;
3548 :
3549 : /* Convert PARTIAL_XMM_UPDATE_TRUE insns, DF -> SF, SF -> DF,
3550 : SI -> SF, SI -> DF, DI -> SF, DI -> DF, sqrt, rsqrt, rcp,
3551 : round, to vec_dup and vec_merge with subreg. */
3552 3173 : rtx src = SET_SRC (set);
3553 3173 : rtx dest = SET_DEST (set);
3554 3173 : machine_mode dest_mode = GET_MODE (dest);
3555 3173 : bool convert_p = false;
3556 3173 : switch (GET_CODE (src))
3557 : {
3558 3108 : case FLOAT:
3559 3108 : case FLOAT_EXTEND:
3560 3108 : case FLOAT_TRUNCATE:
3561 3108 : case UNSIGNED_FLOAT:
3562 3108 : convert_p = true;
3563 3108 : break;
3564 : default:
3565 : break;
3566 : }
3567 :
3568 : /* Only handle conversion here. */
3569 3108 : machine_mode src_mode
3570 3108 : = convert_p ? GET_MODE (XEXP (src, 0)) : VOIDmode;
3571 3108 : switch (src_mode)
3572 : {
3573 153 : case E_SFmode:
3574 153 : case E_DFmode:
3575 153 : if (TARGET_USE_VECTOR_FP_CONVERTS
3576 147 : || !TARGET_SSE_PARTIAL_REG_FP_CONVERTS_DEPENDENCY)
3577 8 : continue;
3578 : break;
3579 2955 : case E_SImode:
3580 2955 : case E_DImode:
3581 2955 : if (TARGET_USE_VECTOR_CONVERTS
3582 2943 : || !TARGET_SSE_PARTIAL_REG_CONVERTS_DEPENDENCY)
3583 14 : continue;
3584 : break;
3585 65 : case E_VOIDmode:
3586 65 : gcc_assert (!convert_p);
3587 : break;
3588 0 : default:
3589 0 : gcc_unreachable ();
3590 : }
3591 :
3592 3151 : if (!v4sf_const0)
3593 1022 : v4sf_const0 = gen_reg_rtx (V4SFmode);
3594 :
3595 3151 : rtx zero;
3596 3151 : machine_mode dest_vecmode;
3597 3151 : switch (dest_mode)
3598 : {
3599 50 : case E_HFmode:
3600 50 : dest_vecmode = V8HFmode;
3601 50 : zero = gen_rtx_SUBREG (V8HFmode, v4sf_const0, 0);
3602 50 : break;
3603 : case E_SFmode:
3604 : dest_vecmode = V4SFmode;
3605 : zero = v4sf_const0;
3606 : break;
3607 1167 : case E_DFmode:
3608 1167 : dest_vecmode = V2DFmode;
3609 1167 : zero = gen_rtx_SUBREG (V2DFmode, v4sf_const0, 0);
3610 1167 : break;
3611 0 : default:
3612 0 : gcc_unreachable ();
3613 : }
3614 :
3615 : /* Change source to vector mode. */
3616 3151 : src = gen_rtx_VEC_DUPLICATE (dest_vecmode, src);
3617 3151 : src = gen_rtx_VEC_MERGE (dest_vecmode, src, zero,
3618 : GEN_INT (HOST_WIDE_INT_1U));
3619 : /* Change destination to vector mode. */
3620 3151 : rtx vec = gen_reg_rtx (dest_vecmode);
3621 : /* Generate an XMM vector SET. */
3622 3151 : set = gen_rtx_SET (vec, src);
3623 3151 : set_insn = emit_insn_before (set, insn);
3624 :
3625 3151 : if (cfun->can_throw_non_call_exceptions)
3626 : {
3627 : /* Handle REG_EH_REGION note. */
3628 0 : rtx note = find_reg_note (insn, REG_EH_REGION, NULL_RTX);
3629 0 : if (note)
3630 : {
3631 0 : control_flow_insns.safe_push (set_insn);
3632 0 : add_reg_note (set_insn, REG_EH_REGION, XEXP (note, 0));
3633 : }
3634 : }
3635 :
3636 3151 : src = gen_rtx_SUBREG (dest_mode, vec, 0);
3637 3151 : set = gen_rtx_SET (dest, src);
3638 :
3639 : /* Drop possible dead definitions. */
3640 3151 : PATTERN (insn) = set;
3641 :
3642 3151 : INSN_CODE (insn) = -1;
3643 3151 : recog_memoized (insn);
3644 3151 : df_insn_rescan (insn);
3645 3151 : bitmap_set_bit (convert_bbs, bb->index);
3646 : }
3647 : }
3648 :
3649 33896 : if (v4sf_const0)
3650 : {
3651 : /* (Re-)discover loops so that bb->loop_father can be used in the
3652 : analysis below. */
3653 1022 : calculate_dominance_info (CDI_DOMINATORS);
3654 1022 : loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
3655 :
3656 1022 : ix86_place_single_vector_set (v4sf_const0,
3657 : CONST0_RTX (V4SFmode),
3658 : convert_bbs);
3659 :
3660 1022 : loop_optimizer_finalize ();
3661 :
3662 1022 : if (!control_flow_insns.is_empty ())
3663 : {
3664 0 : free_dominance_info (CDI_DOMINATORS);
3665 :
3666 0 : unsigned int i;
3667 0 : FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
3668 0 : if (control_flow_insn_p (insn))
3669 : {
3670 : /* Split the block after insn. There will be a fallthru
3671 : edge, which is OK so we keep it. We have to create
3672 : the exception edges ourselves. */
3673 0 : bb = BLOCK_FOR_INSN (insn);
3674 0 : split_block (bb, insn);
3675 0 : rtl_make_eh_edge (NULL, bb, BB_END (bb));
3676 : }
3677 : }
3678 : }
3679 :
3680 33896 : df_process_deferred_rescans ();
3681 33896 : df_clear_flags (DF_DEFER_INSN_RESCAN);
3682 33896 : bitmap_obstack_release (NULL);
3683 33896 : BITMAP_FREE (convert_bbs);
3684 :
3685 33896 : timevar_pop (TV_MACH_DEP);
3686 33896 : return 0;
3687 33896 : }
3688 :
3689 : namespace {
3690 :
3691 : const pass_data pass_data_remove_partial_avx_dependency =
3692 : {
3693 : RTL_PASS, /* type */
3694 : "rpad", /* name */
3695 : OPTGROUP_NONE, /* optinfo_flags */
3696 : TV_MACH_DEP, /* tv_id */
3697 : 0, /* properties_required */
3698 : 0, /* properties_provided */
3699 : 0, /* properties_destroyed */
3700 : 0, /* todo_flags_start */
3701 : 0, /* todo_flags_finish */
3702 : };
3703 :
3704 : class pass_remove_partial_avx_dependency : public rtl_opt_pass
3705 : {
3706 : public:
3707 298828 : pass_remove_partial_avx_dependency (gcc::context *ctxt)
3708 597656 : : rtl_opt_pass (pass_data_remove_partial_avx_dependency, ctxt)
3709 : {}
3710 :
3711 : /* opt_pass methods: */
3712 1488378 : bool gate (function *) final override
3713 : {
3714 1488378 : return ix86_rpad_gate ();
3715 : }
3716 :
3717 33896 : unsigned int execute (function *) final override
3718 : {
3719 33896 : return remove_partial_avx_dependency ();
3720 : }
3721 : }; // class pass_rpad
3722 :
3723 : } // anon namespace
3724 :
3725 : rtl_opt_pass *
3726 298828 : make_pass_remove_partial_avx_dependency (gcc::context *ctxt)
3727 : {
3728 298828 : return new pass_remove_partial_avx_dependency (ctxt);
3729 : }
3730 :
3731 : /* Return a machine mode suitable for vector SIZE with SMODE inner
3732 : mode. */
3733 :
3734 : static machine_mode
3735 63797 : ix86_get_vector_cse_mode (unsigned int size, machine_mode smode)
3736 : {
3737 : /* Use the inner scalar mode of vector broadcast source in:
3738 :
3739 : (set (reg:V8DF 394)
3740 : (vec_duplicate:V8DF (reg:V2DF 190 [ alpha ])))
3741 :
3742 : to compute the vector mode for broadcast from vector source.
3743 : */
3744 63797 : if (VECTOR_MODE_P (smode))
3745 30749 : smode = GET_MODE_INNER (smode);
3746 63797 : scalar_mode s_mode = as_a <scalar_mode> (smode);
3747 127594 : poly_uint64 nunits = size / GET_MODE_SIZE (smode);
3748 63797 : machine_mode mode = mode_for_vector (s_mode, nunits).require ();
3749 63797 : return mode;
3750 : }
3751 :
3752 : /* Replace the source operand of instructions in VECTOR_INSNS with
3753 : VECTOR_CONST in VECTOR_MODE. */
3754 :
3755 : static void
3756 63326 : replace_vector_const (machine_mode vector_mode, rtx vector_const,
3757 : auto_bitmap &vector_insns,
3758 : machine_mode scalar_mode)
3759 : {
3760 63326 : bitmap_iterator bi;
3761 63326 : unsigned int id;
3762 :
3763 222040 : EXECUTE_IF_SET_IN_BITMAP (vector_insns, 0, id, bi)
3764 : {
3765 158714 : rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
3766 :
3767 : /* Get the single SET instruction. */
3768 158714 : rtx set = single_set (insn);
3769 158714 : rtx src = SET_SRC (set);
3770 158714 : rtx dest = SET_DEST (set);
3771 158714 : machine_mode mode = GET_MODE (dest);
3772 :
3773 158714 : rtx replace;
3774 : /* Replace the source operand with VECTOR_CONST. */
3775 158714 : if (SUBREG_P (src)
3776 158714 : || mode == vector_mode
3777 60235 : || CONST_INT_P (vector_const))
3778 : replace = vector_const;
3779 : else
3780 : {
3781 60235 : unsigned int size = GET_MODE_SIZE (mode);
3782 60235 : if (size < ix86_regmode_natural_size (mode))
3783 : {
3784 : /* If the mode size is smaller than its natural size,
3785 : first insert an extra move with a QI vector SUBREG
3786 : of the same size to avoid validate_subreg failure. */
3787 471 : machine_mode vmode
3788 471 : = ix86_get_vector_cse_mode (size, scalar_mode);
3789 471 : rtx vreg;
3790 471 : if (mode == vmode)
3791 : vreg = vector_const;
3792 : else
3793 : {
3794 59 : vreg = gen_reg_rtx (vmode);
3795 59 : rtx vsubreg = gen_rtx_SUBREG (vmode, vector_const, 0);
3796 59 : rtx pat = gen_rtx_SET (vreg, vsubreg);
3797 59 : rtx_insn *vinsn = emit_insn_before (pat, insn);
3798 59 : if (dump_file)
3799 : {
3800 0 : fprintf (dump_file, "\nInsert an extra move:\n\n");
3801 0 : print_rtl_single (dump_file, vinsn);
3802 0 : fprintf (dump_file, "\nbefore:\n\n");
3803 0 : print_rtl_single (dump_file, insn);
3804 0 : fprintf (dump_file, "\n");
3805 : }
3806 : }
3807 471 : replace = gen_rtx_SUBREG (mode, vreg, 0);
3808 : }
3809 : else
3810 59764 : replace = gen_rtx_SUBREG (mode, vector_const, 0);
3811 : }
3812 :
3813 158714 : if (dump_file)
3814 : {
3815 3 : fprintf (dump_file, "\nReplace:\n\n");
3816 3 : print_rtl_single (dump_file, insn);
3817 : }
3818 158714 : SET_SRC (set) = replace;
3819 158714 : if (CONST_INT_P (replace))
3820 : {
3821 23098 : dest = gen_lowpart (scalar_mode, dest);
3822 23098 : SET_DEST (set) = dest;
3823 : }
3824 : /* Drop possible dead definitions. */
3825 158714 : PATTERN (insn) = set;
3826 158714 : INSN_CODE (insn) = -1;
3827 158714 : recog_memoized (insn);
3828 158714 : if (dump_file)
3829 : {
3830 3 : fprintf (dump_file, "\nwith:\n\n");
3831 3 : print_rtl_single (dump_file, insn);
3832 3 : fprintf (dump_file, "\n");
3833 : }
3834 158714 : df_insn_rescan (insn);
3835 : }
3836 63326 : }
3837 :
3838 : /* Return the inner scalar if OP is a broadcast, else return nullptr. */
3839 :
3840 : static rtx
3841 2196863 : ix86_broadcast_inner (rtx op, machine_mode mode,
3842 : machine_mode *scalar_mode_p,
3843 : x86_cse_kind *kind_p, rtx_insn **insn_p)
3844 : {
3845 2196863 : switch (standard_sse_constant_p (op, mode))
3846 : {
3847 114776 : case 1:
3848 114776 : *scalar_mode_p = QImode;
3849 114776 : *kind_p = X86_CSE_CONST0_VECTOR;
3850 114776 : *insn_p = nullptr;
3851 114776 : return const0_rtx;
3852 12163 : case 2:
3853 12163 : *scalar_mode_p = QImode;
3854 12163 : *kind_p = X86_CSE_CONSTM1_VECTOR;
3855 12163 : *insn_p = nullptr;
3856 12163 : return constm1_rtx;
3857 2069924 : default:
3858 2069924 : break;
3859 : }
3860 :
3861 2069924 : mode = GET_MODE (op);
3862 2069924 : int nunits = GET_MODE_NUNITS (mode);
3863 2069924 : if (nunits < 2)
3864 : return nullptr;
3865 :
3866 1595135 : bool const_vector_p = CONST_VECTOR_P (op);
3867 1595135 : bool duplicated = GET_CODE (op) == VEC_DUPLICATE;
3868 1595135 : rtx orig_op = op;
3869 1595135 : if (!const_vector_p)
3870 : {
3871 : /* Check CONST_VECTOR in REG_EQUAL note. */
3872 1595115 : rtx equal = find_reg_equal_equiv_note (*insn_p);
3873 1595115 : if (equal)
3874 : {
3875 373876 : equal = XEXP (equal, 0);
3876 373876 : const_vector_p = CONST_VECTOR_P (equal);
3877 : /* Use CONST_VECTOR in REG_EQUAL note. */
3878 373876 : if (const_vector_p)
3879 : {
3880 : /* Handle REG_EQUAL note in:
3881 :
3882 : (insn 7 5 12 2 (set (subreg:V8SI (reg:V4DI 100) 0)
3883 : (vec_duplicate:V8SI (reg:SI 102)))
3884 : (expr_list:REG_DEAD (reg:SI 102)
3885 : (expr_list:REG_EQUAL (const_vector:V4DI [
3886 : (const_int -1 [0xffffffffffffffff]) repeated x4]) (nil))))
3887 :
3888 : NB: Don't treat it as CONST_VECTOR since EQUAL isn't
3889 : supported by ISAs as in gcc.target/i386/pr40957.c. */
3890 262242 : if (GET_MODE (equal) != mode)
3891 : const_vector_p = false;
3892 : else
3893 1595135 : op = equal;
3894 : }
3895 : }
3896 : }
3897 :
3898 1595135 : machine_mode inner_mode = GET_MODE_INNER (mode);
3899 :
3900 1595135 : if (const_vector_p)
3901 : {
3902 524456 : bool int_load_p = GET_MODE_SIZE (mode) <= UNITS_PER_WORD;
3903 262228 : *kind_p = X86_CSE_CONST_VECTOR;
3904 262228 : if (int_load_p)
3905 : {
3906 : /* This CONST_VECTOR load can be converted to constant
3907 : integer load. */
3908 34371 : *scalar_mode_p = mode;
3909 34371 : *insn_p = nullptr;
3910 34371 : return op;
3911 : }
3912 :
3913 : /* This CONST_VECTOR is wider than the integer register. */
3914 227857 : rtx first = XVECEXP (op, 0, 0);
3915 :
3916 227857 : if (duplicated)
3917 : {
3918 : /* Check if CONST_VECTOR in REG_EQUAL note is duplicated in
3919 :
3920 : (insn 10 7 12 2 (set (reg:V8SI 128)
3921 : (vec_duplicate:V8SI (vec_select:V2SI (reg:V4SI 180)
3922 : (parallel [(const_int 0 [0])
3923 : (const_int 1 [0x1])]))))
3924 : (expr_list:REG_EQUAL (const_vector:V8SI [
3925 : (const_int 0 [0])
3926 : (const_int 34 [0x22])
3927 : (const_int 0 [0])
3928 : (const_int 34 [0x22])
3929 : (const_int 0 [0])
3930 : (const_int 34 [0x22])
3931 : (const_int 0 [0])
3932 : (const_int 34 [0x22])])(nil)))
3933 :
3934 : */
3935 :
3936 211672 : bool duplicated_const_vector = true;
3937 211672 : for (int i = 1; i < nunits; ++i)
3938 : {
3939 138641 : rtx tmp = XVECEXP (op, 0, i);
3940 138641 : if (!rtx_equal_p (tmp, first))
3941 : {
3942 : duplicated_const_vector = false;
3943 : break;
3944 : }
3945 : }
3946 :
3947 73047 : if (duplicated_const_vector)
3948 : {
3949 73031 : bool const_double_p = CONST_DOUBLE_P (first);
3950 : /* Force the floating point constant to memory. */
3951 73031 : if (const_double_p)
3952 5538 : first = validize_mem (force_const_mem (inner_mode, first));
3953 :
3954 73031 : if (const_double_p || CONST_INT_P (first))
3955 : {
3956 : /* Handle
3957 :
3958 : (insn 7 6 8 2 (set (reg:V4SF 99)
3959 : (vec_duplicate:V4SF (mem/u/c:SF (symbol_ref/u:DI ("*.LC2") [flags 0x2]) [0 S4 A32])))
3960 : (expr_list:REG_EQUAL (const_vector:V4SF [
3961 : (const_double:SF 3.4e+1 [0x0.88p+6]) repeated x4]) (nil)))
3962 :
3963 : and
3964 :
3965 : (insn 14 15 16 3 (set (reg:V4SI 116)
3966 : (vec_duplicate:V4SI (reg:SI 117)))
3967 : (expr_list:REG_EQUAL (const_vector:V4SI [
3968 : (const_int 34 [0x22]) repeated x4]) (nil)))
3969 :
3970 : */
3971 73031 : *kind_p = X86_CSE_VEC_DUP;
3972 73031 : *insn_p = nullptr;
3973 73031 : *scalar_mode_p = inner_mode;
3974 73031 : return first;
3975 : }
3976 : }
3977 :
3978 : op = orig_op;
3979 : }
3980 : else
3981 : {
3982 : /* Only native CONST_VECTOR is allowed. */
3983 154810 : if (orig_op != op)
3984 : return nullptr;
3985 :
3986 : /* Check if VEC_DUPLICATE can be used. */
3987 48 : for (int i = 1; i < nunits; ++i)
3988 : {
3989 48 : rtx tmp = XVECEXP (op, 0, i);
3990 : /* Vector duplicate value. */
3991 48 : if (!rtx_equal_p (tmp, first))
3992 : return nullptr;
3993 : }
3994 :
3995 : /* Use the inner mode to handle
3996 : (const_vector:V2QI [(const_int 0 [0]) repeated x2])
3997 : */
3998 0 : *scalar_mode_p = inner_mode;
3999 0 : *insn_p = nullptr;
4000 0 : return first;
4001 : }
4002 : }
4003 :
4004 1332923 : if (!duplicated)
4005 : return nullptr;
4006 :
4007 22671 : *kind_p = X86_CSE_VEC_DUP;
4008 :
4009 : /* Only
4010 :
4011 : (vec_duplicate:V4SI (reg:SI 99))
4012 : (vec_duplicate:V2DF (mem/u/c:DF (symbol_ref/u:DI ("*.LC1") [flags 0x2]) [0 S8 A64]))
4013 :
4014 : are supported. Set OP to the broadcast source by default. */
4015 22671 : op = XEXP (op, 0);
4016 22671 : rtx reg = op;
4017 22671 : if (SUBREG_P (op)
4018 403 : && SUBREG_BYTE (op) == 0
4019 23074 : && !paradoxical_subreg_p (op))
4020 403 : reg = SUBREG_REG (op);
4021 22671 : if (!REG_P (reg))
4022 : {
4023 2305 : if (MEM_P (op)
4024 2041 : && SYMBOL_REF_P (XEXP (op, 0))
4025 2554 : && CONSTANT_POOL_ADDRESS_P (XEXP (op, 0)))
4026 : {
4027 : /* Handle constant broadcast from memory. */
4028 11 : *scalar_mode_p = inner_mode;
4029 11 : *insn_p = nullptr;
4030 11 : return op;
4031 : }
4032 : return nullptr;
4033 : }
4034 :
4035 20366 : machine_mode orig_mode = mode;
4036 20366 : mode = GET_MODE (op);
4037 :
4038 : /* Only single def chain is supported. */
4039 20366 : df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg));
4040 20366 : if (!ref
4041 20365 : || DF_REF_IS_ARTIFICIAL (ref)
4042 20365 : || DF_REF_NEXT_REG (ref) != nullptr)
4043 : return nullptr;
4044 :
4045 14872 : rtx_insn *insn = DF_REF_INSN (ref);
4046 14872 : rtx set = single_set (insn);
4047 14872 : if (!set)
4048 : return nullptr;
4049 :
4050 14833 : rtx src = SET_SRC (set);
4051 :
4052 14833 : if (CONST_INT_P (src))
4053 : {
4054 : /* Handle sequences like
4055 :
4056 : (set (subreg:SI (reg/v:SF 105 [ f ]) 0)
4057 : (const_int 0 [0]))
4058 : (set (reg:V4SF 110)
4059 : (vec_duplicate:V4SF (reg/v:SF 105 [ f ])))
4060 :
4061 : and
4062 :
4063 : (set (reg:SI 99)
4064 : (const_int 34 [0x22]))
4065 : (set (reg:V4SI 98)
4066 : (vec_duplicate:V4SI (reg:SI 99)))
4067 :
4068 : Set *INSN_P to nullptr and return SET_SRC if SET_SRC is an
4069 : integer constant. */
4070 233 : op = src;
4071 233 : if (SCALAR_INT_MODE_P (mode) && mode != GET_MODE (reg))
4072 0 : op = gen_int_mode (INTVAL (src), mode);
4073 233 : if (op == const0_rtx)
4074 : {
4075 6 : if (standard_sse_constant_p (CONST0_RTX (orig_mode),
4076 : orig_mode) == 1)
4077 : {
4078 6 : *scalar_mode_p = QImode;
4079 6 : *kind_p = X86_CSE_CONST0_VECTOR;
4080 6 : *insn_p = nullptr;
4081 6 : return const0_rtx;
4082 : }
4083 0 : op = CONST0_RTX (mode);
4084 : }
4085 227 : else if (op == constm1_rtx
4086 227 : && standard_sse_constant_p (CONSTM1_RTX (orig_mode),
4087 : orig_mode) == 2)
4088 : {
4089 0 : *scalar_mode_p = QImode;
4090 0 : *kind_p = X86_CSE_CONSTM1_VECTOR;
4091 0 : *insn_p = nullptr;
4092 0 : return constm1_rtx;
4093 : }
4094 :
4095 : /* Check if we can convert:
4096 :
4097 : (insn 14 465 412 3 (set (reg:SI 507 [ j_lsm.26 ])
4098 : (const_int 2 [0x2])) "foo.c":10:12 discrim 2 100 {*movsi_internal} (nil))
4099 : ...
4100 : (insn 518 507 434 16 (set (reg:V2SI 493)
4101 : (vec_duplicate:V2SI (reg:SI 507 [ j_lsm.26 ]))) 2395 {*vec_dupv2si} (nil))
4102 :
4103 : to constant integer load:
4104 :
4105 : (insn 566 55 56 6 (set (subreg:DI (reg:V2SI 517) 0)
4106 : (const_int 8589934594 [0x200000002])) -1 (nil))
4107 : ...
4108 : (insn 518 507 434 16 (set (reg:V2SI 493)
4109 : (reg:V2SI 517)) 2066 {*movv2si_internal} (nil))
4110 :
4111 : */
4112 454 : if (GET_MODE_SIZE (orig_mode) <= UNITS_PER_WORD)
4113 6 : *kind_p = X86_CSE_CONST_VECTOR;
4114 :
4115 227 : *insn_p = nullptr;
4116 : }
4117 : else
4118 : {
4119 : /* Handle sequences like
4120 :
4121 : (set (reg:QI 105 [ c ])
4122 : (reg:QI 5 di [ c ]))
4123 : (set (reg:V64QI 102 [ _1 ])
4124 : (vec_duplicate:V64QI (reg:QI 105 [ c ])))
4125 :
4126 : (set (reg/v:SI 116 [ argc ])
4127 : (mem/c:SI (reg:SI 135) [2 argc+0 S4 A32]))
4128 : (set (reg:V4SI 119 [ _45 ])
4129 : (vec_duplicate:V4SI (reg/v:SI 116 [ argc ])))
4130 :
4131 : (set (reg:SI 98 [ _1 ])
4132 : (sign_extend:SI (reg:QI 106 [ c ])))
4133 : (set (reg:V16SI 103 [ _2 ])
4134 : (vec_duplicate:V16SI (reg:SI 98 [ _1 ])))
4135 :
4136 : (set (reg:SI 102 [ cost ])
4137 : (mem/c:SI (symbol_ref:DI ("cost") [flags 0x40])))
4138 : (set (reg:V4HI 103 [ _16 ])
4139 : (vec_duplicate:V4HI (subreg:HI (reg:SI 102 [ cost ]) 0)))
4140 :
4141 : (set (subreg:SI (reg/v:HI 107 [ cr_val ]) 0)
4142 : (ashift:SI (reg:SI 158)
4143 : (subreg:QI (reg:SI 156 [ _2 ]) 0)))
4144 : (set (reg:V16HI 183 [ _61 ])
4145 : (vec_duplicate:V16HI (reg/v:HI 107 [ cr_val ])))
4146 :
4147 : Set *INSN_P to INSN and return the broadcast source otherwise. */
4148 14600 : *insn_p = insn;
4149 : }
4150 :
4151 14827 : *scalar_mode_p = mode;
4152 14827 : return op;
4153 : }
4154 :
4155 : /* Replace CALL instruction in TLS_CALL_INSNS with SET from SRC and
4156 : put the updated instruction in UPDATED_TLS_INSNS. */
4157 :
4158 : static void
4159 313 : replace_tls_call (rtx src, auto_bitmap &tls_call_insns,
4160 : auto_bitmap &updated_tls_insns)
4161 : {
4162 313 : bitmap_iterator bi;
4163 313 : unsigned int id;
4164 :
4165 1739 : EXECUTE_IF_SET_IN_BITMAP (tls_call_insns, 0, id, bi)
4166 : {
4167 1426 : rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
4168 :
4169 : /* If this isn't a CALL, only GNU2 TLS implicit CALL patterns are
4170 : allowed. */
4171 1426 : if (!CALL_P (insn))
4172 : {
4173 47 : attr_tls64 tls64 = get_attr_tls64 (insn);
4174 47 : if (tls64 != TLS64_CALL && tls64 != TLS64_COMBINE)
4175 0 : gcc_unreachable ();
4176 : }
4177 :
4178 1426 : rtx pat = PATTERN (insn);
4179 1426 : gcc_assert (GET_CODE (pat) == PARALLEL);
4180 1426 : rtx set = XVECEXP (pat, 0, 0);
4181 1426 : gcc_assert (GET_CODE (set) == SET);
4182 1426 : rtx dest = SET_DEST (set);
4183 :
4184 1426 : set = gen_rtx_SET (dest, src);
4185 1426 : rtx_insn *set_insn = emit_insn_after (set, insn);
4186 1426 : if (recog_memoized (set_insn) < 0)
4187 0 : gcc_unreachable ();
4188 :
4189 : /* Put SET_INSN in UPDATED_TLS_INSNS. */
4190 1426 : bitmap_set_bit (updated_tls_insns, INSN_UID (set_insn));
4191 :
4192 1426 : if (dump_file)
4193 : {
4194 0 : fprintf (dump_file, "\nReplace:\n\n");
4195 0 : print_rtl_single (dump_file, insn);
4196 0 : fprintf (dump_file, "\nwith:\n\n");
4197 0 : print_rtl_single (dump_file, set_insn);
4198 0 : fprintf (dump_file, "\n");
4199 : }
4200 :
4201 : /* Delete the CALL insn. */
4202 1426 : delete_insn (insn);
4203 :
4204 1426 : df_insn_rescan (set_insn);
4205 : }
4206 313 : }
4207 :
4208 : /* Return the basic block which dominates all basic blocks which set
4209 : hard register REGNO used in basic block BB. */
4210 :
4211 : static basic_block
4212 2 : ix86_get_dominator_for_reg (unsigned int regno, basic_block bb)
4213 : {
4214 2 : basic_block set_bb;
4215 2 : auto_bitmap set_bbs;
4216 :
4217 : /* Get all BBs which set REGNO and dominate the current BB from all
4218 : DEFs of REGNO. */
4219 2 : for (df_ref def = DF_REG_DEF_CHAIN (regno);
4220 18 : def;
4221 16 : def = DF_REF_NEXT_REG (def))
4222 16 : if (!DF_REF_IS_ARTIFICIAL (def)
4223 16 : && !DF_REF_FLAGS_IS_SET (def, DF_REF_MAY_CLOBBER)
4224 6 : && !DF_REF_FLAGS_IS_SET (def, DF_REF_MUST_CLOBBER))
4225 : {
4226 4 : set_bb = DF_REF_BB (def);
4227 4 : if (dominated_by_p (CDI_DOMINATORS, bb, set_bb))
4228 2 : bitmap_set_bit (set_bbs, set_bb->index);
4229 : }
4230 :
4231 2 : bb = nearest_common_dominator_for_set (CDI_DOMINATORS, set_bbs);
4232 2 : return bb;
4233 2 : }
4234 :
4235 : /* Mark FLAGS register as live in DATA, a bitmap of live caller-saved
4236 : registers, if DEST is FLAGS register. */
4237 :
4238 : static void
4239 381 : ix86_check_flags_reg (rtx dest, const_rtx x, void *data)
4240 : {
4241 381 : if (GET_CODE (x) == CLOBBER)
4242 : return;
4243 :
4244 374 : auto_bitmap *live_caller_saved_regs = (auto_bitmap *) data;
4245 374 : if (REG_P (dest) && REGNO (dest) == FLAGS_REG)
4246 0 : bitmap_set_bit (*live_caller_saved_regs, FLAGS_REG);
4247 : }
4248 :
4249 : /* Emit a TLS_SET instruction of KIND in basic block BB. Store the
4250 : insertion point in *BEFORE_P for emit_insn_before or in *AFTER_P
4251 : for emit_insn_after. UPDATED_GNU_TLS_INSNS contains instructions
4252 : which replace the GNU TLS instructions. UPDATED_GNU2_TLS_INSNS
4253 : contains instructions which replace the GNU2 TLS instructions. */
4254 :
4255 : static rtx_insn *
4256 313 : ix86_emit_tls_call (rtx tls_set, x86_cse_kind kind, basic_block bb,
4257 : rtx_insn **before_p, rtx_insn **after_p,
4258 : auto_bitmap &updated_gnu_tls_insns,
4259 : auto_bitmap &updated_gnu2_tls_insns)
4260 : {
4261 315 : rtx_insn *tls_insn;
4262 :
4263 315 : do
4264 : {
4265 315 : rtx_insn *insn = BB_HEAD (bb);
4266 1297 : while (insn && !NONDEBUG_INSN_P (insn))
4267 : {
4268 986 : if (insn == BB_END (bb))
4269 : {
4270 : /* This must be the beginning basic block:
4271 :
4272 : (note 4 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
4273 : (note 2 4 26 2 NOTE_INSN_FUNCTION_BEG)
4274 :
4275 : or a basic block with only a label:
4276 :
4277 : (code_label 78 11 77 3 14 (nil) [1 uses])
4278 : (note 77 78 54 3 [bb 3] NOTE_INSN_BASIC_BLOCK)
4279 :
4280 : or a basic block with only a debug marker:
4281 :
4282 : (note 3 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
4283 : (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG)
4284 : (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil))
4285 :
4286 : or a basic block with only deleted instructions:
4287 :
4288 : (code_label 348 23 349 45 3 (nil) [0 uses])
4289 : (note 349 348 436 45 [bb 45] NOTE_INSN_BASIC_BLOCK)
4290 : (note 436 349 362 45 NOTE_INSN_DELETED)
4291 :
4292 : */
4293 4 : gcc_assert (DEBUG_INSN_P (insn)
4294 : || (NOTE_P (insn)
4295 : && ((NOTE_KIND (insn)
4296 : == NOTE_INSN_FUNCTION_BEG)
4297 : || (NOTE_KIND (insn)
4298 : == NOTE_INSN_DELETED)
4299 : || (NOTE_KIND (insn)
4300 : == NOTE_INSN_BASIC_BLOCK))));
4301 : insn = NULL;
4302 : break;
4303 : }
4304 982 : insn = NEXT_INSN (insn);
4305 : }
4306 :
4307 : /* TLS_GD and TLS_LD_BASE instructions are normal functions which
4308 : clobber caller-saved registers. TLSDESC instructions only
4309 : clobber FLAGS. If any registers clobbered by TLS instructions
4310 : are live in this basic block, we must insert TLS instructions
4311 : after all live registers clobbered are dead. */
4312 :
4313 315 : auto_bitmap live_caller_saved_regs;
4314 630 : bitmap in = df_live ? DF_LIVE_IN (bb) : DF_LR_IN (bb);
4315 :
4316 315 : if (bitmap_bit_p (in, FLAGS_REG))
4317 4 : bitmap_set_bit (live_caller_saved_regs, FLAGS_REG);
4318 :
4319 315 : unsigned int i;
4320 :
4321 : /* Get all live caller-saved registers for TLS_GD and TLS_LD_BASE
4322 : instructions. */
4323 315 : if (kind != X86_CSE_TLSDESC)
4324 27249 : for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
4325 26956 : if (call_used_regs[i]
4326 25198 : && !fixed_regs[i]
4327 38993 : && bitmap_bit_p (in, i))
4328 344 : bitmap_set_bit (live_caller_saved_regs, i);
4329 :
4330 315 : if (bitmap_empty_p (live_caller_saved_regs))
4331 : {
4332 82 : if (insn == BB_HEAD (bb))
4333 : {
4334 0 : *before_p = insn;
4335 0 : tls_insn = emit_insn_before (tls_set, insn);
4336 : }
4337 : else
4338 : {
4339 : /* Emit the TLS call after NOTE_INSN_FUNCTION_BEG in the
4340 : beginning basic block:
4341 :
4342 : (note 4 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
4343 : (note 2 4 26 2 NOTE_INSN_FUNCTION_BEG)
4344 :
4345 : or after NOTE_INSN_BASIC_BLOCK in a basic block with
4346 : only a label:
4347 :
4348 : (code_label 78 11 77 3 14 (nil) [1 uses])
4349 : (note 77 78 54 3 [bb 3] NOTE_INSN_BASIC_BLOCK)
4350 :
4351 : or after debug marker in a basic block with only a
4352 : debug marker:
4353 :
4354 : (note 3 0 2 2 [bb 2] NOTE_INSN_BASIC_BLOCK)
4355 : (note 2 3 5 2 NOTE_INSN_FUNCTION_BEG)
4356 : (debug_insn 5 2 16 2 (debug_marker) "x.c":6:3 -1 (nil))
4357 :
4358 : */
4359 82 : insn = insn ? PREV_INSN (insn) : BB_END (bb);
4360 82 : *after_p = insn;
4361 82 : tls_insn = emit_insn_after (tls_set, insn);
4362 : }
4363 82 : return tls_insn;
4364 : }
4365 :
4366 233 : bool repeat = false;
4367 :
4368 : /* Search for REG_DEAD notes in this basic block. */
4369 661 : FOR_BB_INSNS (bb, insn)
4370 : {
4371 661 : if (!NONDEBUG_INSN_P (insn))
4372 283 : continue;
4373 :
4374 : /* NB: Conditional jump is the only instruction which reads
4375 : flags register and changes control flow. We can never
4376 : place the TLS call after unconditional jump. */
4377 378 : if (JUMP_P (insn))
4378 : {
4379 : /* This must be a conditional jump. */
4380 2 : rtx label = JUMP_LABEL (insn);
4381 2 : if (label == nullptr
4382 2 : || ANY_RETURN_P (label)
4383 2 : || !(LABEL_P (label) || SYMBOL_REF_P (label)))
4384 0 : gcc_unreachable ();
4385 :
4386 : /* Place the call before all FLAGS_REG setting BBs since
4387 : we can't place a call before nor after a conditional
4388 : jump. */
4389 2 : bb = ix86_get_dominator_for_reg (FLAGS_REG, bb);
4390 :
4391 : /* Start over again. */
4392 2 : repeat = true;
4393 2 : break;
4394 : }
4395 :
4396 376 : if (bitmap_bit_p (updated_gnu_tls_insns, INSN_UID (insn)))
4397 : {
4398 : /* Insert the __tls_get_addr call before INSN which
4399 : replaces a __tls_get_addr call. */
4400 1 : *before_p = insn;
4401 1 : tls_insn = emit_insn_before (tls_set, insn);
4402 1 : return tls_insn;
4403 : }
4404 :
4405 375 : if (bitmap_bit_p (updated_gnu2_tls_insns, INSN_UID (insn)))
4406 : {
4407 : /* Mark FLAGS register as dead since FLAGS register
4408 : would be clobbered by the GNU2 TLS instruction. */
4409 1 : bitmap_clear_bit (live_caller_saved_regs, FLAGS_REG);
4410 1 : continue;
4411 : }
4412 :
4413 : /* Check if FLAGS register is live. */
4414 374 : note_stores (insn, ix86_check_flags_reg,
4415 : &live_caller_saved_regs);
4416 :
4417 374 : rtx link;
4418 515 : for (link = REG_NOTES (insn); link; link = XEXP (link, 1))
4419 371 : if ((REG_NOTE_KIND (link) == REG_DEAD
4420 9 : || (REG_NOTE_KIND (link) == REG_UNUSED
4421 7 : && REGNO (XEXP (link, 0)) == FLAGS_REG))
4422 378 : && REG_P (XEXP (link, 0)))
4423 : {
4424 : /* Mark the live caller-saved register as dead. */
4425 743 : for (i = REGNO (XEXP (link, 0));
4426 743 : i < END_REGNO (XEXP (link, 0));
4427 : i++)
4428 374 : if (i < FIRST_PSEUDO_REGISTER)
4429 351 : bitmap_clear_bit (live_caller_saved_regs, i);
4430 :
4431 369 : if (bitmap_empty_p (live_caller_saved_regs))
4432 : {
4433 230 : *after_p = insn;
4434 230 : tls_insn = emit_insn_after (tls_set, insn);
4435 230 : return tls_insn;
4436 : }
4437 : }
4438 : }
4439 :
4440 : /* NB: Start over again for conditional jump. */
4441 2 : if (repeat)
4442 2 : continue;
4443 :
4444 0 : gcc_assert (!bitmap_empty_p (live_caller_saved_regs));
4445 :
4446 : /* If any live caller-saved registers aren't dead at the end of
4447 : this basic block, get the basic block which dominates all
4448 : basic blocks which set the remaining live registers. */
4449 0 : auto_bitmap set_bbs;
4450 0 : bitmap_iterator bi;
4451 0 : unsigned int id;
4452 0 : EXECUTE_IF_SET_IN_BITMAP (live_caller_saved_regs, 0, id, bi)
4453 : {
4454 0 : basic_block set_bb = ix86_get_dominator_for_reg (id, bb);
4455 0 : bitmap_set_bit (set_bbs, set_bb->index);
4456 : }
4457 0 : bb = nearest_common_dominator_for_set (CDI_DOMINATORS, set_bbs);
4458 2 : }
4459 : while (true);
4460 : }
4461 :
4462 : /* Generate a TLS call of KIND with VAL and copy the call result to DEST,
4463 : at entry of the nearest dominator for basic block map BBS, which is in
4464 : the fake loop that contains the whole function, so that there is only
4465 : a single TLS CALL of KIND with VAL in the whole function.
4466 : UPDATED_GNU_TLS_INSNS contains instructions which replace the GNU TLS
4467 : instructions. UPDATED_GNU2_TLS_INSNS contains instructions which
4468 : replace the GNU2 TLS instructions. If TLSDESC_SET isn't nullptr,
4469 : insert it before the TLS call. */
4470 :
4471 : static void
4472 313 : ix86_place_single_tls_call (rtx dest, rtx val, x86_cse_kind kind,
4473 : auto_bitmap &bbs,
4474 : auto_bitmap &updated_gnu_tls_insns,
4475 : auto_bitmap &updated_gnu2_tls_insns,
4476 : rtx tlsdesc_set = nullptr)
4477 : {
4478 313 : basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
4479 313 : while (bb->loop_father->latch
4480 322 : != EXIT_BLOCK_PTR_FOR_FN (cfun))
4481 9 : bb = get_immediate_dominator (CDI_DOMINATORS,
4482 : bb->loop_father->header);
4483 :
4484 313 : rtx rax = nullptr, rdi;
4485 313 : rtx eqv = nullptr;
4486 313 : rtx caddr;
4487 313 : rtx set;
4488 313 : rtx clob;
4489 313 : rtx symbol;
4490 313 : rtx tls;
4491 :
4492 313 : switch (kind)
4493 : {
4494 262 : case X86_CSE_TLS_GD:
4495 262 : rax = gen_rtx_REG (Pmode, AX_REG);
4496 262 : rdi = gen_rtx_REG (Pmode, DI_REG);
4497 262 : caddr = ix86_tls_get_addr ();
4498 :
4499 262 : symbol = XVECEXP (val, 0, 0);
4500 262 : tls = gen_tls_global_dynamic_64 (Pmode, rax, symbol, caddr, rdi);
4501 :
4502 262 : if (GET_MODE (symbol) != Pmode)
4503 0 : symbol = gen_rtx_ZERO_EXTEND (Pmode, symbol);
4504 : eqv = symbol;
4505 : break;
4506 :
4507 30 : case X86_CSE_TLS_LD_BASE:
4508 30 : rax = gen_rtx_REG (Pmode, AX_REG);
4509 30 : rdi = gen_rtx_REG (Pmode, DI_REG);
4510 30 : caddr = ix86_tls_get_addr ();
4511 :
4512 30 : tls = gen_tls_local_dynamic_base_64 (Pmode, rax, caddr, rdi);
4513 :
4514 : /* Attach a unique REG_EQUAL to DEST, to allow the RTL optimizers
4515 : to share the LD_BASE result with other LD model accesses. */
4516 30 : eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
4517 : UNSPEC_TLS_LD_BASE);
4518 :
4519 30 : break;
4520 :
4521 21 : case X86_CSE_TLSDESC:
4522 21 : set = gen_rtx_SET (dest, val);
4523 21 : clob = gen_rtx_CLOBBER (VOIDmode,
4524 : gen_rtx_REG (CCmode, FLAGS_REG));
4525 21 : tls = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clob));
4526 21 : break;
4527 :
4528 0 : default:
4529 0 : gcc_unreachable ();
4530 : }
4531 :
4532 : /* Emit the TLS CALL insn. */
4533 313 : rtx_insn *before = nullptr;
4534 313 : rtx_insn *after = nullptr;
4535 313 : rtx_insn *tls_insn = ix86_emit_tls_call (tls, kind, bb, &before,
4536 : &after,
4537 : updated_gnu_tls_insns,
4538 : updated_gnu2_tls_insns);
4539 :
4540 313 : rtx_insn *tlsdesc_insn = nullptr;
4541 313 : if (tlsdesc_set)
4542 : {
4543 16 : rtx dest = copy_rtx (SET_DEST (tlsdesc_set));
4544 16 : rtx src = copy_rtx (SET_SRC (tlsdesc_set));
4545 16 : tlsdesc_set = gen_rtx_SET (dest, src);
4546 16 : tlsdesc_insn = emit_insn_before (tlsdesc_set, tls_insn);
4547 : }
4548 :
4549 313 : if (kind != X86_CSE_TLSDESC)
4550 : {
4551 292 : RTL_CONST_CALL_P (tls_insn) = 1;
4552 :
4553 : /* Indicate that this function can't jump to non-local gotos. */
4554 292 : make_reg_eh_region_note_nothrow_nononlocal (tls_insn);
4555 : }
4556 :
4557 313 : if (recog_memoized (tls_insn) < 0)
4558 0 : gcc_unreachable ();
4559 :
4560 313 : if (dump_file)
4561 : {
4562 0 : if (after)
4563 : {
4564 0 : fprintf (dump_file, "\nPlace:\n\n");
4565 0 : if (tlsdesc_insn)
4566 0 : print_rtl_single (dump_file, tlsdesc_insn);
4567 0 : print_rtl_single (dump_file, tls_insn);
4568 0 : fprintf (dump_file, "\nafter:\n\n");
4569 0 : print_rtl_single (dump_file, after);
4570 0 : fprintf (dump_file, "\n");
4571 : }
4572 : else
4573 : {
4574 0 : fprintf (dump_file, "\nPlace:\n\n");
4575 0 : if (tlsdesc_insn)
4576 0 : print_rtl_single (dump_file, tlsdesc_insn);
4577 0 : print_rtl_single (dump_file, tls_insn);
4578 0 : fprintf (dump_file, "\nbefore:\n\n");
4579 0 : print_rtl_single (dump_file, before);
4580 0 : fprintf (dump_file, "\n");
4581 : }
4582 : }
4583 :
4584 313 : if (kind != X86_CSE_TLSDESC)
4585 : {
4586 : /* Copy RAX to DEST. */
4587 292 : set = gen_rtx_SET (dest, rax);
4588 292 : rtx_insn *set_insn = emit_insn_after (set, tls_insn);
4589 292 : set_dst_reg_note (set_insn, REG_EQUAL, copy_rtx (eqv), dest);
4590 292 : if (dump_file)
4591 : {
4592 0 : fprintf (dump_file, "\nPlace:\n\n");
4593 0 : print_rtl_single (dump_file, set_insn);
4594 0 : fprintf (dump_file, "\nafter:\n\n");
4595 0 : print_rtl_single (dump_file, tls_insn);
4596 0 : fprintf (dump_file, "\n");
4597 : }
4598 : }
4599 313 : }
4600 :
4601 : namespace {
4602 :
4603 : const pass_data pass_data_x86_cse =
4604 : {
4605 : RTL_PASS, /* type */
4606 : "x86_cse", /* name */
4607 : OPTGROUP_NONE, /* optinfo_flags */
4608 : TV_MACH_DEP, /* tv_id */
4609 : 0, /* properties_required */
4610 : 0, /* properties_provided */
4611 : 0, /* properties_destroyed */
4612 : 0, /* todo_flags_start */
4613 : 0, /* todo_flags_finish */
4614 : };
4615 :
4616 : class pass_x86_cse : public rtl_opt_pass
4617 : {
4618 : public:
4619 298828 : pass_x86_cse (gcc::context *ctxt)
4620 597656 : : rtl_opt_pass (pass_data_x86_cse, ctxt)
4621 : {}
4622 :
4623 : /* opt_pass methods: */
4624 1488378 : bool gate (function *fun) final override
4625 : {
4626 1488378 : return optimize && optimize_function_for_speed_p (fun);
4627 : }
4628 :
4629 976653 : unsigned int execute (function *) final override
4630 : {
4631 976653 : return x86_cse ();
4632 : }
4633 :
4634 : private:
4635 : /* The redundant source value. */
4636 : rtx val;
4637 : /* The actual redundant source value for UNSPEC_TLSDESC. */
4638 : rtx tlsdesc_val;
4639 : /* The instruction which defines the redundant value. */
4640 : rtx_insn *def_insn;
4641 : /* Mode of the destination of the candidate redundant instruction. */
4642 : machine_mode mode;
4643 : /* Mode of the source of the candidate redundant instruction. */
4644 : machine_mode scalar_mode;
4645 : /* The classification of the candidate redundant instruction. */
4646 : x86_cse_kind kind;
4647 :
4648 : unsigned int x86_cse (void);
4649 : bool candidate_gnu_tls_p (rtx_insn *, attr_tls64);
4650 : bool candidate_gnu2_tls_p (rtx, attr_tls64);
4651 : bool candidate_vector_p (rtx, rtx_insn *);
4652 : rtx_insn *tls_set_insn_from_symbol (const_rtx, const_rtx);
4653 : }; // class pass_x86_cse
4654 :
4655 : /* Return the instruction which sets REG from TLS_SYMBOL. */
4656 :
4657 : rtx_insn *
4658 42 : pass_x86_cse::tls_set_insn_from_symbol (const_rtx reg,
4659 : const_rtx tls_symbol)
4660 : {
4661 42 : rtx_insn *set_insn = nullptr;
4662 42 : for (df_ref ref = DF_REG_DEF_CHAIN (REGNO (reg));
4663 111 : ref;
4664 69 : ref = DF_REF_NEXT_REG (ref))
4665 : {
4666 69 : if (DF_REF_IS_ARTIFICIAL (ref))
4667 : return nullptr;
4668 :
4669 69 : set_insn = DF_REF_INSN (ref);
4670 69 : if (get_attr_tls64 (set_insn) != TLS64_LEA)
4671 : return nullptr;
4672 :
4673 69 : rtx tls_set = PATTERN (set_insn);
4674 69 : rtx tls_src = XVECEXP (SET_SRC (tls_set), 0, 0);
4675 69 : if (!rtx_equal_p (tls_symbol, tls_src))
4676 : return nullptr;
4677 : }
4678 :
4679 : return set_insn;
4680 : }
4681 :
4682 : /* Return true and output def_insn, val, mode, scalar_mode and kind if
4683 : INSN is UNSPEC_TLS_GD or UNSPEC_TLS_LD_BASE. */
4684 :
4685 : bool
4686 2190 : pass_x86_cse::candidate_gnu_tls_p (rtx_insn *insn, attr_tls64 tls64)
4687 : {
4688 2190 : if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p)
4689 : return false;
4690 :
4691 : /* Record the redundant TLS CALLs for 64-bit:
4692 :
4693 : (parallel [
4694 : (set (reg:DI 0 ax)
4695 : (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr")))
4696 : (const_int 0 [0])))
4697 : (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50])
4698 : (reg/f:DI 7 sp)] UNSPEC_TLS_GD)
4699 : (clobber (reg:DI 5 di))])
4700 :
4701 :
4702 : and
4703 :
4704 : (parallel [
4705 : (set (reg:DI 0 ax)
4706 : (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr")))
4707 : (const_int 0 [0])))
4708 : (unspec:DI [(reg/f:DI 7 sp)] UNSPEC_TLS_LD_BASE)])
4709 :
4710 : */
4711 :
4712 2022 : rtx pat = PATTERN (insn);
4713 2022 : rtx set = XVECEXP (pat, 0, 0);
4714 2022 : gcc_assert (GET_CODE (set) == SET);
4715 2022 : rtx dest = SET_DEST (set);
4716 2022 : scalar_mode = mode = GET_MODE (dest);
4717 2022 : val = XVECEXP (pat, 0, 1);
4718 2022 : gcc_assert (GET_CODE (val) == UNSPEC);
4719 :
4720 2022 : if (tls64 == TLS64_GD)
4721 1921 : kind = X86_CSE_TLS_GD;
4722 : else
4723 101 : kind = X86_CSE_TLS_LD_BASE;
4724 :
4725 2022 : def_insn = nullptr;
4726 2022 : return true;
4727 : }
4728 :
4729 : /* Return true and output def_insn, val, mode, scalar_mode and kind if
4730 : SET is UNSPEC_TLSDESC. */
4731 :
4732 : bool
4733 56 : pass_x86_cse::candidate_gnu2_tls_p (rtx set, attr_tls64 tls64)
4734 : {
4735 56 : if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p)
4736 : return false;
4737 :
4738 54 : rtx tls_symbol;
4739 54 : rtx_insn *set_insn;
4740 54 : rtx src = SET_SRC (set);
4741 54 : val = src;
4742 54 : tlsdesc_val = src;
4743 54 : kind = X86_CSE_TLSDESC;
4744 :
4745 54 : if (tls64 == TLS64_COMBINE)
4746 : {
4747 : /* Record 64-bit TLS64_COMBINE:
4748 :
4749 : (set (reg/f:DI 104)
4750 : (plus:DI (unspec:DI [
4751 : (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
4752 : (reg:DI 114)
4753 : (reg/f:DI 7 sp)] UNSPEC_TLSDESC)
4754 : (const:DI (unspec:DI [
4755 : (symbol_ref:DI ("e") [flags 0x1a])
4756 : ] UNSPEC_DTPOFF))))
4757 :
4758 : (set (reg/f:DI 104)
4759 : (plus:DI (unspec:DI [
4760 : (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
4761 : (unspec:DI [
4762 : (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
4763 : ] UNSPEC_TLSDESC)
4764 : (reg/f:DI 7 sp)] UNSPEC_TLSDESC)
4765 : (const:DI (unspec:DI [
4766 : (symbol_ref:DI ("e") [flags 0x1a])
4767 : ] UNSPEC_DTPOFF))))
4768 : */
4769 :
4770 12 : scalar_mode = mode = GET_MODE (src);
4771 :
4772 : /* Since the first operand of PLUS in the source TLS_COMBINE
4773 : pattern is unused, use the second operand of PLUS:
4774 :
4775 : (const:DI (unspec:DI [
4776 : (symbol_ref:DI ("e") [flags 0x1a])
4777 : ] UNSPEC_DTPOFF))
4778 :
4779 : as VAL to check if 2 TLS_COMBINE patterns have the same
4780 : source. */
4781 12 : val = XEXP (src, 1);
4782 12 : gcc_assert (GET_CODE (val) == CONST
4783 : && GET_CODE (XEXP (val, 0)) == UNSPEC
4784 : && XINT (XEXP (val, 0), 1) == UNSPEC_DTPOFF
4785 : && SYMBOL_REF_P (XVECEXP (XEXP (val, 0), 0, 0)));
4786 12 : def_insn = nullptr;
4787 12 : return true;
4788 : }
4789 :
4790 : /* Record 64-bit TLS_CALL:
4791 :
4792 : (set (reg:DI 101)
4793 : (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50])
4794 : (reg:DI 112)
4795 : (reg/f:DI 7 sp)] UNSPEC_TLSDESC))
4796 :
4797 : */
4798 :
4799 42 : gcc_assert (GET_CODE (src) == UNSPEC);
4800 42 : tls_symbol = XVECEXP (src, 0, 0);
4801 42 : src = XVECEXP (src, 0, 1);
4802 42 : scalar_mode = mode = GET_MODE (src);
4803 42 : gcc_assert (REG_P (src));
4804 :
4805 : /* All definitions of reg:DI 129 in
4806 :
4807 : (set (reg:DI 110)
4808 : (unspec:DI [(symbol_ref:DI ("foo"))
4809 : (reg:DI 129)
4810 : (reg/f:DI 7 sp)] UNSPEC_TLSDESC))
4811 :
4812 : should have the same source as in
4813 :
4814 : (set (reg:DI 129)
4815 : (unspec:DI [(symbol_ref:DI ("foo"))] UNSPEC_TLSDESC))
4816 :
4817 : */
4818 :
4819 42 : set_insn = tls_set_insn_from_symbol (src, tls_symbol);
4820 42 : if (!set_insn)
4821 : return false;
4822 :
4823 : /* Use TLS_SYMBOL as VAL to check if 2 patterns have the same source. */
4824 42 : val = tls_symbol;
4825 42 : def_insn = set_insn;
4826 42 : return true;
4827 : }
4828 :
4829 : /* Return true and output def_insn, val, mode, scalar_mode and kind if
4830 : INSN is a vector broadcast instruction. */
4831 :
4832 : bool
4833 49619974 : pass_x86_cse::candidate_vector_p (rtx set, rtx_insn *insn)
4834 : {
4835 49619974 : rtx src = SET_SRC (set);
4836 49619974 : rtx dest = SET_DEST (set);
4837 49619974 : mode = GET_MODE (dest);
4838 : /* Skip non-vector instruction. */
4839 49619974 : if (!VECTOR_MODE_P (mode))
4840 : return false;
4841 :
4842 : /* Skip non-vector load instruction. */
4843 3697385 : if (!REG_P (dest) && !SUBREG_P (dest))
4844 : return false;
4845 :
4846 2196863 : def_insn = insn;
4847 2196863 : val = ix86_broadcast_inner (src, mode, &scalar_mode, &kind,
4848 : &def_insn);
4849 2196863 : return val ? true : false;
4850 : }
4851 :
4852 : /* At entry of the nearest common dominator for basic blocks with
4853 :
4854 : 1. Vector CONST0_RTX patterns.
4855 : 2. Vector CONSTM1_RTX patterns.
4856 : 3. Vector broadcast patterns.
4857 : 4. UNSPEC_TLS_GD patterns.
4858 : 5. UNSPEC_TLS_LD_BASE patterns.
4859 : 6. UNSPEC_TLSDESC patterns.
4860 :
4861 : generate a single pattern whose destination is used to replace the
4862 : source in all identical patterns.
4863 :
4864 : NB: We want to generate a pattern, which is executed only once, to
4865 : cover the whole function. The LCM algorithm isn't appropriate here
4866 : since it may place a pattern inside the loop. */
4867 :
4868 : unsigned int
4869 976653 : pass_x86_cse::x86_cse (void)
4870 : {
4871 976653 : timevar_push (TV_MACH_DEP);
4872 :
4873 976653 : auto_vec<redundant_pattern *> loads;
4874 976653 : redundant_pattern *load;
4875 976653 : basic_block bb;
4876 976653 : rtx_insn *insn;
4877 976653 : unsigned int i;
4878 976653 : auto_bitmap updated_gnu_tls_insns;
4879 976653 : auto_bitmap updated_gnu2_tls_insns;
4880 976653 : auto_bitmap call_bbs;
4881 :
4882 976653 : df_set_flags (DF_DEFER_INSN_RESCAN);
4883 :
4884 976653 : bool recursive_call_p = cfun->machine->recursive_function;
4885 :
4886 10831686 : FOR_EACH_BB_FN (bb, cfun)
4887 : {
4888 129855939 : FOR_BB_INSNS (bb, insn)
4889 : {
4890 120000906 : if (!NONDEBUG_INSN_P (insn))
4891 66727867 : continue;
4892 :
4893 53273039 : bool matched = false;
4894 : /* Remove redundant patterns if there are more than 2 of
4895 : them. */
4896 53273039 : unsigned int threshold = 2;
4897 :
4898 53273039 : bool call_p = CALL_P (insn);
4899 53273039 : rtx set = single_set (insn);
4900 53273039 : if (!set && !call_p)
4901 1104511 : continue;
4902 :
4903 52168528 : tlsdesc_val = nullptr;
4904 :
4905 52168528 : attr_tls64 tls64 = get_attr_tls64 (insn);
4906 :
4907 : /* NB: TLS calls preserve all registers. */
4908 52168528 : if (call_p && tls64 == TLS64_NONE)
4909 4399353 : bitmap_set_bit (call_bbs, BLOCK_FOR_INSN (insn)->index);
4910 :
4911 52168528 : switch (tls64)
4912 : {
4913 2190 : case TLS64_GD:
4914 2190 : case TLS64_LD_BASE:
4915 : /* Verify UNSPEC_TLS_GD and UNSPEC_TLS_LD_BASE. */
4916 2190 : if (candidate_gnu_tls_p (insn, tls64))
4917 : break;
4918 168 : continue;
4919 :
4920 56 : case TLS64_CALL:
4921 56 : case TLS64_COMBINE:
4922 : /* Verify UNSPEC_TLSDESC. */
4923 56 : if (candidate_gnu2_tls_p (set, tls64))
4924 : break;
4925 2 : continue;
4926 :
4927 38 : case TLS64_LEA:
4928 : /* Skip TLS64_LEA. */
4929 38 : continue;
4930 :
4931 52166244 : case TLS64_NONE:
4932 52166244 : if (!set)
4933 2546270 : continue;
4934 :
4935 : /* Check for vector broadcast. */
4936 49619974 : if (candidate_vector_p (set, insn))
4937 : break;
4938 49370789 : continue;
4939 : }
4940 :
4941 : /* Check if there is a matching redundant load. */
4942 595578 : FOR_EACH_VEC_ELT (loads, i, load)
4943 440818 : if (load->val
4944 440818 : && load->kind == kind
4945 294295 : && load->mode == scalar_mode
4946 258645 : && (load->bb == bb
4947 197213 : || (kind != X86_CSE_VEC_DUP
4948 197213 : && kind != X86_CSE_CONST_VECTOR)
4949 : /* Non all 0s/1s vector load must be in the same
4950 : basic block if it is in a recursive call. */
4951 137425 : || !recursive_call_p)
4952 697336 : && rtx_equal_p (load->val, val))
4953 : {
4954 : /* Record instruction. */
4955 96501 : bitmap_set_bit (load->insns, INSN_UID (insn));
4956 :
4957 : /* Record the maximum vector size. */
4958 96501 : if (kind <= X86_CSE_VEC_DUP
4959 191889 : && load->size < GET_MODE_SIZE (mode))
4960 1014 : load->size = GET_MODE_SIZE (mode);
4961 :
4962 : /* Record the basic block. */
4963 96501 : bitmap_set_bit (load->bbs, bb->index);
4964 :
4965 : /* Increment the count. */
4966 96501 : load->count++;
4967 :
4968 96501 : matched = true;
4969 96501 : break;
4970 : }
4971 :
4972 251261 : if (matched)
4973 96501 : continue;
4974 :
4975 : /* We see this instruction the first time. Record the
4976 : redundant source value, its mode, the destination size,
4977 : instruction which defines the redundant source value,
4978 : instruction basic block and the instruction kind. */
4979 154760 : load = new redundant_pattern;
4980 :
4981 : /* Convert CONST_VECTOR load no larger than integer register
4982 : to constant integer load even if there is no redundant
4983 : CONST_VECTOR load. */
4984 154760 : if (CONST_VECTOR_P (val))
4985 30748 : threshold = 1;
4986 :
4987 154760 : load->val = copy_rtx (val);
4988 154760 : if (tlsdesc_val)
4989 28 : load->tlsdesc_val = copy_rtx (tlsdesc_val);
4990 : else
4991 154732 : load->tlsdesc_val = nullptr;
4992 154760 : load->mode = scalar_mode;
4993 154760 : load->dest_mode = mode;
4994 154760 : load->size = GET_MODE_SIZE (mode);
4995 154760 : load->def_insn = def_insn;
4996 154760 : load->count = 1;
4997 154760 : load->threshold = threshold;
4998 154760 : load->bb = BLOCK_FOR_INSN (insn);
4999 154760 : load->kind = kind;
5000 :
5001 154760 : bitmap_set_bit (load->insns, INSN_UID (insn));
5002 154760 : bitmap_set_bit (load->bbs, bb->index);
5003 :
5004 154760 : loads.safe_push (load);
5005 : }
5006 : }
5007 :
5008 : bool replaced = false;
5009 1131413 : FOR_EACH_VEC_ELT (loads, i, load)
5010 154760 : if (load->count >= load->threshold)
5011 : {
5012 63639 : machine_mode mode;
5013 63639 : rtx reg, broadcast_reg;
5014 63639 : rtx broadcast_source = nullptr;
5015 63639 : replaced = true;
5016 63639 : switch (load->kind)
5017 : {
5018 313 : case X86_CSE_TLS_GD:
5019 313 : case X86_CSE_TLS_LD_BASE:
5020 313 : case X86_CSE_TLSDESC:
5021 313 : broadcast_reg = gen_reg_rtx (load->mode);
5022 313 : replace_tls_call (broadcast_reg, load->insns,
5023 313 : (load->kind == X86_CSE_TLSDESC
5024 : ? updated_gnu2_tls_insns
5025 : : updated_gnu_tls_insns));
5026 313 : load->broadcast_reg = broadcast_reg;
5027 313 : break;
5028 :
5029 11171 : case X86_CSE_VEC_DUP:
5030 11171 : if (CONST_INT_P (load->val)
5031 10048 : && (load->val == CONST0_RTX (load->mode)
5032 10072 : || load->size <= UNITS_PER_WORD))
5033 : {
5034 : /* Generate CONST_VECTOR load. */
5035 30749 : case X86_CSE_CONST_VECTOR:
5036 30749 : mode = ix86_get_vector_cse_mode (load->size,
5037 : load->mode);
5038 :
5039 30749 : if (CONST_VECTOR_P (load->val))
5040 : broadcast_source = load->val;
5041 1 : else if (load->val == CONST0_RTX (load->mode))
5042 0 : broadcast_source = CONST0_RTX (mode);
5043 1 : else if (load->val == CONSTM1_RTX (load->mode))
5044 0 : broadcast_source = CONSTM1_RTX (mode);
5045 : else
5046 : {
5047 1 : int nunits = GET_MODE_NUNITS (mode);
5048 1 : rtvec v = rtvec_alloc (nunits);
5049 3 : for (int j = 0; j < nunits ; j++)
5050 2 : RTVEC_ELT (v, j) = load->val;
5051 1 : broadcast_source = gen_rtx_CONST_VECTOR (mode, v);
5052 : }
5053 :
5054 : /* NB: Zero CONST_VECTOR load works for MMX and XMM
5055 : registers. */
5056 32160 : if (load->size <= UNITS_PER_WORD)
5057 : {
5058 : /* Convert CONST_VECTOR load no larger than integer
5059 : register:
5060 :
5061 : (set (reg:V2SI 106)
5062 : (const_vector:V2SI [(const_int 1 [1]) repeated x2]))
5063 :
5064 : to constant integer load:
5065 :
5066 : (set (subreg:DI (reg:V2SI 106 [ _20 ]) 0)
5067 : (const_int 4294967297 [0x100000001]))
5068 : */
5069 30749 : machine_mode int_mode
5070 30749 : = int_mode_for_mode (mode).require ();
5071 30749 : load->dest_mode = int_mode;
5072 30749 : broadcast_source = simplify_subreg (int_mode,
5073 : broadcast_source,
5074 : mode, 0);
5075 30749 : gcc_assert (broadcast_source != nullptr);
5076 :
5077 30749 : bool keep_const_int_load = false;
5078 30749 : if (!bitmap_empty_p (call_bbs))
5079 : {
5080 27498 : bitmap_iterator bi;
5081 27498 : unsigned int id;
5082 36029 : EXECUTE_IF_SET_IN_BITMAP (load->bbs, 0, id, bi)
5083 28636 : if (bitmap_bit_p (call_bbs, id))
5084 : {
5085 : /* NB: Constant integer load is faster
5086 : than save and restore an integer
5087 : register when crossing a function call.
5088 : */
5089 : keep_const_int_load = true;
5090 : break;
5091 : }
5092 : }
5093 :
5094 27498 : if (keep_const_int_load)
5095 : {
5096 : /* Keep constant integer load. */
5097 20105 : replace_vector_const (mode, broadcast_source,
5098 20105 : load->insns, int_mode);
5099 20105 : load->broadcast_source = nullptr;
5100 20105 : load->broadcast_reg = nullptr;
5101 : }
5102 : else
5103 : {
5104 10644 : broadcast_reg = gen_reg_rtx (mode);
5105 10644 : reg = gen_reg_rtx (load->mode);
5106 10644 : replace_vector_const (mode, broadcast_reg,
5107 10644 : load->insns, load->mode);
5108 10644 : load->broadcast_source = broadcast_source;
5109 10644 : load->broadcast_reg = broadcast_reg;
5110 : }
5111 : break;
5112 : }
5113 : }
5114 : /* FALLTHRU */
5115 :
5116 32577 : case X86_CSE_CONST0_VECTOR:
5117 32577 : case X86_CSE_CONSTM1_VECTOR:
5118 32577 : mode = ix86_get_vector_cse_mode (load->size, load->mode);
5119 32577 : broadcast_reg = gen_reg_rtx (mode);
5120 32577 : if (load->def_insn)
5121 : {
5122 : /* Replace redundant vector loads with a single vector
5123 : load in the same basic block. */
5124 841 : reg = load->val;
5125 841 : if (load->mode != GET_MODE (reg))
5126 0 : reg = gen_rtx_SUBREG (load->mode, reg, 0);
5127 841 : broadcast_source = gen_rtx_VEC_DUPLICATE (mode, reg);
5128 : }
5129 : else
5130 : /* This is a constant integer/double vector. If the
5131 : inner scalar is 0 or -1, set vector to CONST0_RTX
5132 : or CONSTM1_RTX directly. */
5133 31736 : switch (load->kind)
5134 : {
5135 19828 : case X86_CSE_CONST0_VECTOR:
5136 19828 : broadcast_source = CONST0_RTX (mode);
5137 19828 : break;
5138 1578 : case X86_CSE_CONSTM1_VECTOR:
5139 1578 : broadcast_source = CONSTM1_RTX (mode);
5140 1578 : break;
5141 10330 : case X86_CSE_CONST_VECTOR:
5142 10330 : case X86_CSE_VEC_DUP:
5143 10330 : if (!broadcast_source)
5144 : {
5145 10330 : reg = gen_reg_rtx (load->mode);
5146 10330 : broadcast_source = gen_rtx_VEC_DUPLICATE (mode,
5147 : reg);
5148 : }
5149 : break;
5150 0 : default:
5151 0 : gcc_unreachable ();
5152 : }
5153 32577 : replace_vector_const (mode, broadcast_reg, load->insns,
5154 : load->mode);
5155 32577 : load->broadcast_source = broadcast_source;
5156 32577 : load->broadcast_reg = broadcast_reg;
5157 32577 : break;
5158 : }
5159 : }
5160 :
5161 976653 : if (replaced)
5162 : {
5163 41343 : auto_vec<rtx_insn *> control_flow_insns;
5164 :
5165 : /* (Re-)discover loops so that bb->loop_father can be used in the
5166 : analysis below. */
5167 41343 : calculate_dominance_info (CDI_DOMINATORS);
5168 41343 : loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
5169 :
5170 125650 : FOR_EACH_VEC_ELT (loads, i, load)
5171 84307 : if (load->count >= load->threshold)
5172 : {
5173 63639 : rtx set;
5174 63639 : if (load->def_insn)
5175 857 : switch (load->kind)
5176 : {
5177 16 : case X86_CSE_TLSDESC:
5178 16 : ix86_place_single_tls_call (load->broadcast_reg,
5179 : load->tlsdesc_val,
5180 : load->kind,
5181 16 : load->bbs,
5182 : updated_gnu_tls_insns,
5183 : updated_gnu2_tls_insns,
5184 16 : PATTERN (load->def_insn));
5185 16 : break;
5186 841 : case X86_CSE_VEC_DUP:
5187 : /* Insert a broadcast after the original scalar
5188 : definition. */
5189 841 : set = gen_rtx_SET (load->broadcast_reg,
5190 : load->broadcast_source);
5191 841 : insn = emit_insn_after (set, load->def_insn);
5192 :
5193 841 : if (cfun->can_throw_non_call_exceptions)
5194 : {
5195 : /* Handle REG_EH_REGION note in DEF_INSN. */
5196 4 : rtx note = find_reg_note (load->def_insn,
5197 : REG_EH_REGION, nullptr);
5198 4 : if (note)
5199 : {
5200 1 : control_flow_insns.safe_push (load->def_insn);
5201 1 : add_reg_note (insn, REG_EH_REGION,
5202 : XEXP (note, 0));
5203 : }
5204 : }
5205 :
5206 841 : if (dump_file)
5207 : {
5208 0 : fprintf (dump_file, "\nAdd:\n\n");
5209 0 : print_rtl_single (dump_file, insn);
5210 0 : fprintf (dump_file, "\nafter:\n\n");
5211 0 : print_rtl_single (dump_file, load->def_insn);
5212 0 : fprintf (dump_file, "\n");
5213 : }
5214 : break;
5215 0 : default:
5216 0 : gcc_unreachable ();
5217 : }
5218 : else
5219 62782 : switch (load->kind)
5220 : {
5221 297 : case X86_CSE_TLS_GD:
5222 297 : case X86_CSE_TLS_LD_BASE:
5223 297 : case X86_CSE_TLSDESC:
5224 297 : ix86_place_single_tls_call (load->broadcast_reg,
5225 : (load->kind == X86_CSE_TLSDESC
5226 : ? load->tlsdesc_val
5227 : : load->val),
5228 : load->kind,
5229 297 : load->bbs,
5230 : updated_gnu_tls_insns,
5231 : updated_gnu2_tls_insns);
5232 297 : break;
5233 41079 : case X86_CSE_CONST_VECTOR:
5234 41079 : case X86_CSE_VEC_DUP:
5235 : /* Keep redundant constant integer load. */
5236 41079 : if (!load->broadcast_reg)
5237 : break;
5238 : /* FALLTHRU */
5239 42380 : case X86_CSE_CONST0_VECTOR:
5240 42380 : case X86_CSE_CONSTM1_VECTOR:
5241 42380 : ix86_place_single_vector_set (load->broadcast_reg,
5242 : load->broadcast_source,
5243 : load->bbs,
5244 : load);
5245 42380 : break;
5246 : }
5247 : }
5248 :
5249 41343 : loop_optimizer_finalize ();
5250 :
5251 41343 : if (!control_flow_insns.is_empty ())
5252 : {
5253 1 : free_dominance_info (CDI_DOMINATORS);
5254 :
5255 3 : FOR_EACH_VEC_ELT (control_flow_insns, i, insn)
5256 1 : if (control_flow_insn_p (insn))
5257 : {
5258 : /* Split the block after insn. There will be a fallthru
5259 : edge, which is OK so we keep it. We have to create
5260 : the exception edges ourselves. */
5261 1 : bb = BLOCK_FOR_INSN (insn);
5262 1 : split_block (bb, insn);
5263 1 : rtl_make_eh_edge (NULL, bb, BB_END (bb));
5264 : }
5265 : }
5266 :
5267 41343 : df_process_deferred_rescans ();
5268 41343 : }
5269 :
5270 1131413 : FOR_EACH_VEC_ELT (loads, i, load)
5271 309520 : delete load;
5272 :
5273 976653 : df_clear_flags (DF_DEFER_INSN_RESCAN);
5274 :
5275 976653 : timevar_pop (TV_MACH_DEP);
5276 976653 : return 0;
5277 976653 : }
5278 :
5279 : } // anon namespace
5280 :
5281 : rtl_opt_pass *
5282 298828 : make_pass_x86_cse (gcc::context *ctxt)
5283 : {
5284 298828 : return new pass_x86_cse (ctxt);
5285 : }
5286 :
5287 : /* Convert legacy instructions that clobbers EFLAGS to APX_NF
5288 : instructions when there are no flag set between a flag
5289 : producer and user. */
5290 :
5291 : static unsigned int
5292 371 : ix86_apx_nf_convert (void)
5293 : {
5294 371 : timevar_push (TV_MACH_DEP);
5295 :
5296 371 : basic_block bb;
5297 371 : rtx_insn *insn;
5298 371 : hash_map <rtx_insn *, rtx> converting_map;
5299 371 : auto_vec <rtx_insn *> current_convert_list;
5300 :
5301 371 : bool converting_seq = false;
5302 371 : rtx cc = gen_rtx_REG (CCmode, FLAGS_REG);
5303 :
5304 794 : FOR_EACH_BB_FN (bb, cfun)
5305 : {
5306 : /* Reset conversion for each bb. */
5307 423 : converting_seq = false;
5308 5079 : FOR_BB_INSNS (bb, insn)
5309 : {
5310 4656 : if (!NONDEBUG_INSN_P (insn))
5311 4995 : continue;
5312 :
5313 3712 : if (recog_memoized (insn) < 0)
5314 337 : continue;
5315 :
5316 : /* Convert candidate insns after cstore, which should
5317 : satisfy the two conditions:
5318 : 1. Is not flag user or producer, only clobbers
5319 : FLAGS_REG.
5320 : 2. Have corresponding nf pattern. */
5321 :
5322 3375 : rtx pat = PATTERN (insn);
5323 :
5324 : /* Starting conversion at first cstorecc. */
5325 3375 : rtx set = NULL_RTX;
5326 3375 : if (!converting_seq
5327 2793 : && (set = single_set (insn))
5328 2717 : && ix86_comparison_operator (SET_SRC (set), VOIDmode)
5329 127 : && reg_overlap_mentioned_p (cc, SET_SRC (set))
5330 3499 : && !reg_overlap_mentioned_p (cc, SET_DEST (set)))
5331 : {
5332 124 : converting_seq = true;
5333 124 : current_convert_list.truncate (0);
5334 : }
5335 : /* Terminate at the next explicit flag set. */
5336 3251 : else if (reg_set_p (cc, pat)
5337 3251 : && GET_CODE (set_of (cc, pat)) != CLOBBER)
5338 : converting_seq = false;
5339 :
5340 3154 : if (!converting_seq)
5341 2770 : continue;
5342 :
5343 605 : if (get_attr_has_nf (insn)
5344 605 : && GET_CODE (pat) == PARALLEL)
5345 : {
5346 : /* Record the insn to candidate map. */
5347 72 : current_convert_list.safe_push (insn);
5348 72 : converting_map.put (insn, pat);
5349 : }
5350 : /* If the insn clobbers flags but has no nf_attr,
5351 : revoke all previous candidates. */
5352 533 : else if (!get_attr_has_nf (insn)
5353 532 : && reg_set_p (cc, pat)
5354 536 : && GET_CODE (set_of (cc, pat)) == CLOBBER)
5355 : {
5356 3 : for (auto item : current_convert_list)
5357 0 : converting_map.remove (item);
5358 3 : converting_seq = false;
5359 : }
5360 : }
5361 : }
5362 :
5363 371 : if (!converting_map.is_empty ())
5364 : {
5365 85 : for (auto iter = converting_map.begin ();
5366 170 : iter != converting_map.end (); ++iter)
5367 : {
5368 72 : rtx_insn *replace = (*iter).first;
5369 72 : rtx pat = (*iter).second;
5370 72 : int i, n = 0, len = XVECLEN (pat, 0);
5371 72 : rtx *new_elems = XALLOCAVEC (rtx, len);
5372 72 : rtx new_pat;
5373 216 : for (i = 0; i < len; i++)
5374 : {
5375 144 : rtx temp = XVECEXP (pat, 0, i);
5376 216 : if (! (GET_CODE (temp) == CLOBBER
5377 72 : && reg_overlap_mentioned_p (cc,
5378 72 : XEXP (temp, 0))))
5379 : {
5380 72 : new_elems[n] = temp;
5381 72 : n++;
5382 : }
5383 : }
5384 :
5385 72 : if (n == 1)
5386 72 : new_pat = new_elems[0];
5387 : else
5388 0 : new_pat =
5389 0 : gen_rtx_PARALLEL (VOIDmode,
5390 : gen_rtvec_v (n,
5391 : new_elems));
5392 :
5393 72 : PATTERN (replace) = new_pat;
5394 72 : INSN_CODE (replace) = -1;
5395 72 : recog_memoized (replace);
5396 72 : df_insn_rescan (replace);
5397 : }
5398 : }
5399 :
5400 371 : timevar_pop (TV_MACH_DEP);
5401 371 : return 0;
5402 371 : }
5403 :
5404 :
5405 : namespace {
5406 :
5407 : const pass_data pass_data_apx_nf_convert =
5408 : {
5409 : RTL_PASS, /* type */
5410 : "apx_nfcvt", /* name */
5411 : OPTGROUP_NONE, /* optinfo_flags */
5412 : TV_MACH_DEP, /* tv_id */
5413 : 0, /* properties_required */
5414 : 0, /* properties_provided */
5415 : 0, /* properties_destroyed */
5416 : 0, /* todo_flags_start */
5417 : 0, /* todo_flags_finish */
5418 : };
5419 :
5420 : class pass_apx_nf_convert : public rtl_opt_pass
5421 : {
5422 : public:
5423 298828 : pass_apx_nf_convert (gcc::context *ctxt)
5424 597656 : : rtl_opt_pass (pass_data_apx_nf_convert, ctxt)
5425 : {}
5426 :
5427 : /* opt_pass methods: */
5428 1488378 : bool gate (function *) final override
5429 : {
5430 1488378 : return (TARGET_APX_NF
5431 465 : && optimize
5432 1488833 : && optimize_function_for_speed_p (cfun));
5433 : }
5434 :
5435 371 : unsigned int execute (function *) final override
5436 : {
5437 371 : return ix86_apx_nf_convert ();
5438 : }
5439 : }; // class pass_apx_nf_convert
5440 :
5441 : } // anon namespace
5442 :
5443 : rtl_opt_pass *
5444 298828 : make_pass_apx_nf_convert (gcc::context *ctxt)
5445 : {
5446 298828 : return new pass_apx_nf_convert (ctxt);
5447 : }
5448 :
5449 : /* When a hot loop can be fit into one cacheline,
5450 : force align the loop without considering the max skip. */
5451 : static void
5452 976174 : ix86_align_loops ()
5453 : {
5454 976174 : basic_block bb;
5455 :
5456 : /* Don't do this when we don't know cache line size. */
5457 976174 : if (ix86_cost->prefetch_block == 0)
5458 9 : return;
5459 :
5460 976165 : loop_optimizer_init (AVOID_CFG_MODIFICATIONS);
5461 976165 : profile_count count_threshold = cfun->cfg->count_max / param_align_threshold;
5462 11299539 : FOR_EACH_BB_FN (bb, cfun)
5463 : {
5464 10323374 : rtx_insn *label = BB_HEAD (bb);
5465 10323374 : bool has_fallthru = 0;
5466 10323374 : edge e;
5467 10323374 : edge_iterator ei;
5468 :
5469 10323374 : if (!LABEL_P (label))
5470 5257974 : continue;
5471 :
5472 5070212 : profile_count fallthru_count = profile_count::zero ();
5473 5070212 : profile_count branch_count = profile_count::zero ();
5474 :
5475 14735754 : FOR_EACH_EDGE (e, ei, bb->preds)
5476 : {
5477 9665542 : if (e->flags & EDGE_FALLTHRU)
5478 2463054 : has_fallthru = 1, fallthru_count += e->count ();
5479 : else
5480 7202488 : branch_count += e->count ();
5481 : }
5482 :
5483 5070212 : if (!fallthru_count.initialized_p () || !branch_count.initialized_p ())
5484 4812 : continue;
5485 :
5486 5065400 : if (bb->loop_father
5487 5065400 : && bb->loop_father->latch != EXIT_BLOCK_PTR_FOR_FN (cfun)
5488 6407118 : && (has_fallthru
5489 1341718 : ? (!(single_succ_p (bb)
5490 146632 : && single_succ (bb) == EXIT_BLOCK_PTR_FOR_FN (cfun))
5491 928658 : && optimize_bb_for_speed_p (bb)
5492 848676 : && branch_count + fallthru_count > count_threshold
5493 725939 : && (branch_count > fallthru_count * param_align_loop_iterations))
5494 : /* In case there'no fallthru for the loop.
5495 : Nops inserted won't be executed. */
5496 413060 : : (branch_count > count_threshold
5497 137263 : || (bb->count > bb->prev_bb->count * 10
5498 12583 : && (bb->prev_bb->count
5499 4526995 : <= ENTRY_BLOCK_PTR_FOR_FN (cfun)->count / 2)))))
5500 : {
5501 550988 : rtx_insn* insn, *end_insn;
5502 550988 : HOST_WIDE_INT size = 0;
5503 550988 : bool padding_p = true;
5504 550988 : basic_block tbb = bb;
5505 550988 : unsigned cond_branch_num = 0;
5506 550988 : bool detect_tight_loop_p = false;
5507 :
5508 869732 : for (unsigned int i = 0; i != bb->loop_father->num_nodes;
5509 318744 : i++, tbb = tbb->next_bb)
5510 : {
5511 : /* Only handle continuous cfg layout. */
5512 869732 : if (bb->loop_father != tbb->loop_father)
5513 : {
5514 : padding_p = false;
5515 : break;
5516 : }
5517 :
5518 10322981 : FOR_BB_INSNS (tbb, insn)
5519 : {
5520 9653983 : if (!NONDEBUG_INSN_P (insn))
5521 5599918 : continue;
5522 4054065 : size += ix86_min_insn_size (insn);
5523 :
5524 : /* We don't know size of inline asm.
5525 : Don't align loop for call. */
5526 4054065 : if (asm_noperands (PATTERN (insn)) >= 0
5527 4054065 : || CALL_P (insn))
5528 : {
5529 : size = -1;
5530 : break;
5531 : }
5532 : }
5533 :
5534 825253 : if (size == -1 || size > ix86_cost->prefetch_block)
5535 : {
5536 : padding_p = false;
5537 : break;
5538 : }
5539 :
5540 1483268 : FOR_EACH_EDGE (e, ei, tbb->succs)
5541 : {
5542 : /* It could be part of the loop. */
5543 1024079 : if (e->dest == bb)
5544 : {
5545 : detect_tight_loop_p = true;
5546 : break;
5547 : }
5548 : }
5549 :
5550 643566 : if (detect_tight_loop_p)
5551 : break;
5552 :
5553 459189 : end_insn = BB_END (tbb);
5554 459189 : if (JUMP_P (end_insn))
5555 : {
5556 : /* For decoded icache:
5557 : 1. Up to two branches are allowed per Way.
5558 : 2. A non-conditional branch is the last micro-op in a Way.
5559 : */
5560 370945 : if (onlyjump_p (end_insn)
5561 370945 : && (any_uncondjump_p (end_insn)
5562 312452 : || single_succ_p (tbb)))
5563 : {
5564 : padding_p = false;
5565 : break;
5566 : }
5567 312452 : else if (++cond_branch_num >= 2)
5568 : {
5569 : padding_p = false;
5570 : break;
5571 : }
5572 : }
5573 :
5574 : }
5575 :
5576 550988 : if (padding_p && detect_tight_loop_p)
5577 : {
5578 368754 : emit_insn_before (gen_max_skip_align (GEN_INT (ceil_log2 (size)),
5579 : GEN_INT (0)), label);
5580 : /* End of function. */
5581 184377 : if (!tbb || tbb == EXIT_BLOCK_PTR_FOR_FN (cfun))
5582 : break;
5583 : /* Skip bb which already fits into one cacheline. */
5584 : bb = tbb;
5585 : }
5586 : }
5587 : }
5588 :
5589 976165 : loop_optimizer_finalize ();
5590 976165 : free_dominance_info (CDI_DOMINATORS);
5591 : }
5592 :
5593 : namespace {
5594 :
5595 : const pass_data pass_data_align_tight_loops =
5596 : {
5597 : RTL_PASS, /* type */
5598 : "align_tight_loops", /* name */
5599 : OPTGROUP_NONE, /* optinfo_flags */
5600 : TV_MACH_DEP, /* tv_id */
5601 : 0, /* properties_required */
5602 : 0, /* properties_provided */
5603 : 0, /* properties_destroyed */
5604 : 0, /* todo_flags_start */
5605 : 0, /* todo_flags_finish */
5606 : };
5607 :
5608 : class pass_align_tight_loops : public rtl_opt_pass
5609 : {
5610 : public:
5611 298828 : pass_align_tight_loops (gcc::context *ctxt)
5612 597656 : : rtl_opt_pass (pass_data_align_tight_loops, ctxt)
5613 : {}
5614 :
5615 : /* opt_pass methods: */
5616 1488378 : bool gate (function *) final override
5617 : {
5618 1488378 : return TARGET_ALIGN_TIGHT_LOOPS
5619 1487892 : && optimize
5620 2529666 : && optimize_function_for_speed_p (cfun);
5621 : }
5622 :
5623 976174 : unsigned int execute (function *) final override
5624 : {
5625 976174 : timevar_push (TV_MACH_DEP);
5626 : #ifdef ASM_OUTPUT_MAX_SKIP_ALIGN
5627 976174 : ix86_align_loops ();
5628 : #endif
5629 976174 : timevar_pop (TV_MACH_DEP);
5630 976174 : return 0;
5631 : }
5632 : }; // class pass_align_tight_loops
5633 :
5634 : } // anon namespace
5635 :
5636 : rtl_opt_pass *
5637 298828 : make_pass_align_tight_loops (gcc::context *ctxt)
5638 : {
5639 298828 : return new pass_align_tight_loops (ctxt);
5640 : }
5641 :
5642 : /* This compares the priority of target features in function DECL1
5643 : and DECL2. It returns positive value if DECL1 is higher priority,
5644 : negative value if DECL2 is higher priority and 0 if they are the
5645 : same. */
5646 :
5647 : int
5648 5812 : ix86_compare_version_priority (tree decl1, tree decl2)
5649 : {
5650 5812 : unsigned int priority1 = get_builtin_code_for_version (decl1, NULL);
5651 5812 : unsigned int priority2 = get_builtin_code_for_version (decl2, NULL);
5652 :
5653 5812 : return (int)priority1 - (int)priority2;
5654 : }
5655 :
5656 : /* V1 and V2 point to function versions with different priorities
5657 : based on the target ISA. This function compares their priorities. */
5658 :
5659 : static int
5660 6858 : feature_compare (const void *v1, const void *v2)
5661 : {
5662 6858 : typedef struct _function_version_info
5663 : {
5664 : tree version_decl;
5665 : tree predicate_chain;
5666 : unsigned int dispatch_priority;
5667 : } function_version_info;
5668 :
5669 6858 : const function_version_info c1 = *(const function_version_info *)v1;
5670 6858 : const function_version_info c2 = *(const function_version_info *)v2;
5671 6858 : return (c2.dispatch_priority - c1.dispatch_priority);
5672 : }
5673 :
5674 : /* This adds a condition to the basic_block NEW_BB in function FUNCTION_DECL
5675 : to return a pointer to VERSION_DECL if the outcome of the expression
5676 : formed by PREDICATE_CHAIN is true. This function will be called during
5677 : version dispatch to decide which function version to execute. It returns
5678 : the basic block at the end, to which more conditions can be added. */
5679 :
5680 : static basic_block
5681 839 : add_condition_to_bb (tree function_decl, tree version_decl,
5682 : tree predicate_chain, basic_block new_bb)
5683 : {
5684 839 : gimple *return_stmt;
5685 839 : tree convert_expr, result_var;
5686 839 : gimple *convert_stmt;
5687 839 : gimple *call_cond_stmt;
5688 839 : gimple *if_else_stmt;
5689 :
5690 839 : basic_block bb1, bb2, bb3;
5691 839 : edge e12, e23;
5692 :
5693 839 : tree cond_var, and_expr_var = NULL_TREE;
5694 839 : gimple_seq gseq;
5695 :
5696 839 : tree predicate_decl, predicate_arg;
5697 :
5698 839 : push_cfun (DECL_STRUCT_FUNCTION (function_decl));
5699 :
5700 839 : gcc_assert (new_bb != NULL);
5701 839 : gseq = bb_seq (new_bb);
5702 :
5703 :
5704 839 : convert_expr = build1 (CONVERT_EXPR, ptr_type_node,
5705 : build_fold_addr_expr (version_decl));
5706 839 : result_var = create_tmp_var (ptr_type_node);
5707 839 : convert_stmt = gimple_build_assign (result_var, convert_expr);
5708 839 : return_stmt = gimple_build_return (result_var);
5709 :
5710 839 : if (predicate_chain == NULL_TREE)
5711 : {
5712 201 : gimple_seq_add_stmt (&gseq, convert_stmt);
5713 201 : gimple_seq_add_stmt (&gseq, return_stmt);
5714 201 : set_bb_seq (new_bb, gseq);
5715 201 : gimple_set_bb (convert_stmt, new_bb);
5716 201 : gimple_set_bb (return_stmt, new_bb);
5717 201 : pop_cfun ();
5718 201 : return new_bb;
5719 : }
5720 :
5721 1315 : while (predicate_chain != NULL)
5722 : {
5723 677 : cond_var = create_tmp_var (integer_type_node);
5724 677 : predicate_decl = TREE_PURPOSE (predicate_chain);
5725 677 : predicate_arg = TREE_VALUE (predicate_chain);
5726 677 : call_cond_stmt = gimple_build_call (predicate_decl, 1, predicate_arg);
5727 677 : gimple_call_set_lhs (call_cond_stmt, cond_var);
5728 :
5729 677 : gimple_set_block (call_cond_stmt, DECL_INITIAL (function_decl));
5730 677 : gimple_set_bb (call_cond_stmt, new_bb);
5731 677 : gimple_seq_add_stmt (&gseq, call_cond_stmt);
5732 :
5733 677 : predicate_chain = TREE_CHAIN (predicate_chain);
5734 :
5735 677 : if (and_expr_var == NULL)
5736 : and_expr_var = cond_var;
5737 : else
5738 : {
5739 39 : gimple *assign_stmt;
5740 : /* Use MIN_EXPR to check if any integer is zero?.
5741 : and_expr_var = min_expr <cond_var, and_expr_var> */
5742 39 : assign_stmt = gimple_build_assign (and_expr_var,
5743 : build2 (MIN_EXPR, integer_type_node,
5744 : cond_var, and_expr_var));
5745 :
5746 39 : gimple_set_block (assign_stmt, DECL_INITIAL (function_decl));
5747 39 : gimple_set_bb (assign_stmt, new_bb);
5748 39 : gimple_seq_add_stmt (&gseq, assign_stmt);
5749 : }
5750 : }
5751 :
5752 638 : if_else_stmt = gimple_build_cond (GT_EXPR, and_expr_var,
5753 : integer_zero_node,
5754 : NULL_TREE, NULL_TREE);
5755 638 : gimple_set_block (if_else_stmt, DECL_INITIAL (function_decl));
5756 638 : gimple_set_bb (if_else_stmt, new_bb);
5757 638 : gimple_seq_add_stmt (&gseq, if_else_stmt);
5758 :
5759 638 : gimple_seq_add_stmt (&gseq, convert_stmt);
5760 638 : gimple_seq_add_stmt (&gseq, return_stmt);
5761 638 : set_bb_seq (new_bb, gseq);
5762 :
5763 638 : bb1 = new_bb;
5764 638 : e12 = split_block (bb1, if_else_stmt);
5765 638 : bb2 = e12->dest;
5766 638 : e12->flags &= ~EDGE_FALLTHRU;
5767 638 : e12->flags |= EDGE_TRUE_VALUE;
5768 :
5769 638 : e23 = split_block (bb2, return_stmt);
5770 :
5771 638 : gimple_set_bb (convert_stmt, bb2);
5772 638 : gimple_set_bb (return_stmt, bb2);
5773 :
5774 638 : bb3 = e23->dest;
5775 638 : make_edge (bb1, bb3, EDGE_FALSE_VALUE);
5776 :
5777 638 : remove_edge (e23);
5778 638 : make_edge (bb2, EXIT_BLOCK_PTR_FOR_FN (cfun), 0);
5779 :
5780 638 : pop_cfun ();
5781 :
5782 638 : return bb3;
5783 : }
5784 :
5785 : /* This function generates the dispatch function for
5786 : multi-versioned functions. DISPATCH_DECL is the function which will
5787 : contain the dispatch logic. FNDECLS are the function choices for
5788 : dispatch, and is a tree chain. EMPTY_BB is the basic block pointer
5789 : in DISPATCH_DECL in which the dispatch code is generated. */
5790 :
5791 : static int
5792 201 : dispatch_function_versions (tree dispatch_decl,
5793 : void *fndecls_p,
5794 : basic_block *empty_bb)
5795 : {
5796 201 : tree default_decl;
5797 201 : gimple *ifunc_cpu_init_stmt;
5798 201 : gimple_seq gseq;
5799 201 : int ix;
5800 201 : tree ele;
5801 201 : vec<tree> *fndecls;
5802 201 : unsigned int num_versions = 0;
5803 201 : unsigned int actual_versions = 0;
5804 201 : unsigned int i;
5805 :
5806 201 : struct _function_version_info
5807 : {
5808 : tree version_decl;
5809 : tree predicate_chain;
5810 : unsigned int dispatch_priority;
5811 : }*function_version_info;
5812 :
5813 201 : gcc_assert (dispatch_decl != NULL
5814 : && fndecls_p != NULL
5815 : && empty_bb != NULL);
5816 :
5817 : /*fndecls_p is actually a vector. */
5818 201 : fndecls = static_cast<vec<tree> *> (fndecls_p);
5819 :
5820 : /* At least one more version other than the default. */
5821 201 : num_versions = fndecls->length ();
5822 201 : gcc_assert (num_versions >= 2);
5823 :
5824 201 : function_version_info = (struct _function_version_info *)
5825 201 : XNEWVEC (struct _function_version_info, (num_versions - 1));
5826 :
5827 : /* The first version in the vector is the default decl. */
5828 201 : default_decl = (*fndecls)[0];
5829 :
5830 201 : push_cfun (DECL_STRUCT_FUNCTION (dispatch_decl));
5831 :
5832 201 : gseq = bb_seq (*empty_bb);
5833 : /* Function version dispatch is via IFUNC. IFUNC resolvers fire before
5834 : constructors, so explicitly call __builtin_cpu_init here. */
5835 201 : ifunc_cpu_init_stmt
5836 201 : = gimple_build_call_vec (get_ix86_builtin (IX86_BUILTIN_CPU_INIT), vNULL);
5837 201 : gimple_seq_add_stmt (&gseq, ifunc_cpu_init_stmt);
5838 201 : gimple_set_bb (ifunc_cpu_init_stmt, *empty_bb);
5839 201 : set_bb_seq (*empty_bb, gseq);
5840 :
5841 201 : pop_cfun ();
5842 :
5843 :
5844 996 : for (ix = 1; fndecls->iterate (ix, &ele); ++ix)
5845 : {
5846 795 : tree version_decl = ele;
5847 795 : tree predicate_chain = NULL_TREE;
5848 795 : unsigned int priority;
5849 : /* Get attribute string, parse it and find the right predicate decl.
5850 : The predicate function could be a lengthy combination of many
5851 : features, like arch-type and various isa-variants. */
5852 795 : priority = get_builtin_code_for_version (version_decl,
5853 : &predicate_chain);
5854 :
5855 795 : if (predicate_chain == NULL_TREE)
5856 157 : continue;
5857 :
5858 638 : function_version_info [actual_versions].version_decl = version_decl;
5859 638 : function_version_info [actual_versions].predicate_chain
5860 638 : = predicate_chain;
5861 638 : function_version_info [actual_versions].dispatch_priority = priority;
5862 638 : actual_versions++;
5863 : }
5864 :
5865 : /* Sort the versions according to descending order of dispatch priority. The
5866 : priority is based on the ISA. This is not a perfect solution. There
5867 : could still be ambiguity. If more than one function version is suitable
5868 : to execute, which one should be dispatched? In future, allow the user
5869 : to specify a dispatch priority next to the version. */
5870 201 : qsort (function_version_info, actual_versions,
5871 : sizeof (struct _function_version_info), feature_compare);
5872 :
5873 1040 : for (i = 0; i < actual_versions; ++i)
5874 638 : *empty_bb = add_condition_to_bb (dispatch_decl,
5875 : function_version_info[i].version_decl,
5876 638 : function_version_info[i].predicate_chain,
5877 : *empty_bb);
5878 :
5879 : /* dispatch default version at the end. */
5880 201 : *empty_bb = add_condition_to_bb (dispatch_decl, default_decl,
5881 : NULL, *empty_bb);
5882 :
5883 201 : free (function_version_info);
5884 201 : return 0;
5885 : }
5886 :
5887 : /* This function changes the assembler name for functions that are
5888 : versions. If DECL is a function version and has a "target"
5889 : attribute, it appends the attribute string to its assembler name. */
5890 :
5891 : static tree
5892 1118 : ix86_mangle_function_version_assembler_name (tree decl, tree id)
5893 : {
5894 1118 : tree version_attr;
5895 1118 : char *attr_str;
5896 :
5897 1118 : if (DECL_DECLARED_INLINE_P (decl)
5898 1167 : && lookup_attribute ("gnu_inline",
5899 49 : DECL_ATTRIBUTES (decl)))
5900 0 : error_at (DECL_SOURCE_LOCATION (decl),
5901 : "function versions cannot be marked as %<gnu_inline%>,"
5902 : " bodies have to be generated");
5903 :
5904 1118 : if (DECL_VIRTUAL_P (decl)
5905 2236 : || DECL_VINDEX (decl))
5906 0 : sorry ("virtual function multiversioning not supported");
5907 :
5908 1118 : version_attr = lookup_attribute ("target", DECL_ATTRIBUTES (decl));
5909 :
5910 : /* target attribute string cannot be NULL. */
5911 1118 : gcc_assert (version_attr != NULL_TREE);
5912 :
5913 1118 : attr_str = sorted_attr_string (TREE_VALUE (version_attr));
5914 :
5915 : /* Allow assembler name to be modified if already set. */
5916 1118 : if (DECL_ASSEMBLER_NAME_SET_P (decl))
5917 1103 : SET_DECL_RTL (decl, NULL);
5918 :
5919 1118 : tree ret = clone_identifier (id, attr_str, true);
5920 :
5921 1118 : XDELETEVEC (attr_str);
5922 :
5923 1118 : return ret;
5924 : }
5925 :
5926 : tree
5927 493879603 : ix86_mangle_decl_assembler_name (tree decl, tree id)
5928 : {
5929 : /* For function version, add the target suffix to the assembler name. */
5930 493879603 : if (TREE_CODE (decl) == FUNCTION_DECL)
5931 : {
5932 459472001 : cgraph_node *node = cgraph_node::get (decl);
5933 : /* Mangle all versions when annotated with target_clones, but only
5934 : non-default versions when annotated with target attributes. */
5935 459472001 : if (DECL_FUNCTION_VERSIONED (decl)
5936 459472001 : && (node->is_target_clone
5937 1089 : || !is_function_default_version (node->decl)))
5938 1118 : id = ix86_mangle_function_version_assembler_name (decl, id);
5939 : /* Mangle the dispatched symbol but only in the case of target clones. */
5940 459470883 : else if (node && node->dispatcher_function && !node->is_target_clone)
5941 117 : id = clone_identifier (id, "ifunc");
5942 64165655 : else if (node && node->dispatcher_resolver_function)
5943 201 : id = clone_identifier (id, "resolver");
5944 : }
5945 : #ifdef SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME
5946 : id = SUBTARGET_MANGLE_DECL_ASSEMBLER_NAME (decl, id);
5947 : #endif
5948 :
5949 493879603 : return id;
5950 : }
5951 :
5952 : /* Make a dispatcher declaration for the multi-versioned function DECL.
5953 : Calls to DECL function will be replaced with calls to the dispatcher
5954 : by the front-end. Returns the decl of the dispatcher function. */
5955 :
5956 : tree
5957 327 : ix86_get_function_versions_dispatcher (void *decl)
5958 : {
5959 327 : tree fn = (tree) decl;
5960 327 : struct cgraph_node *node = NULL;
5961 327 : struct cgraph_node *default_node = NULL;
5962 327 : struct cgraph_function_version_info *node_v = NULL;
5963 :
5964 327 : tree dispatch_decl = NULL;
5965 :
5966 327 : struct cgraph_function_version_info *default_version_info = NULL;
5967 :
5968 654 : gcc_assert (fn != NULL && DECL_FUNCTION_VERSIONED (fn));
5969 :
5970 327 : node = cgraph_node::get (fn);
5971 327 : gcc_assert (node != NULL);
5972 :
5973 327 : node_v = node->function_version ();
5974 327 : gcc_assert (node_v != NULL);
5975 :
5976 327 : if (node_v->dispatcher_resolver != NULL)
5977 : return node_v->dispatcher_resolver;
5978 :
5979 : /* The default node is always the beginning of the chain. */
5980 : default_version_info = node_v;
5981 675 : while (default_version_info->prev != NULL)
5982 : default_version_info = default_version_info->prev;
5983 213 : default_node = default_version_info->this_node;
5984 :
5985 : /* If there is no default node, just return NULL. */
5986 213 : if (!is_function_default_version (default_node->decl))
5987 : return NULL;
5988 :
5989 : #if defined (ASM_OUTPUT_TYPE_DIRECTIVE)
5990 204 : if (targetm.has_ifunc_p ())
5991 : {
5992 204 : struct cgraph_function_version_info *it_v = NULL;
5993 :
5994 : /* Right now, the dispatching is done via ifunc. */
5995 204 : dispatch_decl = make_dispatcher_decl (default_node->decl);
5996 :
5997 : /* Set the dispatcher for all the versions. */
5998 204 : it_v = default_version_info;
5999 1410 : while (it_v != NULL)
6000 : {
6001 1002 : it_v->dispatcher_resolver = dispatch_decl;
6002 1002 : it_v = it_v->next;
6003 : }
6004 : }
6005 : else
6006 : #endif
6007 : {
6008 0 : error_at (DECL_SOURCE_LOCATION (default_node->decl),
6009 : "multiversioning needs %<ifunc%> which is not supported "
6010 : "on this target");
6011 : }
6012 :
6013 : return dispatch_decl;
6014 : }
6015 :
6016 : /* Make the resolver function decl to dispatch the versions of
6017 : a multi-versioned function, DEFAULT_DECL. IFUNC_ALIAS_DECL is
6018 : ifunc alias that will point to the created resolver. Create an
6019 : empty basic block in the resolver and store the pointer in
6020 : EMPTY_BB. Return the decl of the resolver function. */
6021 :
6022 : static tree
6023 201 : make_resolver_func (const tree default_decl,
6024 : const tree ifunc_alias_decl,
6025 : basic_block *empty_bb)
6026 : {
6027 201 : tree decl, type, t;
6028 :
6029 : /* The resolver function should return a (void *). */
6030 201 : type = build_function_type_list (ptr_type_node, NULL_TREE);
6031 :
6032 201 : cgraph_node *node = cgraph_node::get (default_decl);
6033 201 : gcc_assert (node && node->function_version ());
6034 :
6035 201 : decl = build_fn_decl (IDENTIFIER_POINTER (DECL_NAME (default_decl)), type);
6036 :
6037 : /* Set the assembler name to prevent cgraph_node attempting to mangle. */
6038 201 : SET_DECL_ASSEMBLER_NAME (decl, DECL_ASSEMBLER_NAME (default_decl));
6039 :
6040 201 : cgraph_node *resolver_node = cgraph_node::get_create (decl);
6041 201 : resolver_node->dispatcher_resolver_function = true;
6042 :
6043 201 : if (node->is_target_clone)
6044 87 : resolver_node->is_target_clone = true;
6045 :
6046 201 : tree id = ix86_mangle_decl_assembler_name
6047 201 : (decl, node->function_version ()->assembler_name);
6048 201 : symtab->change_decl_assembler_name (decl, id);
6049 :
6050 201 : DECL_NAME (decl) = DECL_NAME (default_decl);
6051 201 : TREE_USED (decl) = 1;
6052 201 : DECL_ARTIFICIAL (decl) = 1;
6053 201 : DECL_IGNORED_P (decl) = 1;
6054 201 : TREE_PUBLIC (decl) = 0;
6055 201 : DECL_UNINLINABLE (decl) = 1;
6056 :
6057 : /* Resolver is not external, body is generated. */
6058 201 : DECL_EXTERNAL (decl) = 0;
6059 201 : DECL_EXTERNAL (ifunc_alias_decl) = 0;
6060 :
6061 201 : DECL_CONTEXT (decl) = NULL_TREE;
6062 201 : DECL_INITIAL (decl) = make_node (BLOCK);
6063 201 : DECL_STATIC_CONSTRUCTOR (decl) = 0;
6064 :
6065 201 : if (DECL_COMDAT_GROUP (default_decl)
6066 201 : || TREE_PUBLIC (default_decl))
6067 : {
6068 : /* In this case, each translation unit with a call to this
6069 : versioned function will put out a resolver. Ensure it
6070 : is comdat to keep just one copy. */
6071 177 : DECL_COMDAT (decl) = 1;
6072 177 : make_decl_one_only (decl, DECL_ASSEMBLER_NAME (decl));
6073 : }
6074 : else
6075 24 : TREE_PUBLIC (ifunc_alias_decl) = 0;
6076 :
6077 : /* Build result decl and add to function_decl. */
6078 201 : t = build_decl (UNKNOWN_LOCATION, RESULT_DECL, NULL_TREE, ptr_type_node);
6079 201 : DECL_CONTEXT (t) = decl;
6080 201 : DECL_ARTIFICIAL (t) = 1;
6081 201 : DECL_IGNORED_P (t) = 1;
6082 201 : DECL_RESULT (decl) = t;
6083 :
6084 201 : gimplify_function_tree (decl);
6085 201 : push_cfun (DECL_STRUCT_FUNCTION (decl));
6086 201 : *empty_bb = init_lowered_empty_function (decl, false,
6087 : profile_count::uninitialized ());
6088 :
6089 201 : cgraph_node::add_new_function (decl, true);
6090 201 : symtab->call_cgraph_insertion_hooks (cgraph_node::get_create (decl));
6091 :
6092 201 : pop_cfun ();
6093 :
6094 201 : gcc_assert (ifunc_alias_decl != NULL);
6095 : /* Mark ifunc_alias_decl as "ifunc" with resolver as resolver_name. */
6096 201 : DECL_ATTRIBUTES (ifunc_alias_decl)
6097 201 : = make_attribute ("ifunc", IDENTIFIER_POINTER (DECL_ASSEMBLER_NAME (decl)),
6098 201 : DECL_ATTRIBUTES (ifunc_alias_decl));
6099 :
6100 : /* Create the alias for dispatch to resolver here. */
6101 201 : cgraph_node::create_same_body_alias (ifunc_alias_decl, decl);
6102 201 : return decl;
6103 : }
6104 :
6105 : /* Generate the dispatching code body to dispatch multi-versioned function
6106 : DECL. The target hook is called to process the "target" attributes and
6107 : provide the code to dispatch the right function at run-time. NODE points
6108 : to the dispatcher decl whose body will be created. */
6109 :
6110 : tree
6111 201 : ix86_generate_version_dispatcher_body (void *node_p)
6112 : {
6113 201 : tree resolver_decl;
6114 201 : basic_block empty_bb;
6115 201 : tree default_ver_decl;
6116 201 : struct cgraph_node *versn;
6117 201 : struct cgraph_node *node;
6118 :
6119 201 : struct cgraph_function_version_info *node_version_info = NULL;
6120 201 : struct cgraph_function_version_info *versn_info = NULL;
6121 :
6122 201 : node = (cgraph_node *)node_p;
6123 :
6124 201 : node_version_info = node->function_version ();
6125 201 : gcc_assert (node->dispatcher_function
6126 : && node_version_info != NULL);
6127 :
6128 201 : if (node_version_info->dispatcher_resolver)
6129 : return node_version_info->dispatcher_resolver;
6130 :
6131 : /* The first version in the chain corresponds to the default version. */
6132 201 : default_ver_decl = node_version_info->next->this_node->decl;
6133 :
6134 : /* node is going to be an alias, so remove the finalized bit. */
6135 201 : node->definition = false;
6136 :
6137 201 : resolver_decl = make_resolver_func (default_ver_decl,
6138 : node->decl, &empty_bb);
6139 :
6140 201 : node_version_info->dispatcher_resolver = resolver_decl;
6141 :
6142 201 : push_cfun (DECL_STRUCT_FUNCTION (resolver_decl));
6143 :
6144 201 : auto_vec<tree, 2> fn_ver_vec;
6145 :
6146 1197 : for (versn_info = node_version_info->next; versn_info;
6147 996 : versn_info = versn_info->next)
6148 : {
6149 996 : versn = versn_info->this_node;
6150 : /* Check for virtual functions here again, as by this time it should
6151 : have been determined if this function needs a vtable index or
6152 : not. This happens for methods in derived classes that override
6153 : virtual methods in base classes but are not explicitly marked as
6154 : virtual. */
6155 996 : if (DECL_VIRTUAL_P (versn->decl))
6156 0 : sorry ("virtual function multiversioning not supported");
6157 :
6158 996 : fn_ver_vec.safe_push (versn->decl);
6159 : }
6160 :
6161 201 : dispatch_function_versions (resolver_decl, &fn_ver_vec, &empty_bb);
6162 201 : cgraph_edge::rebuild_edges ();
6163 201 : pop_cfun ();
6164 201 : return resolver_decl;
6165 201 : }
6166 :
6167 :
|