Line data Source code
1 : /* Scheduler hooks for IA-32 which implement CPU specific logic.
2 : Copyright (C) 1988-2026 Free Software Foundation, Inc.
3 :
4 : This file is part of GCC.
5 :
6 : GCC is free software; you can redistribute it and/or modify
7 : it under the terms of the GNU General Public License as published by
8 : the Free Software Foundation; either version 3, or (at your option)
9 : any later version.
10 :
11 : GCC is distributed in the hope that it will be useful,
12 : but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : GNU General Public License for more details.
15 :
16 : You should have received a copy of the GNU General Public License
17 : along with GCC; see the file COPYING3. If not see
18 : <http://www.gnu.org/licenses/>. */
19 :
20 : #define IN_TARGET_CODE 1
21 :
22 : #include "config.h"
23 : #include "system.h"
24 : #include "coretypes.h"
25 : #include "backend.h"
26 : #include "rtl.h"
27 : #include "tree.h"
28 : #include "cfghooks.h"
29 : #include "tm_p.h"
30 : #include "target.h"
31 : #include "insn-config.h"
32 : #include "insn-attr.h"
33 : #include "insn-opinit.h"
34 : #include "recog.h"
35 : #include "tm-constrs.h"
36 :
37 : /* Return the maximum number of instructions a cpu can issue. */
38 :
39 : int
40 36648759 : ix86_issue_rate (void)
41 : {
42 36648759 : switch (ix86_tune)
43 : {
44 : case PROCESSOR_PENTIUM:
45 : case PROCESSOR_LAKEMONT:
46 : case PROCESSOR_BONNELL:
47 : case PROCESSOR_SILVERMONT:
48 : case PROCESSOR_K6:
49 : case PROCESSOR_BTVER2:
50 : case PROCESSOR_PENTIUM4:
51 : case PROCESSOR_NOCONA:
52 : return 2;
53 :
54 : case PROCESSOR_PENTIUMPRO:
55 : case PROCESSOR_ATHLON:
56 : case PROCESSOR_K8:
57 : case PROCESSOR_AMDFAM10:
58 : case PROCESSOR_BTVER1:
59 : case PROCESSOR_LUJIAZUI:
60 : return 3;
61 :
62 : case PROCESSOR_BDVER1:
63 : case PROCESSOR_BDVER2:
64 : case PROCESSOR_BDVER3:
65 : case PROCESSOR_BDVER4:
66 : case PROCESSOR_ZNVER1:
67 : case PROCESSOR_ZNVER2:
68 : case PROCESSOR_ZNVER3:
69 : case PROCESSOR_ZNVER4:
70 : case PROCESSOR_CORE2:
71 : case PROCESSOR_NEHALEM:
72 : case PROCESSOR_SANDYBRIDGE:
73 : case PROCESSOR_HASWELL:
74 : case PROCESSOR_TREMONT:
75 : case PROCESSOR_SKYLAKE:
76 : case PROCESSOR_SKYLAKE_AVX512:
77 : case PROCESSOR_CASCADELAKE:
78 : case PROCESSOR_CANNONLAKE:
79 : case PROCESSOR_ALDERLAKE:
80 : case PROCESSOR_YONGFENG:
81 : case PROCESSOR_SHIJIDADAO:
82 : case PROCESSOR_SIERRAFOREST:
83 : case PROCESSOR_INTEL:
84 : case PROCESSOR_GENERIC:
85 : /* For znver5 decoder can handle 4 or 8 instructions per cycle,
86 : op cache 12 instruction/cycle, dispatch 8 instructions
87 : integer rename 8 instructions and Fp 6 instructions.
88 :
89 : The scheduler, without understanding out of order nature of the CPU
90 : is not going to be able to use more than 4 instructions since that
91 : is limits of the decoders. */
92 : case PROCESSOR_ZNVER5:
93 : case PROCESSOR_ZNVER6:
94 : case PROCESSOR_C86_4G_M4:
95 : case PROCESSOR_C86_4G_M6:
96 : case PROCESSOR_C86_4G_M7:
97 : return 4;
98 :
99 : case PROCESSOR_ICELAKE_CLIENT:
100 : case PROCESSOR_ICELAKE_SERVER:
101 : case PROCESSOR_TIGERLAKE:
102 : case PROCESSOR_COOPERLAKE:
103 : case PROCESSOR_ROCKETLAKE:
104 : return 5;
105 :
106 : case PROCESSOR_SAPPHIRERAPIDS:
107 : case PROCESSOR_GRANITERAPIDS:
108 : case PROCESSOR_GRANITERAPIDS_D:
109 : case PROCESSOR_DIAMONDRAPIDS:
110 : case PROCESSOR_GRANDRIDGE:
111 : case PROCESSOR_CLEARWATERFOREST:
112 : case PROCESSOR_ARROWLAKE:
113 : case PROCESSOR_ARROWLAKE_S:
114 : case PROCESSOR_PANTHERLAKE:
115 : return 6;
116 :
117 : case PROCESSOR_NOVALAKE:
118 : return 8;
119 :
120 : default:
121 : return 1;
122 : }
123 : }
124 :
125 : /* Return true iff USE_INSN has a memory address with operands set by
126 : SET_INSN. */
127 :
128 : bool
129 9891802 : ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
130 : {
131 9891802 : int i;
132 9891802 : extract_insn_cached (use_insn);
133 12166799 : for (i = recog_data.n_operands - 1; i >= 0; --i)
134 11808088 : if (MEM_P (recog_data.operand[i]))
135 : {
136 9533091 : rtx addr = XEXP (recog_data.operand[i], 0);
137 9533091 : if (modified_in_p (addr, set_insn) != 0)
138 : {
139 : /* No AGI stall if SET_INSN is a push or pop and USE_INSN
140 : has SP based memory (unless index reg is modified in a pop). */
141 4314012 : rtx set = single_set (set_insn);
142 4314012 : if (set
143 4314012 : && (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
144 3535837 : || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
145 : {
146 591284 : struct ix86_address parts;
147 591284 : if (ix86_decompose_address (addr, &parts)
148 591284 : && parts.base == stack_pointer_rtx
149 1182396 : && (parts.index == NULL_RTX
150 471 : || MEM_P (SET_DEST (set))
151 2 : || !modified_in_p (parts.index, set_insn)))
152 591111 : return false;
153 : }
154 3722901 : return true;
155 : }
156 : return false;
157 : }
158 : return false;
159 : }
160 :
161 : /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
162 : by DEP_INSN and nothing set by DEP_INSN. */
163 :
164 : static bool
165 0 : ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
166 : {
167 0 : rtx set, set2;
168 :
169 : /* Simplify the test for uninteresting insns. */
170 0 : if (insn_type != TYPE_SETCC
171 0 : && insn_type != TYPE_ICMOV
172 0 : && insn_type != TYPE_FCMOV
173 0 : && insn_type != TYPE_IBR)
174 : return false;
175 :
176 0 : if ((set = single_set (dep_insn)) != 0)
177 : {
178 0 : set = SET_DEST (set);
179 0 : set2 = NULL_RTX;
180 : }
181 0 : else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
182 0 : && XVECLEN (PATTERN (dep_insn), 0) == 2
183 0 : && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
184 0 : && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
185 : {
186 0 : set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
187 0 : set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
188 : }
189 : else
190 : return false;
191 :
192 0 : if (!REG_P (set) || REGNO (set) != FLAGS_REG)
193 : return false;
194 :
195 : /* This test is true if the dependent insn reads the flags but
196 : not any other potentially set register. */
197 0 : if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
198 : return false;
199 :
200 0 : if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
201 : return false;
202 :
203 : return true;
204 : }
205 :
206 : /* Helper function for exact_store_load_dependency.
207 : Return true if addr is found in insn. */
208 : static bool
209 0 : exact_dependency_1 (rtx addr, rtx insn)
210 : {
211 0 : enum rtx_code code;
212 0 : const char *format_ptr;
213 0 : int i, j;
214 :
215 0 : code = GET_CODE (insn);
216 0 : switch (code)
217 : {
218 0 : case MEM:
219 0 : if (rtx_equal_p (addr, insn))
220 : return true;
221 : break;
222 : case REG:
223 : CASE_CONST_ANY:
224 : case SYMBOL_REF:
225 : case CODE_LABEL:
226 : case PC:
227 : case EXPR_LIST:
228 : return false;
229 : default:
230 : break;
231 : }
232 :
233 0 : format_ptr = GET_RTX_FORMAT (code);
234 0 : for (i = 0; i < GET_RTX_LENGTH (code); i++)
235 : {
236 0 : switch (*format_ptr++)
237 : {
238 0 : case 'e':
239 0 : if (exact_dependency_1 (addr, XEXP (insn, i)))
240 : return true;
241 : break;
242 : case 'E':
243 0 : for (j = 0; j < XVECLEN (insn, i); j++)
244 0 : if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
245 : return true;
246 : break;
247 : }
248 : }
249 : return false;
250 : }
251 :
252 : /* Return true if there exists exact dependency for store & load, i.e.
253 : the same memory address is used in them. */
254 : static bool
255 0 : exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
256 : {
257 0 : rtx set1, set2;
258 :
259 0 : set1 = single_set (store);
260 0 : if (!set1)
261 : return false;
262 0 : if (!MEM_P (SET_DEST (set1)))
263 : return false;
264 0 : set2 = single_set (load);
265 0 : if (!set2)
266 : return false;
267 0 : if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
268 : return true;
269 : return false;
270 : }
271 :
272 :
273 : /* This function corrects the value of COST (latency) based on the relationship
274 : between INSN and DEP_INSN through a dependence of type DEP_TYPE, and strength
275 : DW. It should return the new value.
276 :
277 : On x86 CPUs this is most commonly used to model the fact that valus of
278 : registers used to compute address of memory operand needs to be ready
279 : earlier than values of registers used in the actual operation. */
280 :
281 : int
282 151178103 : ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
283 : unsigned int)
284 : {
285 151178103 : enum attr_type insn_type, dep_insn_type;
286 151178103 : enum attr_memory memory;
287 151178103 : rtx set, set2;
288 151178103 : int dep_insn_code_number;
289 :
290 : /* Anti and output dependencies have zero cost on all CPUs. */
291 151178103 : if (dep_type != 0)
292 : return 0;
293 :
294 53877513 : dep_insn_code_number = recog_memoized (dep_insn);
295 :
296 : /* If we can't recognize the insns, we can't really do anything. */
297 53877513 : if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
298 354556 : return cost;
299 :
300 53522957 : insn_type = get_attr_type (insn);
301 53522957 : dep_insn_type = get_attr_type (dep_insn);
302 :
303 53522957 : switch (ix86_tune)
304 : {
305 0 : case PROCESSOR_PENTIUM:
306 0 : case PROCESSOR_LAKEMONT:
307 : /* Address Generation Interlock adds a cycle of latency. */
308 0 : if (insn_type == TYPE_LEA)
309 : {
310 0 : rtx addr = PATTERN (insn);
311 :
312 0 : if (GET_CODE (addr) == PARALLEL)
313 0 : addr = XVECEXP (addr, 0, 0);
314 :
315 0 : gcc_assert (GET_CODE (addr) == SET);
316 :
317 0 : addr = SET_SRC (addr);
318 0 : if (modified_in_p (addr, dep_insn))
319 0 : cost += 1;
320 : }
321 0 : else if (ix86_agi_dependent (dep_insn, insn))
322 0 : cost += 1;
323 :
324 : /* ??? Compares pair with jump/setcc. */
325 0 : if (ix86_flags_dependent (insn, dep_insn, insn_type))
326 0 : cost = 0;
327 :
328 : /* Floating point stores require value to be ready one cycle earlier. */
329 0 : if (insn_type == TYPE_FMOV
330 0 : && get_attr_memory (insn) == MEMORY_STORE
331 0 : && !ix86_agi_dependent (dep_insn, insn))
332 0 : cost += 1;
333 : break;
334 :
335 0 : case PROCESSOR_PENTIUMPRO:
336 : /* INT->FP conversion is expensive. */
337 0 : if (get_attr_fp_int_src (dep_insn))
338 0 : cost += 5;
339 :
340 : /* There is one cycle extra latency between an FP op and a store. */
341 0 : if (insn_type == TYPE_FMOV
342 0 : && (set = single_set (dep_insn)) != NULL_RTX
343 0 : && (set2 = single_set (insn)) != NULL_RTX
344 0 : && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
345 0 : && MEM_P (SET_DEST (set2)))
346 0 : cost += 1;
347 :
348 0 : memory = get_attr_memory (insn);
349 :
350 : /* Show ability of reorder buffer to hide latency of load by executing
351 : in parallel with previous instruction in case
352 : previous instruction is not needed to compute the address. */
353 0 : if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
354 0 : && !ix86_agi_dependent (dep_insn, insn))
355 : {
356 : /* Claim moves to take one cycle, as core can issue one load
357 : at time and the next load can start cycle later. */
358 0 : if (dep_insn_type == TYPE_IMOV
359 0 : || dep_insn_type == TYPE_FMOV)
360 : cost = 1;
361 0 : else if (cost > 1)
362 0 : cost--;
363 : }
364 : break;
365 :
366 0 : case PROCESSOR_K6:
367 : /* The esp dependency is resolved before
368 : the instruction is really finished. */
369 0 : if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
370 0 : && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
371 : return 1;
372 :
373 : /* INT->FP conversion is expensive. */
374 0 : if (get_attr_fp_int_src (dep_insn))
375 0 : cost += 5;
376 :
377 0 : memory = get_attr_memory (insn);
378 :
379 : /* Show ability of reorder buffer to hide latency of load by executing
380 : in parallel with previous instruction in case
381 : previous instruction is not needed to compute the address. */
382 0 : if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
383 0 : && !ix86_agi_dependent (dep_insn, insn))
384 : {
385 : /* Claim moves to take one cycle, as core can issue one load
386 : at time and the next load can start cycle later. */
387 0 : if (dep_insn_type == TYPE_IMOV
388 0 : || dep_insn_type == TYPE_FMOV)
389 : cost = 1;
390 0 : else if (cost > 2)
391 0 : cost -= 2;
392 : else
393 : cost = 1;
394 : }
395 : break;
396 :
397 8961 : case PROCESSOR_AMDFAM10:
398 8961 : case PROCESSOR_BDVER1:
399 8961 : case PROCESSOR_BDVER2:
400 8961 : case PROCESSOR_BDVER3:
401 8961 : case PROCESSOR_BDVER4:
402 8961 : case PROCESSOR_BTVER1:
403 8961 : case PROCESSOR_BTVER2:
404 : /* Stack engine allows to execute push&pop instructions in parall. */
405 8961 : if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
406 219 : && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
407 : return 0;
408 : /* FALLTHRU */
409 :
410 73826 : case PROCESSOR_ATHLON:
411 73826 : case PROCESSOR_K8:
412 73826 : memory = get_attr_memory (insn);
413 :
414 : /* Show ability of reorder buffer to hide latency of load by executing
415 : in parallel with previous instruction in case
416 : previous instruction is not needed to compute the address. */
417 73826 : if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
418 73826 : && !ix86_agi_dependent (dep_insn, insn))
419 : {
420 10371 : enum attr_unit unit = get_attr_unit (insn);
421 10371 : int loadcost = 3;
422 :
423 : /* Because of the difference between the length of integer and
424 : floating unit pipeline preparation stages, the memory operands
425 : for floating point are cheaper.
426 :
427 : ??? For Athlon it the difference is most probably 2. */
428 10371 : if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
429 : loadcost = 3;
430 : else
431 5071 : loadcost = TARGET_CPU_P (ATHLON) ? 2 : 0;
432 :
433 10371 : if (cost >= loadcost)
434 5841 : cost -= loadcost;
435 : else
436 : cost = 0;
437 : }
438 : break;
439 :
440 4669 : case PROCESSOR_ZNVER1:
441 4669 : case PROCESSOR_ZNVER2:
442 4669 : case PROCESSOR_ZNVER3:
443 4669 : case PROCESSOR_ZNVER4:
444 4669 : case PROCESSOR_ZNVER5:
445 4669 : case PROCESSOR_ZNVER6:
446 4669 : case PROCESSOR_C86_4G_M4:
447 4669 : case PROCESSOR_C86_4G_M6:
448 4669 : case PROCESSOR_C86_4G_M7:
449 : /* Stack engine allows to execute push&pop instructions in parall. */
450 4669 : if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
451 559 : && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
452 : return 0;
453 :
454 4410 : memory = get_attr_memory (insn);
455 :
456 : /* Show ability of reorder buffer to hide latency of load by executing
457 : in parallel with previous instruction in case
458 : previous instruction is not needed to compute the address. */
459 4410 : if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
460 4410 : && !ix86_agi_dependent (dep_insn, insn))
461 : {
462 687 : enum attr_unit unit = get_attr_unit (insn);
463 687 : int loadcost;
464 :
465 : /* TODO: On znver5 complex addressing modes have
466 : greater latency. */
467 687 : if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
468 : loadcost = 4;
469 : else
470 449 : loadcost = 7;
471 :
472 687 : if (cost >= loadcost)
473 151 : cost -= loadcost;
474 : else
475 : cost = 0;
476 : }
477 : break;
478 :
479 0 : case PROCESSOR_YONGFENG:
480 0 : case PROCESSOR_SHIJIDADAO:
481 : /* Stack engine allows to execute push&pop instructions in parallel. */
482 0 : if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
483 0 : && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
484 : return 0;
485 : /* FALLTHRU */
486 :
487 0 : case PROCESSOR_LUJIAZUI:
488 0 : memory = get_attr_memory (insn);
489 :
490 : /* Show ability of reorder buffer to hide latency of load by executing
491 : in parallel with previous instruction in case
492 : previous instruction is not needed to compute the address. */
493 0 : if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
494 0 : && !ix86_agi_dependent (dep_insn, insn))
495 : {
496 0 : int loadcost = 4;
497 :
498 0 : if (cost >= loadcost)
499 0 : cost -= loadcost;
500 : else
501 : cost = 0;
502 : }
503 : break;
504 :
505 53413843 : case PROCESSOR_CORE2:
506 53413843 : case PROCESSOR_NEHALEM:
507 53413843 : case PROCESSOR_SANDYBRIDGE:
508 53413843 : case PROCESSOR_HASWELL:
509 53413843 : case PROCESSOR_TREMONT:
510 53413843 : case PROCESSOR_ALDERLAKE:
511 53413843 : case PROCESSOR_INTEL:
512 53413843 : case PROCESSOR_GENERIC:
513 : /* Stack engine allows to execute push&pop instructions in parall. */
514 53413843 : if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
515 8709739 : && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
516 : return 0;
517 :
518 47881485 : memory = get_attr_memory (insn);
519 :
520 : /* Show ability of reorder buffer to hide latency of load by executing
521 : in parallel with previous instruction in case
522 : previous instruction is not needed to compute the address. */
523 47881485 : if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
524 47881485 : && !ix86_agi_dependent (dep_insn, insn))
525 : {
526 6157772 : if (cost >= 4)
527 214782 : cost -= 4;
528 : else
529 : cost = 0;
530 : }
531 : break;
532 :
533 939 : case PROCESSOR_SILVERMONT:
534 939 : if (!reload_completed)
535 : return cost;
536 :
537 : /* Increase cost of integer loads. */
538 939 : memory = get_attr_memory (dep_insn);
539 939 : if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
540 : {
541 214 : enum attr_unit unit = get_attr_unit (dep_insn);
542 214 : if (unit == UNIT_INTEGER && cost == 1)
543 : {
544 165 : if (memory == MEMORY_LOAD)
545 : cost = 3;
546 : else
547 : {
548 : /* Increase cost of ld/st for short int types only
549 : because of store forwarding issue. */
550 0 : rtx set = single_set (dep_insn);
551 0 : if (set && (GET_MODE (SET_DEST (set)) == QImode
552 0 : || GET_MODE (SET_DEST (set)) == HImode))
553 : {
554 : /* Increase cost of store/load insn if exact
555 : dependence exists and it is load insn. */
556 0 : enum attr_memory insn_memory = get_attr_memory (insn);
557 0 : if (insn_memory == MEMORY_LOAD
558 0 : && exact_store_load_dependency (dep_insn, insn))
559 : cost = 3;
560 : }
561 : }
562 : }
563 : }
564 :
565 : default:
566 : break;
567 : }
568 :
569 : return cost;
570 : }
571 :
572 : /* How many alternative schedules to try. This should be as wide as the
573 : scheduling freedom in the DFA, but no wider. Making this value too
574 : large results extra work for the scheduler. */
575 :
576 : int
577 961988 : ia32_multipass_dfa_lookahead (void)
578 : {
579 : /* Generally, we want haifa-sched:max_issue() to look ahead as far
580 : as many instructions can be executed on a cycle, i.e.,
581 : issue_rate. */
582 961988 : if (reload_completed)
583 961549 : return ix86_issue_rate ();
584 : /* Don't use lookahead for pre-reload schedule to save compile time. */
585 : return 0;
586 : }
587 :
588 : /* Return true if target platform supports macro-fusion. */
589 :
590 : bool
591 107481657 : ix86_macro_fusion_p ()
592 : {
593 107481657 : return TARGET_FUSE_CMP_AND_BRANCH;
594 : }
595 :
596 : /* Check whether MOV is a reg-reg move and ALU is an
597 : ALU operation that allows macro-op fusion. */
598 :
599 : static bool
600 2069 : ix86_fuse_mov_alu_p (rtx_insn *mov, rtx_insn *alu)
601 : {
602 : /* Validate mov:
603 : - It should be reg-reg move with opcode 0x89 or 0x8B. */
604 2069 : rtx set1 = PATTERN (mov);
605 2069 : if (GET_CODE (set1) != SET
606 1854 : || !GENERAL_REG_P (SET_SRC (set1))
607 2288 : || !GENERAL_REG_P (SET_DEST (set1)))
608 : return false;
609 91 : rtx reg = SET_DEST (set1);
610 : /* - it should have 0x89 or 0x8B opcode. */
611 91 : if (!INTEGRAL_MODE_P (GET_MODE (reg))
612 182 : || GET_MODE_SIZE (GET_MODE (reg)) < 2
613 182 : || GET_MODE_SIZE (GET_MODE (reg)) > 8)
614 : return false;
615 : /* Validate ALU. */
616 91 : if (GET_CODE (PATTERN (alu)) != PARALLEL)
617 : return false;
618 24 : rtx set2 = XVECEXP (PATTERN (alu), 0, 0);
619 24 : if (GET_CODE (set2) != SET)
620 : return false;
621 : /* If this is instruction setting both compare and normal
622 : register, the first set always sets flags, while
623 : second set writes to the output operan. Pick
624 : the second set. */
625 24 : if (GET_CODE (SET_SRC (set2)) == COMPARE)
626 : {
627 0 : set2 = XVECEXP (PATTERN (alu), 0, 1);
628 0 : if (GET_CODE (set2) != SET)
629 : return false;
630 : }
631 : /* Match one of:
632 : ADD ADC AND XOR OR SUB SBB INC DEC NOT SAL SHL SHR SAR
633 : We also may add insn attribute to handle some of sporadic
634 : case we output those with different RTX expressions. */
635 :
636 24 : if (GET_CODE (SET_SRC (set2)) != PLUS
637 24 : && GET_CODE (SET_SRC (set2)) != MINUS
638 : && GET_CODE (SET_SRC (set2)) != XOR
639 : && GET_CODE (SET_SRC (set2)) != AND
640 : && GET_CODE (SET_SRC (set2)) != IOR
641 : && GET_CODE (SET_SRC (set2)) != NOT
642 : && GET_CODE (SET_SRC (set2)) != ASHIFT
643 : && GET_CODE (SET_SRC (set2)) != ASHIFTRT
644 : && GET_CODE (SET_SRC (set2)) != LSHIFTRT)
645 : return false;
646 24 : rtx op0 = XEXP (SET_SRC (set2), 0);
647 24 : rtx op1 = GET_CODE (SET_SRC (set2)) != NOT ? XEXP (SET_SRC (set2), 1) : NULL;
648 : /* One of operands should be register. */
649 24 : if (op1 && (!REG_P (op0) || REGNO (op0) != REGNO (reg)))
650 : std::swap (op0, op1);
651 24 : if (!REG_P (op0) || REGNO (op0) != REGNO (reg))
652 : return false;
653 23 : if (op1
654 23 : && !REG_P (op1)
655 39 : && !x86_64_immediate_operand (op1, VOIDmode))
656 : return false;
657 : /* Only one of two parameters must be move destination. */
658 23 : if (op1 && REG_P (op1) && REGNO (op1) == REGNO (reg))
659 : return false;
660 : return true;
661 : }
662 :
663 : /* Check whether current microarchitecture support macro fusion
664 : for insn pair "CONDGEN + CONDJMP". Refer to
665 : "Intel Architectures Optimization Reference Manual". */
666 :
667 : bool
668 87369234 : ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
669 : {
670 87369234 : if (TARGET_FUSE_MOV_AND_ALU
671 87369234 : && ix86_fuse_mov_alu_p (condgen, condjmp))
672 : return true;
673 87369211 : rtx src, imm = NULL_RTX;
674 87369211 : enum rtx_code ccode;
675 87369211 : rtx compare_set = NULL_RTX, test_if, cond;
676 87369211 : rtx alu_set = NULL_RTX, addr = NULL_RTX;
677 87369211 : rtx alu_clobber = NULL_RTX;
678 87369211 : enum attr_type condgen_type;
679 :
680 87369211 : if (!any_condjump_p (condjmp))
681 : return false;
682 :
683 13058834 : unsigned int condreg1, condreg2;
684 13058834 : rtx cc_reg_1;
685 13058834 : targetm.fixed_condition_code_regs (&condreg1, &condreg2);
686 13058834 : cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
687 13058834 : if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
688 13058759 : || !condgen
689 26117593 : || !modified_in_p (cc_reg_1, condgen))
690 118811 : return false;
691 :
692 12940023 : condgen_type = get_attr_type (condgen);
693 12940023 : if (condgen_type == TYPE_MULTI
694 418 : && INSN_CODE (condgen) == code_for_stack_protect_test_1 (ptr_mode)
695 12940441 : && TARGET_FUSE_ALU_AND_BRANCH)
696 : {
697 : /* stack_protect_test_<mode> ends with a sub, which subtracts
698 : a non-rip special memory operand from a GPR. */
699 418 : src = NULL_RTX;
700 418 : alu_set = XVECEXP (PATTERN (condgen), 0, 1);
701 418 : goto handle_stack_protect_test;
702 : }
703 : /* ??? zen5 can fuse cmp, test, sub, add, inc, dec, or, and xor.
704 : Cores can not fuse or and xor which will pass the test below
705 : since type is ALU. */
706 12939605 : else if (condgen_type != TYPE_TEST
707 12939605 : && condgen_type != TYPE_ICMP
708 12939605 : && condgen_type != TYPE_INCDEC
709 581544 : && condgen_type != TYPE_ALU)
710 : return false;
711 :
712 12557461 : compare_set = single_set (condgen);
713 12557461 : if (compare_set == NULL_RTX && !TARGET_FUSE_ALU_AND_BRANCH)
714 : return false;
715 :
716 71851 : if (compare_set == NULL_RTX)
717 : {
718 71851 : int i;
719 71851 : rtx pat = PATTERN (condgen);
720 215553 : for (i = 0; i < XVECLEN (pat, 0); i++)
721 143702 : if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
722 : {
723 143702 : rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
724 143702 : if (GET_CODE (set_src) == COMPARE)
725 : compare_set = XVECEXP (pat, 0, i);
726 : else
727 76238 : alu_set = XVECEXP (pat, 0, i);
728 : }
729 : /* We also possibly generated ALU instruction only to set
730 : flags. In this case there will be clobber. */
731 0 : else if (GET_CODE (XVECEXP (pat, 0, i)) == CLOBBER
732 0 : && GENERAL_REG_P (XEXP (XVECEXP (pat, 0, i), 0)))
733 : alu_clobber = XVECEXP (pat, 0, i);
734 : }
735 71851 : if (compare_set == NULL_RTX)
736 : return false;
737 12553056 : src = SET_SRC (compare_set);
738 12553056 : if (GET_CODE (src) != COMPARE)
739 : return false;
740 :
741 : /* Check for memory operand. */
742 12544417 : if (MEM_P (XEXP (src, 0)))
743 1940464 : addr = XEXP (XEXP (src, 0), 0);
744 10603953 : else if (MEM_P (XEXP (src, 1)))
745 787430 : addr = XEXP (XEXP (src, 1), 0);
746 : /* Some CPUs, i.e. tigerlake and cooperlake does not fuse
747 : ALU with memory operand. */
748 2727894 : if (addr && !TARGET_FUSE_ALU_AND_BRANCH_MEM)
749 : return false;
750 12543821 : if (CONST_INT_P (XEXP (src, 0)))
751 : imm = XEXP (src, 0);
752 12543821 : else if (CONST_INT_P (XEXP (src, 1)))
753 : imm = XEXP (src, 1);
754 : /* Check that the instruction really has immediate.
755 : In particular compare with 0 is done using test with no immediate. */
756 8258784 : if (imm && !get_attr_length_immediate (condgen))
757 : imm = NULL;
758 : /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
759 : supported. */
760 12543821 : if (addr && imm && !TARGET_FUSE_ALU_AND_BRANCH_MEM_IMM)
761 : return false;
762 :
763 : /* No fusion for RIP-relative address. */
764 12543497 : if (addr && !TARGET_FUSE_ALU_AND_BRANCH_RIP_RELATIVE)
765 : {
766 2726974 : ix86_address parts;
767 2726974 : int ok = ix86_decompose_address (addr, &parts);
768 2726974 : gcc_assert (ok);
769 :
770 2726974 : if (ix86_rip_relative_addr_p (&parts))
771 402739 : return false;
772 : }
773 : /* Znver5 supports fussion fusion with their reg/reg, reg/imm and
774 : reg/mem forms. They are also supported when the instruction has an
775 : immediate and displacement that meets the criteria of 4 byte displacement
776 : and 2 byte immediate or the case of 2 byte displacement and 4 byte
777 : immediate. We do not know the displacement size, so we ignore this
778 : limitation. */
779 :
780 9816523 : handle_stack_protect_test:
781 12141176 : test_if = SET_SRC (pc_set (condjmp));
782 12141176 : cond = XEXP (test_if, 0);
783 12141176 : ccode = GET_CODE (cond);
784 : /* Check whether conditional jump use Sign or Overflow Flags. */
785 12141176 : if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
786 1460 : && (ccode == GE || ccode == GT || ccode == LE || ccode == LT))
787 : return false;
788 :
789 : /* Return true for TYPE_TEST and TYPE_ICMP. */
790 12141070 : if (condgen_type == TYPE_TEST || condgen_type == TYPE_ICMP)
791 : return true;
792 :
793 : /* The following is the case that macro-fusion for alu + jmp. */
794 191995 : if (!TARGET_FUSE_ALU_AND_BRANCH || (!alu_set && !alu_clobber))
795 : return false;
796 :
797 : /* No fusion for alu op with memory destination operand. */
798 67882 : if (alu_set && MEM_P (SET_DEST (alu_set)))
799 : return false;
800 :
801 :
802 : /* Macro-fusion for inc/dec + unsigned conditional jump is not
803 : supported on some CPUs while supported on others (znver5 and core_avx512).
804 : We however never generate it, so we do not need a specific tune for it. */
805 64394 : gcc_checking_assert (!(condgen_type == TYPE_INCDEC
806 : && (ccode == GEU || ccode == GTU || ccode == LEU || ccode == LTU)));
807 :
808 : return true;
809 : }
|