Line data Source code
1 : /* Scheduler hooks for IA-32 which implement CPU specific logic.
2 : Copyright (C) 1988-2026 Free Software Foundation, Inc.
3 :
4 : This file is part of GCC.
5 :
6 : GCC is free software; you can redistribute it and/or modify
7 : it under the terms of the GNU General Public License as published by
8 : the Free Software Foundation; either version 3, or (at your option)
9 : any later version.
10 :
11 : GCC is distributed in the hope that it will be useful,
12 : but WITHOUT ANY WARRANTY; without even the implied warranty of
13 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 : GNU General Public License for more details.
15 :
16 : You should have received a copy of the GNU General Public License
17 : along with GCC; see the file COPYING3. If not see
18 : <http://www.gnu.org/licenses/>. */
19 :
20 : #define IN_TARGET_CODE 1
21 :
22 : #include "config.h"
23 : #include "system.h"
24 : #include "coretypes.h"
25 : #include "backend.h"
26 : #include "rtl.h"
27 : #include "tree.h"
28 : #include "cfghooks.h"
29 : #include "tm_p.h"
30 : #include "target.h"
31 : #include "insn-config.h"
32 : #include "insn-attr.h"
33 : #include "insn-opinit.h"
34 : #include "recog.h"
35 : #include "tm-constrs.h"
36 :
37 : /* Return the maximum number of instructions a cpu can issue. */
38 :
39 : int
40 36861705 : ix86_issue_rate (void)
41 : {
42 36861705 : switch (ix86_tune)
43 : {
44 : case PROCESSOR_PENTIUM:
45 : case PROCESSOR_LAKEMONT:
46 : case PROCESSOR_BONNELL:
47 : case PROCESSOR_SILVERMONT:
48 : case PROCESSOR_K6:
49 : case PROCESSOR_BTVER2:
50 : case PROCESSOR_PENTIUM4:
51 : case PROCESSOR_NOCONA:
52 : return 2;
53 :
54 : case PROCESSOR_PENTIUMPRO:
55 : case PROCESSOR_ATHLON:
56 : case PROCESSOR_K8:
57 : case PROCESSOR_AMDFAM10:
58 : case PROCESSOR_BTVER1:
59 : case PROCESSOR_LUJIAZUI:
60 : return 3;
61 :
62 : case PROCESSOR_BDVER1:
63 : case PROCESSOR_BDVER2:
64 : case PROCESSOR_BDVER3:
65 : case PROCESSOR_BDVER4:
66 : case PROCESSOR_ZNVER1:
67 : case PROCESSOR_ZNVER2:
68 : case PROCESSOR_ZNVER3:
69 : case PROCESSOR_ZNVER4:
70 : case PROCESSOR_CORE2:
71 : case PROCESSOR_NEHALEM:
72 : case PROCESSOR_SANDYBRIDGE:
73 : case PROCESSOR_HASWELL:
74 : case PROCESSOR_TREMONT:
75 : case PROCESSOR_SKYLAKE:
76 : case PROCESSOR_SKYLAKE_AVX512:
77 : case PROCESSOR_CASCADELAKE:
78 : case PROCESSOR_CANNONLAKE:
79 : case PROCESSOR_ALDERLAKE:
80 : case PROCESSOR_YONGFENG:
81 : case PROCESSOR_SHIJIDADAO:
82 : case PROCESSOR_SIERRAFOREST:
83 : case PROCESSOR_INTEL:
84 : case PROCESSOR_GENERIC:
85 : /* For znver5 decoder can handle 4 or 8 instructions per cycle,
86 : op cache 12 instruction/cycle, dispatch 8 instructions
87 : integer rename 8 instructions and Fp 6 instructions.
88 :
89 : The scheduler, without understanding out of order nature of the CPU
90 : is not going to be able to use more than 4 instructions since that
91 : is limits of the decoders. */
92 : case PROCESSOR_ZNVER5:
93 : case PROCESSOR_ZNVER6:
94 : return 4;
95 :
96 : case PROCESSOR_ICELAKE_CLIENT:
97 : case PROCESSOR_ICELAKE_SERVER:
98 : case PROCESSOR_TIGERLAKE:
99 : case PROCESSOR_COOPERLAKE:
100 : case PROCESSOR_ROCKETLAKE:
101 : return 5;
102 :
103 : case PROCESSOR_SAPPHIRERAPIDS:
104 : case PROCESSOR_GRANITERAPIDS:
105 : case PROCESSOR_GRANITERAPIDS_D:
106 : case PROCESSOR_DIAMONDRAPIDS:
107 : case PROCESSOR_GRANDRIDGE:
108 : case PROCESSOR_CLEARWATERFOREST:
109 : case PROCESSOR_ARROWLAKE:
110 : case PROCESSOR_ARROWLAKE_S:
111 : case PROCESSOR_PANTHERLAKE:
112 : return 6;
113 :
114 : case PROCESSOR_NOVALAKE:
115 : return 8;
116 :
117 : default:
118 : return 1;
119 : }
120 : }
121 :
122 : /* Return true iff USE_INSN has a memory address with operands set by
123 : SET_INSN. */
124 :
125 : bool
126 9931236 : ix86_agi_dependent (rtx_insn *set_insn, rtx_insn *use_insn)
127 : {
128 9931236 : int i;
129 9931236 : extract_insn_cached (use_insn);
130 12196440 : for (i = recog_data.n_operands - 1; i >= 0; --i)
131 11840376 : if (MEM_P (recog_data.operand[i]))
132 : {
133 9575172 : rtx addr = XEXP (recog_data.operand[i], 0);
134 9575172 : if (modified_in_p (addr, set_insn) != 0)
135 : {
136 : /* No AGI stall if SET_INSN is a push or pop and USE_INSN
137 : has SP based memory (unless index reg is modified in a pop). */
138 4319287 : rtx set = single_set (set_insn);
139 4319287 : if (set
140 4319287 : && (push_operand (SET_DEST (set), GET_MODE (SET_DEST (set)))
141 3541815 : || pop_operand (SET_SRC (set), GET_MODE (SET_SRC (set)))))
142 : {
143 590879 : struct ix86_address parts;
144 590879 : if (ix86_decompose_address (addr, &parts)
145 590879 : && parts.base == stack_pointer_rtx
146 1181582 : && (parts.index == NULL_RTX
147 481 : || MEM_P (SET_DEST (set))
148 2 : || !modified_in_p (parts.index, set_insn)))
149 590702 : return false;
150 : }
151 3728585 : return true;
152 : }
153 : return false;
154 : }
155 : return false;
156 : }
157 :
158 : /* A subroutine of ix86_adjust_cost -- return TRUE iff INSN reads flags set
159 : by DEP_INSN and nothing set by DEP_INSN. */
160 :
161 : static bool
162 0 : ix86_flags_dependent (rtx_insn *insn, rtx_insn *dep_insn, enum attr_type insn_type)
163 : {
164 0 : rtx set, set2;
165 :
166 : /* Simplify the test for uninteresting insns. */
167 0 : if (insn_type != TYPE_SETCC
168 0 : && insn_type != TYPE_ICMOV
169 0 : && insn_type != TYPE_FCMOV
170 0 : && insn_type != TYPE_IBR)
171 : return false;
172 :
173 0 : if ((set = single_set (dep_insn)) != 0)
174 : {
175 0 : set = SET_DEST (set);
176 0 : set2 = NULL_RTX;
177 : }
178 0 : else if (GET_CODE (PATTERN (dep_insn)) == PARALLEL
179 0 : && XVECLEN (PATTERN (dep_insn), 0) == 2
180 0 : && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 0)) == SET
181 0 : && GET_CODE (XVECEXP (PATTERN (dep_insn), 0, 1)) == SET)
182 : {
183 0 : set = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
184 0 : set2 = SET_DEST (XVECEXP (PATTERN (dep_insn), 0, 0));
185 : }
186 : else
187 : return false;
188 :
189 0 : if (!REG_P (set) || REGNO (set) != FLAGS_REG)
190 : return false;
191 :
192 : /* This test is true if the dependent insn reads the flags but
193 : not any other potentially set register. */
194 0 : if (!reg_overlap_mentioned_p (set, PATTERN (insn)))
195 : return false;
196 :
197 0 : if (set2 && reg_overlap_mentioned_p (set2, PATTERN (insn)))
198 : return false;
199 :
200 : return true;
201 : }
202 :
203 : /* Helper function for exact_store_load_dependency.
204 : Return true if addr is found in insn. */
205 : static bool
206 0 : exact_dependency_1 (rtx addr, rtx insn)
207 : {
208 0 : enum rtx_code code;
209 0 : const char *format_ptr;
210 0 : int i, j;
211 :
212 0 : code = GET_CODE (insn);
213 0 : switch (code)
214 : {
215 0 : case MEM:
216 0 : if (rtx_equal_p (addr, insn))
217 : return true;
218 : break;
219 : case REG:
220 : CASE_CONST_ANY:
221 : case SYMBOL_REF:
222 : case CODE_LABEL:
223 : case PC:
224 : case EXPR_LIST:
225 : return false;
226 : default:
227 : break;
228 : }
229 :
230 0 : format_ptr = GET_RTX_FORMAT (code);
231 0 : for (i = 0; i < GET_RTX_LENGTH (code); i++)
232 : {
233 0 : switch (*format_ptr++)
234 : {
235 0 : case 'e':
236 0 : if (exact_dependency_1 (addr, XEXP (insn, i)))
237 : return true;
238 : break;
239 : case 'E':
240 0 : for (j = 0; j < XVECLEN (insn, i); j++)
241 0 : if (exact_dependency_1 (addr, XVECEXP (insn, i, j)))
242 : return true;
243 : break;
244 : }
245 : }
246 : return false;
247 : }
248 :
249 : /* Return true if there exists exact dependency for store & load, i.e.
250 : the same memory address is used in them. */
251 : static bool
252 0 : exact_store_load_dependency (rtx_insn *store, rtx_insn *load)
253 : {
254 0 : rtx set1, set2;
255 :
256 0 : set1 = single_set (store);
257 0 : if (!set1)
258 : return false;
259 0 : if (!MEM_P (SET_DEST (set1)))
260 : return false;
261 0 : set2 = single_set (load);
262 0 : if (!set2)
263 : return false;
264 0 : if (exact_dependency_1 (SET_DEST (set1), SET_SRC (set2)))
265 : return true;
266 : return false;
267 : }
268 :
269 :
270 : /* This function corrects the value of COST (latency) based on the relationship
271 : between INSN and DEP_INSN through a dependence of type DEP_TYPE, and strength
272 : DW. It should return the new value.
273 :
274 : On x86 CPUs this is most commonly used to model the fact that valus of
275 : registers used to compute address of memory operand needs to be ready
276 : earlier than values of registers used in the actual operation. */
277 :
278 : int
279 151649354 : ix86_adjust_cost (rtx_insn *insn, int dep_type, rtx_insn *dep_insn, int cost,
280 : unsigned int)
281 : {
282 151649354 : enum attr_type insn_type, dep_insn_type;
283 151649354 : enum attr_memory memory;
284 151649354 : rtx set, set2;
285 151649354 : int dep_insn_code_number;
286 :
287 : /* Anti and output dependencies have zero cost on all CPUs. */
288 151649354 : if (dep_type != 0)
289 : return 0;
290 :
291 54024789 : dep_insn_code_number = recog_memoized (dep_insn);
292 :
293 : /* If we can't recognize the insns, we can't really do anything. */
294 54024789 : if (dep_insn_code_number < 0 || recog_memoized (insn) < 0)
295 353094 : return cost;
296 :
297 53671695 : insn_type = get_attr_type (insn);
298 53671695 : dep_insn_type = get_attr_type (dep_insn);
299 :
300 53671695 : switch (ix86_tune)
301 : {
302 0 : case PROCESSOR_PENTIUM:
303 0 : case PROCESSOR_LAKEMONT:
304 : /* Address Generation Interlock adds a cycle of latency. */
305 0 : if (insn_type == TYPE_LEA)
306 : {
307 0 : rtx addr = PATTERN (insn);
308 :
309 0 : if (GET_CODE (addr) == PARALLEL)
310 0 : addr = XVECEXP (addr, 0, 0);
311 :
312 0 : gcc_assert (GET_CODE (addr) == SET);
313 :
314 0 : addr = SET_SRC (addr);
315 0 : if (modified_in_p (addr, dep_insn))
316 0 : cost += 1;
317 : }
318 0 : else if (ix86_agi_dependent (dep_insn, insn))
319 0 : cost += 1;
320 :
321 : /* ??? Compares pair with jump/setcc. */
322 0 : if (ix86_flags_dependent (insn, dep_insn, insn_type))
323 0 : cost = 0;
324 :
325 : /* Floating point stores require value to be ready one cycle earlier. */
326 0 : if (insn_type == TYPE_FMOV
327 0 : && get_attr_memory (insn) == MEMORY_STORE
328 0 : && !ix86_agi_dependent (dep_insn, insn))
329 0 : cost += 1;
330 : break;
331 :
332 0 : case PROCESSOR_PENTIUMPRO:
333 : /* INT->FP conversion is expensive. */
334 0 : if (get_attr_fp_int_src (dep_insn))
335 0 : cost += 5;
336 :
337 : /* There is one cycle extra latency between an FP op and a store. */
338 0 : if (insn_type == TYPE_FMOV
339 0 : && (set = single_set (dep_insn)) != NULL_RTX
340 0 : && (set2 = single_set (insn)) != NULL_RTX
341 0 : && rtx_equal_p (SET_DEST (set), SET_SRC (set2))
342 0 : && MEM_P (SET_DEST (set2)))
343 0 : cost += 1;
344 :
345 0 : memory = get_attr_memory (insn);
346 :
347 : /* Show ability of reorder buffer to hide latency of load by executing
348 : in parallel with previous instruction in case
349 : previous instruction is not needed to compute the address. */
350 0 : if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
351 0 : && !ix86_agi_dependent (dep_insn, insn))
352 : {
353 : /* Claim moves to take one cycle, as core can issue one load
354 : at time and the next load can start cycle later. */
355 0 : if (dep_insn_type == TYPE_IMOV
356 0 : || dep_insn_type == TYPE_FMOV)
357 : cost = 1;
358 0 : else if (cost > 1)
359 0 : cost--;
360 : }
361 : break;
362 :
363 0 : case PROCESSOR_K6:
364 : /* The esp dependency is resolved before
365 : the instruction is really finished. */
366 0 : if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
367 0 : && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
368 : return 1;
369 :
370 : /* INT->FP conversion is expensive. */
371 0 : if (get_attr_fp_int_src (dep_insn))
372 0 : cost += 5;
373 :
374 0 : memory = get_attr_memory (insn);
375 :
376 : /* Show ability of reorder buffer to hide latency of load by executing
377 : in parallel with previous instruction in case
378 : previous instruction is not needed to compute the address. */
379 0 : if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
380 0 : && !ix86_agi_dependent (dep_insn, insn))
381 : {
382 : /* Claim moves to take one cycle, as core can issue one load
383 : at time and the next load can start cycle later. */
384 0 : if (dep_insn_type == TYPE_IMOV
385 0 : || dep_insn_type == TYPE_FMOV)
386 : cost = 1;
387 0 : else if (cost > 2)
388 0 : cost -= 2;
389 : else
390 : cost = 1;
391 : }
392 : break;
393 :
394 9718 : case PROCESSOR_AMDFAM10:
395 9718 : case PROCESSOR_BDVER1:
396 9718 : case PROCESSOR_BDVER2:
397 9718 : case PROCESSOR_BDVER3:
398 9718 : case PROCESSOR_BDVER4:
399 9718 : case PROCESSOR_BTVER1:
400 9718 : case PROCESSOR_BTVER2:
401 : /* Stack engine allows to execute push&pop instructions in parall. */
402 9718 : if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
403 219 : && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
404 : return 0;
405 : /* FALLTHRU */
406 :
407 75282 : case PROCESSOR_ATHLON:
408 75282 : case PROCESSOR_K8:
409 75282 : memory = get_attr_memory (insn);
410 :
411 : /* Show ability of reorder buffer to hide latency of load by executing
412 : in parallel with previous instruction in case
413 : previous instruction is not needed to compute the address. */
414 75282 : if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
415 75282 : && !ix86_agi_dependent (dep_insn, insn))
416 : {
417 11579 : enum attr_unit unit = get_attr_unit (insn);
418 11579 : int loadcost = 3;
419 :
420 : /* Because of the difference between the length of integer and
421 : floating unit pipeline preparation stages, the memory operands
422 : for floating point are cheaper.
423 :
424 : ??? For Athlon it the difference is most probably 2. */
425 11579 : if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
426 : loadcost = 3;
427 : else
428 6279 : loadcost = TARGET_CPU_P (ATHLON) ? 2 : 0;
429 :
430 11579 : if (cost >= loadcost)
431 7049 : cost -= loadcost;
432 : else
433 : cost = 0;
434 : }
435 : break;
436 :
437 4824 : case PROCESSOR_ZNVER1:
438 4824 : case PROCESSOR_ZNVER2:
439 4824 : case PROCESSOR_ZNVER3:
440 4824 : case PROCESSOR_ZNVER4:
441 4824 : case PROCESSOR_ZNVER5:
442 4824 : case PROCESSOR_ZNVER6:
443 : /* Stack engine allows to execute push&pop instructions in parall. */
444 4824 : if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
445 560 : && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
446 : return 0;
447 :
448 4564 : memory = get_attr_memory (insn);
449 :
450 : /* Show ability of reorder buffer to hide latency of load by executing
451 : in parallel with previous instruction in case
452 : previous instruction is not needed to compute the address. */
453 4564 : if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
454 4564 : && !ix86_agi_dependent (dep_insn, insn))
455 : {
456 698 : enum attr_unit unit = get_attr_unit (insn);
457 698 : int loadcost;
458 :
459 : /* TODO: On znver5 complex addressing modes have
460 : greater latency. */
461 698 : if (unit == UNIT_INTEGER || unit == UNIT_UNKNOWN)
462 : loadcost = 4;
463 : else
464 456 : loadcost = 7;
465 :
466 698 : if (cost >= loadcost)
467 154 : cost -= loadcost;
468 : else
469 : cost = 0;
470 : }
471 : break;
472 :
473 0 : case PROCESSOR_YONGFENG:
474 0 : case PROCESSOR_SHIJIDADAO:
475 : /* Stack engine allows to execute push&pop instructions in parallel. */
476 0 : if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
477 0 : && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
478 : return 0;
479 : /* FALLTHRU */
480 :
481 0 : case PROCESSOR_LUJIAZUI:
482 0 : memory = get_attr_memory (insn);
483 :
484 : /* Show ability of reorder buffer to hide latency of load by executing
485 : in parallel with previous instruction in case
486 : previous instruction is not needed to compute the address. */
487 0 : if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
488 0 : && !ix86_agi_dependent (dep_insn, insn))
489 : {
490 0 : int loadcost = 4;
491 :
492 0 : if (cost >= loadcost)
493 0 : cost -= loadcost;
494 : else
495 : cost = 0;
496 : }
497 : break;
498 :
499 53560807 : case PROCESSOR_CORE2:
500 53560807 : case PROCESSOR_NEHALEM:
501 53560807 : case PROCESSOR_SANDYBRIDGE:
502 53560807 : case PROCESSOR_HASWELL:
503 53560807 : case PROCESSOR_TREMONT:
504 53560807 : case PROCESSOR_ALDERLAKE:
505 53560807 : case PROCESSOR_INTEL:
506 53560807 : case PROCESSOR_GENERIC:
507 : /* Stack engine allows to execute push&pop instructions in parall. */
508 53560807 : if ((insn_type == TYPE_PUSH || insn_type == TYPE_POP)
509 8724170 : && (dep_insn_type == TYPE_PUSH || dep_insn_type == TYPE_POP))
510 : return 0;
511 :
512 48016061 : memory = get_attr_memory (insn);
513 :
514 : /* Show ability of reorder buffer to hide latency of load by executing
515 : in parallel with previous instruction in case
516 : previous instruction is not needed to compute the address. */
517 48016061 : if ((memory == MEMORY_LOAD || memory == MEMORY_BOTH)
518 48016061 : && !ix86_agi_dependent (dep_insn, insn))
519 : {
520 6190303 : if (cost >= 4)
521 216078 : cost -= 4;
522 : else
523 : cost = 0;
524 : }
525 : break;
526 :
527 939 : case PROCESSOR_SILVERMONT:
528 939 : if (!reload_completed)
529 : return cost;
530 :
531 : /* Increase cost of integer loads. */
532 939 : memory = get_attr_memory (dep_insn);
533 939 : if (memory == MEMORY_LOAD || memory == MEMORY_BOTH)
534 : {
535 214 : enum attr_unit unit = get_attr_unit (dep_insn);
536 214 : if (unit == UNIT_INTEGER && cost == 1)
537 : {
538 165 : if (memory == MEMORY_LOAD)
539 : cost = 3;
540 : else
541 : {
542 : /* Increase cost of ld/st for short int types only
543 : because of store forwarding issue. */
544 0 : rtx set = single_set (dep_insn);
545 0 : if (set && (GET_MODE (SET_DEST (set)) == QImode
546 0 : || GET_MODE (SET_DEST (set)) == HImode))
547 : {
548 : /* Increase cost of store/load insn if exact
549 : dependence exists and it is load insn. */
550 0 : enum attr_memory insn_memory = get_attr_memory (insn);
551 0 : if (insn_memory == MEMORY_LOAD
552 0 : && exact_store_load_dependency (dep_insn, insn))
553 : cost = 3;
554 : }
555 : }
556 : }
557 : }
558 :
559 : default:
560 : break;
561 : }
562 :
563 : return cost;
564 : }
565 :
566 : /* How many alternative schedules to try. This should be as wide as the
567 : scheduling freedom in the DFA, but no wider. Making this value too
568 : large results extra work for the scheduler. */
569 :
570 : int
571 964423 : ia32_multipass_dfa_lookahead (void)
572 : {
573 : /* Generally, we want haifa-sched:max_issue() to look ahead as far
574 : as many instructions can be executed on a cycle, i.e.,
575 : issue_rate. */
576 964423 : if (reload_completed)
577 963984 : return ix86_issue_rate ();
578 : /* Don't use lookahead for pre-reload schedule to save compile time. */
579 : return 0;
580 : }
581 :
582 : /* Return true if target platform supports macro-fusion. */
583 :
584 : bool
585 108634557 : ix86_macro_fusion_p ()
586 : {
587 108634557 : return TARGET_FUSE_CMP_AND_BRANCH;
588 : }
589 :
590 : /* Check whether MOV is a reg-reg move and ALU is an
591 : ALU operation that allows macro-op fusion. */
592 :
593 : static bool
594 2109 : ix86_fuse_mov_alu_p (rtx_insn *mov, rtx_insn *alu)
595 : {
596 : /* Validate mov:
597 : - It should be reg-reg move with opcode 0x89 or 0x8B. */
598 2109 : rtx set1 = PATTERN (mov);
599 2109 : if (GET_CODE (set1) != SET
600 1894 : || !GENERAL_REG_P (SET_SRC (set1))
601 2330 : || !GENERAL_REG_P (SET_DEST (set1)))
602 : return false;
603 93 : rtx reg = SET_DEST (set1);
604 : /* - it should have 0x89 or 0x8B opcode. */
605 93 : if (!INTEGRAL_MODE_P (GET_MODE (reg))
606 186 : || GET_MODE_SIZE (GET_MODE (reg)) < 2
607 186 : || GET_MODE_SIZE (GET_MODE (reg)) > 8)
608 : return false;
609 : /* Validate ALU. */
610 93 : if (GET_CODE (PATTERN (alu)) != PARALLEL)
611 : return false;
612 26 : rtx set2 = XVECEXP (PATTERN (alu), 0, 0);
613 26 : if (GET_CODE (set2) != SET)
614 : return false;
615 : /* If this is instruction setting both compare and normal
616 : register, the first set always sets flags, while
617 : second set writes to the output operan. Pick
618 : the second set. */
619 26 : if (GET_CODE (SET_SRC (set2)) == COMPARE)
620 : {
621 0 : set2 = XVECEXP (PATTERN (alu), 0, 1);
622 0 : if (GET_CODE (set2) != SET)
623 : return false;
624 : }
625 : /* Match one of:
626 : ADD ADC AND XOR OR SUB SBB INC DEC NOT SAL SHL SHR SAR
627 : We also may add insn attribute to handle some of sporadic
628 : case we output those with different RTX expressions. */
629 :
630 26 : if (GET_CODE (SET_SRC (set2)) != PLUS
631 26 : && GET_CODE (SET_SRC (set2)) != MINUS
632 : && GET_CODE (SET_SRC (set2)) != XOR
633 : && GET_CODE (SET_SRC (set2)) != AND
634 : && GET_CODE (SET_SRC (set2)) != IOR
635 : && GET_CODE (SET_SRC (set2)) != NOT
636 : && GET_CODE (SET_SRC (set2)) != ASHIFT
637 : && GET_CODE (SET_SRC (set2)) != ASHIFTRT
638 : && GET_CODE (SET_SRC (set2)) != LSHIFTRT)
639 : return false;
640 26 : rtx op0 = XEXP (SET_SRC (set2), 0);
641 26 : rtx op1 = GET_CODE (SET_SRC (set2)) != NOT ? XEXP (SET_SRC (set2), 1) : NULL;
642 : /* One of operands should be register. */
643 26 : if (op1 && (!REG_P (op0) || REGNO (op0) != REGNO (reg)))
644 : std::swap (op0, op1);
645 26 : if (!REG_P (op0) || REGNO (op0) != REGNO (reg))
646 : return false;
647 25 : if (op1
648 25 : && !REG_P (op1)
649 43 : && !x86_64_immediate_operand (op1, VOIDmode))
650 : return false;
651 : /* Only one of two parameters must be move destination. */
652 25 : if (op1 && REG_P (op1) && REGNO (op1) == REGNO (reg))
653 : return false;
654 : return true;
655 : }
656 :
657 : /* Check whether current microarchitecture support macro fusion
658 : for insn pair "CONDGEN + CONDJMP". Refer to
659 : "Intel Architectures Optimization Reference Manual". */
660 :
661 : bool
662 87893726 : ix86_macro_fusion_pair_p (rtx_insn *condgen, rtx_insn *condjmp)
663 : {
664 87893726 : if (TARGET_FUSE_MOV_AND_ALU
665 87893726 : && ix86_fuse_mov_alu_p (condgen, condjmp))
666 : return true;
667 87893701 : rtx src, imm = NULL_RTX;
668 87893701 : enum rtx_code ccode;
669 87893701 : rtx compare_set = NULL_RTX, test_if, cond;
670 87893701 : rtx alu_set = NULL_RTX, addr = NULL_RTX;
671 87893701 : rtx alu_clobber = NULL_RTX;
672 87893701 : enum attr_type condgen_type;
673 :
674 87893701 : if (!any_condjump_p (condjmp))
675 : return false;
676 :
677 13196922 : unsigned int condreg1, condreg2;
678 13196922 : rtx cc_reg_1;
679 13196922 : targetm.fixed_condition_code_regs (&condreg1, &condreg2);
680 13196922 : cc_reg_1 = gen_rtx_REG (CCmode, condreg1);
681 13196922 : if (!reg_referenced_p (cc_reg_1, PATTERN (condjmp))
682 13196846 : || !condgen
683 26393768 : || !modified_in_p (cc_reg_1, condgen))
684 119774 : return false;
685 :
686 13077148 : condgen_type = get_attr_type (condgen);
687 13077148 : if (condgen_type == TYPE_MULTI
688 340 : && INSN_CODE (condgen) == code_for_stack_protect_test_1 (ptr_mode)
689 13077488 : && TARGET_FUSE_ALU_AND_BRANCH)
690 : {
691 : /* stack_protect_test_<mode> ends with a sub, which subtracts
692 : a non-rip special memory operand from a GPR. */
693 340 : src = NULL_RTX;
694 340 : alu_set = XVECEXP (PATTERN (condgen), 0, 1);
695 340 : goto handle_stack_protect_test;
696 : }
697 : /* ??? zen5 can fuse cmp, test, sub, add, inc, dec, or, and xor.
698 : Cores can not fuse or and xor which will pass the test below
699 : since type is ALU. */
700 13076808 : else if (condgen_type != TYPE_TEST
701 13076808 : && condgen_type != TYPE_ICMP
702 13076808 : && condgen_type != TYPE_INCDEC
703 590618 : && condgen_type != TYPE_ALU)
704 : return false;
705 :
706 12693619 : compare_set = single_set (condgen);
707 12693619 : if (compare_set == NULL_RTX && !TARGET_FUSE_ALU_AND_BRANCH)
708 : return false;
709 :
710 74693 : if (compare_set == NULL_RTX)
711 : {
712 74693 : int i;
713 74693 : rtx pat = PATTERN (condgen);
714 224079 : for (i = 0; i < XVECLEN (pat, 0); i++)
715 149386 : if (GET_CODE (XVECEXP (pat, 0, i)) == SET)
716 : {
717 149386 : rtx set_src = SET_SRC (XVECEXP (pat, 0, i));
718 149386 : if (GET_CODE (set_src) == COMPARE)
719 : compare_set = XVECEXP (pat, 0, i);
720 : else
721 79024 : alu_set = XVECEXP (pat, 0, i);
722 : }
723 : /* We also possibly generated ALU instruction only to set
724 : flags. In this case there will be clobber. */
725 0 : else if (GET_CODE (XVECEXP (pat, 0, i)) == CLOBBER
726 0 : && GENERAL_REG_P (XEXP (XVECEXP (pat, 0, i), 0)))
727 : alu_clobber = XVECEXP (pat, 0, i);
728 : }
729 74693 : if (compare_set == NULL_RTX)
730 : return false;
731 12689270 : src = SET_SRC (compare_set);
732 12689270 : if (GET_CODE (src) != COMPARE)
733 : return false;
734 :
735 : /* Check for memory operand. */
736 12680602 : if (MEM_P (XEXP (src, 0)))
737 1947778 : addr = XEXP (XEXP (src, 0), 0);
738 10732824 : else if (MEM_P (XEXP (src, 1)))
739 787757 : addr = XEXP (XEXP (src, 1), 0);
740 : /* Some CPUs, i.e. tigerlake and cooperlake does not fuse
741 : ALU with memory operand. */
742 2735535 : if (addr && !TARGET_FUSE_ALU_AND_BRANCH_MEM)
743 : return false;
744 12680081 : if (CONST_INT_P (XEXP (src, 0)))
745 : imm = XEXP (src, 0);
746 12680081 : else if (CONST_INT_P (XEXP (src, 1)))
747 : imm = XEXP (src, 1);
748 : /* Check that the instruction really has immediate.
749 : In particular compare with 0 is done using test with no immediate. */
750 8318428 : if (imm && !get_attr_length_immediate (condgen))
751 : imm = NULL;
752 : /* Macro-fusion for cmp/test MEM-IMM + conditional jmp is not
753 : supported. */
754 12680081 : if (addr && imm && !TARGET_FUSE_ALU_AND_BRANCH_MEM_IMM)
755 : return false;
756 :
757 : /* No fusion for RIP-relative address. */
758 12679757 : if (addr && !TARGET_FUSE_ALU_AND_BRANCH_RIP_RELATIVE)
759 : {
760 2734690 : ix86_address parts;
761 2734690 : int ok = ix86_decompose_address (addr, &parts);
762 2734690 : gcc_assert (ok);
763 :
764 2734690 : if (ix86_rip_relative_addr_p (&parts))
765 402869 : return false;
766 : }
767 : /* Znver5 supports fussion fusion with their reg/reg, reg/imm and
768 : reg/mem forms. They are also supported when the instruction has an
769 : immediate and displacement that meets the criteria of 4 byte displacement
770 : and 2 byte immediate or the case of 2 byte displacement and 4 byte
771 : immediate. We do not know the displacement size, so we ignore this
772 : limitation. */
773 :
774 9945067 : handle_stack_protect_test:
775 12277228 : test_if = SET_SRC (pc_set (condjmp));
776 12277228 : cond = XEXP (test_if, 0);
777 12277228 : ccode = GET_CODE (cond);
778 : /* Check whether conditional jump use Sign or Overflow Flags. */
779 12277228 : if (!TARGET_FUSE_CMP_AND_BRANCH_SOFLAGS
780 1370 : && (ccode == GE || ccode == GT || ccode == LE || ccode == LT))
781 : return false;
782 :
783 : /* Return true for TYPE_TEST and TYPE_ICMP. */
784 12277128 : if (condgen_type == TYPE_TEST || condgen_type == TYPE_ICMP)
785 : return true;
786 :
787 : /* The following is the case that macro-fusion for alu + jmp. */
788 199934 : if (!TARGET_FUSE_ALU_AND_BRANCH || (!alu_set && !alu_clobber))
789 : return false;
790 :
791 : /* No fusion for alu op with memory destination operand. */
792 70702 : if (alu_set && MEM_P (SET_DEST (alu_set)))
793 : return false;
794 :
795 :
796 : /* Macro-fusion for inc/dec + unsigned conditional jump is not
797 : supported on some CPUs while supported on others (znver5 and core_avx512).
798 : We however never generate it, so we do not need a specific tune for it. */
799 67654 : gcc_checking_assert (!(condgen_type == TYPE_INCDEC
800 : && (ccode == GEU || ccode == GTU || ccode == LEU || ccode == LTU)));
801 :
802 : return true;
803 : }
|