Line data Source code
1 : /* Perform doloop optimizations
2 : Copyright (C) 2004-2026 Free Software Foundation, Inc.
3 : Based on code by Michael P. Hayes (m.hayes@elec.canterbury.ac.nz)
4 :
5 : This file is part of GCC.
6 :
7 : GCC is free software; you can redistribute it and/or modify it under
8 : the terms of the GNU General Public License as published by the Free
9 : Software Foundation; either version 3, or (at your option) any later
10 : version.
11 :
12 : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
13 : WARRANTY; without even the implied warranty of MERCHANTABILITY or
14 : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
15 : for more details.
16 :
17 : You should have received a copy of the GNU General Public License
18 : along with GCC; see the file COPYING3. If not see
19 : <http://www.gnu.org/licenses/>. */
20 :
21 : #include "config.h"
22 : #include "system.h"
23 : #include "coretypes.h"
24 : #include "backend.h"
25 : #include "target.h"
26 : #include "rtl.h"
27 : #include "tree.h"
28 : #include "cfghooks.h"
29 : #include "memmodel.h"
30 : #include "emit-rtl.h"
31 : #include "dojump.h"
32 : #include "expr.h"
33 : #include "cfgloop.h"
34 : #include "cfgrtl.h"
35 : #include "dumpfile.h"
36 : #include "loop-unroll.h"
37 : #include "regs.h"
38 : #include "df.h"
39 : #include "targhooks.h"
40 :
41 : /* This module is used to modify loops with a determinable number of
42 : iterations to use special low-overhead looping instructions.
43 :
44 : It first validates whether the loop is well behaved and has a
45 : determinable number of iterations (either at compile or run-time).
46 : It then modifies the loop to use a low-overhead looping pattern as
47 : follows:
48 :
49 : 1. A pseudo register is allocated as the loop iteration counter.
50 :
51 : 2. The number of loop iterations is calculated and is stored
52 : in the loop counter.
53 :
54 : 3. At the end of the loop, the jump insn is replaced by the
55 : doloop_end pattern. The compare must remain because it might be
56 : used elsewhere. If the loop-variable or condition register are
57 : used elsewhere, they will be eliminated by flow.
58 :
59 : 4. An optional doloop_begin pattern is inserted at the top of the
60 : loop.
61 :
62 : TODO The optimization should only performed when either the biv used for exit
63 : condition is unused at all except for the exit test, or if we do not have to
64 : change its value, since otherwise we have to add a new induction variable,
65 : which usually will not pay up (unless the cost of the doloop pattern is
66 : somehow extremely lower than the cost of compare & jump, or unless the bct
67 : register cannot be used for anything else but doloop -- ??? detect these
68 : cases). */
69 :
70 : /* Return the loop termination condition for PATTERN or zero
71 : if it is not a decrement and branch jump insn. */
72 :
73 : rtx
74 0 : doloop_condition_get (rtx_insn *doloop_pat)
75 : {
76 0 : rtx cmp;
77 0 : rtx inc;
78 0 : rtx reg;
79 0 : rtx inc_src;
80 0 : rtx condition;
81 0 : rtx pattern;
82 0 : rtx cc_reg = NULL_RTX;
83 0 : rtx reg_orig = NULL_RTX;
84 :
85 : /* The canonical doloop pattern we expect has one of the following
86 : forms:
87 :
88 : 1) (parallel [(set (pc) (if_then_else (condition)
89 : (label_ref (label))
90 : (pc)))
91 : (set (reg) (plus (reg) (const_int -1)))
92 : (additional clobbers and uses)])
93 :
94 : The branch must be the first entry of the parallel (also required
95 : by jump.cc), and the second entry of the parallel must be a set of
96 : the loop counter register. Some targets (IA-64) wrap the set of
97 : the loop counter in an if_then_else too.
98 :
99 : 2) (set (reg) (plus (reg) (const_int -1))
100 : (set (pc) (if_then_else (reg != 0)
101 : (label_ref (label))
102 : (pc))).
103 :
104 : 3) Some targets (Arm) do the comparison before the branch, as in the
105 : following form:
106 :
107 : (parallel [(set (cc) (compare (plus (reg) (const_int -1)) 0))
108 : (set (reg) (plus (reg) (const_int -1)))])
109 : (set (pc) (if_then_else (cc == NE)
110 : (label_ref (label))
111 : (pc)))
112 :
113 : 4) This form supports a construct that is used to represent a vectorized
114 : do loop with predication, however we do not need to care about the
115 : details of the predication here.
116 : Arm uses this construct to support MVE tail predication.
117 :
118 : (parallel
119 : [(set (pc)
120 : (if_then_else (gtu (plus (reg) (const_int -n))
121 : (const_int n-1))
122 : (label_ref)
123 : (pc)))
124 : (set (reg) (plus (reg) (const_int -n)))
125 : (additional clobbers and uses)])
126 : */
127 0 : pattern = PATTERN (doloop_pat);
128 :
129 0 : if (GET_CODE (pattern) != PARALLEL)
130 : {
131 0 : rtx cond;
132 0 : rtx_insn *prev_insn = prev_nondebug_insn (doloop_pat);
133 0 : rtx cmp_arg1, cmp_arg2;
134 0 : rtx cmp_orig;
135 :
136 : /* In case the pattern is not PARALLEL we expect two forms
137 : of doloop which are cases 2) and 3) above: in case 2) the
138 : decrement immediately precedes the branch, while in case 3)
139 : the compare and decrement instructions immediately precede
140 : the branch. */
141 :
142 0 : if (prev_insn == NULL_RTX || !INSN_P (prev_insn))
143 : return 0;
144 :
145 0 : cmp = pattern;
146 0 : if (GET_CODE (PATTERN (prev_insn)) == PARALLEL)
147 : {
148 : /* The third case: the compare and decrement instructions
149 : immediately precede the branch. */
150 0 : cmp_orig = XVECEXP (PATTERN (prev_insn), 0, 0);
151 0 : if (GET_CODE (cmp_orig) != SET)
152 : return 0;
153 0 : if (GET_CODE (SET_SRC (cmp_orig)) != COMPARE)
154 : return 0;
155 0 : cmp_arg1 = XEXP (SET_SRC (cmp_orig), 0);
156 0 : cmp_arg2 = XEXP (SET_SRC (cmp_orig), 1);
157 0 : if (cmp_arg2 != const0_rtx
158 0 : || GET_CODE (cmp_arg1) != PLUS)
159 : return 0;
160 0 : reg_orig = XEXP (cmp_arg1, 0);
161 0 : if (XEXP (cmp_arg1, 1) != GEN_INT (-1)
162 0 : || !REG_P (reg_orig))
163 : return 0;
164 0 : cc_reg = SET_DEST (cmp_orig);
165 :
166 0 : inc = XVECEXP (PATTERN (prev_insn), 0, 1);
167 : }
168 : else
169 : inc = PATTERN (prev_insn);
170 0 : if (GET_CODE (cmp) == SET && GET_CODE (SET_SRC (cmp)) == IF_THEN_ELSE)
171 : {
172 : /* We expect the condition to be of the form (reg != 0) */
173 0 : cond = XEXP (SET_SRC (cmp), 0);
174 0 : if (GET_CODE (cond) != NE || XEXP (cond, 1) != const0_rtx)
175 : return 0;
176 : }
177 : }
178 : else
179 : {
180 0 : cmp = XVECEXP (pattern, 0, 0);
181 0 : inc = XVECEXP (pattern, 0, 1);
182 : }
183 :
184 : /* Check for (set (reg) (something)). */
185 0 : if (GET_CODE (inc) != SET)
186 : return 0;
187 0 : reg = SET_DEST (inc);
188 0 : if (! REG_P (reg))
189 : return 0;
190 :
191 : /* Check if something = (plus (reg) (const_int -n)).
192 : On IA-64, this decrement is wrapped in an if_then_else. */
193 0 : inc_src = SET_SRC (inc);
194 0 : if (GET_CODE (inc_src) == IF_THEN_ELSE)
195 0 : inc_src = XEXP (inc_src, 1);
196 0 : if (GET_CODE (inc_src) != PLUS
197 0 : || !rtx_equal_p (XEXP (inc_src, 0), reg)
198 0 : || !CONST_INT_P (XEXP (inc_src, 1))
199 0 : || INTVAL (XEXP (inc_src, 1)) >= 0)
200 0 : return 0;
201 0 : int dec_num = -INTVAL (XEXP (inc_src, 1));
202 :
203 : /* Check for (set (pc) (if_then_else (condition)
204 : (label_ref (label))
205 : (pc))). */
206 0 : if (GET_CODE (cmp) != SET
207 0 : || SET_DEST (cmp) != pc_rtx
208 0 : || GET_CODE (SET_SRC (cmp)) != IF_THEN_ELSE
209 0 : || GET_CODE (XEXP (SET_SRC (cmp), 1)) != LABEL_REF
210 0 : || XEXP (SET_SRC (cmp), 2) != pc_rtx)
211 : return 0;
212 :
213 : /* Extract loop termination condition. */
214 0 : condition = XEXP (SET_SRC (cmp), 0);
215 :
216 : /* We expect a GE or NE comparison with 0 or 1, or a GTU comparison with
217 : dec_num - 1. */
218 0 : if (!((GET_CODE (condition) == GE
219 0 : || GET_CODE (condition) == NE)
220 0 : && (XEXP (condition, 1) == const0_rtx
221 0 : || XEXP (condition, 1) == const1_rtx ))
222 0 : &&!(GET_CODE (condition) == GTU
223 0 : && ((INTVAL (XEXP (condition, 1))) == (dec_num - 1))))
224 : return 0;
225 :
226 0 : if (rtx_equal_p (XEXP (condition, 0), reg)
227 : /* For the third case: */
228 0 : || ((cc_reg != NULL_RTX)
229 0 : && (XEXP (condition, 0) == cc_reg)
230 0 : && (rtx_equal_p (reg_orig, reg)))
231 0 : || (GET_CODE (XEXP (condition, 0)) == PLUS
232 0 : && rtx_equal_p (XEXP (XEXP (condition, 0), 0), reg, NULL)))
233 : {
234 0 : if (GET_CODE (pattern) != PARALLEL)
235 : /* For the second form we expect:
236 :
237 : (set (reg) (plus (reg) (const_int -1))
238 : (set (pc) (if_then_else (reg != 0)
239 : (label_ref (label))
240 : (pc))).
241 :
242 : is equivalent to the following:
243 :
244 : (parallel [(set (pc) (if_then_else (reg != 1)
245 : (label_ref (label))
246 : (pc)))
247 : (set (reg) (plus (reg) (const_int -1)))
248 : (additional clobbers and uses)])
249 :
250 : For the third form we expect:
251 :
252 : (parallel [(set (cc) (compare ((plus (reg) (const_int -1)), 0))
253 : (set (reg) (plus (reg) (const_int -1)))])
254 : (set (pc) (if_then_else (cc == NE)
255 : (label_ref (label))
256 : (pc)))
257 :
258 : which is equivalent to the following:
259 :
260 : (parallel [(set (cc) (compare (reg, 1))
261 : (set (reg) (plus (reg) (const_int -1)))
262 : (set (pc) (if_then_else (NE == cc)
263 : (label_ref (label))
264 : (pc))))])
265 :
266 : So we return the second form instead for the two cases.
267 :
268 : */
269 0 : condition = gen_rtx_fmt_ee (NE, VOIDmode, inc_src, const1_rtx);
270 :
271 0 : return condition;
272 : }
273 :
274 : /* ??? If a machine uses a funny comparison, we could return a
275 : canonicalized form here. */
276 :
277 : return 0;
278 : }
279 :
280 : /* Return nonzero if the loop specified by LOOP is suitable for
281 : the use of special low-overhead looping instructions. DESC
282 : describes the number of iterations of the loop. */
283 :
284 : static bool
285 0 : doloop_valid_p (class loop *loop, class niter_desc *desc)
286 : {
287 0 : basic_block *body = get_loop_body (loop), bb;
288 0 : rtx_insn *insn;
289 0 : unsigned i;
290 0 : bool result = true;
291 :
292 : /* Check for loops that may not terminate under special conditions. */
293 0 : if (!desc->simple_p
294 0 : || desc->assumptions
295 0 : || desc->infinite)
296 : {
297 : /* There are some cases that would require a special attention.
298 : For example if the comparison is LEU and the comparison value
299 : is UINT_MAX then the loop will not terminate. Similarly, if the
300 : comparison code is GEU and the comparison value is 0, the
301 : loop will not terminate.
302 :
303 : If the absolute increment is not 1, the loop can be infinite
304 : even with LTU/GTU, e.g. for (i = 3; i > 0; i -= 2)
305 :
306 : ??? We could compute these conditions at run-time and have a
307 : additional jump around the loop to ensure an infinite loop.
308 : However, it is very unlikely that this is the intended
309 : behavior of the loop and checking for these rare boundary
310 : conditions would pessimize all other code.
311 :
312 : If the loop is executed only a few times an extra check to
313 : restart the loop could use up most of the benefits of using a
314 : count register loop. Note however, that normally, this
315 : restart branch would never execute, so it could be predicted
316 : well by the CPU. We should generate the pessimistic code by
317 : default, and have an option, e.g. -funsafe-loops that would
318 : enable count-register loops in this case. */
319 0 : if (dump_file)
320 0 : fprintf (dump_file, "Doloop: Possible infinite iteration case.\n");
321 0 : result = false;
322 0 : goto cleanup;
323 : }
324 :
325 0 : for (i = 0; i < loop->num_nodes; i++)
326 : {
327 0 : bb = body[i];
328 :
329 0 : for (insn = BB_HEAD (bb);
330 0 : insn != NEXT_INSN (BB_END (bb));
331 0 : insn = NEXT_INSN (insn))
332 : {
333 : /* Different targets have different necessities for low-overhead
334 : looping. Call the back end for each instruction within the loop
335 : to let it decide whether the insn prohibits a low-overhead loop.
336 : It will then return the cause for it to emit to the dump file. */
337 0 : const char * invalid = targetm.invalid_within_doloop (insn);
338 0 : if (invalid)
339 : {
340 0 : if (dump_file)
341 0 : fprintf (dump_file, "Doloop: %s\n", invalid);
342 0 : result = false;
343 0 : goto cleanup;
344 : }
345 : }
346 : }
347 : result = true;
348 :
349 0 : cleanup:
350 0 : free (body);
351 :
352 0 : return result;
353 : }
354 :
355 : /* Adds test of COND jumping to DEST on edge *E and set *E to the new fallthru
356 : edge. If the condition is always false, do not do anything. If it is always
357 : true, redirect E to DEST and return false. In all other cases, true is
358 : returned. */
359 :
360 : static bool
361 0 : add_test (rtx cond, edge *e, basic_block dest)
362 : {
363 0 : rtx_insn *seq, *jump;
364 0 : rtx_code_label *label;
365 0 : machine_mode mode;
366 0 : rtx op0 = XEXP (cond, 0), op1 = XEXP (cond, 1);
367 0 : enum rtx_code code = GET_CODE (cond);
368 0 : basic_block bb;
369 : /* The jump is supposed to handle an unlikely special case. */
370 0 : profile_probability prob = profile_probability::guessed_never ();
371 :
372 0 : mode = GET_MODE (XEXP (cond, 0));
373 0 : if (mode == VOIDmode)
374 0 : mode = GET_MODE (XEXP (cond, 1));
375 :
376 0 : start_sequence ();
377 0 : op0 = force_operand (op0, NULL_RTX);
378 0 : op1 = force_operand (op1, NULL_RTX);
379 0 : label = block_label (dest);
380 0 : do_compare_rtx_and_jump (op0, op1, code, 0, mode, NULL_RTX, NULL, label,
381 : prob);
382 :
383 0 : jump = get_last_insn ();
384 0 : if (!jump || !JUMP_P (jump))
385 : {
386 : /* The condition is always false and the jump was optimized out. */
387 0 : end_sequence ();
388 0 : return true;
389 : }
390 :
391 0 : seq = get_insns ();
392 0 : unshare_all_rtl_in_chain (seq);
393 0 : end_sequence ();
394 :
395 : /* There always is at least the jump insn in the sequence. */
396 0 : gcc_assert (seq != NULL_RTX);
397 :
398 0 : bb = split_edge_and_insert (*e, seq);
399 0 : *e = single_succ_edge (bb);
400 :
401 0 : if (any_uncondjump_p (jump) && onlyjump_p (jump))
402 : {
403 : /* The condition is always true. */
404 0 : delete_insn (jump);
405 0 : redirect_edge_and_branch_force (*e, dest);
406 0 : return false;
407 : }
408 :
409 0 : JUMP_LABEL (jump) = label;
410 :
411 0 : LABEL_NUSES (label)++;
412 :
413 0 : edge e2 = make_edge (bb, dest, (*e)->flags & ~EDGE_FALLTHRU);
414 0 : e2->probability = prob;
415 0 : (*e)->probability = prob.invert ();
416 0 : update_br_prob_note (e2->src);
417 0 : return true;
418 : }
419 :
420 : /* Fold (add -1; zero_ext; add +1) operations to zero_ext if not wrapping. i.e:
421 :
422 : 73: r145:SI=r123:DI#0-0x1
423 : 74: r144:DI=zero_extend (r145:SI)
424 : 75: r143:DI=r144:DI+0x1
425 : ...
426 : 31: r135:CC=cmp (r123:DI,0)
427 : 72: {pc={(r143:DI!=0x1)?L70:pc};r143:DI=r143:DI-0x1;...}
428 :
429 : r123:DI#0-0x1 is param count derived from loop->niter_expr equal to number of
430 : loop iterations, if loop iterations expression doesn't overflow, then
431 : (zero_extend (r123:DI#0-1))+1 can be simplified to zero_extend. */
432 :
433 : static rtx
434 0 : doloop_simplify_count (class loop *loop, scalar_int_mode mode, rtx count)
435 : {
436 0 : widest_int iterations;
437 0 : if (GET_CODE (count) == ZERO_EXTEND)
438 : {
439 0 : rtx extop0 = XEXP (count, 0);
440 0 : if (GET_CODE (extop0) == PLUS)
441 : {
442 0 : rtx addop0 = XEXP (extop0, 0);
443 0 : rtx addop1 = XEXP (extop0, 1);
444 :
445 0 : if (get_max_loop_iterations (loop, &iterations)
446 0 : && wi::ltu_p (iterations, GET_MODE_MASK (GET_MODE (addop0)))
447 0 : && addop1 == constm1_rtx)
448 0 : return simplify_gen_unary (ZERO_EXTEND, mode, addop0,
449 0 : GET_MODE (addop0));
450 : }
451 : }
452 :
453 0 : return simplify_gen_binary (PLUS, mode, count, const1_rtx);
454 0 : }
455 :
456 : /* Modify the loop to use the low-overhead looping insn where LOOP
457 : describes the loop, DESC describes the number of iterations of the
458 : loop, and DOLOOP_INSN is the low-overhead looping insn to emit at the
459 : end of the loop. CONDITION is the condition separated from the
460 : DOLOOP_SEQ. COUNT is the number of iterations of the LOOP. */
461 :
462 : static void
463 0 : doloop_modify (class loop *loop, class niter_desc *desc,
464 : rtx_insn *doloop_seq, rtx condition, rtx count)
465 : {
466 0 : rtx counter_reg;
467 0 : rtx tmp, noloop = NULL_RTX;
468 0 : rtx_insn *sequence;
469 0 : rtx_insn *jump_insn;
470 0 : rtx_code_label *jump_label;
471 0 : int nonneg = 0;
472 0 : bool increment_count;
473 0 : basic_block loop_end = desc->out_edge->src;
474 0 : scalar_int_mode mode;
475 0 : widest_int iterations;
476 :
477 0 : jump_insn = BB_END (loop_end);
478 :
479 0 : if (dump_file)
480 : {
481 0 : fprintf (dump_file, "Doloop: Inserting doloop pattern (");
482 0 : if (desc->const_iter)
483 0 : fprintf (dump_file, "%" PRId64, desc->niter);
484 : else
485 0 : fputs ("runtime", dump_file);
486 0 : fputs (" iterations).\n", dump_file);
487 : }
488 :
489 : /* Discard original jump to continue loop. The original compare
490 : result may still be live, so it cannot be discarded explicitly. */
491 0 : delete_insn (jump_insn);
492 :
493 0 : counter_reg = XEXP (condition, 0);
494 0 : if (GET_CODE (counter_reg) == PLUS)
495 0 : counter_reg = XEXP (counter_reg, 0);
496 : /* These patterns must operate on integer counters. */
497 0 : mode = as_a <scalar_int_mode> (GET_MODE (counter_reg));
498 :
499 0 : increment_count = false;
500 0 : switch (GET_CODE (condition))
501 : {
502 0 : case NE:
503 : /* Currently only NE tests against zero and one are supported. */
504 0 : noloop = XEXP (condition, 1);
505 0 : if (noloop != const0_rtx)
506 : {
507 0 : gcc_assert (noloop == const1_rtx);
508 : increment_count = true;
509 : }
510 : break;
511 :
512 0 : case GE:
513 : /* Currently only GE tests against zero are supported. */
514 0 : gcc_assert (XEXP (condition, 1) == const0_rtx);
515 :
516 0 : noloop = constm1_rtx;
517 :
518 : /* The iteration count does not need incrementing for a GE test. */
519 0 : increment_count = false;
520 :
521 : /* Determine if the iteration counter will be non-negative.
522 : Note that the maximum value loaded is iterations_max - 1. */
523 0 : if (get_max_loop_iterations (loop, &iterations)
524 0 : && wi::leu_p (iterations,
525 : wi::set_bit_in_zero <widest_int>
526 0 : (GET_MODE_PRECISION (mode) - 1)))
527 : nonneg = 1;
528 : break;
529 :
530 : case GTU:
531 : /* The iteration count does not need incrementing for a GTU test. */
532 : increment_count = false;
533 : break;
534 :
535 : /* Abort if an invalid doloop pattern has been generated. */
536 0 : default:
537 0 : gcc_unreachable ();
538 : }
539 :
540 0 : if (increment_count)
541 0 : count = doloop_simplify_count (loop, mode, count);
542 :
543 : /* Insert initialization of the count register into the loop header. */
544 0 : start_sequence ();
545 : /* count has been already copied through copy_rtx. */
546 0 : reset_used_flags (count);
547 0 : set_used_flags (condition);
548 0 : tmp = force_operand (count, counter_reg);
549 0 : convert_move (counter_reg, tmp, 1);
550 0 : sequence = get_insns ();
551 0 : unshare_all_rtl_in_chain (sequence);
552 0 : end_sequence ();
553 0 : emit_insn_after (sequence, BB_END (loop_preheader_edge (loop)->src));
554 :
555 0 : if (desc->noloop_assumptions)
556 : {
557 : /* The GTU case has only been implemented for Arm, where
558 : noloop_assumptions gets explicitly set to NULL for that case, so
559 : assert here for safety. */
560 0 : gcc_assert (GET_CODE (condition) != GTU);
561 0 : rtx ass = copy_rtx (desc->noloop_assumptions);
562 0 : basic_block preheader = loop_preheader_edge (loop)->src;
563 0 : basic_block set_zero = split_edge (loop_preheader_edge (loop));
564 0 : basic_block new_preheader = split_edge (loop_preheader_edge (loop));
565 0 : edge te;
566 :
567 : /* Expand the condition testing the assumptions and if it does not pass,
568 : reset the count register to 0. */
569 0 : redirect_edge_and_branch_force (single_succ_edge (preheader), new_preheader);
570 0 : set_immediate_dominator (CDI_DOMINATORS, new_preheader, preheader);
571 :
572 0 : set_zero->count = profile_count::uninitialized ();
573 :
574 0 : te = single_succ_edge (preheader);
575 0 : for (; ass; ass = XEXP (ass, 1))
576 0 : if (!add_test (XEXP (ass, 0), &te, set_zero))
577 : break;
578 :
579 0 : if (ass)
580 : {
581 : /* We reached a condition that is always true. This is very hard to
582 : reproduce (such a loop does not roll, and thus it would most
583 : likely get optimized out by some of the preceding optimizations).
584 : In fact, I do not have any testcase for it. However, it would
585 : also be very hard to show that it is impossible, so we must
586 : handle this case. */
587 0 : set_zero->count = preheader->count;
588 : }
589 :
590 0 : if (EDGE_COUNT (set_zero->preds) == 0)
591 : {
592 : /* All the conditions were simplified to false, remove the
593 : unreachable set_zero block. */
594 0 : delete_basic_block (set_zero);
595 : }
596 : else
597 : {
598 : /* Reset the counter to zero in the set_zero block. */
599 0 : start_sequence ();
600 0 : convert_move (counter_reg, noloop, 0);
601 0 : sequence = end_sequence ();
602 0 : emit_insn_after (sequence, BB_END (set_zero));
603 :
604 0 : set_immediate_dominator (CDI_DOMINATORS, set_zero,
605 : recompute_dominator (CDI_DOMINATORS,
606 : set_zero));
607 : }
608 :
609 0 : set_immediate_dominator (CDI_DOMINATORS, new_preheader,
610 : recompute_dominator (CDI_DOMINATORS,
611 : new_preheader));
612 : }
613 :
614 : /* Some targets (eg, C4x) need to initialize special looping
615 : registers. */
616 0 : if (targetm.have_doloop_begin ())
617 0 : if (rtx_insn *seq = targetm.gen_doloop_begin (counter_reg, doloop_seq))
618 0 : emit_insn_after (seq, BB_END (loop_preheader_edge (loop)->src));
619 :
620 : /* Insert the new low-overhead looping insn. */
621 0 : emit_jump_insn_after (doloop_seq, BB_END (loop_end));
622 0 : jump_insn = BB_END (loop_end);
623 0 : jump_label = block_label (desc->in_edge->dest);
624 0 : JUMP_LABEL (jump_insn) = jump_label;
625 0 : LABEL_NUSES (jump_label)++;
626 :
627 : /* Ensure the right fallthru edge is marked, for case we have reversed
628 : the condition. */
629 0 : desc->in_edge->flags &= ~EDGE_FALLTHRU;
630 0 : desc->out_edge->flags |= EDGE_FALLTHRU;
631 :
632 : /* Add a REG_NONNEG note if the actual or estimated maximum number
633 : of iterations is non-negative. */
634 0 : if (nonneg)
635 0 : add_reg_note (jump_insn, REG_NONNEG, NULL_RTX);
636 :
637 : /* Update the REG_BR_PROB note. */
638 0 : if (desc->in_edge->probability.initialized_p ())
639 0 : add_reg_br_prob_note (jump_insn, desc->in_edge->probability);
640 0 : }
641 :
642 : /* Called through note_stores. */
643 :
644 : static void
645 0 : record_reg_sets (rtx x, const_rtx pat ATTRIBUTE_UNUSED, void *data)
646 : {
647 0 : bitmap mod = (bitmap)data;
648 0 : if (REG_P (x))
649 : {
650 0 : unsigned int regno = REGNO (x);
651 0 : if (HARD_REGISTER_P (x))
652 : {
653 0 : unsigned int end_regno = end_hard_regno (GET_MODE (x), regno);
654 0 : do
655 0 : bitmap_set_bit (mod, regno);
656 0 : while (++regno < end_regno);
657 : }
658 : else
659 0 : bitmap_set_bit (mod, regno);
660 : }
661 0 : }
662 :
663 : /* Process loop described by LOOP validating that the loop is suitable for
664 : conversion to use a low overhead looping instruction, replacing the jump
665 : insn where suitable. Returns true if the loop was successfully
666 : modified. */
667 :
668 : static bool
669 0 : doloop_optimize (class loop *loop)
670 : {
671 0 : scalar_int_mode mode;
672 0 : rtx doloop_reg;
673 0 : rtx count = NULL_RTX;
674 0 : widest_int iterations, iterations_max;
675 0 : rtx_code_label *start_label;
676 0 : rtx condition;
677 0 : unsigned level;
678 0 : HOST_WIDE_INT est_niter;
679 0 : int max_cost;
680 0 : class niter_desc *desc;
681 0 : unsigned word_mode_size;
682 0 : unsigned HOST_WIDE_INT word_mode_max;
683 0 : int entered_at_top;
684 :
685 0 : if (dump_file)
686 0 : fprintf (dump_file, "Doloop: Processing loop %d.\n", loop->num);
687 :
688 0 : iv_analysis_loop_init (loop);
689 :
690 : /* Find the simple exit of a LOOP. */
691 0 : desc = get_simple_loop_desc (loop);
692 :
693 : /* Check that loop is a candidate for a low-overhead looping insn. */
694 0 : if (!doloop_valid_p (loop, desc))
695 : {
696 0 : if (dump_file)
697 0 : fprintf (dump_file,
698 : "Doloop: The loop is not suitable.\n");
699 0 : return false;
700 : }
701 0 : mode = desc->mode;
702 :
703 0 : est_niter = get_estimated_loop_iterations_int (loop);
704 0 : if (est_niter == -1)
705 0 : est_niter = get_likely_max_loop_iterations_int (loop);
706 :
707 0 : if (est_niter >= 0 && est_niter < 3)
708 : {
709 0 : if (dump_file)
710 0 : fprintf (dump_file,
711 : "Doloop: Too few iterations (%u) to be profitable.\n",
712 : (unsigned int)est_niter);
713 0 : return false;
714 : }
715 :
716 0 : if (desc->const_iter)
717 0 : iterations = widest_int::from (rtx_mode_t (desc->niter_expr, mode),
718 0 : UNSIGNED);
719 : else
720 0 : iterations = 0;
721 0 : if (!get_max_loop_iterations (loop, &iterations_max))
722 0 : iterations_max = 0;
723 0 : level = get_loop_level (loop) + 1;
724 0 : entered_at_top = (loop->latch == desc->in_edge->dest
725 0 : && contains_no_active_insn_p (loop->latch));
726 0 : if (!targetm.can_use_doloop_p (iterations, iterations_max, level,
727 : entered_at_top))
728 : {
729 0 : if (dump_file)
730 0 : fprintf (dump_file, "Loop rejected by can_use_doloop_p.\n");
731 0 : return false;
732 : }
733 :
734 : /* Generate looping insn. If the pattern FAILs then give up trying
735 : to modify the loop since there is some aspect the back-end does
736 : not like. If this succeeds, there is a chance that the loop
737 : desc->niter_expr has been altered by the backend, so only extract
738 : that data after the gen_doloop_end. */
739 0 : start_label = block_label (desc->in_edge->dest);
740 0 : doloop_reg = gen_reg_rtx (mode);
741 0 : rtx_insn *doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label);
742 :
743 0 : max_cost
744 0 : = COSTS_N_INSNS (param_max_iterations_computation_cost);
745 0 : if (set_src_cost (desc->niter_expr, mode, optimize_loop_for_speed_p (loop))
746 : > max_cost)
747 : {
748 0 : if (dump_file)
749 0 : fprintf (dump_file,
750 : "Doloop: number of iterations too costly to compute.\n");
751 0 : return false;
752 : }
753 :
754 0 : count = copy_rtx (desc->niter_expr);
755 0 : word_mode_size = GET_MODE_PRECISION (word_mode);
756 0 : word_mode_max = (HOST_WIDE_INT_1U << (word_mode_size - 1) << 1) - 1;
757 0 : if (! doloop_seq
758 0 : && mode != word_mode
759 : /* Before trying mode different from the one in that # of iterations is
760 : computed, we must be sure that the number of iterations fits into
761 : the new mode. */
762 0 : && (word_mode_size >= GET_MODE_PRECISION (mode)
763 0 : || wi::leu_p (iterations_max, word_mode_max)))
764 : {
765 0 : if (word_mode_size > GET_MODE_PRECISION (mode))
766 0 : count = simplify_gen_unary (ZERO_EXTEND, word_mode, count, mode);
767 : else
768 0 : count = lowpart_subreg (word_mode, count, mode);
769 0 : PUT_MODE (doloop_reg, word_mode);
770 0 : doloop_seq = targetm.gen_doloop_end (doloop_reg, start_label);
771 : }
772 0 : if (! doloop_seq)
773 : {
774 0 : if (dump_file)
775 0 : fprintf (dump_file,
776 : "Doloop: Target unwilling to use doloop pattern!\n");
777 0 : return false;
778 : }
779 :
780 : /* If multiple instructions were created, the last must be the
781 : jump instruction. */
782 : rtx_insn *doloop_insn = doloop_seq;
783 0 : while (NEXT_INSN (doloop_insn) != NULL_RTX)
784 : doloop_insn = NEXT_INSN (doloop_insn);
785 0 : if (!JUMP_P (doloop_insn)
786 0 : || !(condition = doloop_condition_get (doloop_insn)))
787 : {
788 0 : if (dump_file)
789 0 : fprintf (dump_file, "Doloop: Unrecognizable doloop pattern!\n");
790 0 : return false;
791 : }
792 :
793 : /* Ensure that the new sequence doesn't clobber a register that
794 : is live at the end of the block. */
795 0 : {
796 0 : bitmap modified = BITMAP_ALLOC (NULL);
797 :
798 0 : for (rtx_insn *i = doloop_seq; i != NULL; i = NEXT_INSN (i))
799 0 : note_stores (i, record_reg_sets, modified);
800 :
801 0 : basic_block loop_end = desc->out_edge->src;
802 0 : bool fail = bitmap_intersect_p (df_get_live_out (loop_end), modified);
803 : /* iv_analysis_loop_init calls df_analyze_loop, which computes just
804 : partial df for blocks of the loop only. The above will catch if
805 : any of the modified registers are use inside of the loop body, but
806 : it will most likely not have accurate info on registers used
807 : at the destination of the out_edge. We call df_analyze on the
808 : whole function at the start of the pass though and iterate only
809 : on innermost loops or from innermost loops, so
810 : live in on desc->out_edge->dest should be still unmodified from
811 : the initial df_analyze. */
812 0 : if (!fail)
813 0 : fail = bitmap_intersect_p (df_get_live_in (desc->out_edge->dest),
814 : modified);
815 0 : BITMAP_FREE (modified);
816 :
817 0 : if (fail)
818 : {
819 0 : if (dump_file)
820 0 : fprintf (dump_file, "Doloop: doloop pattern clobbers live out\n");
821 0 : return false;
822 : }
823 : }
824 :
825 0 : doloop_modify (loop, desc, doloop_seq, condition, count);
826 0 : return true;
827 0 : }
828 :
829 : /* This is the main entry point. Process all loops using doloop_optimize. */
830 :
831 : void
832 0 : doloop_optimize_loops (void)
833 : {
834 0 : if (optimize == 1)
835 : {
836 0 : df_live_add_problem ();
837 0 : df_live_set_all_dirty ();
838 : }
839 :
840 0 : df_analyze ();
841 :
842 0 : for (auto loop : loops_list (cfun,
843 0 : targetm.can_use_doloop_p
844 : == can_use_doloop_if_innermost
845 0 : ? LI_ONLY_INNERMOST : LI_FROM_INNERMOST))
846 0 : doloop_optimize (loop);
847 :
848 0 : if (optimize == 1)
849 0 : df_remove_problem (df_live);
850 :
851 0 : iv_analysis_done ();
852 :
853 0 : checking_verify_loop_structure ();
854 0 : }
|