Line data Source code
1 : /* Loop unrolling.
2 : Copyright (C) 2002-2026 Free Software Foundation, Inc.
3 :
4 : This file is part of GCC.
5 :
6 : GCC is free software; you can redistribute it and/or modify it under
7 : the terms of the GNU General Public License as published by the Free
8 : Software Foundation; either version 3, or (at your option) any later
9 : version.
10 :
11 : GCC is distributed in the hope that it will be useful, but WITHOUT ANY
12 : WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 : FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 : for more details.
15 :
16 : You should have received a copy of the GNU General Public License
17 : along with GCC; see the file COPYING3. If not see
18 : <http://www.gnu.org/licenses/>. */
19 :
20 : #include "config.h"
21 : #include "system.h"
22 : #include "coretypes.h"
23 : #include "backend.h"
24 : #include "target.h"
25 : #include "rtl.h"
26 : #include "tree.h"
27 : #include "cfghooks.h"
28 : #include "memmodel.h"
29 : #include "optabs.h"
30 : #include "emit-rtl.h"
31 : #include "recog.h"
32 : #include "profile.h"
33 : #include "cfgrtl.h"
34 : #include "cfgloop.h"
35 : #include "dojump.h"
36 : #include "expr.h"
37 : #include "dumpfile.h"
38 :
39 : /* This pass performs loop unrolling. We only perform this
40 : optimization on innermost loops (with single exception) because
41 : the impact on performance is greatest here, and we want to avoid
42 : unnecessary code size growth. The gain is caused by greater sequentiality
43 : of code, better code to optimize for further passes and in some cases
44 : by fewer testings of exit conditions. The main problem is code growth,
45 : that impacts performance negatively due to effect of caches.
46 :
47 : What we do:
48 :
49 : -- unrolling of loops that roll constant times; this is almost always
50 : win, as we get rid of exit condition tests.
51 : -- unrolling of loops that roll number of times that we can compute
52 : in runtime; we also get rid of exit condition tests here, but there
53 : is the extra expense for calculating the number of iterations
54 : -- simple unrolling of remaining loops; this is performed only if we
55 : are asked to, as the gain is questionable in this case and often
56 : it may even slow down the code
57 : For more detailed descriptions of each of those, see comments at
58 : appropriate function below.
59 :
60 : There is a lot of parameters (defined and described in params.def) that
61 : control how much we unroll.
62 :
63 : ??? A great problem is that we don't have a good way how to determine
64 : how many times we should unroll the loop; the experiments I have made
65 : showed that this choice may affect performance in order of several %.
66 : */
67 :
68 : /* Information about induction variables to split. */
69 :
70 : struct iv_to_split
71 : {
72 : rtx_insn *insn; /* The insn in that the induction variable occurs. */
73 : rtx orig_var; /* The variable (register) for the IV before split. */
74 : rtx base_var; /* The variable on that the values in the further
75 : iterations are based. */
76 : rtx step; /* Step of the induction variable. */
77 : struct iv_to_split *next; /* Next entry in walking order. */
78 : };
79 :
80 : /* Information about accumulators to expand. */
81 :
82 : struct var_to_expand
83 : {
84 : rtx_insn *insn; /* The insn in that the variable expansion occurs. */
85 : rtx reg; /* The accumulator which is expanded. */
86 : vec<rtx> var_expansions; /* The copies of the accumulator which is expanded. */
87 : struct var_to_expand *next; /* Next entry in walking order. */
88 : enum rtx_code op; /* The type of the accumulation - addition, subtraction
89 : or multiplication. */
90 : int expansion_count; /* Count the number of expansions generated so far. */
91 : int reuse_expansion; /* The expansion we intend to reuse to expand
92 : the accumulator. If REUSE_EXPANSION is 0 reuse
93 : the original accumulator. Else use
94 : var_expansions[REUSE_EXPANSION - 1]. */
95 : };
96 :
97 : /* Hashtable helper for iv_to_split. */
98 :
99 : struct iv_split_hasher : free_ptr_hash <iv_to_split>
100 : {
101 : static inline hashval_t hash (const iv_to_split *);
102 : static inline bool equal (const iv_to_split *, const iv_to_split *);
103 : };
104 :
105 :
106 : /* A hash function for information about insns to split. */
107 :
108 : inline hashval_t
109 236639940 : iv_split_hasher::hash (const iv_to_split *ivts)
110 : {
111 236639940 : return (hashval_t) INSN_UID (ivts->insn);
112 : }
113 :
114 : /* An equality functions for information about insns to split. */
115 :
116 : inline bool
117 107989168 : iv_split_hasher::equal (const iv_to_split *i1, const iv_to_split *i2)
118 : {
119 107989168 : return i1->insn == i2->insn;
120 : }
121 :
122 : /* Hashtable helper for iv_to_split. */
123 :
124 : struct var_expand_hasher : free_ptr_hash <var_to_expand>
125 : {
126 : static inline hashval_t hash (const var_to_expand *);
127 : static inline bool equal (const var_to_expand *, const var_to_expand *);
128 : };
129 :
130 : /* Return a hash for VES. */
131 :
132 : inline hashval_t
133 353 : var_expand_hasher::hash (const var_to_expand *ves)
134 : {
135 353 : return (hashval_t) INSN_UID (ves->insn);
136 : }
137 :
138 : /* Return true if I1 and I2 refer to the same instruction. */
139 :
140 : inline bool
141 84 : var_expand_hasher::equal (const var_to_expand *i1, const var_to_expand *i2)
142 : {
143 84 : return i1->insn == i2->insn;
144 : }
145 :
146 : /* Information about optimization applied in
147 : the unrolled loop. */
148 :
149 : struct opt_info
150 : {
151 : hash_table<iv_split_hasher> *insns_to_split; /* A hashtable of insns to
152 : split. */
153 : struct iv_to_split *iv_to_split_head; /* The first iv to split. */
154 : struct iv_to_split **iv_to_split_tail; /* Pointer to the tail of the list. */
155 : hash_table<var_expand_hasher> *insns_with_var_to_expand; /* A hashtable of
156 : insns with accumulators to expand. */
157 : struct var_to_expand *var_to_expand_head; /* The first var to expand. */
158 : struct var_to_expand **var_to_expand_tail; /* Pointer to the tail of the list. */
159 : unsigned first_new_block; /* The first basic block that was
160 : duplicated. */
161 : basic_block loop_exit; /* The loop exit basic block. */
162 : basic_block loop_preheader; /* The loop preheader basic block. */
163 : };
164 :
165 : static void decide_unroll_stupid (class loop *, int);
166 : static void decide_unroll_constant_iterations (class loop *, int);
167 : static void decide_unroll_runtime_iterations (class loop *, int);
168 : static void unroll_loop_stupid (class loop *);
169 : static void decide_unrolling (int);
170 : static void unroll_loop_constant_iterations (class loop *);
171 : static void unroll_loop_runtime_iterations (class loop *);
172 : static struct opt_info *analyze_insns_in_loop (class loop *);
173 : static void opt_info_start_duplication (struct opt_info *);
174 : static void apply_opt_in_copies (struct opt_info *, unsigned, bool, bool);
175 : static void free_opt_info (struct opt_info *);
176 : static struct var_to_expand *analyze_insn_to_expand_var (class loop*, rtx_insn *);
177 : static bool referenced_in_one_insn_in_loop_p (class loop *, rtx, int *);
178 : static struct iv_to_split *analyze_iv_to_split_insn (rtx_insn *);
179 : static void expand_var_during_unrolling (struct var_to_expand *, rtx_insn *);
180 : static void insert_var_expansion_initialization (struct var_to_expand *,
181 : basic_block);
182 : static void combine_var_copies_in_loop_exit (struct var_to_expand *,
183 : basic_block);
184 : static rtx get_expansion (struct var_to_expand *);
185 :
186 : /* Emit a message summarizing the unroll that will be
187 : performed for LOOP, along with the loop's location LOCUS, if
188 : appropriate given the dump or -fopt-info settings. */
189 :
190 : static void
191 422232 : report_unroll (class loop *loop, dump_location_t locus)
192 : {
193 422232 : dump_flags_t report_flags = MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS;
194 :
195 422232 : if (loop->lpt_decision.decision == LPT_NONE)
196 422158 : return;
197 :
198 50739 : if (!dump_enabled_p ())
199 : return;
200 :
201 74 : dump_metadata_t metadata (report_flags, locus.get_impl_location ());
202 74 : dump_printf_loc (metadata, locus.get_user_location (),
203 : "loop unrolled %d times",
204 : loop->lpt_decision.times);
205 74 : if (profile_info && loop->header->count.initialized_p ())
206 1 : dump_printf (metadata,
207 : " (header execution count %d)",
208 1 : (int)loop->header->count.to_gcov_type ());
209 :
210 74 : dump_printf (metadata, "\n");
211 : }
212 :
213 : /* Decide whether unroll loops and how much. */
214 : static void
215 220038 : decide_unrolling (int flags)
216 : {
217 : /* Scan the loops, inner ones first. */
218 1251712 : for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
219 : {
220 591598 : loop->lpt_decision.decision = LPT_NONE;
221 591598 : dump_user_location_t locus = get_loop_location (loop);
222 :
223 591598 : if (dump_enabled_p ())
224 451 : dump_printf_loc (MSG_NOTE, locus,
225 : "considering unrolling loop %d at BB %d\n",
226 451 : loop->num, loop->header->index);
227 :
228 591598 : if (loop->unroll == 1)
229 : {
230 35 : if (dump_file)
231 13 : fprintf (dump_file,
232 : ";; Not unrolling loop, user didn't want it unrolled\n");
233 169366 : continue;
234 : }
235 :
236 : /* Do not peel cold areas. */
237 591563 : if (optimize_loop_for_size_p (loop))
238 : {
239 71480 : if (dump_file)
240 0 : fprintf (dump_file, ";; Not considering loop, cold area\n");
241 71480 : continue;
242 : }
243 :
244 : /* Can the loop be manipulated? */
245 520083 : if (!can_duplicate_loop_p (loop))
246 : {
247 10715 : if (dump_file)
248 0 : fprintf (dump_file,
249 : ";; Not considering loop, cannot duplicate\n");
250 10715 : continue;
251 : }
252 :
253 : /* Skip non-innermost loops. */
254 509368 : if (loop->inner)
255 : {
256 87136 : if (dump_file)
257 0 : fprintf (dump_file, ";; Not considering loop, is not innermost\n");
258 87136 : continue;
259 : }
260 :
261 422232 : loop->ninsns = num_loop_insns (loop);
262 422232 : loop->av_ninsns = average_num_loop_insns (loop);
263 :
264 : /* Try transformations one by one in decreasing order of priority. */
265 422232 : decide_unroll_constant_iterations (loop, flags);
266 422232 : if (loop->lpt_decision.decision == LPT_NONE)
267 390754 : decide_unroll_runtime_iterations (loop, flags);
268 422232 : if (loop->lpt_decision.decision == LPT_NONE)
269 371595 : decide_unroll_stupid (loop, flags);
270 :
271 422232 : report_unroll (loop, locus);
272 220038 : }
273 220038 : }
274 :
275 : /* Unroll LOOPS. */
276 : void
277 220038 : unroll_loops (int flags)
278 : {
279 220038 : bool changed = false;
280 :
281 : /* Now decide rest of unrolling. */
282 220038 : decide_unrolling (flags);
283 :
284 : /* Scan the loops, inner ones first. */
285 1251712 : for (auto loop : loops_list (cfun, LI_FROM_INNERMOST))
286 : {
287 : /* And perform the appropriate transformations. */
288 591598 : switch (loop->lpt_decision.decision)
289 : {
290 31478 : case LPT_UNROLL_CONSTANT:
291 31478 : unroll_loop_constant_iterations (loop);
292 31478 : changed = true;
293 31478 : break;
294 19159 : case LPT_UNROLL_RUNTIME:
295 19159 : unroll_loop_runtime_iterations (loop);
296 19159 : changed = true;
297 19159 : break;
298 102 : case LPT_UNROLL_STUPID:
299 102 : unroll_loop_stupid (loop);
300 102 : changed = true;
301 102 : break;
302 : case LPT_NONE:
303 : break;
304 0 : default:
305 0 : gcc_unreachable ();
306 : }
307 220038 : }
308 :
309 220038 : if (changed)
310 : {
311 18363 : calculate_dominance_info (CDI_DOMINATORS);
312 18363 : fix_loop_structure (NULL);
313 : }
314 :
315 220038 : iv_analysis_done ();
316 220038 : }
317 :
318 : /* Check whether exit of the LOOP is at the end of loop body. */
319 :
320 : static bool
321 247251 : loop_exit_at_end_p (class loop *loop)
322 : {
323 247251 : class niter_desc *desc = get_simple_loop_desc (loop);
324 247251 : rtx_insn *insn;
325 :
326 : /* We should never have conditional in latch block. */
327 247251 : gcc_assert (desc->in_edge->dest != loop->header);
328 :
329 247251 : if (desc->in_edge->dest != loop->latch)
330 : return false;
331 :
332 : /* Check that the latch is empty. */
333 731089 : FOR_BB_INSNS (loop->latch, insn)
334 : {
335 486888 : if (INSN_P (insn) && active_insn_p (insn))
336 : return false;
337 : }
338 :
339 : return true;
340 : }
341 :
342 : /* Decide whether to unroll LOOP iterating constant number of times
343 : and how much. */
344 :
345 : static void
346 422232 : decide_unroll_constant_iterations (class loop *loop, int flags)
347 : {
348 422232 : unsigned nunroll, nunroll_by_av, best_copies, best_unroll = 0, n_copies, i;
349 422232 : class niter_desc *desc;
350 422232 : widest_int iterations;
351 :
352 : /* If we were not asked to unroll this loop, just return back silently. */
353 422232 : if (!(flags & UAP_UNROLL) && !loop->unroll)
354 : return;
355 :
356 422147 : if (dump_enabled_p ())
357 316 : dump_printf (MSG_NOTE,
358 : "considering unrolling loop with constant "
359 : "number of iterations\n");
360 :
361 : /* nunroll = total number of copies of the original loop body in
362 : unrolled loop (i.e. if it is 2, we have to duplicate loop body once). */
363 422147 : nunroll = param_max_unrolled_insns / loop->ninsns;
364 422147 : nunroll_by_av
365 422147 : = param_max_average_unrolled_insns / loop->av_ninsns;
366 422147 : if (nunroll > nunroll_by_av)
367 : nunroll = nunroll_by_av;
368 422147 : if (nunroll > (unsigned) param_max_unroll_times)
369 : nunroll = param_max_unroll_times;
370 :
371 422147 : if (targetm.loop_unroll_adjust)
372 422147 : nunroll = targetm.loop_unroll_adjust (nunroll, loop);
373 :
374 : /* Skip big loops. */
375 422147 : if (nunroll <= 1
376 351476 : && !(loop->unroll > 1 && loop->unroll < USHRT_MAX))
377 : {
378 351476 : if (dump_file)
379 8 : fprintf (dump_file, ";; Not considering loop, is too big\n");
380 351476 : return;
381 : }
382 :
383 : /* Check for simple loops. */
384 70671 : desc = get_simple_loop_desc (loop);
385 :
386 : /* Check number of iterations. */
387 70671 : if (!desc->simple_p || !desc->const_iter || desc->assumptions)
388 : {
389 37253 : if (dump_file)
390 39 : fprintf (dump_file,
391 : ";; Unable to prove that the loop iterates constant times\n");
392 37253 : return;
393 : }
394 :
395 : /* Check for an explicit unrolling factor. */
396 33418 : if (loop->unroll > 0 && loop->unroll < USHRT_MAX)
397 : {
398 : /* However we cannot unroll completely at the RTL level a loop with
399 : constant number of iterations; it should have been peeled instead. */
400 338 : if (desc->niter == 0 || (unsigned) loop->unroll > desc->niter - 1)
401 : {
402 17 : if (dump_file)
403 9 : fprintf (dump_file, ";; Loop should have been peeled\n");
404 : }
405 : else
406 : {
407 321 : loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
408 321 : loop->lpt_decision.times = loop->unroll - 1;
409 : }
410 338 : return;
411 : }
412 :
413 : /* Check whether the loop rolls enough to consider.
414 : Consult also loop bounds and profile; in the case the loop has more
415 : than one exit it may well loop less than determined maximal number
416 : of iterations. */
417 33080 : if (desc->niter < 2 * nunroll
418 33080 : || ((get_estimated_loop_iterations (loop, &iterations)
419 573 : || get_likely_max_loop_iterations (loop, &iterations))
420 31158 : && wi::ltu_p (iterations, 2 * nunroll)))
421 : {
422 1923 : if (dump_file)
423 1 : fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
424 1923 : return;
425 : }
426 :
427 : /* Success; now compute number of iterations to unroll. We alter
428 : nunroll so that as few as possible copies of loop body are
429 : necessary, while still not decreasing the number of unrollings
430 : too much (at most by 1). */
431 31157 : best_copies = 2 * nunroll + 10;
432 :
433 31157 : i = 2 * nunroll + 2;
434 31157 : if (i > desc->niter - 2)
435 550 : i = desc->niter - 2;
436 :
437 227771 : for (; i >= nunroll - 1; i--)
438 : {
439 196614 : unsigned exit_mod = desc->niter % (i + 1);
440 :
441 196614 : if (!loop_exit_at_end_p (loop))
442 861 : n_copies = exit_mod + i + 1;
443 195753 : else if (exit_mod != (unsigned) i
444 78052 : || desc->noloop_assumptions != NULL_RTX)
445 117703 : n_copies = exit_mod + i + 2;
446 : else
447 : n_copies = i + 1;
448 :
449 196614 : if (n_copies < best_copies)
450 : {
451 149589 : best_copies = n_copies;
452 149589 : best_unroll = i;
453 : }
454 : }
455 :
456 31157 : loop->lpt_decision.decision = LPT_UNROLL_CONSTANT;
457 31157 : loop->lpt_decision.times = best_unroll;
458 422232 : }
459 :
460 : /* Unroll LOOP with constant number of iterations LOOP->LPT_DECISION.TIMES times.
461 : The transformation does this:
462 :
463 : for (i = 0; i < 102; i++)
464 : body;
465 :
466 : ==> (LOOP->LPT_DECISION.TIMES == 3)
467 :
468 : i = 0;
469 : body; i++;
470 : body; i++;
471 : while (i < 102)
472 : {
473 : body; i++;
474 : body; i++;
475 : body; i++;
476 : body; i++;
477 : }
478 : */
479 : static void
480 31478 : unroll_loop_constant_iterations (class loop *loop)
481 : {
482 31478 : unsigned HOST_WIDE_INT niter;
483 31478 : unsigned exit_mod;
484 31478 : unsigned i;
485 31478 : edge e;
486 31478 : unsigned max_unroll = loop->lpt_decision.times;
487 31478 : class niter_desc *desc = get_simple_loop_desc (loop);
488 31478 : bool exit_at_end = loop_exit_at_end_p (loop);
489 31478 : struct opt_info *opt_info = NULL;
490 31478 : bool ok;
491 31478 : bool flat = maybe_flat_loop_profile (loop);
492 31478 : profile_count orig_exit_count = desc->out_edge->count ();
493 :
494 31478 : niter = desc->niter;
495 :
496 : /* Should not get here (such loop should be peeled instead). */
497 31478 : gcc_assert (niter > max_unroll + 1);
498 :
499 31478 : exit_mod = niter % (max_unroll + 1);
500 :
501 31478 : auto_sbitmap wont_exit (max_unroll + 2);
502 31478 : bitmap_ones (wont_exit);
503 :
504 31478 : auto_vec<edge> remove_edges;
505 31478 : if (flag_split_ivs_in_unroller
506 0 : || flag_variable_expansion_in_unroller)
507 31478 : opt_info = analyze_insns_in_loop (loop);
508 :
509 31478 : if (!exit_at_end)
510 : {
511 : /* The exit is not at the end of the loop; leave exit test
512 : in the first copy, so that the loops that start with test
513 : of exit condition have continuous body after unrolling. */
514 :
515 96 : if (dump_file)
516 0 : fprintf (dump_file, ";; Condition at beginning of loop.\n");
517 :
518 : /* Peel exit_mod iterations. */
519 96 : bitmap_clear_bit (wont_exit, 0);
520 96 : if (desc->noloop_assumptions)
521 0 : bitmap_clear_bit (wont_exit, 1);
522 :
523 96 : if (exit_mod)
524 : {
525 114 : opt_info_start_duplication (opt_info);
526 38 : ok = duplicate_loop_body_to_header_edge (
527 : loop, loop_preheader_edge (loop), exit_mod, wont_exit,
528 : desc->out_edge, &remove_edges,
529 : DLTHE_FLAG_UPDATE_FREQ
530 38 : | (opt_info && exit_mod > 1 ? DLTHE_RECORD_COPY_NUMBER : 0));
531 38 : gcc_assert (ok);
532 :
533 38 : if (opt_info && exit_mod > 1)
534 20 : apply_opt_in_copies (opt_info, exit_mod, false, false);
535 :
536 38 : desc->noloop_assumptions = NULL_RTX;
537 38 : desc->niter -= exit_mod;
538 38 : loop->nb_iterations_upper_bound -= exit_mod;
539 38 : if (loop->any_estimate
540 38 : && wi::leu_p (exit_mod, loop->nb_iterations_estimate))
541 21 : loop->nb_iterations_estimate -= exit_mod;
542 : else
543 17 : loop->any_estimate = false;
544 38 : if (loop->any_likely_upper_bound
545 38 : && wi::leu_p (exit_mod, loop->nb_iterations_likely_upper_bound))
546 38 : loop->nb_iterations_likely_upper_bound -= exit_mod;
547 : else
548 0 : loop->any_likely_upper_bound = false;
549 : }
550 :
551 96 : bitmap_set_bit (wont_exit, 1);
552 : }
553 : else
554 : {
555 : /* Leave exit test in last copy, for the same reason as above if
556 : the loop tests the condition at the end of loop body. */
557 :
558 31382 : if (dump_file)
559 33 : fprintf (dump_file, ";; Condition at end of loop.\n");
560 :
561 : /* We know that niter >= max_unroll + 2; so we do not need to care of
562 : case when we would exit before reaching the loop. So just peel
563 : exit_mod + 1 iterations. */
564 31382 : if (exit_mod != max_unroll
565 29989 : || desc->noloop_assumptions)
566 : {
567 1394 : bitmap_clear_bit (wont_exit, 0);
568 1394 : if (desc->noloop_assumptions)
569 1 : bitmap_clear_bit (wont_exit, 1);
570 :
571 4182 : opt_info_start_duplication (opt_info);
572 1394 : ok = duplicate_loop_body_to_header_edge (
573 : loop, loop_preheader_edge (loop), exit_mod + 1, wont_exit,
574 : desc->out_edge, &remove_edges,
575 : DLTHE_FLAG_UPDATE_FREQ
576 1394 : | (opt_info && exit_mod > 0 ? DLTHE_RECORD_COPY_NUMBER : 0));
577 1394 : gcc_assert (ok);
578 :
579 1394 : if (opt_info && exit_mod > 0)
580 246 : apply_opt_in_copies (opt_info, exit_mod + 1, false, false);
581 :
582 1394 : desc->niter -= exit_mod + 1;
583 1394 : loop->nb_iterations_upper_bound -= exit_mod + 1;
584 1394 : if (loop->any_estimate
585 1394 : && wi::leu_p (exit_mod + 1, loop->nb_iterations_estimate))
586 1101 : loop->nb_iterations_estimate -= exit_mod + 1;
587 : else
588 293 : loop->any_estimate = false;
589 1394 : if (loop->any_likely_upper_bound
590 1394 : && wi::leu_p (exit_mod + 1, loop->nb_iterations_likely_upper_bound))
591 1393 : loop->nb_iterations_likely_upper_bound -= exit_mod + 1;
592 : else
593 1 : loop->any_likely_upper_bound = false;
594 1394 : desc->noloop_assumptions = NULL_RTX;
595 :
596 1394 : bitmap_set_bit (wont_exit, 0);
597 1394 : bitmap_set_bit (wont_exit, 1);
598 : }
599 :
600 31382 : bitmap_clear_bit (wont_exit, max_unroll);
601 : }
602 :
603 : /* Now unroll the loop. */
604 :
605 94434 : opt_info_start_duplication (opt_info);
606 31478 : ok = duplicate_loop_body_to_header_edge (
607 : loop, loop_latch_edge (loop), max_unroll, wont_exit, desc->out_edge,
608 : &remove_edges,
609 : DLTHE_FLAG_UPDATE_FREQ | (opt_info ? DLTHE_RECORD_COPY_NUMBER : 0)
610 31478 : | (flat ? DLTHE_FLAG_FLAT_PROFILE : 0));
611 31478 : gcc_assert (ok);
612 :
613 31478 : edge exit = update_loop_exit_probability_scale_dom_bbs
614 31478 : (loop, desc->out_edge, orig_exit_count);
615 31478 : if (exit)
616 31477 : update_br_prob_note (exit->src);
617 :
618 31478 : if (opt_info)
619 : {
620 31478 : apply_opt_in_copies (opt_info, max_unroll, true, true);
621 31478 : free_opt_info (opt_info);
622 : }
623 :
624 31478 : if (exit_at_end)
625 : {
626 31382 : basic_block exit_block = get_bb_copy (desc->in_edge->src);
627 : /* Find a new in and out edge; they are in the last copy we have made. */
628 :
629 31382 : if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
630 : {
631 1600 : desc->out_edge = EDGE_SUCC (exit_block, 0);
632 1600 : desc->in_edge = EDGE_SUCC (exit_block, 1);
633 : }
634 : else
635 : {
636 29782 : desc->out_edge = EDGE_SUCC (exit_block, 1);
637 29782 : desc->in_edge = EDGE_SUCC (exit_block, 0);
638 : }
639 : }
640 :
641 31478 : desc->niter /= max_unroll + 1;
642 31478 : loop->nb_iterations_upper_bound
643 31478 : = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
644 31478 : if (loop->any_estimate)
645 30617 : loop->nb_iterations_estimate
646 30617 : = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
647 31478 : if (loop->any_likely_upper_bound)
648 31477 : loop->nb_iterations_likely_upper_bound
649 31477 : = wi::udiv_trunc (loop->nb_iterations_likely_upper_bound, max_unroll + 1);
650 31478 : desc->niter_expr = gen_int_mode (desc->niter, desc->mode);
651 :
652 : /* Remove the edges. */
653 78355 : FOR_EACH_VEC_ELT (remove_edges, i, e)
654 46877 : remove_path (e);
655 :
656 31478 : if (dump_file)
657 33 : fprintf (dump_file,
658 : ";; Unrolled loop %d times, constant # of iterations %i insns\n",
659 : max_unroll, num_loop_insns (loop));
660 31478 : }
661 :
662 : /* Decide whether to unroll LOOP iterating runtime computable number of times
663 : and how much. */
664 : static void
665 390754 : decide_unroll_runtime_iterations (class loop *loop, int flags)
666 : {
667 390754 : unsigned nunroll, nunroll_by_av, i;
668 390754 : class niter_desc *desc;
669 390754 : widest_int iterations;
670 :
671 : /* If we were not asked to unroll this loop, just return back silently. */
672 390754 : if (!(flags & UAP_UNROLL) && !loop->unroll)
673 : return;
674 :
675 390669 : if (dump_enabled_p ())
676 283 : dump_printf (MSG_NOTE,
677 : "considering unrolling loop with runtime-"
678 : "computable number of iterations\n");
679 :
680 : /* nunroll = total number of copies of the original loop body in
681 : unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
682 390669 : nunroll = param_max_unrolled_insns / loop->ninsns;
683 390669 : nunroll_by_av = param_max_average_unrolled_insns / loop->av_ninsns;
684 390669 : if (nunroll > nunroll_by_av)
685 : nunroll = nunroll_by_av;
686 390669 : if (nunroll > (unsigned) param_max_unroll_times)
687 : nunroll = param_max_unroll_times;
688 :
689 390669 : if (targetm.loop_unroll_adjust)
690 390669 : nunroll = targetm.loop_unroll_adjust (nunroll, loop);
691 :
692 390669 : if (loop->unroll > 0 && loop->unroll < USHRT_MAX)
693 917 : nunroll = loop->unroll;
694 :
695 : /* Skip big loops. */
696 390669 : if (nunroll <= 1)
697 : {
698 351476 : if (dump_file)
699 8 : fprintf (dump_file, ";; Not considering loop, is too big\n");
700 351476 : return;
701 : }
702 :
703 : /* Check for simple loops. */
704 39193 : desc = get_simple_loop_desc (loop);
705 :
706 : /* Check simpleness. */
707 39193 : if (!desc->simple_p || desc->assumptions)
708 : {
709 16876 : if (dump_file)
710 24 : fprintf (dump_file,
711 : ";; Unable to prove that the number of iterations "
712 : "can be counted in runtime\n");
713 16876 : return;
714 : }
715 :
716 22317 : if (desc->const_iter)
717 : {
718 1940 : if (dump_file)
719 10 : fprintf (dump_file, ";; Loop iterates constant times\n");
720 1940 : return;
721 : }
722 :
723 : /* Check whether the loop rolls. */
724 20377 : if ((get_estimated_loop_iterations (loop, &iterations)
725 19225 : || get_likely_max_loop_iterations (loop, &iterations))
726 39602 : && wi::ltu_p (iterations, 2 * nunroll))
727 : {
728 1218 : if (dump_file)
729 1 : fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
730 1218 : return;
731 : }
732 :
733 : /* Success; now force nunroll to be power of 2, as code-gen
734 : requires it, we are unable to cope with overflows in
735 : computation of number of iterations. */
736 52733 : for (i = 1; 2 * i <= nunroll; i *= 2)
737 33574 : continue;
738 :
739 19159 : loop->lpt_decision.decision = LPT_UNROLL_RUNTIME;
740 19159 : loop->lpt_decision.times = i - 1;
741 390754 : }
742 :
743 : /* Splits edge E and inserts the sequence of instructions INSNS on it, and
744 : returns the newly created block. If INSNS is NULL_RTX, nothing is changed
745 : and NULL is returned instead. */
746 :
747 : basic_block
748 76618 : split_edge_and_insert (edge e, rtx_insn *insns)
749 : {
750 76618 : basic_block bb;
751 :
752 76618 : if (!insns)
753 : return NULL;
754 76618 : bb = split_edge (e);
755 76618 : emit_insn_after (insns, BB_END (bb));
756 :
757 : /* ??? We used to assume that INSNS can contain control flow insns, and
758 : that we had to try to find sub basic blocks in BB to maintain a valid
759 : CFG. For this purpose we used to set the BB_SUPERBLOCK flag on BB
760 : and call break_superblocks when going out of cfglayout mode. But it
761 : turns out that this never happens; and that if it does ever happen,
762 : the verify_flow_info at the end of the RTL loop passes would fail.
763 :
764 : There are two reasons why we expected we could have control flow insns
765 : in INSNS. The first is when a comparison has to be done in parts, and
766 : the second is when the number of iterations is computed for loops with
767 : the number of iterations known at runtime. In both cases, test cases
768 : to get control flow in INSNS appear to be impossible to construct:
769 :
770 : * If do_compare_rtx_and_jump needs several branches to do comparison
771 : in a mode that needs comparison by parts, we cannot analyze the
772 : number of iterations of the loop, and we never get to unrolling it.
773 :
774 : * The code in expand_divmod that was suspected to cause creation of
775 : branching code seems to be only accessed for signed division. The
776 : divisions used by # of iterations analysis are always unsigned.
777 : Problems might arise on architectures that emits branching code
778 : for some operations that may appear in the unroller (especially
779 : for division), but we have no such architectures.
780 :
781 : Considering all this, it was decided that we should for now assume
782 : that INSNS can in theory contain control flow insns, but in practice
783 : it never does. So we don't handle the theoretical case, and should
784 : a real failure ever show up, we have a pretty good clue for how to
785 : fix it. */
786 :
787 76618 : return bb;
788 : }
789 :
790 : /* Prepare a sequence comparing OP0 with OP1 using COMP and jumping to LABEL if
791 : true, with probability PROB. If CINSN is not NULL, it is the insn to copy
792 : in order to create a jump. */
793 :
794 : static rtx_insn *
795 57459 : compare_and_jump_seq (rtx op0, rtx op1, enum rtx_code comp,
796 : rtx_code_label *label, profile_probability prob,
797 : rtx_insn *cinsn)
798 : {
799 57459 : rtx_insn *seq;
800 57459 : rtx_jump_insn *jump;
801 57459 : rtx cond;
802 57459 : machine_mode mode;
803 :
804 57459 : mode = GET_MODE (op0);
805 57459 : if (mode == VOIDmode)
806 0 : mode = GET_MODE (op1);
807 :
808 57459 : start_sequence ();
809 57459 : if (GET_MODE_CLASS (mode) == MODE_CC)
810 : {
811 : /* A hack -- there seems to be no easy generic way how to make a
812 : conditional jump from a ccmode comparison. */
813 0 : gcc_assert (cinsn);
814 0 : cond = XEXP (SET_SRC (pc_set (cinsn)), 0);
815 0 : gcc_assert (GET_CODE (cond) == comp);
816 0 : gcc_assert (rtx_equal_p (op0, XEXP (cond, 0)));
817 0 : gcc_assert (rtx_equal_p (op1, XEXP (cond, 1)));
818 0 : emit_jump_insn (copy_insn (PATTERN (cinsn)));
819 0 : jump = as_a <rtx_jump_insn *> (get_last_insn ());
820 0 : JUMP_LABEL (jump) = JUMP_LABEL (cinsn);
821 0 : LABEL_NUSES (JUMP_LABEL (jump))++;
822 0 : redirect_jump (jump, label, 0);
823 : }
824 : else
825 : {
826 57459 : gcc_assert (!cinsn);
827 :
828 57459 : op0 = force_operand (op0, NULL_RTX);
829 57459 : op1 = force_operand (op1, NULL_RTX);
830 57459 : do_compare_rtx_and_jump (op0, op1, comp, 0,
831 : mode, NULL_RTX, NULL, label,
832 : profile_probability::uninitialized ());
833 57459 : jump = as_a <rtx_jump_insn *> (get_last_insn ());
834 57459 : jump->set_jump_target (label);
835 57459 : LABEL_NUSES (label)++;
836 : }
837 57459 : if (prob.initialized_p ())
838 57459 : add_reg_br_prob_note (jump, prob);
839 :
840 57459 : seq = end_sequence ();
841 :
842 57459 : return seq;
843 : }
844 :
845 : /* Unroll LOOP for which we are able to count number of iterations in
846 : runtime LOOP->LPT_DECISION.TIMES times. The times value must be a
847 : power of two. The transformation does this (with some extra care
848 : for case n < 0):
849 :
850 : for (i = 0; i < n; i++)
851 : body;
852 :
853 : ==> (LOOP->LPT_DECISION.TIMES == 3)
854 :
855 : i = 0;
856 : mod = n % 4;
857 :
858 : switch (mod)
859 : {
860 : case 3:
861 : body; i++;
862 : case 2:
863 : body; i++;
864 : case 1:
865 : body; i++;
866 : case 0: ;
867 : }
868 :
869 : while (i < n)
870 : {
871 : body; i++;
872 : body; i++;
873 : body; i++;
874 : body; i++;
875 : }
876 : */
877 : static void
878 19159 : unroll_loop_runtime_iterations (class loop *loop)
879 : {
880 19159 : rtx old_niter, niter, tmp;
881 19159 : rtx_insn *init_code, *branch_code;
882 19159 : unsigned i;
883 19159 : profile_probability p;
884 19159 : basic_block preheader, *body, swtch, ezc_swtch = NULL;
885 19159 : int may_exit_copy;
886 19159 : profile_count iter_count, new_count;
887 19159 : unsigned n_peel;
888 19159 : edge e;
889 19159 : bool extra_zero_check, last_may_exit;
890 19159 : unsigned max_unroll = loop->lpt_decision.times;
891 19159 : class niter_desc *desc = get_simple_loop_desc (loop);
892 19159 : bool exit_at_end = loop_exit_at_end_p (loop);
893 19159 : struct opt_info *opt_info = NULL;
894 19159 : bool ok;
895 :
896 19159 : if (flag_split_ivs_in_unroller
897 0 : || flag_variable_expansion_in_unroller)
898 19159 : opt_info = analyze_insns_in_loop (loop);
899 :
900 : /* Remember blocks whose dominators will have to be updated. */
901 19159 : auto_vec<basic_block> dom_bbs;
902 :
903 19159 : body = get_loop_body (loop);
904 64462 : for (i = 0; i < loop->num_nodes; i++)
905 : {
906 141202 : for (basic_block bb : get_dominated_by (CDI_DOMINATORS, body[i]))
907 45601 : if (!flow_bb_inside_loop_p (loop, bb))
908 64760 : dom_bbs.safe_push (bb);
909 : }
910 19159 : free (body);
911 :
912 19159 : if (!exit_at_end)
913 : {
914 : /* Leave exit in first copy (for explanation why see comment in
915 : unroll_loop_constant_iterations). */
916 2093 : may_exit_copy = 0;
917 2093 : n_peel = max_unroll - 1;
918 2093 : extra_zero_check = true;
919 2093 : last_may_exit = false;
920 : }
921 : else
922 : {
923 : /* Leave exit in last copy (for explanation why see comment in
924 : unroll_loop_constant_iterations). */
925 17066 : may_exit_copy = max_unroll;
926 17066 : n_peel = max_unroll;
927 17066 : extra_zero_check = false;
928 17066 : last_may_exit = true;
929 : }
930 :
931 : /* Get expression for number of iterations. */
932 19159 : start_sequence ();
933 19159 : old_niter = niter = gen_reg_rtx (desc->mode);
934 19159 : tmp = force_operand (copy_rtx (desc->niter_expr), niter);
935 19159 : if (tmp != niter)
936 357 : emit_move_insn (niter, tmp);
937 :
938 : /* For loops that exit at end and whose number of iterations is reliable,
939 : add one to niter to account for first pass through loop body before
940 : reaching exit test. */
941 19159 : if (exit_at_end && !desc->noloop_assumptions)
942 : {
943 11446 : niter = expand_simple_binop (desc->mode, PLUS,
944 : niter, const1_rtx,
945 : NULL_RTX, 0, OPTAB_LIB_WIDEN);
946 11446 : old_niter = niter;
947 : }
948 :
949 : /* Count modulo by ANDing it with max_unroll; we use the fact that
950 : the number of unrollings is a power of two, and thus this is correct
951 : even if there is overflow in the computation. */
952 19159 : niter = expand_simple_binop (desc->mode, AND,
953 19159 : niter, gen_int_mode (max_unroll, desc->mode),
954 : NULL_RTX, 0, OPTAB_LIB_WIDEN);
955 :
956 19159 : init_code = end_sequence ();
957 19159 : unshare_all_rtl_in_chain (init_code);
958 :
959 : /* Precondition the loop. */
960 19159 : split_edge_and_insert (loop_preheader_edge (loop), init_code);
961 :
962 19159 : auto_vec<edge> remove_edges;
963 :
964 19159 : auto_sbitmap wont_exit (max_unroll + 2);
965 :
966 19159 : if (extra_zero_check || desc->noloop_assumptions)
967 : {
968 : /* Peel the first copy of loop body. Leave the exit test if the number
969 : of iterations is not reliable. Also record the place of the extra zero
970 : check. */
971 7713 : bitmap_clear (wont_exit);
972 7713 : if (!desc->noloop_assumptions)
973 1640 : bitmap_set_bit (wont_exit, 1);
974 7713 : ezc_swtch = loop_preheader_edge (loop)->src;
975 7713 : ok = duplicate_loop_body_to_header_edge (loop, loop_preheader_edge (loop),
976 : 1, wont_exit, desc->out_edge,
977 : &remove_edges,
978 : DLTHE_FLAG_UPDATE_FREQ);
979 7713 : gcc_assert (ok);
980 : }
981 :
982 : /* Record the place where switch will be built for preconditioning. */
983 19159 : swtch = split_edge (loop_preheader_edge (loop));
984 :
985 : /* Compute count increments for each switch block and initialize
986 : innermost switch block. Switch blocks and peeled loop copies are built
987 : from innermost outward. */
988 19159 : iter_count = new_count = swtch->count / (max_unroll + 1);
989 19159 : swtch->count = new_count;
990 :
991 74525 : for (i = 0; i < n_peel; i++)
992 : {
993 : /* Peel the copy. */
994 55366 : bitmap_clear (wont_exit);
995 55366 : if (i != n_peel - 1 || !last_may_exit)
996 38300 : bitmap_set_bit (wont_exit, 1);
997 55366 : ok = duplicate_loop_body_to_header_edge (loop, loop_preheader_edge (loop),
998 : 1, wont_exit, desc->out_edge,
999 : &remove_edges,
1000 : DLTHE_FLAG_UPDATE_FREQ);
1001 55366 : gcc_assert (ok);
1002 :
1003 : /* Create item for switch. */
1004 55366 : unsigned j = n_peel - i - (extra_zero_check ? 0 : 1);
1005 55366 : p = profile_probability::always () / (i + 2);
1006 :
1007 55366 : preheader = split_edge (loop_preheader_edge (loop));
1008 : /* Add in count of edge from switch block. */
1009 55366 : preheader->count += iter_count;
1010 55366 : branch_code = compare_and_jump_seq (copy_rtx (niter),
1011 55366 : gen_int_mode (j, desc->mode), EQ,
1012 : block_label (preheader), p, NULL);
1013 :
1014 : /* We rely on the fact that the compare and jump cannot be optimized out,
1015 : and hence the cfg we create is correct. */
1016 55366 : gcc_assert (branch_code != NULL_RTX);
1017 :
1018 55366 : swtch = split_edge_and_insert (single_pred_edge (swtch), branch_code);
1019 55366 : set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1020 55366 : single_succ_edge (swtch)->probability = p.invert ();
1021 55366 : new_count += iter_count;
1022 55366 : swtch->count = new_count;
1023 166098 : e = make_edge (swtch, preheader,
1024 55366 : single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1025 55366 : e->probability = p;
1026 : }
1027 :
1028 19159 : if (extra_zero_check)
1029 : {
1030 : /* Add branch for zero iterations. */
1031 2093 : p = profile_probability::always () / (max_unroll + 1);
1032 2093 : swtch = ezc_swtch;
1033 2093 : preheader = split_edge (loop_preheader_edge (loop));
1034 : /* Recompute count adjustments since initial peel copy may
1035 : have exited and reduced those values that were computed above. */
1036 2093 : iter_count = swtch->count / (max_unroll + 1);
1037 : /* Add in count of edge from switch block. */
1038 2093 : preheader->count += iter_count;
1039 2093 : branch_code = compare_and_jump_seq (copy_rtx (niter), const0_rtx, EQ,
1040 : block_label (preheader), p,
1041 : NULL);
1042 2093 : gcc_assert (branch_code != NULL_RTX);
1043 :
1044 2093 : swtch = split_edge_and_insert (single_succ_edge (swtch), branch_code);
1045 2093 : set_immediate_dominator (CDI_DOMINATORS, preheader, swtch);
1046 2093 : single_succ_edge (swtch)->probability = p.invert ();
1047 6279 : e = make_edge (swtch, preheader,
1048 2093 : single_succ_edge (swtch)->flags & EDGE_IRREDUCIBLE_LOOP);
1049 2093 : e->probability = p;
1050 : }
1051 :
1052 : /* Recount dominators for outer blocks. */
1053 19159 : iterate_fix_dominators (CDI_DOMINATORS, dom_bbs, false);
1054 :
1055 : /* And unroll loop. */
1056 :
1057 19159 : bitmap_ones (wont_exit);
1058 19159 : bitmap_clear_bit (wont_exit, may_exit_copy);
1059 57477 : opt_info_start_duplication (opt_info);
1060 :
1061 19159 : ok = duplicate_loop_body_to_header_edge (
1062 : loop, loop_latch_edge (loop), max_unroll, wont_exit, desc->out_edge,
1063 : &remove_edges,
1064 : DLTHE_FLAG_UPDATE_FREQ | (opt_info ? DLTHE_RECORD_COPY_NUMBER : 0));
1065 19159 : gcc_assert (ok);
1066 :
1067 19159 : if (opt_info)
1068 : {
1069 19159 : apply_opt_in_copies (opt_info, max_unroll, true, true);
1070 19159 : free_opt_info (opt_info);
1071 : }
1072 :
1073 19159 : if (exit_at_end)
1074 : {
1075 17066 : basic_block exit_block = get_bb_copy (desc->in_edge->src);
1076 : /* Find a new in and out edge; they are in the last copy we have
1077 : made. */
1078 :
1079 17066 : if (EDGE_SUCC (exit_block, 0)->dest == desc->out_edge->dest)
1080 : {
1081 3814 : desc->out_edge = EDGE_SUCC (exit_block, 0);
1082 3814 : desc->in_edge = EDGE_SUCC (exit_block, 1);
1083 : }
1084 : else
1085 : {
1086 13252 : desc->out_edge = EDGE_SUCC (exit_block, 1);
1087 13252 : desc->in_edge = EDGE_SUCC (exit_block, 0);
1088 : }
1089 : }
1090 :
1091 : /* Remove the edges. */
1092 116558 : FOR_EACH_VEC_ELT (remove_edges, i, e)
1093 97399 : remove_path (e);
1094 :
1095 : /* We must be careful when updating the number of iterations due to
1096 : preconditioning and the fact that the value must be valid at entry
1097 : of the loop. After passing through the above code, we see that
1098 : the correct new number of iterations is this: */
1099 19159 : gcc_assert (!desc->const_iter);
1100 38318 : desc->niter_expr =
1101 19159 : simplify_gen_binary (UDIV, desc->mode, old_niter,
1102 19159 : gen_int_mode (max_unroll + 1, desc->mode));
1103 19159 : loop->nb_iterations_upper_bound
1104 19159 : = wi::udiv_trunc (loop->nb_iterations_upper_bound, max_unroll + 1);
1105 19159 : if (loop->any_estimate)
1106 595 : loop->nb_iterations_estimate
1107 595 : = wi::udiv_trunc (loop->nb_iterations_estimate, max_unroll + 1);
1108 19159 : if (loop->any_likely_upper_bound)
1109 17603 : loop->nb_iterations_likely_upper_bound
1110 17603 : = wi::udiv_trunc (loop->nb_iterations_likely_upper_bound, max_unroll + 1);
1111 19159 : if (exit_at_end)
1112 : {
1113 34132 : desc->niter_expr =
1114 17066 : simplify_gen_binary (MINUS, desc->mode, desc->niter_expr, const1_rtx);
1115 17066 : desc->noloop_assumptions = NULL_RTX;
1116 17066 : --loop->nb_iterations_upper_bound;
1117 17066 : if (loop->any_estimate
1118 17066 : && loop->nb_iterations_estimate != 0)
1119 594 : --loop->nb_iterations_estimate;
1120 : else
1121 16472 : loop->any_estimate = false;
1122 17066 : if (loop->any_likely_upper_bound
1123 17066 : && loop->nb_iterations_likely_upper_bound != 0)
1124 15971 : --loop->nb_iterations_likely_upper_bound;
1125 : else
1126 1095 : loop->any_likely_upper_bound = false;
1127 : }
1128 :
1129 19159 : if (dump_file)
1130 14 : fprintf (dump_file,
1131 : ";; Unrolled loop %d times, counting # of iterations "
1132 : "in runtime, %i insns\n",
1133 : max_unroll, num_loop_insns (loop));
1134 19159 : }
1135 :
1136 : /* Decide whether to unroll LOOP stupidly and how much. */
1137 : static void
1138 371595 : decide_unroll_stupid (class loop *loop, int flags)
1139 : {
1140 371595 : unsigned nunroll, nunroll_by_av, i;
1141 371595 : class niter_desc *desc;
1142 371595 : widest_int iterations;
1143 :
1144 : /* If we were not asked to unroll this loop, just return back silently. */
1145 371595 : if (!(flags & UAP_UNROLL_ALL) && !loop->unroll)
1146 : return;
1147 :
1148 159 : if (dump_enabled_p ())
1149 33 : dump_printf (MSG_NOTE, "considering unrolling loop stupidly\n");
1150 :
1151 : /* nunroll = total number of copies of the original loop body in
1152 : unrolled loop (i.e. if it is 2, we have to duplicate loop body once. */
1153 159 : nunroll = param_max_unrolled_insns / loop->ninsns;
1154 159 : nunroll_by_av
1155 159 : = param_max_average_unrolled_insns / loop->av_ninsns;
1156 159 : if (nunroll > nunroll_by_av)
1157 : nunroll = nunroll_by_av;
1158 159 : if (nunroll > (unsigned) param_max_unroll_times)
1159 : nunroll = param_max_unroll_times;
1160 :
1161 159 : if (targetm.loop_unroll_adjust)
1162 159 : nunroll = targetm.loop_unroll_adjust (nunroll, loop);
1163 :
1164 159 : if (loop->unroll > 0 && loop->unroll < USHRT_MAX)
1165 143 : nunroll = loop->unroll;
1166 :
1167 : /* Skip big loops. */
1168 159 : if (nunroll <= 1)
1169 : {
1170 1 : if (dump_file)
1171 0 : fprintf (dump_file, ";; Not considering loop, is too big\n");
1172 1 : return;
1173 : }
1174 :
1175 : /* Check for simple loops. */
1176 158 : desc = get_simple_loop_desc (loop);
1177 :
1178 : /* Check simpleness. */
1179 158 : if (desc->simple_p && !desc->assumptions)
1180 : {
1181 25 : if (dump_file)
1182 9 : fprintf (dump_file, ";; Loop is simple\n");
1183 25 : return;
1184 : }
1185 :
1186 : /* Do not unroll loops with branches inside -- it increases number
1187 : of mispredicts.
1188 : TODO: this heuristic needs tuning; call inside the loop body
1189 : is also relatively good reason to not unroll. */
1190 133 : if (num_loop_branches (loop) > 1)
1191 : {
1192 30 : if (dump_file)
1193 0 : fprintf (dump_file, ";; Not unrolling, contains branches\n");
1194 30 : return;
1195 : }
1196 :
1197 : /* Check whether the loop rolls. */
1198 103 : if ((get_estimated_loop_iterations (loop, &iterations)
1199 103 : || get_likely_max_loop_iterations (loop, &iterations))
1200 206 : && wi::ltu_p (iterations, 2 * nunroll))
1201 : {
1202 1 : if (dump_file)
1203 0 : fprintf (dump_file, ";; Not unrolling loop, doesn't roll\n");
1204 1 : return;
1205 : }
1206 :
1207 : /* Success. Now force nunroll to be power of 2, as it seems that this
1208 : improves results (partially because of better alignments, partially
1209 : because of some dark magic). */
1210 279 : for (i = 1; 2 * i <= nunroll; i *= 2)
1211 177 : continue;
1212 :
1213 102 : loop->lpt_decision.decision = LPT_UNROLL_STUPID;
1214 102 : loop->lpt_decision.times = i - 1;
1215 371595 : }
1216 :
1217 : /* Unroll a LOOP LOOP->LPT_DECISION.TIMES times. The transformation does this:
1218 :
1219 : while (cond)
1220 : body;
1221 :
1222 : ==> (LOOP->LPT_DECISION.TIMES == 3)
1223 :
1224 : while (cond)
1225 : {
1226 : body;
1227 : if (!cond) break;
1228 : body;
1229 : if (!cond) break;
1230 : body;
1231 : if (!cond) break;
1232 : body;
1233 : }
1234 : */
1235 : static void
1236 102 : unroll_loop_stupid (class loop *loop)
1237 : {
1238 102 : unsigned nunroll = loop->lpt_decision.times;
1239 102 : class niter_desc *desc = get_simple_loop_desc (loop);
1240 102 : struct opt_info *opt_info = NULL;
1241 102 : bool ok;
1242 :
1243 102 : if (flag_split_ivs_in_unroller
1244 0 : || flag_variable_expansion_in_unroller)
1245 102 : opt_info = analyze_insns_in_loop (loop);
1246 :
1247 102 : auto_sbitmap wont_exit (nunroll + 1);
1248 102 : bitmap_clear (wont_exit);
1249 306 : opt_info_start_duplication (opt_info);
1250 :
1251 102 : ok = duplicate_loop_body_to_header_edge (
1252 : loop, loop_latch_edge (loop), nunroll, wont_exit, NULL, NULL,
1253 : DLTHE_FLAG_UPDATE_FREQ | (opt_info ? DLTHE_RECORD_COPY_NUMBER : 0));
1254 102 : gcc_assert (ok);
1255 :
1256 102 : if (opt_info)
1257 : {
1258 102 : apply_opt_in_copies (opt_info, nunroll, true, true);
1259 102 : free_opt_info (opt_info);
1260 : }
1261 :
1262 102 : if (desc->simple_p)
1263 : {
1264 : /* We indeed may get here provided that there are nontrivial assumptions
1265 : for a loop to be really simple. We could update the counts, but the
1266 : problem is that we are unable to decide which exit will be taken
1267 : (not really true in case the number of iterations is constant,
1268 : but no one will do anything with this information, so we do not
1269 : worry about it). */
1270 1 : desc->simple_p = false;
1271 : }
1272 :
1273 102 : if (dump_file)
1274 24 : fprintf (dump_file, ";; Unrolled loop %d times, %i insns\n",
1275 : nunroll, num_loop_insns (loop));
1276 102 : }
1277 :
1278 : /* Returns true if REG is referenced in one nondebug insn in LOOP.
1279 : Set *DEBUG_USES to the number of debug insns that reference the
1280 : variable. */
1281 :
1282 : static bool
1283 3 : referenced_in_one_insn_in_loop_p (class loop *loop, rtx reg,
1284 : int *debug_uses)
1285 : {
1286 3 : basic_block *body, bb;
1287 3 : unsigned i;
1288 3 : int count_ref = 0;
1289 3 : rtx_insn *insn;
1290 :
1291 3 : body = get_loop_body (loop);
1292 12 : for (i = 0; i < loop->num_nodes; i++)
1293 : {
1294 6 : bb = body[i];
1295 :
1296 30 : FOR_BB_INSNS (bb, insn)
1297 24 : if (!rtx_referenced_p (reg, insn))
1298 21 : continue;
1299 3 : else if (DEBUG_INSN_P (insn))
1300 0 : ++*debug_uses;
1301 3 : else if (++count_ref > 1)
1302 : break;
1303 : }
1304 3 : free (body);
1305 3 : return (count_ref == 1);
1306 : }
1307 :
1308 : /* Reset the DEBUG_USES debug insns in LOOP that reference REG. */
1309 :
1310 : static void
1311 0 : reset_debug_uses_in_loop (class loop *loop, rtx reg, int debug_uses)
1312 : {
1313 0 : basic_block *body, bb;
1314 0 : unsigned i;
1315 0 : rtx_insn *insn;
1316 :
1317 0 : body = get_loop_body (loop);
1318 0 : for (i = 0; debug_uses && i < loop->num_nodes; i++)
1319 : {
1320 0 : bb = body[i];
1321 :
1322 0 : FOR_BB_INSNS (bb, insn)
1323 0 : if (!DEBUG_INSN_P (insn) || !rtx_referenced_p (reg, insn))
1324 0 : continue;
1325 : else
1326 : {
1327 0 : validate_change (insn, &INSN_VAR_LOCATION_LOC (insn),
1328 : gen_rtx_UNKNOWN_VAR_LOC (), 0);
1329 0 : if (!--debug_uses)
1330 : break;
1331 : }
1332 : }
1333 0 : free (body);
1334 0 : }
1335 :
1336 : /* Determine whether INSN contains an accumulator
1337 : which can be expanded into separate copies,
1338 : one for each copy of the LOOP body.
1339 :
1340 : for (i = 0 ; i < n; i++)
1341 : sum += a[i];
1342 :
1343 : ==>
1344 :
1345 : sum += a[i]
1346 : ....
1347 : i = i+1;
1348 : sum1 += a[i]
1349 : ....
1350 : i = i+1
1351 : sum2 += a[i];
1352 : ....
1353 :
1354 : Return NULL if INSN contains no opportunity for expansion of accumulator.
1355 : Otherwise, allocate a VAR_TO_EXPAND structure, fill it with the relevant
1356 : information and return a pointer to it.
1357 : */
1358 :
1359 : static struct var_to_expand *
1360 32 : analyze_insn_to_expand_var (class loop *loop, rtx_insn *insn)
1361 : {
1362 32 : rtx set, dest, src;
1363 32 : struct var_to_expand *ves;
1364 32 : unsigned accum_pos;
1365 32 : enum rtx_code code;
1366 32 : int debug_uses = 0;
1367 :
1368 32 : set = single_set (insn);
1369 32 : if (!set)
1370 : return NULL;
1371 :
1372 22 : dest = SET_DEST (set);
1373 22 : src = SET_SRC (set);
1374 22 : code = GET_CODE (src);
1375 :
1376 22 : if (code != PLUS && code != MINUS && code != MULT && code != FMA)
1377 : return NULL;
1378 :
1379 5 : if (FLOAT_MODE_P (GET_MODE (dest)))
1380 : {
1381 2 : if (!flag_associative_math)
1382 : return NULL;
1383 : /* In the case of FMA, we're also changing the rounding. */
1384 2 : if (code == FMA && !flag_unsafe_math_optimizations)
1385 : return NULL;
1386 : }
1387 :
1388 : /* Hmm, this is a bit paradoxical. We know that INSN is a valid insn
1389 : in MD. But if there is no optab to generate the insn, we cannot
1390 : perform the variable expansion. This can happen if an MD provides
1391 : an insn but not a named pattern to generate it, for example to avoid
1392 : producing code that needs additional mode switches like for x87/mmx.
1393 :
1394 : So we check have_insn_for which looks for an optab for the operation
1395 : in SRC. If it doesn't exist, we can't perform the expansion even
1396 : though INSN is valid. */
1397 5 : if (!have_insn_for (code, GET_MODE (src)))
1398 : return NULL;
1399 :
1400 5 : if (!REG_P (dest)
1401 0 : && !(GET_CODE (dest) == SUBREG
1402 0 : && REG_P (SUBREG_REG (dest))))
1403 : return NULL;
1404 :
1405 : /* Find the accumulator use within the operation. */
1406 5 : if (code == FMA)
1407 : {
1408 : /* We only support accumulation via FMA in the ADD position. */
1409 0 : if (!rtx_equal_p (dest, XEXP (src, 2)))
1410 : return NULL;
1411 : accum_pos = 2;
1412 : }
1413 5 : else if (rtx_equal_p (dest, XEXP (src, 0)))
1414 : accum_pos = 0;
1415 2 : else if (rtx_equal_p (dest, XEXP (src, 1)))
1416 : {
1417 : /* The method of expansion that we are using; which includes the
1418 : initialization of the expansions with zero and the summation of
1419 : the expansions at the end of the computation will yield wrong
1420 : results for (x = something - x) thus avoid using it in that case. */
1421 0 : if (code == MINUS)
1422 : return NULL;
1423 : accum_pos = 1;
1424 : }
1425 : else
1426 : return NULL;
1427 :
1428 : /* It must not otherwise be used. */
1429 3 : if (code == FMA)
1430 : {
1431 0 : if (rtx_referenced_p (dest, XEXP (src, 0))
1432 0 : || rtx_referenced_p (dest, XEXP (src, 1)))
1433 0 : return NULL;
1434 : }
1435 3 : else if (rtx_referenced_p (dest, XEXP (src, 1 - accum_pos)))
1436 : return NULL;
1437 :
1438 : /* It must be used in exactly one insn. */
1439 3 : if (!referenced_in_one_insn_in_loop_p (loop, dest, &debug_uses))
1440 : return NULL;
1441 :
1442 3 : if (dump_file)
1443 : {
1444 1 : fprintf (dump_file, "\n;; Expanding Accumulator ");
1445 1 : print_rtl (dump_file, dest);
1446 1 : fprintf (dump_file, "\n");
1447 : }
1448 :
1449 3 : if (debug_uses)
1450 : /* Instead of resetting the debug insns, we could replace each
1451 : debug use in the loop with the sum or product of all expanded
1452 : accumulators. Since we'll only know of all expansions at the
1453 : end, we'd have to keep track of which vars_to_expand a debug
1454 : insn in the loop references, take note of each copy of the
1455 : debug insn during unrolling, and when it's all done, compute
1456 : the sum or product of each variable and adjust the original
1457 : debug insn and each copy thereof. What a pain! */
1458 0 : reset_debug_uses_in_loop (loop, dest, debug_uses);
1459 :
1460 : /* Record the accumulator to expand. */
1461 3 : ves = XNEW (struct var_to_expand);
1462 3 : ves->insn = insn;
1463 3 : ves->reg = copy_rtx (dest);
1464 3 : ves->var_expansions.create (1);
1465 3 : ves->next = NULL;
1466 3 : ves->op = GET_CODE (src);
1467 3 : ves->expansion_count = 0;
1468 3 : ves->reuse_expansion = 0;
1469 3 : return ves;
1470 : }
1471 :
1472 : /* Determine whether there is an induction variable in INSN that
1473 : we would like to split during unrolling.
1474 :
1475 : I.e. replace
1476 :
1477 : i = i + 1;
1478 : ...
1479 : i = i + 1;
1480 : ...
1481 : i = i + 1;
1482 : ...
1483 :
1484 : type chains by
1485 :
1486 : i0 = i + 1
1487 : ...
1488 : i = i0 + 1
1489 : ...
1490 : i = i0 + 2
1491 : ...
1492 :
1493 : Return NULL if INSN contains no interesting IVs. Otherwise, allocate
1494 : an IV_TO_SPLIT structure, fill it with the relevant information and return a
1495 : pointer to it. */
1496 :
1497 : static struct iv_to_split *
1498 433672 : analyze_iv_to_split_insn (rtx_insn *insn)
1499 : {
1500 433672 : rtx set, dest;
1501 433672 : class rtx_iv iv;
1502 433672 : struct iv_to_split *ivts;
1503 433672 : scalar_int_mode mode;
1504 433672 : bool ok;
1505 :
1506 : /* For now we just split the basic induction variables. Later this may be
1507 : extended for example by selecting also addresses of memory references. */
1508 433672 : set = single_set (insn);
1509 433672 : if (!set)
1510 : return NULL;
1511 :
1512 282352 : dest = SET_DEST (set);
1513 469401 : if (!REG_P (dest) || !is_a <scalar_int_mode> (GET_MODE (dest), &mode))
1514 : return NULL;
1515 :
1516 88697 : if (!biv_p (insn, mode, dest))
1517 : return NULL;
1518 :
1519 52973 : ok = iv_analyze_result (insn, dest, &iv);
1520 :
1521 : /* This used to be an assert under the assumption that if biv_p returns
1522 : true that iv_analyze_result must also return true. However, that
1523 : assumption is not strictly correct as evidenced by pr25569.
1524 :
1525 : Returning NULL when iv_analyze_result returns false is safe and
1526 : avoids the problems in pr25569 until the iv_analyze_* routines
1527 : can be fixed, which is apparently hard and time consuming
1528 : according to their author. */
1529 52973 : if (! ok)
1530 : return NULL;
1531 :
1532 52973 : if (iv.step == const0_rtx
1533 52973 : || iv.mode != iv.extend_mode)
1534 : return NULL;
1535 :
1536 : /* Record the insn to split. */
1537 52968 : ivts = XNEW (struct iv_to_split);
1538 52968 : ivts->insn = insn;
1539 52968 : ivts->orig_var = dest;
1540 52968 : ivts->base_var = NULL_RTX;
1541 52968 : ivts->step = iv.step;
1542 52968 : ivts->next = NULL;
1543 :
1544 52968 : return ivts;
1545 : }
1546 :
1547 : /* Determines which of insns in LOOP can be optimized.
1548 : Return a OPT_INFO struct with the relevant hash tables filled
1549 : with all insns to be optimized. The FIRST_NEW_BLOCK field
1550 : is undefined for the return value. */
1551 :
1552 : static struct opt_info *
1553 50739 : analyze_insns_in_loop (class loop *loop)
1554 : {
1555 50739 : basic_block *body, bb;
1556 50739 : unsigned i;
1557 50739 : struct opt_info *opt_info = XCNEW (struct opt_info);
1558 50739 : rtx_insn *insn;
1559 50739 : struct iv_to_split *ivts = NULL;
1560 50739 : struct var_to_expand *ves = NULL;
1561 50739 : iv_to_split **slot1;
1562 50739 : var_to_expand **slot2;
1563 50739 : auto_vec<edge> edges = get_loop_exit_edges (loop);
1564 50739 : edge exit;
1565 50739 : bool can_apply = false;
1566 :
1567 50739 : iv_analysis_loop_init (loop);
1568 :
1569 50739 : body = get_loop_body (loop);
1570 :
1571 50739 : if (flag_split_ivs_in_unroller)
1572 : {
1573 50739 : opt_info->insns_to_split
1574 50739 : = new hash_table<iv_split_hasher> (5 * loop->num_nodes);
1575 50739 : opt_info->iv_to_split_head = NULL;
1576 50739 : opt_info->iv_to_split_tail = &opt_info->iv_to_split_head;
1577 : }
1578 :
1579 : /* Record the loop exit bb and loop preheader before the unrolling. */
1580 50739 : opt_info->loop_preheader = loop_preheader_edge (loop)->src;
1581 :
1582 50739 : if (edges.length () == 1)
1583 : {
1584 45505 : exit = edges[0];
1585 45505 : if (!(exit->flags & EDGE_COMPLEX))
1586 : {
1587 45505 : opt_info->loop_exit = split_edge (exit);
1588 45505 : can_apply = true;
1589 : }
1590 : }
1591 :
1592 50739 : if (flag_variable_expansion_in_unroller
1593 6 : && can_apply)
1594 : {
1595 6 : opt_info->insns_with_var_to_expand
1596 6 : = new hash_table<var_expand_hasher> (5 * loop->num_nodes);
1597 6 : opt_info->var_to_expand_head = NULL;
1598 6 : opt_info->var_to_expand_tail = &opt_info->var_to_expand_head;
1599 : }
1600 :
1601 162309 : for (i = 0; i < loop->num_nodes; i++)
1602 : {
1603 111570 : bb = body[i];
1604 111570 : if (!dominated_by_p (CDI_DOMINATORS, loop->latch, bb))
1605 2732 : continue;
1606 :
1607 754722 : FOR_BB_INSNS (bb, insn)
1608 : {
1609 645884 : if (!INSN_P (insn))
1610 212212 : continue;
1611 :
1612 433672 : if (opt_info->insns_to_split)
1613 433672 : ivts = analyze_iv_to_split_insn (insn);
1614 :
1615 433672 : if (ivts)
1616 : {
1617 52968 : slot1 = opt_info->insns_to_split->find_slot (ivts, INSERT);
1618 52968 : gcc_assert (*slot1 == NULL);
1619 52968 : *slot1 = ivts;
1620 52968 : *opt_info->iv_to_split_tail = ivts;
1621 52968 : opt_info->iv_to_split_tail = &ivts->next;
1622 52968 : continue;
1623 : }
1624 :
1625 380704 : if (opt_info->insns_with_var_to_expand)
1626 32 : ves = analyze_insn_to_expand_var (loop, insn);
1627 :
1628 380704 : if (ves)
1629 : {
1630 3 : slot2 = opt_info->insns_with_var_to_expand->find_slot (ves, INSERT);
1631 3 : gcc_assert (*slot2 == NULL);
1632 3 : *slot2 = ves;
1633 3 : *opt_info->var_to_expand_tail = ves;
1634 3 : opt_info->var_to_expand_tail = &ves->next;
1635 : }
1636 : }
1637 : }
1638 :
1639 50739 : free (body);
1640 50739 : return opt_info;
1641 50739 : }
1642 :
1643 : /* Called just before loop duplication. Records start of duplicated area
1644 : to OPT_INFO. */
1645 :
1646 : static void
1647 52171 : opt_info_start_duplication (struct opt_info *opt_info)
1648 : {
1649 52171 : if (opt_info)
1650 52171 : opt_info->first_new_block = last_basic_block_for_fn (cfun);
1651 0 : }
1652 :
1653 : /* Determine the number of iterations between initialization of the base
1654 : variable and the current copy (N_COPY). N_COPIES is the total number
1655 : of newly created copies. UNROLLING is true if we are unrolling
1656 : (not peeling) the loop. */
1657 :
1658 : static unsigned
1659 358671 : determine_split_iv_delta (unsigned n_copy, unsigned n_copies, bool unrolling)
1660 : {
1661 0 : if (unrolling)
1662 : {
1663 : /* If we are unrolling, initialization is done in the original loop
1664 : body (number 0). */
1665 : return n_copy;
1666 : }
1667 : else
1668 : {
1669 : /* If we are peeling, the copy in that the initialization occurs has
1670 : number 1. The original loop (number 0) is the last. */
1671 2241 : if (n_copy)
1672 2241 : return n_copy - 1;
1673 : else
1674 : return n_copies;
1675 : }
1676 : }
1677 :
1678 : /* Allocate basic variable for the induction variable chain. */
1679 :
1680 : static void
1681 53250 : allocate_basic_variable (struct iv_to_split *ivts)
1682 : {
1683 53250 : rtx expr = SET_SRC (single_set (ivts->insn));
1684 :
1685 53250 : ivts->base_var = gen_reg_rtx (GET_MODE (expr));
1686 53250 : }
1687 :
1688 : /* Insert initialization of basic variable of IVTS before INSN, taking
1689 : the initial value from INSN. */
1690 :
1691 : static void
1692 64623 : insert_base_initialization (struct iv_to_split *ivts, rtx_insn *insn)
1693 : {
1694 64623 : rtx expr = copy_rtx (SET_SRC (single_set (insn)));
1695 64623 : rtx_insn *seq;
1696 :
1697 64623 : start_sequence ();
1698 64623 : expr = force_operand (expr, ivts->base_var);
1699 64623 : if (expr != ivts->base_var)
1700 246 : emit_move_insn (ivts->base_var, expr);
1701 64623 : seq = end_sequence ();
1702 :
1703 64623 : emit_insn_before (seq, insn);
1704 64623 : }
1705 :
1706 : /* Replace the use of induction variable described in IVTS in INSN
1707 : by base variable + DELTA * step. */
1708 :
1709 : static void
1710 175876 : split_iv (struct iv_to_split *ivts, rtx_insn *insn, unsigned delta)
1711 : {
1712 175876 : rtx expr, *loc, incr, var;
1713 175876 : rtx_insn *seq;
1714 175876 : machine_mode mode = GET_MODE (ivts->base_var);
1715 175876 : rtx src, dest, set;
1716 :
1717 : /* Construct base + DELTA * step. */
1718 175876 : if (!delta)
1719 : expr = ivts->base_var;
1720 : else
1721 : {
1722 111253 : incr = simplify_gen_binary (MULT, mode,
1723 : copy_rtx (ivts->step),
1724 111253 : gen_int_mode (delta, mode));
1725 111253 : expr = simplify_gen_binary (PLUS, GET_MODE (ivts->base_var),
1726 : ivts->base_var, incr);
1727 : }
1728 :
1729 : /* Figure out where to do the replacement. */
1730 175876 : loc = &SET_SRC (single_set (insn));
1731 :
1732 : /* If we can make the replacement right away, we're done. */
1733 175876 : if (validate_change (insn, loc, expr, 0))
1734 : return;
1735 :
1736 : /* Otherwise, force EXPR into a register and try again. */
1737 224 : start_sequence ();
1738 224 : var = gen_reg_rtx (mode);
1739 224 : expr = force_operand (expr, var);
1740 224 : if (expr != var)
1741 0 : emit_move_insn (var, expr);
1742 224 : seq = end_sequence ();
1743 224 : emit_insn_before (seq, insn);
1744 :
1745 224 : if (validate_change (insn, loc, var, 0))
1746 : return;
1747 :
1748 : /* The last chance. Try recreating the assignment in insn
1749 : completely from scratch. */
1750 0 : set = single_set (insn);
1751 0 : gcc_assert (set);
1752 :
1753 0 : start_sequence ();
1754 0 : *loc = var;
1755 0 : src = copy_rtx (SET_SRC (set));
1756 0 : dest = copy_rtx (SET_DEST (set));
1757 0 : src = force_operand (src, dest);
1758 0 : if (src != dest)
1759 0 : emit_move_insn (dest, src);
1760 0 : seq = end_sequence ();
1761 :
1762 0 : emit_insn_before (seq, insn);
1763 0 : delete_insn (insn);
1764 : }
1765 :
1766 :
1767 : /* Return one expansion of the accumulator recorded in struct VE. */
1768 :
1769 : static rtx
1770 18 : get_expansion (struct var_to_expand *ve)
1771 : {
1772 18 : rtx reg;
1773 :
1774 18 : if (ve->reuse_expansion == 0)
1775 9 : reg = ve->reg;
1776 : else
1777 9 : reg = ve->var_expansions[ve->reuse_expansion - 1];
1778 :
1779 36 : if (ve->var_expansions.length () == (unsigned) ve->reuse_expansion)
1780 9 : ve->reuse_expansion = 0;
1781 : else
1782 9 : ve->reuse_expansion++;
1783 :
1784 18 : return reg;
1785 : }
1786 :
1787 :
1788 : /* Given INSN replace the uses of the accumulator recorded in VE
1789 : with a new register. */
1790 :
1791 : static void
1792 21 : expand_var_during_unrolling (struct var_to_expand *ve, rtx_insn *insn)
1793 : {
1794 21 : rtx new_reg, set;
1795 21 : bool really_new_expansion = false;
1796 :
1797 21 : set = single_set (insn);
1798 21 : gcc_assert (set);
1799 :
1800 : /* Generate a new register only if the expansion limit has not been
1801 : reached. Else reuse an already existing expansion. */
1802 21 : if (param_max_variable_expansions > ve->expansion_count)
1803 : {
1804 3 : really_new_expansion = true;
1805 3 : new_reg = gen_reg_rtx (GET_MODE (ve->reg));
1806 : }
1807 : else
1808 18 : new_reg = get_expansion (ve);
1809 :
1810 21 : validate_replace_rtx_group (SET_DEST (set), new_reg, insn);
1811 21 : if (apply_change_group ())
1812 21 : if (really_new_expansion)
1813 : {
1814 3 : ve->var_expansions.safe_push (new_reg);
1815 3 : ve->expansion_count++;
1816 : }
1817 21 : }
1818 :
1819 : /* Initialize the variable expansions in loop preheader. PLACE is the
1820 : loop-preheader basic block where the initialization of the
1821 : expansions should take place. The expansions are initialized with
1822 : (-0) when the operation is plus or minus to honor sign zero. This
1823 : way we can prevent cases where the sign of the final result is
1824 : effected by the sign of the expansion. Here is an example to
1825 : demonstrate this:
1826 :
1827 : for (i = 0 ; i < n; i++)
1828 : sum += something;
1829 :
1830 : ==>
1831 :
1832 : sum += something
1833 : ....
1834 : i = i+1;
1835 : sum1 += something
1836 : ....
1837 : i = i+1
1838 : sum2 += something;
1839 : ....
1840 :
1841 : When SUM is initialized with -zero and SOMETHING is also -zero; the
1842 : final result of sum should be -zero thus the expansions sum1 and sum2
1843 : should be initialized with -zero as well (otherwise we will get +zero
1844 : as the final result). */
1845 :
1846 : static void
1847 3 : insert_var_expansion_initialization (struct var_to_expand *ve,
1848 : basic_block place)
1849 : {
1850 3 : rtx_insn *seq;
1851 3 : rtx var, zero_init;
1852 3 : unsigned i;
1853 3 : machine_mode mode = GET_MODE (ve->reg);
1854 9 : bool has_signed_zero_p = MODE_HAS_SIGNED_ZEROS (mode);
1855 :
1856 3 : if (ve->var_expansions.length () == 0)
1857 3 : return;
1858 :
1859 3 : start_sequence ();
1860 3 : switch (ve->op)
1861 : {
1862 : case FMA:
1863 : /* Note that we only accumulate FMA via the ADD operand. */
1864 : case PLUS:
1865 : case MINUS:
1866 6 : FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1867 : {
1868 3 : if (has_signed_zero_p)
1869 2 : zero_init = simplify_gen_unary (NEG, mode, CONST0_RTX (mode), mode);
1870 : else
1871 1 : zero_init = CONST0_RTX (mode);
1872 3 : emit_move_insn (var, zero_init);
1873 : }
1874 : break;
1875 :
1876 : case MULT:
1877 0 : FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1878 : {
1879 0 : zero_init = CONST1_RTX (GET_MODE (var));
1880 0 : emit_move_insn (var, zero_init);
1881 : }
1882 : break;
1883 :
1884 0 : default:
1885 0 : gcc_unreachable ();
1886 : }
1887 :
1888 3 : seq = end_sequence ();
1889 :
1890 3 : emit_insn_after (seq, BB_END (place));
1891 : }
1892 :
1893 : /* Combine the variable expansions at the loop exit. PLACE is the
1894 : loop exit basic block where the summation of the expansions should
1895 : take place. */
1896 :
1897 : static void
1898 3 : combine_var_copies_in_loop_exit (struct var_to_expand *ve, basic_block place)
1899 : {
1900 3 : rtx sum = ve->reg;
1901 3 : rtx expr, var;
1902 3 : rtx_insn *seq, *insn;
1903 3 : unsigned i;
1904 :
1905 3 : if (ve->var_expansions.length () == 0)
1906 3 : return;
1907 :
1908 : /* ve->reg might be SUBREG or some other non-shareable RTL, and we use
1909 : it both here and as the destination of the assignment. */
1910 3 : sum = copy_rtx (sum);
1911 3 : start_sequence ();
1912 3 : switch (ve->op)
1913 : {
1914 : case FMA:
1915 : /* Note that we only accumulate FMA via the ADD operand. */
1916 : case PLUS:
1917 : case MINUS:
1918 6 : FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1919 3 : sum = simplify_gen_binary (PLUS, GET_MODE (ve->reg), var, sum);
1920 : break;
1921 :
1922 : case MULT:
1923 0 : FOR_EACH_VEC_ELT (ve->var_expansions, i, var)
1924 0 : sum = simplify_gen_binary (MULT, GET_MODE (ve->reg), var, sum);
1925 : break;
1926 :
1927 0 : default:
1928 0 : gcc_unreachable ();
1929 : }
1930 :
1931 3 : expr = force_operand (sum, ve->reg);
1932 3 : if (expr != ve->reg)
1933 0 : emit_move_insn (ve->reg, expr);
1934 3 : seq = end_sequence ();
1935 :
1936 3 : insn = BB_HEAD (place);
1937 3 : while (!NOTE_INSN_BASIC_BLOCK_P (insn))
1938 0 : insn = NEXT_INSN (insn);
1939 :
1940 3 : emit_insn_after (seq, insn);
1941 : }
1942 :
1943 : /* Strip away REG_EQUAL notes for IVs we're splitting.
1944 :
1945 : Updating REG_EQUAL notes for IVs we split is tricky: We
1946 : cannot tell until after unrolling, DF-rescanning, and liveness
1947 : updating, whether an EQ_USE is reached by the split IV while
1948 : the IV reg is still live. See PR55006.
1949 :
1950 : ??? We cannot use remove_reg_equal_equiv_notes_for_regno,
1951 : because RTL loop-iv requires us to defer rescanning insns and
1952 : any notes attached to them. So resort to old techniques... */
1953 :
1954 : static void
1955 138437781 : maybe_strip_eq_note_for_split_iv (struct opt_info *opt_info, rtx_insn *insn)
1956 : {
1957 138437781 : struct iv_to_split *ivts;
1958 138437781 : rtx note = find_reg_equal_equiv_note (insn);
1959 138437781 : if (! note)
1960 : return;
1961 10308704 : for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
1962 5253916 : if (reg_mentioned_p (ivts->orig_var, note))
1963 : {
1964 21043 : remove_note (insn, note);
1965 21043 : return;
1966 : }
1967 : }
1968 :
1969 : /* Apply loop optimizations in loop copies using the
1970 : data which gathered during the unrolling. Structure
1971 : OPT_INFO record that data.
1972 :
1973 : UNROLLING is true if we unrolled (not peeled) the loop.
1974 : REWRITE_ORIGINAL_BODY is true if we should also rewrite the original body of
1975 : the loop (as it should happen in complete unrolling, but not in ordinary
1976 : peeling of the loop). */
1977 :
1978 : static void
1979 51005 : apply_opt_in_copies (struct opt_info *opt_info,
1980 : unsigned n_copies, bool unrolling,
1981 : bool rewrite_original_loop)
1982 : {
1983 51005 : unsigned i, delta;
1984 51005 : basic_block bb, orig_bb;
1985 51005 : rtx_insn *insn, *orig_insn, *next;
1986 51005 : struct iv_to_split ivts_templ, *ivts;
1987 51005 : struct var_to_expand ve_templ, *ves;
1988 :
1989 : /* Sanity check -- we need to put initialization in the original loop
1990 : body. */
1991 51005 : gcc_assert (!unrolling || rewrite_original_loop);
1992 :
1993 : /* Allocate the basic variables (i0). */
1994 51005 : if (opt_info->insns_to_split)
1995 104255 : for (ivts = opt_info->iv_to_split_head; ivts; ivts = ivts->next)
1996 53250 : allocate_basic_variable (ivts);
1997 :
1998 298106 : for (i = opt_info->first_new_block;
1999 298106 : i < (unsigned) last_basic_block_for_fn (cfun);
2000 : i++)
2001 : {
2002 247101 : bb = BASIC_BLOCK_FOR_FN (cfun, i);
2003 247101 : orig_bb = get_bb_original (bb);
2004 :
2005 : /* bb->aux holds position in copy sequence initialized by
2006 : duplicate_loop_body_to_header_edge. */
2007 247101 : delta = determine_split_iv_delta ((size_t)bb->aux, n_copies,
2008 : unrolling);
2009 247101 : bb->aux = 0;
2010 247101 : orig_insn = BB_HEAD (orig_bb);
2011 3413476 : FOR_BB_INSNS_SAFE (bb, insn, next)
2012 : {
2013 1817326 : if (!INSN_P (insn)
2014 1459637 : || (DEBUG_BIND_INSN_P (insn)
2015 268955 : && INSN_VAR_LOCATION_DECL (insn)
2016 268955 : && TREE_CODE (INSN_VAR_LOCATION_DECL (insn)) == LABEL_DECL))
2017 357689 : continue;
2018 :
2019 1369799 : while (!INSN_P (orig_insn)
2020 1369799 : || (DEBUG_BIND_INSN_P (orig_insn)
2021 268969 : && INSN_VAR_LOCATION_DECL (orig_insn)
2022 268969 : && (TREE_CODE (INSN_VAR_LOCATION_DECL (orig_insn))
2023 : == LABEL_DECL)))
2024 267851 : orig_insn = NEXT_INSN (orig_insn);
2025 :
2026 1101948 : ivts_templ.insn = orig_insn;
2027 1101948 : ve_templ.insn = orig_insn;
2028 :
2029 : /* Apply splitting iv optimization. */
2030 1101948 : if (opt_info->insns_to_split)
2031 : {
2032 1101948 : maybe_strip_eq_note_for_split_iv (opt_info, insn);
2033 :
2034 1101948 : ivts = opt_info->insns_to_split->find (&ivts_templ);
2035 :
2036 1101948 : if (ivts)
2037 : {
2038 111535 : gcc_assert (GET_CODE (PATTERN (insn))
2039 : == GET_CODE (PATTERN (orig_insn)));
2040 :
2041 111535 : if (!delta)
2042 282 : insert_base_initialization (ivts, insn);
2043 111535 : split_iv (ivts, insn, delta);
2044 : }
2045 : }
2046 : /* Apply variable expansion optimization. */
2047 1101948 : if (unrolling && opt_info->insns_with_var_to_expand)
2048 : {
2049 532 : ves = (struct var_to_expand *)
2050 266 : opt_info->insns_with_var_to_expand->find (&ve_templ);
2051 266 : if (ves)
2052 : {
2053 21 : gcc_assert (GET_CODE (PATTERN (insn))
2054 : == GET_CODE (PATTERN (orig_insn)));
2055 21 : expand_var_during_unrolling (ves, insn);
2056 : }
2057 : }
2058 1101948 : orig_insn = NEXT_INSN (orig_insn);
2059 : }
2060 : }
2061 :
2062 51005 : if (!rewrite_original_loop)
2063 266 : return;
2064 :
2065 : /* Initialize the variable expansions in the loop preheader
2066 : and take care of combining them at the loop exit. */
2067 50739 : if (opt_info->insns_with_var_to_expand)
2068 : {
2069 9 : for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2070 3 : insert_var_expansion_initialization (ves, opt_info->loop_preheader);
2071 9 : for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2072 3 : combine_var_copies_in_loop_exit (ves, opt_info->loop_exit);
2073 : }
2074 :
2075 : /* Rewrite also the original loop body. Find them as originals of the blocks
2076 : in the last copied iteration, i.e. those that have
2077 : get_bb_copy (get_bb_original (bb)) == bb. */
2078 295599 : for (i = opt_info->first_new_block;
2079 295599 : i < (unsigned) last_basic_block_for_fn (cfun);
2080 : i++)
2081 : {
2082 244860 : bb = BASIC_BLOCK_FOR_FN (cfun, i);
2083 244860 : orig_bb = get_bb_original (bb);
2084 244860 : if (get_bb_copy (orig_bb) != bb)
2085 133290 : continue;
2086 :
2087 111570 : delta = determine_split_iv_delta (0, n_copies, unrolling);
2088 111570 : for (orig_insn = BB_HEAD (orig_bb);
2089 183923209 : orig_insn != NEXT_INSN (BB_END (bb));
2090 : orig_insn = next)
2091 : {
2092 183811639 : next = NEXT_INSN (orig_insn);
2093 :
2094 183811639 : if (!INSN_P (orig_insn))
2095 46475806 : continue;
2096 :
2097 137335833 : ivts_templ.insn = orig_insn;
2098 137335833 : if (opt_info->insns_to_split)
2099 : {
2100 137335833 : maybe_strip_eq_note_for_split_iv (opt_info, orig_insn);
2101 :
2102 274671666 : ivts = (struct iv_to_split *)
2103 137335833 : opt_info->insns_to_split->find (&ivts_templ);
2104 137335833 : if (ivts)
2105 : {
2106 64341 : if (!delta)
2107 64341 : insert_base_initialization (ivts, orig_insn);
2108 64341 : split_iv (ivts, orig_insn, delta);
2109 64341 : continue;
2110 : }
2111 : }
2112 :
2113 : }
2114 : }
2115 : }
2116 :
2117 : /* Release OPT_INFO. */
2118 :
2119 : static void
2120 50739 : free_opt_info (struct opt_info *opt_info)
2121 : {
2122 50739 : delete opt_info->insns_to_split;
2123 50739 : opt_info->insns_to_split = NULL;
2124 50739 : if (opt_info->insns_with_var_to_expand)
2125 : {
2126 6 : struct var_to_expand *ves;
2127 :
2128 9 : for (ves = opt_info->var_to_expand_head; ves; ves = ves->next)
2129 3 : ves->var_expansions.release ();
2130 6 : delete opt_info->insns_with_var_to_expand;
2131 6 : opt_info->insns_with_var_to_expand = NULL;
2132 : }
2133 50739 : free (opt_info);
2134 50739 : }
|