LCOV - code coverage report
Current view: top level - gcc - gimple-loop-jam.cc (source / functions) Coverage Total Hit
Test: gcc.info Lines: 97.8 % 225 220
Test Date: 2024-12-21 13:15:12 Functions: 100.0 % 10 10
Legend: Lines: hit not hit | Branches: + taken - not taken # not executed Branches: - 0 0

             Branch data     Line data    Source code
       1                 :             : /* Loop unroll-and-jam.
       2                 :             :    Copyright (C) 2017-2024 Free Software Foundation, Inc.
       3                 :             : 
       4                 :             : This file is part of GCC.
       5                 :             : 
       6                 :             : GCC is free software; you can redistribute it and/or modify it
       7                 :             : under the terms of the GNU General Public License as published by the
       8                 :             : Free Software Foundation; either version 3, or (at your option) any
       9                 :             : later version.
      10                 :             : 
      11                 :             : GCC is distributed in the hope that it will be useful, but WITHOUT
      12                 :             : ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
      13                 :             : FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
      14                 :             : for more details.
      15                 :             : 
      16                 :             : You should have received a copy of the GNU General Public License
      17                 :             : along with GCC; see the file COPYING3.  If not see
      18                 :             : <http://www.gnu.org/licenses/>.  */
      19                 :             : 
      20                 :             : #include "config.h"
      21                 :             : #include "system.h"
      22                 :             : #include "coretypes.h"
      23                 :             : #include "tree-pass.h"
      24                 :             : #include "backend.h"
      25                 :             : #include "tree.h"
      26                 :             : #include "gimple.h"
      27                 :             : #include "ssa.h"
      28                 :             : #include "fold-const.h"
      29                 :             : #include "tree-cfg.h"
      30                 :             : #include "tree-ssa.h"
      31                 :             : #include "tree-ssa-loop-niter.h"
      32                 :             : #include "tree-ssa-loop.h"
      33                 :             : #include "tree-ssa-loop-manip.h"
      34                 :             : #include "cfgloop.h"
      35                 :             : #include "tree-scalar-evolution.h"
      36                 :             : #include "gimple-iterator.h"
      37                 :             : #include "cfghooks.h"
      38                 :             : #include "tree-data-ref.h"
      39                 :             : #include "tree-ssa-loop-ivopts.h"
      40                 :             : #include "tree-vectorizer.h"
      41                 :             : #include "tree-ssa-sccvn.h"
      42                 :             : #include "tree-cfgcleanup.h"
      43                 :             : 
      44                 :             : /* Unroll and Jam transformation
      45                 :             : 
      46                 :             :    This is a combination of two transformations, where the second
      47                 :             :    is not always valid.  It's applicable if a loop nest has redundancies
      48                 :             :    over the iterations of an outer loop while not having that with
      49                 :             :    an inner loop.
      50                 :             : 
      51                 :             :    Given this nest:
      52                 :             :        for (i) {
      53                 :             :          for (j) {
      54                 :             :            B(i,j)
      55                 :             :          }
      56                 :             :        }
      57                 :             : 
      58                 :             :    first unroll:
      59                 :             :        for (i by 2) {
      60                 :             :          for (j) {
      61                 :             :            B(i,j)
      62                 :             :          }
      63                 :             :          for (j) {
      64                 :             :            B(i+1,j)
      65                 :             :          }
      66                 :             :        }
      67                 :             : 
      68                 :             :    then fuse the two adjacent inner loops resulting from that:
      69                 :             :        for (i by 2) {
      70                 :             :          for (j) {
      71                 :             :            B(i,j)
      72                 :             :            B(i+1,j)
      73                 :             :          }
      74                 :             :        }
      75                 :             : 
      76                 :             :    As the order of evaluations of the body B changes this is valid
      77                 :             :    only in certain situations: all distance vectors need to be forward.
      78                 :             :    Additionally if there are multiple induction variables than just
      79                 :             :    a counting control IV (j above) we can also deal with some situations.
      80                 :             : 
      81                 :             :    The validity is checked by unroll_jam_possible_p, and the data-dep
      82                 :             :    testing below.
      83                 :             : 
      84                 :             :    A trivial example where the fusion is wrong would be when
      85                 :             :    B(i,j) == x[j-1] = x[j];
      86                 :             :        for (i by 2) {
      87                 :             :          for (j) {
      88                 :             :            x[j-1] = x[j];
      89                 :             :          }
      90                 :             :          for (j) {
      91                 :             :            x[j-1] = x[j];
      92                 :             :          }
      93                 :             :        }  effect: move content to front by two elements
      94                 :             :        -->
      95                 :             :        for (i by 2) {
      96                 :             :          for (j) {
      97                 :             :            x[j-1] = x[j];
      98                 :             :            x[j-1] = x[j];
      99                 :             :          }
     100                 :             :        }  effect: move content to front by one element
     101                 :             : */
     102                 :             : 
     103                 :             : /* Modify the loop tree for the fact that all code once belonging
     104                 :             :    to the OLD loop or the outer loop of OLD now is inside LOOP.  */
     105                 :             : 
     106                 :             : static void
     107                 :          71 : merge_loop_tree (class loop *loop, class loop *old)
     108                 :             : {
     109                 :          71 :   basic_block *bbs;
     110                 :          71 :   int i, n;
     111                 :          71 :   class loop *subloop;
     112                 :          71 :   edge e;
     113                 :          71 :   edge_iterator ei;
     114                 :             : 
     115                 :             :   /* Find its nodes.  */
     116                 :          71 :   bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
     117                 :          71 :   n = get_loop_body_with_size (loop, bbs, n_basic_blocks_for_fn (cfun));
     118                 :             : 
     119                 :         624 :   for (i = 0; i < n; i++)
     120                 :             :     {
     121                 :             :       /* If the block was direct child of OLD loop it's now part
     122                 :             :          of LOOP.  If it was outside OLD, then it moved into LOOP
     123                 :             :          as well.  This avoids changing the loop father for BBs
     124                 :             :          in inner loops of OLD.  */
     125                 :        1027 :       if (bbs[i]->loop_father == old
     126                 :         956 :           || loop_depth (bbs[i]->loop_father) < loop_depth (old))
     127                 :             :         {
     128                 :         474 :           remove_bb_from_loops (bbs[i]);
     129                 :         474 :           add_bb_to_loop (bbs[i], loop);
     130                 :         474 :           continue;
     131                 :             :         }
     132                 :             : 
     133                 :             :       /* If we find a direct subloop of OLD, move it to LOOP.  */
     134                 :          79 :       subloop = bbs[i]->loop_father;
     135                 :          79 :       if (loop_outer (subloop) == old && subloop->header == bbs[i])
     136                 :             :         {
     137                 :           0 :           flow_loop_tree_node_remove (subloop);
     138                 :           0 :           flow_loop_tree_node_add (loop, subloop);
     139                 :             :         }
     140                 :             :     }
     141                 :             : 
     142                 :             :   /* Update the information about loop exit edges.  */
     143                 :         624 :   for (i = 0; i < n; i++)
     144                 :             :     {
     145                 :        1201 :       FOR_EACH_EDGE (e, ei, bbs[i]->succs)
     146                 :             :         {
     147                 :         648 :           rescan_loop_exit (e, false, false);
     148                 :             :         }
     149                 :             :     }
     150                 :             : 
     151                 :          71 :   loop->num_nodes = n;
     152                 :             : 
     153                 :          71 :   free (bbs);
     154                 :          71 : }
     155                 :             : 
     156                 :             : /* BB is part of the outer loop of an unroll-and-jam situation.
     157                 :             :    Check if any statements therein would prevent the transformation.  */
     158                 :             : 
     159                 :             : static bool
     160                 :        6750 : bb_prevents_fusion_p (basic_block bb)
     161                 :             : {
     162                 :        6750 :   gimple_stmt_iterator gsi;
     163                 :             :   /* BB is duplicated by outer unrolling and then all N-1 first copies
     164                 :             :      move into the body of the fused inner loop.  If BB exits the outer loop
     165                 :             :      the last copy still does so, and the first N-1 copies are cancelled
     166                 :             :      by loop unrolling, so also after fusion it's the exit block.
     167                 :             :      But there might be other reasons that prevent fusion:
     168                 :             :        * stores or unknown side-effects prevent fusion
     169                 :             :        * loads don't
     170                 :             :        * computations into SSA names: these aren't problematic.  Their
     171                 :             :          result will be unused on the exit edges of the first N-1 copies
     172                 :             :          (those aren't taken after unrolling).  If they are used on the
     173                 :             :          other edge (the one leading to the outer latch block) they are
     174                 :             :          loop-carried (on the outer loop) and the Nth copy of BB will
     175                 :             :          compute them again (i.e. the first N-1 copies will be dead).  */
     176                 :       26724 :   for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi); gsi_next (&gsi))
     177                 :             :     {
     178                 :       13492 :       gimple *g = gsi_stmt (gsi);
     179                 :       22802 :       if (gimple_vdef (g) || gimple_has_side_effects (g))
     180                 :         268 :         return true;
     181                 :             :     }
     182                 :             :   return false;
     183                 :             : }
     184                 :             : 
     185                 :             : /* Given an inner loop LOOP (of some OUTER loop) determine if
     186                 :             :    we can safely fuse copies of it (generated by outer unrolling).
     187                 :             :    If so return true, otherwise return false.  */
     188                 :             : 
     189                 :             : static bool
     190                 :       14189 : unroll_jam_possible_p (class loop *outer, class loop *loop)
     191                 :             : {
     192                 :       14189 :   basic_block *bbs;
     193                 :       14189 :   int i, n;
     194                 :       14189 :   class tree_niter_desc niter;
     195                 :             : 
     196                 :             :   /* When fusing the loops we skip the latch block
     197                 :             :      of the first one, so it mustn't have any effects to
     198                 :             :      preserve.  */
     199                 :       14189 :   if (!empty_block_p (loop->latch))
     200                 :             :     return false;
     201                 :             : 
     202                 :       13614 :   edge exit;
     203                 :       13614 :   if (!(exit = single_exit (loop)))
     204                 :             :     return false;
     205                 :             : 
     206                 :             :   /* We need a perfect nest.  Quick check for adjacent inner loops.  */
     207                 :        9787 :   if (outer->inner != loop || loop->next)
     208                 :             :     return false;
     209                 :             : 
     210                 :             :   /* Prevent head-controlled inner loops, that we usually have.
     211                 :             :      The guard block would need to be accepted
     212                 :             :      (invariant condition either entering or skipping the loop),
     213                 :             :      without also accepting arbitrary control flow.  When unswitching
     214                 :             :      ran before us (as with -O3) this won't be a problem because its
     215                 :             :      outer loop unswitching will have moved out the invariant condition.
     216                 :             : 
     217                 :             :      If we do that we need to extend fuse_loops() to cope with this
     218                 :             :      by threading through the (still invariant) copied condition
     219                 :             :      between the two loop copies.  */
     220                 :        5762 :   if (!dominated_by_p (CDI_DOMINATORS, outer->latch, loop->header))
     221                 :             :     return false;
     222                 :             : 
     223                 :             :   /* The number of iterations of the inner loop must be loop invariant
     224                 :             :      with respect to the outer loop.  */
     225                 :        3582 :   if (!number_of_iterations_exit (loop, single_exit (loop), &niter,
     226                 :             :                                  false, true)
     227                 :        3203 :       || niter.cmp == ERROR_MARK
     228                 :        3203 :       || !integer_zerop (niter.may_be_zero)
     229                 :        6564 :       || !expr_invariant_in_loop_p (outer, niter.niter))
     230                 :         923 :     return false;
     231                 :             : 
     232                 :             :   /* If the inner loop produces any values that are used inside the
     233                 :             :      outer loop (except the virtual op) then it can flow
     234                 :             :      back (perhaps indirectly) into the inner loop.  This prevents
     235                 :             :      fusion: without fusion the value at the last iteration is used,
     236                 :             :      with fusion the value after the initial iteration is used.
     237                 :             : 
     238                 :             :      If all uses are outside the outer loop this doesn't prevent fusion;
     239                 :             :      the value of the last iteration is still used (and the values from
     240                 :             :      all intermediate iterations are dead).  */
     241                 :        2659 :   gphi_iterator psi;
     242                 :        2659 :   for (psi = gsi_start_phis (single_exit (loop)->dest);
     243                 :        5044 :        !gsi_end_p (psi); gsi_next (&psi))
     244                 :             :     {
     245                 :        2754 :       imm_use_iterator imm_iter;
     246                 :        2754 :       use_operand_p use_p;
     247                 :        2754 :       tree op = gimple_phi_result (psi.phi ());
     248                 :        5508 :       if (virtual_operand_p (op))
     249                 :        2295 :         continue;
     250                 :         557 :       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, op)
     251                 :             :         {
     252                 :         467 :           gimple *use_stmt = USE_STMT (use_p);
     253                 :         467 :           if (!is_gimple_debug (use_stmt)
     254                 :         467 :               && flow_bb_inside_loop_p (outer, gimple_bb (use_stmt)))
     255                 :         369 :             return false;
     256                 :             :         }
     257                 :             :     }
     258                 :             : 
     259                 :             :   /* And check blocks belonging to just outer loop.  */
     260                 :        2290 :   bbs = XNEWVEC (basic_block, n_basic_blocks_for_fn (cfun));
     261                 :        2290 :   n = get_loop_body_with_size (outer, bbs, n_basic_blocks_for_fn (cfun));
     262                 :             : 
     263                 :       13379 :   for (i = 0; i < n; i++)
     264                 :       11392 :     if (bbs[i]->loop_father == outer
     265                 :       11392 :         && (bb_prevents_fusion_p (bbs[i])
     266                 :             :             /* Outer loop exits must come after the inner loop, otherwise
     267                 :             :                we'll put the outer loop exit into the fused inner loop.  */
     268                 :        6482 :             || (loop_exits_from_bb_p (outer, bbs[i])
     269                 :        2024 :                 && !dominated_by_p (CDI_DOMINATORS, bbs[i], exit->src))))
     270                 :             :       break;
     271                 :        2290 :   free (bbs);
     272                 :        2290 :   if (i != n)
     273                 :             :     return false;
     274                 :             : 
     275                 :             :   /* For now we can safely fuse copies of LOOP only if all
     276                 :             :      loop carried variables are inductions (or the virtual op).
     277                 :             : 
     278                 :             :      We could handle reductions as well (the initial value in the second
     279                 :             :      body would be the after-iter value of the first body) if it's over
     280                 :             :      an associative and commutative operation.  We wouldn't
     281                 :             :      be able to handle unknown cycles.  */
     282                 :        5240 :   for (psi = gsi_start_phis (loop->header); !gsi_end_p (psi); gsi_next (&psi))
     283                 :             :     {
     284                 :        3642 :       affine_iv iv;
     285                 :        3642 :       tree op = gimple_phi_result (psi.phi ());
     286                 :             : 
     287                 :        7284 :       if (virtual_operand_p (op))
     288                 :        1583 :         continue;
     289                 :        2059 :       if (!simple_iv (loop, loop, op, &iv, true))
     290                 :         389 :         return false;
     291                 :             :       /* The inductions must be regular, loop invariant step and initial
     292                 :             :          value.  */
     293                 :        2038 :       if (!expr_invariant_in_loop_p (outer, iv.step)
     294                 :        2038 :           || !expr_invariant_in_loop_p (outer, iv.base))
     295                 :         368 :         return false;
     296                 :             :       /* XXX With more effort we could also be able to deal with inductions
     297                 :             :          where the initial value is loop variant but a simple IV in the
     298                 :             :          outer loop.  The initial value for the second body would be
     299                 :             :          the original initial value plus iv.base.step.  The next value
     300                 :             :          for the fused loop would be the original next value of the first
     301                 :             :          copy, _not_ the next value of the second body.  */
     302                 :             :     }
     303                 :             : 
     304                 :             :   return true;
     305                 :       14189 : }
     306                 :             : 
     307                 :             : /* Fuse LOOP with all further neighbors.  The loops are expected to
     308                 :             :    be in appropriate form.  */
     309                 :             : 
     310                 :             : static void
     311                 :          71 : fuse_loops (class loop *loop)
     312                 :             : {
     313                 :          71 :   class loop *next = loop->next;
     314                 :             : 
     315                 :         142 :   while (next)
     316                 :             :     {
     317                 :          71 :       edge e;
     318                 :             : 
     319                 :          71 :       remove_branch (single_pred_edge (loop->latch));
     320                 :             :       /* Make delete_basic_block not fiddle with the loop structure.  */
     321                 :          71 :       basic_block oldlatch = loop->latch;
     322                 :          71 :       loop->latch = NULL;
     323                 :          71 :       delete_basic_block (oldlatch);
     324                 :          71 :       e = redirect_edge_and_branch (loop_latch_edge (next),
     325                 :             :                                     loop->header);
     326                 :          71 :       loop->latch = e->src;
     327                 :          71 :       flush_pending_stmts (e);
     328                 :             : 
     329                 :          71 :       gcc_assert (EDGE_COUNT (next->header->preds) == 1);
     330                 :             : 
     331                 :             :       /* The PHI nodes of the second body (single-argument now)
     332                 :             :          need adjustments to use the right values: either directly
     333                 :             :          the value of the corresponding PHI in the first copy or
     334                 :             :          the one leaving the first body which unrolling did for us.
     335                 :             : 
     336                 :             :          See also unroll_jam_possible_p() for further possibilities.  */
     337                 :          71 :       gphi_iterator psi_first, psi_second;
     338                 :          71 :       e = single_pred_edge (next->header);
     339                 :          71 :       for (psi_first = gsi_start_phis (loop->header),
     340                 :          71 :            psi_second = gsi_start_phis (next->header);
     341                 :         198 :            !gsi_end_p (psi_first);
     342                 :         127 :            gsi_next (&psi_first), gsi_next (&psi_second))
     343                 :             :         {
     344                 :         127 :           gphi *phi_first = psi_first.phi ();
     345                 :         127 :           gphi *phi_second = psi_second.phi ();
     346                 :         127 :           tree firstop = gimple_phi_result (phi_first);
     347                 :             :           /* The virtual operand is correct already as it's
     348                 :             :              always live at exit, hence has a LCSSA node and outer
     349                 :             :              loop unrolling updated SSA form.  */
     350                 :         254 :           if (virtual_operand_p (firstop))
     351                 :          55 :             continue;
     352                 :             : 
     353                 :             :           /* Due to unroll_jam_possible_p() we know that this is
     354                 :             :              an induction.  The second body goes over the same
     355                 :             :              iteration space.  */
     356                 :          72 :           add_phi_arg (phi_second, firstop, e,
     357                 :             :                        gimple_location (phi_first));
     358                 :             :         }
     359                 :          71 :       gcc_assert (gsi_end_p (psi_second));
     360                 :             : 
     361                 :          71 :       merge_loop_tree (loop, next);
     362                 :          71 :       gcc_assert (!next->num_nodes);
     363                 :          71 :       class loop *ln = next->next;
     364                 :          71 :       delete_loop (next);
     365                 :          71 :       next = ln;
     366                 :             :     }
     367                 :          71 : }
     368                 :             : 
     369                 :             : /* Return true if any of the access functions for dataref A
     370                 :             :    isn't invariant with respect to loop LOOP_NEST.  */
     371                 :             : static bool
     372                 :        3650 : any_access_function_variant_p (const struct data_reference *a,
     373                 :             :                                const class loop *loop_nest)
     374                 :             : {
     375                 :        3650 :   vec<tree> fns = DR_ACCESS_FNS (a);
     376                 :             : 
     377                 :       11487 :   for (tree t : fns)
     378                 :        4175 :     if (!evolution_function_is_invariant_p (t, loop_nest->num))
     379                 :             :       return true;
     380                 :             : 
     381                 :             :   return false;
     382                 :             : }
     383                 :             : 
     384                 :             : /* Returns true if the distance in DDR can be determined and adjusts
     385                 :             :    the unroll factor in *UNROLL to make unrolling valid for that distance.
     386                 :             :    Otherwise return false.  DDR is with respect to the outer loop of INNER.
     387                 :             : 
     388                 :             :    If this data dep can lead to a removed memory reference, increment
     389                 :             :    *REMOVED and adjust *PROFIT_UNROLL to be the necessary unroll factor
     390                 :             :    for this to happen.  */
     391                 :             : 
     392                 :             : static bool
     393                 :        3627 : adjust_unroll_factor (class loop *inner, struct data_dependence_relation *ddr,
     394                 :             :                       unsigned *unroll, unsigned *profit_unroll,
     395                 :             :                       unsigned *removed)
     396                 :             : {
     397                 :        3627 :   bool ret = false;
     398                 :        3627 :   if (DDR_ARE_DEPENDENT (ddr) != chrec_known)
     399                 :             :     {
     400                 :        3627 :       if (DDR_NUM_DIST_VECTS (ddr) == 0)
     401                 :        3627 :         return false;
     402                 :             :       unsigned i;
     403                 :             :       lambda_vector dist_v;
     404                 :        3705 :       FOR_EACH_VEC_ELT (DDR_DIST_VECTS (ddr), i, dist_v)
     405                 :             :         {
     406                 :             :           /* A distance (a,b) is at worst transformed into (a/N,b) by the
     407                 :             :              unrolling (factor N), so the transformation is valid if
     408                 :             :              a >= N, or b > 0, or b is zero and a > 0.  Otherwise the unroll
     409                 :             :              factor needs to be limited so that the first condition holds.
     410                 :             :              That may limit the factor down to zero in the worst case.  */
     411                 :        2435 :           lambda_int dist = dist_v[0];
     412                 :        2435 :           if (dist < 0)
     413                 :           0 :             gcc_unreachable ();
     414                 :        2435 :           else if (dist >= (lambda_int)*unroll)
     415                 :             :             ;
     416                 :        7143 :           else if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
     417                 :             :             {
     418                 :             :               /* We have (a,0) with a < N, so this will be transformed into
     419                 :             :                  (0,0) after unrolling by N.  This might potentially be a
     420                 :             :                  problem, if it's not a read-read dependency.  */
     421                 :        2303 :               if (DR_IS_READ (DDR_A (ddr)) && DR_IS_READ (DDR_B (ddr)))
     422                 :             :                 ;
     423                 :             :               else
     424                 :             :                 {
     425                 :             :                   /* So, at least one is a write, and we might reduce the
     426                 :             :                      distance vector to (0,0).  This is still no problem
     427                 :             :                      if both data-refs are affine with respect to the inner
     428                 :             :                      loops.  But if one of them is invariant with respect
     429                 :             :                      to an inner loop our reordering implicit in loop fusion
     430                 :             :                      corrupts the program, as our data dependences don't
     431                 :             :                      capture this.  E.g. for:
     432                 :             :                        for (0 <= i < n)
     433                 :             :                          for (0 <= j < m)
     434                 :             :                            a[i][0] = a[i+1][0] + 2;    // (1)
     435                 :             :                            b[i][j] = b[i+1][j] + 2;    // (2)
     436                 :             :                      the distance vector for both statements is (-1,0),
     437                 :             :                      but exchanging the order for (2) is okay, while
     438                 :             :                      for (1) it is not.  To see this, write out the original
     439                 :             :                      accesses (assume m is 2):
     440                 :             :                        a i j original
     441                 :             :                        0 0 0 r a[1][0] b[1][0]
     442                 :             :                        1 0 0 w a[0][0] b[0][0]
     443                 :             :                        2 0 1 r a[1][0] b[1][1]
     444                 :             :                        3 0 1 w a[0][0] b[0][1]
     445                 :             :                        4 1 0 r a[2][0] b[2][0]
     446                 :             :                        5 1 0 w a[1][0] b[1][0]
     447                 :             :                      after unroll-by-2 and fusion the accesses are done in
     448                 :             :                      this order (from column a): 0,1, 4,5, 2,3, i.e. this:
     449                 :             :                        a i j transformed
     450                 :             :                        0 0 0 r a[1][0] b[1][0]
     451                 :             :                        1 0 0 w a[0][0] b[0][0]
     452                 :             :                        4 1 0 r a[2][0] b[2][0]
     453                 :             :                        5 1 0 w a[1][0] b[1][0]
     454                 :             :                        2 0 1 r a[1][0] b[1][1]
     455                 :             :                        3 0 1 w a[0][0] b[0][1]
     456                 :             :                      Note how access 2 accesses the same element as access 5
     457                 :             :                      for array 'a' but not for array 'b'.  */
     458                 :        1831 :                   if (any_access_function_variant_p (DDR_A (ddr), inner)
     459                 :        1831 :                       && any_access_function_variant_p (DDR_B (ddr), inner))
     460                 :             :                     ;
     461                 :             :                   else
     462                 :             :                     /* And if any dataref of this pair is invariant with
     463                 :             :                        respect to the inner loop, we have no chance than
     464                 :             :                        to reduce the unroll factor.  */
     465                 :          12 :                     *unroll = dist;
     466                 :             :                 }
     467                 :             :             }
     468                 :        2479 :           else if (lambda_vector_lexico_pos (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
     469                 :             :             ;
     470                 :             :           else
     471                 :          44 :             *unroll = dist;
     472                 :             : 
     473                 :             :           /* With a distance (a,0) it's always profitable to unroll-and-jam
     474                 :             :              (by a+1), because one memory reference will go away.  With
     475                 :             :              (a,b) and b != 0 that's less clear.  We will increase the
     476                 :             :              number of streams without lowering the number of mem refs.
     477                 :             :              So for now only handle the first situation.  */
     478                 :        7305 :           if (lambda_vector_zerop (dist_v + 1, DDR_NB_LOOPS (ddr) - 1))
     479                 :             :             {
     480                 :        2307 :               *profit_unroll = MAX (*profit_unroll, (unsigned)dist + 1);
     481                 :        2307 :               (*removed)++;
     482                 :             :             }
     483                 :             : 
     484                 :        2435 :           ret = true;
     485                 :             :         }
     486                 :             :     }
     487                 :             :   return ret;
     488                 :             : }
     489                 :             : 
     490                 :             : /* Main entry point for the unroll-and-jam transformation
     491                 :             :    described above.  */
     492                 :             : 
     493                 :             : static unsigned int
     494                 :       25297 : tree_loop_unroll_and_jam (void)
     495                 :             : {
     496                 :       25297 :   unsigned int todo = 0;
     497                 :             : 
     498                 :       25297 :   gcc_assert (scev_initialized_p ());
     499                 :             : 
     500                 :             :   /* Go through all innermost loops.  */
     501                 :      137738 :   for (auto loop : loops_list (cfun, LI_ONLY_INNERMOST))
     502                 :             :     {
     503                 :       61847 :       class loop *outer = loop_outer (loop);
     504                 :             : 
     505                 :       61847 :       if (loop_depth (loop) < 2
     506                 :       61847 :           || optimize_loop_nest_for_size_p (outer))
     507                 :       60662 :         continue;
     508                 :             : 
     509                 :       14189 :       if (!unroll_jam_possible_p (outer, loop))
     510                 :       12591 :         continue;
     511                 :             : 
     512                 :        1598 :       vec<data_reference_p> datarefs = vNULL;
     513                 :        1598 :       vec<ddr_p> dependences = vNULL;
     514                 :        1598 :       unsigned unroll_factor, profit_unroll, removed;
     515                 :        1598 :       class tree_niter_desc desc;
     516                 :        1598 :       bool unroll = false;
     517                 :             : 
     518                 :        1598 :       auto_vec<loop_p, 3> loop_nest;
     519                 :        1598 :       if (!compute_data_dependences_for_loop (outer, true, &loop_nest,
     520                 :             :                                               &datarefs, &dependences))
     521                 :             :         {
     522                 :         322 :           if (dump_file && (dump_flags & TDF_DETAILS))
     523                 :           0 :             fprintf (dump_file, "Cannot analyze data dependencies\n");
     524                 :         322 :           free_data_refs (datarefs);
     525                 :         322 :           free_dependence_relations (dependences);
     526                 :         322 :           continue;
     527                 :             :         }
     528                 :        1276 :       if (!datarefs.length ())
     529                 :          91 :         continue;
     530                 :             : 
     531                 :        1185 :       if (dump_file && (dump_flags & TDF_DETAILS))
     532                 :          14 :         dump_data_dependence_relations (dump_file, dependences);
     533                 :             : 
     534                 :        1185 :       unroll_factor = (unsigned)-1;
     535                 :        1185 :       profit_unroll = 1;
     536                 :        1185 :       removed = 0;
     537                 :             : 
     538                 :             :       /* Check all dependencies.  */
     539                 :        1185 :       unsigned i;
     540                 :        1185 :       struct data_dependence_relation *ddr;
     541                 :       40542 :       FOR_EACH_VEC_ELT (dependences, i, ddr)
     542                 :             :         {
     543                 :       39686 :           struct data_reference *dra, *drb;
     544                 :             : 
     545                 :             :           /* If the refs are independend there's nothing to do.  */
     546                 :       39686 :           if (DDR_ARE_DEPENDENT (ddr) == chrec_known)
     547                 :       32226 :             continue;
     548                 :             : 
     549                 :        7460 :           dra = DDR_A (ddr);
     550                 :        7460 :           drb = DDR_B (ddr);
     551                 :             : 
     552                 :             :           /* Nothing interesting for the self dependencies, except for WAW if
     553                 :             :              the access function is not affine or constant because we may end
     554                 :             :              up reordering writes to the same location.  */
     555                 :        7460 :           if (dra == drb)
     556                 :             :             {
     557                 :        7518 :               if (DR_IS_WRITE (dra)
     558                 :        2037 :                   && !DR_ACCESS_FNS (dra).is_empty ()
     559                 :        5865 :                   && DDR_ARE_DEPENDENT (ddr) == chrec_dont_know)
     560                 :             :                 {
     561                 :         148 :                   unroll_factor = 0;
     562                 :         148 :                   break;
     563                 :             :                 }
     564                 :             :               else
     565                 :        3685 :                 continue;
     566                 :             :             }
     567                 :             : 
     568                 :             :           /* Now check the distance vector, for determining a sensible
     569                 :             :              outer unroll factor, and for validity of merging the inner
     570                 :             :              loop copies.  */
     571                 :        3627 :           if (!adjust_unroll_factor (loop, ddr, &unroll_factor, &profit_unroll,
     572                 :             :                                      &removed))
     573                 :             :             {
     574                 :             :               /* Couldn't get the distance vector.  For two reads that's
     575                 :             :                  harmless (we assume we should unroll).  For at least
     576                 :             :                  one write this means we can't check the dependence direction
     577                 :             :                  and hence can't determine safety.  */
     578                 :             : 
     579                 :        2357 :               if (DR_IS_WRITE (dra) || DR_IS_WRITE (drb))
     580                 :             :                 {
     581                 :         181 :                   unroll_factor = 0;
     582                 :         181 :                   break;
     583                 :             :                 }
     584                 :             :             }
     585                 :             :         }
     586                 :             : 
     587                 :             :       /* We regard a user-specified minimum percentage of zero as a request
     588                 :             :          to ignore all profitability concerns and apply the transformation
     589                 :             :          always.  */
     590                 :        1185 :       if (!param_unroll_jam_min_percent)
     591                 :          24 :         profit_unroll = MAX(2, profit_unroll);
     592                 :        1161 :       else if (removed * 100 / datarefs.length ()
     593                 :        1161 :           < (unsigned)param_unroll_jam_min_percent)
     594                 :         938 :         profit_unroll = 1;
     595                 :        1185 :       if (unroll_factor > profit_unroll)
     596                 :         814 :         unroll_factor = profit_unroll;
     597                 :        1185 :       if (unroll_factor > (unsigned)param_unroll_jam_max_unroll)
     598                 :           0 :         unroll_factor = param_unroll_jam_max_unroll;
     599                 :        2441 :       unroll = (unroll_factor > 1
     600                 :        1185 :                 && can_unroll_loop_p (outer, unroll_factor, &desc));
     601                 :             : 
     602                 :          71 :       if (unroll)
     603                 :             :         {
     604                 :          71 :           if (dump_enabled_p ())
     605                 :           8 :             dump_printf_loc (MSG_OPTIMIZED_LOCATIONS | TDF_DETAILS,
     606                 :           8 :                              find_loop_location (outer),
     607                 :             :                              "applying unroll and jam with factor %d\n",
     608                 :             :                              unroll_factor);
     609                 :          71 :           initialize_original_copy_tables ();
     610                 :          71 :           tree_unroll_loop (outer, unroll_factor, &desc);
     611                 :          71 :           free_original_copy_tables ();
     612                 :          71 :           fuse_loops (outer->inner);
     613                 :          71 :           todo |= TODO_cleanup_cfg;
     614                 :             : 
     615                 :          71 :           auto_bitmap exit_bbs;
     616                 :          71 :           bitmap_set_bit (exit_bbs, single_exit (outer)->dest->index);
     617                 :          71 :           todo |= do_rpo_vn (cfun, loop_preheader_edge (outer), exit_bbs);
     618                 :          71 :         }
     619                 :             : 
     620                 :        1185 :       loop_nest.release ();
     621                 :        1185 :       free_dependence_relations (dependences);
     622                 :        1185 :       free_data_refs (datarefs);
     623                 :        1598 :     }
     624                 :             : 
     625                 :       25297 :   if (todo)
     626                 :             :     {
     627                 :          63 :       free_dominance_info (CDI_DOMINATORS);
     628                 :             :       /* We need to cleanup the CFG first since otherwise SSA form can
     629                 :             :          be not up-to-date from the VN run.  */
     630                 :          63 :       if (todo & TODO_cleanup_cfg)
     631                 :             :         {
     632                 :          63 :           cleanup_tree_cfg ();
     633                 :          63 :           todo &= ~TODO_cleanup_cfg;
     634                 :             :         }
     635                 :          63 :       rewrite_into_loop_closed_ssa (NULL, 0);
     636                 :          63 :       scev_reset ();
     637                 :             :     }
     638                 :       25297 :   return todo;
     639                 :             : }
     640                 :             : 
     641                 :             : /* Pass boilerplate */
     642                 :             : 
     643                 :             : namespace {
     644                 :             : 
     645                 :             : const pass_data pass_data_loop_jam =
     646                 :             : {
     647                 :             :   GIMPLE_PASS, /* type */
     648                 :             :   "unrolljam", /* name */
     649                 :             :   OPTGROUP_LOOP, /* optinfo_flags */
     650                 :             :   TV_LOOP_JAM, /* tv_id */
     651                 :             :   PROP_cfg, /* properties_required */
     652                 :             :   0, /* properties_provided */
     653                 :             :   0, /* properties_destroyed */
     654                 :             :   0, /* todo_flags_start */
     655                 :             :   0, /* todo_flags_finish */
     656                 :             : };
     657                 :             : 
     658                 :             : class pass_loop_jam : public gimple_opt_pass
     659                 :             : {
     660                 :             : public:
     661                 :      280114 :   pass_loop_jam (gcc::context *ctxt)
     662                 :      560228 :     : gimple_opt_pass (pass_data_loop_jam, ctxt)
     663                 :             :   {}
     664                 :             : 
     665                 :             :   /* opt_pass methods: */
     666                 :      225983 :   bool gate (function *) final override { return flag_unroll_jam != 0; }
     667                 :             :   unsigned int execute (function *) final override;
     668                 :             : 
     669                 :             : };
     670                 :             : 
     671                 :             : unsigned int
     672                 :       25297 : pass_loop_jam::execute (function *fun)
     673                 :             : {
     674                 :       50594 :   if (number_of_loops (fun) <= 1)
     675                 :             :     return 0;
     676                 :             : 
     677                 :       25297 :   return tree_loop_unroll_and_jam ();
     678                 :             : }
     679                 :             : 
     680                 :             : }
     681                 :             : 
     682                 :             : gimple_opt_pass *
     683                 :      280114 : make_pass_loop_jam (gcc::context *ctxt)
     684                 :             : {
     685                 :      280114 :   return new pass_loop_jam (ctxt);
     686                 :             : }
        

Generated by: LCOV version 2.1-beta

LCOV profile is generated on x86_64 machine using following configure options: configure --disable-bootstrap --enable-coverage=opt --enable-languages=c,c++,fortran,go,jit,lto,rust,m2 --enable-host-shared. GCC test suite is run with the built compiler.