Hi,
For now distribution pass only handles the innermost loop.  This patch extends 
the pass
to cover two-level innermost loop nest.  It also refactors code in 
pass_loop_distribution::execute
for better reading.  Note I restrict it to 2-level loop nest on purpose because 
of high
cost in data dependence computation.  Some compilation time optimizations like 
reusing
the data reference finding, data dependence computing, would require a rewrite 
of this
pass like the proposed loop interchange implementation.  But that's another 
task.

This patch introduces a temporary TODO for loop nest builtin partition which is 
covered
by next two patches.

With this patch, kernel loop in bwaves now can be distributed, thus exposed for 
further
interchange.  This patch adds new test for matrix multiplication, as well as 
adjusts
test strings of existing tests.
Bootstrap and test in patch set on x86_64 and AArch64, is it OK?

Thanks,
bin
2017-10-04  Bin Cheng  <bin.ch...@arm.com>

        * tree-loop-distribution.c: Adjust the general comment.
        (NUM_PARTITION_THRESHOLD): New macro.
        (ssa_name_has_uses_outside_loop_p): Support loop nest distribution.
        (classify_partition): Skip builtin pattern of loop nest's inner loop.
        (merge_dep_scc_partitions): New parameter ignore_alias_p and use it
        in call to build_partition_graph.
        (finalize_partitions): New parameter.  Make loop distribution more
        conservative by fusing more partitions.
        (distribute_loop): Don't do runtime alias check in case of loop nest
        distribution.
        (find_seed_stmts_for_distribution): New function.
        (pass_loop_distribution::execute): Refactor code finding seed stmts
        into above function.  Support loop nest distribution for two-level
        innermost loop nest.  Adjust dump information.

gcc/testsuite/ChangeLog
2017-10-04  Bin Cheng  <bin.ch...@arm.com>

        * gcc.dg/tree-ssa/ldist-7.c: Adjust test string.
        * gcc.dg/tree-ssa/ldist-16.c: Ditto.
        * gcc.dg/tree-ssa/ldist-25.c: Ditto.
        * gcc.dg/tree-ssa/ldist-33.c: Ditto.
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-16.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ldist-16.c
index f43b64e..f4f3a44 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ldist-16.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-16.c
@@ -16,5 +16,5 @@ void foo (int n)
 
 /* We should not apply loop distribution and not generate a memset (0).  */
 
-/* { dg-final { scan-tree-dump "Loop 1 is the same" "ldist" } } */
+/* { dg-final { scan-tree-dump "Loop 1 not distributed" "ldist" } } */
 /* { dg-final { scan-tree-dump-times "generated memset zero" 0 "ldist" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-25.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ldist-25.c
index 699bf38..c0b95fc 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ldist-25.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-25.c
@@ -22,4 +22,4 @@ foo (void)
     }
 }
 
-/* { dg-final { scan-tree-dump "Loop . is the same" "ldist" } } */
+/* { dg-final { scan-tree-dump "Loop . not distributed" "ldist" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-33.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ldist-33.c
new file mode 100644
index 0000000..24d27fd
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-33.c
@@ -0,0 +1,21 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -ftree-loop-distribution -ftree-loop-distribute-patterns 
-fdump-tree-ldist-details" } */
+
+#define N (1024)
+double a[N][N], b[N][N], c[N][N];
+
+void
+foo (void)
+{
+  unsigned i, j, k;
+
+  for (i = 0; i < N; ++i)
+    for (j = 0; j < N; ++j)
+      {
+       c[i][j] = 0.0;
+       for (k = 0; k < N; ++k)
+         c[i][j] += a[i][k] * b[k][j];
+      }
+}
+
+/* { dg-final { scan-tree-dump "Loop nest . distributed: split to 1 loops and 
1 library" "ldist" } } */
diff --git a/gcc/testsuite/gcc.dg/tree-ssa/ldist-7.c 
b/gcc/testsuite/gcc.dg/tree-ssa/ldist-7.c
index f31d051..2eb1f74 100644
--- a/gcc/testsuite/gcc.dg/tree-ssa/ldist-7.c
+++ b/gcc/testsuite/gcc.dg/tree-ssa/ldist-7.c
@@ -28,4 +28,4 @@ int loop1 (int k)
   return a[1000-2] + b[1000-1] + c[1000-2] + d[1000-2];
 }
 
-/* { dg-final { scan-tree-dump-times "distributed" 0 "ldist" } } */
+/* { dg-final { scan-tree-dump-times "distributed: " 0 "ldist" } } */
diff --git a/gcc/tree-loop-distribution.c b/gcc/tree-loop-distribution.c
index 999b32e..59a968c 100644
--- a/gcc/tree-loop-distribution.c
+++ b/gcc/tree-loop-distribution.c
@@ -83,8 +83,8 @@ along with GCC; see the file COPYING3.  If not see
        loops and recover to the original one.
 
    TODO:
-     1) We only distribute innermost loops now.  This pass should handle loop
-       nests in the future.
+     1) We only distribute innermost two-level loop nest now.  We should
+       extend it for arbitrary loop nests in the future.
      2) We only fuse partitions in SCC now.  A better fusion algorithm is
        desired to minimize loop overhead, maximize parallelism and maximize
        data reuse.  */
@@ -118,6 +118,11 @@ along with GCC; see the file COPYING3.  If not see
 #define MAX_DATAREFS_NUM \
        ((unsigned) PARAM_VALUE (PARAM_LOOP_MAX_DATAREFS_FOR_DATADEPS))
 
+/* Threshold controlling number of distributed partitions.  Given it may
+   be unnecessary if a memory stream cost model is invented in the future,
+   we define it as a temporary macro, rather than a parameter.  */
+#define NUM_PARTITION_THRESHOLD (4)
+
 /* Hashtable helpers.  */
 
 struct ddr_hasher : nofree_ptr_hash <struct data_dependence_relation>
@@ -714,9 +719,11 @@ ssa_name_has_uses_outside_loop_p (tree def, loop_p loop)
 
   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, def)
     {
-      gimple *use_stmt = USE_STMT (use_p);
-      if (!is_gimple_debug (use_stmt)
-         && loop != loop_containing_stmt (use_stmt))
+      if (is_gimple_debug (USE_STMT (use_p)))
+       continue;
+
+      basic_block use_bb = gimple_bb (USE_STMT (use_p));
+      if (use_bb == NULL || !flow_bb_inside_loop_p (loop, use_bb))
        return true;
     }
 
@@ -1414,6 +1421,22 @@ classify_partition (loop_p loop, struct graph *rdg, 
partition *partition,
   if (!single_store)
     return;
 
+  /* TODO: We don't support memset/memcpy distribution for loop nest yet.  */
+  if (loop->inner)
+    {
+      basic_block bb = gimple_bb (DR_STMT (single_store));
+
+      if (bb->loop_father != loop)
+       return;
+
+      if (single_load)
+       {
+         bb = gimple_bb (DR_STMT (single_load));
+         if (bb->loop_father != loop)
+           return;
+       }
+    }
+
   nb_iter = number_of_latch_executions (loop);
   gcc_assert (nb_iter && nb_iter != chrec_dont_know);
   if (dominated_by_p (CDI_DOMINATORS, single_exit (loop)->src,
@@ -1964,16 +1987,18 @@ sort_partitions_by_post_order (struct graph *pg,
 }
 
 /* Given reduced dependence graph RDG merge strong connected components
-   of PARTITIONS.  In this function, data dependence caused by possible
-   alias between references is ignored, as if it doesn't exist at all.  */
+   of PARTITIONS.  If IGNORE_ALIAS_P is true, data dependence caused by
+   possible alias between references is ignored, as if it doesn't exist
+   at all; otherwise all depdendences are considered.  */
 
 static void
 merge_dep_scc_partitions (struct graph *rdg,
-                         vec<struct partition *> *partitions)
+                         vec<struct partition *> *partitions,
+                         bool ignore_alias_p)
 {
   struct partition *partition1, *partition2;
   struct pg_vdata *data;
-  graph *pg = build_partition_graph (rdg, partitions, true);
+  graph *pg = build_partition_graph (rdg, partitions, ignore_alias_p);
   int i, j, num_sccs = graphds_scc (pg, NULL);
 
   /* Strong connected compoenent means dependence cycle, we cannot distribute
@@ -2325,38 +2350,49 @@ version_for_distribution_p (vec<struct partition *> 
*partitions,
   return (alias_ddrs->length () > 0);
 }
 
-/* Fuse all partitions if necessary before finalizing distribution.  */
+/* Fuse PARTITIONS of LOOP if necessary before finalizing distribution.
+   ALIAS_DDRS contains ddrs which need runtime alias check.  */
 
 static void
-finalize_partitions (vec<struct partition *> *partitions,
+finalize_partitions (struct loop *loop, vec<struct partition *> *partitions,
                     vec<ddr_p> *alias_ddrs)
 {
   unsigned i;
-  struct partition *a, *partition;
+  struct partition *partition, *a;
 
   if (partitions->length () == 1
       || alias_ddrs->length () > 0)
     return;
 
-  a = (*partitions)[0];
-  if (a->kind != PKIND_NORMAL)
-    return;
-
-  for (i = 1; partitions->iterate (i, &partition); ++i)
+  unsigned num_builtin = 0, num_normal = 0;
+  bool same_type_p = true;
+  enum partition_type type = ((*partitions)[0])->type;
+  for (i = 0; partitions->iterate (i, &partition); ++i)
     {
-      /* Don't fuse if partition has different type or it is a builtin.  */
-      if (partition->type != a->type
-         || partition->kind != PKIND_NORMAL)
-       return;
+      same_type_p &= (type == partition->type);
+      if (partition->kind != PKIND_NORMAL)
+       num_builtin++;
+      else
+       num_normal++;
     }
 
-  /* Fuse all partitions.  */
-  for (i = 1; partitions->iterate (i, &partition); ++i)
+  /* Don't distribute current loop into too many loops given we don't have
+     memory stream cost model.  Be even more conservative in case of loop
+     nest distribution.  */
+  if ((same_type_p && num_builtin == 0)
+      || (loop->inner != NULL
+         && i >= NUM_PARTITION_THRESHOLD && num_normal > 1)
+      || (loop->inner == NULL
+         && i >= NUM_PARTITION_THRESHOLD && num_normal > num_builtin))
     {
-      partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
-      partition_free (partition);
+      a = (*partitions)[0];
+      for (i = 1; partitions->iterate (i, &partition); ++i)
+       {
+         partition_merge_into (NULL, a, partition, FUSE_FINALIZE);
+         partition_free (partition);
+       }
+      partitions->truncate (1);
     }
-  partitions->truncate (1);
 }
 
 /* Distributes the code from LOOP in such a way that producer statements
@@ -2509,16 +2545,25 @@ distribute_loop (struct loop *loop, vec<gimple *> stmts,
        i--;
     }
 
-  /* Build the partition dependency graph.  */
+  /* Build the partition dependency graph and fuse partitions in strong
+     connected component.  */
   if (partitions.length () > 1)
     {
-      merge_dep_scc_partitions (rdg, &partitions);
-      alias_ddrs.truncate (0);
-      if (partitions.length () > 1)
-       break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
+      /* Don't support loop nest distribution under runtime alias check
+        since it's not likely to enable many vectorization opportunities.  */
+      if (loop->inner)
+       {
+         merge_dep_scc_partitions (rdg, &partitions, false);
+       }
+      else
+       {
+         merge_dep_scc_partitions (rdg, &partitions, true);
+         if (partitions.length () > 1)
+           break_alias_scc_partitions (rdg, &partitions, &alias_ddrs);
+       }
     }
 
-  finalize_partitions (&partitions, &alias_ddrs);
+  finalize_partitions (loop, &partitions, &alias_ddrs);
 
   nbp = partitions.length ();
   if (nbp == 0
@@ -2599,6 +2644,58 @@ public:
 
 }; // class pass_loop_distribution
 
+
+/* Given LOOP, this function records seed statements for distribution in
+   WORK_LIST.  Return false if there is nothing for distribution.  */
+
+static bool
+find_seed_stmts_for_distribution (struct loop *loop, vec<gimple *> *work_list)
+{
+  basic_block *bbs = get_loop_body_in_dom_order (loop);
+
+  /* Initialize the worklist with stmts we seed the partitions with.  */
+  for (unsigned i = 0; i < loop->num_nodes; ++i)
+    {
+      for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
+          !gsi_end_p (gsi); gsi_next (&gsi))
+       {
+         gphi *phi = gsi.phi ();
+         if (virtual_operand_p (gimple_phi_result (phi)))
+           continue;
+         /* Distribute stmts which have defs that are used outside of
+            the loop.  */
+         if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
+           continue;
+         work_list->safe_push (phi);
+       }
+      for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
+          !gsi_end_p (gsi); gsi_next (&gsi))
+       {
+         gimple *stmt = gsi_stmt (gsi);
+
+         /* If there is a stmt with side-effects bail out - we
+            cannot and should not distribute this loop.  */
+         if (gimple_has_side_effects (stmt))
+           {
+             free (bbs);
+             return false;
+           }
+
+         /* Distribute stmts which have defs that are used outside of
+            the loop.  */
+         if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
+           ;
+         /* Otherwise only distribute stores for now.  */
+         else if (!gimple_vdef (stmt))
+           continue;
+
+         work_list->safe_push (stmt);
+       }
+    }
+  free (bbs);
+  return work_list->length () > 0;
+}
+
 unsigned int
 pass_loop_distribution::execute (function *fun)
 {
@@ -2641,18 +2738,9 @@ pass_loop_distribution::execute (function *fun)
      walking to innermost loops.  */
   FOR_EACH_LOOP (loop, LI_ONLY_INNERMOST)
     {
-      auto_vec<gimple *> work_list;
-      basic_block *bbs;
-      int num = loop->num;
-      unsigned int i;
-
-      /* If the loop doesn't have a single exit we will fail anyway,
-        so do that early.  */
-      if (!single_exit (loop))
-       continue;
-
-      /* Only optimize hot loops.  */
-      if (!optimize_loop_for_speed_p (loop))
+      /* Don't distribute multiple exit edges loop, or cold loop.  */
+      if (!single_exit (loop)
+         || !optimize_loop_for_speed_p (loop))
        continue;
 
       /* Don't distribute loop if niters is unknown.  */
@@ -2660,56 +2748,30 @@ pass_loop_distribution::execute (function *fun)
       if (niters == NULL_TREE || niters == chrec_dont_know)
        continue;
 
-      /* Initialize the worklist with stmts we seed the partitions with.  */
-      bbs = get_loop_body_in_dom_order (loop);
-      for (i = 0; i < loop->num_nodes; ++i)
+      bool loop_nest_p = false;
+      struct loop *outer = loop_outer (loop);
+
+      /* Support loop nest distribution enclosing current innermost loop.
+        For the moment, we only support the innermost two-level loop nest.  */
+      if (flag_tree_loop_distribution
+         && outer->num > 0 && outer->inner == loop && loop->next == NULL
+         && single_exit (outer)
+         && optimize_loop_for_speed_p (loop)
+         && !chrec_contains_symbols_defined_in_loop (niters, outer->num)
+         && (niters = number_of_latch_executions (outer)) != NULL_TREE
+         && niters != chrec_dont_know)
        {
-         for (gphi_iterator gsi = gsi_start_phis (bbs[i]);
-              !gsi_end_p (gsi);
-              gsi_next (&gsi))
-           {
-             gphi *phi = gsi.phi ();
-             if (virtual_operand_p (gimple_phi_result (phi)))
-               continue;
-             /* Distribute stmts which have defs that are used outside of
-                the loop.  */
-             if (!stmt_has_scalar_dependences_outside_loop (loop, phi))
-               continue;
-             work_list.safe_push (phi);
-           }
-         for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
-              !gsi_end_p (gsi);
-              gsi_next (&gsi))
-           {
-             gimple *stmt = gsi_stmt (gsi);
-
-             /* If there is a stmt with side-effects bail out - we
-                cannot and should not distribute this loop.  */
-             if (gimple_has_side_effects (stmt))
-               {
-                 work_list.truncate (0);
-                 goto out;
-               }
-
-             /* Distribute stmts which have defs that are used outside of
-                the loop.  */
-             if (stmt_has_scalar_dependences_outside_loop (loop, stmt))
-               ;
-             /* Otherwise only distribute stores for now.  */
-             else if (!gimple_vdef (stmt))
-               continue;
-
-             work_list.safe_push (stmt);
-           }
+         loop = outer;
+         loop_nest_p = true;
        }
-out:
-      free (bbs);
 
-      int nb_generated_loops = 0;
-      int nb_generated_calls = 0;
-      location_t loc = find_loop_location (loop);
-      if (work_list.length () > 0)
+      for (; loop; loop = loop->inner, loop_nest_p = false)
        {
+         auto_vec<gimple *> work_list;
+         if (!find_seed_stmts_for_distribution (loop, &work_list))
+           break;
+
+         location_t loc = find_loop_location (loop);
          if (!cd)
            {
              calculate_dominance_info (CDI_DOMINATORS);
@@ -2717,24 +2779,30 @@ out:
              cd = new control_dependences ();
              free_dominance_info (CDI_POST_DOMINATORS);
            }
+
          bool destroy_p;
+         int nb_generated_loops, nb_generated_calls;
          nb_generated_loops = distribute_loop (loop, work_list, cd,
                                                &nb_generated_calls,
                                                &destroy_p);
          if (destroy_p)
            loops_to_be_destroyed.safe_push (loop);
-       }
 
-      if (nb_generated_loops + nb_generated_calls > 0)
-       {
-         changed = true;
-         dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
-                          loc, "Loop %d distributed: split to %d loops "
-                          "and %d library calls.\n",
-                          num, nb_generated_loops, nb_generated_calls);
+         if (nb_generated_loops + nb_generated_calls > 0)
+           {
+             changed = true;
+             dump_printf_loc (MSG_OPTIMIZED_LOCATIONS,
+                              loc, "Loop%s %d distributed: split to %d loops "
+                              "and %d library calls.\n",
+                              loop_nest_p ? " nest" : "", loop->num,
+                              nb_generated_loops, nb_generated_calls);
+
+             break;
+           }
+
+         if (dump_file && (dump_flags & TDF_DETAILS))
+           fprintf (dump_file, "Loop %d not distributed.\n", loop->num);
        }
-      else if (dump_file && (dump_flags & TDF_DETAILS))
-       fprintf (dump_file, "Loop %d is the same.\n", num);
     }
 
   if (cd)

Reply via email to