Hi!

With the previously posted patch to use RANGE_EXPRs in build_vec_init,
the following patch attempts to do what the reporters were asking for
- determine if it wouldn't be more efficient to use (perhaps nested) loops
to initialize at runtime some automatic variable over having huge .rodata
constants that are copied over into the automatic variables or used as the
variable itself.

E.g. on the PR87436 testcase, we have RANGE_EXPR x4096 nested in a
RANGE_EXPR x4096 with 24 byte innermost constructor.  Without this patch
we emit 384MB long .rodata constant initializer that is then copied over
in the constructor, with this patch we just have 2 nested loops.

The patch extends categorize_ctor_elements so that it gives next to the
number of all scalars and number of non-zero scalars also something that
allows us to guess how many RANGE_EXPRs are in there, i.e. how large the
runtime code initializing that ctor would be.  It keeps doing the old thing
for all initializers < 64 bytes, I think for those generally using runtime
loops wouldn't be beneficial.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

If the C FE starts using RANGE_EXPRs at some point e.g. for:
struct S { int a, b, c, d; };
void foo (const struct S *);

void
bar (void)
{
  const struct S s[] = { [0 ... 99999] = { 0, 1, 2, 3 } };
  foo (&s[0]);
}
it could benefit from this patch too.

2018-12-13  Jakub Jelinek  <ja...@redhat.com>

        PR c++/82294
        PR c++/87436
        * expr.h (categorize_ctor_elements): Add p_unique_nz_elts argument.
        * expr.c (categorize_ctor_elements_1): Likewise.  Compute it like
        p_nz_elts, except don't multiply it by mult.  Adjust recursive call.
        Fix up COMPLEX_CST handling.
        (categorize_ctor_elements): Add p_unique_nz_elts argument, initialize
        it and pass it through to categorize_ctor_elements_1.
        (mostly_zeros_p, all_zeros_p): Adjust categorize_ctor_elements callers.
        * gimplify.c (gimplify_init_constructor): Likewise.  Don't force
        ctor into readonly data section if num_unique_nonzero_elements is
        smaller or equal to 1/8 of num_nonzero_elements and size is >= 64
        bytes.

        * g++.dg/tree-ssa/pr82294.C: New test.
        * g++.dg/tree-ssa/pr87436.C: New test.

--- gcc/expr.h.jj       2018-11-27 16:37:40.249419631 +0100
+++ gcc/expr.h  2018-12-13 16:39:24.839050215 +0100
@@ -309,7 +309,8 @@ extern bool can_move_by_pieces (unsigned
 extern unsigned HOST_WIDE_INT highest_pow2_factor (const_tree);
 
 extern bool categorize_ctor_elements (const_tree, HOST_WIDE_INT *,
-                                     HOST_WIDE_INT *, bool *);
+                                     HOST_WIDE_INT *, HOST_WIDE_INT *,
+                                     bool *);
 
 extern void expand_operands (tree, tree, rtx, rtx*, rtx*,
                             enum expand_modifier);
--- gcc/expr.c.jj       2018-11-27 16:37:40.249419631 +0100
+++ gcc/expr.c  2018-12-13 17:13:47.083545126 +0100
@@ -5945,10 +5945,11 @@ count_type_elements (const_tree type, bo
 
 static bool
 categorize_ctor_elements_1 (const_tree ctor, HOST_WIDE_INT *p_nz_elts,
+                           HOST_WIDE_INT *p_unique_nz_elts,
                            HOST_WIDE_INT *p_init_elts, bool *p_complete)
 {
   unsigned HOST_WIDE_INT idx;
-  HOST_WIDE_INT nz_elts, init_elts, num_fields;
+  HOST_WIDE_INT nz_elts, unique_nz_elts, init_elts, num_fields;
   tree value, purpose, elt_type;
 
   /* Whether CTOR is a valid constant initializer, in accordance with what
@@ -5958,6 +5959,7 @@ categorize_ctor_elements_1 (const_tree c
   bool const_p = const_from_elts_p ? true : TREE_STATIC (ctor);
 
   nz_elts = 0;
+  unique_nz_elts = 0;
   init_elts = 0;
   num_fields = 0;
   elt_type = NULL_TREE;
@@ -5982,12 +5984,13 @@ categorize_ctor_elements_1 (const_tree c
        {
        case CONSTRUCTOR:
          {
-           HOST_WIDE_INT nz = 0, ic = 0;
+           HOST_WIDE_INT nz = 0, unz = 0, ic = 0;
 
-           bool const_elt_p = categorize_ctor_elements_1 (value, &nz, &ic,
-                                                          p_complete);
+           bool const_elt_p = categorize_ctor_elements_1 (value, &nz, &unz,
+                                                          &ic, p_complete);
 
            nz_elts += mult * nz;
+           unique_nz_elts += unz;
            init_elts += mult * ic;
 
            if (const_from_elts_p && const_p)
@@ -5999,21 +6002,31 @@ categorize_ctor_elements_1 (const_tree c
        case REAL_CST:
        case FIXED_CST:
          if (!initializer_zerop (value))
-           nz_elts += mult;
+           {
+             nz_elts += mult;
+             unique_nz_elts++;
+           }
          init_elts += mult;
          break;
 
        case STRING_CST:
          nz_elts += mult * TREE_STRING_LENGTH (value);
+         unique_nz_elts += TREE_STRING_LENGTH (value);
          init_elts += mult * TREE_STRING_LENGTH (value);
          break;
 
        case COMPLEX_CST:
          if (!initializer_zerop (TREE_REALPART (value)))
-           nz_elts += mult;
+           {
+             nz_elts += mult;
+             unique_nz_elts++;
+           }
          if (!initializer_zerop (TREE_IMAGPART (value)))
-           nz_elts += mult;
-         init_elts += mult;
+           {
+             nz_elts += mult;
+             unique_nz_elts++;
+           }
+         init_elts += 2 * mult;
          break;
 
        case VECTOR_CST:
@@ -6025,7 +6038,10 @@ categorize_ctor_elements_1 (const_tree c
              {
                tree v = VECTOR_CST_ELT (value, i);
                if (!initializer_zerop (v))
-                 nz_elts += mult;
+                 {
+                   nz_elts += mult;
+                   unique_nz_elts++;
+                 }
                init_elts += mult;
              }
          }
@@ -6035,6 +6051,7 @@ categorize_ctor_elements_1 (const_tree c
          {
            HOST_WIDE_INT tc = count_type_elements (elt_type, false);
            nz_elts += mult * tc;
+           unique_nz_elts += tc;
            init_elts += mult * tc;
 
            if (const_from_elts_p && const_p)
@@ -6054,6 +6071,7 @@ categorize_ctor_elements_1 (const_tree c
     *p_complete = false;
 
   *p_nz_elts += nz_elts;
+  *p_unique_nz_elts += unique_nz_elts;
   *p_init_elts += init_elts;
 
   return const_p;
@@ -6062,6 +6080,11 @@ categorize_ctor_elements_1 (const_tree c
 /* Examine CTOR to discover:
    * how many scalar fields are set to nonzero values,
      and place it in *P_NZ_ELTS;
+   * the same, but counting RANGE_EXPRs as multiplier of 1 instead of
+     high - low + 1 (this can be useful for callers to determine ctors
+     that could be cheaply initialized with - perhaps nested - loops
+     compared to copied from huge read-only data),
+     and place it in *P_UNIQUE_NZ_ELTS;
    * how many scalar fields in total are in CTOR,
      and place it in *P_ELT_COUNT.
    * whether the constructor is complete -- in the sense that every
@@ -6073,13 +6096,16 @@ categorize_ctor_elements_1 (const_tree c
 
 bool
 categorize_ctor_elements (const_tree ctor, HOST_WIDE_INT *p_nz_elts,
+                         HOST_WIDE_INT *p_unique_nz_elts,
                          HOST_WIDE_INT *p_init_elts, bool *p_complete)
 {
   *p_nz_elts = 0;
+  *p_unique_nz_elts = 0;
   *p_init_elts = 0;
   *p_complete = true;
 
-  return categorize_ctor_elements_1 (ctor, p_nz_elts, p_init_elts, p_complete);
+  return categorize_ctor_elements_1 (ctor, p_nz_elts, p_unique_nz_elts,
+                                    p_init_elts, p_complete);
 }
 
 /* TYPE is initialized by a constructor with NUM_ELTS elements, the last
@@ -6110,17 +6136,18 @@ complete_ctor_at_level_p (const_tree typ
   return count_type_elements (type, true) == num_elts;
 }
 
-/* Return 1 if EXP contains mostly (3/4)  zeros.  */
+/* Return 1 if EXP contains mostly (3/4) zeros.  */
 
 static int
 mostly_zeros_p (const_tree exp)
 {
   if (TREE_CODE (exp) == CONSTRUCTOR)
     {
-      HOST_WIDE_INT nz_elts, init_elts;
+      HOST_WIDE_INT nz_elts, unz_elts, init_elts;
       bool complete_p;
 
-      categorize_ctor_elements (exp, &nz_elts, &init_elts, &complete_p);
+      categorize_ctor_elements (exp, &nz_elts, &unz_elts, &init_elts,
+                               &complete_p);
       return !complete_p || nz_elts < init_elts / 4;
     }
 
@@ -6134,10 +6161,11 @@ all_zeros_p (const_tree exp)
 {
   if (TREE_CODE (exp) == CONSTRUCTOR)
     {
-      HOST_WIDE_INT nz_elts, init_elts;
+      HOST_WIDE_INT nz_elts, unz_elts, init_elts;
       bool complete_p;
 
-      categorize_ctor_elements (exp, &nz_elts, &init_elts, &complete_p);
+      categorize_ctor_elements (exp, &nz_elts, &unz_elts, &init_elts,
+                               &complete_p);
       return nz_elts == 0;
     }
 
--- gcc/gimplify.c.jj   2018-12-07 00:23:15.721987301 +0100
+++ gcc/gimplify.c      2018-12-13 17:34:05.188733818 +0100
@@ -4778,6 +4778,7 @@ gimplify_init_constructor (tree *expr_p,
       {
        struct gimplify_init_ctor_preeval_data preeval_data;
        HOST_WIDE_INT num_ctor_elements, num_nonzero_elements;
+       HOST_WIDE_INT num_unique_nonzero_elements;
        bool cleared, complete_p, valid_const_initializer;
 
        /* Aggregate types must lower constructors to initialization of
@@ -4795,6 +4796,7 @@ gimplify_init_constructor (tree *expr_p,
           can only do so if it known to be a valid constant initializer.  */
        valid_const_initializer
          = categorize_ctor_elements (ctor, &num_nonzero_elements,
+                                     &num_unique_nonzero_elements,
                                      &num_ctor_elements, &complete_p);
 
        /* If a const aggregate variable is being initialized, then it
@@ -4803,7 +4805,14 @@ gimplify_init_constructor (tree *expr_p,
            && num_nonzero_elements > 1
            && TREE_READONLY (object)
            && VAR_P (object)
-           && (flag_merge_constants >= 2 || !TREE_ADDRESSABLE (object)))
+           && (flag_merge_constants >= 2 || !TREE_ADDRESSABLE (object))
+           /* For ctors that have many repeated nonzero elements
+              represented through RANGE_EXPRs, prefer initializing
+              those through runtime loops over copies of large amounts
+              of data from readonly data section.  */
+           && (num_unique_nonzero_elements > num_nonzero_elements / 8
+               || ((unsigned HOST_WIDE_INT) int_size_in_bytes (type)
+                   < HOST_WIDE_INT_UC (64))))
          {
            if (notify_temp_creation)
              return GS_ERROR;
@@ -4896,6 +4905,12 @@ gimplify_init_constructor (tree *expr_p,
               is so large as to make individual moves inefficient.  */
            if (size > 0
                && num_nonzero_elements > 1
+               /* For ctors that have many repeated nonzero elements
+                  represented through RANGE_EXPRs, prefer initializing
+                  those through runtime loops over copies of large amounts
+                  of data from readonly data section.  */
+               && (num_unique_nonzero_elements > num_nonzero_elements / 8
+                   || size < 64)
                && (size < num_nonzero_elements
                    || !can_move_by_pieces (size, align)))
              {
--- gcc/testsuite/g++.dg/tree-ssa/pr82294.C.jj  2018-12-13 17:42:08.357897946 
+0100
+++ gcc/testsuite/g++.dg/tree-ssa/pr82294.C     2018-12-13 17:41:11.682817078 
+0100
@@ -0,0 +1,13 @@
+// PR c++/82294
+// { dg-do compile { target c++11 } }
+// { dg-options "-O2 -fdump-tree-gimple" }
+
+// Verify we don't "optimize" the ctor as copying a 1KB .rodata
+// object into the variable.  It is better to initialize it through
+// a loop.
+// { dg-final { scan-tree-dump-not "this->arr = " "gimple" } }
+
+struct S { int x; explicit constexpr S (); };
+constexpr S::S () : x{7} {}
+struct T { S arr[256]; explicit T (); };
+T::T () {}
--- gcc/testsuite/g++.dg/tree-ssa/pr87436.C.jj  2018-12-13 17:38:43.701216996 
+0100
+++ gcc/testsuite/g++.dg/tree-ssa/pr87436.C     2018-12-13 17:38:07.978796332 
+0100
@@ -0,0 +1,25 @@
+// PR c++/87436
+// { dg-do compile { target c++11 } }
+// { dg-options "-O2 -fdump-tree-gimple" }
+
+// Verify we don't "optimize" the ctor as copying a 384MB .rodata
+// object into the variable.  It is better to initialize it through
+// two nested loops.
+// { dg-final { scan-tree-dump-not "this->arr = " "gimple" } }
+
+struct S {
+  int a = -1;
+  short b = 3;
+  int x = 0;
+  int y = 1;
+  int z = 42;
+  float f = 0.123f;
+};
+
+struct T { S arr[4096][4096]; };
+
+T *
+foo ()
+{
+  return new T;
+}


        Jakub

Reply via email to