On Fri, 14 Dec 2018, Jakub Jelinek wrote:

> Hi!
> 
> With the previously posted patch to use RANGE_EXPRs in build_vec_init,
> the following patch attempts to do what the reporters were asking for
> - determine if it wouldn't be more efficient to use (perhaps nested) loops
> to initialize at runtime some automatic variable over having huge .rodata
> constants that are copied over into the automatic variables or used as the
> variable itself.
> 
> E.g. on the PR87436 testcase, we have RANGE_EXPR x4096 nested in a
> RANGE_EXPR x4096 with 24 byte innermost constructor.  Without this patch
> we emit 384MB long .rodata constant initializer that is then copied over
> in the constructor, with this patch we just have 2 nested loops.
> 
> The patch extends categorize_ctor_elements so that it gives next to the
> number of all scalars and number of non-zero scalars also something that
> allows us to guess how many RANGE_EXPRs are in there, i.e. how large the
> runtime code initializing that ctor would be.  It keeps doing the old thing
> for all initializers < 64 bytes, I think for those generally using runtime
> loops wouldn't be beneficial.
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

This looks OK to me - the only comment I have is on the two magic
constants (64 and 8) which are used twice in the patch.  Can you
either see to hoist the common condition into sth like

 bool prefer_loop_initializer_p = ...

or add #defines for those constants?  I suppose the hoisting
might be tricky as int_size_in_bytes can return -1 and the
workarounds are different in both places right now (maybe that's
a bug as well...).  OTOH using (unsigned)int_size_in_bytes
looks reasonable in general.

Thanks for tacking this long-standing issue,
Richard.

> If the C FE starts using RANGE_EXPRs at some point e.g. for:
> struct S { int a, b, c, d; };
> void foo (const struct S *);
> 
> void
> bar (void)
> {
>   const struct S s[] = { [0 ... 99999] = { 0, 1, 2, 3 } };
>   foo (&s[0]);
> }
> it could benefit from this patch too.
> 
> 2018-12-13  Jakub Jelinek  <ja...@redhat.com>
> 
>       PR c++/82294
>       PR c++/87436
>       * expr.h (categorize_ctor_elements): Add p_unique_nz_elts argument.
>       * expr.c (categorize_ctor_elements_1): Likewise.  Compute it like
>       p_nz_elts, except don't multiply it by mult.  Adjust recursive call.
>       Fix up COMPLEX_CST handling.
>       (categorize_ctor_elements): Add p_unique_nz_elts argument, initialize
>       it and pass it through to categorize_ctor_elements_1.
>       (mostly_zeros_p, all_zeros_p): Adjust categorize_ctor_elements callers.
>       * gimplify.c (gimplify_init_constructor): Likewise.  Don't force
>       ctor into readonly data section if num_unique_nonzero_elements is
>       smaller or equal to 1/8 of num_nonzero_elements and size is >= 64
>       bytes.
> 
>       * g++.dg/tree-ssa/pr82294.C: New test.
>       * g++.dg/tree-ssa/pr87436.C: New test.
> 
> --- gcc/expr.h.jj     2018-11-27 16:37:40.249419631 +0100
> +++ gcc/expr.h        2018-12-13 16:39:24.839050215 +0100
> @@ -309,7 +309,8 @@ extern bool can_move_by_pieces (unsigned
>  extern unsigned HOST_WIDE_INT highest_pow2_factor (const_tree);
>  
>  extern bool categorize_ctor_elements (const_tree, HOST_WIDE_INT *,
> -                                   HOST_WIDE_INT *, bool *);
> +                                   HOST_WIDE_INT *, HOST_WIDE_INT *,
> +                                   bool *);
>  
>  extern void expand_operands (tree, tree, rtx, rtx*, rtx*,
>                            enum expand_modifier);
> --- gcc/expr.c.jj     2018-11-27 16:37:40.249419631 +0100
> +++ gcc/expr.c        2018-12-13 17:13:47.083545126 +0100
> @@ -5945,10 +5945,11 @@ count_type_elements (const_tree type, bo
>  
>  static bool
>  categorize_ctor_elements_1 (const_tree ctor, HOST_WIDE_INT *p_nz_elts,
> +                         HOST_WIDE_INT *p_unique_nz_elts,
>                           HOST_WIDE_INT *p_init_elts, bool *p_complete)
>  {
>    unsigned HOST_WIDE_INT idx;
> -  HOST_WIDE_INT nz_elts, init_elts, num_fields;
> +  HOST_WIDE_INT nz_elts, unique_nz_elts, init_elts, num_fields;
>    tree value, purpose, elt_type;
>  
>    /* Whether CTOR is a valid constant initializer, in accordance with what
> @@ -5958,6 +5959,7 @@ categorize_ctor_elements_1 (const_tree c
>    bool const_p = const_from_elts_p ? true : TREE_STATIC (ctor);
>  
>    nz_elts = 0;
> +  unique_nz_elts = 0;
>    init_elts = 0;
>    num_fields = 0;
>    elt_type = NULL_TREE;
> @@ -5982,12 +5984,13 @@ categorize_ctor_elements_1 (const_tree c
>       {
>       case CONSTRUCTOR:
>         {
> -         HOST_WIDE_INT nz = 0, ic = 0;
> +         HOST_WIDE_INT nz = 0, unz = 0, ic = 0;
>  
> -         bool const_elt_p = categorize_ctor_elements_1 (value, &nz, &ic,
> -                                                        p_complete);
> +         bool const_elt_p = categorize_ctor_elements_1 (value, &nz, &unz,
> +                                                        &ic, p_complete);
>  
>           nz_elts += mult * nz;
> +         unique_nz_elts += unz;
>           init_elts += mult * ic;
>  
>           if (const_from_elts_p && const_p)
> @@ -5999,21 +6002,31 @@ categorize_ctor_elements_1 (const_tree c
>       case REAL_CST:
>       case FIXED_CST:
>         if (!initializer_zerop (value))
> -         nz_elts += mult;
> +         {
> +           nz_elts += mult;
> +           unique_nz_elts++;
> +         }
>         init_elts += mult;
>         break;
>  
>       case STRING_CST:
>         nz_elts += mult * TREE_STRING_LENGTH (value);
> +       unique_nz_elts += TREE_STRING_LENGTH (value);
>         init_elts += mult * TREE_STRING_LENGTH (value);
>         break;
>  
>       case COMPLEX_CST:
>         if (!initializer_zerop (TREE_REALPART (value)))
> -         nz_elts += mult;
> +         {
> +           nz_elts += mult;
> +           unique_nz_elts++;
> +         }
>         if (!initializer_zerop (TREE_IMAGPART (value)))
> -         nz_elts += mult;
> -       init_elts += mult;
> +         {
> +           nz_elts += mult;
> +           unique_nz_elts++;
> +         }
> +       init_elts += 2 * mult;
>         break;
>  
>       case VECTOR_CST:
> @@ -6025,7 +6038,10 @@ categorize_ctor_elements_1 (const_tree c
>             {
>               tree v = VECTOR_CST_ELT (value, i);
>               if (!initializer_zerop (v))
> -               nz_elts += mult;
> +               {
> +                 nz_elts += mult;
> +                 unique_nz_elts++;
> +               }
>               init_elts += mult;
>             }
>         }
> @@ -6035,6 +6051,7 @@ categorize_ctor_elements_1 (const_tree c
>         {
>           HOST_WIDE_INT tc = count_type_elements (elt_type, false);
>           nz_elts += mult * tc;
> +         unique_nz_elts += tc;
>           init_elts += mult * tc;
>  
>           if (const_from_elts_p && const_p)
> @@ -6054,6 +6071,7 @@ categorize_ctor_elements_1 (const_tree c
>      *p_complete = false;
>  
>    *p_nz_elts += nz_elts;
> +  *p_unique_nz_elts += unique_nz_elts;
>    *p_init_elts += init_elts;
>  
>    return const_p;
> @@ -6062,6 +6080,11 @@ categorize_ctor_elements_1 (const_tree c
>  /* Examine CTOR to discover:
>     * how many scalar fields are set to nonzero values,
>       and place it in *P_NZ_ELTS;
> +   * the same, but counting RANGE_EXPRs as multiplier of 1 instead of
> +     high - low + 1 (this can be useful for callers to determine ctors
> +     that could be cheaply initialized with - perhaps nested - loops
> +     compared to copied from huge read-only data),
> +     and place it in *P_UNIQUE_NZ_ELTS;
>     * how many scalar fields in total are in CTOR,
>       and place it in *P_ELT_COUNT.
>     * whether the constructor is complete -- in the sense that every
> @@ -6073,13 +6096,16 @@ categorize_ctor_elements_1 (const_tree c
>  
>  bool
>  categorize_ctor_elements (const_tree ctor, HOST_WIDE_INT *p_nz_elts,
> +                       HOST_WIDE_INT *p_unique_nz_elts,
>                         HOST_WIDE_INT *p_init_elts, bool *p_complete)
>  {
>    *p_nz_elts = 0;
> +  *p_unique_nz_elts = 0;
>    *p_init_elts = 0;
>    *p_complete = true;
>  
> -  return categorize_ctor_elements_1 (ctor, p_nz_elts, p_init_elts, 
> p_complete);
> +  return categorize_ctor_elements_1 (ctor, p_nz_elts, p_unique_nz_elts,
> +                                  p_init_elts, p_complete);
>  }
>  
>  /* TYPE is initialized by a constructor with NUM_ELTS elements, the last
> @@ -6110,17 +6136,18 @@ complete_ctor_at_level_p (const_tree typ
>    return count_type_elements (type, true) == num_elts;
>  }
>  
> -/* Return 1 if EXP contains mostly (3/4)  zeros.  */
> +/* Return 1 if EXP contains mostly (3/4) zeros.  */
>  
>  static int
>  mostly_zeros_p (const_tree exp)
>  {
>    if (TREE_CODE (exp) == CONSTRUCTOR)
>      {
> -      HOST_WIDE_INT nz_elts, init_elts;
> +      HOST_WIDE_INT nz_elts, unz_elts, init_elts;
>        bool complete_p;
>  
> -      categorize_ctor_elements (exp, &nz_elts, &init_elts, &complete_p);
> +      categorize_ctor_elements (exp, &nz_elts, &unz_elts, &init_elts,
> +                             &complete_p);
>        return !complete_p || nz_elts < init_elts / 4;
>      }
>  
> @@ -6134,10 +6161,11 @@ all_zeros_p (const_tree exp)
>  {
>    if (TREE_CODE (exp) == CONSTRUCTOR)
>      {
> -      HOST_WIDE_INT nz_elts, init_elts;
> +      HOST_WIDE_INT nz_elts, unz_elts, init_elts;
>        bool complete_p;
>  
> -      categorize_ctor_elements (exp, &nz_elts, &init_elts, &complete_p);
> +      categorize_ctor_elements (exp, &nz_elts, &unz_elts, &init_elts,
> +                             &complete_p);
>        return nz_elts == 0;
>      }
>  
> --- gcc/gimplify.c.jj 2018-12-07 00:23:15.721987301 +0100
> +++ gcc/gimplify.c    2018-12-13 17:34:05.188733818 +0100
> @@ -4778,6 +4778,7 @@ gimplify_init_constructor (tree *expr_p,
>        {
>       struct gimplify_init_ctor_preeval_data preeval_data;
>       HOST_WIDE_INT num_ctor_elements, num_nonzero_elements;
> +     HOST_WIDE_INT num_unique_nonzero_elements;
>       bool cleared, complete_p, valid_const_initializer;
>  
>       /* Aggregate types must lower constructors to initialization of
> @@ -4795,6 +4796,7 @@ gimplify_init_constructor (tree *expr_p,
>          can only do so if it known to be a valid constant initializer.  */
>       valid_const_initializer
>         = categorize_ctor_elements (ctor, &num_nonzero_elements,
> +                                   &num_unique_nonzero_elements,
>                                     &num_ctor_elements, &complete_p);
>  
>       /* If a const aggregate variable is being initialized, then it
> @@ -4803,7 +4805,14 @@ gimplify_init_constructor (tree *expr_p,
>           && num_nonzero_elements > 1
>           && TREE_READONLY (object)
>           && VAR_P (object)
> -         && (flag_merge_constants >= 2 || !TREE_ADDRESSABLE (object)))
> +         && (flag_merge_constants >= 2 || !TREE_ADDRESSABLE (object))
> +         /* For ctors that have many repeated nonzero elements
> +            represented through RANGE_EXPRs, prefer initializing
> +            those through runtime loops over copies of large amounts
> +            of data from readonly data section.  */
> +         && (num_unique_nonzero_elements > num_nonzero_elements / 8
> +             || ((unsigned HOST_WIDE_INT) int_size_in_bytes (type)
> +                 < HOST_WIDE_INT_UC (64))))
>         {
>           if (notify_temp_creation)
>             return GS_ERROR;
> @@ -4896,6 +4905,12 @@ gimplify_init_constructor (tree *expr_p,
>              is so large as to make individual moves inefficient.  */
>           if (size > 0
>               && num_nonzero_elements > 1
> +             /* For ctors that have many repeated nonzero elements
> +                represented through RANGE_EXPRs, prefer initializing
> +                those through runtime loops over copies of large amounts
> +                of data from readonly data section.  */
> +             && (num_unique_nonzero_elements > num_nonzero_elements / 8
> +                 || size < 64)
>               && (size < num_nonzero_elements
>                   || !can_move_by_pieces (size, align)))
>             {
> --- gcc/testsuite/g++.dg/tree-ssa/pr82294.C.jj        2018-12-13 
> 17:42:08.357897946 +0100
> +++ gcc/testsuite/g++.dg/tree-ssa/pr82294.C   2018-12-13 17:41:11.682817078 
> +0100
> @@ -0,0 +1,13 @@
> +// PR c++/82294
> +// { dg-do compile { target c++11 } }
> +// { dg-options "-O2 -fdump-tree-gimple" }
> +
> +// Verify we don't "optimize" the ctor as copying a 1KB .rodata
> +// object into the variable.  It is better to initialize it through
> +// a loop.
> +// { dg-final { scan-tree-dump-not "this->arr = " "gimple" } }
> +
> +struct S { int x; explicit constexpr S (); };
> +constexpr S::S () : x{7} {}
> +struct T { S arr[256]; explicit T (); };
> +T::T () {}
> --- gcc/testsuite/g++.dg/tree-ssa/pr87436.C.jj        2018-12-13 
> 17:38:43.701216996 +0100
> +++ gcc/testsuite/g++.dg/tree-ssa/pr87436.C   2018-12-13 17:38:07.978796332 
> +0100
> @@ -0,0 +1,25 @@
> +// PR c++/87436
> +// { dg-do compile { target c++11 } }
> +// { dg-options "-O2 -fdump-tree-gimple" }
> +
> +// Verify we don't "optimize" the ctor as copying a 384MB .rodata
> +// object into the variable.  It is better to initialize it through
> +// two nested loops.
> +// { dg-final { scan-tree-dump-not "this->arr = " "gimple" } }
> +
> +struct S {
> +  int a = -1;
> +  short b = 3;
> +  int x = 0;
> +  int y = 1;
> +  int z = 42;
> +  float f = 0.123f;
> +};
> +
> +struct T { S arr[4096][4096]; };
> +
> +T *
> +foo ()
> +{
> +  return new T;
> +}
> 
> 
>       Jakub
> 
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Graham Norton, HRB 
21284 (AG Nuernberg)

Reply via email to