Greetings,

Here is a new revision of the tree portions of this patch.  I moved the
pattern recognizer to expand, and added additional logic to look for the
same pattern in gimple form.  I added two more tests to verify the new
logic.

I didn't run into any problems with the RTL CSE phases.  I can't recall
for certain what caused me to abandon the expand version previously.
There may not have been good reason; too many versions to keep track of
and too many interruptions, I suppose.  In any case, I'm much happier
having this code in the expander.

Paolo's RTL logic for unpropagating the zero-offset case is not going to
work out as is.  It causes a number of performance degradations, which I
suspect are due to the pass reordering.  That's a separate issue,
though, and not needed for this patch.

Bootstrapped and regression-tested on powerpc64-linux.  SPEC cpu2000
shows a number of small improvements and no significant degradations.
SPEC cpu2006 testing is pending.

Thanks,
Bill


2011-10-18  Bill Schmidt  <wschm...@linux.vnet.ibm.com>

gcc:

        PR rtl-optimization/46556
        * expr.c (tree-pretty-print.h): New include.
        (restructure_base_and_offset): New function.
        (restructure_mem_ref): New function.
        (expand_expr_real_1): In MEM_REF case, attempt restructure_mem_ref
        first.  In normal_inner_ref case, attempt restructure_base_and_offset
        first.
        * Makefile.in: Update dependences for expr.o.

gcc/testsuite:

        PR rtl-optimization/46556
        * gcc.dg/tree-ssa-pr46556-1.c: New test.
        * gcc.dg/tree-ssa-pr46556-2.c: Likewise.
        * gcc.dg/tree-ssa-pr46556-3.c: Likewise.
        * gcc.dg/tree-ssa-pr46556-4.c: Likewise.
        * gcc.dg/tree-ssa-pr46556-5.c: Likewise.


Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c   (revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-1.c   (revision 0)
@@ -0,0 +1,22 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-rtl-expand" } */
+
+struct x
+{
+  int a[16];
+  int b[16];
+  int c[16];
+};
+
+extern void foo (int, int, int);
+
+void
+f (struct x *p, unsigned int n)
+{
+  foo (p->a[n], p->c[n], p->b[n]);
+}
+
+/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 2 "expand" } } */
+/* { dg-final { scan-rtl-dump-times "const_int 128" 1 "expand" } } */
+/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 1 
"expand" } } */
+/* { dg-final { cleanup-rtl-dump "expand" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c   (revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-2.c   (revision 0)
@@ -0,0 +1,26 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-rtl-expand" } */
+
+struct x
+{
+  int a[16];
+  int b[16];
+  int c[16];
+};
+
+extern void foo (int, int, int);
+
+void
+f (struct x *p, unsigned int n)
+{
+  foo (p->a[n], p->c[n], p->b[n]);
+  if (n > 12)
+    foo (p->a[n], p->c[n], p->b[n]);
+  else if (n > 3)
+    foo (p->b[n], p->a[n], p->c[n]);
+}
+
+/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 6 "expand" } } */
+/* { dg-final { scan-rtl-dump-times "const_int 128" 3 "expand" } } */
+/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 3 
"expand" } } */
+/* { dg-final { cleanup-rtl-dump "expand" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c   (revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-3.c   (revision 0)
@@ -0,0 +1,27 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-rtl-expand" } */
+struct x
+{
+  int a[16];
+  int b[16];
+  int c[16];
+};
+
+extern void foo (int, int, int);
+
+void
+f (struct x *p, unsigned int n)
+{
+  foo (p->a[n], p->c[n], p->b[n]);
+  if (n > 3)
+    {
+      foo (p->a[n], p->c[n], p->b[n]);
+      if (n > 12)
+       foo (p->b[n], p->a[n], p->c[n]);
+    }
+}
+
+/* { dg-final { scan-rtl-dump-times "\\(mem/s:SI \\(plus:" 6 "expand" } } */
+/* { dg-final { scan-rtl-dump-times "const_int 128" 3 "expand" } } */
+/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 3 
"expand" } } */
+/* { dg-final { cleanup-rtl-dump "expand" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-4.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr46556-4.c   (revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-4.c   (revision 0)
@@ -0,0 +1,28 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-rtl-expand" } */
+
+struct x
+{
+  int a[16];
+  int b[16];
+  int c[16];
+};
+
+extern void foo (int, int, int);
+
+void
+f (struct x *p, unsigned int n)
+{
+  foo (*((int *)p + n*4), *((int *)p + 32 + n*4), *((int *)p + 16 + n*4));
+  if (n > 3)
+    {
+      foo (*((int *)p + n*4), *((int *)p + 32 + n*4), *((int *)p + 16 + n*4));
+      if (n > 12)
+       foo (*((int *)p + 16 + n*4), *((int *)p + n*4), *((int *)p + 32 + n*4));
+    }
+}
+
+/* { dg-final { scan-rtl-dump-times "\\(mem:SI \\(plus:" 6 "expand" } } */
+/* { dg-final { scan-rtl-dump-times "const_int 128" 3 "expand" } } */
+/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 3 
"expand" } } */
+/* { dg-final { cleanup-rtl-dump "expand" } } */
Index: gcc/testsuite/gcc.dg/tree-ssa/pr46556-5.c
===================================================================
--- gcc/testsuite/gcc.dg/tree-ssa/pr46556-5.c   (revision 0)
+++ gcc/testsuite/gcc.dg/tree-ssa/pr46556-5.c   (revision 0)
@@ -0,0 +1,36 @@
+/* { dg-do compile } */
+/* { dg-options "-O2 -fdump-rtl-expand" } */
+/* { dg-skip-if "" { lp64 } { "*" } { "-m32" } } */
+/* { dg-skip-if "" { llp64 } { "*" } { "-m32" } } */
+
+struct x
+{
+  int a[16];
+  int b[16];
+  int c[16];
+};
+
+extern void foo (int, int, int);
+
+void
+f (struct x *p, unsigned int n)
+{
+  foo (*((int *)p + n * 4),
+       *((int *)p + (n + 8) * 4),
+       *((int *)p + (n + 4) * 4));
+  if (n > 3)
+    {
+      foo (*((int *)p + n * 4),
+          *((int *)p + (n + 8) * 4),
+          *((int *)p + (n + 4) * 4));
+      if (n > 12)
+       foo (*((int *)p + (n + 4) * 4),
+            *((int *)p + n * 4),
+            *((int *)p + (n + 8) * 4));
+    }
+}
+
+/* { dg-final { scan-rtl-dump-times "\\(mem:SI \\(plus:" 6 "expand" } } */
+/* { dg-final { scan-rtl-dump-times "const_int 128" 3 "expand" } } */
+/* { dg-final { scan-rtl-dump-times "const_int 64 \\\[0x40\\\]\\)\\) \\\[" 3 
"expand" } } */
+/* { dg-final { cleanup-rtl-dump "expand" } } */
Index: gcc/expr.c
===================================================================
--- gcc/expr.c  (revision 180138)
+++ gcc/expr.c  (working copy)
@@ -56,6 +56,7 @@ along with GCC; see the file COPYING3.  If not see
 #include "ssaexpand.h"
 #include "target-globals.h"
 #include "params.h"
+#include "tree-pretty-print.h"
 
 /* Decide whether a function's arguments should be processed
    from first to last or from last to first.
@@ -7648,7 +7649,157 @@ expand_constructor (tree exp, rtx target, enum exp
   return target;
 }
 
+/* Given BASE, OFFSET, and BITPOS derived from EXPR, determine whether
+   there is a profitable opportunity to restructure address arithmetic
+   within BASE and OFFSET.  If so, produce such a restructuring and
+   return it.  */
+/* TODO: This belongs more properly in a separate pass that performs
+   general strength reduction on straight-line code.  Eventually move
+   this there.  */
 
+static tree
+restructure_base_and_offset (tree expr, tree base, tree offset,
+                            HOST_WIDE_INT bitpos)
+{
+  tree mult_op0, mult_op1, t1, t2, offset_type, mult_expr, add_expr, mem_ref;
+  double_int c, c1, c2, c3, c4;
+
+  /* Look for the following pattern:
+
+       base   = MEM_REF (T1, C1);
+       offset = MULT_EXPR (PLUS_EXPR (T2, C2), C3)
+       bitpos = C4 * BITS_PER_UNIT
+
+     If found, convert it to:
+
+       MEM_REF (POINTER_PLUS_EXPR (T1, MULT_EXPR (T2, C3)),
+                C1 + (C2 * C3) + C4)
+   */
+  if (!base
+      || !offset
+      || TREE_CODE (base) != MEM_REF
+      || TREE_CODE (offset) != MULT_EXPR)
+    return NULL_TREE;
+
+  mult_op0 = TREE_OPERAND (offset, 0);
+  mult_op1 = TREE_OPERAND (offset, 1);
+
+  if (TREE_CODE (mult_op0) != PLUS_EXPR
+      || TREE_CODE (mult_op1) != INTEGER_CST
+      || TREE_CODE (TREE_OPERAND (mult_op0, 1)) != INTEGER_CST)
+    return NULL_TREE;
+
+  t1 = TREE_OPERAND (base, 0);
+  t2 = TREE_OPERAND (mult_op0, 0);
+  c1 = mem_ref_offset (base);
+  c2 = tree_to_double_int (TREE_OPERAND (mult_op0, 1));
+  c3 = tree_to_double_int (mult_op1);
+  c4 = uhwi_to_double_int (bitpos / BITS_PER_UNIT);
+  c = double_int_add (double_int_add (c1, double_int_mul (c2, c3)), c4);
+
+  offset_type = TREE_TYPE (TREE_OPERAND (base, 1));
+
+  mult_expr = fold_build2 (MULT_EXPR, sizetype, t2, 
+                          double_int_to_tree (sizetype, c3));
+
+  add_expr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (t1), t1, mult_expr);
+
+  mem_ref = fold_build2 (MEM_REF, TREE_TYPE (expr), add_expr,
+                        double_int_to_tree (offset_type, c));
+  return mem_ref;
+}
+
+/* Look for a hidden array reference in memory reference EXP.  If found,
+   return a revised memory reference that restructures addressability
+   to combine constants.  */
+/* TODO: This belongs more properly in a separate pass that performs
+   general strength reduction on straight-line code.  Eventually move
+   this there.  */
+
+static tree
+restructure_mem_ref (tree exp)
+{
+  tree s2, s3, t1, t2, offset_type, mult_expr, add_expr;
+  tree s2_op0, s2_op1, s3_op0, s3_op1, result;
+  gimple s1_stmt, s2_stmt, s3_stmt;
+  double_int c1, c2, c3, c;
+
+  /* Currently we look for the following pattern, where each Si is an
+     SSA name, each Tj is an arbitrary tree, and each Ck is an integer
+     constant:
+
+       S3 = T2 + C2;
+       S2 = S3 * C3;
+       S1 = T1 ptr+ S2;
+       exp = MEM_REF (S1, C1);
+
+     This is rewritten as:
+
+       exp = MEM_REF (T1 ptr+ (T2 * C3)), C1 + C2 * C3);
+
+  */
+  if (TREE_CODE (TREE_OPERAND (exp, 0)) != SSA_NAME)
+    return NULL_TREE;
+
+  /* We don't use get_def_for_expr for S1 because TER doesn't forward
+     S1 in some situations where this transform is useful, such as
+     when S1 is the base of two MEM_REFs fitting the pattern.  */
+  s1_stmt = SSA_NAME_DEF_STMT (TREE_OPERAND (exp, 0));
+  
+  if (!s1_stmt
+      || !is_gimple_assign (s1_stmt)
+      || gimple_assign_rhs_code (s1_stmt) != POINTER_PLUS_EXPR)
+    return NULL_TREE;
+
+  c1 = mem_ref_offset (exp);
+  t1 = gimple_assign_rhs1 (s1_stmt);
+  s2 = gimple_assign_rhs2 (s1_stmt);
+
+  if (TREE_CODE (s2) != SSA_NAME)
+    return NULL_TREE;
+
+  s2_stmt = get_def_for_expr (s2, MULT_EXPR);
+
+  if (!s2_stmt)
+    return NULL_TREE;
+
+  s2_op0 = gimple_assign_rhs1 (s2_stmt);
+  s2_op1 = gimple_assign_rhs2 (s2_stmt);
+
+  if (TREE_CODE (s2_op1) != INTEGER_CST)
+    return NULL_TREE;
+
+  s3 = s2_op0;
+  c3 = tree_to_double_int (s2_op1);
+  s3_stmt = get_def_for_expr (s3, PLUS_EXPR);
+
+  if (!s3_stmt)
+    return NULL_TREE;
+
+  s3_op0 = gimple_assign_rhs1 (s3_stmt);
+  s3_op1 = gimple_assign_rhs2 (s3_stmt);
+
+  if (TREE_CODE (s3_op1) != INTEGER_CST)
+    return NULL_TREE;
+
+  t2 = s3_op0;
+  c2 = tree_to_double_int (s3_op1);
+  c = double_int_add (c1, double_int_mul (c2, c3));
+
+  offset_type = TREE_TYPE (TREE_OPERAND (exp, 1));
+
+  mult_expr = fold_build2 (MULT_EXPR, sizetype, t2,
+                          double_int_to_tree (sizetype, c3));
+
+  add_expr = fold_build2 (POINTER_PLUS_EXPR, TREE_TYPE (t1), t1, mult_expr);
+
+  result = build2 (MEM_REF, TREE_TYPE (exp), add_expr,
+                  double_int_to_tree (offset_type, c));
+
+  return result;
+}
+
+
 /* expand_expr: generate code for computing expression EXP.
    An rtx for the computed value is returned.  The value is never null.
    In the case of a void EXP, const0_rtx is returned.
@@ -9313,6 +9464,28 @@ expand_expr_real_1 (tree exp, rtx target, enum mac
        gimple def_stmt;
        enum insn_code icode;
        unsigned align;
+       tree tem;
+
+       /* Attempt to restructure a hidden array reference, such
+          as *(p + (n + k) * s).  This generates dead code, so
+          require -O2.  */
+       if (optimize > 1
+           && mode != BLKmode
+           && ((tem = restructure_mem_ref (exp)) != NULL_TREE))
+         {
+           if (dump_file && (dump_flags & TDF_DETAILS))
+             {
+               fprintf (dump_file, "\nRestructured memory reference:\n");
+               fprintf (dump_file, "  Original reference: ");
+               print_generic_expr (dump_file, exp, TDF_VOPS|TDF_MEMSYMS);
+               fprintf (dump_file, "\n  Replacement: ");
+               print_generic_expr (dump_file, tem, TDF_VOPS|TDF_MEMSYMS);
+             }
+           copy_ref_info (tem, exp);
+           exp = tem;
+           base = TREE_OPERAND (exp, 0);
+         }
+
        /* Handle expansion of non-aliased memory with non-BLKmode.  That
           might end up in a register.  */
        if (TREE_CODE (base) == ADDR_EXPR)
@@ -9584,7 +9757,7 @@ expand_expr_real_1 (tree exp, rtx target, enum mac
       {
        enum machine_mode mode1, mode2;
        HOST_WIDE_INT bitsize, bitpos;
-       tree offset;
+       tree offset, tem2;
        int volatilep = 0, must_force_mem;
        bool packedp = false;
        tree tem = get_inner_reference (exp, &bitsize, &bitpos, &offset,
@@ -9601,6 +9774,36 @@ expand_expr_real_1 (tree exp, rtx target, enum mac
                && DECL_PACKED (TREE_OPERAND (exp, 1))))
          packedp = true;
 
+       /* Attempt to restructure the base and offset to combine constants.
+          Bitfield references should eventually be lowered prior to this
+          point and are not dealt with.  Skip BLKmode aggregates as well.
+          This generates dead code, so require -O2.  */
+       if (optimize > 1
+           && code != BIT_FIELD_REF
+           && (code != COMPONENT_REF 
+               || !DECL_BIT_FIELD (TREE_OPERAND (exp, 1)))
+           && TYPE_MODE (TREE_TYPE (exp)) != BLKmode
+           && bitpos % BITS_PER_UNIT == 0
+           && ((tem2 = restructure_base_and_offset (exp, tem, offset, bitpos))
+               != NULL_TREE))
+         {
+           copy_ref_info (tem2, exp);
+           tem = tem2;
+           /* The offset and bitpos were absorbed into the new MEM_REF, so 
+              make sure we don't add them in again.  */
+           offset = NULL_TREE;
+           bitpos = 0;
+
+           if (dump_file && (dump_flags & TDF_DETAILS))
+             {
+               fprintf (dump_file, "\nRestructured inner reference:\n");
+               fprintf (dump_file, "  Original reference: ");
+               print_generic_expr (dump_file, exp, TDF_VOPS|TDF_MEMSYMS);
+               fprintf (dump_file, "\n  Replacement: ");
+               print_generic_expr (dump_file, tem, TDF_VOPS|TDF_MEMSYMS);
+             }
+         }
+
        /* If TEM's type is a union of variable size, pass TARGET to the inner
           computation, since it will need a temporary and TARGET is known
           to have to do.  This occurs in unchecked conversion in Ada.  */
Index: gcc/Makefile.in
===================================================================
--- gcc/Makefile.in     (revision 180138)
+++ gcc/Makefile.in     (working copy)
@@ -2926,7 +2926,7 @@ expr.o : expr.c $(CONFIG_H) $(SYSTEM_H) coretypes.
    reload.h langhooks.h intl.h $(TM_P_H) $(TARGET_H) \
    tree-iterator.h gt-expr.h $(MACHMODE_H) $(TIMEVAR_H) $(TREE_FLOW_H) \
    $(TREE_PASS_H) $(DF_H) $(DIAGNOSTIC_H) vecprim.h $(SSAEXPAND_H) \
-   $(PARAMS_H) $(COMMON_TARGET_H)
+   $(PARAMS_H) $(COMMON_TARGET_H) tree-pretty-print.h
 dojump.o : dojump.c $(CONFIG_H) $(SYSTEM_H) coretypes.h $(TM_H) $(RTL_H) 
$(TREE_H) \
    $(FLAGS_H) $(FUNCTION_H) $(EXPR_H) $(OPTABS_H) $(INSN_ATTR_H) insn-config.h 
\
    langhooks.h $(GGC_H) gt-dojump.h vecprim.h $(BASIC_BLOCK_H) output.h


Reply via email to