On Tue, 15 Dec 2020, Jakub Jelinek wrote:

Hi!

The following patch teaches the bswap pass to handle for small (2/4/8 byte
long) vectors a CONSTRUCTOR by determining if the bytes of the constructor
come from non-vector sources and are either nop or bswap and changing the
CONSTRUCTOR in that case to VIEW_CONVERT_EXPR from scalar integer to
the vector type.

Unfortunately, as I found after the patch was written, due to pass ordering
this doesn't really fix the original testcase, just the one I wrote,
because both loop and slp vectorization is done only after the bswap pass.
A possible way out of that would be to perform just this particular bswap
optimization (i.e. for CONSTRUCTOR assignments with integral vector types
call find_bswap_or_nop and bswap_replace if successful) also during the
store merging pass, it isn't really a store, but the store merging pass
already performs bswapping when handling store, so it wouldn't be that big
hack.  What do you think?

I think it probably makes sense to have some helper split out that
collects & classifies vector constructor components we can use from
both forwprop (where matching the V_C_E from integer could be done
as well IMHO) and bswap (when a permute is involved) and store-merging.

Like with the bug missing integer promotion/demotion from memory
in forwprop we should be able to handle this in several places.

The helper would collect defs from the CTOR, and for loads classify
them so they have the same VUSE and are in the same BB and only then
does some ref analysis (using data-ref analysis is OK I think) and
see whether the defs are linear or permuted.  There should be a
corresponding second helper for emitting a combined load so we do
not have to redo that.

I guess this isn't really stage3 material though.

Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?

I guess this patch is still OK (tracking the bswap state in a generic
helper of course complicates things).

Richard.

2020-12-15  Jakub Jelinek  <ja...@redhat.com>

        PR tree-optimization/96239
        * gimple-ssa-store-merging.c (find_bswap_or_nop): Handle a vector
        CONSTRUCTOR.
        (bswap_replace): Likewise.

        * gcc.dg/pr96239.c: New test.

--- gcc/gimple-ssa-store-merging.c.jj   2020-11-20 12:26:23.777860152 +0100
+++ gcc/gimple-ssa-store-merging.c      2020-12-14 19:10:30.887578930 +0100
@@ -865,7 +865,66 @@ find_bswap_or_nop (gimple *stmt, struct
  gimple *ins_stmt = find_bswap_or_nop_1 (stmt, n, limit);

  if (!ins_stmt)
-    return NULL;
+    {
+      if (gimple_assign_rhs_code (stmt) != CONSTRUCTOR
+         || BYTES_BIG_ENDIAN != WORDS_BIG_ENDIAN)
+       return NULL;
+      unsigned HOST_WIDE_INT sz = tree_to_uhwi (type_size) * BITS_PER_UNIT;
+      if (sz != 16 && sz != 32 && sz != 64)
+       return NULL;
+      tree rhs = gimple_assign_rhs1 (stmt);
+      tree eltype = TREE_TYPE (TREE_TYPE (rhs));
+      unsigned HOST_WIDE_INT eltsz
+       = int_size_in_bytes (eltype) * BITS_PER_UNIT;
+      if (TYPE_PRECISION (eltype) != eltsz)
+       return NULL;
+      constructor_elt *elt;
+      unsigned int i;
+      tree type = build_nonstandard_integer_type (sz, 1);
+      FOR_EACH_VEC_SAFE_ELT (CONSTRUCTOR_ELTS (rhs), i, elt)
+       {
+         if (TREE_CODE (elt->value) != SSA_NAME
+             || !INTEGRAL_TYPE_P (TREE_TYPE (elt->value)))
+           return NULL;
+         struct symbolic_number n1;
+         gimple *source_stmt
+           = find_bswap_or_nop_1 (SSA_NAME_DEF_STMT (elt->value), &n1,
+                                  limit - 1);
+
+         if (!source_stmt)
+           return NULL;
+
+         n1.type = type;
+         if (!n1.base_addr)
+           n1.range = sz / BITS_PER_UNIT;
+
+         if (i == 0)
+           {
+             ins_stmt = source_stmt;
+             *n = n1;
+           }
+         else
+           {
+             if (n->vuse != n1.vuse)
+               return NULL;
+
+             struct symbolic_number n0 = *n;
+
+             if (!BYTES_BIG_ENDIAN)
+               {
+                 if (!do_shift_rotate (LSHIFT_EXPR, &n1, i * eltsz))
+                   return NULL;
+               }
+             else if (!do_shift_rotate (LSHIFT_EXPR, &n0, eltsz))
+               return NULL;
+             ins_stmt
+               = perform_symbolic_merge (ins_stmt, &n0, source_stmt, &n1, n);
+
+             if (!ins_stmt)
+               return NULL;
+           }
+       }
+    }

  uint64_t cmpxchg, cmpnop;
  find_bswap_or_nop_finalize (n, &cmpxchg, &cmpnop);
@@ -939,11 +998,18 @@ bswap_replace (gimple_stmt_iterator gsi,
{
  tree src, tmp, tgt = NULL_TREE;
  gimple *bswap_stmt;
+  tree_code conv_code = NOP_EXPR;

  gimple *cur_stmt = gsi_stmt (gsi);
  src = n->src;
  if (cur_stmt)
-    tgt = gimple_assign_lhs (cur_stmt);
+    {
+      tgt = gimple_assign_lhs (cur_stmt);
+      if (gimple_assign_rhs_code (cur_stmt) == CONSTRUCTOR
+         && tgt
+         && VECTOR_TYPE_P (TREE_TYPE (tgt)))
+       conv_code = VIEW_CONVERT_EXPR;
+    }

  /* Need to load the value from memory first.  */
  if (n->base_addr)
@@ -1031,7 +1097,9 @@ bswap_replace (gimple_stmt_iterator gsi,
              load_stmt = gimple_build_assign (val_tmp, val_expr);
              gimple_set_vuse (load_stmt, n->vuse);
              gsi_insert_before (&gsi, load_stmt, GSI_SAME_STMT);
-             gimple_assign_set_rhs_with_ops (&gsi, NOP_EXPR, val_tmp);
+             if (conv_code == VIEW_CONVERT_EXPR)
+               val_tmp = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (tgt), val_tmp);
+             gimple_assign_set_rhs_with_ops (&gsi, conv_code, val_tmp);
              update_stmt (cur_stmt);
            }
          else if (cur_stmt)
@@ -1073,7 +1141,9 @@ bswap_replace (gimple_stmt_iterator gsi,
        {
          if (!is_gimple_val (src))
            return NULL_TREE;
-         g = gimple_build_assign (tgt, NOP_EXPR, src);
+         if (conv_code == VIEW_CONVERT_EXPR)
+           src = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (tgt), src);
+         g = gimple_build_assign (tgt, conv_code, src);
        }
      else if (cur_stmt)
        g = gimple_build_assign (tgt, src);
@@ -1153,7 +1223,10 @@ bswap_replace (gimple_stmt_iterator gsi,
      gimple *convert_stmt;

      tmp = make_temp_ssa_name (bswap_type, NULL, "bswapdst");
-      convert_stmt = gimple_build_assign (tgt, NOP_EXPR, tmp);
+      tree atmp = tmp;
+      if (conv_code == VIEW_CONVERT_EXPR)
+       atmp = build1 (VIEW_CONVERT_EXPR, TREE_TYPE (tgt), tmp);
+      convert_stmt = gimple_build_assign (tgt, conv_code, atmp);
      gsi_insert_after (&gsi, convert_stmt, GSI_SAME_STMT);
    }

@@ -1258,6 +1331,14 @@ pass_optimize_bswap::execute (function *
              /* Fall through.  */
            case BIT_IOR_EXPR:
              break;
+           case CONSTRUCTOR:
+             {
+               tree rhs = gimple_assign_rhs1 (cur_stmt);
+               if (VECTOR_TYPE_P (TREE_TYPE (rhs))
+                   && INTEGRAL_TYPE_P (TREE_TYPE (TREE_TYPE (rhs))))
+                 break;
+             }
+             continue;
            default:
              continue;
            }
--- gcc/testsuite/gcc.dg/pr96239.c.jj   2020-12-14 20:20:32.668775009 +0100
+++ gcc/testsuite/gcc.dg/pr96239.c      2020-12-14 20:22:05.000746427 +0100
@@ -0,0 +1,54 @@
+/* PR tree-optimization/96239 */
+/* { dg-do run { target { ilp32 || lp64 } } } */
+/* { dg-options "-O2 -fdump-tree-optimized" } */
+/* { dg-final { scan-tree-dump-times " r>> 8;" 1 "optimized" { target bswap } 
} } */
+/* { dg-final { scan-tree-dump-times " = __builtin_bswap64 " 1 "optimized" { 
target bswap } } } */
+/* { dg-final { scan-tree-dump-not " >> \(8\|16\|24\|32\|40\|48\|56\);" 
"optimized" { target bswap } } } */
+
+typedef unsigned char V __attribute__((vector_size (2)));
+typedef unsigned char W __attribute__((vector_size (8)));
+
+__attribute__((noipa)) void
+foo (unsigned short x, V *p)
+{
+  *p = (V) { x >> 8, x };
+}
+
+__attribute__((noipa)) void
+bar (unsigned long long x, W *p)
+{
+  *p = (W) { x >> 56, x >> 48, x >> 40, x >> 32, x >> 24, x >> 16, x >> 8, x };
+}
+
+__attribute__((noipa)) void
+baz (unsigned short x, V *p)
+{
+  *p = (V) { x, x >> 8 };
+}
+
+__attribute__((noipa)) void
+qux (unsigned long long x, W *p)
+{
+  *p = (W) { x, x >> 8, x >> 16, x >> 24, x >> 32, x >> 40, x >> 48, x >> 56 };
+}
+
+int
+main ()
+{
+  V a, c, e, g;
+  W b, d, f, h;
+  foo (0xcafe, &a);
+  bar (0xdeadbeefcafebabeULL, &b);
+  baz (0xdead, &c);
+  qux (0xfeedbac1beefdeadULL, &d);
+  e = (V) { 0xca, 0xfe };
+  f = (W) { 0xde, 0xad, 0xbe, 0xef, 0xca, 0xfe, 0xba, 0xbe };
+  g = (V) { 0xad, 0xde };
+  h = (W) { 0xad, 0xde, 0xef, 0xbe, 0xc1, 0xba, 0xed, 0xfe };
+  if (__builtin_memcmp (&a, &e, sizeof (V))
+      || __builtin_memcmp (&b, &f, sizeof (W))
+      || __builtin_memcmp (&c, &g, sizeof (V))
+      || __builtin_memcmp (&d, &h, sizeof (W)))
+    __builtin_abort ();
+  return 0;
+}

        Jakub


Reply via email to