[PATCH] Strip of a vector load which is only used partially.

liuhongt via Gcc-patches Wed, 04 May 2022 22:04:55 -0700

Optimize

  _1 = *srcp_3(D);
  _4 = VEC_PERM_EXPR <_1, _1, { 4, 5, 6, 7, 4, 5, 6, 7 }>;
  _5 = BIT_FIELD_REF <_4, 128, 0>;


to

  _1 = *srcp_3(D);
  _5 = BIT_FIELD_REF <_1, 128, 128>;

the upper will finally be optimized to

_5 = BIT_FIELD_REF <*srcp_3(D), 128, 128>;

Bootstrapped and regtested on x86_64-pc-linux-gnu{m32,}.
Ok for trunk?

gcc/ChangeLog:

        PR tree-optimization/102583
        * gimple.h (gate_optimize_vector_load): Declare.
        * match.pd: Simplify (BIT_FIELD_REF (vec_perm *p *p { 4, 5, 6,
        7, 4, 5, 6, 7 }) 128 0) to (BIT_FIELD_REF *p 128 128).
        * tree-ssa-forwprop.cc (gate_optimize_vector_load): New
        function.
        (pass_forwprop::execute): Put condition codes in the upper new
        function.

gcc/testsuite/ChangeLog:

        * gcc.target/i386/pr102583.c: New test.
---
 gcc/gimple.h                             |  1 +
 gcc/match.pd                             | 56 ++++++++++++++++++++++++
 gcc/testsuite/gcc.target/i386/pr102583.c | 30 +++++++++++++
 gcc/tree-ssa-forwprop.cc                 | 32 +++++++++-----
 4 files changed, 109 insertions(+), 10 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr102583.c

diff --git a/gcc/gimple.h b/gcc/gimple.h
index 6b1e89ad74e..1747dae1193 100644
--- a/gcc/gimple.h
+++ b/gcc/gimple.h
@@ -1638,6 +1638,7 @@ extern void maybe_remove_unused_call_args (struct 
function *, gimple *);
 extern bool gimple_inexpensive_call_p (gcall *);
 extern bool stmt_can_terminate_bb_p (gimple *);
 extern location_t gimple_or_expr_nonartificial_location (gimple *, tree);
+extern bool gate_optimize_vector_load (gimple *);
 
 /* Return the disposition for a warning (or all warnings by default)
    for a statement.  */
diff --git a/gcc/match.pd b/gcc/match.pd
index 6d691d302b3..ac214310251 100644
--- a/gcc/match.pd
+++ b/gcc/match.pd
@@ -6832,6 +6832,62 @@ DEFINE_INT_AND_FLOAT_ROUND_FN (RINT)
        }
        (cmp @0 { res; })))))))))
 
+#if GIMPLE
+/* Simplify partail vector access, transform
+
+   V8SI A;
+   V4SI B;
+   A = *PA;
+   B = VEC_PERM_EXPR (A, A, { 4, 5, 6, 7, 4, 5, 6, 7 });
+   C = BIT_FIELD_REF (B, 128, 0)
+
+to
+
+   A = *PA;
+   C = BIT_FIELD_REF (B, 128, 128);
+
+optimize_vector_load will eventually optimize the upper to
+
+   C = BIT_FIELD_REF (*PA, 128, 128);  */
+
+(simplify
+ (BIT_FIELD_REF (vec_perm@2 SSA_NAME@0 @0 VECTOR_CST@1) @rsize @rpos)
+ (if (VECTOR_TYPE_P (type)
+     && TYPE_MODE (type) != BLKmode
+     && single_use (@2)
+     && gate_optimize_vector_load (SSA_NAME_DEF_STMT (@0))
+     && types_match (TREE_TYPE (type), TREE_TYPE (TREE_TYPE (@0))))
+  (with
+   {
+     unsigned HOST_WIDE_INT nelts = -1;
+     if (!VECTOR_CST_NELTS (@1).is_constant (&nelts))
+       return NULL_TREE;
+     tree inner_type = TREE_TYPE (type);
+     unsigned HOST_WIDE_INT elt_w = tree_to_uhwi (TYPE_SIZE (inner_type));
+     unsigned HOST_WIDE_INT pos = tree_to_uhwi (@rpos);
+     unsigned HOST_WIDE_INT size = tree_to_uhwi (@rsize);
+     unsigned HOST_WIDE_INT start
+       = tree_to_uhwi (vector_cst_elt (@1, pos / elt_w));
+
+     for (unsigned HOST_WIDE_INT i  = pos / elt_w + 1; i != size / elt_w; i++)
+       {
+        /* Continuous area.  */
+        if (tree_to_uhwi (vector_cst_elt (@1, i)) - 1
+            != tree_to_uhwi (vector_cst_elt (@1, i - 1)))
+          return NULL_TREE;
+       }
+
+     /* Aligned or support movmisalign_optab.  */
+     unsigned HOST_WIDE_INT dest_align = tree_to_uhwi (TYPE_SIZE (type));
+     if ((TYPE_ALIGN (TREE_TYPE (@0)) % dest_align
+         || start * elt_w % dest_align)
+       && (optab_handler (movmisalign_optab, TYPE_MODE (type))
+           == CODE_FOR_nothing))
+       return NULL_TREE;
+   }
+   (BIT_FIELD_REF @0 @rsize { bitsize_int (start * elt_w); }))))
+#endif
+
 /* Canonicalizations of BIT_FIELD_REFs.  */
 
 (simplify
diff --git a/gcc/testsuite/gcc.target/i386/pr102583.c 
b/gcc/testsuite/gcc.target/i386/pr102583.c
new file mode 100644
index 00000000000..ff2ffb5e671
--- /dev/null
+++ b/gcc/testsuite/gcc.target/i386/pr102583.c
@@ -0,0 +1,30 @@
+/* { dg-do compile } */
+/* { dg-options "-mavx512f -O2" } */
+/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+20\(%.*%ymm} 1 } } */
+/* { dg-final { scan-assembler-times {(?n)vcvtdq2ps[ \t]+8\(%.*%xmm} 1 } } */
+/* { dg-final { scan-assembler-times {(?n)vmovq[ \t]+16\(%.*%xmm} 1 { target { 
! ia32 } } } } */
+/* { dg-final { scan-assembler-not {(?n)vpermd[ \t]+.*%zmm} } } */
+
+typedef int v16si __attribute__((vector_size(64)));
+typedef float v8sf __attribute__((vector_size(32)));
+typedef float v4sf __attribute__((vector_size(16)));
+typedef float v2sf __attribute__((vector_size(8)));
+
+v8sf part (v16si *srcp)
+{
+  v16si src = *srcp;
+  return (v8sf) { (float)src[5], (float)src[6], (float)src[7], (float)src[8],
+      (float)src[9], (float)src[10], (float)src[11], (float)src[12] };
+}
+
+v4sf part1 (v16si *srcp)
+{
+  v16si src = *srcp;
+  return (v4sf) { (float)src[2], (float)src[3], (float)src[4], (float)src[5] };
+}
+
+v2sf part2 (v16si *srcp)
+{
+  v16si src = *srcp;
+  return (v2sf) { (float)src[4], (float)src[5] };
+}
diff --git a/gcc/tree-ssa-forwprop.cc b/gcc/tree-ssa-forwprop.cc
index 484491fa1c5..2c8d8bc6dce 100644
--- a/gcc/tree-ssa-forwprop.cc
+++ b/gcc/tree-ssa-forwprop.cc
@@ -3074,6 +3074,27 @@ simplify_vector_constructor (gimple_stmt_iterator *gsi)
   return true;
 }
 
+/* Gate for optimize_vector_load.  */
+bool
+gate_optimize_vector_load (gimple* stmt)
+{
+  if (!is_gimple_assign (stmt))
+    return false;
+
+  tree lhs = gimple_assign_lhs (stmt);
+  tree rhs = gimple_assign_rhs1 (stmt);
+  return (cfun
+         && TREE_CODE (TREE_TYPE (lhs)) == VECTOR_TYPE
+         && (TYPE_MODE (TREE_TYPE (lhs)) == BLKmode
+             /* After vector lowering rewrite all loads, but
+                initially do not since this conflicts with
+                vector CONSTRUCTOR to shuffle optimization.  */
+             || (cfun->curr_properties & PROP_gimple_lvec))
+         && gimple_assign_load_p (stmt)
+         && !gimple_has_volatile_ops (stmt)
+         && !stmt_can_throw_internal (cfun, stmt)
+         && (!VAR_P (rhs) || !DECL_HARD_REGISTER (rhs)));
+}
 
 /* Rewrite the vector load at *GSI to component-wise loads if the load
    is only used in BIT_FIELD_REF extractions with eventual intermediate
@@ -3500,16 +3521,7 @@ pass_forwprop::execute (function *fun)
              else
                gsi_next (&gsi);
            }
-         else if (TREE_CODE (TREE_TYPE (lhs)) == VECTOR_TYPE
-                  && (TYPE_MODE (TREE_TYPE (lhs)) == BLKmode
-                      /* After vector lowering rewrite all loads, but
-                         initially do not since this conflicts with
-                         vector CONSTRUCTOR to shuffle optimization.  */
-                      || (fun->curr_properties & PROP_gimple_lvec))
-                  && gimple_assign_load_p (stmt)
-                  && !gimple_has_volatile_ops (stmt)
-                  && !stmt_can_throw_internal (cfun, stmt)
-                  && (!VAR_P (rhs) || !DECL_HARD_REGISTER (rhs)))
+         else if (gate_optimize_vector_load (stmt))
            optimize_vector_load (&gsi);
 
          else if (code == COMPLEX_EXPR)
-- 
2.18.1

[PATCH] Strip of a vector load which is only used partially.

Reply via email to