Re: [PATCH] vect: Try to remove single-vector permutes from SLP graph

2022-09-01 Thread Richard Biener via Gcc-patches
On Thu, 1 Sep 2022, Richard Sandiford wrote:

> This patch extends the SLP layout optimisation pass so that it
> tries to remove layout changes that are brought about by permutes
> of existing vectors.  This fixes the bb-slp-pr54400.c regression on
> x86_64 and also means that we can remove the permutes in cases like:
> 
> typedef float v4sf __attribute__((vector_size(sizeof(float)*4)));
> 
> float __attribute__((noipa))
> f(v4sf v0, v4sf v1)
> {
>   return v0[0]*v1[0]+v0[1]*v1[1]+v0[2]*v1[2]+v0[3]*v1[3];
> }
> 
> The new test is a simple adaption of bb-slp-pr54400.c, with the
> same style of markup.
> 
> Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?

OK.

Thanks,
Richard.

> 
> Richard
> 
> PS. Sorry for missing the regression during testing.  The initial
> version of the new bb-slp-layout* tests had markup that led
> to a lot of FAILs on x86_64.  I fixed the markup to avoid those,
> but didn't notice this extra (unrelated) failure at the end.
> 
> 
> gcc/
>   * tree-vect-slp.cc (vect_build_slp_tree_2): When building a
>   VEC_PERM_EXPR of an existing vector, set the SLP_TREE_LANES
>   to the number of vector elements, if that's a known constant.
>   (vect_optimize_slp_pass::is_compatible_layout): Remove associated
>   comment about zero SLP_TREE_LANES.
>   (vect_optimize_slp_pass::start_choosing_layouts): Iterate over
>   all partition members when looking for potential layouts.
>   Handle existing permutes of fixed-length vectors.
> 
> gcc/testsuite/
>   * gcc.dg/vect/bb-slp-pr54400.c: Extend to aarch64.
>   * gcc.dg/vect/bb-slp-layout-18.c: New test.
> ---
>  gcc/testsuite/gcc.dg/vect/bb-slp-layout-18.c | 15 +
>  gcc/testsuite/gcc.dg/vect/bb-slp-pr54400.c   |  4 +-
>  gcc/tree-vect-slp.cc | 67 +---
>  3 files changed, 61 insertions(+), 25 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-layout-18.c
> 
> diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-18.c 
> b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-18.c
> new file mode 100644
> index 000..ff462722507
> --- /dev/null
> +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-18.c
> @@ -0,0 +1,15 @@
> +/* { dg-do compile } */
> +/* { dg-require-effective-target vect_float} */
> +/* { dg-additional-options "-w -Wno-psabi -ffast-math" } */
> +
> +typedef float v4sf __attribute__((vector_size(sizeof(float)*4)));
> +
> +float __attribute__((noipa))
> +f(v4sf v0, v4sf v1)
> +{
> +  return v0[0]*v1[0]+v0[1]*v1[1]+v0[2]*v1[2]+v0[3]*v1[3];
> +}
> +
> +/* We are lacking an effective target for .REDUC_PLUS support.  */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized" 1 "slp2" 
> { target x86_64-*-* aarch64*-*-* } } } */
> +/* { dg-final { scan-tree-dump-not " = VEC_PERM_EXPR" "slp2" { target 
> x86_64-*-* aarch64*-*-* } } } */
> diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr54400.c 
> b/gcc/testsuite/gcc.dg/vect/bb-slp-pr54400.c
> index 6b427aac774..8aec2092f4d 100644
> --- a/gcc/testsuite/gcc.dg/vect/bb-slp-pr54400.c
> +++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr54400.c
> @@ -39,5 +39,5 @@ main ()
>  }
>  
>  /* We are lacking an effective target for .REDUC_PLUS support.  */
> -/* { dg-final { scan-tree-dump-times "basic block part vectorized" 3 "slp2" 
> { target x86_64-*-* } } } */
> -/* { dg-final { scan-tree-dump-not " = VEC_PERM_EXPR" "slp2" { target 
> x86_64-*-* } } } */
> +/* { dg-final { scan-tree-dump-times "basic block part vectorized" 3 "slp2" 
> { target x86_64-*-* aarch64*-*-* } } } */
> +/* { dg-final { scan-tree-dump-not " = VEC_PERM_EXPR" "slp2" { target 
> x86_64-*-* aarch64*-*-* } } } */
> diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
> index 226550635cc..cc04ef350a6 100644
> --- a/gcc/tree-vect-slp.cc
> +++ b/gcc/tree-vect-slp.cc
> @@ -1840,6 +1840,10 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
>TREE_TYPE (TREE_TYPE (vec;
> SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
>   }
> +  auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
> +  unsigned HOST_WIDE_INT const_nunits;
> +  if (nunits.is_constant (_nunits))
> + SLP_TREE_LANES (vnode) = const_nunits;
>SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
>/* We are always building a permutation node even if it is an identity
>permute to shield the rest of the vectorizer from the odd node
> @@ -4325,8 +4329,6 @@ vect_optimize_slp_pass::is_compatible_layout (slp_tree 
> node,
>if (layout_i == 0)
>  return true;
>  
> -  /* SLP_TREE_LANES is zero for existing vectors, but those only support
> - layout 0 anyway.  */
>if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
>  return false;
>  
> @@ -4521,38 +4523,57 @@ vect_optimize_slp_pass::start_choosing_layouts ()
>m_perms.safe_push (vNULL);
>  
>/* Create layouts from existing permutations.  */
> -  for (unsigned int node_i : m_leafs)

[PATCH] vect: Try to remove single-vector permutes from SLP graph

2022-09-01 Thread Richard Sandiford via Gcc-patches
This patch extends the SLP layout optimisation pass so that it
tries to remove layout changes that are brought about by permutes
of existing vectors.  This fixes the bb-slp-pr54400.c regression on
x86_64 and also means that we can remove the permutes in cases like:

typedef float v4sf __attribute__((vector_size(sizeof(float)*4)));

float __attribute__((noipa))
f(v4sf v0, v4sf v1)
{
  return v0[0]*v1[0]+v0[1]*v1[1]+v0[2]*v1[2]+v0[3]*v1[3];
}

The new test is a simple adaption of bb-slp-pr54400.c, with the
same style of markup.

Tested on aarch64-linux-gnu and x86_64-linux-gnu.  OK to install?

Richard

PS. Sorry for missing the regression during testing.  The initial
version of the new bb-slp-layout* tests had markup that led
to a lot of FAILs on x86_64.  I fixed the markup to avoid those,
but didn't notice this extra (unrelated) failure at the end.


gcc/
* tree-vect-slp.cc (vect_build_slp_tree_2): When building a
VEC_PERM_EXPR of an existing vector, set the SLP_TREE_LANES
to the number of vector elements, if that's a known constant.
(vect_optimize_slp_pass::is_compatible_layout): Remove associated
comment about zero SLP_TREE_LANES.
(vect_optimize_slp_pass::start_choosing_layouts): Iterate over
all partition members when looking for potential layouts.
Handle existing permutes of fixed-length vectors.

gcc/testsuite/
* gcc.dg/vect/bb-slp-pr54400.c: Extend to aarch64.
* gcc.dg/vect/bb-slp-layout-18.c: New test.
---
 gcc/testsuite/gcc.dg/vect/bb-slp-layout-18.c | 15 +
 gcc/testsuite/gcc.dg/vect/bb-slp-pr54400.c   |  4 +-
 gcc/tree-vect-slp.cc | 67 +---
 3 files changed, 61 insertions(+), 25 deletions(-)
 create mode 100644 gcc/testsuite/gcc.dg/vect/bb-slp-layout-18.c

diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-layout-18.c 
b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-18.c
new file mode 100644
index 000..ff462722507
--- /dev/null
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-layout-18.c
@@ -0,0 +1,15 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target vect_float} */
+/* { dg-additional-options "-w -Wno-psabi -ffast-math" } */
+
+typedef float v4sf __attribute__((vector_size(sizeof(float)*4)));
+
+float __attribute__((noipa))
+f(v4sf v0, v4sf v1)
+{
+  return v0[0]*v1[0]+v0[1]*v1[1]+v0[2]*v1[2]+v0[3]*v1[3];
+}
+
+/* We are lacking an effective target for .REDUC_PLUS support.  */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized" 1 "slp2" { 
target x86_64-*-* aarch64*-*-* } } } */
+/* { dg-final { scan-tree-dump-not " = VEC_PERM_EXPR" "slp2" { target 
x86_64-*-* aarch64*-*-* } } } */
diff --git a/gcc/testsuite/gcc.dg/vect/bb-slp-pr54400.c 
b/gcc/testsuite/gcc.dg/vect/bb-slp-pr54400.c
index 6b427aac774..8aec2092f4d 100644
--- a/gcc/testsuite/gcc.dg/vect/bb-slp-pr54400.c
+++ b/gcc/testsuite/gcc.dg/vect/bb-slp-pr54400.c
@@ -39,5 +39,5 @@ main ()
 }
 
 /* We are lacking an effective target for .REDUC_PLUS support.  */
-/* { dg-final { scan-tree-dump-times "basic block part vectorized" 3 "slp2" { 
target x86_64-*-* } } } */
-/* { dg-final { scan-tree-dump-not " = VEC_PERM_EXPR" "slp2" { target 
x86_64-*-* } } } */
+/* { dg-final { scan-tree-dump-times "basic block part vectorized" 3 "slp2" { 
target x86_64-*-* aarch64*-*-* } } } */
+/* { dg-final { scan-tree-dump-not " = VEC_PERM_EXPR" "slp2" { target 
x86_64-*-* aarch64*-*-* } } } */
diff --git a/gcc/tree-vect-slp.cc b/gcc/tree-vect-slp.cc
index 226550635cc..cc04ef350a6 100644
--- a/gcc/tree-vect-slp.cc
+++ b/gcc/tree-vect-slp.cc
@@ -1840,6 +1840,10 @@ vect_build_slp_tree_2 (vec_info *vinfo, slp_tree node,
 TREE_TYPE (TREE_TYPE (vec;
  SLP_TREE_VECTYPE (vnode) = TREE_TYPE (vec);
}
+  auto nunits = TYPE_VECTOR_SUBPARTS (SLP_TREE_VECTYPE (vnode));
+  unsigned HOST_WIDE_INT const_nunits;
+  if (nunits.is_constant (_nunits))
+   SLP_TREE_LANES (vnode) = const_nunits;
   SLP_TREE_VEC_DEFS (vnode).safe_push (vec);
   /* We are always building a permutation node even if it is an identity
 permute to shield the rest of the vectorizer from the odd node
@@ -4325,8 +4329,6 @@ vect_optimize_slp_pass::is_compatible_layout (slp_tree 
node,
   if (layout_i == 0)
 return true;
 
-  /* SLP_TREE_LANES is zero for existing vectors, but those only support
- layout 0 anyway.  */
   if (SLP_TREE_LANES (node) != m_perms[layout_i].length ())
 return false;
 
@@ -4521,38 +4523,57 @@ vect_optimize_slp_pass::start_choosing_layouts ()
   m_perms.safe_push (vNULL);
 
   /* Create layouts from existing permutations.  */
-  for (unsigned int node_i : m_leafs)
+  auto_load_permutation_t tmp_perm;
+  for (unsigned int node_i : m_partitioned_nodes)
 {
-  auto  = m_vertices[node_i];
-  if (vertex.partition < 0)
-   continue;
-
   /* Leafs also double as entries to the reverse graph.  Allow the
 layout of