Re: [PATCH] i386: Optimize vshuf{i, f}{32x4, 64x2} ymm and vperm{i, f}128 ymm

2023-04-18 Thread Hongtao Liu via Gcc-patches
On Tue, Apr 18, 2023 at 2:52 PM Hu, Lin1 via Gcc-patches
 wrote:
>
> Hi, all
>
> The patch aims to optimize vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128.
> And it has regtested on x86_64-pc-linux-gnu. OK for trunk?
Ok.
>
> Thanks.
> Lin
>
> vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128 ymm are 3 clk.
> We can optimze them to vblend, vmovaps when there's no cross-lane.
>
> gcc/ChangeLog:
>
> * config/i386/sse.md: Modify insn vperm{i,f}
> and vshuf{i,f}.
>
> gcc/testsuite/ChangeLog:
>
> * gcc.target/i386/avx512vl-vshuff32x4-1.c: Modify test.
> * gcc.target/i386/avx512vl-vshuff64x2-1.c: Ditto.
> * gcc.target/i386/avx512vl-vshufi32x4-1.c: Ditto.
> * gcc.target/i386/avx512vl-vshufi64x2-1.c: Ditto.
> * gcc.target/i386/opt-vperm-vshuf-1.c: New test.
> * gcc.target/i386/opt-vperm-vshuf-2.c: Ditto.
> * gcc.target/i386/opt-vperm-vshuf-3.c: Ditto.
> ---
>  gcc/config/i386/sse.md| 36 --
>  .../gcc.target/i386/avx512vl-vshuff32x4-1.c   |  2 +-
>  .../gcc.target/i386/avx512vl-vshuff64x2-1.c   |  2 +-
>  .../gcc.target/i386/avx512vl-vshufi32x4-1.c   |  2 +-
>  .../gcc.target/i386/avx512vl-vshufi64x2-1.c   |  2 +-
>  .../gcc.target/i386/opt-vperm-vshuf-1.c   | 51 ++
>  .../gcc.target/i386/opt-vperm-vshuf-2.c   | 68 +++
>  .../gcc.target/i386/opt-vperm-vshuf-3.c   | 63 +
>  8 files changed, 218 insertions(+), 8 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c
>
> diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
> index 513960e8f33..5b6b2427460 100644
> --- a/gcc/config/i386/sse.md
> +++ b/gcc/config/i386/sse.md
> @@ -18437,6 +18437,8 @@
>mask = INTVAL (operands[3]) / 2;
>mask |= (INTVAL (operands[5]) - 4) / 2 << 1;
>operands[3] = GEN_INT (mask);
> +  if (INTVAL (operands[3]) == 2 && !)
> +return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
>return "vshuf64x2\t{%3, %2, %1, 
> %0|%0, %1, %2, %3}";
>  }
>[(set_attr "type" "sselog")
> @@ -18595,6 +18597,9 @@
>mask |= (INTVAL (operands[7]) - 8) / 4 << 1;
>operands[3] = GEN_INT (mask);
>
> +  if (INTVAL (operands[3]) == 2 && !)
> +return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
> +
>return "vshuf32x4\t{%3, %2, %1, 
> %0|%0, %1, %2, %3}";
>  }
>[(set_attr "type" "sselog")
> @@ -25663,7 +25668,28 @@
>(match_operand:SI 3 "const_0_to_255_operand")]
>   UNSPEC_VPERMTI))]
>"TARGET_AVX2"
> -  "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}"
> +  {
> +int mask = INTVAL (operands[3]);
> +if ((mask & 0xbb) == 16)
> +  {
> +   if (rtx_equal_p (operands[0], operands[1]))
> + return "";
> +   else
> + return "vmovaps\t{%1, %0|%0, %1}";
> +  }
> +if ((mask & 0xbb) == 50)
> +  {
> +   if (rtx_equal_p (operands[0], operands[2]))
> + return "";
> +   else
> + return "vmovaps\t{%2, %0|%0, %2}";
> +  }
> +if ((mask & 0xbb) == 18)
> +  return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}";
> +if ((mask & 0xbb) == 48)
> +  return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
> +return "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}";
> +  }
>[(set_attr "type" "sselog")
> (set_attr "prefix" "vex")
> (set_attr "mode" "OI")])
> @@ -26226,9 +26252,11 @@
> && avx_vperm2f128_parallel (operands[3], mode)"
>  {
>int mask = avx_vperm2f128_parallel (operands[3], mode) - 1;
> -  if (mask == 0x12)
> -return "vinsert\t{$0, %x2, %1, %0|%0, %1, %x2, 0}";
> -  if (mask == 0x20)
> +  if ((mask & 0xbb) == 0x12)
> +return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}";
> +  if ((mask & 0xbb) == 0x30)
> +return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
> +  if ((mask & 0xbb) == 0x20)
>  return "vinsert\t{$1, %x2, %1, %0|%0, %1, %x2, 1}";
>operands[3] = GEN_INT (mask);
>return "vperm2\t{%3, %2, %1, %0|%0, %1, %2, %3}";
> diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c 
> b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c
> index 6c2fb2f184a..02aecf4edce 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c
> @@ -12,7 +12,7 @@ volatile __mmask8 m;
>  void extern
>  avx512vl_test (void)
>  {
> -  x = _mm256_shuffle_f32x4 (x, x, 2);
> +  x = _mm256_shuffle_f32x4 (x, x, 3);
>x = _mm256_mask_shuffle_f32x4 (x, m, x, x, 2);
>x = _mm256_maskz_shuffle_f32x4 (m, x, x, 2);
>  }
> diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c 
> b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c
> index 1191b400134..563ded5d9df 100644
> --- a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c
> +++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c
> @@ 

[PATCH] i386: Optimize vshuf{i, f}{32x4, 64x2} ymm and vperm{i, f}128 ymm

2023-04-18 Thread Hu, Lin1 via Gcc-patches
Hi, all

The patch aims to optimize vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128.
And it has regtested on x86_64-pc-linux-gnu. OK for trunk?

Thanks.
Lin

vshuf{i,f}{32x4,64x2} ymm and vperm{i,f}128 ymm are 3 clk.
We can optimze them to vblend, vmovaps when there's no cross-lane.

gcc/ChangeLog:

* config/i386/sse.md: Modify insn vperm{i,f}
and vshuf{i,f}.

gcc/testsuite/ChangeLog:

* gcc.target/i386/avx512vl-vshuff32x4-1.c: Modify test.
* gcc.target/i386/avx512vl-vshuff64x2-1.c: Ditto.
* gcc.target/i386/avx512vl-vshufi32x4-1.c: Ditto.
* gcc.target/i386/avx512vl-vshufi64x2-1.c: Ditto.
* gcc.target/i386/opt-vperm-vshuf-1.c: New test.
* gcc.target/i386/opt-vperm-vshuf-2.c: Ditto.
* gcc.target/i386/opt-vperm-vshuf-3.c: Ditto.
---
 gcc/config/i386/sse.md| 36 --
 .../gcc.target/i386/avx512vl-vshuff32x4-1.c   |  2 +-
 .../gcc.target/i386/avx512vl-vshuff64x2-1.c   |  2 +-
 .../gcc.target/i386/avx512vl-vshufi32x4-1.c   |  2 +-
 .../gcc.target/i386/avx512vl-vshufi64x2-1.c   |  2 +-
 .../gcc.target/i386/opt-vperm-vshuf-1.c   | 51 ++
 .../gcc.target/i386/opt-vperm-vshuf-2.c   | 68 +++
 .../gcc.target/i386/opt-vperm-vshuf-3.c   | 63 +
 8 files changed, 218 insertions(+), 8 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-1.c
 create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-2.c
 create mode 100644 gcc/testsuite/gcc.target/i386/opt-vperm-vshuf-3.c

diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index 513960e8f33..5b6b2427460 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -18437,6 +18437,8 @@
   mask = INTVAL (operands[3]) / 2;
   mask |= (INTVAL (operands[5]) - 4) / 2 << 1;
   operands[3] = GEN_INT (mask);
+  if (INTVAL (operands[3]) == 2 && !)
+return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
   return "vshuf64x2\t{%3, %2, %1, 
%0|%0, %1, %2, %3}";
 }
   [(set_attr "type" "sselog")
@@ -18595,6 +18597,9 @@
   mask |= (INTVAL (operands[7]) - 8) / 4 << 1;
   operands[3] = GEN_INT (mask);
 
+  if (INTVAL (operands[3]) == 2 && !)
+return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
+
   return "vshuf32x4\t{%3, %2, %1, 
%0|%0, %1, %2, %3}";
 }
   [(set_attr "type" "sselog")
@@ -25663,7 +25668,28 @@
   (match_operand:SI 3 "const_0_to_255_operand")]
  UNSPEC_VPERMTI))]
   "TARGET_AVX2"
-  "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}"
+  {
+int mask = INTVAL (operands[3]);
+if ((mask & 0xbb) == 16)
+  {
+   if (rtx_equal_p (operands[0], operands[1]))
+ return "";
+   else
+ return "vmovaps\t{%1, %0|%0, %1}";
+  }
+if ((mask & 0xbb) == 50)
+  {
+   if (rtx_equal_p (operands[0], operands[2]))
+ return "";
+   else
+ return "vmovaps\t{%2, %0|%0, %2}";
+  }
+if ((mask & 0xbb) == 18)
+  return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}";
+if ((mask & 0xbb) == 48)
+  return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
+return "vperm2i128\t{%3, %2, %1, %0|%0, %1, %2, %3}";
+  }
   [(set_attr "type" "sselog")
(set_attr "prefix" "vex")
(set_attr "mode" "OI")])
@@ -26226,9 +26252,11 @@
&& avx_vperm2f128_parallel (operands[3], mode)"
 {
   int mask = avx_vperm2f128_parallel (operands[3], mode) - 1;
-  if (mask == 0x12)
-return "vinsert\t{$0, %x2, %1, %0|%0, %1, %x2, 0}";
-  if (mask == 0x20)
+  if ((mask & 0xbb) == 0x12)
+return "vblendps\t{$15, %2, %1, %0|%0, %1, %2, 15}";
+  if ((mask & 0xbb) == 0x30)
+return "vblendps\t{$240, %2, %1, %0|%0, %1, %2, 240}";
+  if ((mask & 0xbb) == 0x20)
 return "vinsert\t{$1, %x2, %1, %0|%0, %1, %x2, 1}";
   operands[3] = GEN_INT (mask);
   return "vperm2\t{%3, %2, %1, %0|%0, %1, %2, %3}";
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c 
b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c
index 6c2fb2f184a..02aecf4edce 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff32x4-1.c
@@ -12,7 +12,7 @@ volatile __mmask8 m;
 void extern
 avx512vl_test (void)
 {
-  x = _mm256_shuffle_f32x4 (x, x, 2);
+  x = _mm256_shuffle_f32x4 (x, x, 3);
   x = _mm256_mask_shuffle_f32x4 (x, m, x, x, 2);
   x = _mm256_maskz_shuffle_f32x4 (m, x, x, 2);
 }
diff --git a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c 
b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c
index 1191b400134..563ded5d9df 100644
--- a/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c
+++ b/gcc/testsuite/gcc.target/i386/avx512vl-vshuff64x2-1.c
@@ -12,7 +12,7 @@ volatile __mmask8 m;
 void extern
 avx512vl_test (void)
 {
-  x = _mm256_shuffle_f64x2 (x, x, 2);
+  x = _mm256_shuffle_f64x2 (x, x, 3);
   x = _mm256_mask_shuffle_f64x2 (x, m, x, x, 2);
   x = _mm256_maskz_shuffle_f64x2 (m, x, x, 2);
 }
diff --git