Re: [PATCH 3/3] vect: support direct conversion under x86-64-v3.

2024-05-23 Thread Hongtao Liu
On Thu, May 23, 2024 at 3:17 PM Hu, Lin1  wrote:
>
> > -Original Message-
> > From: Hongtao Liu 
> > Sent: Thursday, May 23, 2024 2:42 PM
> > To: Hu, Lin1 
> > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ;
> > ubiz...@gmail.com; rguent...@suse.de
> > Subject: Re: [PATCH 3/3] vect: support direct conversion under x86-64-v3.
> >
> > On Thu, May 23, 2024 at 2:38 PM Hu, Lin1  wrote:
> > >
> > > gcc/ChangeLog:
> > >
> > > PR 107432
> > > * config/i386/i386-expand.cc 
> > > (ix86_expand_trunc_with_avx2_noavx512f):
> > > New function for generate a series of suitable insn.
> > > * config/i386/i386-protos.h 
> > > (ix86_expand_trunc_with_avx2_noavx512f):
> > > Define new function.
> > > * config/i386/sse.md: Extend trunc2 for x86-64-v3.
> > I have some concern for this patch since
> > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115069, let's hold on to this
> > patch.
>
> OK, maybe we need to modify ix86_expand_vec_perm_const_1, let it emit some 
> better code. Maybe like clang (https://godbolt.org/z/rTKPq9oj5).
> Or we can disable some of the optimization via vpermq. In pr107432-8.c, there 
> are only 5 tests that use vpermq.
After a second thought, we may go ahead with the patch, for PR115069,
there's an alternative to avoid cross-lane truncation.
But for this one, there's no alternative. Although cross-lane
permutation is not very efficient, it should still be better than
original code.
>
> BRs,
> Lin
>
> > > gcc/testsuite/ChangeLog:
> > >
> > > PR 107432
> > > * gcc.target/i386/pr107432-8.c: New test.
> > > * gcc.target/i386/pr107432-9.c: Ditto.
> > > * gcc.target/i386/pr92645-4.c: Modify test.
> > > ---
> > >  gcc/config/i386/i386-expand.cc |  47 +++-
> > >  gcc/config/i386/i386-protos.h  |   3 +
> > >  gcc/config/i386/sse.md |  87 +++
> > >  gcc/testsuite/gcc.target/i386/pr107432-8.c |  73 +
> > > gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 +
> > >  gcc/testsuite/gcc.target/i386/pr92645-4.c  |   2 -
> > >  6 files changed, 304 insertions(+), 29 deletions(-)  create mode
> > > 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c
> > >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c
> > >
> > > diff --git a/gcc/config/i386/i386-expand.cc
> > > b/gcc/config/i386/i386-expand.cc index 2f27bfb484c..bca8b85c9d1 100644
> > > --- a/gcc/config/i386/i386-expand.cc
> > > +++ b/gcc/config/i386/i386-expand.cc
> > > @@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[])
> > >emit_insn (gen_xorv4si3 (value, value, large));  }
> > >
> > > -static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
> > > -machine_mode mode, rtx 
> > > target,
> > > -rtx var, int one_var);
> > > -
> > >  /* Convert an unsigned DImode value into a DFmode, using only SSE.
> > > Expects the 64-bit DImode to be supplied in a pair of integral
> > > registers.  Requires SSE2; will use SSE3 if available.  For
> > > x86_32, @@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool
> > mmx_ok, machine_mode mode,
> > > whose ONE_VAR element is VAR, and other elements are zero.  Return 
> > > true
> > > if successful.  */
> > >
> > > -static bool
> > > +bool
> > >  ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
> > >  rtx target, rtx var, int one_var)
> > > { @@ -25551,4 +25547,45 @@ ix86_expand_fast_convert_bf_to_sf (rtx val)
> > >return ret;
> > >  }
> > >
> > > +/* Trunc a vector to a narrow vector, like v4di -> v4si.  */
> > > +
> > > +bool
> > > +ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input) {
> > > +  machine_mode out_mode = GET_MODE (output);
> > > +  machine_mode in_mode = GET_MODE (input);
> > > +  int len = GET_MODE_SIZE (in_mode);
> > > +  gcc_assert (len == 16 || len == 32);
> > > +  machine_mode cvt_mode = (len == 16) ? V16QImode : V32QImode;
> > > +  int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode));
> > > +  int out_innersize = GET_MODE_SIZE (GET_MODE_INNER (out_mode));
> > > +
> > >

RE: [PATCH 3/3] vect: support direct conversion under x86-64-v3.

2024-05-23 Thread Hu, Lin1
> -Original Message-
> From: Hongtao Liu 
> Sent: Thursday, May 23, 2024 2:42 PM
> To: Hu, Lin1 
> Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ;
> ubiz...@gmail.com; rguent...@suse.de
> Subject: Re: [PATCH 3/3] vect: support direct conversion under x86-64-v3.
> 
> On Thu, May 23, 2024 at 2:38 PM Hu, Lin1  wrote:
> >
> > gcc/ChangeLog:
> >
> > PR 107432
> > * config/i386/i386-expand.cc 
> > (ix86_expand_trunc_with_avx2_noavx512f):
> > New function for generate a series of suitable insn.
> > * config/i386/i386-protos.h (ix86_expand_trunc_with_avx2_noavx512f):
> > Define new function.
> > * config/i386/sse.md: Extend trunc2 for x86-64-v3.
> I have some concern for this patch since
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115069, let's hold on to this
> patch.

OK, maybe we need to modify ix86_expand_vec_perm_const_1, let it emit some 
better code. Maybe like clang (https://godbolt.org/z/rTKPq9oj5).
Or we can disable some of the optimization via vpermq. In pr107432-8.c, there 
are only 5 tests that use vpermq.

BRs,
Lin
 
> > gcc/testsuite/ChangeLog:
> >
> > PR 107432
> > * gcc.target/i386/pr107432-8.c: New test.
> > * gcc.target/i386/pr107432-9.c: Ditto.
> > * gcc.target/i386/pr92645-4.c: Modify test.
> > ---
> >  gcc/config/i386/i386-expand.cc |  47 +++-
> >  gcc/config/i386/i386-protos.h  |   3 +
> >  gcc/config/i386/sse.md |  87 +++
> >  gcc/testsuite/gcc.target/i386/pr107432-8.c |  73 +
> > gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 +
> >  gcc/testsuite/gcc.target/i386/pr92645-4.c  |   2 -
> >  6 files changed, 304 insertions(+), 29 deletions(-)  create mode
> > 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c
> >  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c
> >
> > diff --git a/gcc/config/i386/i386-expand.cc
> > b/gcc/config/i386/i386-expand.cc index 2f27bfb484c..bca8b85c9d1 100644
> > --- a/gcc/config/i386/i386-expand.cc
> > +++ b/gcc/config/i386/i386-expand.cc
> > @@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[])
> >emit_insn (gen_xorv4si3 (value, value, large));  }
> >
> > -static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
> > -machine_mode mode, rtx 
> > target,
> > -rtx var, int one_var);
> > -
> >  /* Convert an unsigned DImode value into a DFmode, using only SSE.
> > Expects the 64-bit DImode to be supplied in a pair of integral
> > registers.  Requires SSE2; will use SSE3 if available.  For
> > x86_32, @@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool
> mmx_ok, machine_mode mode,
> > whose ONE_VAR element is VAR, and other elements are zero.  Return true
> > if successful.  */
> >
> > -static bool
> > +bool
> >  ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
> >  rtx target, rtx var, int one_var)
> > { @@ -25551,4 +25547,45 @@ ix86_expand_fast_convert_bf_to_sf (rtx val)
> >return ret;
> >  }
> >
> > +/* Trunc a vector to a narrow vector, like v4di -> v4si.  */
> > +
> > +bool
> > +ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input) {
> > +  machine_mode out_mode = GET_MODE (output);
> > +  machine_mode in_mode = GET_MODE (input);
> > +  int len = GET_MODE_SIZE (in_mode);
> > +  gcc_assert (len == 16 || len == 32);
> > +  machine_mode cvt_mode = (len == 16) ? V16QImode : V32QImode;
> > +  int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode));
> > +  int out_innersize = GET_MODE_SIZE (GET_MODE_INNER (out_mode));
> > +
> > +  struct expand_vec_perm_d d;
> > +  d.target = gen_reg_rtx (cvt_mode);
> > +  d.op0 = lowpart_subreg (cvt_mode, force_reg (in_mode, input),
> > + in_mode);
> > +  d.op1 = d.op0;
> > +  d.vmode = cvt_mode;
> > +  d.nelt = len;
> > +  d.testing_p = false;
> > +  d.one_operand_p = true;
> > +
> > +  /* Init perm. Put the needed bits of input in order and
> > + fill the rest of bits by default.  */  int tot = 0;  for (int i
> > + = 0; i < len; ++i)
> > +{
> > +  d.perm[i] = i;
> > +  if ((i % in_innersize) < out_innersize)
> > +   d.perm[tot++] = i;
> > +}
> > +
> > +  if (ix86_expand_vec_perm_const_1(&d))
> > +{
> > +  emit_move_i

Re: [PATCH 3/3] vect: support direct conversion under x86-64-v3.

2024-05-22 Thread Hongtao Liu
On Thu, May 23, 2024 at 2:38 PM Hu, Lin1  wrote:
>
> gcc/ChangeLog:
>
> PR 107432
> * config/i386/i386-expand.cc (ix86_expand_trunc_with_avx2_noavx512f):
> New function for generate a series of suitable insn.
> * config/i386/i386-protos.h (ix86_expand_trunc_with_avx2_noavx512f):
> Define new function.
> * config/i386/sse.md: Extend trunc2 for x86-64-v3.
I have some concern for this patch since
https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115069, let's hold on to
this patch.
> gcc/testsuite/ChangeLog:
>
> PR 107432
> * gcc.target/i386/pr107432-8.c: New test.
> * gcc.target/i386/pr107432-9.c: Ditto.
> * gcc.target/i386/pr92645-4.c: Modify test.
> ---
>  gcc/config/i386/i386-expand.cc |  47 +++-
>  gcc/config/i386/i386-protos.h  |   3 +
>  gcc/config/i386/sse.md |  87 +++
>  gcc/testsuite/gcc.target/i386/pr107432-8.c |  73 +
>  gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 +
>  gcc/testsuite/gcc.target/i386/pr92645-4.c  |   2 -
>  6 files changed, 304 insertions(+), 29 deletions(-)
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c
>
> diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
> index 2f27bfb484c..bca8b85c9d1 100644
> --- a/gcc/config/i386/i386-expand.cc
> +++ b/gcc/config/i386/i386-expand.cc
> @@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[])
>emit_insn (gen_xorv4si3 (value, value, large));
>  }
>
> -static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
> -machine_mode mode, rtx 
> target,
> -rtx var, int one_var);
> -
>  /* Convert an unsigned DImode value into a DFmode, using only SSE.
> Expects the 64-bit DImode to be supplied in a pair of integral
> registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
> @@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, 
> machine_mode mode,
> whose ONE_VAR element is VAR, and other elements are zero.  Return true
> if successful.  */
>
> -static bool
> +bool
>  ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
>  rtx target, rtx var, int one_var)
>  {
> @@ -25551,4 +25547,45 @@ ix86_expand_fast_convert_bf_to_sf (rtx val)
>return ret;
>  }
>
> +/* Trunc a vector to a narrow vector, like v4di -> v4si.  */
> +
> +bool
> +ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input)
> +{
> +  machine_mode out_mode = GET_MODE (output);
> +  machine_mode in_mode = GET_MODE (input);
> +  int len = GET_MODE_SIZE (in_mode);
> +  gcc_assert (len == 16 || len == 32);
> +  machine_mode cvt_mode = (len == 16) ? V16QImode : V32QImode;
> +  int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode));
> +  int out_innersize = GET_MODE_SIZE (GET_MODE_INNER (out_mode));
> +
> +  struct expand_vec_perm_d d;
> +  d.target = gen_reg_rtx (cvt_mode);
> +  d.op0 = lowpart_subreg (cvt_mode, force_reg (in_mode, input), in_mode);
> +  d.op1 = d.op0;
> +  d.vmode = cvt_mode;
> +  d.nelt = len;
> +  d.testing_p = false;
> +  d.one_operand_p = true;
> +
> +  /* Init perm. Put the needed bits of input in order and
> + fill the rest of bits by default.  */
> +  int tot = 0;
> +  for (int i = 0; i < len; ++i)
> +{
> +  d.perm[i] = i;
> +  if ((i % in_innersize) < out_innersize)
> +   d.perm[tot++] = i;
> +}
> +
> +  if (ix86_expand_vec_perm_const_1(&d))
> +{
> +  emit_move_insn (output, gen_lowpart (out_mode, d.target));
> +  return true;
> +}
> +
> +  return false;
> +}
> +
>  #include "gt-i386-expand.h"
> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> index dbc861fb1ea..ac29fb34028 100644
> --- a/gcc/config/i386/i386-protos.h
> +++ b/gcc/config/i386/i386-protos.h
> @@ -242,6 +242,7 @@ extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, 
> rtx, enum rtx_code,
>  extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx, rtx, rtx, rtx,
>   bool, rtx_code_label *);
>  extern rtx ix86_expand_fast_convert_bf_to_sf (rtx);
> +extern bool ix86_expand_trunc_with_avx2_noavx512f (rtx, rtx);
>  extern rtx ix86_memtag_untagged_pointer (rtx, rtx);
>  extern bool ix86_memtag_can_tag_addresses (void);
>
> @@ -288,6 +289,8 @@ extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
>  extern void ix86_expand_sse2_abs (rtx, rtx);
>  extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx,
>rtx);
> +extern bool ix86_expand_vector_init_one_nonzero (bool, machine_mode, rtx,
> +rtx, int);
>  extern bool ix86_extract_perm_from_pool_constant (int*, rtx);
>
>  /* In i386-c.cc  */
> diff --git a/g

[PATCH 3/3] vect: support direct conversion under x86-64-v3.

2024-05-22 Thread Hu, Lin1
gcc/ChangeLog:

PR 107432
* config/i386/i386-expand.cc (ix86_expand_trunc_with_avx2_noavx512f):
New function for generate a series of suitable insn.
* config/i386/i386-protos.h (ix86_expand_trunc_with_avx2_noavx512f):
Define new function.
* config/i386/sse.md: Extend trunc2 for x86-64-v3.

gcc/testsuite/ChangeLog:

PR 107432
* gcc.target/i386/pr107432-8.c: New test.
* gcc.target/i386/pr107432-9.c: Ditto.
* gcc.target/i386/pr92645-4.c: Modify test.
---
 gcc/config/i386/i386-expand.cc |  47 +++-
 gcc/config/i386/i386-protos.h  |   3 +
 gcc/config/i386/sse.md |  87 +++
 gcc/testsuite/gcc.target/i386/pr107432-8.c |  73 +
 gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 +
 gcc/testsuite/gcc.target/i386/pr92645-4.c  |   2 -
 6 files changed, 304 insertions(+), 29 deletions(-)
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c
 create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c

diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc
index 2f27bfb484c..bca8b85c9d1 100644
--- a/gcc/config/i386/i386-expand.cc
+++ b/gcc/config/i386/i386-expand.cc
@@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[])
   emit_insn (gen_xorv4si3 (value, value, large));
 }
 
-static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok,
-machine_mode mode, rtx target,
-rtx var, int one_var);
-
 /* Convert an unsigned DImode value into a DFmode, using only SSE.
Expects the 64-bit DImode to be supplied in a pair of integral
registers.  Requires SSE2; will use SSE3 if available.  For x86_32,
@@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, 
machine_mode mode,
whose ONE_VAR element is VAR, and other elements are zero.  Return true
if successful.  */
 
-static bool
+bool
 ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
 rtx target, rtx var, int one_var)
 {
@@ -25551,4 +25547,45 @@ ix86_expand_fast_convert_bf_to_sf (rtx val)
   return ret;
 }
 
+/* Trunc a vector to a narrow vector, like v4di -> v4si.  */
+
+bool
+ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input)
+{
+  machine_mode out_mode = GET_MODE (output);
+  machine_mode in_mode = GET_MODE (input);
+  int len = GET_MODE_SIZE (in_mode);
+  gcc_assert (len == 16 || len == 32);
+  machine_mode cvt_mode = (len == 16) ? V16QImode : V32QImode;
+  int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode));
+  int out_innersize = GET_MODE_SIZE (GET_MODE_INNER (out_mode));
+
+  struct expand_vec_perm_d d;
+  d.target = gen_reg_rtx (cvt_mode);
+  d.op0 = lowpart_subreg (cvt_mode, force_reg (in_mode, input), in_mode);
+  d.op1 = d.op0;
+  d.vmode = cvt_mode;
+  d.nelt = len;
+  d.testing_p = false;
+  d.one_operand_p = true;
+
+  /* Init perm. Put the needed bits of input in order and
+ fill the rest of bits by default.  */
+  int tot = 0;
+  for (int i = 0; i < len; ++i)
+{
+  d.perm[i] = i;
+  if ((i % in_innersize) < out_innersize)
+   d.perm[tot++] = i;
+}
+
+  if (ix86_expand_vec_perm_const_1(&d))
+{
+  emit_move_insn (output, gen_lowpart (out_mode, d.target));
+  return true;
+}
+
+  return false;
+}
+
 #include "gt-i386-expand.h"
diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
index dbc861fb1ea..ac29fb34028 100644
--- a/gcc/config/i386/i386-protos.h
+++ b/gcc/config/i386/i386-protos.h
@@ -242,6 +242,7 @@ extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, 
rtx, enum rtx_code,
 extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx, rtx, rtx, rtx,
  bool, rtx_code_label *);
 extern rtx ix86_expand_fast_convert_bf_to_sf (rtx);
+extern bool ix86_expand_trunc_with_avx2_noavx512f (rtx, rtx);
 extern rtx ix86_memtag_untagged_pointer (rtx, rtx);
 extern bool ix86_memtag_can_tag_addresses (void);
 
@@ -288,6 +289,8 @@ extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx);
 extern void ix86_expand_sse2_abs (rtx, rtx);
 extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx,
   rtx);
+extern bool ix86_expand_vector_init_one_nonzero (bool, machine_mode, rtx,
+rtx, int);
 extern bool ix86_extract_perm_from_pool_constant (int*, rtx);
 
 /* In i386-c.cc  */
diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md
index f57f36ae380..0b14b3dc1ac 100644
--- a/gcc/config/i386/sse.md
+++ b/gcc/config/i386/sse.md
@@ -14373,14 +14373,25 @@ (define_expand "avx512bw_v32hiv32qi2_mask_store"
 
 (define_mode_iterator PMOV_DST_MODE_2
   [V4SI V8HI (V16QI "TARGET_AVX512BW")])
+(define_mode_iterator PMOV_DST_MODE_2_AVX2
+  [V4SI V8HI V16QI])
 (define_mode_attr pmov_suff_2