Re: [PATCH 3/3] vect: support direct conversion under x86-64-v3.
On Thu, May 23, 2024 at 3:17 PM Hu, Lin1 wrote: > > > -Original Message- > > From: Hongtao Liu > > Sent: Thursday, May 23, 2024 2:42 PM > > To: Hu, Lin1 > > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ; > > ubiz...@gmail.com; rguent...@suse.de > > Subject: Re: [PATCH 3/3] vect: support direct conversion under x86-64-v3. > > > > On Thu, May 23, 2024 at 2:38 PM Hu, Lin1 wrote: > > > > > > gcc/ChangeLog: > > > > > > PR 107432 > > > * config/i386/i386-expand.cc > > > (ix86_expand_trunc_with_avx2_noavx512f): > > > New function for generate a series of suitable insn. > > > * config/i386/i386-protos.h > > > (ix86_expand_trunc_with_avx2_noavx512f): > > > Define new function. > > > * config/i386/sse.md: Extend trunc2 for x86-64-v3. > > I have some concern for this patch since > > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115069, let's hold on to this > > patch. > > OK, maybe we need to modify ix86_expand_vec_perm_const_1, let it emit some > better code. Maybe like clang (https://godbolt.org/z/rTKPq9oj5). > Or we can disable some of the optimization via vpermq. In pr107432-8.c, there > are only 5 tests that use vpermq. After a second thought, we may go ahead with the patch, for PR115069, there's an alternative to avoid cross-lane truncation. But for this one, there's no alternative. Although cross-lane permutation is not very efficient, it should still be better than original code. > > BRs, > Lin > > > > gcc/testsuite/ChangeLog: > > > > > > PR 107432 > > > * gcc.target/i386/pr107432-8.c: New test. > > > * gcc.target/i386/pr107432-9.c: Ditto. > > > * gcc.target/i386/pr92645-4.c: Modify test. > > > --- > > > gcc/config/i386/i386-expand.cc | 47 +++- > > > gcc/config/i386/i386-protos.h | 3 + > > > gcc/config/i386/sse.md | 87 +++ > > > gcc/testsuite/gcc.target/i386/pr107432-8.c | 73 + > > > gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 + > > > gcc/testsuite/gcc.target/i386/pr92645-4.c | 2 - > > > 6 files changed, 304 insertions(+), 29 deletions(-) create mode > > > 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c > > > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c > > > > > > diff --git a/gcc/config/i386/i386-expand.cc > > > b/gcc/config/i386/i386-expand.cc index 2f27bfb484c..bca8b85c9d1 100644 > > > --- a/gcc/config/i386/i386-expand.cc > > > +++ b/gcc/config/i386/i386-expand.cc > > > @@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[]) > > >emit_insn (gen_xorv4si3 (value, value, large)); } > > > > > > -static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok, > > > -machine_mode mode, rtx > > > target, > > > -rtx var, int one_var); > > > - > > > /* Convert an unsigned DImode value into a DFmode, using only SSE. > > > Expects the 64-bit DImode to be supplied in a pair of integral > > > registers. Requires SSE2; will use SSE3 if available. For > > > x86_32, @@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool > > mmx_ok, machine_mode mode, > > > whose ONE_VAR element is VAR, and other elements are zero. Return > > > true > > > if successful. */ > > > > > > -static bool > > > +bool > > > ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode, > > > rtx target, rtx var, int one_var) > > > { @@ -25551,4 +25547,45 @@ ix86_expand_fast_convert_bf_to_sf (rtx val) > > >return ret; > > > } > > > > > > +/* Trunc a vector to a narrow vector, like v4di -> v4si. */ > > > + > > > +bool > > > +ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input) { > > > + machine_mode out_mode = GET_MODE (output); > > > + machine_mode in_mode = GET_MODE (input); > > > + int len = GET_MODE_SIZE (in_mode); > > > + gcc_assert (len == 16 || len == 32); > > > + machine_mode cvt_mode = (len == 16) ? V16QImode : V32QImode; > > > + int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode)); > > > + int out_innersize = GET_MODE_SIZE (GET_MODE_INNER (out_mode)); > > > + > > >
RE: [PATCH 3/3] vect: support direct conversion under x86-64-v3.
> -Original Message- > From: Hongtao Liu > Sent: Thursday, May 23, 2024 2:42 PM > To: Hu, Lin1 > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ; > ubiz...@gmail.com; rguent...@suse.de > Subject: Re: [PATCH 3/3] vect: support direct conversion under x86-64-v3. > > On Thu, May 23, 2024 at 2:38 PM Hu, Lin1 wrote: > > > > gcc/ChangeLog: > > > > PR 107432 > > * config/i386/i386-expand.cc > > (ix86_expand_trunc_with_avx2_noavx512f): > > New function for generate a series of suitable insn. > > * config/i386/i386-protos.h (ix86_expand_trunc_with_avx2_noavx512f): > > Define new function. > > * config/i386/sse.md: Extend trunc2 for x86-64-v3. > I have some concern for this patch since > https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115069, let's hold on to this > patch. OK, maybe we need to modify ix86_expand_vec_perm_const_1, let it emit some better code. Maybe like clang (https://godbolt.org/z/rTKPq9oj5). Or we can disable some of the optimization via vpermq. In pr107432-8.c, there are only 5 tests that use vpermq. BRs, Lin > > gcc/testsuite/ChangeLog: > > > > PR 107432 > > * gcc.target/i386/pr107432-8.c: New test. > > * gcc.target/i386/pr107432-9.c: Ditto. > > * gcc.target/i386/pr92645-4.c: Modify test. > > --- > > gcc/config/i386/i386-expand.cc | 47 +++- > > gcc/config/i386/i386-protos.h | 3 + > > gcc/config/i386/sse.md | 87 +++ > > gcc/testsuite/gcc.target/i386/pr107432-8.c | 73 + > > gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 + > > gcc/testsuite/gcc.target/i386/pr92645-4.c | 2 - > > 6 files changed, 304 insertions(+), 29 deletions(-) create mode > > 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c > > > > diff --git a/gcc/config/i386/i386-expand.cc > > b/gcc/config/i386/i386-expand.cc index 2f27bfb484c..bca8b85c9d1 100644 > > --- a/gcc/config/i386/i386-expand.cc > > +++ b/gcc/config/i386/i386-expand.cc > > @@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[]) > >emit_insn (gen_xorv4si3 (value, value, large)); } > > > > -static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok, > > -machine_mode mode, rtx > > target, > > -rtx var, int one_var); > > - > > /* Convert an unsigned DImode value into a DFmode, using only SSE. > > Expects the 64-bit DImode to be supplied in a pair of integral > > registers. Requires SSE2; will use SSE3 if available. For > > x86_32, @@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool > mmx_ok, machine_mode mode, > > whose ONE_VAR element is VAR, and other elements are zero. Return true > > if successful. */ > > > > -static bool > > +bool > > ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode, > > rtx target, rtx var, int one_var) > > { @@ -25551,4 +25547,45 @@ ix86_expand_fast_convert_bf_to_sf (rtx val) > >return ret; > > } > > > > +/* Trunc a vector to a narrow vector, like v4di -> v4si. */ > > + > > +bool > > +ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input) { > > + machine_mode out_mode = GET_MODE (output); > > + machine_mode in_mode = GET_MODE (input); > > + int len = GET_MODE_SIZE (in_mode); > > + gcc_assert (len == 16 || len == 32); > > + machine_mode cvt_mode = (len == 16) ? V16QImode : V32QImode; > > + int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode)); > > + int out_innersize = GET_MODE_SIZE (GET_MODE_INNER (out_mode)); > > + > > + struct expand_vec_perm_d d; > > + d.target = gen_reg_rtx (cvt_mode); > > + d.op0 = lowpart_subreg (cvt_mode, force_reg (in_mode, input), > > + in_mode); > > + d.op1 = d.op0; > > + d.vmode = cvt_mode; > > + d.nelt = len; > > + d.testing_p = false; > > + d.one_operand_p = true; > > + > > + /* Init perm. Put the needed bits of input in order and > > + fill the rest of bits by default. */ int tot = 0; for (int i > > + = 0; i < len; ++i) > > +{ > > + d.perm[i] = i; > > + if ((i % in_innersize) < out_innersize) > > + d.perm[tot++] = i; > > +} > > + > > + if (ix86_expand_vec_perm_const_1(&d)) > > +{ > > + emit_move_i
Re: [PATCH 3/3] vect: support direct conversion under x86-64-v3.
On Thu, May 23, 2024 at 2:38 PM Hu, Lin1 wrote: > > gcc/ChangeLog: > > PR 107432 > * config/i386/i386-expand.cc (ix86_expand_trunc_with_avx2_noavx512f): > New function for generate a series of suitable insn. > * config/i386/i386-protos.h (ix86_expand_trunc_with_avx2_noavx512f): > Define new function. > * config/i386/sse.md: Extend trunc2 for x86-64-v3. I have some concern for this patch since https://gcc.gnu.org/bugzilla/show_bug.cgi?id=115069, let's hold on to this patch. > gcc/testsuite/ChangeLog: > > PR 107432 > * gcc.target/i386/pr107432-8.c: New test. > * gcc.target/i386/pr107432-9.c: Ditto. > * gcc.target/i386/pr92645-4.c: Modify test. > --- > gcc/config/i386/i386-expand.cc | 47 +++- > gcc/config/i386/i386-protos.h | 3 + > gcc/config/i386/sse.md | 87 +++ > gcc/testsuite/gcc.target/i386/pr107432-8.c | 73 + > gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 + > gcc/testsuite/gcc.target/i386/pr92645-4.c | 2 - > 6 files changed, 304 insertions(+), 29 deletions(-) > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c > > diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc > index 2f27bfb484c..bca8b85c9d1 100644 > --- a/gcc/config/i386/i386-expand.cc > +++ b/gcc/config/i386/i386-expand.cc > @@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[]) >emit_insn (gen_xorv4si3 (value, value, large)); > } > > -static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok, > -machine_mode mode, rtx > target, > -rtx var, int one_var); > - > /* Convert an unsigned DImode value into a DFmode, using only SSE. > Expects the 64-bit DImode to be supplied in a pair of integral > registers. Requires SSE2; will use SSE3 if available. For x86_32, > @@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, > machine_mode mode, > whose ONE_VAR element is VAR, and other elements are zero. Return true > if successful. */ > > -static bool > +bool > ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode, > rtx target, rtx var, int one_var) > { > @@ -25551,4 +25547,45 @@ ix86_expand_fast_convert_bf_to_sf (rtx val) >return ret; > } > > +/* Trunc a vector to a narrow vector, like v4di -> v4si. */ > + > +bool > +ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input) > +{ > + machine_mode out_mode = GET_MODE (output); > + machine_mode in_mode = GET_MODE (input); > + int len = GET_MODE_SIZE (in_mode); > + gcc_assert (len == 16 || len == 32); > + machine_mode cvt_mode = (len == 16) ? V16QImode : V32QImode; > + int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode)); > + int out_innersize = GET_MODE_SIZE (GET_MODE_INNER (out_mode)); > + > + struct expand_vec_perm_d d; > + d.target = gen_reg_rtx (cvt_mode); > + d.op0 = lowpart_subreg (cvt_mode, force_reg (in_mode, input), in_mode); > + d.op1 = d.op0; > + d.vmode = cvt_mode; > + d.nelt = len; > + d.testing_p = false; > + d.one_operand_p = true; > + > + /* Init perm. Put the needed bits of input in order and > + fill the rest of bits by default. */ > + int tot = 0; > + for (int i = 0; i < len; ++i) > +{ > + d.perm[i] = i; > + if ((i % in_innersize) < out_innersize) > + d.perm[tot++] = i; > +} > + > + if (ix86_expand_vec_perm_const_1(&d)) > +{ > + emit_move_insn (output, gen_lowpart (out_mode, d.target)); > + return true; > +} > + > + return false; > +} > + > #include "gt-i386-expand.h" > diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h > index dbc861fb1ea..ac29fb34028 100644 > --- a/gcc/config/i386/i386-protos.h > +++ b/gcc/config/i386/i386-protos.h > @@ -242,6 +242,7 @@ extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, > rtx, enum rtx_code, > extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx, rtx, rtx, rtx, > bool, rtx_code_label *); > extern rtx ix86_expand_fast_convert_bf_to_sf (rtx); > +extern bool ix86_expand_trunc_with_avx2_noavx512f (rtx, rtx); > extern rtx ix86_memtag_untagged_pointer (rtx, rtx); > extern bool ix86_memtag_can_tag_addresses (void); > > @@ -288,6 +289,8 @@ extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx); > extern void ix86_expand_sse2_abs (rtx, rtx); > extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx, >rtx); > +extern bool ix86_expand_vector_init_one_nonzero (bool, machine_mode, rtx, > +rtx, int); > extern bool ix86_extract_perm_from_pool_constant (int*, rtx); > > /* In i386-c.cc */ > diff --git a/g
[PATCH 3/3] vect: support direct conversion under x86-64-v3.
gcc/ChangeLog: PR 107432 * config/i386/i386-expand.cc (ix86_expand_trunc_with_avx2_noavx512f): New function for generate a series of suitable insn. * config/i386/i386-protos.h (ix86_expand_trunc_with_avx2_noavx512f): Define new function. * config/i386/sse.md: Extend trunc2 for x86-64-v3. gcc/testsuite/ChangeLog: PR 107432 * gcc.target/i386/pr107432-8.c: New test. * gcc.target/i386/pr107432-9.c: Ditto. * gcc.target/i386/pr92645-4.c: Modify test. --- gcc/config/i386/i386-expand.cc | 47 +++- gcc/config/i386/i386-protos.h | 3 + gcc/config/i386/sse.md | 87 +++ gcc/testsuite/gcc.target/i386/pr107432-8.c | 73 + gcc/testsuite/gcc.target/i386/pr107432-9.c | 121 + gcc/testsuite/gcc.target/i386/pr92645-4.c | 2 - 6 files changed, 304 insertions(+), 29 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-8.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-9.c diff --git a/gcc/config/i386/i386-expand.cc b/gcc/config/i386/i386-expand.cc index 2f27bfb484c..bca8b85c9d1 100644 --- a/gcc/config/i386/i386-expand.cc +++ b/gcc/config/i386/i386-expand.cc @@ -1896,10 +1896,6 @@ ix86_split_convert_uns_si_sse (rtx operands[]) emit_insn (gen_xorv4si3 (value, value, large)); } -static bool ix86_expand_vector_init_one_nonzero (bool mmx_ok, -machine_mode mode, rtx target, -rtx var, int one_var); - /* Convert an unsigned DImode value into a DFmode, using only SSE. Expects the 64-bit DImode to be supplied in a pair of integral registers. Requires SSE2; will use SSE3 if available. For x86_32, @@ -16418,7 +16414,7 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode, whose ONE_VAR element is VAR, and other elements are zero. Return true if successful. */ -static bool +bool ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode, rtx target, rtx var, int one_var) { @@ -25551,4 +25547,45 @@ ix86_expand_fast_convert_bf_to_sf (rtx val) return ret; } +/* Trunc a vector to a narrow vector, like v4di -> v4si. */ + +bool +ix86_expand_trunc_with_avx2_noavx512f (rtx output, rtx input) +{ + machine_mode out_mode = GET_MODE (output); + machine_mode in_mode = GET_MODE (input); + int len = GET_MODE_SIZE (in_mode); + gcc_assert (len == 16 || len == 32); + machine_mode cvt_mode = (len == 16) ? V16QImode : V32QImode; + int in_innersize = GET_MODE_SIZE (GET_MODE_INNER (in_mode)); + int out_innersize = GET_MODE_SIZE (GET_MODE_INNER (out_mode)); + + struct expand_vec_perm_d d; + d.target = gen_reg_rtx (cvt_mode); + d.op0 = lowpart_subreg (cvt_mode, force_reg (in_mode, input), in_mode); + d.op1 = d.op0; + d.vmode = cvt_mode; + d.nelt = len; + d.testing_p = false; + d.one_operand_p = true; + + /* Init perm. Put the needed bits of input in order and + fill the rest of bits by default. */ + int tot = 0; + for (int i = 0; i < len; ++i) +{ + d.perm[i] = i; + if ((i % in_innersize) < out_innersize) + d.perm[tot++] = i; +} + + if (ix86_expand_vec_perm_const_1(&d)) +{ + emit_move_insn (output, gen_lowpart (out_mode, d.target)); + return true; +} + + return false; +} + #include "gt-i386-expand.h" diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h index dbc861fb1ea..ac29fb34028 100644 --- a/gcc/config/i386/i386-protos.h +++ b/gcc/config/i386/i386-protos.h @@ -242,6 +242,7 @@ extern void ix86_expand_atomic_fetch_op_loop (rtx, rtx, rtx, enum rtx_code, extern void ix86_expand_cmpxchg_loop (rtx *, rtx, rtx, rtx, rtx, rtx, bool, rtx_code_label *); extern rtx ix86_expand_fast_convert_bf_to_sf (rtx); +extern bool ix86_expand_trunc_with_avx2_noavx512f (rtx, rtx); extern rtx ix86_memtag_untagged_pointer (rtx, rtx); extern bool ix86_memtag_can_tag_addresses (void); @@ -288,6 +289,8 @@ extern void ix86_expand_sse2_mulvxdi3 (rtx, rtx, rtx); extern void ix86_expand_sse2_abs (rtx, rtx); extern bool ix86_expand_vector_init_duplicate (bool, machine_mode, rtx, rtx); +extern bool ix86_expand_vector_init_one_nonzero (bool, machine_mode, rtx, +rtx, int); extern bool ix86_extract_perm_from_pool_constant (int*, rtx); /* In i386-c.cc */ diff --git a/gcc/config/i386/sse.md b/gcc/config/i386/sse.md index f57f36ae380..0b14b3dc1ac 100644 --- a/gcc/config/i386/sse.md +++ b/gcc/config/i386/sse.md @@ -14373,14 +14373,25 @@ (define_expand "avx512bw_v32hiv32qi2_mask_store" (define_mode_iterator PMOV_DST_MODE_2 [V4SI V8HI (V16QI "TARGET_AVX512BW")]) +(define_mode_iterator PMOV_DST_MODE_2_AVX2 + [V4SI V8HI V16QI]) (define_mode_attr pmov_suff_2