[PATCH] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
Hi, all This patch aims to optimize __builtin_convertvector. We want the function can generate more efficient insn for some situations. Like v2si -> v2di. The patch has been bootstrapped and regtested on x86_64-pc-linux-gnu, OK for trunk? BRs, Lin gcc/ChangeLog: PR target/107432 * tree-vect-generic.cc (expand_vector_conversion): Support convert for int -> int, float -> float and int <-> float. (expand_vector_conversion_no_vec_pack): Check if can convert int <-> int, float <-> float and int <-> float, directly. Support indirect convert, when direct optab is not supported. gcc/testsuite/ChangeLog: PR target/107432 * gcc.target/i386/pr107432-1.c: New test. * gcc.target/i386/pr107432-2.c: Ditto. * gcc.target/i386/pr107432-3.c: Ditto. * gcc.target/i386/pr107432-4.c: Ditto. * gcc.target/i386/pr107432-5.c: Ditto. * gcc.target/i386/pr107432-6.c: Ditto. * gcc.target/i386/pr107432-7.c: Ditto. --- gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 + gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 + gcc/testsuite/gcc.target/i386/pr107432-3.c | 55 + gcc/testsuite/gcc.target/i386/pr107432-4.c | 56 + gcc/testsuite/gcc.target/i386/pr107432-5.c | 72 +++ gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 ++ gcc/tree-vect-generic.cc | 107 +- 8 files changed, 918 insertions(+), 6 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c new file mode 100644 index 000..a4f37447eb4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c @@ -0,0 +1,234 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */ +/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } } } */ +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */ + +#include + +typedef short __v2hi __attribute__ ((__vector_size__ (4))); +typedef char __v2qi __attribute__ ((__vector_size__ (2))); +typedef char __v4qi __attribute__ ((__vector_size__ (4))); +typedef char __v8qi __attribute__ ((__vector_size__ (8))); + +typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4))); +typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8))); +typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2))); +typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4))); +typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8))); +typedef unsigned int __v2su __attribute__ ((__vector_size__ (8))); + +__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2si); +} + +__m128imm256_cvtepi64_epi32_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v4di)a, __v4si); +} + +__m256imm512_cvtepi64_epi32_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v8di)a, __v8si); +} + +__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2hi); +} + +__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4di)a, __v4hi); +} + +__m128imm512_cvtepi64_epi16_builtin_convertvector(__m512i a) +{ + return (__m128i)__builtin_convertvector((__v8di)a, __v8hi); +} + +__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2qi); +} + +__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4di)a, __v4qi); +} + +__v8qi mm512_cvtepi64_epi8_builtin_convertvector(__m512i a) +{ + return __builtin_convertvector((__v8di)a, __v8qi); +} + +__v2hi mm64_cvtepi32_epi16_builtin_convertvector(__v2si a) +{ + return __builtin_convertvector((__v2si)a, __v2hi); +} + +__v4hi mm_cvtepi32_epi16_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v4si)a, __v4hi); +} + +__m128imm256_cvtepi32_epi16_builtin_convertvector
RE: [PATCH] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
Do you have any advice? BRs, Lin -Original Message- From: Hu, Lin1 Sent: Wednesday, May 8, 2024 9:38 AM To: gcc-patches@gcc.gnu.org Cc: Liu, Hongtao ; ubiz...@gmail.com Subject: [PATCH] vect: generate suitable convert insn for int -> int, float -> float and int <-> float. Hi, all This patch aims to optimize __builtin_convertvector. We want the function can generate more efficient insn for some situations. Like v2si -> v2di. The patch has been bootstrapped and regtested on x86_64-pc-linux-gnu, OK for trunk? BRs, Lin gcc/ChangeLog: PR target/107432 * tree-vect-generic.cc (expand_vector_conversion): Support convert for int -> int, float -> float and int <-> float. (expand_vector_conversion_no_vec_pack): Check if can convert int <-> int, float <-> float and int <-> float, directly. Support indirect convert, when direct optab is not supported. gcc/testsuite/ChangeLog: PR target/107432 * gcc.target/i386/pr107432-1.c: New test. * gcc.target/i386/pr107432-2.c: Ditto. * gcc.target/i386/pr107432-3.c: Ditto. * gcc.target/i386/pr107432-4.c: Ditto. * gcc.target/i386/pr107432-5.c: Ditto. * gcc.target/i386/pr107432-6.c: Ditto. * gcc.target/i386/pr107432-7.c: Ditto. --- gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 + gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 + gcc/testsuite/gcc.target/i386/pr107432-3.c | 55 + gcc/testsuite/gcc.target/i386/pr107432-4.c | 56 + gcc/testsuite/gcc.target/i386/pr107432-5.c | 72 +++ gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 ++ gcc/tree-vect-generic.cc | 107 +- 8 files changed, 918 insertions(+), 6 deletions(-) create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c b/gcc/testsuite/gcc.target/i386/pr107432-1.c new file mode 100644 index 000..a4f37447eb4 --- /dev/null +++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c @@ -0,0 +1,234 @@ +/* { dg-do compile } */ +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */ +/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */ +/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } +} */ +/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } +} } */ +/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } +} */ +/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } +} } */ +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */ + +#include + +typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef +char __v2qi __attribute__ ((__vector_size__ (2))); typedef char __v4qi +__attribute__ ((__vector_size__ (4))); typedef char __v8qi +__attribute__ ((__vector_size__ (8))); + +typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4))); +typedef unsigned short __v4hu __attribute__ ((__vector_size__ (8))); +typedef unsigned char __v2qu __attribute__ ((__vector_size__ (2))); +typedef unsigned char __v4qu __attribute__ ((__vector_size__ (4))); +typedef unsigned char __v8qu __attribute__ ((__vector_size__ (8))); +typedef unsigned int __v2su __attribute__ ((__vector_size__ (8))); + +__v2si mm_cvtepi64_epi32_builtin_convertvector(__m128i a) { + return __builtin_convertvector((__v2di)a, __v2si); } + +__m128imm256_cvtepi64_epi32_builtin_convertvector(__m256i a) +{ + return (__m128i)__builtin_convertvector((__v4di)a, __v4si); } + +__m256imm512_cvtepi64_epi32_builtin_convertvector(__m512i a) +{ + return (__m256i)__builtin_convertvector((__v8di)a, __v8si); } + +__v2hi mm_cvtepi64_epi16_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2hi); } + +__v4hi mm256_cvtepi64_epi16_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v4di)a, __v4hi); } + +__m128imm512_cvtepi64_epi16_builtin_convertvector(__m512i a) +{ + return (__m128i)__builtin_convertvector((__v8di)a, __v8hi); } + +__v2qi mm_cvtepi64_epi8_builtin_convertvector(__m128i a) +{ + return __builtin_convertvector((__v2di)a, __v2qi); } + +__v4qi mm256_cvtepi64_epi8_builtin_convertvector(__m256i a) +{ + return __builtin_convertvector((__v
RE: [PATCH] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
On Tue, 14 May 2024, Hu, Lin1 wrote: > Do you have any advice? > > BRs, > Lin > > -Original Message- > From: Hu, Lin1 > Sent: Wednesday, May 8, 2024 9:38 AM > To: gcc-patches@gcc.gnu.org > Cc: Liu, Hongtao ; ubiz...@gmail.com > Subject: [PATCH] vect: generate suitable convert insn for int -> int, float > -> float and int <-> float. > > Hi, all > > This patch aims to optimize __builtin_convertvector. We want the function can > generate more efficient insn for some situations. Like v2si -> v2di. > > The patch has been bootstrapped and regtested on x86_64-pc-linux-gnu, OK for > trunk? I don't like the new code to be in a separate function, not integrated with the existing handling. Note the existing handling should get, say, V8DF -> V8SI correct for SSE by splitting the operation into smaller vectors but your code seems to just handle the cases the vectors are already properly sized. Without checking it seems you are basing the code on what the vectorizer does? Maybe we should have some common code that computes intermediate conversion steps supported by the HW unifying what for example supportable_widening_operation or supportable_narrowing_operation can do to also cover int <-> float conversions. That said, if you don't want to do that please still think about the core part of tree-vect-generic.cc which is breaking down large emulated vectors into small supported vectors. Richard. > BRs, > Lin > > gcc/ChangeLog: > > PR target/107432 > * tree-vect-generic.cc (expand_vector_conversion): Support > convert for int -> int, float -> float and int <-> float. > (expand_vector_conversion_no_vec_pack): Check if can convert > int <-> int, float <-> float and int <-> float, directly. > Support indirect convert, when direct optab is not supported. > > gcc/testsuite/ChangeLog: > > PR target/107432 > * gcc.target/i386/pr107432-1.c: New test. > * gcc.target/i386/pr107432-2.c: Ditto. > * gcc.target/i386/pr107432-3.c: Ditto. > * gcc.target/i386/pr107432-4.c: Ditto. > * gcc.target/i386/pr107432-5.c: Ditto. > * gcc.target/i386/pr107432-6.c: Ditto. > * gcc.target/i386/pr107432-7.c: Ditto. > --- > gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 + > gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 + > gcc/testsuite/gcc.target/i386/pr107432-3.c | 55 + > gcc/testsuite/gcc.target/i386/pr107432-4.c | 56 + > gcc/testsuite/gcc.target/i386/pr107432-5.c | 72 +++ > gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 > gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 ++ > gcc/tree-vect-generic.cc | 107 +- > 8 files changed, 918 insertions(+), 6 deletions(-) create mode 100644 > gcc/testsuite/gcc.target/i386/pr107432-1.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c > > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c > b/gcc/testsuite/gcc.target/i386/pr107432-1.c > new file mode 100644 > index 000..a4f37447eb4 > --- /dev/null > +++ b/gcc/testsuite/gcc.target/i386/pr107432-1.c > @@ -0,0 +1,234 @@ > +/* { dg-do compile } */ > +/* { dg-options "-march=x86-64 -mavx512bw -mavx512vl -O3" } */ > +/* { dg-final { scan-assembler-times "vpmovqd" 6 } } */ > +/* { dg-final { scan-assembler-times "vpmovqw" 6 } } */ > +/* { dg-final { scan-assembler-times "vpmovqb" 6 } } */ > +/* { dg-final { scan-assembler-times "vpmovdw" 6 { target { ia32 } } } > +} */ > +/* { dg-final { scan-assembler-times "vpmovdw" 8 { target { ! ia32 } } > +} } */ > +/* { dg-final { scan-assembler-times "vpmovdb" 6 { target { ia32 } } } > +} */ > +/* { dg-final { scan-assembler-times "vpmovdb" 8 { target { ! ia32 } } > +} } */ > +/* { dg-final { scan-assembler-times "vpmovwb" 8 } } */ > + > +#include > + > +typedef short __v2hi __attribute__ ((__vector_size__ (4))); typedef > +char __v2qi __attribute__ ((__vector_size__ (2))); typedef char __v4qi > +__attribute__ ((__vector_size__ (4))); typedef char __v8qi > +__attribute__ ((__vector_size__ (8))); > + > +typedef unsigned short __v2hu __attribute__ ((__vector_size__ (4))); > +typedef unsigned short __v4hu __attribute__ ((__v
RE: [PATCH] vect: generate suitable convert insn for int -> int, float -> float and int <-> float.
> -Original Message- > From: Richard Biener > Sent: Tuesday, May 14, 2024 8:23 PM > To: Hu, Lin1 > Cc: gcc-patches@gcc.gnu.org; Liu, Hongtao ; > ubiz...@gmail.com > Subject: RE: [PATCH] vect: generate suitable convert insn for int -> int, > float -> > float and int <-> float. > > On Tue, 14 May 2024, Hu, Lin1 wrote: > > > Do you have any advice? > > > > BRs, > > Lin > > > > -Original Message- > > From: Hu, Lin1 > > Sent: Wednesday, May 8, 2024 9:38 AM > > To: gcc-patches@gcc.gnu.org > > Cc: Liu, Hongtao ; ubiz...@gmail.com > > Subject: [PATCH] vect: generate suitable convert insn for int -> int, float > > -> > float and int <-> float. > > > > Hi, all > > > > This patch aims to optimize __builtin_convertvector. We want the function > can generate more efficient insn for some situations. Like v2si -> v2di. > > > > The patch has been bootstrapped and regtested on x86_64-pc-linux-gnu, OK > for trunk? > > I don't like the new code to be in a separate function, not integrated with > the > existing handling. Note the existing handling should get, say, V8DF -> V8SI > correct for SSE by splitting the operation into smaller vectors but your code > seems to just handle the cases the vectors are already properly sized. > Yes, my code only handles some cases, but others are handled by the core part of tree-vect-generic.cc. I just take care of some special cases up front. So, V8DF -> V8SI is still split into smaller vectors for SSE. And for SSE, I have another patch to expand the available direct optab environment with ix86_expand_vec_perm_const_1 (...). This patch hasn't been sent yet. I will sending it out together after I modify this patch. This gives an overall view of my changes to this function. > > Without checking it seems you are basing the code on what the vectorizer does? > Maybe we should have some common code that computes intermediate > conversion steps supported by the HW unifying what for example > supportable_widening_operation or supportable_narrowing_operation can do > to also cover int <-> float conversions. > Yes, my code is based on vectorizable_conversion(...). I will consider to split the function and define some new function like your advises to make my code more common. BRs, Lin > > That said, if you don't want to do that please still think about the core > part of > tree-vect-generic.cc which is breaking down large emulated vectors into small > supported vectors. > > Richard. > > > BRs, > > Lin > > > > gcc/ChangeLog: > > > > PR target/107432 > > * tree-vect-generic.cc (expand_vector_conversion): Support > > convert for int -> int, float -> float and int <-> float. > > (expand_vector_conversion_no_vec_pack): Check if can convert > > int <-> int, float <-> float and int <-> float, directly. > > Support indirect convert, when direct optab is not supported. > > > > gcc/testsuite/ChangeLog: > > > > PR target/107432 > > * gcc.target/i386/pr107432-1.c: New test. > > * gcc.target/i386/pr107432-2.c: Ditto. > > * gcc.target/i386/pr107432-3.c: Ditto. > > * gcc.target/i386/pr107432-4.c: Ditto. > > * gcc.target/i386/pr107432-5.c: Ditto. > > * gcc.target/i386/pr107432-6.c: Ditto. > > * gcc.target/i386/pr107432-7.c: Ditto. > > --- > > gcc/testsuite/gcc.target/i386/pr107432-1.c | 234 + > gcc/testsuite/gcc.target/i386/pr107432-2.c | 105 + > gcc/testsuite/gcc.target/i386/pr107432-3.c | 55 + > gcc/testsuite/gcc.target/i386/pr107432-4.c | 56 + > gcc/testsuite/gcc.target/i386/pr107432-5.c | 72 +++ > gcc/testsuite/gcc.target/i386/pr107432-6.c | 139 > gcc/testsuite/gcc.target/i386/pr107432-7.c | 156 ++ > > gcc/tree-vect-generic.cc | 107 +- > > 8 files changed, 918 insertions(+), 6 deletions(-) create mode > > 100644 gcc/testsuite/gcc.target/i386/pr107432-1.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-2.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-3.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-4.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-5.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-6.c > > create mode 100644 gcc/testsuite/gcc.target/i386/pr107432-7.c > > > > diff --git a/gcc/testsuite/gcc.target/i386/pr107432-1.c > > b/gcc/testsuite/gcc.target/i386/pr107432-1.c > > new file mode