Re: [PATCH] rs6000: Add Power10 optimization for most _mm_movemask*
On Fri, Jan 7, 2022 at 3:35 PM Paul A. Clarke wrote: > > On Fri, Jan 07, 2022 at 02:23:14PM -0500, David Edelsohn wrote: > > > Power10 ISA added `vextract*` instructions which are realized in the > > > `vec_extractm` instrinsic. > > > > > > Use `vec_extractm` for `_mm_movemask_ps`, `_mm_movemask_pd`, and > > > `_mm_movemask_epi8` compatibility intrinsics, when `_ARCH_PWR10`. > > > > > > 2021-10-21 Paul A. Clarke > > > > > > gcc > > > * config/rs6000/xmmintrin.h (_mm_movemask_ps): Use vec_extractm > > > when _ARCH_PWR10. > > > * config/rs6000/emmintrin.h (_mm_movemask_pd): Likewise. > > > (_mm_movemask_epi8): Likewise. > > > --- > > > Tested on Power10 powerpc64le-linux (compiled with and without > > > `-mcpu=power10`). > > > > > > OK for trunk? > > > > This is okay modulo > > > > > + return vec_extractm ((__v16qu) __A); > > > > Should the above be __v16qi like x86? > > That would match x86 better, but we don't have a function signature > for vec_extractm which accepts a signed type. Okay, nevermind. I thought that vec_extractm also allowed signed. Thanks, David
Re: [PATCH] rs6000: Add Power10 optimization for most _mm_movemask*
On Fri, Jan 07, 2022 at 02:23:14PM -0500, David Edelsohn wrote: > > Power10 ISA added `vextract*` instructions which are realized in the > > `vec_extractm` instrinsic. > > > > Use `vec_extractm` for `_mm_movemask_ps`, `_mm_movemask_pd`, and > > `_mm_movemask_epi8` compatibility intrinsics, when `_ARCH_PWR10`. > > > > 2021-10-21 Paul A. Clarke > > > > gcc > > * config/rs6000/xmmintrin.h (_mm_movemask_ps): Use vec_extractm > > when _ARCH_PWR10. > > * config/rs6000/emmintrin.h (_mm_movemask_pd): Likewise. > > (_mm_movemask_epi8): Likewise. > > --- > > Tested on Power10 powerpc64le-linux (compiled with and without > > `-mcpu=power10`). > > > > OK for trunk? > > This is okay modulo > > > + return vec_extractm ((__v16qu) __A); > > Should the above be __v16qi like x86? That would match x86 better, but we don't have a function signature for vec_extractm which accepts a signed type. PC
Re: [PATCH] rs6000: Add Power10 optimization for most _mm_movemask*
> Power10 ISA added `vextract*` instructions which are realized in the > `vec_extractm` instrinsic. > > Use `vec_extractm` for `_mm_movemask_ps`, `_mm_movemask_pd`, and > `_mm_movemask_epi8` compatibility intrinsics, when `_ARCH_PWR10`. > > 2021-10-21 Paul A. Clarke > > gcc > * config/rs6000/xmmintrin.h (_mm_movemask_ps): Use vec_extractm > when _ARCH_PWR10. > * config/rs6000/emmintrin.h (_mm_movemask_pd): Likewise. > (_mm_movemask_epi8): Likewise. > --- > Tested on Power10 powerpc64le-linux (compiled with and without > `-mcpu=power10`). > > OK for trunk? This is okay modulo > + return vec_extractm ((__v16qu) __A); Should the above be __v16qi like x86? Thanks, David
Re: [PATCH] rs6000: Add Power10 optimization for most _mm_movemask*
On Thu, Oct 21, 2021 at 12:22:12PM -0500, Paul A. Clarke wrote: > Power10 ISA added `vextract*` instructions which are realized in the > `vec_extractm` instrinsic. > > Use `vec_extractm` for `_mm_movemask_ps`, `_mm_movemask_pd`, and > `_mm_movemask_epi8` compatibility intrinsics, when `_ARCH_PWR10`. > > 2021-10-21 Paul A. Clarke > > gcc > * config/rs6000/xmmintrin.h (_mm_movemask_ps): Use vec_extractm > when _ARCH_PWR10. > * config/rs6000/emmintrin.h (_mm_movemask_pd): Likewise. > (_mm_movemask_epi8): Likewise. Okay for trunk. Thanks! Segher
Re: [PING^2 PATCH] rs6000: Add Power10 optimization for most _mm_movemask*
On Mon, Nov 08, 2021 at 11:42:56AM -0600, Paul A. Clarke via Gcc-patches wrote: > Gentle ping... Gentle re-ping. > On Thu, Oct 21, 2021 at 12:22:12PM -0500, Paul A. Clarke via Gcc-patches > wrote: > > Power10 ISA added `vextract*` instructions which are realized in the > > `vec_extractm` instrinsic. > > > > Use `vec_extractm` for `_mm_movemask_ps`, `_mm_movemask_pd`, and > > `_mm_movemask_epi8` compatibility intrinsics, when `_ARCH_PWR10`. > > > > 2021-10-21 Paul A. Clarke > > > > gcc > > * config/rs6000/xmmintrin.h (_mm_movemask_ps): Use vec_extractm > > when _ARCH_PWR10. > > * config/rs6000/emmintrin.h (_mm_movemask_pd): Likewise. > > (_mm_movemask_epi8): Likewise. > > --- > > Tested on Power10 powerpc64le-linux (compiled with and without > > `-mcpu=power10`). > > > > OK for trunk? > > > > gcc/config/rs6000/emmintrin.h | 8 > > gcc/config/rs6000/xmmintrin.h | 4 > > 2 files changed, 12 insertions(+) > > > > diff --git a/gcc/config/rs6000/emmintrin.h b/gcc/config/rs6000/emmintrin.h > > index 32ad72b4cc35..ab16c13c379e 100644 > > --- a/gcc/config/rs6000/emmintrin.h > > +++ b/gcc/config/rs6000/emmintrin.h > > @@ -1233,6 +1233,9 @@ _mm_loadl_pd (__m128d __A, double const *__B) > > extern __inline int __attribute__((__gnu_inline__, __always_inline__, > > __artificial__)) > > _mm_movemask_pd (__m128d __A) > > { > > +#ifdef _ARCH_PWR10 > > + return vec_extractm ((__v2du) __A); > > +#else > >__vector unsigned long long result; > >static const __vector unsigned int perm_mask = > > { > > @@ -1252,6 +1255,7 @@ _mm_movemask_pd (__m128d __A) > > #else > >return result[0]; > > #endif > > +#endif /* !_ARCH_PWR10 */ > > } > > #endif /* _ARCH_PWR8 */ > > > > @@ -2030,6 +2034,9 @@ _mm_min_epu8 (__m128i __A, __m128i __B) > > extern __inline int __attribute__((__gnu_inline__, __always_inline__, > > __artificial__)) > > _mm_movemask_epi8 (__m128i __A) > > { > > +#ifdef _ARCH_PWR10 > > + return vec_extractm ((__v16qu) __A); > > +#else > >__vector unsigned long long result; > >static const __vector unsigned char perm_mask = > > { > > @@ -2046,6 +2053,7 @@ _mm_movemask_epi8 (__m128i __A) > > #else > >return result[0]; > > #endif > > +#endif /* !_ARCH_PWR10 */ > > } > > #endif /* _ARCH_PWR8 */ > > > > diff --git a/gcc/config/rs6000/xmmintrin.h b/gcc/config/rs6000/xmmintrin.h > > index ae1a33e8d95b..4c093fd1d5ae 100644 > > --- a/gcc/config/rs6000/xmmintrin.h > > +++ b/gcc/config/rs6000/xmmintrin.h > > @@ -1352,6 +1352,9 @@ _mm_storel_pi (__m64 *__P, __m128 __A) > > extern __inline int __attribute__((__gnu_inline__, __always_inline__, > > __artificial__)) > > _mm_movemask_ps (__m128 __A) > > { > > +#ifdef _ARCH_PWR10 > > + return vec_extractm ((vector unsigned int) __A); > > +#else > >__vector unsigned long long result; > >static const __vector unsigned int perm_mask = > > { > > @@ -1371,6 +1374,7 @@ _mm_movemask_ps (__m128 __A) > > #else > >return result[0]; > > #endif > > +#endif /* !_ARCH_PWR10 */ > > } > > #endif /* _ARCH_PWR8 */ > > > > -- > > 2.27.0 > >
[PING PATCH] rs6000: Add Power10 optimization for most _mm_movemask*
Gentle ping... On Thu, Oct 21, 2021 at 12:22:12PM -0500, Paul A. Clarke via Gcc-patches wrote: > Power10 ISA added `vextract*` instructions which are realized in the > `vec_extractm` instrinsic. > > Use `vec_extractm` for `_mm_movemask_ps`, `_mm_movemask_pd`, and > `_mm_movemask_epi8` compatibility intrinsics, when `_ARCH_PWR10`. > > 2021-10-21 Paul A. Clarke > > gcc > * config/rs6000/xmmintrin.h (_mm_movemask_ps): Use vec_extractm > when _ARCH_PWR10. > * config/rs6000/emmintrin.h (_mm_movemask_pd): Likewise. > (_mm_movemask_epi8): Likewise. > --- > Tested on Power10 powerpc64le-linux (compiled with and without > `-mcpu=power10`). > > OK for trunk? > > gcc/config/rs6000/emmintrin.h | 8 > gcc/config/rs6000/xmmintrin.h | 4 > 2 files changed, 12 insertions(+) > > diff --git a/gcc/config/rs6000/emmintrin.h b/gcc/config/rs6000/emmintrin.h > index 32ad72b4cc35..ab16c13c379e 100644 > --- a/gcc/config/rs6000/emmintrin.h > +++ b/gcc/config/rs6000/emmintrin.h > @@ -1233,6 +1233,9 @@ _mm_loadl_pd (__m128d __A, double const *__B) > extern __inline int __attribute__((__gnu_inline__, __always_inline__, > __artificial__)) > _mm_movemask_pd (__m128d __A) > { > +#ifdef _ARCH_PWR10 > + return vec_extractm ((__v2du) __A); > +#else >__vector unsigned long long result; >static const __vector unsigned int perm_mask = > { > @@ -1252,6 +1255,7 @@ _mm_movemask_pd (__m128d __A) > #else >return result[0]; > #endif > +#endif /* !_ARCH_PWR10 */ > } > #endif /* _ARCH_PWR8 */ > > @@ -2030,6 +2034,9 @@ _mm_min_epu8 (__m128i __A, __m128i __B) > extern __inline int __attribute__((__gnu_inline__, __always_inline__, > __artificial__)) > _mm_movemask_epi8 (__m128i __A) > { > +#ifdef _ARCH_PWR10 > + return vec_extractm ((__v16qu) __A); > +#else >__vector unsigned long long result; >static const __vector unsigned char perm_mask = > { > @@ -2046,6 +2053,7 @@ _mm_movemask_epi8 (__m128i __A) > #else >return result[0]; > #endif > +#endif /* !_ARCH_PWR10 */ > } > #endif /* _ARCH_PWR8 */ > > diff --git a/gcc/config/rs6000/xmmintrin.h b/gcc/config/rs6000/xmmintrin.h > index ae1a33e8d95b..4c093fd1d5ae 100644 > --- a/gcc/config/rs6000/xmmintrin.h > +++ b/gcc/config/rs6000/xmmintrin.h > @@ -1352,6 +1352,9 @@ _mm_storel_pi (__m64 *__P, __m128 __A) > extern __inline int __attribute__((__gnu_inline__, __always_inline__, > __artificial__)) > _mm_movemask_ps (__m128 __A) > { > +#ifdef _ARCH_PWR10 > + return vec_extractm ((vector unsigned int) __A); > +#else >__vector unsigned long long result; >static const __vector unsigned int perm_mask = > { > @@ -1371,6 +1374,7 @@ _mm_movemask_ps (__m128 __A) > #else >return result[0]; > #endif > +#endif /* !_ARCH_PWR10 */ > } > #endif /* _ARCH_PWR8 */ > > -- > 2.27.0 >
[PATCH] rs6000: Add Power10 optimization for most _mm_movemask*
Power10 ISA added `vextract*` instructions which are realized in the `vec_extractm` instrinsic. Use `vec_extractm` for `_mm_movemask_ps`, `_mm_movemask_pd`, and `_mm_movemask_epi8` compatibility intrinsics, when `_ARCH_PWR10`. 2021-10-21 Paul A. Clarke gcc * config/rs6000/xmmintrin.h (_mm_movemask_ps): Use vec_extractm when _ARCH_PWR10. * config/rs6000/emmintrin.h (_mm_movemask_pd): Likewise. (_mm_movemask_epi8): Likewise. --- Tested on Power10 powerpc64le-linux (compiled with and without `-mcpu=power10`). OK for trunk? gcc/config/rs6000/emmintrin.h | 8 gcc/config/rs6000/xmmintrin.h | 4 2 files changed, 12 insertions(+) diff --git a/gcc/config/rs6000/emmintrin.h b/gcc/config/rs6000/emmintrin.h index 32ad72b4cc35..ab16c13c379e 100644 --- a/gcc/config/rs6000/emmintrin.h +++ b/gcc/config/rs6000/emmintrin.h @@ -1233,6 +1233,9 @@ _mm_loadl_pd (__m128d __A, double const *__B) extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movemask_pd (__m128d __A) { +#ifdef _ARCH_PWR10 + return vec_extractm ((__v2du) __A); +#else __vector unsigned long long result; static const __vector unsigned int perm_mask = { @@ -1252,6 +1255,7 @@ _mm_movemask_pd (__m128d __A) #else return result[0]; #endif +#endif /* !_ARCH_PWR10 */ } #endif /* _ARCH_PWR8 */ @@ -2030,6 +2034,9 @@ _mm_min_epu8 (__m128i __A, __m128i __B) extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movemask_epi8 (__m128i __A) { +#ifdef _ARCH_PWR10 + return vec_extractm ((__v16qu) __A); +#else __vector unsigned long long result; static const __vector unsigned char perm_mask = { @@ -2046,6 +2053,7 @@ _mm_movemask_epi8 (__m128i __A) #else return result[0]; #endif +#endif /* !_ARCH_PWR10 */ } #endif /* _ARCH_PWR8 */ diff --git a/gcc/config/rs6000/xmmintrin.h b/gcc/config/rs6000/xmmintrin.h index ae1a33e8d95b..4c093fd1d5ae 100644 --- a/gcc/config/rs6000/xmmintrin.h +++ b/gcc/config/rs6000/xmmintrin.h @@ -1352,6 +1352,9 @@ _mm_storel_pi (__m64 *__P, __m128 __A) extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) _mm_movemask_ps (__m128 __A) { +#ifdef _ARCH_PWR10 + return vec_extractm ((vector unsigned int) __A); +#else __vector unsigned long long result; static const __vector unsigned int perm_mask = { @@ -1371,6 +1374,7 @@ _mm_movemask_ps (__m128 __A) #else return result[0]; #endif +#endif /* !_ARCH_PWR10 */ } #endif /* _ARCH_PWR8 */ -- 2.27.0