[PATCH 3/4] rs6000: Add support for SSE4.1 "blend" intrinsics

2021-06-29 Thread Paul A. Clarke via Gcc-patches
_mm_blend_epi16 and _mm_blendv_epi8 were added earlier.
Add these four to complete the set.

2021-06-29  Paul A. Clarke  

gcc/ChangeLog:
* config/rs6000/smmintrin.h (_mm_blend_pd, _mm_blendv_pd,
_mm_blend_ps, _mm_blendv_ps): New.
---
 gcc/config/rs6000/smmintrin.h | 46 +++
 1 file changed, 46 insertions(+)

diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h
index 1b8cad135ed0..fa17a8b2f478 100644
--- a/gcc/config/rs6000/smmintrin.h
+++ b/gcc/config/rs6000/smmintrin.h
@@ -116,6 +116,52 @@ _mm_blendv_epi8 (__m128i __A, __m128i __B, __m128i __mask)
   return (__m128i) vec_sel ((__v16qu) __A, (__v16qu) __B, __lmask);
 }
 
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+_mm_blend_pd (__m128d __A, __m128d __B, const int __imm8)
+{
+  const signed char __tmp = (__imm8 & 0b10) * 0b0000 |
+   (__imm8 & 0b01) * 0b;
+  __v16qi __charmask = vec_splats ((signed char) __tmp);
+  __charmask = vec_gb (__charmask);
+  __v8hu __shortmask = (__v8hu) vec_unpackh (__charmask);
+  #ifdef __BIG_ENDIAN__
+  __shortmask = vec_reve (__shortmask);
+  #endif
+  return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du) __shortmask);
+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+_mm_blendv_pd (__m128d __A, __m128d __B, __m128d __mask)
+{
+  const __v2di __zero = {0};
+  const vector __bool long long __boolmask = vec_cmplt ((__v2di) __mask, 
__zero);
+  return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du) __boolmask);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+_mm_blend_ps (__m128 __A, __m128 __B, const int __imm8)
+{
+  const signed char __mask = (__imm8 & 0b1000) * 0b00011000 |
+(__imm8 & 0b0100) * 0b1100 |
+(__imm8 & 0b0010) * 0b0110 |
+(__imm8 & 0b0001) * 0b0011;
+  __v16qi __charmask = vec_splats ( __mask);
+  __charmask = vec_gb (__charmask);
+  __v8hu __shortmask = (__v8hu) vec_unpackh (__charmask);
+  #ifdef __BIG_ENDIAN__
+  __shortmask = vec_reve (__shortmask);
+  #endif
+  return (__m128) vec_sel ((__v8hu) __A, (__v8hu) __B, __shortmask);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+_mm_blendv_ps (__m128 __A, __m128 __B, __m128 __mask)
+{
+  const __v4si __zero = {0};
+  const vector __bool int __boolmask = vec_cmplt ((__v4si) __mask, __zero);
+  return (__m128) vec_sel ((__v4su) __A, (__v4su) __B, (__v4su) __boolmask);
+}
+
 extern __inline int __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
 _mm_testz_si128 (__m128i __A, __m128i __B)
 {
-- 
2.27.0



Re: [PATCH 3/4] rs6000: Add support for SSE4.1 "blend" intrinsics

2021-07-11 Thread Bill Schmidt via Gcc-patches

Hi Paul,

On 6/29/21 1:08 PM, Paul A. Clarke via Gcc-patches wrote:

_mm_blend_epi16 and _mm_blendv_epi8 were added earlier.
Add these four to complete the set.

2021-06-29  Paul A. Clarke  

gcc/ChangeLog:
* config/rs6000/smmintrin.h (_mm_blend_pd, _mm_blendv_pd,
_mm_blend_ps, _mm_blendv_ps): New.
---
  gcc/config/rs6000/smmintrin.h | 46 +++
  1 file changed, 46 insertions(+)

diff --git a/gcc/config/rs6000/smmintrin.h b/gcc/config/rs6000/smmintrin.h
index 1b8cad135ed0..fa17a8b2f478 100644
--- a/gcc/config/rs6000/smmintrin.h
+++ b/gcc/config/rs6000/smmintrin.h
@@ -116,6 +116,52 @@ _mm_blendv_epi8 (__m128i __A, __m128i __B, __m128i __mask)
return (__m128i) vec_sel ((__v16qu) __A, (__v16qu) __B, __lmask);
  }

+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))

Usual line length complaint. :)  Here and below...

+_mm_blend_pd (__m128d __A, __m128d __B, const int __imm8)
+{
+  const signed char __tmp = (__imm8 & 0b10) * 0b0000 |
+   (__imm8 & 0b01) * 0b;
+  __v16qi __charmask = vec_splats ((signed char) __tmp);
+  __charmask = vec_gb (__charmask);
+  __v8hu __shortmask = (__v8hu) vec_unpackh (__charmask);
+  #ifdef __BIG_ENDIAN__
+  __shortmask = vec_reve (__shortmask);
+  #endif
+  return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du) __shortmask);


This seems way too complex, and needs commentary to explain what you're 
doing.  Doesn't this instruction just translate into some form of 
xxpermdi?  Different ones for BE and LE, but still just xxpermdi, I think.



+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+_mm_blendv_pd (__m128d __A, __m128d __B, __m128d __mask)
+{
+  const __v2di __zero = {0};
+  const vector __bool long long __boolmask = vec_cmplt ((__v2di) __mask, 
__zero);
+  return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du) __boolmask);
+}


Okay.


+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+_mm_blend_ps (__m128 __A, __m128 __B, const int __imm8)
+{
+  const signed char __mask = (__imm8 & 0b1000) * 0b00011000 |
+(__imm8 & 0b0100) * 0b1100 |
+(__imm8 & 0b0010) * 0b0110 |
+(__imm8 & 0b0001) * 0b0011;
+  __v16qi __charmask = vec_splats ( __mask);
+  __charmask = vec_gb (__charmask);
+  __v8hu __shortmask = (__v8hu) vec_unpackh (__charmask);


This is a good trick, but you need comments to explain what you're 
doing, including how you build __mask.  I recommend you include 
alternate code for P10, where you can just use vec_genwm to expand from 
__mask to a mask of word elements.


I don't understand how you're getting away with a v8hu mask for word 
elements.  This seems wrong to me.  Adequate testing?



+  #ifdef __BIG_ENDIAN__
+  __shortmask = vec_reve (__shortmask);
+  #endif
+  return (__m128) vec_sel ((__v8hu) __A, (__v8hu) __B, __shortmask);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
+_mm_blendv_ps (__m128 __A, __m128 __B, __m128 __mask)
+{
+  const __v4si __zero = {0};
+  const vector __bool int __boolmask = vec_cmplt ((__v4si) __mask, __zero);
+  return (__m128) vec_sel ((__v4su) __A, (__v4su) __B, (__v4su) __boolmask);
+}
+


Okay.

Please have a look at the above issues and resubmit.  Thanks!
Bill


  extern __inline int __attribute__((__gnu_inline__, __always_inline__, 
__artificial__))
  _mm_testz_si128 (__m128i __A, __m128i __B)
  {


Re: [PATCH 3/4] rs6000: Add support for SSE4.1 "blend" intrinsics

2021-07-11 Thread Bill Schmidt via Gcc-patches

On 7/11/21 11:17 AM, Bill Schmidt wrote:

Hi Paul,

On 6/29/21 1:08 PM, Paul A. Clarke via Gcc-patches wrote:

_mm_blend_epi16 and _mm_blendv_epi8 were added earlier.
Add these four to complete the set.

2021-06-29  Paul A. Clarke  

gcc/ChangeLog:
* config/rs6000/smmintrin.h (_mm_blend_pd, _mm_blendv_pd,
_mm_blend_ps, _mm_blendv_ps): New.
---
  gcc/config/rs6000/smmintrin.h | 46 +++
  1 file changed, 46 insertions(+)

diff --git a/gcc/config/rs6000/smmintrin.h 
b/gcc/config/rs6000/smmintrin.h

index 1b8cad135ed0..fa17a8b2f478 100644
--- a/gcc/config/rs6000/smmintrin.h
+++ b/gcc/config/rs6000/smmintrin.h
@@ -116,6 +116,52 @@ _mm_blendv_epi8 (__m128i __A, __m128i __B, 
__m128i __mask)

    return (__m128i) vec_sel ((__v16qu) __A, (__v16qu) __B, __lmask);
  }

+extern __inline __m128d __attribute__((__gnu_inline__, 
__always_inline__, __artificial__))

Usual line length complaint. :)  Here and below...

+_mm_blend_pd (__m128d __A, __m128d __B, const int __imm8)
+{
+  const signed char __tmp = (__imm8 & 0b10) * 0b0000 |
+    (__imm8 & 0b01) * 0b;
+  __v16qi __charmask = vec_splats ((signed char) __tmp);
+  __charmask = vec_gb (__charmask);
+  __v8hu __shortmask = (__v8hu) vec_unpackh (__charmask);
+  #ifdef __BIG_ENDIAN__
+  __shortmask = vec_reve (__shortmask);
+  #endif
+  return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du) 
__shortmask);


This seems way too complex, and needs commentary to explain what 
you're doing.  Doesn't this instruction just translate into some form 
of xxpermdi?  Different ones for BE and LE, but still just xxpermdi, I 
think.



+}
+
+extern __inline __m128d __attribute__((__gnu_inline__, 
__always_inline__, __artificial__))

+_mm_blendv_pd (__m128d __A, __m128d __B, __m128d __mask)
+{
+  const __v2di __zero = {0};
+  const vector __bool long long __boolmask = vec_cmplt ((__v2di) 
__mask, __zero);
+  return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du) 
__boolmask);

+}


Okay.


+
+extern __inline __m128 __attribute__((__gnu_inline__, 
__always_inline__, __artificial__))

+_mm_blend_ps (__m128 __A, __m128 __B, const int __imm8)
+{
+  const signed char __mask = (__imm8 & 0b1000) * 0b00011000 |
+ (__imm8 & 0b0100) * 0b1100 |
+ (__imm8 & 0b0010) * 0b0110 |
+ (__imm8 & 0b0001) * 0b0011;
+  __v16qi __charmask = vec_splats ( __mask);
+  __charmask = vec_gb (__charmask);
+  __v8hu __shortmask = (__v8hu) vec_unpackh (__charmask);


This is a good trick, but you need comments to explain what you're 
doing, including how you build __mask.  I recommend you include 
alternate code for P10, where you can just use vec_genwm to expand 
from __mask to a mask of word elements.


I don't understand how you're getting away with a v8hu mask for word 
elements.  This seems wrong to me.  Adequate testing?


As an alternate approach, I suppose you could use vec_perm / vec_permr 
with one of sixteen possible masks, which would seem faster than the 
splat/gather/unpack/select approach.  Something to consider.


Bill




+  #ifdef __BIG_ENDIAN__
+  __shortmask = vec_reve (__shortmask);
+  #endif
+  return (__m128) vec_sel ((__v8hu) __A, (__v8hu) __B, __shortmask);
+}
+
+extern __inline __m128 __attribute__((__gnu_inline__, 
__always_inline__, __artificial__))

+_mm_blendv_ps (__m128 __A, __m128 __B, __m128 __mask)
+{
+  const __v4si __zero = {0};
+  const vector __bool int __boolmask = vec_cmplt ((__v4si) __mask, 
__zero);
+  return (__m128) vec_sel ((__v4su) __A, (__v4su) __B, (__v4su) 
__boolmask);

+}
+


Okay.

Please have a look at the above issues and resubmit.  Thanks!
Bill

  extern __inline int __attribute__((__gnu_inline__, 
__always_inline__, __artificial__))

  _mm_testz_si128 (__m128i __A, __m128i __B)
  {


Re: [PATCH 3/4] rs6000: Add support for SSE4.1 "blend" intrinsics

2021-07-12 Thread Paul A. Clarke via Gcc-patches
On Sun, Jul 11, 2021 at 11:29:24AM -0500, Bill Schmidt wrote:
> On 7/11/21 11:17 AM, Bill Schmidt wrote:
> > On 6/29/21 1:08 PM, Paul A. Clarke via Gcc-patches wrote:
> > > _mm_blend_epi16 and _mm_blendv_epi8 were added earlier.
> > > Add these four to complete the set.
> > > 
> > > 2021-06-29  Paul A. Clarke  
> > > 
> > > gcc/ChangeLog:
> > > * config/rs6000/smmintrin.h (_mm_blend_pd, _mm_blendv_pd,
> > > _mm_blend_ps, _mm_blendv_ps): New.
> > > ---
> > >   gcc/config/rs6000/smmintrin.h | 46 +++
> > >   1 file changed, 46 insertions(+)
> > > 
> > > diff --git a/gcc/config/rs6000/smmintrin.h
> > > b/gcc/config/rs6000/smmintrin.h
> > > index 1b8cad135ed0..fa17a8b2f478 100644
> > > --- a/gcc/config/rs6000/smmintrin.h
> > > +++ b/gcc/config/rs6000/smmintrin.h
> > > @@ -116,6 +116,52 @@ _mm_blendv_epi8 (__m128i __A, __m128i __B,
> > > __m128i __mask)
> > >     return (__m128i) vec_sel ((__v16qu) __A, (__v16qu) __B, __lmask);
> > >   }
> > > 
> > > +extern __inline __m128d __attribute__((__gnu_inline__,
> > > __always_inline__, __artificial__))
> > Usual line length complaint. :)  Here and below...
> > > +_mm_blend_pd (__m128d __A, __m128d __B, const int __imm8)
> > > +{
> > > +  const signed char __tmp = (__imm8 & 0b10) * 0b0000 |
> > > +    (__imm8 & 0b01) * 0b;
> > > +  __v16qi __charmask = vec_splats ((signed char) __tmp);
> > > +  __charmask = vec_gb (__charmask);
> > > +  __v8hu __shortmask = (__v8hu) vec_unpackh (__charmask);
> > > +  #ifdef __BIG_ENDIAN__
> > > +  __shortmask = vec_reve (__shortmask);
> > > +  #endif
> > > +  return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du)
> > > __shortmask);
> > 
> > This seems way too complex, and needs commentary to explain what you're
> > doing.  Doesn't this instruction just translate into some form of
> > xxpermdi?  Different ones for BE and LE, but still just xxpermdi, I
> > think.

xxpermdi won't work because the operation here is different.
blend_pd is "for each element, select a value from A or B"
(more like a "select" operation than a permute, such that the result may be
entirely identical to an input), whereas xxpermdi always takes one value from
each input.

Your suggestion, below, to use vperm can also be used here.

> > > +}
> > > +
> > > +extern __inline __m128d __attribute__((__gnu_inline__,
> > > __always_inline__, __artificial__))
> > > +_mm_blendv_pd (__m128d __A, __m128d __B, __m128d __mask)
> > > +{
> > > +  const __v2di __zero = {0};
> > > +  const vector __bool long long __boolmask = vec_cmplt ((__v2di)
> > > __mask, __zero);
> > > +  return (__m128d) vec_sel ((__v2du) __A, (__v2du) __B, (__v2du)
> > > __boolmask);
> > > +}
> > 
> > Okay.
> > 
> > > +
> > > +extern __inline __m128 __attribute__((__gnu_inline__,
> > > __always_inline__, __artificial__))
> > > +_mm_blend_ps (__m128 __A, __m128 __B, const int __imm8)
> > > +{
> > > +  const signed char __mask = (__imm8 & 0b1000) * 0b00011000 |
> > > + (__imm8 & 0b0100) * 0b1100 |
> > > + (__imm8 & 0b0010) * 0b0110 |
> > > + (__imm8 & 0b0001) * 0b0011;
> > > +  __v16qi __charmask = vec_splats ( __mask);
> > > +  __charmask = vec_gb (__charmask);
> > > +  __v8hu __shortmask = (__v8hu) vec_unpackh (__charmask);
> > 
> > This is a good trick, but you need comments to explain what you're
> > doing, including how you build __mask.  I recommend you include
> > alternate code for P10, where you can just use vec_genwm to expand from
> > __mask to a mask of word elements.
> > 
> > I don't understand how you're getting away with a v8hu mask for word
> > elements.  This seems wrong to me.  Adequate testing?

chars unpack to shorts. If you construct the char vector carefully, it all
just works.  In the end, it's a long bitmask.

Regardless, I'll go with your alternate approach, below...

> As an alternate approach, I suppose you could use vec_perm / vec_permr with
> one of sixteen possible masks, which would seem faster than the
> splat/gather/unpack/select approach.  Something to consider.

vperm will work, and is a bit more straightforward, if just a bit more code.

> > > +  #ifdef __BIG_ENDIAN__
> > > +  __shortmask = vec_reve (__shortmask);
> > > +  #endif
> > > +  return (__m128) vec_sel ((__v8hu) __A, (__v8hu) __B, __shortmask);
> > > +}
> > > +
> > > +extern __inline __m128 __attribute__((__gnu_inline__,
> > > __always_inline__, __artificial__))
> > > +_mm_blendv_ps (__m128 __A, __m128 __B, __m128 __mask)
> > > +{
> > > +  const __v4si __zero = {0};
> > > +  const vector __bool int __boolmask = vec_cmplt ((__v4si) __mask,
> > > __zero);
> > > +  return (__m128) vec_sel ((__v4su) __A, (__v4su) __B, (__v4su)
> > > __boolmask);
> > > +}
> > > +
> > 
> > Okay.
> > 
> > Please have a look at the above issues and resubmit.  Thanks!

Will do. Thanks for the reviews!

PC