Re: [PATCH, rs6000] 2/3 Add x86 SSE intrinsics to GCC PPC64LE taget

Segher Boessenkool Wed, 16 Aug 2017 22:29:19 -0700

Hi!

On Wed, Aug 16, 2017 at 03:35:40PM -0500, Steven Munroe wrote:
> +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> +_mm_add_ss (__m128 __A, __m128 __B)
> +{
> +#ifdef _ARCH_PWR7
> +  __m128 a, b, c;
> +  static const __vector unsigned int mask = {0xffffffff, 0, 0, 0};
> +  /* PowerISA VSX does not allow partial (for just lower double)
> +   * results. So to insure we don't generate spurious exceptions
> +   * (from the upper double values) we splat the lower double
> +   * before we to the operation. */


No leading stars in comments please.

> +  a = vec_splat (__A, 0);
> +  b = vec_splat (__B, 0);
> +  c = a + b;
> +  /* Then we merge the lower float result with the original upper
> +   * float elements from __A.  */
> +  return (vec_sel (__A, c, mask));
> +#else
> +  __A[0] = __A[0] + __B[0];
> +  return (__A);
> +#endif
> +}

It would be nice if we could just write the #else version and get the
more optimised code, but I guess we get something horrible going through
memory, instead?

> +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> +_mm_rcp_ps (__m128 __A)
> +{
> +  __v4sf result;
> +
> +  __asm__(
> +      "xvresp %x0,%x1;\n"
> +      : "=v" (result)
> +      : "v" (__A)
> +      : );
> +
> +  return (result);
> +}

There is a builtin for this (__builtin_vec_re).

> +/* Convert the lower SPFP value to a 32-bit integer according to the current
> +   rounding mode.  */
> +extern __inline int __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> +_mm_cvtss_si32 (__m128 __A)
> +{
> +  __m64 res = 0;
> +#ifdef _ARCH_PWR8
> +  __m128 vtmp;
> +  __asm__(
> +      "xxsldwi %x1,%x2,%x2,3;\n"
> +      "xscvspdp %x1,%x1;\n"
> +      "fctiw  %1,%1;\n"
> +      "mfvsrd  %0,%x1;\n"
> +      : "=r" (res),
> +     "=&wi" (vtmp)
> +      : "wa" (__A)
> +      : );
> +#endif
> +  return (res);
> +}

Maybe it could do something better than return the wrong answer for non-p8?

> +extern __inline int __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))

> +#ifdef __LITTLE_ENDIAN__
> +  return result[1];
> +#elif __BIG_ENDIAN__
> +  return result [0];

Remove the extra space here?

> +_mm_max_pi16 (__m64 __A, __m64 __B)

> +  res.as_short[0] = (m1.as_short[0] > m2.as_short[0])? m1.as_short[0]: 
> m2.as_short[0];
> +  res.as_short[1] = (m1.as_short[1] > m2.as_short[1])? m1.as_short[1]: 
> m2.as_short[1];
> +  res.as_short[2] = (m1.as_short[2] > m2.as_short[2])? m1.as_short[2]: 
> m2.as_short[2];
> +  res.as_short[3] = (m1.as_short[3] > m2.as_short[3])? m1.as_short[3]: 
> m2.as_short[3];

Space before ? and : .

> +_mm_min_pi16 (__m64 __A, __m64 __B)

In this function, too.

> +extern __inline int __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> +_m_pmovmskb (__m64 __A)
> +{
> +  return _mm_movemask_pi8 (__A);
> +}
> +/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values
> +   in B and produce the high 16 bits of the 32-bit results.  */
> +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))

Newline before the comment?

> +_mm_sad_pu8 (__m64  __A, __m64  __B)

> +  /* Sum four groups of bytes into integers.  */
> +  vsum = (__vector signed int) vec_sum4s (vabsdiff, zero);
> +  /* sum across four integers with integer result.  */
> +  vsum = vec_sums (vsum, (__vector signed int) zero);
> +  /* the sum is in the right most 32-bits of the vector result.
> +   Transfer the a GPR and truncate to 16 bits.  */

That last line has an indentation problem.  Sentences should start with
a capital, in those last two comments.

> +/* Stores the data in A to the address P without polluting the caches.  */
> +extern __inline void __attribute__((__gnu_inline__, __always_inline__, 
> __artificial__))
> +_mm_stream_pi (__m64 *__P, __m64 __A)
> +{
> +  /* Use the data cache block touch for store transient.  */
> +  __asm__ (
> +    "        dcbtstt 0,%0;"

No need for the ";".  dcbtstt isn't really the same semantics but I don't
think you can do better.

The rest looks great.

Please do something about the _mm_cvtss_si32, have a look at the other
comments, and it is ready to commit.  Thanks,


Segher

Re: [PATCH, rs6000] 2/3 Add x86 SSE intrinsics to GCC PPC64LE taget

Reply via email to