Hi! On Wed, Aug 16, 2017 at 03:35:40PM -0500, Steven Munroe wrote: > +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, > __artificial__)) > +_mm_add_ss (__m128 __A, __m128 __B) > +{ > +#ifdef _ARCH_PWR7 > + __m128 a, b, c; > + static const __vector unsigned int mask = {0xffffffff, 0, 0, 0}; > + /* PowerISA VSX does not allow partial (for just lower double) > + * results. So to insure we don't generate spurious exceptions > + * (from the upper double values) we splat the lower double > + * before we to the operation. */
No leading stars in comments please. > + a = vec_splat (__A, 0); > + b = vec_splat (__B, 0); > + c = a + b; > + /* Then we merge the lower float result with the original upper > + * float elements from __A. */ > + return (vec_sel (__A, c, mask)); > +#else > + __A[0] = __A[0] + __B[0]; > + return (__A); > +#endif > +} It would be nice if we could just write the #else version and get the more optimised code, but I guess we get something horrible going through memory, instead? > +extern __inline __m128 __attribute__((__gnu_inline__, __always_inline__, > __artificial__)) > +_mm_rcp_ps (__m128 __A) > +{ > + __v4sf result; > + > + __asm__( > + "xvresp %x0,%x1;\n" > + : "=v" (result) > + : "v" (__A) > + : ); > + > + return (result); > +} There is a builtin for this (__builtin_vec_re). > +/* Convert the lower SPFP value to a 32-bit integer according to the current > + rounding mode. */ > +extern __inline int __attribute__((__gnu_inline__, __always_inline__, > __artificial__)) > +_mm_cvtss_si32 (__m128 __A) > +{ > + __m64 res = 0; > +#ifdef _ARCH_PWR8 > + __m128 vtmp; > + __asm__( > + "xxsldwi %x1,%x2,%x2,3;\n" > + "xscvspdp %x1,%x1;\n" > + "fctiw %1,%1;\n" > + "mfvsrd %0,%x1;\n" > + : "=r" (res), > + "=&wi" (vtmp) > + : "wa" (__A) > + : ); > +#endif > + return (res); > +} Maybe it could do something better than return the wrong answer for non-p8? > +extern __inline int __attribute__((__gnu_inline__, __always_inline__, > __artificial__)) > +#ifdef __LITTLE_ENDIAN__ > + return result[1]; > +#elif __BIG_ENDIAN__ > + return result [0]; Remove the extra space here? > +_mm_max_pi16 (__m64 __A, __m64 __B) > + res.as_short[0] = (m1.as_short[0] > m2.as_short[0])? m1.as_short[0]: > m2.as_short[0]; > + res.as_short[1] = (m1.as_short[1] > m2.as_short[1])? m1.as_short[1]: > m2.as_short[1]; > + res.as_short[2] = (m1.as_short[2] > m2.as_short[2])? m1.as_short[2]: > m2.as_short[2]; > + res.as_short[3] = (m1.as_short[3] > m2.as_short[3])? m1.as_short[3]: > m2.as_short[3]; Space before ? and : . > +_mm_min_pi16 (__m64 __A, __m64 __B) In this function, too. > +extern __inline int __attribute__((__gnu_inline__, __always_inline__, > __artificial__)) > +_m_pmovmskb (__m64 __A) > +{ > + return _mm_movemask_pi8 (__A); > +} > +/* Multiply four unsigned 16-bit values in A by four unsigned 16-bit values > + in B and produce the high 16 bits of the 32-bit results. */ > +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, > __artificial__)) Newline before the comment? > +_mm_sad_pu8 (__m64 __A, __m64 __B) > + /* Sum four groups of bytes into integers. */ > + vsum = (__vector signed int) vec_sum4s (vabsdiff, zero); > + /* sum across four integers with integer result. */ > + vsum = vec_sums (vsum, (__vector signed int) zero); > + /* the sum is in the right most 32-bits of the vector result. > + Transfer the a GPR and truncate to 16 bits. */ That last line has an indentation problem. Sentences should start with a capital, in those last two comments. > +/* Stores the data in A to the address P without polluting the caches. */ > +extern __inline void __attribute__((__gnu_inline__, __always_inline__, > __artificial__)) > +_mm_stream_pi (__m64 *__P, __m64 __A) > +{ > + /* Use the data cache block touch for store transient. */ > + __asm__ ( > + " dcbtstt 0,%0;" No need for the ";". dcbtstt isn't really the same semantics but I don't think you can do better. The rest looks great. Please do something about the _mm_cvtss_si32, have a look at the other comments, and it is ready to commit. Thanks, Segher