Re: [PATCH v2 13/42] i386: Destructive vector helpers for AVX

2022-04-26 Thread Paolo Bonzini

On 4/25/22 00:01, Paul Brook wrote:

+#define SHUFFLE4(F, a, b, offset) do {  \
+r0 = a->F((order & 3) + offset);\
+r1 = a->F(((order >> 2) & 3) + offset); \
+r2 = b->F(((order >> 4) & 3) + offset); \
+r3 = b->F(((order >> 6) & 3) + offset); \
+d->F(offset) = r0;  \
+d->F(offset + 1) = r1;  \
+d->F(offset + 2) = r2;  \
+d->F(offset + 3) = r3;  \
+} while (0)
+
  #if SHIFT == 0
  void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order)
  {
-Reg r;
+uint16_t r0, r1, r2, r3;
  
-r.W(0) = s->W(order & 3);

-r.W(1) = s->W((order >> 2) & 3);
-r.W(2) = s->W((order >> 4) & 3);
-r.W(3) = s->W((order >> 6) & 3);
-MOVE(*d, r);
+SHUFFLE4(W, s, s, 0);


I am not particularly attached to the MOVE macro, but replacing the Reg 
variable with scalars seems worse.


Paolo



[PATCH v2 13/42] i386: Destructive vector helpers for AVX

2022-04-24 Thread Paul Brook
These helpers need to take special care to avoid overwriting source values
before the wole result has been calculated.  Currently they use a dummy
Reg typed variable to store the result then assign the whole register.
This will cause 128 bit operations to corrupt the upper half of the register,
so replace it with explicit temporaries and element assignments.

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h | 707 ++
 1 file changed, 437 insertions(+), 270 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index d0424140d9..c645d2ddbf 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -680,71 +680,85 @@ void glue(helper_movq_mm_T0, SUFFIX)(Reg *d, uint64_t val)
 }
 #endif
 
+#define SHUFFLE4(F, a, b, offset) do {  \
+r0 = a->F((order & 3) + offset);\
+r1 = a->F(((order >> 2) & 3) + offset); \
+r2 = b->F(((order >> 4) & 3) + offset); \
+r3 = b->F(((order >> 6) & 3) + offset); \
+d->F(offset) = r0;  \
+d->F(offset + 1) = r1;  \
+d->F(offset + 2) = r2;  \
+d->F(offset + 3) = r3;  \
+} while (0)
+
 #if SHIFT == 0
 void glue(helper_pshufw, SUFFIX)(Reg *d, Reg *s, int order)
 {
-Reg r;
+uint16_t r0, r1, r2, r3;
 
-r.W(0) = s->W(order & 3);
-r.W(1) = s->W((order >> 2) & 3);
-r.W(2) = s->W((order >> 4) & 3);
-r.W(3) = s->W((order >> 6) & 3);
-MOVE(*d, r);
+SHUFFLE4(W, s, s, 0);
 }
 #else
 void helper_shufps(Reg *d, Reg *s, int order)
 {
-Reg r;
+Reg *v = d;
+uint32_t r0, r1, r2, r3;
 
-r.L(0) = d->L(order & 3);
-r.L(1) = d->L((order >> 2) & 3);
-r.L(2) = s->L((order >> 4) & 3);
-r.L(3) = s->L((order >> 6) & 3);
-MOVE(*d, r);
+SHUFFLE4(L, v, s, 0);
+#if SHIFT == 2
+SHUFFLE4(L, v, s, 4);
+#endif
 }
 
 void helper_shufpd(Reg *d, Reg *s, int order)
 {
-Reg r;
+Reg *v = d;
+uint64_t r0, r1;
 
-r.Q(0) = d->Q(order & 1);
-r.Q(1) = s->Q((order >> 1) & 1);
-MOVE(*d, r);
+r0 = v->Q(order & 1);
+r1 = s->Q((order >> 1) & 1);
+d->Q(0) = r0;
+d->Q(1) = r1;
+#if SHIFT == 2
+r0 = v->Q(((order >> 2) & 1) + 2);
+r1 = s->Q(((order >> 3) & 1) + 2);
+d->Q(2) = r0;
+d->Q(3) = r1;
+#endif
 }
 
 void glue(helper_pshufd, SUFFIX)(Reg *d, Reg *s, int order)
 {
-Reg r;
+uint32_t r0, r1, r2, r3;
 
-r.L(0) = s->L(order & 3);
-r.L(1) = s->L((order >> 2) & 3);
-r.L(2) = s->L((order >> 4) & 3);
-r.L(3) = s->L((order >> 6) & 3);
-MOVE(*d, r);
+SHUFFLE4(L, s, s, 0);
+#if SHIFT ==  2
+SHUFFLE4(L, s, s, 4);
+#endif
 }
 
 void glue(helper_pshuflw, SUFFIX)(Reg *d, Reg *s, int order)
 {
-Reg r;
+uint16_t r0, r1, r2, r3;
 
-r.W(0) = s->W(order & 3);
-r.W(1) = s->W((order >> 2) & 3);
-r.W(2) = s->W((order >> 4) & 3);
-r.W(3) = s->W((order >> 6) & 3);
-r.Q(1) = s->Q(1);
-MOVE(*d, r);
+SHUFFLE4(W, s, s, 0);
+d->Q(1) = s->Q(1);
+#if SHIFT == 2
+SHUFFLE4(W, s, s, 8);
+d->Q(3) = s->Q(3);
+#endif
 }
 
 void glue(helper_pshufhw, SUFFIX)(Reg *d, Reg *s, int order)
 {
-Reg r;
+uint16_t r0, r1, r2, r3;
 
-r.Q(0) = s->Q(0);
-r.W(4) = s->W(4 + (order & 3));
-r.W(5) = s->W(4 + ((order >> 2) & 3));
-r.W(6) = s->W(4 + ((order >> 4) & 3));
-r.W(7) = s->W(4 + ((order >> 6) & 3));
-MOVE(*d, r);
+d->Q(0) = s->Q(0);
+SHUFFLE4(W, s, s, 4);
+#if SHIFT == 2
+d->Q(2) = s->Q(2);
+SHUFFLE4(W, s, s, 12);
+#endif
 }
 #endif
 
@@ -1320,156 +1334,190 @@ uint32_t glue(helper_pmovmskb, SUFFIX)(CPUX86State 
*env, Reg *s)
 return val;
 }
 
-void glue(helper_packsswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
-{
-Reg r;
-
-r.B(0) = satsb((int16_t)d->W(0));
-r.B(1) = satsb((int16_t)d->W(1));
-r.B(2) = satsb((int16_t)d->W(2));
-r.B(3) = satsb((int16_t)d->W(3));
-#if SHIFT == 1
-r.B(4) = satsb((int16_t)d->W(4));
-r.B(5) = satsb((int16_t)d->W(5));
-r.B(6) = satsb((int16_t)d->W(6));
-r.B(7) = satsb((int16_t)d->W(7));
-#endif
-r.B((4 << SHIFT) + 0) = satsb((int16_t)s->W(0));
-r.B((4 << SHIFT) + 1) = satsb((int16_t)s->W(1));
-r.B((4 << SHIFT) + 2) = satsb((int16_t)s->W(2));
-r.B((4 << SHIFT) + 3) = satsb((int16_t)s->W(3));
-#if SHIFT == 1
-r.B(12) = satsb((int16_t)s->W(4));
-r.B(13) = satsb((int16_t)s->W(5));
-r.B(14) = satsb((int16_t)s->W(6));
-r.B(15) = satsb((int16_t)s->W(7));
-#endif
-MOVE(*d, r);
-}
-
-void glue(helper_packuswb, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
-{
-Reg r;
-
-r.B(0) = satub((int16_t)d->W(0));
-r.B(1) = satub((int16_t)d->W(1));
-r.B(2) = satub((int16_t)d->W(2));
-r.B(3) = satub((int16_t)d->W(3));
-#if SHIFT == 1
-r.B(4) = satub((int16_t)d->W(4));
-r.B(5) = satub((int16_t)d->W(5));
-r.B(6) = satub((int16_t)d->W(6));
-r.B(7) = satub((int16_t)d->W(7));
-#endif
-r.B((4 << SHIFT) + 0) = satub((int16_t)s->W(0));