Re: [PATCH v2 10/42] i386: Rewrite vector shift helper

2022-04-27 Thread Paolo Bonzini

On 4/25/22 23:33, Richard Henderson wrote:

I do not think it worthwhile to unroll these loops by hand.


Totally agree, as it would also remove most of the uses of 
XMM_ONLY/YMM_ONLY.


I also saw GCC -Warray-bounds complain about

if (SHIFT >= 1) {
d->elem[8] = s->elem[8];
}

though this should probably treated as a GCC bug.

Paolo


If we're that keen on it, it should be written

#pragma GCC unroll 4 << SHIFT
     for (i = 0; i < 4 << SHIFT; ++i) {
     something
     }





Re: [PATCH v2 10/42] i386: Rewrite vector shift helper

2022-04-25 Thread Richard Henderson

On 4/24/22 15:01, Paul Brook wrote:

Rewrite the vector shift helpers in preperation for AVX support (3 operand
form and 256 bit vectors).

For now keep the existing two operand interface.

No functional changes to existing helpers.

Signed-off-by: Paul Brook 
---
  target/i386/ops_sse.h | 250 ++
  1 file changed, 133 insertions(+), 117 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 23daab6b50..9297c96d04 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -63,199 +63,215 @@
  #define MOVE(d, r) memcpy(&(d).B(0), &(r).B(0), SIZE)
  #endif
  
-void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)

+#if SHIFT == 0
+#define SHIFT_HELPER_BODY(n, elem, F) do {  \
+d->elem(0) = F(s->elem(0), shift);  \
+if ((n) > 1) {  \
+d->elem(1) = F(s->elem(1), shift);  \
+}   \
+if ((n) > 2) {  \
+d->elem(2) = F(s->elem(2), shift);  \
+d->elem(3) = F(s->elem(3), shift);  \
+}   \
+if ((n) > 4) {  \
+d->elem(4) = F(s->elem(4), shift);  \
+d->elem(5) = F(s->elem(5), shift);  \
+d->elem(6) = F(s->elem(6), shift);  \
+d->elem(7) = F(s->elem(7), shift);  \
+}   \
+if ((n) > 8) {  \
+d->elem(8) = F(s->elem(8), shift);  \
+d->elem(9) = F(s->elem(9), shift);  \
+d->elem(10) = F(s->elem(10), shift);\
+d->elem(11) = F(s->elem(11), shift);\
+d->elem(12) = F(s->elem(12), shift);\
+d->elem(13) = F(s->elem(13), shift);\
+d->elem(14) = F(s->elem(14), shift);\
+d->elem(15) = F(s->elem(15), shift);\
+}   \
+} while (0)
+
+#define FPSRL(x, c) ((x) >> shift)
+#define FPSRAW(x, c) ((int16_t)(x) >> shift)
+#define FPSRAL(x, c) ((int32_t)(x) >> shift)
+#define FPSLL(x, c) ((x) << shift)
+#endif
+
+void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
  {
+Reg *s = d;
  int shift;
-
-if (s->Q(0) > 15) {
+if (c->Q(0) > 15) {
  d->Q(0) = 0;
-#if SHIFT == 1
-d->Q(1) = 0;
-#endif
+XMM_ONLY(d->Q(1) = 0;)
+YMM_ONLY(
+d->Q(2) = 0;
+d->Q(3) = 0;
+)
  } else {
-shift = s->B(0);
-d->W(0) >>= shift;
-d->W(1) >>= shift;
-d->W(2) >>= shift;
-d->W(3) >>= shift;
-#if SHIFT == 1
-d->W(4) >>= shift;
-d->W(5) >>= shift;
-d->W(6) >>= shift;
-d->W(7) >>= shift;
-#endif
+shift = c->B(0);
+SHIFT_HELPER_BODY(4 << SHIFT, W, FPSRL);
  }


I do not think it worthwhile to unroll these loops by hand.
If we're that keen on it, it should be written

#pragma GCC unroll 4 << SHIFT
for (i = 0; i < 4 << SHIFT; ++i) {
something
}

However, I would much rather you rework the users to use tcg_gen_gvec_3.  Note that you 
can't use tcg_gen_gvec_shls directly because of the shift-overflow-to-zero behaviour. 
There are examples in target/arm/translate.c, though of course the arm shift semantics are 
different, so it's not cut-and-paste.




r~



[PATCH v2 10/42] i386: Rewrite vector shift helper

2022-04-24 Thread Paul Brook
Rewrite the vector shift helpers in preperation for AVX support (3 operand
form and 256 bit vectors).

For now keep the existing two operand interface.

No functional changes to existing helpers.

Signed-off-by: Paul Brook 
---
 target/i386/ops_sse.h | 250 ++
 1 file changed, 133 insertions(+), 117 deletions(-)

diff --git a/target/i386/ops_sse.h b/target/i386/ops_sse.h
index 23daab6b50..9297c96d04 100644
--- a/target/i386/ops_sse.h
+++ b/target/i386/ops_sse.h
@@ -63,199 +63,215 @@
 #define MOVE(d, r) memcpy(&(d).B(0), &(r).B(0), SIZE)
 #endif
 
-void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+#if SHIFT == 0
+#define SHIFT_HELPER_BODY(n, elem, F) do {  \
+d->elem(0) = F(s->elem(0), shift);  \
+if ((n) > 1) {  \
+d->elem(1) = F(s->elem(1), shift);  \
+}   \
+if ((n) > 2) {  \
+d->elem(2) = F(s->elem(2), shift);  \
+d->elem(3) = F(s->elem(3), shift);  \
+}   \
+if ((n) > 4) {  \
+d->elem(4) = F(s->elem(4), shift);  \
+d->elem(5) = F(s->elem(5), shift);  \
+d->elem(6) = F(s->elem(6), shift);  \
+d->elem(7) = F(s->elem(7), shift);  \
+}   \
+if ((n) > 8) {  \
+d->elem(8) = F(s->elem(8), shift);  \
+d->elem(9) = F(s->elem(9), shift);  \
+d->elem(10) = F(s->elem(10), shift);\
+d->elem(11) = F(s->elem(11), shift);\
+d->elem(12) = F(s->elem(12), shift);\
+d->elem(13) = F(s->elem(13), shift);\
+d->elem(14) = F(s->elem(14), shift);\
+d->elem(15) = F(s->elem(15), shift);\
+}   \
+} while (0)
+
+#define FPSRL(x, c) ((x) >> shift)
+#define FPSRAW(x, c) ((int16_t)(x) >> shift)
+#define FPSRAL(x, c) ((int32_t)(x) >> shift)
+#define FPSLL(x, c) ((x) << shift)
+#endif
+
+void glue(helper_psrlw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
 {
+Reg *s = d;
 int shift;
-
-if (s->Q(0) > 15) {
+if (c->Q(0) > 15) {
 d->Q(0) = 0;
-#if SHIFT == 1
-d->Q(1) = 0;
-#endif
+XMM_ONLY(d->Q(1) = 0;)
+YMM_ONLY(
+d->Q(2) = 0;
+d->Q(3) = 0;
+)
 } else {
-shift = s->B(0);
-d->W(0) >>= shift;
-d->W(1) >>= shift;
-d->W(2) >>= shift;
-d->W(3) >>= shift;
-#if SHIFT == 1
-d->W(4) >>= shift;
-d->W(5) >>= shift;
-d->W(6) >>= shift;
-d->W(7) >>= shift;
-#endif
+shift = c->B(0);
+SHIFT_HELPER_BODY(4 << SHIFT, W, FPSRL);
 }
 }
 
-void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
 {
+Reg *s = d;
 int shift;
-
-if (s->Q(0) > 15) {
-shift = 15;
+if (c->Q(0) > 15) {
+d->Q(0) = 0;
+XMM_ONLY(d->Q(1) = 0;)
+YMM_ONLY(
+d->Q(2) = 0;
+d->Q(3) = 0;
+)
 } else {
-shift = s->B(0);
+shift = c->B(0);
+SHIFT_HELPER_BODY(4 << SHIFT, W, FPSLL);
 }
-d->W(0) = (int16_t)d->W(0) >> shift;
-d->W(1) = (int16_t)d->W(1) >> shift;
-d->W(2) = (int16_t)d->W(2) >> shift;
-d->W(3) = (int16_t)d->W(3) >> shift;
-#if SHIFT == 1
-d->W(4) = (int16_t)d->W(4) >> shift;
-d->W(5) = (int16_t)d->W(5) >> shift;
-d->W(6) = (int16_t)d->W(6) >> shift;
-d->W(7) = (int16_t)d->W(7) >> shift;
-#endif
 }
 
-void glue(helper_psllw, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+void glue(helper_psraw, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
 {
+Reg *s = d;
 int shift;
-
-if (s->Q(0) > 15) {
-d->Q(0) = 0;
-#if SHIFT == 1
-d->Q(1) = 0;
-#endif
+if (c->Q(0) > 15) {
+shift = 15;
 } else {
-shift = s->B(0);
-d->W(0) <<= shift;
-d->W(1) <<= shift;
-d->W(2) <<= shift;
-d->W(3) <<= shift;
-#if SHIFT == 1
-d->W(4) <<= shift;
-d->W(5) <<= shift;
-d->W(6) <<= shift;
-d->W(7) <<= shift;
-#endif
+shift = c->B(0);
 }
+SHIFT_HELPER_BODY(4 << SHIFT, W, FPSRAW);
 }
 
-void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *s)
+void glue(helper_psrld, SUFFIX)(CPUX86State *env, Reg *d, Reg *c)
 {
+Reg *s = d;
 int shift;
-
-if (s->Q(0) > 31) {
+if (c->Q(0) > 31) {
 d->Q(0) = 0;
-#if SHIFT == 1
-d->Q(1) = 0;
-#endif
+XMM_ONLY(d->Q(1) = 0;)
+YMM_ONLY(
+d->Q(2) = 0;
+d->Q(3) = 0;
+)
 } else {
-shift = s->B(0);
-d->L(0) >>= shift;
-d->L(1) >>= shift;
-#if SHIFT == 1
-d->L(2) >>= shift;