Re: [PATCH v3 01/13] tcg/s390x: Use register pair allocation for div and mulu2

2022-12-06 Thread Ilya Leoshkevich
On Thu, Dec 01, 2022 at 10:51:48PM -0800, Richard Henderson wrote:
> Previously we hard-coded R2 and R3.
> 
> Signed-off-by: Richard Henderson 
> ---
>  tcg/s390x/tcg-target-con-set.h |  4 ++--
>  tcg/s390x/tcg-target-con-str.h |  8 +--
>  tcg/s390x/tcg-target.c.inc | 43 +-
>  3 files changed, 35 insertions(+), 20 deletions(-)

Reviewed-by: Ilya Leoshkevich 



[PATCH v3 01/13] tcg/s390x: Use register pair allocation for div and mulu2

2022-12-01 Thread Richard Henderson
Previously we hard-coded R2 and R3.

Signed-off-by: Richard Henderson 
---
 tcg/s390x/tcg-target-con-set.h |  4 ++--
 tcg/s390x/tcg-target-con-str.h |  8 +--
 tcg/s390x/tcg-target.c.inc | 43 +-
 3 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/tcg/s390x/tcg-target-con-set.h b/tcg/s390x/tcg-target-con-set.h
index 426dd92e51..00ba727b70 100644
--- a/tcg/s390x/tcg-target-con-set.h
+++ b/tcg/s390x/tcg-target-con-set.h
@@ -29,8 +29,8 @@ C_O1_I2(v, v, v)
 C_O1_I3(v, v, v, v)
 C_O1_I4(r, r, ri, r, 0)
 C_O1_I4(r, r, ri, rI, 0)
-C_O2_I2(b, a, 0, r)
-C_O2_I3(b, a, 0, 1, r)
+C_O2_I2(o, m, 0, r)
+C_O2_I3(o, m, 0, 1, r)
 C_O2_I4(r, r, 0, 1, rA, r)
 C_O2_I4(r, r, 0, 1, ri, r)
 C_O2_I4(r, r, 0, 1, r, r)
diff --git a/tcg/s390x/tcg-target-con-str.h b/tcg/s390x/tcg-target-con-str.h
index 8bb0358ae5..76446aecae 100644
--- a/tcg/s390x/tcg-target-con-str.h
+++ b/tcg/s390x/tcg-target-con-str.h
@@ -11,13 +11,7 @@
 REGS('r', ALL_GENERAL_REGS)
 REGS('L', ALL_GENERAL_REGS & ~SOFTMMU_RESERVE_REGS)
 REGS('v', ALL_VECTOR_REGS)
-/*
- * A (single) even/odd pair for division.
- * TODO: Add something to the register allocator to allow
- * this kind of regno+1 pairing to be done more generally.
- */
-REGS('a', 1u << TCG_REG_R2)
-REGS('b', 1u << TCG_REG_R3)
+REGS('o', 0x) /* odd numbered general regs */
 
 /*
  * Define constraint letters for constants:
diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
index b9ba7b605e..cb00bb6999 100644
--- a/tcg/s390x/tcg-target.c.inc
+++ b/tcg/s390x/tcg-target.c.inc
@@ -2264,10 +2264,18 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode 
opc,
 break;
 
 case INDEX_op_div2_i32:
-tcg_out_insn(s, RR, DR, TCG_REG_R2, args[4]);
+tcg_debug_assert(args[0] == args[2]);
+tcg_debug_assert(args[1] == args[3]);
+tcg_debug_assert((args[1] & 1) == 0);
+tcg_debug_assert(args[0] == args[1] + 1);
+tcg_out_insn(s, RR, DR, args[1], args[4]);
 break;
 case INDEX_op_divu2_i32:
-tcg_out_insn(s, RRE, DLR, TCG_REG_R2, args[4]);
+tcg_debug_assert(args[0] == args[2]);
+tcg_debug_assert(args[1] == args[3]);
+tcg_debug_assert((args[1] & 1) == 0);
+tcg_debug_assert(args[0] == args[1] + 1);
+tcg_out_insn(s, RRE, DLR, args[1], args[4]);
 break;
 
 case INDEX_op_shl_i32:
@@ -2521,17 +2529,30 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode 
opc,
 break;
 
 case INDEX_op_div2_i64:
-/* ??? We get an unnecessary sign-extension of the dividend
-   into R3 with this definition, but as we do in fact always
-   produce both quotient and remainder using INDEX_op_div_i64
-   instead requires jumping through even more hoops.  */
-tcg_out_insn(s, RRE, DSGR, TCG_REG_R2, args[4]);
+/*
+ * ??? We get an unnecessary sign-extension of the dividend
+ * into op0 with this definition, but as we do in fact always
+ * produce both quotient and remainder using INDEX_op_div_i64
+ * instead requires jumping through even more hoops.
+ */
+tcg_debug_assert(args[0] == args[2]);
+tcg_debug_assert(args[1] == args[3]);
+tcg_debug_assert((args[1] & 1) == 0);
+tcg_debug_assert(args[0] == args[1] + 1);
+tcg_out_insn(s, RRE, DSGR, args[1], args[4]);
 break;
 case INDEX_op_divu2_i64:
-tcg_out_insn(s, RRE, DLGR, TCG_REG_R2, args[4]);
+tcg_debug_assert(args[0] == args[2]);
+tcg_debug_assert(args[1] == args[3]);
+tcg_debug_assert((args[1] & 1) == 0);
+tcg_debug_assert(args[0] == args[1] + 1);
+tcg_out_insn(s, RRE, DLGR, args[1], args[4]);
 break;
 case INDEX_op_mulu2_i64:
-tcg_out_insn(s, RRE, MLGR, TCG_REG_R2, args[3]);
+tcg_debug_assert(args[0] == args[2]);
+tcg_debug_assert((args[1] & 1) == 0);
+tcg_debug_assert(args[0] == args[1] + 1);
+tcg_out_insn(s, RRE, MLGR, args[1], args[3]);
 break;
 
 case INDEX_op_shl_i64:
@@ -3226,10 +3247,10 @@ static TCGConstraintSetIndex 
tcg_target_op_def(TCGOpcode op)
 case INDEX_op_div2_i64:
 case INDEX_op_divu2_i32:
 case INDEX_op_divu2_i64:
-return C_O2_I3(b, a, 0, 1, r);
+return C_O2_I3(o, m, 0, 1, r);
 
 case INDEX_op_mulu2_i64:
-return C_O2_I2(b, a, 0, r);
+return C_O2_I2(o, m, 0, r);
 
 case INDEX_op_add2_i32:
 case INDEX_op_sub2_i32:
-- 
2.34.1