On Thu, Dec 01, 2022 at 10:51:49PM -0800, Richard Henderson wrote: > This reverts 829e1376d940 ("tcg/s390: Introduce TCG_REG_TB"), and > several follow-up patches. The primary motivation is to reduce the > less-tested code paths, pre-z10. Secondarily, this allows the > unconditional use of TCG_TARGET_HAS_direct_jump, which might be more > important for performance than any slight increase in code size. > > Signed-off-by: Richard Henderson <richard.hender...@linaro.org> > --- > tcg/s390x/tcg-target.h | 2 +- > tcg/s390x/tcg-target.c.inc | 176 +++++-------------------------------- > 2 files changed, 23 insertions(+), 155 deletions(-)
Reviewed-by: Ilya Leoshkevich <i...@linux.ibm.com> I have a few questions/ideas for the future below. > diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h > index 22d70d431b..645f522058 100644 > --- a/tcg/s390x/tcg-target.h > +++ b/tcg/s390x/tcg-target.h > @@ -103,7 +103,7 @@ extern uint64_t s390_facilities[3]; > #define TCG_TARGET_HAS_mulsh_i32 0 > #define TCG_TARGET_HAS_extrl_i64_i32 0 > #define TCG_TARGET_HAS_extrh_i64_i32 0 > -#define TCG_TARGET_HAS_direct_jump HAVE_FACILITY(GEN_INST_EXT) > +#define TCG_TARGET_HAS_direct_jump 1 This change doesn't seem to affect that, but what is the minimum supported s390x qemu host? z900? > #define TCG_TARGET_HAS_qemu_st8_i32 0 > > #define TCG_TARGET_HAS_div2_i64 1 > diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc > index cb00bb6999..8a4bec0a28 100644 > --- a/tcg/s390x/tcg-target.c.inc > +++ b/tcg/s390x/tcg-target.c.inc > @@ -65,12 +65,6 @@ > /* A scratch register that may be be used throughout the backend. */ > #define TCG_TMP0 TCG_REG_R1 > > -/* A scratch register that holds a pointer to the beginning of the TB. > - We don't need this when we have pc-relative loads with the general > - instructions extension facility. */ > -#define TCG_REG_TB TCG_REG_R12 > -#define USE_REG_TB (!HAVE_FACILITY(GEN_INST_EXT)) > - > #ifndef CONFIG_SOFTMMU > #define TCG_GUEST_BASE_REG TCG_REG_R13 > #endif > @@ -813,8 +807,8 @@ static bool maybe_out_small_movi(TCGContext *s, TCGType > type, > } > > /* load a register with an immediate value */ > -static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret, > - tcg_target_long sval, bool in_prologue) > +static void tcg_out_movi(TCGContext *s, TCGType type, > + TCGReg ret, tcg_target_long sval) > { > tcg_target_ulong uval; > > @@ -853,14 +847,6 @@ static void tcg_out_movi_int(TCGContext *s, TCGType > type, TCGReg ret, > tcg_out_insn(s, RIL, LARL, ret, off); > return; > } > - } else if (USE_REG_TB && !in_prologue) { > - ptrdiff_t off = tcg_tbrel_diff(s, (void *)sval); > - if (off == sextract64(off, 0, 20)) { > - /* This is certain to be an address within TB, and therefore > - OFF will be negative; don't try RX_LA. */ > - tcg_out_insn(s, RXY, LAY, ret, TCG_REG_TB, TCG_REG_NONE, off); > - return; > - } > } > > /* A 32-bit unsigned value can be loaded in 2 insns. And given > @@ -876,10 +862,6 @@ static void tcg_out_movi_int(TCGContext *s, TCGType > type, TCGReg ret, > if (HAVE_FACILITY(GEN_INST_EXT)) { > tcg_out_insn(s, RIL, LGRL, ret, 0); > new_pool_label(s, sval, R_390_PC32DBL, s->code_ptr - 2, 2); > - } else if (USE_REG_TB && !in_prologue) { > - tcg_out_insn(s, RXY, LG, ret, TCG_REG_TB, TCG_REG_NONE, 0); > - new_pool_label(s, sval, R_390_20, s->code_ptr - 2, > - tcg_tbrel_diff(s, NULL)); > } else { > TCGReg base = ret ? ret : TCG_TMP0; > tcg_out_insn(s, RIL, LARL, base, 0); > @@ -888,12 +870,6 @@ static void tcg_out_movi_int(TCGContext *s, TCGType > type, TCGReg ret, > } > } I did some benchmarking of various ways to load constants in context of GCC in the past, and it turned out that LLIHF+OILF is more efficient than literal pool [1]. > -static void tcg_out_movi(TCGContext *s, TCGType type, > - TCGReg ret, tcg_target_long sval) > -{ > - tcg_out_movi_int(s, type, ret, sval, false); > -} > - > /* Emit a load/store type instruction. Inputs are: > DATA: The register to be loaded or stored. > BASE+OFS: The effective address. > @@ -1020,35 +996,6 @@ static inline bool tcg_out_sti(TCGContext *s, TCGType > type, TCGArg val, > return false; > } > > -/* load data from an absolute host address */ > -static void tcg_out_ld_abs(TCGContext *s, TCGType type, > - TCGReg dest, const void *abs) > -{ > - intptr_t addr = (intptr_t)abs; > - > - if (HAVE_FACILITY(GEN_INST_EXT) && !(addr & 1)) { > - ptrdiff_t disp = tcg_pcrel_diff(s, abs) >> 1; > - if (disp == (int32_t)disp) { > - if (type == TCG_TYPE_I32) { > - tcg_out_insn(s, RIL, LRL, dest, disp); > - } else { > - tcg_out_insn(s, RIL, LGRL, dest, disp); > - } > - return; > - } > - } > - if (USE_REG_TB) { > - ptrdiff_t disp = tcg_tbrel_diff(s, abs); > - if (disp == sextract64(disp, 0, 20)) { > - tcg_out_ld(s, type, dest, TCG_REG_TB, disp); > - return; > - } > - } > - > - tcg_out_movi(s, TCG_TYPE_PTR, dest, addr & ~0xffff); > - tcg_out_ld(s, type, dest, dest, addr & 0xffff); > -} > - > static inline void tcg_out_risbg(TCGContext *s, TCGReg dest, TCGReg src, > int msb, int lsb, int ofs, int z) > { > @@ -1243,17 +1190,7 @@ static void tgen_andi(TCGContext *s, TCGType type, > TCGReg dest, uint64_t val) > return; > } > > - /* Use the constant pool if USE_REG_TB, but not for small constants. */ > - if (USE_REG_TB) { > - if (!maybe_out_small_movi(s, type, TCG_TMP0, val)) { > - tcg_out_insn(s, RXY, NG, dest, TCG_REG_TB, TCG_REG_NONE, 0); > - new_pool_label(s, val & valid, R_390_20, s->code_ptr - 2, > - tcg_tbrel_diff(s, NULL)); > - return; > - } > - } else { > - tcg_out_movi(s, type, TCG_TMP0, val); > - } > + tcg_out_movi(s, type, TCG_TMP0, val); > if (type == TCG_TYPE_I32) { > tcg_out_insn(s, RR, NR, dest, TCG_TMP0); > } else { > @@ -1297,24 +1234,11 @@ static void tgen_ori(TCGContext *s, TCGType type, > TCGReg dest, uint64_t val) > } > } > > - /* Use the constant pool if USE_REG_TB, but not for small constants. */ > - if (maybe_out_small_movi(s, type, TCG_TMP0, val)) { > - if (type == TCG_TYPE_I32) { > - tcg_out_insn(s, RR, OR, dest, TCG_TMP0); > - } else { > - tcg_out_insn(s, RRE, OGR, dest, TCG_TMP0); > - } > - } else if (USE_REG_TB) { > - tcg_out_insn(s, RXY, OG, dest, TCG_REG_TB, TCG_REG_NONE, 0); > - new_pool_label(s, val, R_390_20, s->code_ptr - 2, > - tcg_tbrel_diff(s, NULL)); > + tcg_out_movi(s, type, TCG_TMP0, val); > + if (type == TCG_TYPE_I32) { > + tcg_out_insn(s, RR, OR, dest, TCG_TMP0); > } else { > - /* Perform the OR via sequential modifications to the high and > - low parts. Do this via recursion to handle 16-bit vs 32-bit > - masks in each half. */ > - tcg_debug_assert(HAVE_FACILITY(EXT_IMM)); > - tgen_ori(s, type, dest, val & 0x00000000ffffffffull); > - tgen_ori(s, type, dest, val & 0xffffffff00000000ull); > + tcg_out_insn(s, RRE, OGR, dest, TCG_TMP0); > } > } > > @@ -1332,26 +1256,11 @@ static void tgen_xori(TCGContext *s, TCGType type, > TCGReg dest, uint64_t val) > } > } > > - /* Use the constant pool if USE_REG_TB, but not for small constants. */ > - if (maybe_out_small_movi(s, type, TCG_TMP0, val)) { > - if (type == TCG_TYPE_I32) { > - tcg_out_insn(s, RR, XR, dest, TCG_TMP0); > - } else { > - tcg_out_insn(s, RRE, XGR, dest, TCG_TMP0); > - } > - } else if (USE_REG_TB) { > - tcg_out_insn(s, RXY, XG, dest, TCG_REG_TB, TCG_REG_NONE, 0); > - new_pool_label(s, val, R_390_20, s->code_ptr - 2, > - tcg_tbrel_diff(s, NULL)); > + tcg_out_movi(s, type, TCG_TMP0, val); > + if (type == TCG_TYPE_I32) { > + tcg_out_insn(s, RR, XR, dest, TCG_TMP0); > } else { > - /* Perform the xor by parts. */ > - tcg_debug_assert(HAVE_FACILITY(EXT_IMM)); > - if (val & 0xffffffff) { > - tcg_out_insn(s, RIL, XILF, dest, val); > - } > - if (val > 0xffffffff) { > - tcg_out_insn(s, RIL, XIHF, dest, val >> 32); > - } > + tcg_out_insn(s, RRE, XGR, dest, TCG_TMP0); > } > } Wouldn't it be worth keeping XILF/XIFH here? I don't have any numbers right now, but it looks more compact/efficient than a load + XGR. Same for OGR above; I even wonder if both implementations could be unified. > @@ -1395,19 +1304,6 @@ static int tgen_cmp(TCGContext *s, TCGType type, > TCGCond c, TCGReg r1, > if (maybe_out_small_movi(s, type, TCG_TMP0, c2)) { > c2 = TCG_TMP0; > /* fall through to reg-reg */ > - } else if (USE_REG_TB) { > - if (type == TCG_TYPE_I32) { > - op = (is_unsigned ? RXY_CLY : RXY_CY); > - tcg_out_insn_RXY(s, op, r1, TCG_REG_TB, TCG_REG_NONE, 0); > - new_pool_label(s, (uint32_t)c2, R_390_20, s->code_ptr - 2, > - 4 - tcg_tbrel_diff(s, NULL)); > - } else { > - op = (is_unsigned ? RXY_CLG : RXY_CG); > - tcg_out_insn_RXY(s, op, r1, TCG_REG_TB, TCG_REG_NONE, 0); > - new_pool_label(s, c2, R_390_20, s->code_ptr - 2, > - tcg_tbrel_diff(s, NULL)); > - } > - goto exit; > } else { > if (type == TCG_TYPE_I32) { > op = (is_unsigned ? RIL_CLRL : RIL_CRL); > @@ -2101,43 +1997,22 @@ static inline void tcg_out_op(TCGContext *s, > TCGOpcode opc, > > case INDEX_op_goto_tb: > a0 = args[0]; > - if (s->tb_jmp_insn_offset) { > - /* > - * branch displacement must be aligned for atomic patching; > - * see if we need to add extra nop before branch > - */ > - if (!QEMU_PTR_IS_ALIGNED(s->code_ptr + 1, 4)) { > - tcg_out16(s, NOP); > - } > - tcg_debug_assert(!USE_REG_TB); > - tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4)); > - s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s); > - s->code_ptr += 2; > - } else { > - /* load address stored at s->tb_jmp_target_addr + a0 */ > - tcg_out_ld_abs(s, TCG_TYPE_PTR, TCG_REG_TB, > - tcg_splitwx_to_rx(s->tb_jmp_target_addr + a0)); > - /* and go there */ > - tcg_out_insn(s, RR, BCR, S390_CC_ALWAYS, TCG_REG_TB); > + tcg_debug_assert(s->tb_jmp_insn_offset); > + /* > + * branch displacement must be aligned for atomic patching; > + * see if we need to add extra nop before branch > + */ > + if (!QEMU_PTR_IS_ALIGNED(s->code_ptr + 1, 4)) { > + tcg_out16(s, NOP); > } > + tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4)); > + s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s); > + tcg_out32(s, 0); > set_jmp_reset_offset(s, a0); This seems to work in practice, but I don't think patching branch offsets is allowed by PoP (in a multi-threaded environment). For example, I had to do [2] in order to work around this limitation in ftrace. > - > - /* For the unlinked path of goto_tb, we need to reset > - TCG_REG_TB to the beginning of this TB. */ > - if (USE_REG_TB) { > - int ofs = -tcg_current_code_size(s); > - /* All TB are restricted to 64KiB by unwind info. */ > - tcg_debug_assert(ofs == sextract64(ofs, 0, 20)); > - tcg_out_insn(s, RXY, LAY, TCG_REG_TB, > - TCG_REG_TB, TCG_REG_NONE, ofs); > - } > break; > > case INDEX_op_goto_ptr: > a0 = args[0]; > - if (USE_REG_TB) { > - tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_TB, a0); > - } > tcg_out_insn(s, RR, BCR, S390_CC_ALWAYS, a0); > break; > > @@ -3405,9 +3280,6 @@ static void tcg_target_init(TCGContext *s) > /* XXX many insns can't be used with R0, so we better avoid it for now */ > tcg_regset_set_reg(s->reserved_regs, TCG_REG_R0); > tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK); > - if (USE_REG_TB) { > - tcg_regset_set_reg(s->reserved_regs, TCG_REG_TB); > - } > } A third benefit seems to be that we now have one more register to allocate. > > #define FRAME_SIZE ((int)(TCG_TARGET_CALL_STACK_OFFSET \ > @@ -3428,16 +3300,12 @@ static void tcg_target_qemu_prologue(TCGContext *s) > > #ifndef CONFIG_SOFTMMU > if (guest_base >= 0x80000) { > - tcg_out_movi_int(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, guest_base, > true); > + tcg_out_movi(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, guest_base); > tcg_regset_set_reg(s->reserved_regs, TCG_GUEST_BASE_REG); > } > #endif > > tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]); > - if (USE_REG_TB) { > - tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_TB, > - tcg_target_call_iarg_regs[1]); > - } > > /* br %r3 (go to TB) */ > tcg_out_insn(s, RR, BCR, S390_CC_ALWAYS, tcg_target_call_iarg_regs[1]); > -- > 2.34.1 [1] https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=9776b4653bc4f8b568ea49fea4a7d7460e58b68a [2] https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=de5012b41e5c900a8a3875c7a825394c5f624c05