s390x: Remove TCG_REG_TB

Ilya Leoshkevich Tue, 06 Dec 2022 11:31:25 -0800

On Thu, Dec 01, 2022 at 10:51:49PM -0800, Richard Henderson wrote:
> This reverts 829e1376d940 ("tcg/s390: Introduce TCG_REG_TB"), and
> several follow-up patches.  The primary motivation is to reduce the
> less-tested code paths, pre-z10.  Secondarily, this allows the
> unconditional use of TCG_TARGET_HAS_direct_jump, which might be more
> important for performance than any slight increase in code size.
> 
> Signed-off-by: Richard Henderson <richard.hender...@linaro.org>
> ---
>  tcg/s390x/tcg-target.h     |   2 +-
>  tcg/s390x/tcg-target.c.inc | 176 +++++--------------------------------
>  2 files changed, 23 insertions(+), 155 deletions(-)


Reviewed-by: Ilya Leoshkevich <i...@linux.ibm.com>

I have a few questions/ideas for the future below.
 
> diff --git a/tcg/s390x/tcg-target.h b/tcg/s390x/tcg-target.h
> index 22d70d431b..645f522058 100644
> --- a/tcg/s390x/tcg-target.h
> +++ b/tcg/s390x/tcg-target.h
> @@ -103,7 +103,7 @@ extern uint64_t s390_facilities[3];
>  #define TCG_TARGET_HAS_mulsh_i32      0
>  #define TCG_TARGET_HAS_extrl_i64_i32  0
>  #define TCG_TARGET_HAS_extrh_i64_i32  0
> -#define TCG_TARGET_HAS_direct_jump    HAVE_FACILITY(GEN_INST_EXT)
> +#define TCG_TARGET_HAS_direct_jump    1

This change doesn't seem to affect that, but what is the minimum
supported s390x qemu host? z900?

>  #define TCG_TARGET_HAS_qemu_st8_i32   0
>  
>  #define TCG_TARGET_HAS_div2_i64       1
> diff --git a/tcg/s390x/tcg-target.c.inc b/tcg/s390x/tcg-target.c.inc
> index cb00bb6999..8a4bec0a28 100644
> --- a/tcg/s390x/tcg-target.c.inc
> +++ b/tcg/s390x/tcg-target.c.inc
> @@ -65,12 +65,6 @@
>  /* A scratch register that may be be used throughout the backend.  */
>  #define TCG_TMP0        TCG_REG_R1
>  
> -/* A scratch register that holds a pointer to the beginning of the TB.
> -   We don't need this when we have pc-relative loads with the general
> -   instructions extension facility.  */
> -#define TCG_REG_TB      TCG_REG_R12
> -#define USE_REG_TB      (!HAVE_FACILITY(GEN_INST_EXT))
> -
>  #ifndef CONFIG_SOFTMMU
>  #define TCG_GUEST_BASE_REG TCG_REG_R13
>  #endif
> @@ -813,8 +807,8 @@ static bool maybe_out_small_movi(TCGContext *s, TCGType 
> type,
>  }
>  
>  /* load a register with an immediate value */
> -static void tcg_out_movi_int(TCGContext *s, TCGType type, TCGReg ret,
> -                             tcg_target_long sval, bool in_prologue)
> +static void tcg_out_movi(TCGContext *s, TCGType type,
> +                         TCGReg ret, tcg_target_long sval)
>  {
>      tcg_target_ulong uval;
>  
> @@ -853,14 +847,6 @@ static void tcg_out_movi_int(TCGContext *s, TCGType 
> type, TCGReg ret,
>              tcg_out_insn(s, RIL, LARL, ret, off);
>              return;
>          }
> -    } else if (USE_REG_TB && !in_prologue) {
> -        ptrdiff_t off = tcg_tbrel_diff(s, (void *)sval);
> -        if (off == sextract64(off, 0, 20)) {
> -            /* This is certain to be an address within TB, and therefore
> -               OFF will be negative; don't try RX_LA.  */
> -            tcg_out_insn(s, RXY, LAY, ret, TCG_REG_TB, TCG_REG_NONE, off);
> -            return;
> -        }
>      }
>  
>      /* A 32-bit unsigned value can be loaded in 2 insns.  And given
> @@ -876,10 +862,6 @@ static void tcg_out_movi_int(TCGContext *s, TCGType 
> type, TCGReg ret,
>      if (HAVE_FACILITY(GEN_INST_EXT)) {
>          tcg_out_insn(s, RIL, LGRL, ret, 0);
>          new_pool_label(s, sval, R_390_PC32DBL, s->code_ptr - 2, 2);
> -    } else if (USE_REG_TB && !in_prologue) {
> -        tcg_out_insn(s, RXY, LG, ret, TCG_REG_TB, TCG_REG_NONE, 0);
> -        new_pool_label(s, sval, R_390_20, s->code_ptr - 2,
> -                       tcg_tbrel_diff(s, NULL));
>      } else {
>          TCGReg base = ret ? ret : TCG_TMP0;
>          tcg_out_insn(s, RIL, LARL, base, 0);
> @@ -888,12 +870,6 @@ static void tcg_out_movi_int(TCGContext *s, TCGType 
> type, TCGReg ret,
>      }
>  }

I did some benchmarking of various ways to load constants in context of
GCC in the past, and it turned out that LLIHF+OILF is more efficient
than literal pool [1].

> -static void tcg_out_movi(TCGContext *s, TCGType type,
> -                         TCGReg ret, tcg_target_long sval)
> -{
> -    tcg_out_movi_int(s, type, ret, sval, false);
> -}
> -
>  /* Emit a load/store type instruction.  Inputs are:
>     DATA:     The register to be loaded or stored.
>     BASE+OFS: The effective address.
> @@ -1020,35 +996,6 @@ static inline bool tcg_out_sti(TCGContext *s, TCGType 
> type, TCGArg val,
>      return false;
>  }
>  
> -/* load data from an absolute host address */
> -static void tcg_out_ld_abs(TCGContext *s, TCGType type,
> -                           TCGReg dest, const void *abs)
> -{
> -    intptr_t addr = (intptr_t)abs;
> -
> -    if (HAVE_FACILITY(GEN_INST_EXT) && !(addr & 1)) {
> -        ptrdiff_t disp = tcg_pcrel_diff(s, abs) >> 1;
> -        if (disp == (int32_t)disp) {
> -            if (type == TCG_TYPE_I32) {
> -                tcg_out_insn(s, RIL, LRL, dest, disp);
> -            } else {
> -                tcg_out_insn(s, RIL, LGRL, dest, disp);
> -            }
> -            return;
> -        }
> -    }
> -    if (USE_REG_TB) {
> -        ptrdiff_t disp = tcg_tbrel_diff(s, abs);
> -        if (disp == sextract64(disp, 0, 20)) {
> -            tcg_out_ld(s, type, dest, TCG_REG_TB, disp);
> -            return;
> -        }
> -    }
> -
> -    tcg_out_movi(s, TCG_TYPE_PTR, dest, addr & ~0xffff);
> -    tcg_out_ld(s, type, dest, dest, addr & 0xffff);
> -}
> -
>  static inline void tcg_out_risbg(TCGContext *s, TCGReg dest, TCGReg src,
>                                   int msb, int lsb, int ofs, int z)
>  {
> @@ -1243,17 +1190,7 @@ static void tgen_andi(TCGContext *s, TCGType type, 
> TCGReg dest, uint64_t val)
>          return;
>      }
>  
> -    /* Use the constant pool if USE_REG_TB, but not for small constants.  */
> -    if (USE_REG_TB) {
> -        if (!maybe_out_small_movi(s, type, TCG_TMP0, val)) {
> -            tcg_out_insn(s, RXY, NG, dest, TCG_REG_TB, TCG_REG_NONE, 0);
> -            new_pool_label(s, val & valid, R_390_20, s->code_ptr - 2,
> -                           tcg_tbrel_diff(s, NULL));
> -            return;
> -        }
> -    } else {
> -        tcg_out_movi(s, type, TCG_TMP0, val);
> -    }
> +    tcg_out_movi(s, type, TCG_TMP0, val);
>      if (type == TCG_TYPE_I32) {
>          tcg_out_insn(s, RR, NR, dest, TCG_TMP0);
>      } else {
> @@ -1297,24 +1234,11 @@ static void tgen_ori(TCGContext *s, TCGType type, 
> TCGReg dest, uint64_t val)
>          }
>      }
>  
> -    /* Use the constant pool if USE_REG_TB, but not for small constants.  */
> -    if (maybe_out_small_movi(s, type, TCG_TMP0, val)) {
> -        if (type == TCG_TYPE_I32) {
> -            tcg_out_insn(s, RR, OR, dest, TCG_TMP0);
> -        } else {
> -            tcg_out_insn(s, RRE, OGR, dest, TCG_TMP0);
> -        }
> -    } else if (USE_REG_TB) {
> -        tcg_out_insn(s, RXY, OG, dest, TCG_REG_TB, TCG_REG_NONE, 0);
> -        new_pool_label(s, val, R_390_20, s->code_ptr - 2,
> -                       tcg_tbrel_diff(s, NULL));
> +    tcg_out_movi(s, type, TCG_TMP0, val);
> +    if (type == TCG_TYPE_I32) {
> +        tcg_out_insn(s, RR, OR, dest, TCG_TMP0);
>      } else {
> -        /* Perform the OR via sequential modifications to the high and
> -           low parts.  Do this via recursion to handle 16-bit vs 32-bit
> -           masks in each half.  */
> -        tcg_debug_assert(HAVE_FACILITY(EXT_IMM));
> -        tgen_ori(s, type, dest, val & 0x00000000ffffffffull);
> -        tgen_ori(s, type, dest, val & 0xffffffff00000000ull);
> +        tcg_out_insn(s, RRE, OGR, dest, TCG_TMP0);
>      }
>  }
>  
> @@ -1332,26 +1256,11 @@ static void tgen_xori(TCGContext *s, TCGType type, 
> TCGReg dest, uint64_t val)
>          }
>      }
>  
> -    /* Use the constant pool if USE_REG_TB, but not for small constants.  */
> -    if (maybe_out_small_movi(s, type, TCG_TMP0, val)) {
> -        if (type == TCG_TYPE_I32) {
> -            tcg_out_insn(s, RR, XR, dest, TCG_TMP0);
> -        } else {
> -            tcg_out_insn(s, RRE, XGR, dest, TCG_TMP0);
> -        }
> -    } else if (USE_REG_TB) {
> -        tcg_out_insn(s, RXY, XG, dest, TCG_REG_TB, TCG_REG_NONE, 0);
> -        new_pool_label(s, val, R_390_20, s->code_ptr - 2,
> -                       tcg_tbrel_diff(s, NULL));
> +    tcg_out_movi(s, type, TCG_TMP0, val);
> +    if (type == TCG_TYPE_I32) {
> +        tcg_out_insn(s, RR, XR, dest, TCG_TMP0);
>      } else {
> -        /* Perform the xor by parts.  */
> -        tcg_debug_assert(HAVE_FACILITY(EXT_IMM));
> -        if (val & 0xffffffff) {
> -            tcg_out_insn(s, RIL, XILF, dest, val);
> -        }
> -        if (val > 0xffffffff) {
> -            tcg_out_insn(s, RIL, XIHF, dest, val >> 32);
> -        }
> +        tcg_out_insn(s, RRE, XGR, dest, TCG_TMP0);
>      }
>  }

Wouldn't it be worth keeping XILF/XIFH here?
I don't have any numbers right now, but it looks more compact/efficient
than a load + XGR.
Same for OGR above; I even wonder if both implementations could be
unified.

> @@ -1395,19 +1304,6 @@ static int tgen_cmp(TCGContext *s, TCGType type, 
> TCGCond c, TCGReg r1,
>          if (maybe_out_small_movi(s, type, TCG_TMP0, c2)) {
>              c2 = TCG_TMP0;
>              /* fall through to reg-reg */
> -        } else if (USE_REG_TB) {
> -            if (type == TCG_TYPE_I32) {
> -                op = (is_unsigned ? RXY_CLY : RXY_CY);
> -                tcg_out_insn_RXY(s, op, r1, TCG_REG_TB, TCG_REG_NONE, 0);
> -                new_pool_label(s, (uint32_t)c2, R_390_20, s->code_ptr - 2,
> -                               4 - tcg_tbrel_diff(s, NULL));
> -            } else {
> -                op = (is_unsigned ? RXY_CLG : RXY_CG);
> -                tcg_out_insn_RXY(s, op, r1, TCG_REG_TB, TCG_REG_NONE, 0);
> -                new_pool_label(s, c2, R_390_20, s->code_ptr - 2,
> -                               tcg_tbrel_diff(s, NULL));
> -            }
> -            goto exit;
>          } else {
>              if (type == TCG_TYPE_I32) {
>                  op = (is_unsigned ? RIL_CLRL : RIL_CRL);
> @@ -2101,43 +1997,22 @@ static inline void tcg_out_op(TCGContext *s, 
> TCGOpcode opc,
>  
>      case INDEX_op_goto_tb:
>          a0 = args[0];
> -        if (s->tb_jmp_insn_offset) {
> -            /*
> -             * branch displacement must be aligned for atomic patching;
> -             * see if we need to add extra nop before branch
> -             */
> -            if (!QEMU_PTR_IS_ALIGNED(s->code_ptr + 1, 4)) {
> -                tcg_out16(s, NOP);
> -            }
> -            tcg_debug_assert(!USE_REG_TB);
> -            tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4));
> -            s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
> -            s->code_ptr += 2;
> -        } else {
> -            /* load address stored at s->tb_jmp_target_addr + a0 */
> -            tcg_out_ld_abs(s, TCG_TYPE_PTR, TCG_REG_TB,
> -                           tcg_splitwx_to_rx(s->tb_jmp_target_addr + a0));
> -            /* and go there */
> -            tcg_out_insn(s, RR, BCR, S390_CC_ALWAYS, TCG_REG_TB);
> +        tcg_debug_assert(s->tb_jmp_insn_offset);
> +        /*
> +         * branch displacement must be aligned for atomic patching;
> +         * see if we need to add extra nop before branch
> +         */
> +        if (!QEMU_PTR_IS_ALIGNED(s->code_ptr + 1, 4)) {
> +            tcg_out16(s, NOP);
>          }
> +        tcg_out16(s, RIL_BRCL | (S390_CC_ALWAYS << 4));
> +        s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
> +        tcg_out32(s, 0);
>          set_jmp_reset_offset(s, a0);

This seems to work in practice, but I don't think patching branch
offsets is allowed by PoP (in a multi-threaded environment). For
example, I had to do [2] in order to work around this limitation in
ftrace.

> -
> -        /* For the unlinked path of goto_tb, we need to reset
> -           TCG_REG_TB to the beginning of this TB.  */
> -        if (USE_REG_TB) {
> -            int ofs = -tcg_current_code_size(s);
> -            /* All TB are restricted to 64KiB by unwind info. */
> -            tcg_debug_assert(ofs == sextract64(ofs, 0, 20));
> -            tcg_out_insn(s, RXY, LAY, TCG_REG_TB,
> -                         TCG_REG_TB, TCG_REG_NONE, ofs);
> -        }
>          break;
>  
>      case INDEX_op_goto_ptr:
>          a0 = args[0];
> -        if (USE_REG_TB) {
> -            tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_TB, a0);
> -        }
>          tcg_out_insn(s, RR, BCR, S390_CC_ALWAYS, a0);
>          break;
>  
> @@ -3405,9 +3280,6 @@ static void tcg_target_init(TCGContext *s)
>      /* XXX many insns can't be used with R0, so we better avoid it for now */
>      tcg_regset_set_reg(s->reserved_regs, TCG_REG_R0);
>      tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
> -    if (USE_REG_TB) {
> -        tcg_regset_set_reg(s->reserved_regs, TCG_REG_TB);
> -    }
>  }

A third benefit seems to be that we now have one more register to
allocate.

>  
>  #define FRAME_SIZE  ((int)(TCG_TARGET_CALL_STACK_OFFSET          \
> @@ -3428,16 +3300,12 @@ static void tcg_target_qemu_prologue(TCGContext *s)
>  
>  #ifndef CONFIG_SOFTMMU
>      if (guest_base >= 0x80000) {
> -        tcg_out_movi_int(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, guest_base, 
> true);
> +        tcg_out_movi(s, TCG_TYPE_PTR, TCG_GUEST_BASE_REG, guest_base);
>          tcg_regset_set_reg(s->reserved_regs, TCG_GUEST_BASE_REG);
>      }
>  #endif
>  
>      tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
> -    if (USE_REG_TB) {
> -        tcg_out_mov(s, TCG_TYPE_PTR, TCG_REG_TB,
> -                    tcg_target_call_iarg_regs[1]);
> -    }
>  
>      /* br %r3 (go to TB) */
>      tcg_out_insn(s, RR, BCR, S390_CC_ALWAYS, tcg_target_call_iarg_regs[1]);
> -- 
> 2.34.1

[1] 
https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=9776b4653bc4f8b568ea49fea4a7d7460e58b68a
[2] 
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=de5012b41e5c900a8a3875c7a825394c5f624c05

Re: [PATCH v3 02/13] tcg/s390x: Remove TCG_REG_TB

Reply via email to