On Tue, Apr 23, 2013 at 01:46:51PM -0700, Richard Henderson wrote: > Move the slow path out of line, as the TODO's mention. > This allows the fast path to be unconditional, which can > speed up the fast path as well, depending on the core. > > Signed-off-by: Richard Henderson <r...@twiddle.net> > --- > configure | 2 +- > include/exec/exec-all.h | 17 +++ > tcg/arm/tcg-target.c | 309 > +++++++++++++++++++++++++++++++----------------- > 3 files changed, 220 insertions(+), 108 deletions(-) > > diff --git a/configure b/configure > index 51a6c56..ececfe2 100755 > --- a/configure > +++ b/configure > @@ -3616,7 +3616,7 @@ echo "libs_softmmu=$libs_softmmu" >> $config_host_mak > echo "ARCH=$ARCH" >> $config_host_mak > > case "$cpu" in > - i386|x86_64|ppc) > + arm|i386|x86_64|ppc) > # The TCG interpreter currently does not support ld/st optimization. > if test "$tcg_interpreter" = "no" ; then > echo "CONFIG_QEMU_LDST_OPTIMIZATION=y" >> $config_host_mak > diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h > index e856191..6362074 100644 > --- a/include/exec/exec-all.h > +++ b/include/exec/exec-all.h > @@ -338,6 +338,23 @@ extern uintptr_t tci_tb_ptr; > # elif defined (_ARCH_PPC) && !defined (_ARCH_PPC64) > # define GETRA() ((uintptr_t)__builtin_return_address(0)) > # define GETPC_LDST() ((uintptr_t) ((*(int32_t *)(GETRA() - 4)) - 1)) > +# elif defined(__arm__) > +/* We define two insns between the return address and the branch back to > + straight-line. Find and decode that branch insn. */ > +# define GETRA() ((uintptr_t)__builtin_return_address(0)) > +# define GETPC_LDST() tcg_getpc_ldst(GETRA()) > +static inline uintptr_t tcg_getpc_ldst(uintptr_t ra) > +{ > + int32_t b; > + ra += 8; /* skip the two insns */ > + b = *(int32_t *)ra; /* load the branch insn */ > + b = (b << 8) >> (8 - 2); /* extract the displacement */ > + ra += 8; /* branches are relative to pc+8 */ > + ra += b; /* apply the displacement */ > + ra -= 4; /* return a pointer into the current opcode, > + not the start of the next opcode */ > + return ra; > +} > # else > # error "CONFIG_QEMU_LDST_OPTIMIZATION needs GETPC_LDST() implementation!" > # endif > diff --git a/tcg/arm/tcg-target.c b/tcg/arm/tcg-target.c > index eb697f2..d6afa2f 100644 > --- a/tcg/arm/tcg-target.c > +++ b/tcg/arm/tcg-target.c > @@ -419,6 +419,20 @@ static inline void tcg_out_dat_reg(TCGContext *s, > (rn << 16) | (rd << 12) | shift | rm); > } > > +static inline void tcg_out_nop(TCGContext *s) > +{ > + if (use_armv7_instructions) { > + /* Architected nop introduced in v6k. */ > + /* ??? This is an MSR (imm) 0,0,0 insn. Anyone know if this > + also Just So Happened to do nothing on pre-v6k so that we > + don't need to conditionalize it? */ > + tcg_out32(s, 0xe320f000); > + } else { > + /* Prior to that the assembler uses mov r0, r0. */ > + tcg_out_dat_reg(s, COND_AL, ARITH_MOV, 0, 0, 0, SHIFT_IMM_LSL(0)); > + } > +} > + > static inline void tcg_out_mov_reg(TCGContext *s, int cond, int rd, int rm) > { > /* Simple reg-reg move, optimising out the 'do nothing' case */ > @@ -1200,6 +1214,134 @@ static void tcg_out_tlb_read(TCGContext *s, TCGReg > addrlo, TCGReg addrhi, > TCG_REG_R1, addrhi, SHIFT_IMM_LSL(0)); > } > } > + > +/* Record the context of a call to the out of line helper code for the slow > + path for a load or store, so that we can later generate the correct > + helper code. */ > +static void add_qemu_ldst_label(TCGContext *s, int is_ld, int opc, > + int data_reg, int data_reg2, int addrlo_reg, > + int addrhi_reg, int mem_index, > + uint8_t *raddr, uint8_t *label_ptr) > +{ > + int idx; > + TCGLabelQemuLdst *label; > + > + if (s->nb_qemu_ldst_labels >= TCG_MAX_QEMU_LDST) { > + tcg_abort(); > + } > + > + idx = s->nb_qemu_ldst_labels++; > + label = (TCGLabelQemuLdst *)&s->qemu_ldst_labels[idx]; > + label->is_ld = is_ld; > + label->opc = opc; > + label->datalo_reg = data_reg; > + label->datahi_reg = data_reg2; > + label->addrlo_reg = addrlo_reg; > + label->addrhi_reg = addrhi_reg; > + label->mem_index = mem_index; > + label->raddr = raddr; > + label->label_ptr[0] = label_ptr; > +} > + > +static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb) > +{ > + TCGReg argreg, data_reg, data_reg2; > + uint8_t *start; > + > + reloc_pc24(lb->label_ptr[0], (tcg_target_long)s->code_ptr); > + > + argreg = tcg_out_arg_reg32(s, TCG_REG_R0, TCG_AREG0); > + if (TARGET_LONG_BITS == 64) { > + argreg = tcg_out_arg_reg64(s, argreg, lb->addrlo_reg, > lb->addrhi_reg); > + } else { > + argreg = tcg_out_arg_reg32(s, argreg, lb->addrlo_reg); > + } > + argreg = tcg_out_arg_imm32(s, argreg, lb->mem_index); > + tcg_out_call(s, (tcg_target_long) qemu_ld_helpers[lb->opc & 3]); > + > + data_reg = lb->datalo_reg; > + data_reg2 = lb->datahi_reg; > + > + start = s->code_ptr; > + switch (lb->opc) { > + case 0 | 4: > + tcg_out_ext8s(s, COND_AL, data_reg, TCG_REG_R0); > + break; > + case 1 | 4: > + tcg_out_ext16s(s, COND_AL, data_reg, TCG_REG_R0); > + break; > + case 0: > + case 1: > + case 2: > + default: > + tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_R0); > + break; > + case 3: > + tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_R0); > + tcg_out_mov_reg(s, COND_AL, data_reg2, TCG_REG_R1); > + break; > + } > + > + /* For GETPC_LDST in exec-all.h, we architect exactly 2 insns between > + the call and the branch back to straight-line code. Note that the > + moves above could be elided by register allocation, nor do we know > + which code alternative we chose for extension. */ > + switch (s->code_ptr - start) { > + case 0: > + tcg_out_nop(s); > + /* FALLTHRU */ > + case 4: > + tcg_out_nop(s); > + /* FALLTHRU */ > + case 8: > + break; > + default: > + abort(); > + } > + > + tcg_out_goto(s, COND_AL, (tcg_target_long)lb->raddr); > +} > + > +static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb) > +{ > + TCGReg argreg, data_reg, data_reg2; > + > + reloc_pc24(lb->label_ptr[0], (tcg_target_long)s->code_ptr); > + > + argreg = TCG_REG_R0; > + argreg = tcg_out_arg_reg32(s, argreg, TCG_AREG0); > + if (TARGET_LONG_BITS == 64) { > + argreg = tcg_out_arg_reg64(s, argreg, lb->addrlo_reg, > lb->addrhi_reg); > + } else { > + argreg = tcg_out_arg_reg32(s, argreg, lb->addrlo_reg); > + } > + > + data_reg = lb->datalo_reg; > + data_reg2 = lb->datahi_reg; > + switch (lb->opc) { > + case 0: > + argreg = tcg_out_arg_reg8(s, argreg, data_reg); > + break; > + case 1: > + argreg = tcg_out_arg_reg16(s, argreg, data_reg); > + break; > + case 2: > + argreg = tcg_out_arg_reg32(s, argreg, data_reg); > + break; > + case 3: > + argreg = tcg_out_arg_reg64(s, argreg, data_reg, data_reg2); > + break; > + } > + > + argreg = tcg_out_arg_imm32(s, argreg, lb->mem_index); > + tcg_out_call(s, (tcg_target_long) qemu_st_helpers[lb->opc & 3]); > + > + /* For GETPC_LDST in exec-all.h, we architect exactly 2 insns between > + the call and the branch back to straight-line code. */ > + tcg_out_nop(s); > + tcg_out_nop(s); > + tcg_out_goto(s, COND_AL, (tcg_target_long)lb->raddr); > +} > #endif /* SOFTMMU */ > > static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, int opc) > @@ -1208,8 +1350,8 @@ static void tcg_out_qemu_ld(TCGContext *s, const TCGArg > *args, int opc) > bool bswap; > #ifdef CONFIG_SOFTMMU > int mem_index, s_bits; > - TCGReg argreg, addr_reg2; > - uint32_t *label_ptr; > + TCGReg addr_reg2; > + uint8_t *label_ptr; > #endif > #ifdef TARGET_WORDS_BIGENDIAN > bswap = 1; > @@ -1228,89 +1370,56 @@ static void tcg_out_qemu_ld(TCGContext *s, const > TCGArg *args, int opc) > tcg_out_tlb_read(s, addr_reg, addr_reg2, s_bits, > offsetof(CPUArchState, > tlb_table[mem_index][0].addr_read)); > > - tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R2, > + label_ptr = s->code_ptr; > + tcg_out_b_noaddr(s, COND_NE); > + > + tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2, > offsetof(CPUTLBEntry, addend) > - offsetof(CPUTLBEntry, addr_read)); > > switch (opc) { > case 0: > - tcg_out_ld8_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1); > + tcg_out_ld8_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); > break; > case 0 | 4: > - tcg_out_ld8s_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1); > + tcg_out_ld8s_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); > break; > case 1: > - tcg_out_ld16u_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1); > + tcg_out_ld16u_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); > if (bswap) { > - tcg_out_bswap16(s, COND_EQ, data_reg, data_reg); > + tcg_out_bswap16(s, COND_AL, data_reg, data_reg); > } > break; > case 1 | 4: > if (bswap) { > - tcg_out_ld16u_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1); > - tcg_out_bswap16s(s, COND_EQ, data_reg, data_reg); > + tcg_out_ld16u_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); > + tcg_out_bswap16s(s, COND_AL, data_reg, data_reg); > } else { > - tcg_out_ld16s_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1); > + tcg_out_ld16s_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); > } > break; > case 2: > default: > - tcg_out_ld32_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1); > + tcg_out_ld32_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); > if (bswap) { > - tcg_out_bswap32(s, COND_EQ, data_reg, data_reg); > + tcg_out_bswap32(s, COND_AL, data_reg, data_reg); > } > break; > case 3: > if (bswap) { > - tcg_out_ld32_rwb(s, COND_EQ, data_reg2, TCG_REG_R1, addr_reg); > - tcg_out_ld32_12(s, COND_EQ, data_reg, TCG_REG_R1, 4); > - tcg_out_bswap32(s, COND_EQ, data_reg2, data_reg2); > - tcg_out_bswap32(s, COND_EQ, data_reg, data_reg); > + tcg_out_ld32_rwb(s, COND_AL, data_reg2, TCG_REG_R1, addr_reg); > + tcg_out_ld32_12(s, COND_AL, data_reg, TCG_REG_R1, 4); > + tcg_out_bswap32(s, COND_AL, data_reg2, data_reg2); > + tcg_out_bswap32(s, COND_AL, data_reg, data_reg); > } else { > - tcg_out_ld32_rwb(s, COND_EQ, data_reg, TCG_REG_R1, addr_reg); > - tcg_out_ld32_12(s, COND_EQ, data_reg2, TCG_REG_R1, 4); > + tcg_out_ld32_rwb(s, COND_AL, data_reg, TCG_REG_R1, addr_reg); > + tcg_out_ld32_12(s, COND_AL, data_reg2, TCG_REG_R1, 4); > } > break; > } > > - label_ptr = (void *) s->code_ptr; > - tcg_out_b_noaddr(s, COND_EQ); > - > - /* TODO: move this code to where the constants pool will be */ > - /* Note that this code relies on the constraints we set in arm_op_defs[] > - * to ensure that later arguments are not passed to us in registers we > - * trash by moving the earlier arguments into them. > - */ > - argreg = TCG_REG_R0; > - argreg = tcg_out_arg_reg32(s, argreg, TCG_AREG0); > - if (TARGET_LONG_BITS == 64) { > - argreg = tcg_out_arg_reg64(s, argreg, addr_reg, addr_reg2); > - } else { > - argreg = tcg_out_arg_reg32(s, argreg, addr_reg); > - } > - argreg = tcg_out_arg_imm32(s, argreg, mem_index); > - tcg_out_call(s, (tcg_target_long) qemu_ld_helpers[s_bits]); > - > - switch (opc) { > - case 0 | 4: > - tcg_out_ext8s(s, COND_AL, data_reg, TCG_REG_R0); > - break; > - case 1 | 4: > - tcg_out_ext16s(s, COND_AL, data_reg, TCG_REG_R0); > - break; > - case 0: > - case 1: > - case 2: > - default: > - tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_R0); > - break; > - case 3: > - tcg_out_mov_reg(s, COND_AL, data_reg, TCG_REG_R0); > - tcg_out_mov_reg(s, COND_AL, data_reg2, TCG_REG_R1); > - break; > - } > - > - reloc_pc24(label_ptr, (tcg_target_long)s->code_ptr); > + add_qemu_ldst_label(s, 1, opc, data_reg, data_reg2, addr_reg, addr_reg2, > + mem_index, s->code_ptr, label_ptr); > #else /* !CONFIG_SOFTMMU */ > if (GUEST_BASE) { > uint32_t offset = GUEST_BASE; > @@ -1379,8 +1488,8 @@ static void tcg_out_qemu_st(TCGContext *s, const TCGArg > *args, int opc) > bool bswap; > #ifdef CONFIG_SOFTMMU > int mem_index, s_bits; > - TCGReg argreg, addr_reg2; > - uint32_t *label_ptr; > + TCGReg addr_reg2; > + uint8_t *label_ptr; > #endif > #ifdef TARGET_WORDS_BIGENDIAN > bswap = 1; > @@ -1400,79 +1509,49 @@ static void tcg_out_qemu_st(TCGContext *s, const > TCGArg *args, int opc) > offsetof(CPUArchState, > tlb_table[mem_index][0].addr_write)); > > - tcg_out_ld32_12(s, COND_EQ, TCG_REG_R1, TCG_REG_R2, > + label_ptr = s->code_ptr; > + tcg_out_b_noaddr(s, COND_NE); > + > + tcg_out_ld32_12(s, COND_AL, TCG_REG_R1, TCG_REG_R2, > offsetof(CPUTLBEntry, addend) > - offsetof(CPUTLBEntry, addr_write)); > > switch (opc) { > case 0: > - tcg_out_st8_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1); > + tcg_out_st8_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); > break; > case 1: > if (bswap) { > - tcg_out_bswap16st(s, COND_EQ, TCG_REG_R0, data_reg); > - tcg_out_st16_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R1); > + tcg_out_bswap16st(s, COND_AL, TCG_REG_R0, data_reg); > + tcg_out_st16_r(s, COND_AL, TCG_REG_R0, addr_reg, TCG_REG_R1); > } else { > - tcg_out_st16_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1); > + tcg_out_st16_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); > } > break; > case 2: > default: > if (bswap) { > - tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg); > - tcg_out_st32_r(s, COND_EQ, TCG_REG_R0, addr_reg, TCG_REG_R1); > + tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg); > + tcg_out_st32_r(s, COND_AL, TCG_REG_R0, addr_reg, TCG_REG_R1); > } else { > - tcg_out_st32_r(s, COND_EQ, data_reg, addr_reg, TCG_REG_R1); > + tcg_out_st32_r(s, COND_AL, data_reg, addr_reg, TCG_REG_R1); > } > break; > case 3: > if (bswap) { > - tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg2); > - tcg_out_st32_rwb(s, COND_EQ, TCG_REG_R0, TCG_REG_R1, addr_reg); > - tcg_out_bswap32(s, COND_EQ, TCG_REG_R0, data_reg); > - tcg_out_st32_12(s, COND_EQ, TCG_REG_R0, TCG_REG_R1, 4); > + tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg2); > + tcg_out_st32_rwb(s, COND_AL, TCG_REG_R0, TCG_REG_R1, addr_reg); > + tcg_out_bswap32(s, COND_AL, TCG_REG_R0, data_reg); > + tcg_out_st32_12(s, COND_AL, TCG_REG_R0, TCG_REG_R1, 4); > } else { > - tcg_out_st32_rwb(s, COND_EQ, data_reg, TCG_REG_R1, addr_reg); > - tcg_out_st32_12(s, COND_EQ, data_reg2, TCG_REG_R1, 4); > + tcg_out_st32_rwb(s, COND_AL, data_reg, TCG_REG_R1, addr_reg); > + tcg_out_st32_12(s, COND_AL, data_reg2, TCG_REG_R1, 4); > } > break; > } > > - label_ptr = (void *) s->code_ptr; > - tcg_out_b_noaddr(s, COND_EQ); > - > - /* TODO: move this code to where the constants pool will be */ > - /* Note that this code relies on the constraints we set in arm_op_defs[] > - * to ensure that later arguments are not passed to us in registers we > - * trash by moving the earlier arguments into them. > - */ > - argreg = TCG_REG_R0; > - argreg = tcg_out_arg_reg32(s, argreg, TCG_AREG0); > - if (TARGET_LONG_BITS == 64) { > - argreg = tcg_out_arg_reg64(s, argreg, addr_reg, addr_reg2); > - } else { > - argreg = tcg_out_arg_reg32(s, argreg, addr_reg); > - } > - > - switch (opc) { > - case 0: > - argreg = tcg_out_arg_reg8(s, argreg, data_reg); > - break; > - case 1: > - argreg = tcg_out_arg_reg16(s, argreg, data_reg); > - break; > - case 2: > - argreg = tcg_out_arg_reg32(s, argreg, data_reg); > - break; > - case 3: > - argreg = tcg_out_arg_reg64(s, argreg, data_reg, data_reg2); > - break; > - } > - > - argreg = tcg_out_arg_imm32(s, argreg, mem_index); > - tcg_out_call(s, (tcg_target_long) qemu_st_helpers[s_bits]); > - > - reloc_pc24(label_ptr, (tcg_target_long)s->code_ptr); > + add_qemu_ldst_label(s, 0, opc, data_reg, data_reg2, addr_reg, addr_reg2, > + mem_index, s->code_ptr, label_ptr); > #else /* !CONFIG_SOFTMMU */ > if (GUEST_BASE) { > uint32_t offset = GUEST_BASE; > @@ -1872,6 +1951,22 @@ static inline void tcg_out_op(TCGContext *s, TCGOpcode > opc, > } > } > > +#ifdef CONFIG_SOFTMMU > +/* Generate TB finalization at the end of block. */ > +void tcg_out_tb_finalize(TCGContext *s) > +{ > + int i; > + for (i = 0; i < s->nb_qemu_ldst_labels; i++) { > + TCGLabelQemuLdst *label = &s->qemu_ldst_labels[i]; > + if (label->is_ld) { > + tcg_out_qemu_ld_slow_path(s, label); > + } else { > + tcg_out_qemu_st_slow_path(s, label); > + } > + } > +} > +#endif /* SOFTMMU */ > + > static const TCGTargetOpDef arm_op_defs[] = { > { INDEX_op_exit_tb, { } }, > { INDEX_op_goto_tb, { } },
Reviewed-by: Aurelien Jarno <aurel...@aurel32.net> -- Aurelien Jarno GPG: 1024D/F1BCDB73 aurel...@aurel32.net http://www.aurel32.net