On Mon, Nov 02, 2009 at 05:16:44PM +0200, Ulrich Hecht wrote: > On Thursday 22 October 2009, Aurelien Jarno wrote: > > Probably the second. Changing the instruction pointer in the helper > > instead of using the proper goto_tb TCG op prevents TB chaining, and > > therefore as a huge impact on performance. > > > > It's something not difficult to implement, and that I would definitely > > want to see in the patch before getting it merged. > > OK, I implemented it, and the surprising result is that performance does > not get any better; in fact it even suffers a little bit. (My standard > quick test, the polarssl test suite, shows about a 2% performance impact > when profiled with cachegrind).
That looks really strange, as TB chaining clearly reduce the number of instructions to execute, by not have to lookup for the TB after each branch. Also using a brcond instead of a helper should change nothing as it is located at the end of the TB, where all the globals must be saved in anyway. Also a recent bug found on ARM host with regard to TB chaining has shown it can gives a noticeably speed gain. On what host are you doing your benchmarks? > Could there be anything I overlooked? I modelled my implementation after > those in the existing targets. (See the attached patch that goes on top > of my other S/390 patches.) > Your patch looks good overall, I have minor comments (see below), but nothing that should improve the speed noticeably. > -- > SUSE LINUX Products GmbH, GF: Markus Rex, HRB 16746 (AG Nürnberg) > diff --git a/target-s390/helpers.h b/target-s390/helpers.h > index 0d16760..6009312 100644 > --- a/target-s390/helpers.h > +++ b/target-s390/helpers.h > @@ -15,7 +15,6 @@ DEF_HELPER_FLAGS_1(set_cc_comp_s64, > TCG_CALL_PURE|TCG_CALL_CONST, i32, s64) > DEF_HELPER_FLAGS_1(set_cc_nz_u32, TCG_CALL_PURE|TCG_CALL_CONST, i32, i32) > DEF_HELPER_FLAGS_1(set_cc_nz_u64, TCG_CALL_PURE|TCG_CALL_CONST, i32, i64) > DEF_HELPER_FLAGS_2(set_cc_icm, TCG_CALL_PURE|TCG_CALL_CONST, i32, i32, i32) > -DEF_HELPER_4(brc, void, i32, i32, i64, s32) > DEF_HELPER_3(brctg, void, i64, i64, s32) > DEF_HELPER_3(brct, void, i32, i64, s32) > DEF_HELPER_4(brcl, void, i32, i32, i64, s64) > diff --git a/target-s390/op_helper.c b/target-s390/op_helper.c > index 637d22f..f7f52ba 100644 > --- a/target-s390/op_helper.c > +++ b/target-s390/op_helper.c > @@ -218,17 +218,6 @@ uint32_t HELPER(set_cc_icm)(uint32_t mask, uint32_t val) > return cc; > } > > -/* relative conditional branch */ > -void HELPER(brc)(uint32_t cc, uint32_t mask, uint64_t pc, int32_t offset) > -{ > - if ( mask & ( 1 << (3 - cc) ) ) { > - env->psw.addr = pc + offset; > - } > - else { > - env->psw.addr = pc + 4; > - } > -} > - > /* branch relative on 64-bit count (condition is computed inline, this only > does the branch */ > void HELPER(brctg)(uint64_t flag, uint64_t pc, int32_t offset) > diff --git a/target-s390/translate.c b/target-s390/translate.c > index 9ffa7bd..5a7cfe7 100644 > --- a/target-s390/translate.c > +++ b/target-s390/translate.c > @@ -49,6 +49,7 @@ struct DisasContext { > uint64_t pc; > int is_jmp; > CPUS390XState *env; > + struct TranslationBlock *tb; > }; > > #define DISAS_EXCP 4 > @@ -359,23 +360,55 @@ static void gen_bcr(uint32_t mask, int tr, uint64_t > offset) > tcg_temp_free(target); > } > > -static void gen_brc(uint32_t mask, uint64_t pc, int32_t offset) > +static inline void gen_goto_tb(DisasContext *s, int tb_num, target_ulong pc) > { > - TCGv p; > - TCGv_i32 m, o; > + TranslationBlock *tb; > + > + tb = s->tb; > + /* NOTE: we handle the case where the TB spans two pages here */ > + if ((pc & TARGET_PAGE_MASK) == (tb->pc & TARGET_PAGE_MASK) || > + (pc & TARGET_PAGE_MASK) == ((s->pc - 1) & TARGET_PAGE_MASK)) { I have difficulties to figure out why the second comparison is needed. I know it comes from target-i386, but on the other hand it is not present in other targets. > + /* jump to same page: we can use a direct jump */ > + tcg_gen_mov_i32(global_cc, cc); > + tcg_gen_goto_tb(tb_num); > + tcg_gen_movi_i64(psw_addr, pc); > + tcg_gen_exit_tb((long)tb + tb_num); > + } else { > + /* jump to another page: currently not optimized */ > + tcg_gen_movi_i64(psw_addr, pc); > + tcg_gen_mov_i32(global_cc, cc); > + tcg_gen_exit_tb(0); > + } > +} > + > +static void gen_brc(uint32_t mask, DisasContext *s, int32_t offset) > +{ > + TCGv_i32 r; > + TCGv_i32 tmp, tmp2; > + int skip; > > if (mask == 0xf) { /* unconditional */ > - tcg_gen_movi_i64(psw_addr, pc + offset); > + //tcg_gen_movi_i64(psw_addr, s->pc + offset); > + gen_goto_tb(s, 0, s->pc + offset); > } > else { > - m = tcg_const_i32(mask); > - p = tcg_const_i64(pc); > - o = tcg_const_i32(offset); > - gen_helper_brc(cc, m, p, o); > - tcg_temp_free(m); > - tcg_temp_free(p); > - tcg_temp_free(o); > + tmp = tcg_const_i32(3); > + tcg_gen_sub_i32(tmp, tmp, cc); /* 3 - cc */ > + tmp2 = tcg_const_i32(1); > + tcg_gen_shl_i32(tmp2, tmp2, tmp); /* 1 << (3 - cc) */ > + r = tcg_const_i32(mask); > + tcg_gen_and_i32(r, r, tmp2); /* mask & (1 << (3 - cc)) */ > + tcg_temp_free(tmp); > + tcg_temp_free(tmp2); > + skip = gen_new_label(); > + tcg_gen_brcondi_i32(TCG_COND_EQ, r, 0, skip); > + gen_goto_tb(s, 0, s->pc + offset); > + gen_set_label(skip); > + gen_goto_tb(s, 1, s->pc + 4); > + tcg_gen_mov_i32(global_cc, cc); This is probably not needed as this is already done in gen_goto_tb(). > + tcg_temp_free(r); > } > + s->is_jmp = DISAS_TB_JUMP; > } > > static void gen_set_cc_add64(TCGv v1, TCGv v2, TCGv vr) > @@ -1143,9 +1176,7 @@ static void disas_a7(DisasContext *s, int op, int r1, > int i2) > tcg_temp_free(tmp2); > break; > case 0x4: /* brc m1, i2 */ > - /* FIXME: optimize m1 == 0xf (unconditional) case */ > - gen_brc(r1, s->pc, i2 * 2); > - s->is_jmp = DISAS_JUMP; > + gen_brc(r1, s, i2 * 2); > return; > case 0x5: /* BRAS R1,I2 [RI] */ > tmp = tcg_const_i64(s->pc + 4); > @@ -2739,6 +2770,7 @@ static inline void gen_intermediate_code_internal > (CPUState *env, > dc.env = env; > dc.pc = pc_start; > dc.is_jmp = DISAS_NEXT; > + dc.tb = tb; > > gen_opc_end = gen_opc_buf + OPC_MAX_SIZE; > > @@ -2778,8 +2810,11 @@ static inline void gen_intermediate_code_internal > (CPUState *env, > num_insns++; > } while (!dc.is_jmp && gen_opc_ptr < gen_opc_end && dc.pc < > next_page_start > && num_insns < max_insns && !env->singlestep_enabled); > - tcg_gen_mov_i32(global_cc, cc); > - tcg_temp_free(cc); > + > + if (dc.is_jmp != DISAS_TB_JUMP) { > + tcg_gen_mov_i32(global_cc, cc); > + tcg_temp_free(cc); > + } > > if (!dc.is_jmp) { > tcg_gen_st_i64(tcg_const_i64(dc.pc), cpu_env, offsetof(CPUState, > psw.addr)); > @@ -2794,7 +2829,9 @@ static inline void gen_intermediate_code_internal > (CPUState *env, > if (tb->cflags & CF_LAST_IO) > gen_io_end(); > /* Generate the return instruction */ > - tcg_gen_exit_tb(0); > + if (dc.is_jmp != DISAS_TB_JUMP) { > + tcg_gen_exit_tb(0); > + } > gen_icount_end(tb, num_insns); > *gen_opc_ptr = INDEX_op_end; > if (search_pc) { -- Aurelien Jarno GPG: 1024D/F1BCDB73 aurel...@aurel32.net http://www.aurel32.net