PF/ZF/SF are computed the same way for almost all CC_OP values (depending only on the operand size in the case of ZF and SF). The only exception is PF for CC_OP_BLSI* and CC_OP_BMILG*; but AMD documents that PF should be computed normally (rather than being undefined) so that is a kind of bug fix.
Put the common code at the end of helper_cc_compute_all, shaving another kB from its text. Signed-off-by: Paolo Bonzini <[email protected]> --- target/i386/cpu.h | 4 +- target/i386/tcg/cc_helper_template.h.inc | 112 +++------ target/i386/tcg/cc_helper.c | 274 +++++++++++++++-------- 3 files changed, 209 insertions(+), 181 deletions(-) diff --git a/target/i386/cpu.h b/target/i386/cpu.h index cee1f692a1c..ecca38ed0b5 100644 --- a/target/i386/cpu.h +++ b/target/i386/cpu.h @@ -1495,12 +1495,12 @@ typedef enum { CC_OP_SARL, CC_OP_SARQ, - CC_OP_BMILGB, /* Z,S via CC_DST, C = SRC==0; O=0; P,A undefined */ + CC_OP_BMILGB, /* P,Z,S via CC_DST, C = SRC==0; A=O=0 */ CC_OP_BMILGW, CC_OP_BMILGL, CC_OP_BMILGQ, - CC_OP_BLSIB, /* Z,S via CC_DST, C = SRC!=0; O=0; P,A undefined */ + CC_OP_BLSIB, /* P,Z,S via CC_DST, C = SRC!=0; A=O=0 */ CC_OP_BLSIW, CC_OP_BLSIL, CC_OP_BLSIQ, diff --git a/target/i386/tcg/cc_helper_template.h.inc b/target/i386/tcg/cc_helper_template.h.inc index d8fd976ca15..af58c2409f7 100644 --- a/target/i386/tcg/cc_helper_template.h.inc +++ b/target/i386/tcg/cc_helper_template.h.inc @@ -1,5 +1,5 @@ /* - * x86 condition code helpers + * x86 condition code helpers for AF/CF/OF * * Copyright (c) 2008 Fabrice Bellard * @@ -44,14 +44,9 @@ /* dynamic flags computation */ -static uint32_t glue(compute_all_cout, SUFFIX)(DATA_TYPE dst, DATA_TYPE carries) +static uint32_t glue(compute_aco_cout, SUFFIX)(DATA_TYPE carries) { - uint32_t af_cf, pf, zf, sf, of; - - /* PF, ZF, SF computed from result. */ - pf = compute_pf(dst); - zf = (dst == 0) * CC_Z; - sf = lshift(dst, 8 - DATA_BITS) & CC_S; + uint32_t af_cf, of; /* * AF, CF, OF computed from carry out vector. To compute AF and CF, rotate it @@ -62,14 +57,14 @@ static uint32_t glue(compute_all_cout, SUFFIX)(DATA_TYPE dst, DATA_TYPE carries) */ af_cf = ((carries << 1) | (carries >> (DATA_BITS - 1))) & (CC_A | CC_C); of = (lshift(carries, 12 - DATA_BITS) + CC_O / 2) & CC_O; - return pf + zf + sf + af_cf + of; + return af_cf + of; } -static uint32_t glue(compute_all_add, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) +static uint32_t glue(compute_aco_add, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) { DATA_TYPE src2 = dst - src1; DATA_TYPE carries = ADD_COUT_VEC(src1, src2, dst); - return glue(compute_all_cout, SUFFIX)(dst, carries); + return glue(compute_aco_cout, SUFFIX)(carries); } static int glue(compute_c_add, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) @@ -77,12 +72,12 @@ static int glue(compute_c_add, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) return dst < src1; } -static uint32_t glue(compute_all_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1, +static uint32_t glue(compute_aco_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1, DATA_TYPE src3) { DATA_TYPE src2 = dst - src1 - src3; DATA_TYPE carries = ADD_COUT_VEC(src1, src2, dst); - return glue(compute_all_cout, SUFFIX)(dst, carries); + return glue(compute_aco_cout, SUFFIX)(carries); } static int glue(compute_c_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1, @@ -97,11 +92,11 @@ static int glue(compute_c_adc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1, #endif } -static uint32_t glue(compute_all_sub, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2) +static uint32_t glue(compute_aco_sub, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2) { DATA_TYPE src1 = dst + src2; DATA_TYPE carries = SUB_COUT_VEC(src1, src2, dst); - return glue(compute_all_cout, SUFFIX)(dst, carries); + return glue(compute_aco_cout, SUFFIX)(carries); } static int glue(compute_c_sub, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2) @@ -111,12 +106,12 @@ static int glue(compute_c_sub, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2) return src1 < src2; } -static uint32_t glue(compute_all_sbb, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2, +static uint32_t glue(compute_aco_sbb, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2, DATA_TYPE src3) { DATA_TYPE src1 = dst + src2 + src3; DATA_TYPE carries = SUB_COUT_VEC(src1, src2, dst); - return glue(compute_all_cout, SUFFIX)(dst, carries); + return glue(compute_aco_cout, SUFFIX)(carries); } static int glue(compute_c_sbb, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2, @@ -134,57 +129,35 @@ static int glue(compute_c_sbb, SUFFIX)(DATA_TYPE dst, DATA_TYPE src2, #endif } -static uint32_t glue(compute_all_logic, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) +static uint32_t glue(compute_aco_inc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) { - uint32_t cf, pf, af, zf, sf, of; - - cf = 0; - pf = compute_pf(dst); - af = 0; - zf = (dst == 0) * CC_Z; - sf = lshift(dst, 8 - DATA_BITS) & CC_S; - of = 0; - return cf + pf + af + zf + sf + of; -} - -static uint32_t glue(compute_all_inc, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) -{ - uint32_t cf, pf, af, zf, sf, of; + uint32_t cf, af, of; cf = src1; - pf = compute_pf(dst); af = (dst ^ (dst - 1)) & CC_A; /* bits 0..3 are all clear */ - zf = (dst == 0) * CC_Z; - sf = lshift(dst, 8 - DATA_BITS) & CC_S; of = (dst == SIGN_MASK) * CC_O; - return cf + pf + af + zf + sf + of; + return cf + af + of; } -static uint32_t glue(compute_all_dec, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) +static uint32_t glue(compute_aco_dec, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) { - uint32_t cf, pf, af, zf, sf, of; + uint32_t cf, af, of; cf = src1; - pf = compute_pf(dst); af = (dst ^ (dst + 1)) & CC_A; /* bits 0..3 are all set */ - zf = (dst == 0) * CC_Z; - sf = lshift(dst, 8 - DATA_BITS) & CC_S; of = (dst == SIGN_MASK - 1) * CC_O; - return cf + pf + af + zf + sf + of; + return cf + af + of; } -static uint32_t glue(compute_all_shl, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) +static uint32_t glue(compute_aco_shl, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) { - uint32_t cf, pf, af, zf, sf, of; + uint32_t cf, af, of; cf = (src1 >> (DATA_BITS - 1)) & CC_C; - pf = compute_pf(dst); af = 0; /* undefined */ - zf = (dst == 0) * CC_Z; - sf = lshift(dst, 8 - DATA_BITS) & CC_S; /* of is defined iff shift count == 1 */ of = lshift(src1 ^ dst, 12 - DATA_BITS) & CC_O; - return cf + pf + af + zf + sf + of; + return cf + af + of; } static int glue(compute_c_shl, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) @@ -192,47 +165,25 @@ static int glue(compute_c_shl, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) return (src1 >> (DATA_BITS - 1)) & CC_C; } -static uint32_t glue(compute_all_sar, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) +static uint32_t glue(compute_aco_sar, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) { - uint32_t cf, pf, af, zf, sf, of; + uint32_t cf, af, of; cf = src1 & 1; - pf = compute_pf(dst); af = 0; /* undefined */ - zf = (dst == 0) * CC_Z; - sf = lshift(dst, 8 - DATA_BITS) & CC_S; /* of is defined iff shift count == 1 */ of = lshift(src1 ^ dst, 12 - DATA_BITS) & CC_O; - return cf + pf + af + zf + sf + of; + return cf + af + of; } -/* NOTE: we compute the flags like the P4. On olders CPUs, only OF and - CF are modified and it is slower to do that. Note as well that we - don't truncate SRC1 for computing carry to DATA_TYPE. */ -static uint32_t glue(compute_all_mul, SUFFIX)(DATA_TYPE dst, target_long src1) +static uint32_t glue(compute_aco_bmilg, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) { - uint32_t cf, pf, af, zf, sf, of; - - cf = (src1 != 0); - pf = compute_pf(dst); - af = 0; /* undefined */ - zf = (dst == 0) * CC_Z; - sf = lshift(dst, 8 - DATA_BITS) & CC_S; - of = cf * CC_O; - return cf + pf + af + zf + sf + of; -} - -static uint32_t glue(compute_all_bmilg, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) -{ - uint32_t cf, pf, af, zf, sf, of; + uint32_t cf, af, of; cf = (src1 == 0); - pf = 0; /* undefined */ af = 0; /* undefined */ - zf = (dst == 0) * CC_Z; - sf = lshift(dst, 8 - DATA_BITS) & CC_S; of = 0; - return cf + pf + af + zf + sf + of; + return cf + af + of; } static int glue(compute_c_bmilg, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) @@ -240,17 +191,14 @@ static int glue(compute_c_bmilg, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) return src1 == 0; } -static int glue(compute_all_blsi, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) +static int glue(compute_aco_blsi, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) { - uint32_t cf, pf, af, zf, sf, of; + uint32_t cf, af, of; cf = (src1 != 0); - pf = 0; /* undefined */ af = 0; /* undefined */ - zf = (dst == 0) * CC_Z; - sf = lshift(dst, 8 - DATA_BITS) & CC_S; of = 0; - return cf + pf + af + zf + sf + of; + return cf + af + of; } static int glue(compute_c_blsi, SUFFIX)(DATA_TYPE dst, DATA_TYPE src1) diff --git a/target/i386/tcg/cc_helper.c b/target/i386/tcg/cc_helper.c index f1940b40927..2c4170b5b77 100644 --- a/target/i386/tcg/cc_helper.c +++ b/target/i386/tcg/cc_helper.c @@ -73,9 +73,25 @@ target_ulong helper_cc_compute_nz(target_ulong dst, target_ulong src1, } } +/* NOTE: we compute the flags like the P4. On olders CPUs, only OF and + CF are modified and it is slower to do that. Note as well that we + don't truncate SRC1 for computing carry to DATA_TYPE. */ +static inline uint32_t compute_aco_mul(target_long src1) +{ + uint32_t cf, af, of; + + cf = (src1 != 0); + af = 0; /* undefined */ + of = cf * CC_O; + return cf + af + of; +} + target_ulong helper_cc_compute_all(target_ulong dst, target_ulong src1, target_ulong src2, int op) { + uint32_t flags = 0; + int shift = 0; + switch (op) { default: /* should never happen */ return 0; @@ -85,90 +101,6 @@ target_ulong helper_cc_compute_all(target_ulong dst, target_ulong src1, case CC_OP_POPCNT: return dst ? 0 : CC_Z; - case CC_OP_MULB: - return compute_all_mulb(dst, src1); - case CC_OP_MULW: - return compute_all_mulw(dst, src1); - case CC_OP_MULL: - return compute_all_mull(dst, src1); - - case CC_OP_ADDB: - return compute_all_addb(dst, src1); - case CC_OP_ADDW: - return compute_all_addw(dst, src1); - case CC_OP_ADDL: - return compute_all_addl(dst, src1); - - case CC_OP_ADCB: - return compute_all_adcb(dst, src1, src2); - case CC_OP_ADCW: - return compute_all_adcw(dst, src1, src2); - case CC_OP_ADCL: - return compute_all_adcl(dst, src1, src2); - - case CC_OP_SUBB: - return compute_all_subb(dst, src1); - case CC_OP_SUBW: - return compute_all_subw(dst, src1); - case CC_OP_SUBL: - return compute_all_subl(dst, src1); - - case CC_OP_SBBB: - return compute_all_sbbb(dst, src1, src2); - case CC_OP_SBBW: - return compute_all_sbbw(dst, src1, src2); - case CC_OP_SBBL: - return compute_all_sbbl(dst, src1, src2); - - case CC_OP_LOGICB: - return compute_all_logicb(dst, src1); - case CC_OP_LOGICW: - return compute_all_logicw(dst, src1); - case CC_OP_LOGICL: - return compute_all_logicl(dst, src1); - - case CC_OP_INCB: - return compute_all_incb(dst, src1); - case CC_OP_INCW: - return compute_all_incw(dst, src1); - case CC_OP_INCL: - return compute_all_incl(dst, src1); - - case CC_OP_DECB: - return compute_all_decb(dst, src1); - case CC_OP_DECW: - return compute_all_decw(dst, src1); - case CC_OP_DECL: - return compute_all_decl(dst, src1); - - case CC_OP_SHLB: - return compute_all_shlb(dst, src1); - case CC_OP_SHLW: - return compute_all_shlw(dst, src1); - case CC_OP_SHLL: - return compute_all_shll(dst, src1); - - case CC_OP_SARB: - return compute_all_sarb(dst, src1); - case CC_OP_SARW: - return compute_all_sarw(dst, src1); - case CC_OP_SARL: - return compute_all_sarl(dst, src1); - - case CC_OP_BMILGB: - return compute_all_bmilgb(dst, src1); - case CC_OP_BMILGW: - return compute_all_bmilgw(dst, src1); - case CC_OP_BMILGL: - return compute_all_bmilgl(dst, src1); - - case CC_OP_BLSIB: - return compute_all_blsib(dst, src1); - case CC_OP_BLSIW: - return compute_all_blsiw(dst, src1); - case CC_OP_BLSIL: - return compute_all_blsil(dst, src1); - case CC_OP_ADCX: return compute_all_adcx(dst, src1, src2); case CC_OP_ADOX: @@ -176,33 +108,181 @@ target_ulong helper_cc_compute_all(target_ulong dst, target_ulong src1, case CC_OP_ADCOX: return compute_all_adcox(dst, src1, src2); + case CC_OP_MULB: + flags = compute_aco_mul(src1); + goto psz_b; + case CC_OP_MULW: + flags = compute_aco_mul(src1); + goto psz_w; + case CC_OP_MULL: + flags = compute_aco_mul(src1); + goto psz_l; + + case CC_OP_ADDB: + flags = compute_aco_addb(dst, src1); + goto psz_b; + case CC_OP_ADDW: + flags = compute_aco_addw(dst, src1); + goto psz_w; + case CC_OP_ADDL: + flags = compute_aco_addl(dst, src1); + goto psz_l; + + case CC_OP_ADCB: + flags = compute_aco_adcb(dst, src1, src2); + goto psz_b; + case CC_OP_ADCW: + flags = compute_aco_adcw(dst, src1, src2); + goto psz_w; + case CC_OP_ADCL: + flags = compute_aco_adcl(dst, src1, src2); + goto psz_l; + + case CC_OP_SUBB: + flags = compute_aco_subb(dst, src1); + goto psz_b; + case CC_OP_SUBW: + flags = compute_aco_subw(dst, src1); + goto psz_w; + case CC_OP_SUBL: + flags = compute_aco_subl(dst, src1); + goto psz_l; + + case CC_OP_SBBB: + flags = compute_aco_sbbb(dst, src1, src2); + goto psz_b; + case CC_OP_SBBW: + flags = compute_aco_sbbw(dst, src1, src2); + goto psz_w; + case CC_OP_SBBL: + flags = compute_aco_sbbl(dst, src1, src2); + goto psz_l; + + case CC_OP_LOGICB: + flags = 0; + goto psz_b; + case CC_OP_LOGICW: + flags = 0; + goto psz_w; + case CC_OP_LOGICL: + flags = 0; + goto psz_l; + + case CC_OP_INCB: + flags = compute_aco_incb(dst, src1); + goto psz_b; + case CC_OP_INCW: + flags = compute_aco_incw(dst, src1); + goto psz_w; + case CC_OP_INCL: + flags = compute_aco_incl(dst, src1); + goto psz_l; + + case CC_OP_DECB: + flags = compute_aco_decb(dst, src1); + goto psz_b; + case CC_OP_DECW: + flags = compute_aco_decw(dst, src1); + goto psz_w; + case CC_OP_DECL: + flags = compute_aco_decl(dst, src1); + goto psz_l; + + case CC_OP_SHLB: + flags = compute_aco_shlb(dst, src1); + goto psz_b; + case CC_OP_SHLW: + flags = compute_aco_shlw(dst, src1); + goto psz_w; + case CC_OP_SHLL: + flags = compute_aco_shll(dst, src1); + goto psz_l; + + case CC_OP_SARB: + flags = compute_aco_sarb(dst, src1); + goto psz_b; + case CC_OP_SARW: + flags = compute_aco_sarw(dst, src1); + goto psz_w; + case CC_OP_SARL: + flags = compute_aco_sarl(dst, src1); + goto psz_l; + + case CC_OP_BMILGB: + flags = compute_aco_bmilgb(dst, src1); + goto psz_b; + case CC_OP_BMILGW: + flags = compute_aco_bmilgw(dst, src1); + goto psz_w; + case CC_OP_BMILGL: + flags = compute_aco_bmilgl(dst, src1); + goto psz_l; + + case CC_OP_BLSIB: + flags = compute_aco_blsib(dst, src1); + goto psz_b; + case CC_OP_BLSIW: + flags = compute_aco_blsiw(dst, src1); + goto psz_w; + case CC_OP_BLSIL: + flags = compute_aco_blsil(dst, src1); + goto psz_l; + #ifdef TARGET_X86_64 case CC_OP_MULQ: - return compute_all_mulq(dst, src1); + flags = compute_aco_mul(src1); + goto psz_q; case CC_OP_ADDQ: - return compute_all_addq(dst, src1); + flags = compute_aco_addq(dst, src1); + goto psz_q; case CC_OP_ADCQ: - return compute_all_adcq(dst, src1, src2); + flags = compute_aco_adcq(dst, src1, src2); + goto psz_q; case CC_OP_SUBQ: - return compute_all_subq(dst, src1); + flags = compute_aco_subq(dst, src1); + goto psz_q; case CC_OP_SBBQ: - return compute_all_sbbq(dst, src1, src2); - case CC_OP_LOGICQ: - return compute_all_logicq(dst, src1); + flags = compute_aco_sbbq(dst, src1, src2); + goto psz_q; case CC_OP_INCQ: - return compute_all_incq(dst, src1); + flags = compute_aco_incq(dst, src1); + goto psz_q; case CC_OP_DECQ: - return compute_all_decq(dst, src1); + flags = compute_aco_decq(dst, src1); + goto psz_q; + case CC_OP_LOGICQ: + flags = 0; + goto psz_q; case CC_OP_SHLQ: - return compute_all_shlq(dst, src1); + flags = compute_aco_shlq(dst, src1); + goto psz_q; case CC_OP_SARQ: - return compute_all_sarq(dst, src1); + flags = compute_aco_sarq(dst, src1); + goto psz_q; case CC_OP_BMILGQ: - return compute_all_bmilgq(dst, src1); + flags = compute_aco_bmilgq(dst, src1); + goto psz_q; case CC_OP_BLSIQ: - return compute_all_blsiq(dst, src1); + flags = compute_aco_blsiq(dst, src1); + goto psz_q; #endif } + +psz_b: + shift += 8; +psz_w: + shift += 16; +psz_l: +#ifdef TARGET_X86_64 + shift += 32; +psz_q: +#endif + + flags += compute_pf(dst); + dst <<= shift; + flags += dst == 0 ? CC_Z : 0; + flags += (target_long)dst < 0 ? CC_S : 0; + return flags; } uint32_t cpu_cc_compute_all(CPUX86State *env) -- 2.52.0
