The movbe instruction has been added on some Intel Atom CPUs and on recent Intel Haswell CPUs. It allows to load/store a value and at the same time bswap it.
This patch detects the avaibility of this instruction and when available use it in the qemu load/store routines in replacement of load/store + bswap. Note that for 16-bit unsigned loads, movbe + movzw is basically the same as movzw + bswap, so the patch doesn't touch this case. Signed-off-by: Aurelien Jarno <aurel...@aurel32.net> --- tcg/i386/tcg-target.c | 152 ++++++++++++++++++++++++++++++++++--------------- 1 file changed, 107 insertions(+), 45 deletions(-) diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c index e247829..8fbb0be 100644 --- a/tcg/i386/tcg-target.c +++ b/tcg/i386/tcg-target.c @@ -99,18 +99,31 @@ static const int tcg_target_call_oarg_regs[] = { # define TCG_REG_L1 TCG_REG_EDX #endif +/* The host compiler should supply <cpuid.h> to enable runtime features + detection, as we're not going to go so far as our own inline assembly. + If not available, default values will be assumed. */ +#if defined(CONFIG_CPUID_H) +#include <cpuid.h> +#endif + /* For 32-bit, we are going to attempt to determine at runtime whether cmov - is available. However, the host compiler must supply <cpuid.h>, as we're - not going to go so far as our own inline assembly. */ + is available. */ #if TCG_TARGET_REG_BITS == 64 # define have_cmov 1 #elif defined(CONFIG_CPUID_H) -#include <cpuid.h> static bool have_cmov; #else # define have_cmov 0 #endif +/* If bit_MOVBE is defined in cpuid.h (added in GCC version 4.6), we are + going to attempt to determine at runtime whether movbe is available. */ +#if defined(CONFIG_CPUID_H) && defined(bit_MOVBE) +static bool have_movbe; +#else +# define have_movbe 0 +#endif + static uint8_t *tb_ret_addr; static void patch_reloc(uint8_t *code_ptr, int type, @@ -280,6 +293,8 @@ static inline int tcg_target_const_match(tcg_target_long val, #define OPC_MOVB_EvIz (0xc6) #define OPC_MOVL_EvIz (0xc7) #define OPC_MOVL_Iv (0xb8) +#define OPC_MOVBE_GyMy (0xf0 | P_EXT2) +#define OPC_MOVBE_MyGy (0xf1 | P_EXT2) #define OPC_MOVSBL (0xbe | P_EXT) #define OPC_MOVSWL (0xbf | P_EXT) #define OPC_MOVSLQ (0x63 | P_REXW) @@ -1363,8 +1378,13 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, break; case MO_SW: if (bswap) { - tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); - tcg_out_rolw_8(s, datalo); + if (have_movbe) { + tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg, + datalo, base, ofs); + } else { + tcg_out_modrm_offset(s, OPC_MOVZWL + seg, datalo, base, ofs); + tcg_out_rolw_8(s, datalo); + } tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo); } else { tcg_out_modrm_offset(s, OPC_MOVSWL + P_REXW + seg, @@ -1372,16 +1392,25 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, } break; case MO_UL: - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs); - if (bswap) { - tcg_out_bswap32(s, datalo); + if (bswap && have_movbe) { + tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + seg, datalo, base, ofs); + } else { + tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs); + if (bswap) { + tcg_out_bswap32(s, datalo); + } } break; #if TCG_TARGET_REG_BITS == 64 case MO_SL: if (bswap) { - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs); - tcg_out_bswap32(s, datalo); + if (have_movbe) { + tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + seg, + datalo, base, ofs); + } else { + tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, datalo, base, ofs); + tcg_out_bswap32(s, datalo); + } tcg_out_ext32s(s, datalo, datalo); } else { tcg_out_modrm_offset(s, OPC_MOVSLQ + seg, datalo, base, ofs); @@ -1390,29 +1419,34 @@ static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, #endif case MO_Q: if (TCG_TARGET_REG_BITS == 64) { - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + P_REXW + seg, - datalo, base, ofs); - if (bswap) { - tcg_out_bswap64(s, datalo); + if (bswap && have_movbe) { + tcg_out_modrm_offset(s, OPC_MOVBE_GyMy + P_REXW + seg, + datalo, base, ofs); + } else { + tcg_out_modrm_offset(s, OPC_MOVL_GvEv + P_REXW + seg, + datalo, base, ofs); + if (bswap) { + tcg_out_bswap64(s, datalo); + } } } else { + int opc = OPC_MOVL_GvEv; if (bswap) { int t = datalo; datalo = datahi; datahi = t; + if (have_movbe) { + opc = OPC_MOVBE_GyMy; + } } if (base != datalo) { - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, - datalo, base, ofs); - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, - datahi, base, ofs + 4); + tcg_out_modrm_offset(s, opc + seg, datalo, base, ofs); + tcg_out_modrm_offset(s, opc + seg, datahi, base, ofs + 4); } else { - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, - datahi, base, ofs + 4); - tcg_out_modrm_offset(s, OPC_MOVL_GvEv + seg, - datalo, base, ofs); + tcg_out_modrm_offset(s, opc + seg, datahi, base, ofs + 4); + tcg_out_modrm_offset(s, opc + seg, datalo, base, ofs); } - if (bswap) { + if (bswap && opc != OPC_MOVBE_GyMy) { tcg_out_bswap32(s, datalo); tcg_out_bswap32(s, datahi); } @@ -1506,31 +1540,48 @@ static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi, datalo, base, ofs); break; case MO_16: - if (bswap) { - tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo); - tcg_out_rolw_8(s, scratch); - datalo = scratch; + if (bswap & have_movbe) { + tcg_out_modrm_offset(s, OPC_MOVBE_MyGy + P_DATA16 + seg, + datalo, base, ofs); + } else { + if (bswap) { + tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo); + tcg_out_rolw_8(s, scratch); + datalo = scratch; + } + tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_DATA16 + seg, + datalo, base, ofs); } - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_DATA16 + seg, - datalo, base, ofs); break; case MO_32: - if (bswap) { - tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo); - tcg_out_bswap32(s, scratch); - datalo = scratch; + if (bswap & have_movbe) { + tcg_out_modrm_offset(s, OPC_MOVBE_MyGy + seg, datalo, base, ofs); + } else { + if (bswap) { + tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo); + tcg_out_bswap32(s, scratch); + datalo = scratch; + } + tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs); } - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, datalo, base, ofs); break; case MO_64: if (TCG_TARGET_REG_BITS == 64) { - if (bswap) { - tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo); - tcg_out_bswap64(s, scratch); - datalo = scratch; + if (bswap && have_movbe) { + tcg_out_modrm_offset(s, OPC_MOVBE_MyGy + P_REXW + seg, + datalo, base, ofs); + } else { + if (bswap) { + tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo); + tcg_out_bswap64(s, scratch); + datalo = scratch; + } + tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_REXW + seg, + datalo, base, ofs); } - tcg_out_modrm_offset(s, OPC_MOVL_EvGv + P_REXW + seg, - datalo, base, ofs); + } else if (bswap && have_movbe) { + tcg_out_modrm_offset(s, OPC_MOVBE_MyGy + seg, datahi, base, ofs); + tcg_out_modrm_offset(s, OPC_MOVBE_MyGy + seg, datalo, base, ofs+4); } else if (bswap) { tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi); tcg_out_bswap32(s, scratch); @@ -2167,13 +2218,24 @@ static void tcg_target_qemu_prologue(TCGContext *s) static void tcg_target_init(TCGContext *s) { - /* For 32-bit, 99% certainty that we're running on hardware that supports - cmov, but we still need to check. In case cmov is not available, we'll - use a small forward branch. */ -#ifndef have_cmov +#if !(defined(have_cmov) && defined(have_movbe)) { unsigned a, b, c, d; - have_cmov = (__get_cpuid(1, &a, &b, &c, &d) && (d & bit_CMOV)); + int ret; + ret = __get_cpuid(1, &a, &b, &c, &d); + +# ifndef have_cmov + /* For 32-bit, 99% certainty that we're running on hardware that + supports cmov, but we still need to check. In case cmov is not + available, we'll use a small forward branch. */ + have_cmov = ret && (d & bit_CMOV); +# endif + +# ifndef have_movbe + /* MOVBE is only available on Intel Atom and Haswell CPUs, so we + need to probe for it. */ + have_movbe = ret && (c & bit_MOVBE); +# endif } #endif -- 1.7.10.4