On 04.06.2010, at 02:35, Richard Henderson wrote: > For 32-bit, using a segment override is smaller than the 4-byte > immediate offset. For 64-bit, segments can hold the entire 64-bit > offset whereas the 4-byte immediate cannot.
Very nice idea indeed :). Have you found it to be faster? IIRC segment accesses are slower when seg_offs != 0. But then again the code is smaller, so it might weigh it up. > Only implemented for linux, with fallback to the immediate offset > if the system call fails. > > Signed-off-by: Richard Henderson <r...@twiddle.net> > --- > tcg/i386/tcg-target.c | 206 +++++++++++++++++++++++++++++++++++------------- > 1 files changed, 150 insertions(+), 56 deletions(-) > > diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c > index fab2a30..e34254f 100644 > --- a/tcg/i386/tcg-target.c > +++ b/tcg/i386/tcg-target.c > @@ -240,6 +240,8 @@ static inline int tcg_target_const_match(tcg_target_long > val, > # define P_REXB_R 0 > # define P_REXB_RM 0 > #endif > +#define P_FS 0x4000 > +#define P_GS 0x8000 > > #define OPC_ARITH_EvIz (0x81) > #define OPC_ARITH_EvIb (0x83) > @@ -347,11 +349,29 @@ static const uint8_t tcg_cond_to_jcc[10] = { > [TCG_COND_GTU] = JCC_JA, > }; > > +static inline void tcg_out_seg_prefix(TCGContext *s, int opc) > +{ > + switch (opc & (P_FS | P_GS)) { > + case 0: > + break; > + case P_FS: > + tcg_out8(s, 0x64); > + break; > + case P_GS: > + tcg_out8(s, 0x65); > + break; > + default: > + tcg_abort(); > + } > +} > + > #if TCG_TARGET_REG_BITS == 64 > static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x) > { > int rex; > > + tcg_out_seg_prefix(s, opc); > + > if (opc & P_DATA16) { > /* We should never be asking for both 16 and 64-bit operation. */ > assert((opc & P_REXW) == 0); > @@ -387,6 +407,8 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, > int rm, int x) > #else > static void tcg_out_opc(TCGContext *s, int opc) > { > + tcg_out_seg_prefix(s, opc); > + > if (opc & P_DATA16) { > tcg_out8(s, 0x66); > } > @@ -956,6 +978,48 @@ static void tcg_out_jmp(TCGContext *s, tcg_target_long > dest) > tcg_out_branch(s, 0, dest); > } > > +#ifndef GUEST_BASE > +#define GUEST_BASE 0 > +#endif > + > +#if defined(__x86_64__) && defined(__linux__) > +# include <sys/syscall.h> > +# include <asm/prctl.h> > + > +static int guest_base_flags; > +static inline void setup_guest_base_seg(void) > +{ > + if (syscall(__NR_arch_prctl, ARCH_SET_GS, GUEST_BASE) == 0) { > + guest_base_flags = P_GS; I'd like to see a comment here stating that FS is used for TLS. > + } > +} > +#elif defined(__i386__) && defined(__linux__) > +# include <sys/syscall.h> > +# include <asm/ldt.h> > + > +static int guest_base_flags; > +static inline void setup_guest_base_seg(void) > +{ > + struct user_desc d; > + > + memset(&d, 0, sizeof(d)); > + d.entry_number = -1; /* let the kernel choose */ > + d.base_addr = GUEST_BASE; > + d.limit = 0xfffff; /* 4GB segment */ > + d.seg_32bit = 1; > + d.limit_in_pages = 1; > + d.useable = 1; > + > + if (syscall(__NR_set_thread_area, &d) == 0) { > + asm volatile("movw %w0, %%fs" : : "r"(d.entry_number * 8 + 3)); Same here for %gs. [snip] > @@ -1945,6 +2031,14 @@ static void tcg_target_qemu_prologue(TCGContext *s) > tcg_out_pop(s, tcg_target_callee_save_regs[i]); > } > tcg_out_opc(s, OPC_RET, 0, 0, 0); > + > + /* Try to set up %fs or %gs (whichever isn't already used for TLS) > + to point to GUEST_BASE. The 1-byte segment override prefix is > + always smaller than the 4-byte offset we'd have to encode into > + the address, and is also able to handle the full 64-bit offset. */ Ah, so that's where the comment hides. Uh. Better be safe than sorry and have it in both locations, no? :) Alex