On 04.06.2010, at 02:35, Richard Henderson wrote:

> For 32-bit, using a segment override is smaller than the 4-byte
> immediate offset.  For 64-bit, segments can hold the entire 64-bit
> offset whereas the 4-byte immediate cannot.

Very nice idea indeed :). Have you found it to be faster? IIRC segment accesses 
are slower when seg_offs != 0. But then again the code is smaller, so it might 
weigh it up.

> Only implemented for linux, with fallback to the immediate offset
> if the system call fails.
> 
> Signed-off-by: Richard Henderson <r...@twiddle.net>
> ---
> tcg/i386/tcg-target.c |  206 +++++++++++++++++++++++++++++++++++-------------
> 1 files changed, 150 insertions(+), 56 deletions(-)
> 
> diff --git a/tcg/i386/tcg-target.c b/tcg/i386/tcg-target.c
> index fab2a30..e34254f 100644
> --- a/tcg/i386/tcg-target.c
> +++ b/tcg/i386/tcg-target.c
> @@ -240,6 +240,8 @@ static inline int tcg_target_const_match(tcg_target_long 
> val,
> # define P_REXB_R     0
> # define P_REXB_RM    0
> #endif
> +#define P_FS         0x4000
> +#define P_GS         0x8000
> 
> #define OPC_ARITH_EvIz        (0x81)
> #define OPC_ARITH_EvIb        (0x83)
> @@ -347,11 +349,29 @@ static const uint8_t tcg_cond_to_jcc[10] = {
>     [TCG_COND_GTU] = JCC_JA,
> };
> 
> +static inline void tcg_out_seg_prefix(TCGContext *s, int opc)
> +{
> +    switch (opc & (P_FS | P_GS)) {
> +    case 0:
> +        break;
> +    case P_FS:
> +        tcg_out8(s, 0x64);
> +        break;
> +    case P_GS:
> +        tcg_out8(s, 0x65);
> +        break;
> +    default:
> +        tcg_abort();
> +    }
> +}
> +
> #if TCG_TARGET_REG_BITS == 64
> static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
> {
>     int rex;
> 
> +    tcg_out_seg_prefix(s, opc);
> +
>     if (opc & P_DATA16) {
>         /* We should never be asking for both 16 and 64-bit operation.  */
>         assert((opc & P_REXW) == 0);
> @@ -387,6 +407,8 @@ static void tcg_out_opc(TCGContext *s, int opc, int r, 
> int rm, int x)
> #else
> static void tcg_out_opc(TCGContext *s, int opc)
> {
> +    tcg_out_seg_prefix(s, opc);
> +
>     if (opc & P_DATA16) {
>         tcg_out8(s, 0x66);
>     }
> @@ -956,6 +978,48 @@ static void tcg_out_jmp(TCGContext *s, tcg_target_long 
> dest)
>     tcg_out_branch(s, 0, dest);
> }
> 
> +#ifndef GUEST_BASE
> +#define GUEST_BASE 0
> +#endif
> +
> +#if defined(__x86_64__) && defined(__linux__)
> +# include <sys/syscall.h>
> +# include <asm/prctl.h>
> +
> +static int guest_base_flags;
> +static inline void setup_guest_base_seg(void)
> +{
> +    if (syscall(__NR_arch_prctl, ARCH_SET_GS, GUEST_BASE) == 0) {
> +        guest_base_flags = P_GS;

I'd like to see a comment here stating that FS is used for TLS.

> +    }
> +}
> +#elif defined(__i386__) && defined(__linux__)
> +# include <sys/syscall.h>
> +# include <asm/ldt.h>
> +
> +static int guest_base_flags;
> +static inline void setup_guest_base_seg(void)
> +{
> +    struct user_desc d;
> +
> +    memset(&d, 0, sizeof(d));
> +    d.entry_number = -1;                /* let the kernel choose */
> +    d.base_addr = GUEST_BASE;
> +    d.limit = 0xfffff;                  /* 4GB segment */
> +    d.seg_32bit = 1;
> +    d.limit_in_pages = 1;
> +    d.useable = 1;
> +
> +    if (syscall(__NR_set_thread_area, &d) == 0) {
> +        asm volatile("movw %w0, %%fs" : : "r"(d.entry_number * 8 + 3));

Same here for %gs.

[snip]

> @@ -1945,6 +2031,14 @@ static void tcg_target_qemu_prologue(TCGContext *s)
>         tcg_out_pop(s, tcg_target_callee_save_regs[i]);
>     }
>     tcg_out_opc(s, OPC_RET, 0, 0, 0);
> +
> +    /* Try to set up %fs or %gs (whichever isn't already used for TLS)
> +       to point to GUEST_BASE.  The 1-byte segment override prefix is
> +       always smaller than the 4-byte offset we'd have to encode into
> +       the address, and is also able to handle the full 64-bit offset.  */

Ah, so that's where the comment hides. Uh. Better be safe than sorry and have 
it in both locations, no? :)

Alex


Reply via email to