Thanks, and sorry for all the round-trips... I have one last (I hope) issue
below (two comments, actually):


--
Nadav Har'El
n...@scylladb.com

On Tue, May 15, 2018 at 8:01 AM, Waldemar Kozaczuk <jwkozac...@gmail.com>
wrote:

> This patch implements separate syscall call stack needed
> when runtimes like Golang use SYSCALL instruction to execute
> system calls. More specifically in case of Golang tiny stacks
> used by coroutines are not deep enough to execute system
> calls on OSv which handles them as regular function calls.
> In case of OS like Linux tiny Golang stacks are not a problem as
> the system calls are executed on separate kernel stack. For example
> golang-httpserver app should function now well under load (at least 100
> concurent requests per second using ab).
>
> More specifically each application thread pre-alllocates
> "tiny" (1024-bytes deep) syscall call stack. When SYSCALL
> instruction is called first time for given thread the call stack
> is switched to the tiny syscall stack in order to setup the "large"
> stack and switch to. Eventually execution of syscall continues
> on large syscall stack. From this point, second and subsequent
> executions of SYSCALL instruction are handled on large syscall stack.
>
> Please note that because the stack is switched from original
> caller stack to the syscall stack that is allocated lazily
> the stack pointers "jumps" to higher addresses. From gdb
> perspective this results in "Backtrace stopped: previous
> frame inner to this frame (corrupt stack?)" message.
> Gdb assumes that stack pointer should decrease in value as the stack
> "grows"
> which is not the case when stack is switched to syscall stack.
>
> This patch is based on the original patch provided by @Hawxchen.
>
> Fixes #808
>
> Signed-off-by: Waldemar Kozaczuk <jwkozac...@gmail.com>
> ---
>  arch/x64/arch-switch.hh | 108 ++++++++++++++++++++++++++++++
> ++++++++++++++++++
>  arch/x64/arch-tls.hh    |  19 +++++++++
>  arch/x64/entry.S        |  99 ++++++++++++++++++++++++++++++
> +-------------
>  include/osv/sched.hh    |   4 +-
>  4 files changed, 201 insertions(+), 29 deletions(-)
>
> diff --git a/arch/x64/arch-switch.hh b/arch/x64/arch-switch.hh
> index d1a039a..e376954 100644
> --- a/arch/x64/arch-switch.hh
> +++ b/arch/x64/arch-switch.hh
> @@ -12,6 +12,37 @@
>  #include <osv/barrier.hh>
>  #include <string.h>
>
> +//
> +// The last 16 bytes of the syscall stack are reserved for -
> +// tiny/large indicator and extra 8 bytes to make it 16 bytes aligned
> +// as Linux x64 ABI mandates.
> +#define SYSCALL_STACK_RESERVED_SPACE_SIZE (2 * 8)
> +//
> +// The tiny stack has to be large enough to allow for execution of
> +// thread::setup_large_syscall_stack() that allocates and sets up
> +// large syscall stack. It was measured that as of this writing
> +// setup_large_syscall_stack() needs a little over 600 bytes of stack
> +// to properly operate. This makes 1024 bytes to be an adequate size
> +// of tiny stack.
> +// All application threads pre-allocate tiny syscall stack so there
> +// is a tiny penalty with this solution.
> +#define TINY_SYSCALL_STACK_SIZE 1024
> +#define TINY_SYSCALL_STACK_DEPTH (TINY_SYSCALL_STACK_SIZE -
> SYSCALL_STACK_RESERVED_SPACE_SIZE)
> +//
> +// The large syscall stack is setup and switched to on first
> +// execution of SYSCALL instruction for given application thread.
> +#define LARGE_SYSCALL_STACK_SIZE (16 * PAGE_SIZE)
> +#define LARGE_SYSCALL_STACK_DEPTH (LARGE_SYSCALL_STACK_SIZE -
> SYSCALL_STACK_RESERVED_SPACE_SIZE)
> +
> +#define SET_SYSCALL_STACK_TYPE_INDICATOR(value) \
> +*((long*)(_tcb->syscall_stack_top)) = value;
> +
> +#define GET_SYSCALL_STACK_TYPE_INDICATOR() \
> +*((long*)(_tcb->syscall_stack_top))
> +
> +#define TINY_SYSCALL_STACK_INDICATOR 0l
> +#define LARGE_SYSCALL_STACK_INDICATOR 1l
> +
>  extern "C" {
>  void thread_main(void);
>  void thread_main_c(sched::thread* t);
> @@ -146,6 +177,66 @@ void thread::setup_tcb()
>      _tcb = static_cast<thread_control_block*>(p + tls.size +
> user_tls_size);
>      _tcb->self = _tcb;
>      _tcb->tls_base = p + user_tls_size;
> +
> +    if (is_app()) {
> +        //
> +        // Allocate TINY syscall call stack
> +        void* tiny_syscall_stack_begin = malloc(TINY_SYSCALL_STACK_SIZE);
> +        assert(tiny_syscall_stack_begin);
> +        //
> +        // The top of the stack needs to be 16 bytes lower to make space
> for
> +        // OSv syscall stack type indicator and extra 8 bytes to make it
> 16-bytes aligned
> +        _tcb->syscall_stack_top = tiny_syscall_stack_begin +
> TINY_SYSCALL_STACK_DEPTH;
> +        SET_SYSCALL_STACK_TYPE_INDICATOR(TINY_SYSCALL_STACK_INDICATOR);
> +    }
> +    else {
> +        _tcb->syscall_stack_top = 0;
> +    }
> +}
> +
> +void thread::setup_large_syscall_stack()
> +{
> +    assert(is_app());
> +    assert(GET_SYSCALL_STACK_TYPE_INDICATOR() ==
> TINY_SYSCALL_STACK_INDICATOR);
> +    //
> +    // Allocate LARGE syscall stack
> +    void* large_syscall_stack_begin = malloc(LARGE_SYSCALL_STACK_SIZE);
> +    void* large_syscall_stack_top = large_syscall_stack_begin +
> LARGE_SYSCALL_STACK_DEPTH;
> +    //
> +    // Copy all of the tiny stack to the are of last 1024 bytes of large
> stack.
> +    // This way we do not have to pop and push the same registers to be
> saved again.
> +    // Also the caller stack pointer is also copied.
> +    // We could have copied only last 128 (registers) + 16 bytes (2
> fields) instead
> +    // of all of the stack but copying 1024 is simpler and happens
> +    // only once per thread.
> +    void* tiny_syscall_stack_top = _tcb->syscall_stack_top;
> +    memcpy(large_syscall_stack_top - TINY_SYSCALL_STACK_DEPTH,
> +           tiny_syscall_stack_top - TINY_SYSCALL_STACK_DEPTH,
> TINY_SYSCALL_STACK_SIZE);
> +    //
> +    // Save beginning of tiny stack at the bottom of LARGE stack so
> +    // that we can deallocate it in free_tiny_syscall_stack
> +    *((void**)large_syscall_stack_begin) = tiny_syscall_stack_top -
> TINY_SYSCALL_STACK_DEPTH;
> +    //
> +    // Set canary value (0xDEADBEAFDEADBEAF) under bottom + 8 of LARGE
> stack
> +    *((long*)(large_syscall_stack_begin + 8)) = 0xdeadbeafdeadbeaf;
> +    //
> +    // Switch syscall stack address value in TCB to the top of the LARGE
> one
> +    _tcb->syscall_stack_top = large_syscall_stack_top;
> +    SET_SYSCALL_STACK_TYPE_INDICATOR(LARGE_SYSCALL_STACK_INDICATOR);
> +}
> +
> +void thread::free_tiny_syscall_stack()
> +{
> +    assert(is_app());
> +    assert(GET_SYSCALL_STACK_TYPE_INDICATOR() ==
> LARGE_SYSCALL_STACK_INDICATOR);
> +
> +    void* large_syscall_stack_top = _tcb->syscall_stack_top;
> +    void* large_syscall_stack_begin = large_syscall_stack_top -
> LARGE_SYSCALL_STACK_DEPTH;
> +    //
> +    // Lookup address of tiny stack saved by setup_large_syscall_stack()
> +    // at the bottom of LARGE stack (current syscall stack)
> +    void* tiny_syscall_stack_begin = *((void**)large_syscall_stack_
> begin);
> +    free(tiny_syscall_stack_begin);
>  }
>
>  void thread::free_tcb()
> @@ -156,6 +247,13 @@ void thread::free_tcb()
>      } else {
>          free(_tcb->tls_base);
>      }
> +
> +    if (_tcb->syscall_stack_top) {
> +        void* syscall_stack_begin = GET_SYSCALL_STACK_TYPE_INDICATOR()
> == TINY_SYSCALL_STACK_INDICATOR ?
> +            _tcb->syscall_stack_top - TINY_SYSCALL_STACK_DEPTH :
> +            _tcb->syscall_stack_top - LARGE_SYSCALL_STACK_DEPTH;
> +        free(syscall_stack_begin);
> +    }
>  }
>
>  void thread_main_c(thread* t)
> @@ -171,6 +269,16 @@ void thread_main_c(thread* t)
>      t->complete();
>  }
>
> +extern "C" void setup_large_syscall_stack()
> +{
> +    sched::thread::current()->setup_large_syscall_stack();
> +}
> +
> +extern "C" void free_tiny_syscall_stack()
> +{
> +    sched::thread::current()->free_tiny_syscall_stack();
> +}
> +
>  }
>
>  #endif /* ARCH_SWITCH_HH_ */
> diff --git a/arch/x64/arch-tls.hh b/arch/x64/arch-tls.hh
> index 1bf86fd..82871d6 100644
> --- a/arch/x64/arch-tls.hh
> +++ b/arch/x64/arch-tls.hh
> @@ -8,9 +8,28 @@
>  #ifndef ARCH_TLS_HH
>  #define ARCH_TLS_HH
>
> +// Don't change the declaration sequence of all existing members'.
> +// Please add new members from the last.
>  struct thread_control_block {
>      thread_control_block* self;
>      void* tls_base;
> +    //
> +    // This field, a per-thread stack for SYSCALL instruction, is  used in
> +    // arch/x64/entry.S for %fs's offset.  We currently keep this field
> in the TCB
> +    // to make it easier to access in assembly code through a known
> offset at %fs:16.
> +    // But with more effort, we could have used an ordinary thread-local
> variable
> +    // instead and avoided extending the TCB here.
> +    //
> +    // The 8 bytes at the top of the syscall stack are used to identify if
> +    // the stack is tiny (0) or large (1). So the size of the syscall
> stack is in
> +    // reality smaller by 16 bytes from what was originally allocated
> because we need
> +    // to make it 16-bytes aligned.
> +    void* syscall_stack_top;
> +    //
> +    // This field is used to store the syscall caller stack pointer
> (value of RSP when
> +    // SYSCALL was called) so that it can be restored when syscall
> completed.
> +    // Same as above this field could be an ordinary thread-local
> variable.
> +    void* syscall_caller_stack_pointer;
>  };
>
>  #endif /* ARCH_TLS_HH */
> diff --git a/arch/x64/entry.S b/arch/x64/entry.S
> index 04d809d..3cbad68 100644
> --- a/arch/x64/entry.S
> +++ b/arch/x64/entry.S
> @@ -169,27 +169,69 @@ syscall_entry:
>      .cfi_register rip, rcx # rcx took previous rip value
>      .cfi_register rflags, r11 # r11 took previous rflags value
>      # There is no ring transition and rflags are left unchanged.
> -
> -    # Skip the "red zone" allowed by the AMD64 ABI (the caller used a
> -    # SYSCALL instruction and doesn't know he called a function):
> -    subq $128, %rsp
> -
> -    # Align the stack to 16 bytes. We align it now because of limitations
> of
> -    # the CFI language, but need to ensure it is still aligned before we
> call
> -    # syscall_wrapper(), so must ensure that the number of pushes below
> are
> -    # even.
> -    # An additional complication is that we need to restore %rsp later
> without
> -    # knowing how it was previously aligned. In the following trick,
> without
> -    # using an additional register, the two pushes leave the stack with
> the
> -    # same alignment it had originally, and a copy of the original %rsp at
> -    # (%rsp) and 8(%rsp). The andq then aligns the stack - if it was
> already
> -    # 16 byte aligned nothing changes, if it was 8 byte aligned then it
> -    # subtracts 8 from %rsp, meaning that the original %rsp is now at
> 8(%rsp)
> -    # and 16(%rsp). In both cases we can restore it below from 8(%rsp).
> -    pushq %rsp
> -    pushq (%rsp)
> -    andq $-16, %rsp
> -
> +    #
> +    # Unfortunately the mov instruction cannot be used to dereference an
> address
> +    # on syscall stack pointed by address in TCB (%fs:16) - double memory
> dereference.
> +    # Therefore we are forced to save caller stack address in a field in
> TCB.
> +    movq %rsp, %fs:24 # syscall_caller_stack_pointer
> +    #
> +    # Switch stack to "tiny" syscall stack that should be large
> +    # enough to setup "large" syscall stack (only when first SYSCALL on
> this thread)
> +    movq %fs:16, %rsp
> +
> +    # Skip large syscall stack setup if it has been already setup
> +    cmpq $0, (%rsp)  // Check if we are on tiny or large stack
> +    jne large_stack_has_been_setup
> +
> +    # We are on small stack
> +    # Save all registers
> +    pushq %rcx
> +    pushq %rcx
> +    pushq %rbp
> +    pushq %rbx
> +    pushq %rax
> +    pushq %rdx
> +    pushq %rsi
> +    pushq %rdi
> +    pushq %r8
> +    pushq %r9
> +    pushq %r10
> +    pushq %r11 # contains rflags before syscall instruction
> +    pushq %r12
> +    pushq %r13
> +    pushq %r14
> +    pushq %r15
> +    # Please note we pushed rcx twice to make stack 16-bytes aligned
> +
> +    # Call setup_large_syscall_stack to setup large call stack
> +    # This function does not take any arguments nor returns anything.
> +    # It ends up allocating large stack and storing its address in tcb
> +    callq setup_large_syscall_stack
> +    movq %fs:16, %rsp  // Switch stack to large stack
> +    subq $128, %rsp    // Skip 128 bytes of large stack so that we can
> restore all registers saved above (16 pushes).
> +                       // Please note that these 128 bytes have been
> copied by setup_large_syscall_stack function
> +                       // so that we do not have to pop and then push
> same registers again.
> +    callq free_tiny_syscall_stack
> +
> +    # Restore all registers
> +    popq %r15
> +    popq %r14
> +    popq %r13
> +    popq %r12
> +    popq %r11
> +    popq %r10
> +    popq %r9
> +    popq %r8
> +    popq %rdi
> +    popq %rsi
> +    popq %rdx
> +    popq %rax
> +    popq %rbx
> +    popq %rbp
> +    popq %rcx
> +    popq %rcx
> +
> +large_stack_has_been_setup:
>      .cfi_def_cfa %rsp, 0
>
>      # We need to save and restore the caller's %rbp anyway, so let's also
> @@ -204,9 +246,12 @@ syscall_entry:
>      # We do this just so we can refer to it with CFI and help gdb's DWARF
>      # stack unwinding. This saving not otherwise needed for correct
> operation
>      # (we anyway restore it below by undoing all our modifications).
> -    movq 24(%rsp), %rbp
> -    addq $128, %rbp
> -    pushq %rbp
> +    #
> +    # The caller stack rsp was saved at the address of syscall stack
> pointer.
>

Wait, do we now? This happened in the previous patch, but now you save
it in %fs:24, not in 16(%rsp), no? (if you change the push code below, also
don't forget to change the comment you added here. or just remove the new
part - my original comment already explained that we are saving the
caller's rsp.

To really test this, you will need to try gdb (as I wrote in the comment
above,
the only effect this code has is on GDB - it's not needed for correctness).

+    # Since we pushed twice since switching rsp to syscall stack pointer
> +    # we simply have to dereference the value of the caller stack like
> below.
> +    pushq 16(%rsp)
> +
>      .cfi_adjust_cfa_offset 8

     .cfi_rel_offset %rsp, 0
>

> @@ -275,16 +320,14 @@ syscall_entry:
>      popq_cfi %rbp
>      popq_cfi %rcx
>
> -    movq 8(%rsp), %rsp # undo alignment (as explained above)
> -
>      # restore rflags
>      # push the rflag state syscall saved in r11 to the stack
>      pushq %r11
>      # pop the stack value in flag register
>      popfq
>
> -    #undo red-zone skip without altering restored flags
> -    lea 128(%rsp), %rsp
> +    # Restore caller stack pointer
> +    movq %fs:(24), %rsp
>

What do these parentheses do?

>
>      # jump to rcx where the syscall instruction put rip
>      # (sysret would leave rxc cloberred so we have nothing to do to
> restore it)
> diff --git a/include/osv/sched.hh b/include/osv/sched.hh
> index dada8f5..033dfae 100644
> --- a/include/osv/sched.hh
> +++ b/include/osv/sched.hh
> @@ -591,7 +591,9 @@ public:
>        * wait queue for) a crucial mutex (e.g., for I/O or memory
> allocation),
>        * which could cause the whole system to block. So use at your own
> peril.
>        */
> -     bool unsafe_stop();
> +    bool unsafe_stop();
> +    void setup_large_syscall_stack();
> +    void free_tiny_syscall_stack();
>  private:
>      static void wake_impl(detached_state* st,
>              unsigned allowed_initial_states_mask = 1 <<
> unsigned(status::waiting));
> --
> 2.7.4
>
> --
> You received this message because you are subscribed to the Google Groups
> "OSv Development" group.
> To unsubscribe from this group and stop receiving emails from it, send an
> email to osv-dev+unsubscr...@googlegroups.com.
> For more options, visit https://groups.google.com/d/optout.
>

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to osv-dev+unsubscr...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Reply via email to