This patch changes syscall stack implementation to use
mmap/munmap/mprotect combination instead of malloc/free
for large syscall stack. This makes it possible to add guard page
to detect stack overflow scenarios. Lastly new implementation
also detects if tiny stack is not overflown when executing logic
to setup large stack.

Signed-off-by: Waldemar Kozaczuk <jwkozac...@gmail.com>
---
 arch/x64/arch-switch.hh | 55 +++++++++++++++++++++++++++++++------------------
 1 file changed, 35 insertions(+), 20 deletions(-)

diff --git a/arch/x64/arch-switch.hh b/arch/x64/arch-switch.hh
index e376954..223e4db 100644
--- a/arch/x64/arch-switch.hh
+++ b/arch/x64/arch-switch.hh
@@ -11,6 +11,7 @@
 #include "msr.hh"
 #include <osv/barrier.hh>
 #include <string.h>
+#include <sys/mman.h>
 
 //
 // The last 16 bytes of the syscall stack are reserved for -
@@ -20,14 +21,19 @@
 //
 // The tiny stack has to be large enough to allow for execution of
 // thread::setup_large_syscall_stack() that allocates and sets up
-// large syscall stack. It was measured that as of this writing
-// setup_large_syscall_stack() needs a little over 600 bytes of stack
-// to properly operate. This makes 1024 bytes to be an adequate size
-// of tiny stack.
+// large syscall stack. Based on some rudimentary experiments 1280
+// bytes should be enough for this purpose. However given that memory
+// allocator would still allocate full page
+// (see 
https://github.com/cloudius-systems/osv/blob/a3cd022fcda2c88eae89476aa6c29e3c4be04926/core/mempool.cc#L1544-L1565),
+// we are going to use full page for it anyway.
+// In any case we put a canary value in the bottom of tiny stack
+// to at least detect stack overflow.
+//
 // All application threads pre-allocate tiny syscall stack so there
 // is a tiny penalty with this solution.
-#define TINY_SYSCALL_STACK_SIZE 1024
+#define TINY_SYSCALL_STACK_SIZE PAGE_SIZE
 #define TINY_SYSCALL_STACK_DEPTH (TINY_SYSCALL_STACK_SIZE - 
SYSCALL_STACK_RESERVED_SPACE_SIZE)
+#define TINY_SYSCALL_STACK_CANARY (0xdeadbeafdeadbeaf)
 //
 // The large syscall stack is setup and switched to on first
 // execution of SYSCALL instruction for given application thread.
@@ -188,6 +194,9 @@ void thread::setup_tcb()
         // OSv syscall stack type indicator and extra 8 bytes to make it 
16-bytes aligned
         _tcb->syscall_stack_top = tiny_syscall_stack_begin + 
TINY_SYSCALL_STACK_DEPTH;
         SET_SYSCALL_STACK_TYPE_INDICATOR(TINY_SYSCALL_STACK_INDICATOR);
+        //
+        // Set canary value at the bottom of the TINY stack
+        *((long*)(tiny_syscall_stack_begin)) = TINY_SYSCALL_STACK_CANARY;
     }
     else {
         _tcb->syscall_stack_top = 0;
@@ -199,30 +208,37 @@ void thread::setup_large_syscall_stack()
     assert(is_app());
     assert(GET_SYSCALL_STACK_TYPE_INDICATOR() == TINY_SYSCALL_STACK_INDICATOR);
     //
+    // Check canary (a little bit paranoid)
+    void* tiny_syscall_stack_begin = _tcb->syscall_stack_top - 
TINY_SYSCALL_STACK_DEPTH;
+    assert(TINY_SYSCALL_STACK_CANARY == *((unsigned 
long*)tiny_syscall_stack_begin));
+    //
     // Allocate LARGE syscall stack
-    void* large_syscall_stack_begin = malloc(LARGE_SYSCALL_STACK_SIZE);
+    void* large_syscall_stack_begin = mmap(NULL, LARGE_SYSCALL_STACK_SIZE,
+        PROT_READ|PROT_WRITE, MAP_STACK|MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
+    assert(large_syscall_stack_begin != MAP_FAILED );
     void* large_syscall_stack_top = large_syscall_stack_begin + 
LARGE_SYSCALL_STACK_DEPTH;
     //
-    // Copy all of the tiny stack to the are of last 1024 bytes of large stack.
+    // Copy all of the tiny stack to the top area of large stack.
     // This way we do not have to pop and push the same registers to be saved 
again.
     // Also the caller stack pointer is also copied.
     // We could have copied only last 128 (registers) + 16 bytes (2 fields) 
instead
-    // of all of the stack but copying 1024 is simpler and happens
+    // of all of the stack but copying entire tiny stack is simpler and happens
     // only once per thread.
     void* tiny_syscall_stack_top = _tcb->syscall_stack_top;
     memcpy(large_syscall_stack_top - TINY_SYSCALL_STACK_DEPTH,
            tiny_syscall_stack_top - TINY_SYSCALL_STACK_DEPTH, 
TINY_SYSCALL_STACK_SIZE);
     //
-    // Save beginning of tiny stack at the bottom of LARGE stack so
+    // Save beginning of tiny stack at the very top of LARGE stack so
     // that we can deallocate it in free_tiny_syscall_stack
-    *((void**)large_syscall_stack_begin) = tiny_syscall_stack_top - 
TINY_SYSCALL_STACK_DEPTH;
-    //
-    // Set canary value (0xDEADBEAFDEADBEAF) under bottom + 8 of LARGE stack
-    *((long*)(large_syscall_stack_begin + 8)) = 0xdeadbeafdeadbeaf;
+    *((void**)(large_syscall_stack_top + 8)) = tiny_syscall_stack_top - 
TINY_SYSCALL_STACK_DEPTH;
     //
     // Switch syscall stack address value in TCB to the top of the LARGE one
     _tcb->syscall_stack_top = large_syscall_stack_top;
     SET_SYSCALL_STACK_TYPE_INDICATOR(LARGE_SYSCALL_STACK_INDICATOR);
+    assert(0 == mprotect(large_syscall_stack_begin, PAGE_SIZE, PROT_NONE));
+    //
+    // Check canary of tiny stack to verify it was not overflown
+    assert(TINY_SYSCALL_STACK_CANARY == *((unsigned 
long*)tiny_syscall_stack_begin));
 }
 
 void thread::free_tiny_syscall_stack()
@@ -231,11 +247,10 @@ void thread::free_tiny_syscall_stack()
     assert(GET_SYSCALL_STACK_TYPE_INDICATOR() == 
LARGE_SYSCALL_STACK_INDICATOR);
 
     void* large_syscall_stack_top = _tcb->syscall_stack_top;
-    void* large_syscall_stack_begin = large_syscall_stack_top - 
LARGE_SYSCALL_STACK_DEPTH;
     //
     // Lookup address of tiny stack saved by setup_large_syscall_stack()
-    // at the bottom of LARGE stack (current syscall stack)
-    void* tiny_syscall_stack_begin = *((void**)large_syscall_stack_begin);
+    // at the very top of LARGE stack (current syscall stack)
+    void* tiny_syscall_stack_begin = *((void**)(large_syscall_stack_top + 8));
     free(tiny_syscall_stack_begin);
 }
 
@@ -249,10 +264,10 @@ void thread::free_tcb()
     }
 
     if (_tcb->syscall_stack_top) {
-        void* syscall_stack_begin = GET_SYSCALL_STACK_TYPE_INDICATOR() == 
TINY_SYSCALL_STACK_INDICATOR ?
-            _tcb->syscall_stack_top - TINY_SYSCALL_STACK_DEPTH :
-            _tcb->syscall_stack_top - LARGE_SYSCALL_STACK_DEPTH;
-        free(syscall_stack_begin);
+        if(GET_SYSCALL_STACK_TYPE_INDICATOR() == TINY_SYSCALL_STACK_INDICATOR)
+           free(_tcb->syscall_stack_top - TINY_SYSCALL_STACK_DEPTH);
+        else
+           assert(0 == munmap(_tcb->syscall_stack_top - 
LARGE_SYSCALL_STACK_DEPTH, LARGE_SYSCALL_STACK_SIZE));
     }
 }
 
-- 
2.7.4

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to osv-dev+unsubscr...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Reply via email to