As the issue #143 explains, currently applications stacks are eagerly
allocated (aka pre-populated) which may lead to substantial memory waste.
We want those stacks to be lazily allocated instead, so that physical
memory behind the stack gets allocated as needed by standard fault handling 
mechanism.

The page fault handling logic requires that both interrupts and preemption
are enabled. Therefore this patch implements simple strategy to read single 
byte on a stack
one page (4K) deeper before disabling interrupts or preemption.

It turns out we cannot read that single byte "blindly" because in some scenarios
interrupts might be already disabled when we are about to disable preemption 
and vice versa.
Also similarly, which is more speculative, disabling preemption (and possibly 
interrupts) may nest.
Therefore we need a "conditional" read of that byte on stack. More specifically
we should ONLY read the byte IF both interrupts and preemption are enabled.

In constrast to the option 1, this patch achieves this by using already 
available
irq_enabled() function. Everytime before we disable interrupts or preemption, 
we check
if preemption counter is 0 and interrupts are enabled and only then we try to 
read a
byte to potentially trigger a fault.

Please note that performance wise this patch is not as "cheap" as it
possibly was hoped. As a matter of fact based on the disassembled snippets
it seems we are adding extra 13 instructions so the option 2 is even more 
expensive.

As an example of memory saving, tomcat using ~ 300 thread ended up 365MB instead
of 620MB before the patch.

Fixes #143

Signed-off-by: Matthew Pabst <pabstmatt...@gmail.com>
Signed-off-by: Waldemar Kozaczuk <jwkozac...@gmail.com>
---
 arch/x64/arch-switch.hh |  5 +++++
 arch/x64/arch.hh        | 19 +++++++++++++++++++
 arch/x64/exceptions.cc  |  1 +
 include/osv/mmu-defs.hh |  1 +
 include/osv/sched.hh    |  1 +
 libc/mman.cc            |  7 +------
 libc/pthread.cc         |  2 +-
 7 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/arch/x64/arch-switch.hh b/arch/x64/arch-switch.hh
index 6803498f..2cb21f53 100644
--- a/arch/x64/arch-switch.hh
+++ b/arch/x64/arch-switch.hh
@@ -148,6 +148,11 @@ void thread::init_stack()
     _state.rip = reinterpret_cast<void*>(thread_main);
     _state.rsp = stacktop;
     _state.exception_stack = _arch.exception_stack + 
sizeof(_arch.exception_stack);
+
+    // If thread stack ia lazily allocated and given the thread initially 
starts
+    // running with preemption disabled, we need to pre-fault the first stack 
page.
+    volatile char r = *((char *) (stacktop - 1));
+    (void) r; // trick the compiler into thinking that r is used
 }
 
 void thread::setup_tcb()
diff --git a/arch/x64/arch.hh b/arch/x64/arch.hh
index 17df5f5c..51ad1d55 100644
--- a/arch/x64/arch.hh
+++ b/arch/x64/arch.hh
@@ -10,18 +10,36 @@
 
 #include "processor.hh"
 #include "msr.hh"
+#include <osv/barrier.hh>
 
 // namespace arch - architecture independent interface for architecture
 //                  dependent operations (e.g. irq_disable vs. cli)
 
+namespace sched {
+extern unsigned __thread preempt_counter;
+}
+
 namespace arch {
 
 #define CACHELINE_ALIGNED __attribute__((aligned(64)))
 #define INSTR_SIZE_MIN 1
 #define ELF_IMAGE_START OSV_KERNEL_BASE
 
+inline bool irq_enabled();
+inline void ensure_next_stack_page() {
+    //TODO: Possibly check if this a kernel or in general non-lazy stack and 
return
+    //if so - no need to pre-fault the stack as it should be fully 
pre-populated
+    if (sched::preempt_counter || !irq_enabled()) {
+        return;
+    }
+
+    char i;
+    asm volatile("movb -4096(%%rsp), %0" : "=r"(i));
+}
+
 inline void irq_disable()
 {
+    ensure_next_stack_page();
     processor::cli();
 }
 
@@ -78,6 +96,7 @@ private:
 
 inline void irq_flag_notrace::save()
 {
+    ensure_next_stack_page();
     asm volatile("sub $128, %%rsp; pushfq; popq %0; add $128, %%rsp" : 
"=r"(_rflags));
 }
 
diff --git a/arch/x64/exceptions.cc b/arch/x64/exceptions.cc
index 7c9eaf51..889ca48c 100644
--- a/arch/x64/exceptions.cc
+++ b/arch/x64/exceptions.cc
@@ -302,6 +302,7 @@ extern "C" void simd_exception(exception_frame *ef)
 
 extern "C" void nmi(exception_frame* ef)
 {
+    //TODO: Possibly needs to be handled
     while (true) {
         processor::cli_hlt();
     }
diff --git a/include/osv/mmu-defs.hh b/include/osv/mmu-defs.hh
index 18edf441..694815f8 100644
--- a/include/osv/mmu-defs.hh
+++ b/include/osv/mmu-defs.hh
@@ -84,6 +84,7 @@ enum {
     mmap_small       = 1ul << 5,
     mmap_jvm_balloon = 1ul << 6,
     mmap_file        = 1ul << 7,
+    mmap_stack       = 1ul << 8,
 };
 
 enum {
diff --git a/include/osv/sched.hh b/include/osv/sched.hh
index 0fb44f77..0471927e 100644
--- a/include/osv/sched.hh
+++ b/include/osv/sched.hh
@@ -1005,6 +1005,7 @@ inline void preempt()
 
 inline void preempt_disable()
 {
+    arch::ensure_next_stack_page();
     ++preempt_counter;
     barrier();
 }
diff --git a/libc/mman.cc b/libc/mman.cc
index d0803ac4..de7ee4dd 100644
--- a/libc/mman.cc
+++ b/libc/mman.cc
@@ -38,12 +38,7 @@ unsigned libc_flags_to_mmap(int flags)
         mmap_flags |= mmu::mmap_populate;
     }
     if (flags & MAP_STACK) {
-        // OSv currently requires that stacks be pinned (see issue #143). So
-        // if an application wants to mmap() a stack for pthread_attr_setstack
-        // and did us the courtesy of telling this to ue (via MAP_STACK),
-        // let's return the courtesy by returning pre-faulted memory.
-        // FIXME: If issue #143 is fixed, this workaround should be removed.
-        mmap_flags |= mmu::mmap_populate;
+        mmap_flags |= mmu::mmap_stack;
     }
     if (flags & MAP_SHARED) {
         mmap_flags |= mmu::mmap_shared;
diff --git a/libc/pthread.cc b/libc/pthread.cc
index 8c976bf6..93bada47 100644
--- a/libc/pthread.cc
+++ b/libc/pthread.cc
@@ -140,7 +140,7 @@ namespace pthread_private {
             return {attr.stack_begin, attr.stack_size};
         }
         size_t size = attr.stack_size;
-        void *addr = mmu::map_anon(nullptr, size, mmu::mmap_populate, 
mmu::perm_rw);
+        void *addr = mmu::map_anon(nullptr, size, mmu::mmap_stack, 
mmu::perm_rw);
         mmu::mprotect(addr, attr.guard_size, 0);
         sched::thread::stack_info si{addr, size};
         si.deleter = free_stack;
-- 
2.20.1

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to osv-dev+unsubscr...@googlegroups.com.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/osv-dev/20200126182907.7388-1-jwkozaczuk%40gmail.com.

Reply via email to