On 12/11/2016 01:58 AM, Nadav Har'El wrote:

On Sat, Dec 10, 2016 at 12:57 AM, Avi Kivity <a...@scylladb.com <mailto:a...@scylladb.com>> wrote:

    Since some applications are hitting the current 16 TB RAM limit (and


Really? :-)

No, it was a joke.


    the 128 TB virtual address space), extend the physical RAM limit to
    8 PB and the virtual address space to 64 PB using la57 addressing.

    TODO: compile- and run- time switch to support pre-la57 processors.
    ---

    Note: requires qemu patch to emulate la57.


Thanks Avi, this is a really interesting (and unexpected) patch.

Apparently Intel only released the white paper about 5-level page tables this month (https://software.intel.com/sites/default/files/managed/2b/80/5-level_paging_white_paper.pdf) and Linux patches were released 2 days ago, so you were pretty fast about this patch. Cool :-)

Just showing off the power of out template-based mmu instantiating the code for the fifth level automatically.


If I understand correctly, your patch will not work on processors and hypervisors who don't support 5-level paging (i.e., all of them ;-)),

Who wants to use a 4-level processor anyway.

so your TODO is pretty much mandatory before we can commit this, right? But regarding your TODO:

1. I wouldn't like a compile-time switch, because that would mean we would need to compile OSv differently for different hosts, something we tried so far to avoid. Sadly, because of all the compile-time trickery in our code, making the number of levels a run-time parameter might be tricky :-( Maybe we need two copies of the template instances?


Yes, that's the plan.

struct mmu_defs_std { /* defs for 4-level mmu */ };
struct mmu_defs_la57 { /* defs for 4-level mmu */ };

using supported_mmu_defs = std::tuple<mmu_defs_la57, mmu_defs_std>;


2. At runtime, we should have such a switch, but also enable it automatically if we know the physical memory is too big. Also, of course we need to check the cpuid for the availability of this feature (the pdf explains how).

Finally, how the heck will we test this patch? :-)

It runs on qemu (without kvm).


P.S. I'm sure the chemistry buff are amused by the name "La57" ;-)



     arch/x64/arch-cpu.hh       |  1 +
     arch/x64/arch-mmu.hh       |  5 +++++
     arch/x64/arch-setup.cc     |  2 +-
     arch/x64/boot.S            |  8 +++++++-
     arch/x64/mmu.cc            |  4 ++--
     arch/x64/processor-flags.h |  1 +
     arch/x64/processor.hh      |  1 +
     core/mmu.cc                |  6 +++---
     include/osv/mmu-defs.hh    | 10 +++++-----
     9 files changed, 26 insertions(+), 12 deletions(-)

    diff --git a/arch/x64/arch-cpu.hh b/arch/x64/arch-cpu.hh
    index b1cb0bf..72e3e1a 100644
    --- a/arch/x64/arch-cpu.hh
    +++ b/arch/x64/arch-cpu.hh
    @@ -161,6 +161,7 @@ inline void arch_cpu::init_on_cpu()
         if (features().xsave) {
             cr4 |= cr4_osxsave;
         }
    +    cr4 |= cr4_la57;
         write_cr4(cr4);

         if (features().xsave) {
    diff --git a/arch/x64/arch-mmu.hh b/arch/x64/arch-mmu.hh
    index 2e6ea13..8dfbfcf 100644
    --- a/arch/x64/arch-mmu.hh
    +++ b/arch/x64/arch-mmu.hh
    @@ -13,6 +13,11 @@ extern uint8_t phys_bits, virt_bits;
     constexpr uint8_t rsvd_bits_used = 1;
     constexpr uint8_t max_phys_bits = 52 - rsvd_bits_used;

    +constexpr int page_table_levels() { return 5; }

    +constexpr unsigned virt_addr_bits() { return 12 + 9 *
    page_table_levels(); }
    +constexpr uintptr_t virt_addr_valid_mask() { return (uintptr_t(1)
    << virt_addr_bits()) - 1; }
    +constexpr uintptr_t virt_addr_invalid_mask() { return
    ~virt_addr_valid_mask(); }
    +
     enum class mattr {
         normal
     };
    diff --git a/arch/x64/arch-setup.cc b/arch/x64/arch-setup.cc
    index 0fabfec..7ee6c9b 100644
    --- a/arch/x64/arch-setup.cc
    +++ b/arch/x64/arch-setup.cc
    @@ -75,7 +75,7 @@ void setup_temporary_phys_map()
         auto pt = reinterpret_cast<u64*>(cr3);
         for (auto&& area : mmu::identity_mapped_areas) {
             auto base = reinterpret_cast<void*>(get_mem_area_base(area));
    -        pt[mmu::pt_index(base, 3)] = pt[0];
    +        pt[mmu::pt_index(base, 4)] = pt[0];
         }
     }

    diff --git a/arch/x64/boot.S b/arch/x64/boot.S
    index bfca8c7..8de4366 100644
    --- a/arch/x64/boot.S
    +++ b/arch/x64/boot.S
    @@ -10,6 +10,11 @@

     .data
     .align 4096
    +ident_pt_l5:
    +    .quad ident_pt_l4 + 0x67
    +    .rept 511
    +    .quad 0
    +    .endr
     ident_pt_l4:
         .quad ident_pt_l3 + 0x67
         .rept 511
    @@ -62,6 +67,7 @@ interrupt_stack_top = .
                      | X86_CR4_PAE        \
                      | X86_CR4_PGE        \
                      | X86_CR4_PCE        \
    +                 | X86_CR4_LA57       \
                      | X86_CR4_OSFXSR     \
                      | X86_CR4_OSXMMEXCPT )

    @@ -81,7 +87,7 @@ start32:
         and $~7, %esp
         mov $BOOT_CR4, %eax
         mov %eax, %cr4
    -    lea ident_pt_l4, %eax
    +    lea ident_pt_l5, %eax
         mov %eax, %cr3
         mov $0xc0000080, %ecx
         mov $0x00000900, %eax
    diff --git a/arch/x64/mmu.cc b/arch/x64/mmu.cc
    index 2f1ba5e..2ee01d3 100644
    --- a/arch/x64/mmu.cc
    +++ b/arch/x64/mmu.cc
    @@ -113,9 +113,9 @@ void flush_tlb_all()
         tlb_flush_waiter.clear();
     }

    -static pt_element<4> page_table_root
    __attribute__((init_priority((int)init_prio::pt_root)));
    +static pt_element<page_table_levels()> page_table_root
    __attribute__((init_priority((int)init_prio::pt_root)));

    -pt_element<4> *get_root_pt(uintptr_t virt __attribute__((unused))) {
    +pt_element<page_table_levels()> *get_root_pt(uintptr_t virt
    __attribute__((unused))) {
         return &page_table_root;
     }

    diff --git a/arch/x64/processor-flags.h b/arch/x64/processor-flags.h
    index 6531152..ef87c35 100644
    --- a/arch/x64/processor-flags.h
    +++ b/arch/x64/processor-flags.h
    @@ -24,6 +24,7 @@
     #define X86_CR4_PCE            (1 << 8)
     #define X86_CR4_OSFXSR         (1 << 9)
     #define X86_CR4_OSXMMEXCPT     (1 << 10)
    +#define X86_CR4_LA57        (1 << 12)
     #define X86_CR4_VMXE           (1 << 13)
     #define X86_CR4_SMXE           (1 << 14)
     #define X86_CR4_FSGSBASE       (1 << 16)
    diff --git a/arch/x64/processor.hh b/arch/x64/processor.hh
    index 250153b..5cd239a 100644
    --- a/arch/x64/processor.hh
    +++ b/arch/x64/processor.hh
    @@ -36,6 +36,7 @@ constexpr ulong cr4_pge = 1u << 7;
     constexpr ulong cr4_pce = 1u << 8;
     constexpr ulong cr4_osfxsr = 1u << 9;
     constexpr ulong cr4_osxmmexcpt = 1u << 10;
    +constexpr ulong cr4_la57 = 1u << 12;
     constexpr ulong cr4_vmxe = 1u << 13;
     constexpr ulong cr4_smxe = 1u << 14;
     constexpr ulong cr4_fsgsbase = 1u << 16;
    diff --git a/core/mmu.cc b/core/mmu.cc
    index f929412..f1b29cc 100644
    --- a/core/mmu.cc
    +++ b/core/mmu.cc
    @@ -351,8 +351,8 @@ template<typename PageOp, int ParentLevel>
    class map_level;
     template<typename PageOp>
             void map_range(uintptr_t vma_start, uintptr_t vstart,
    size_t size, PageOp& page_mapper, size_t slop = page_size)
     {
    -    map_level<PageOp, 4> pt_mapper(vma_start, vstart, size,
    page_mapper, slop);
    -    pt_mapper(hw_ptep<4>::force(mmu::get_root_pt(vstart)));
    +    map_level<PageOp, page_table_levels()> pt_mapper(vma_start,
    vstart, size, page_mapper, slop);
+ pt_mapper(hw_ptep<page_table_levels()>::force(mmu::get_root_pt(vstart)));
     }

     template<typename PageOp, int ParentLevel> class map_level {
    @@ -425,7 +425,7 @@ private:
             auto idx = pt_index(vcur, level);
             auto eidx = pt_index(vend, level);
             base_virt += idx * step;
    -        base_virt = (int64_t(base_virt) << 16) >> 16; // extend
    47th bit
    +        base_virt = (int64_t(base_virt) << 7) >> 7; // extend
    56th bit

             do {
                 auto ptep = pt.at <http://pt.at>(idx);
    diff --git a/include/osv/mmu-defs.hh b/include/osv/mmu-defs.hh
    index 18edf44..9e75335 100644
    --- a/include/osv/mmu-defs.hh
    +++ b/include/osv/mmu-defs.hh
    @@ -42,16 +42,16 @@ constexpr mem_area identity_mapped_areas[] = {
         mem_area::mempool,
     };

    -constexpr uintptr_t mem_area_size = uintptr_t(1) << 44;
    +constexpr uintptr_t mem_area_size = uintptr_t(1) << 53;

     constexpr uintptr_t get_mem_area_base(mem_area area)
     {
    -    return 0xffff800000000000 | uintptr_t(area) << 44;
    +    return 0xff00000000000000 | uintptr_t(area) << 53;
     }

     static inline mem_area get_mem_area(void* addr)
     {
    -    return mem_area(reinterpret_cast<uintptr_t>(addr) >> 44 & 7);
    +    return mem_area(reinterpret_cast<uintptr_t>(addr) >> 53 & 7);
     }

     constexpr void* translate_mem_area(mem_area from, mem_area to,
    void* addr)
    @@ -175,7 +175,7 @@ template<int N>
     pt_element<N> make_empty_pte() { return pt_element<N>(); }

     /* get the root of the page table responsible for virtual address
    virt */
    -pt_element<4> *get_root_pt(uintptr_t virt);
    +pt_element<page_table_levels()> *get_root_pt(uintptr_t virt);

     /* take an error code coming from the exception frame, and return
        whether the error reports a page fault (insn/write) */
    @@ -230,7 +230,7 @@ using hw_ptep_base = typename std::conditional<
        The arch must implement change_perm for this class. */
     template <int N>
     class hw_ptep : public hw_ptep_base<N> {
    -    static_assert(N >= 0 && N <= 4, "Wrong hw_pte level");
    +    static_assert(N >= 0 && N <= page_table_levels(), "Wrong
    hw_pte level");
     public:
         hw_ptep(const hw_ptep& a) : hw_ptep_base<N>(a.p) {}
         hw_ptep& operator=(const hw_ptep& a) = default;
    --
    2.9.3

    --
    You received this message because you are subscribed to the Google
    Groups "OSv Development" group.
    To unsubscribe from this group and stop receiving emails from it,
    send an email to osv-dev+unsubscr...@googlegroups.com
    <mailto:osv-dev%2bunsubscr...@googlegroups.com>.
    For more options, visit https://groups.google.com/d/optout
    <https://groups.google.com/d/optout>.



--
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to osv-dev+unsubscr...@googlegroups.com.
For more options, visit https://groups.google.com/d/optout.

Reply via email to