From: Waldemar Kozaczuk <jwkozac...@gmail.com>
Committer: Nadav Har'El <n...@scylladb.com>
Branch: master

memory: move malloc virtual address space below 0x800000000000

This patch changes the layout of the virtual address space to make
all virtual memory fit below the 0x0000800000000000 address.

As Nadav Har'El explains in the description of the issue #1196:

"Although x86 is nominally a 64-bit address space, it didn't fully
support the entire 64 bits and doesn't even now. Rather (see a good explanation
in https://en.wikipedia.org/wiki/X86-64, "canonical form address") it only
supported 48 bits.

Moreover, all the highest bits must be copies of the bit 47. So basically
you have 47 bits (128 TB) with all highest bits 0, and another 128 TB with
the highest bits 1 - these are the 0xfffff... addresses.

So it was convenient for OSv to divide the address space with one half for mmap
and one half for malloc."

As it turns out, the virtual address space story on AArch64 is similar
(for details read https://www.kernel.org/doc/html/latest/arm64/memory.html),
where all the bits 63:48 are set to either 0 or 1. The difference to x86 is that
the 0-addresses mapping is specified by the table pointed by the TTBR0 system 
register
and the 1-addresses by the table pointed by the TTBR1 register.

So the virtual-physical linear memory mapping before this patch would
look like this per the 'osv lineap_mmap' gdb command:

x86_64)
           vaddr            paddr     size perm memattr name
        40200000           200000   67d434 rwxp  normal kernel
ffff800000000000                0 40000000 rwxp  normal main
ffff8000000f0000            f0000    10000 rwxp  normal dmi
ffff8000000f5a10            f5a10      17c rwxp  normal smbios
ffff800040000000         40000000 3ffdd000 rwxp  normal main
ffff80007fe00000         7fe00000   200000 rwxp  normal acpi
ffff8000feb91000         feb91000     1000 rwxp  normal pci_bar
ffff8000feb92000         feb92000     1000 rwxp  normal pci_bar
ffff8000fec00000         fec00000     1000 rwxp  normal ioapic
ffff900000000000                0 40000000 rwxp  normal page
ffff900040000000         40000000 3ffdd000 rwxp  normal page
ffffa00000000000                0 40000000 rwxp  normal mempool
ffffa00040000000         40000000 3ffdd000 rwxp  normal mempool

aarch64)
           vaddr            paddr     size perm memattr name
         8000000          8000000    10000 rwxp     dev gic_dist
         8010000          8010000    10000 rwxp     dev gic_cpu
         9000000          9000000     1000 rwxp     dev pl011
         9010000          9010000     1000 rwxp     dev pl031
        10000000         10000000 2eff0000 rwxp     dev pci_mem
        3eff0000         3eff0000    10000 rwxp     dev pci_io
       fc0000000         40000000   84e000 rwxp  normal kernel
      4010000000       4010000000 10000000 rwxp     dev pci_cfg
ffff80000a000000          a000000      200 rwxp  normal virtio_mmio_cfg
ffff80000a000200          a000200      200 rwxp  normal virtio_mmio_cfg
ffff80000a000400          a000400      200 rwxp  normal virtio_mmio_cfg
ffff80000a000600          a000600      200 rwxp  normal virtio_mmio_cfg
ffff80000a000800          a000800      200 rwxp  normal virtio_mmio_cfg
ffff80000a000a00          a000a00      200 rwxp  normal virtio_mmio_cfg
ffff80000a000c00          a000c00      200 rwxp  normal virtio_mmio_cfg
ffff80000a000e00          a000e00      200 rwxp  normal virtio_mmio_cfg
ffff80004084e000         4084e000 7f7b2000 rwxp  normal main
ffff90004084e000         4084e000 7f7b2000 rwxp  normal page
ffffa0004084e000         4084e000 7f7b2000 rwxp  normal mempool

The mappings above include the kernel code, memory-mapped devices
as well as the malloc-related areas marked by "main", "page" and
"mempool" names.

There are also mmap-related areas as indicated by this gdb example:
osv mmap
0x0000000000000000 0x0000000000000000 [0.0 kB]         flags=none     perm=none
0x0000200000000000 0x0000200000001000 [4.0 kB]         flags=p        perm=none
0x0000800000000000 0x0000800000000000 [0.0 kB]         flags=none     perm=none

Unfortunately, this virtual memory layout while being convenient,
prevents some Linux applications from running correctly on OSv.
More specifically, the RapidJSON C++ library (see 
https://rapidjson.org/index.html) on x86_64
and Java JIT compiler on AArch64 (see #1145 and #1157) use the 63-48 bits to
"pack" some extra information for some optimizations and thus assumes
that these bits are 0.

So this patch changes the virtual memory layout to make "malloc" areas
fall under 0x0000800000000000. In short we effectively move the areas:

ffff800000000000 - ffff8fffffffffff (main)
ffff900000000000 - ffff9fffffffffff (page)
ffffa00000000000 - ffffafffffffffff (mempool)
ffffb00000000000 - ffffbfffffffffff (debug)

to:

0000400000000000 - 00004fffffffffff (main)
0000500000000000 - 00005fffffffffff (page)
0000600000000000 - 00006fffffffffff (mempool)
0000700000000000 - 00007fffffffffff (debug)

We also squeze the mmap area from:

0000000000000000 - 0000800000000000

to:

0000000000000000 - 0000400000000000

As a result the linear mappings after the patch look like this:

x86_64)
           vaddr            paddr     size perm memattr name
        40200000           200000   67c434 rwxp  normal kernel
    400000000000                0 40000000 rwxp  normal main
    4000000f0000            f0000    10000 rwxp  normal dmi
    4000000f5a10            f5a10      17c rwxp  normal smbios
    400040000000         40000000 3ffdd000 rwxp  normal main
    40007fe00000         7fe00000   200000 rwxp  normal acpi
    4000feb91000         feb91000     1000 rwxp  normal pci_bar
    4000feb92000         feb92000     1000 rwxp  normal pci_bar
    4000fec00000         fec00000     1000 rwxp  normal ioapic
    500000000000                0 40000000 rwxp  normal page
    500040000000         40000000 3ffdd000 rwxp  normal page
    600000000000                0 40000000 rwxp  normal mempool
    600040000000         40000000 3ffdd000 rwxp  normal mempool

aarch64)
           vaddr            paddr     size perm memattr name
         8000000          8000000    10000 rwxp     dev gic_dist
         8010000          8010000    10000 rwxp     dev gic_cpu
         9000000          9000000     1000 rwxp     dev pl011
         9010000          9010000     1000 rwxp     dev pl031
        10000000         10000000 2eff0000 rwxp     dev pci_mem
        3eff0000         3eff0000    10000 rwxp     dev pci_io
       fc0000000         40000000   7de000 rwxp  normal kernel
      4010000000       4010000000 10000000 rwxp     dev pci_cfg
    40000a000000          a000000      200 rwxp  normal virtio_mmio_cfg
    40000a000200          a000200      200 rwxp  normal virtio_mmio_cfg
    40000a000400          a000400      200 rwxp  normal virtio_mmio_cfg
    40000a000600          a000600      200 rwxp  normal virtio_mmio_cfg
    40000a000800          a000800      200 rwxp  normal virtio_mmio_cfg
    40000a000a00          a000a00      200 rwxp  normal virtio_mmio_cfg
    40000a000c00          a000c00      200 rwxp  normal virtio_mmio_cfg
    40000a000e00          a000e00      200 rwxp  normal virtio_mmio_cfg
    4000407de000         407de000 7f822000 rwxp  normal main
    5000407de000         407de000 7f822000 rwxp  normal page
    6000407de000         407de000 7f822000 rwxp  normal mempool

Fixes #1196
Fixes #1145
Fixes #1157

Signed-off-by: Waldemar Kozaczuk <jwkozac...@gmail.com>
Message-Id: <20221014000810.7323-1-jwkozac...@gmail.com>

---
diff --git a/arch/aarch64/arch-setup.cc b/arch/aarch64/arch-setup.cc
--- a/arch/aarch64/arch-setup.cc
+++ b/arch/aarch64/arch-setup.cc
@@ -36,12 +36,11 @@
 
 void setup_temporary_phys_map()
 {
-    // duplicate 1:1 mapping into phys_mem
+    // duplicate 1:1 mapping into the lower part of phys_mem
     u64 *pt_ttbr0 = reinterpret_cast<u64*>(processor::read_ttbr0());
-    u64 *pt_ttbr1 = reinterpret_cast<u64*>(processor::read_ttbr1());
     for (auto&& area : mmu::identity_mapped_areas) {
         auto base = reinterpret_cast<void*>(get_mem_area_base(area));
-        pt_ttbr1[mmu::pt_index(base, 3)] = pt_ttbr0[0];
+        pt_ttbr0[mmu::pt_index(base, 3)] = pt_ttbr0[0];
     }
     mmu::flush_tlb_all();
 }
diff --git a/core/mmu.cc b/core/mmu.cc
--- a/core/mmu.cc
+++ b/core/mmu.cc
@@ -78,7 +78,7 @@ class vma_compare {
 };
 
 constexpr uintptr_t lower_vma_limit = 0x0;
-constexpr uintptr_t upper_vma_limit = 0x800000000000;
+constexpr uintptr_t upper_vma_limit = 0x400000000000;
 
 typedef boost::intrusive::set<vma,
                               bi::compare<vma_compare>,
diff --git a/include/osv/mmu-defs.hh b/include/osv/mmu-defs.hh
--- a/include/osv/mmu-defs.hh
+++ b/include/osv/mmu-defs.hh
@@ -46,12 +46,12 @@ constexpr uintptr_t mem_area_size = uintptr_t(1) << 44;
 
 constexpr uintptr_t get_mem_area_base(mem_area area)
 {
-    return 0xffff800000000000 | uintptr_t(area) << 44;
+    return 0x400000000000 | uintptr_t(area) << 44;
 }
 
 static inline mem_area get_mem_area(void* addr)
 {
-    return mem_area(reinterpret_cast<uintptr_t>(addr) >> 44 & 7);
+    return mem_area(reinterpret_cast<uintptr_t>(addr) >> 44 & 3);
 }
 
 constexpr void* translate_mem_area(mem_area from, mem_area to, void* addr)
diff --git a/scripts/loader.py b/scripts/loader.py
--- a/scripts/loader.py
+++ b/scripts/loader.py
@@ -27,7 +27,7 @@ class status_enum_class(object):
     pass
 status_enum = status_enum_class()
 
-phys_mem = 0xffff800000000000
+phys_mem = 0x400000000000
 
 def pt_index(addr, level):
     return (addr >> (12 + 9 * level)) & 511

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to osv-dev+unsubscr...@googlegroups.com.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/osv-dev/00000000000053db2705eb22a89d%40google.com.

Reply via email to