This patch fixes ZFS support on aarch64. As the issue #1131 explains,
the ZFS page scanner logic clears the access flag of PTEs of relevant
memory-mapped chunks of the files. On Intel, the cpu automatically sets
the flags on first access (read or write) to those pages of memory.
But on ARM it may need to be done by software if CPU does not have this
capability (it does not on RPI 4 and Odroid I have been using possibly
due to QEMU limitation).

So to set the access flags in software, this patch enhances the page
fault handler to detect if relevant fault is access flag related
and does the manual page walk to navigate all the way down to the leaf
PTE based on the virtual memory address retrieved from far_el1.
Then it sets the access flag of the PTE and the dirty flag if the fault
was triggered by a write. Eventually it writes the PTE back to memory
and issues necessary `dsb ishst` to force completion of writes to page
table entries and flush cpu pipeline.

Finally, this patch adjusts `scripts/build` to support building ZFS
images on arm and makes ZFS a default filesystem as on x64_64.

Besides running all unit tests on ZFS image I have also verified that
more involved tests like misc-zfs-io.cc work as well.

Fixes #1131

Signed-off-by: Waldemar Kozaczuk <jwkozac...@gmail.com>
---
 arch/aarch64/mmu.cc        | 57 ++++++++++++++++++++++++++++++++++++++
 scripts/build              | 20 ++++++-------
 scripts/upload_manifest.py | 10 +++++--
 3 files changed, 74 insertions(+), 13 deletions(-)

diff --git a/arch/aarch64/mmu.cc b/arch/aarch64/mmu.cc
index ccf40667..aff7cc61 100644
--- a/arch/aarch64/mmu.cc
+++ b/arch/aarch64/mmu.cc
@@ -14,6 +14,59 @@
 #include "arch-cpu.hh"
 #include "exceptions.hh"
 
+#define ACCESS_FLAG_FAULT_LEVEL_3(esr)            ((esr & 0b0111111) == 0x0b) 
// 0xb = 0b1011 indicates level 3
+#define ACCESS_FLAG_FAULT_LEVEL_3_WHEN_WRITE(esr) ((esr & 0b1111111) == 0x4b)
+
+TRACEPOINT(trace_mmu_vm_access_flag_fault, "addr=%p", void *);
+
+template <typename T>
+T* phys_to_virt_cast(mmu::phys pa)
+{
+    void *virt = mmu::phys_mem + pa;
+    return static_cast<T*>(virt);
+}
+
+static void handle_access_flag_fault(exception_frame *ef, u64 addr) {
+    trace_mmu_vm_access_flag_fault((void*)addr);
+
+    // The access bit of a PTE (Page Table Entry) at level 3 got cleared and 
we need
+    // to set it to handle this page fault. Therefore we need to do a page walk
+    // to navigate down to the level 3 and identify relevant PTE.
+
+    // Start with root PTE
+    auto root_pt = mmu::get_root_pt(addr);
+    auto root_ptep = mmu::hw_ptep<4>::force(root_pt);
+
+    // Identify PTEP (PTE Pointer) at level 0 (the template parameter is 
reversed)
+    // First identify the ptep table at this level
+    auto l3_ptep_table = 
mmu::hw_ptep<3>::force(phys_to_virt_cast<mmu::pt_element<3>>(root_ptep.read().next_pt_addr()));
+    // Then access ptep at the index encoded in the virtual address
+    auto l3_ptep = 
l3_ptep_table.at(mmu::pt_index(reinterpret_cast<void*>(addr), 3));
+
+    // Identify PTEP at level 1 (first identify the ptep table and then the 
relevant ptep)
+    auto l2_ptep_table = 
mmu::hw_ptep<2>::force(phys_to_virt_cast<mmu::pt_element<2>>(l3_ptep.read().next_pt_addr()));
+    auto l2_ptep = 
l2_ptep_table.at(mmu::pt_index(reinterpret_cast<void*>(addr), 2));
+
+    // Identify PTEP at level 2 (first identify the ptep table and then the 
relevant ptep)
+    auto l1_ptep_table = 
mmu::hw_ptep<1>::force(phys_to_virt_cast<mmu::pt_element<1>>(l2_ptep.read().next_pt_addr()));
+    auto l1_ptep = 
l1_ptep_table.at(mmu::pt_index(reinterpret_cast<void*>(addr), 1));
+
+    // Identify PTEP at level 3 (first identify the ptep table and then the 
relevant ptep)
+    auto l0_ptep_table = 
mmu::hw_ptep<0>::force(phys_to_virt_cast<mmu::pt_element<0>>(l1_ptep.read().next_pt_addr()));
+    auto l0_ptep = 
l0_ptep_table.at(mmu::pt_index(reinterpret_cast<void*>(addr), 0));
+
+    // Read leaf PTE
+    auto leaf_pte = l0_ptep.read();
+
+    leaf_pte.set_accessed(true);
+    if (ACCESS_FLAG_FAULT_LEVEL_3(ef->esr)) {
+        leaf_pte.set_dirty(true);
+    }
+
+    l0_ptep.write(leaf_pte);
+    mmu::synchronize_page_table_modifications();
+}
+
 void page_fault(exception_frame *ef)
 {
     sched::fpu_lock fpu;
@@ -39,6 +92,10 @@ void page_fault(exception_frame *ef)
         abort("trying to execute null pointer");
     }
 
+    if (ACCESS_FLAG_FAULT_LEVEL_3(ef->esr)) {
+        return handle_access_flag_fault(ef, addr);
+    }
+
     /* vm_fault might sleep, so check that the thread is preemptable,
      * and that interrupts in the saved pstate are enabled.
      * Then enable interrupts for the vm_fault.
diff --git a/scripts/build b/scripts/build
index ffae67b3..38aa70d5 100755
--- a/scripts/build
+++ b/scripts/build
@@ -190,15 +190,7 @@ host_arch=$(uname -m)
 
 # Default manifest
 manifest=bootfs.manifest.skel
-if [[ "$host_arch" == "aarch64" || "$arch" == "aarch64" ]]; then
-       # We default to ROFS as ZFS is not supported on ARM until the issue 
#1131 is fixed
-       fs_type=${vars[fs]-rofs}
-       if [[ "$fs_type" == "rofs" ]]; then
-               vars[create_disk]="true"
-       fi
-else
-       fs_type=${vars[fs]-zfs}
-fi
+fs_type=${vars[fs]-zfs}
 usrskel_arg=
 case $fs_type in
 zfs)
@@ -215,6 +207,10 @@ ramfs)
        exit 2
 esac
 
+if [[ "$host_arch" == "aarch64" || "$arch" == "aarch64" ]]; then
+       vars[create_disk]="true"
+fi
+
 if test -n "${vars[usrskel]}"
 then
        # Override default skel
@@ -305,7 +301,9 @@ if [[ ${vars[create_disk]} == "true" ]]; then
        bare="$SRC"/scripts/disk.bin
        raw_disk=disk
        qcow2_disk=disk
-       upload_kernel_mode="-k"
+       if [[ "$arch" == 'x64' ]]; then
+               upload_kernel_mode="-k"
+       fi
 else
        partition_offset=$kernel_end
        bare=loader.img
@@ -318,7 +316,7 @@ create_zfs_disk() {
        "$SRC"/scripts/imgedit.py setpartition "-f raw ${raw_disk}.raw" 2 
$partition_offset $partition_size
        qemu-img convert -f raw -O qcow2 $raw_disk.raw $qcow2_disk.img
        qemu-img resize $qcow2_disk.img ${image_size}b >/dev/null 2>&1
-       "$SRC"/scripts/upload_manifest.py -o $qcow2_disk.img -m usr.manifest -D 
libgcc_s_dir="$libgcc_s_dir" $upload_kernel_mode
+       "$SRC"/scripts/upload_manifest.py --arch=$arch -o $qcow2_disk.img -m 
usr.manifest -D libgcc_s_dir="$libgcc_s_dir" $upload_kernel_mode
 }
 
 create_rofs_disk() {
diff --git a/scripts/upload_manifest.py b/scripts/upload_manifest.py
index 97c59e5b..aa144837 100755
--- a/scripts/upload_manifest.py
+++ b/scripts/upload_manifest.py
@@ -7,6 +7,8 @@ from contextlib import closing
 import io
 StringIO = io.StringIO
 
+host_arch = os.uname().machine
+
 def find_free_port():
     with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s:
         s.bind(('localhost', 0))
@@ -137,7 +139,11 @@ def main():
             make_option('-k',
                         dest='kernel',
                         action='store_true',
-                        help='run OSv in direct kernel mode')
+                        help='run OSv in direct kernel mode'),
+            make_option('--arch',
+                        dest='arch',
+                        default=host_arch,
+                        help="specify QEMU architecture: x86_64, aarch64")
     ])
 
     (options, args) = opt.parse_args()
@@ -155,7 +161,7 @@ def main():
         kernel_mode_flag = '-k --kernel-path build/release/loader-stripped.elf'
     else:
         kernel_mode_flag = ''
-    osv = subprocess.Popen('cd ../..; scripts/run.py %s --vnc none -m 512 -c1 
-i "%s" --block-device-cache unsafe -s -e "--norandom --nomount --noinit 
/tools/mkfs.so; /tools/cpiod.so --prefix /zfs/zfs/; /zfs.so set compression=off 
osv" --forward tcp:127.0.0.1:%s-:10000' % 
(kernel_mode_flag,image_path,upload_port), shell=True, stdout=subprocess.PIPE)
+    osv = subprocess.Popen('cd ../..; scripts/run.py %s --arch=%s --vnc none 
-m 512 -c1 -i "%s" --block-device-cache unsafe -s -e "--norandom --nomount 
--noinit /tools/mkfs.so; /tools/cpiod.so --prefix /zfs/zfs/; /zfs.so set 
compression=off osv" --forward tcp:127.0.0.1:%s-:10000' % 
(kernel_mode_flag,options.arch,image_path,upload_port), shell=True, 
stdout=subprocess.PIPE)
 
     upload(osv, manifest, depends, upload_port)
 
-- 
2.27.0

-- 
You received this message because you are subscribed to the Google Groups "OSv 
Development" group.
To unsubscribe from this group and stop receiving emails from it, send an email 
to osv-dev+unsubscr...@googlegroups.com.
To view this discussion on the web visit 
https://groups.google.com/d/msgid/osv-dev/20220503223436.42029-1-jwkozaczuk%40gmail.com.

Reply via email to