This patch fixes ZFS support on aarch64. As the issue #1131 explains, the ZFS page scanner logic clears the access flag of PTEs of relevant memory-mapped chunks of the files. On Intel, the cpu automatically sets the flags on first access (read or write) to those pages of memory. But on ARM it may need to be done by software if CPU does not have this capability (it does not on RPI 4 and Odroid I have been using possibly due to QEMU limitation).
So to set the access flags in software, this patch enhances the page fault handler to detect if relevant fault is access flag related and does the manual page walk to navigate all the way down to the leaf PTE based on the virtual memory address retrieved from far_el1. Then it sets the access flag of the PTE and the dirty flag if the fault was triggered by a write. Eventually it writes the PTE back to memory and issues necessary `dsb ishst` to force completion of writes to page table entries and flush cpu pipeline. Finally, this patch adjusts `scripts/build` to support building ZFS images on arm and makes ZFS a default filesystem as on x64_64. Besides running all unit tests on ZFS image I have also verified that more involved tests like misc-zfs-io.cc work as well. Fixes #1131 Signed-off-by: Waldemar Kozaczuk <jwkozac...@gmail.com> --- arch/aarch64/mmu.cc | 57 ++++++++++++++++++++++++++++++++++++++ scripts/build | 20 ++++++------- scripts/upload_manifest.py | 10 +++++-- 3 files changed, 74 insertions(+), 13 deletions(-) diff --git a/arch/aarch64/mmu.cc b/arch/aarch64/mmu.cc index ccf40667..aff7cc61 100644 --- a/arch/aarch64/mmu.cc +++ b/arch/aarch64/mmu.cc @@ -14,6 +14,59 @@ #include "arch-cpu.hh" #include "exceptions.hh" +#define ACCESS_FLAG_FAULT_LEVEL_3(esr) ((esr & 0b0111111) == 0x0b) // 0xb = 0b1011 indicates level 3 +#define ACCESS_FLAG_FAULT_LEVEL_3_WHEN_WRITE(esr) ((esr & 0b1111111) == 0x4b) + +TRACEPOINT(trace_mmu_vm_access_flag_fault, "addr=%p", void *); + +template <typename T> +T* phys_to_virt_cast(mmu::phys pa) +{ + void *virt = mmu::phys_mem + pa; + return static_cast<T*>(virt); +} + +static void handle_access_flag_fault(exception_frame *ef, u64 addr) { + trace_mmu_vm_access_flag_fault((void*)addr); + + // The access bit of a PTE (Page Table Entry) at level 3 got cleared and we need + // to set it to handle this page fault. Therefore we need to do a page walk + // to navigate down to the level 3 and identify relevant PTE. + + // Start with root PTE + auto root_pt = mmu::get_root_pt(addr); + auto root_ptep = mmu::hw_ptep<4>::force(root_pt); + + // Identify PTEP (PTE Pointer) at level 0 (the template parameter is reversed) + // First identify the ptep table at this level + auto l3_ptep_table = mmu::hw_ptep<3>::force(phys_to_virt_cast<mmu::pt_element<3>>(root_ptep.read().next_pt_addr())); + // Then access ptep at the index encoded in the virtual address + auto l3_ptep = l3_ptep_table.at(mmu::pt_index(reinterpret_cast<void*>(addr), 3)); + + // Identify PTEP at level 1 (first identify the ptep table and then the relevant ptep) + auto l2_ptep_table = mmu::hw_ptep<2>::force(phys_to_virt_cast<mmu::pt_element<2>>(l3_ptep.read().next_pt_addr())); + auto l2_ptep = l2_ptep_table.at(mmu::pt_index(reinterpret_cast<void*>(addr), 2)); + + // Identify PTEP at level 2 (first identify the ptep table and then the relevant ptep) + auto l1_ptep_table = mmu::hw_ptep<1>::force(phys_to_virt_cast<mmu::pt_element<1>>(l2_ptep.read().next_pt_addr())); + auto l1_ptep = l1_ptep_table.at(mmu::pt_index(reinterpret_cast<void*>(addr), 1)); + + // Identify PTEP at level 3 (first identify the ptep table and then the relevant ptep) + auto l0_ptep_table = mmu::hw_ptep<0>::force(phys_to_virt_cast<mmu::pt_element<0>>(l1_ptep.read().next_pt_addr())); + auto l0_ptep = l0_ptep_table.at(mmu::pt_index(reinterpret_cast<void*>(addr), 0)); + + // Read leaf PTE + auto leaf_pte = l0_ptep.read(); + + leaf_pte.set_accessed(true); + if (ACCESS_FLAG_FAULT_LEVEL_3(ef->esr)) { + leaf_pte.set_dirty(true); + } + + l0_ptep.write(leaf_pte); + mmu::synchronize_page_table_modifications(); +} + void page_fault(exception_frame *ef) { sched::fpu_lock fpu; @@ -39,6 +92,10 @@ void page_fault(exception_frame *ef) abort("trying to execute null pointer"); } + if (ACCESS_FLAG_FAULT_LEVEL_3(ef->esr)) { + return handle_access_flag_fault(ef, addr); + } + /* vm_fault might sleep, so check that the thread is preemptable, * and that interrupts in the saved pstate are enabled. * Then enable interrupts for the vm_fault. diff --git a/scripts/build b/scripts/build index ffae67b3..38aa70d5 100755 --- a/scripts/build +++ b/scripts/build @@ -190,15 +190,7 @@ host_arch=$(uname -m) # Default manifest manifest=bootfs.manifest.skel -if [[ "$host_arch" == "aarch64" || "$arch" == "aarch64" ]]; then - # We default to ROFS as ZFS is not supported on ARM until the issue #1131 is fixed - fs_type=${vars[fs]-rofs} - if [[ "$fs_type" == "rofs" ]]; then - vars[create_disk]="true" - fi -else - fs_type=${vars[fs]-zfs} -fi +fs_type=${vars[fs]-zfs} usrskel_arg= case $fs_type in zfs) @@ -215,6 +207,10 @@ ramfs) exit 2 esac +if [[ "$host_arch" == "aarch64" || "$arch" == "aarch64" ]]; then + vars[create_disk]="true" +fi + if test -n "${vars[usrskel]}" then # Override default skel @@ -305,7 +301,9 @@ if [[ ${vars[create_disk]} == "true" ]]; then bare="$SRC"/scripts/disk.bin raw_disk=disk qcow2_disk=disk - upload_kernel_mode="-k" + if [[ "$arch" == 'x64' ]]; then + upload_kernel_mode="-k" + fi else partition_offset=$kernel_end bare=loader.img @@ -318,7 +316,7 @@ create_zfs_disk() { "$SRC"/scripts/imgedit.py setpartition "-f raw ${raw_disk}.raw" 2 $partition_offset $partition_size qemu-img convert -f raw -O qcow2 $raw_disk.raw $qcow2_disk.img qemu-img resize $qcow2_disk.img ${image_size}b >/dev/null 2>&1 - "$SRC"/scripts/upload_manifest.py -o $qcow2_disk.img -m usr.manifest -D libgcc_s_dir="$libgcc_s_dir" $upload_kernel_mode + "$SRC"/scripts/upload_manifest.py --arch=$arch -o $qcow2_disk.img -m usr.manifest -D libgcc_s_dir="$libgcc_s_dir" $upload_kernel_mode } create_rofs_disk() { diff --git a/scripts/upload_manifest.py b/scripts/upload_manifest.py index 97c59e5b..aa144837 100755 --- a/scripts/upload_manifest.py +++ b/scripts/upload_manifest.py @@ -7,6 +7,8 @@ from contextlib import closing import io StringIO = io.StringIO +host_arch = os.uname().machine + def find_free_port(): with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: s.bind(('localhost', 0)) @@ -137,7 +139,11 @@ def main(): make_option('-k', dest='kernel', action='store_true', - help='run OSv in direct kernel mode') + help='run OSv in direct kernel mode'), + make_option('--arch', + dest='arch', + default=host_arch, + help="specify QEMU architecture: x86_64, aarch64") ]) (options, args) = opt.parse_args() @@ -155,7 +161,7 @@ def main(): kernel_mode_flag = '-k --kernel-path build/release/loader-stripped.elf' else: kernel_mode_flag = '' - osv = subprocess.Popen('cd ../..; scripts/run.py %s --vnc none -m 512 -c1 -i "%s" --block-device-cache unsafe -s -e "--norandom --nomount --noinit /tools/mkfs.so; /tools/cpiod.so --prefix /zfs/zfs/; /zfs.so set compression=off osv" --forward tcp:127.0.0.1:%s-:10000' % (kernel_mode_flag,image_path,upload_port), shell=True, stdout=subprocess.PIPE) + osv = subprocess.Popen('cd ../..; scripts/run.py %s --arch=%s --vnc none -m 512 -c1 -i "%s" --block-device-cache unsafe -s -e "--norandom --nomount --noinit /tools/mkfs.so; /tools/cpiod.so --prefix /zfs/zfs/; /zfs.so set compression=off osv" --forward tcp:127.0.0.1:%s-:10000' % (kernel_mode_flag,options.arch,image_path,upload_port), shell=True, stdout=subprocess.PIPE) upload(osv, manifest, depends, upload_port) -- 2.27.0 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220503223436.42029-1-jwkozaczuk%40gmail.com.