[osv-dev] [PATCH] trace: add mechanism to resolve application symbols
Enhance tracing/profiling tool by adding a new -S option that instructs trace.py to use tracefile.symbols file for symbol resolution instead of addr2line utility: ./scripts/trace.py prof -FLS In essence, this new patch enhances loader.py to generate new file tracefile.symbols along the trace file with an information like this: 0x10119277 /usr/lib/golang/src/runtime/mheap.go 1950 runtime.newMarkBits 0x10114a04 /usr/lib/golang/src/runtime/mgcsweep.go 471 runtime.(*mspan).sweep 0x1010824a /usr/lib/golang/src/runtime/mcentral.go 214 runtime.(*mcentral).uncacheSpan 0x10107c3b /usr/lib/golang/src/runtime/mcache.go 276 runtime.(*mcache).releaseAll 0x40351825 core/mempool.cc 356 memory::pool::free(void*) 0x402403b6 ./bsd/sys/sys/mbuf.h 609 m_freem 0x4034 /usr/include/c++/11/bits/std_mutex.h 228 epoll_file::wait(epoll_event*, int, int) Each line above contains white-space separated address, filename, file number (if any) and name of all referenced symbols of both kernel and application. Please note the 'trace.py extract' calls 'osv syms' to load all application objects. But this only helps if the objects are still loaded by OSv. For example, if one runs some server app to profile under load and uses 'trace.py extract', then it will work because the app would be still running as OSv has not entered into a shutdown phase. However, if one tests it with any app that simply completes like native-example, then it will not work because objects are already unloaded. In this case, one has to invoke run.py with -w option, set a breakpoint in OSv code after it loads objects but before it starts the app (for example at core/app.cc:220) and then manually run 'osv syms' to force resolution of application symbols. The extra benefit of this new approach is that 'trace.py prof' works much faster as it simply reads new file .symbols instead of calling addr2line against loader.elf for each symbol which can take half a minute or longer. Signed-off-by: Waldemar Kozaczuk --- scripts/loader.py| 21 + scripts/osv/debug.py | 36 scripts/trace.py | 6 ++ 3 files changed, 63 insertions(+) diff --git a/scripts/loader.py b/scripts/loader.py index 97c831e9..a7f82e6c 100755 --- a/scripts/loader.py +++ b/scripts/loader.py @@ -101,6 +101,15 @@ class syminfo_resolver(object): def clear_cache(clazz): clazz.cache.clear() +@classmethod +def output_cache(clazz, output_func): +for source_addr in clazz.cache.values(): +addr = source_addr[0] +if addr.line: +output_func("0x%x %s %d %s\n" % (addr.addr, addr.filename, addr.line, addr.name)) +else: +output_func("0x%x %s %s\n" % (addr.addr, addr.filename, addr.name)) + symbol_resolver = syminfo_resolver() def symbol_formatter(src_addr): @@ -1304,6 +1313,17 @@ def all_traces(): def save_traces_to_file(filename): trace.write_to_file(filename, list(all_traces())) +def save_backtrace_symbols_to_file(filename): +# Iterate over all traces and force resolution of symbols in +# included backtrace if any +for trace in all_traces(): +if trace.backtrace: +for address in list(x - 1 for x in trace.backtrace if x): +symbol_resolver(address) +# Save resolved symbol information from cache into a file +with open(filename, 'wt') as sout: +syminfo_resolver.output_cache(sout.write) + def make_symbolic(addr): return str(syminfo(addr)) @@ -1503,6 +1523,7 @@ class osv_trace_save(gdb.Command): gdb.write('Saving traces to %s ...\n' % arg) save_traces_to_file(arg) +save_backtrace_symbols_to_file("%s.symbols" % arg) class osv_trace_file(gdb.Command): def __init__(self): diff --git a/scripts/osv/debug.py b/scripts/osv/debug.py index 83372ada..d7d34637 100644 --- a/scripts/osv/debug.py +++ b/scripts/osv/debug.py @@ -99,6 +99,42 @@ class SymbolResolver(object): self.addr2line.stdin.close() self.addr2line.wait() +class SymbolsFileResolver(object): +def __init__(self, symbols_file, fallback_resolver=DummyResolver()): +if not os.path.exists(symbols_file): +raise Exception('File not found: ' + object_path) +self.fallback_resolver = fallback_resolver +self.cache = dict() + +try: +symbol_lines = open(symbols_file).read().split('\n') +except IOError: +symbol_lines = [] + +for symbol_line in symbol_lines: +tokens = symbol_line.split(maxsplit=3) +if len(tokens) > 0: +addr = int(tokens[0], 16) +filename = tokens[1] +if tokens[2] == '': +line = None +else: +
[osv-dev] [PATCH] trace.py: inspect backtrace filename in a safe manner
Some collected tracepoints may have missing the filename field. So before trying to filter the frames by filename test if it is present. Signed-off-by: Waldemar Kozaczuk --- scripts/osv/trace.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/osv/trace.py b/scripts/osv/trace.py index fb14c4fe..1b979215 100644 --- a/scripts/osv/trace.py +++ b/scripts/osv/trace.py @@ -33,7 +33,7 @@ class BacktraceFormatter: frames = list(debug.resolve_all(self.resolver, (x - 1 for x in backtrace if x))) -while frames[0].name and (frames[0].name.startswith("tracepoint") or frames[0].filename.endswith("trace.hh")): +while frames[0].name and (frames[0].name.startswith("tracepoint") or (frames[0].filename and frames[0].filename.endswith("trace.hh"))): frames.pop(0) if self.multiline: -- 2.35.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20230404024308.169022-3-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] prof.py: remove extra noise from profiler stack traces
Eliminate any tracepoint code specific frames from the profiler backtraces. Signed-off-by: Waldemar Kozaczuk --- scripts/osv/prof.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/osv/prof.py b/scripts/osv/prof.py index 2de0d4ac..edb8a5c1 100644 --- a/scripts/osv/prof.py +++ b/scripts/osv/prof.py @@ -103,6 +103,8 @@ def strip_garbage(backtrace): def is_good(src_addr): if not src_addr.name: return True +if src_addr.filename and src_addr.filename.endswith("trace.hh"): +return False return not src_addr.name in unimportant_functions for chain in unimportant_prefixes: -- 2.35.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20230404024308.169022-2-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] loader.py: use with statement when writing trace file
The 'osv trace2file' does not work with python3 so let us fix it by replacing the code to open and write to a file with more portable and succinct "with" construct. Signed-off-by: Waldemar Kozaczuk --- scripts/loader.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/loader.py b/scripts/loader.py index 0ce782d0..97c831e9 100755 --- a/scripts/loader.py +++ b/scripts/loader.py @@ -1508,9 +1508,8 @@ class osv_trace_file(gdb.Command): def __init__(self): gdb.Command.__init__(self, 'osv trace2file', gdb.COMMAND_USER, gdb.COMPLETE_NONE) def invoke(self, arg, from_tty): -fout = file("trace.txt", "wt") -dump_trace(fout.write) -fout.close() +with open("trace.txt", 'wt') as fout: +dump_trace(fout.write) class osv_leak(gdb.Command): def __init__(self): -- 2.35.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20230404024308.169022-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] libc: add __wcsncpy_chk() to support python 3.10
Signed-off-by: Waldemar Kozaczuk --- Makefile| 1 + libc/string/__wcsncpy_chk.c | 15 +++ 2 files changed, 16 insertions(+) create mode 100644 libc/string/__wcsncpy_chk.c diff --git a/Makefile b/Makefile index fcc55e29..b8d5d924 100644 --- a/Makefile +++ b/Makefile @@ -1796,6 +1796,7 @@ musl += string/wcsncasecmp_l.o musl += string/wcsncat.o musl += string/wcsncmp.o musl += string/wcsncpy.o +libc += string/__wcsncpy_chk.o musl += string/wcsnlen.o musl += string/wcspbrk.o musl += string/wcsrchr.o diff --git a/libc/string/__wcsncpy_chk.c b/libc/string/__wcsncpy_chk.c new file mode 100644 index ..ff3da7f5 --- /dev/null +++ b/libc/string/__wcsncpy_chk.c @@ -0,0 +1,15 @@ +/* + * Copyright (C) 2022 Waldemar Kozaczuk + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ + +#include +#include + +wchar_t *__wcsncpy_chk(wchar_t * dest, const wchar_t * src, size_t n, size_t destlen) +{ +assert(wcslen(src) + sizeof(L'\0') <= destlen); +return wcsncpy(dest, src, n); +} -- 2.35.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20221107152837.142732-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] aarch64: enable JIT in java tests
After the patch to move the malloc address space below 0x8000 is applied, we can enable JIT when running Java tests on aarch64. So this patch simply modifies the tests makefile to make it no longer add the '-Xint' JVM flag which enables interpreted execution mode and disables JIT. Signed-off-by: Waldemar Kozaczuk --- modules/java-tests/Makefile | 12 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/modules/java-tests/Makefile b/modules/java-tests/Makefile index e5a046cb..4646fc08 100644 --- a/modules/java-tests/Makefile +++ b/modules/java-tests/Makefile @@ -66,16 +66,12 @@ $(tests-jar-target): $(wildcard $(java-base-path)/tests-for-java9_1x/src/main/ja module: $(tests-jar-target) test_commands endif -ifeq ($(arch),aarch64) -java_arch_options := -Xint -endif - -java_isolated_cmd := 'java_isolated: /java_isolated.so $(java_arch_options) -cp /tests/java/tests.jar:/tests/java/isolates.jar \ +java_isolated_cmd := 'java_isolated: /java_isolated.so -cp /tests/java/tests.jar:/tests/java/isolates.jar \ -Disolates.jar=/tests/java/isolates.jar org.junit.runner.JUnitCore io.osv.AllTestsThatTestIsolatedApp' -java_non_isolated_cmd := 'java_non_isolated: /java.so $(java_arch_options) -cp /tests/java/tests.jar:/tests/java/isolates.jar \ +java_non_isolated_cmd := 'java_non_isolated: /java.so -cp /tests/java/tests.jar:/tests/java/isolates.jar \ -Disolates.jar=/tests/java/isolates.jar org.junit.runner.JUnitCore io.osv.AllTestsThatTestNonIsolatedApp' -java_no_wrapper_cmd := 'java_no_wrapper: /usr/bin/java $(java_arch_options) -cp /tests/java/tests.jar org.junit.runner.JUnitCore io.osv.BasicTests !' -java_perms_cmd := 'java-perms: /usr/bin/java $(java_arch_options) -cp /tests/java/tests.jar io.osv.TestDomainPermissions !' +java_no_wrapper_cmd := 'java_no_wrapper: /usr/bin/java -cp /tests/java/tests.jar org.junit.runner.JUnitCore io.osv.BasicTests !' +java_perms_cmd := 'java-perms: /usr/bin/java -cp /tests/java/tests.jar io.osv.TestDomainPermissions !' .PHONY: test_commands -- 2.34.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20221014190814.10795-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] memory: move malloc virtual address space below 0x800000000000
00 to: - 4000 As a result the linear mappings after the patch look like this: x86_64) vaddrpaddr size perm memattr name 4020 20 67c434 rwxp normal kernel 40000 4000 rwxp normal main 400ff1 rwxp normal dmi 400f5a10f5a10 17c rwxp normal smbios 40004000 4000 3ffdd000 rwxp normal main 40007fe0 7fe0 20 rwxp normal acpi 4000feb91000 feb91000 1000 rwxp normal pci_bar 4000feb92000 feb92000 1000 rwxp normal pci_bar 4000fec0 fec0 1000 rwxp normal ioapic 50000 4000 rwxp normal page 50004000 4000 3ffdd000 rwxp normal page 60000 4000 rwxp normal mempool 60004000 4000 3ffdd000 rwxp normal mempool aarch64) vaddrpaddr size perm memattr name 800 8001 rwxp dev gic_dist 801 8011 rwxp dev gic_cpu 900 900 1000 rwxp dev pl011 901 901 1000 rwxp dev pl031 1000 1000 2eff rwxp dev pci_mem 3eff 3eff1 rwxp dev pci_io fc000 4000 7de000 rwxp normal kernel 401000 401000 1000 rwxp dev pci_cfg 4a00 a00 200 rwxp normal virtio_mmio_cfg 4a000200 a000200 200 rwxp normal virtio_mmio_cfg 4a000400 a000400 200 rwxp normal virtio_mmio_cfg 4a000600 a000600 200 rwxp normal virtio_mmio_cfg 4a000800 a000800 200 rwxp normal virtio_mmio_cfg 4a000a00 a000a00 200 rwxp normal virtio_mmio_cfg 4a000c00 a000c00 200 rwxp normal virtio_mmio_cfg 4a000e00 a000e00 200 rwxp normal virtio_mmio_cfg 4000407de000 407de000 7f822000 rwxp normal main 5000407de000 407de000 7f822000 rwxp normal page 6000407de000 407de000 7f822000 rwxp normal mempool Fixes #1196 Fixes #1145 Fixes #1157 Signed-off-by: Waldemar Kozaczuk --- arch/aarch64/arch-setup.cc | 5 ++--- core/mmu.cc| 2 +- include/osv/mmu-defs.hh| 4 ++-- scripts/loader.py | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/arch/aarch64/arch-setup.cc b/arch/aarch64/arch-setup.cc index 24e007c4..89dceae9 100644 --- a/arch/aarch64/arch-setup.cc +++ b/arch/aarch64/arch-setup.cc @@ -36,12 +36,11 @@ void setup_temporary_phys_map() { -// duplicate 1:1 mapping into phys_mem +// duplicate 1:1 mapping into the lower part of phys_mem u64 *pt_ttbr0 = reinterpret_cast(processor::read_ttbr0()); -u64 *pt_ttbr1 = reinterpret_cast(processor::read_ttbr1()); for (auto&& area : mmu::identity_mapped_areas) { auto base = reinterpret_cast(get_mem_area_base(area)); -pt_ttbr1[mmu::pt_index(base, 3)] = pt_ttbr0[0]; +pt_ttbr0[mmu::pt_index(base, 3)] = pt_ttbr0[0]; } mmu::flush_tlb_all(); } diff --git a/core/mmu.cc b/core/mmu.cc index 007d4331..33ae8407 100644 --- a/core/mmu.cc +++ b/core/mmu.cc @@ -78,7 +78,7 @@ public: }; constexpr uintptr_t lower_vma_limit = 0x0; -constexpr uintptr_t upper_vma_limit = 0x8000; +constexpr uintptr_t upper_vma_limit = 0x4000; typedef boost::intrusive::set, diff --git a/include/osv/mmu-defs.hh b/include/osv/mmu-defs.hh index 18edf441..fd6a85a6 100644 --- a/include/osv/mmu-defs.hh +++ b/include/osv/mmu-defs.hh @@ -46,12 +46,12 @@ constexpr uintptr_t mem_area_size = uintptr_t(1) << 44; constexpr uintptr_t get_mem_area_base(mem_area area) { -return 0x8000 | uintptr_t(area) << 44; +return 0x4000 | uintptr_t(area) << 44; } static inline mem_area get_mem_area(void* addr) { -return mem_area(reinterpret_cast(addr) >> 44 & 7); +return mem_area(reinterpret_cast(addr) >> 44 & 3); } constexpr void* translate_mem_area(mem_area from, mem_area to, void* addr) diff --git a/scripts/loader.py b/scripts/loader.py index 6878a7a3..0ce782d0 100755 --- a/scripts/loader.py +++ b/scripts/loader.py @@ -27,7 +27,7 @@ class status_enum_class(object): pass status_enum = status_enum_class() -phys_mem = 0x8000 +phys_mem = 0x4000 def pt_index(addr, level): return (addr >> (12 + 9 * level)) & 511 -- 2.34.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20221014000810.7323-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] futex: add benchmark test
This patch adds a new benchmark - misc-futex-perf.cc. The goal is to indirectly measure performance of the futex syscall implemented in OSv and compare it to Linux and thus guide us in implementing the improvements described in the issue #853. The benchmark program does it by implementing special mutex - fmutex - based on futex syscall according to the algorithm specified in the Ulrich Drepper's paper "Futexes Are Tricky". The test is similar to the misc-mutex2.cc written by Nadav Har'El. It takes three parameters: mandatory number of threads (nthreads) and a computation length (worklen) and optional number of mutexes (nmutexes) which is equal to 1 by default. The test groups all threads (nthreads * nmutexes) into nmutexes sets of nthreads threads trying to take the group mutex (one out of nmutexes) in a loop and increment the group counter and then do some short computation of the specified length outside the loop. The test runs for 30 seconds, and shows the average total number of lock-protected counter increments per second. The number of cpus is set by using the '-c' option passes to run.py in case of OSv, and using taskset -c 0..n when running the same program on host. The results of the test that show number of total increments (across counters of all groups of threads) per second for both OSv and Linux host are below. It also shows number of total futex syscall calls (wake) captured by adding an atomic counter in the futex implementation for OSv. +--++--+ | Run parameters | On OSv guest | On Linux host (op/s) | | | (op/s) (futex called) | +--++--+ | 1 0 1 (1 cpu) | 5.1353e+07 0 | 5.21169e+07 | | 2 0 1 (2 cpus) | 2.26067e+07 345,745 | 1.78575e+07 | | 4 0 1 (4 cpus) | 4.93204e+07 2342 | 1.41494e+07 | | 1 500 1 (1 cpu) | 5.67558e+06 0 | 5.7e+06 | | 2 500 1 (2 cpus) | 9.19294e+06 3618 | 9.78263e+06 | | 4 500 1 (4 cpus) | 5.65933e+0638,243 | 6.87465e+06 | | 4 500 2 (4 cpus) | 8.30834e+06 266 | 1.15537e+07 | | 4 500 4 (4 cpus) | 1.06216e+07 111 | 1.16908e+07 | | 4 500 8 (4 cpus) | 1.39291e+07 101 | 1.31845e+07 | +--++--+ The results are surprising and somewhat confusing. For example the lines 2 and 3 show OSv outperforming Linux by a lot. Also the line 7 (4 500 2) shows OSv peformance worse by ~30% even when number of futex calls is pretty low. Possibly there is a flaw in this test, or some kind of different explanation. Signed-off-by: Waldemar Kozaczuk --- modules/tests/Makefile | 3 +- tests/misc-futex-perf.cc | 190 +++ 2 files changed, 192 insertions(+), 1 deletion(-) create mode 100644 tests/misc-futex-perf.cc diff --git a/modules/tests/Makefile b/modules/tests/Makefile index 9552fa2e..ed064d3d 100644 --- a/modules/tests/Makefile +++ b/modules/tests/Makefile @@ -134,7 +134,8 @@ tests := tst-pthread.so misc-ramdisk.so tst-vblk.so tst-bsd-evh.so \ tst-elf-init.so tst-realloc.so tst-setjmp.so \ libtls.so libtls_gold.so tst-tls.so tst-tls-gold.so tst-tls-pie.so \ tst-sigaction.so tst-syscall.so tst-ifaddrs.so tst-getdents.so \ - tst-netlink.so misc-zfs-io.so misc-zfs-arc.so tst-pthread-create.so + tst-netlink.so misc-zfs-io.so misc-zfs-arc.so tst-pthread-create.so \ + misc-futex-perf.so # libstatic-thread-variable.so tst-static-thread-variable.so \ ifeq ($(arch),x64) diff --git a/tests/misc-futex-perf.cc b/tests/misc-futex-perf.cc new file mode 100644 index ..a2f80f8b --- /dev/null +++ b/tests/misc-futex-perf.cc @@ -0,0 +1,190 @@ +/* + * Copyright (C) 2022 Waldemar Kozaczuk + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// This test is based on misc-mutex2.cc written by Nadav Har'El. But unlike +// the other one, it focuses on measuring the performance of the futex() +// syscall implementation. It does it indirectly by implementing mutex based +// on futex syscall according to the formula specified in the Ulrich Drepper's +// paper "Futexes Are Tricky". +// It takes three parameters: mandatory number of threads (nthreads) and +// a computation length (worklen) and optional number of mutexes (nmutexes). +// The test groups all threads (nthreads * nmutexes) into nmutexes sets +// where nthreads threads loop trying to take the group mutex (one out of nmutexe
[osv-dev] [PATCH 8/8] lazy stack: new tracepoint for stack pre-faults
This last patch of the series adds new tracepoint - mmu_vm_stack_fault - which when enabled allows one to see how particular app triggers the stack page faults. The tracepoint captures the stack fault address, the thread id and number of the page (0 being the 1st page). Please note this does not capture the 1st page of the stack (page_no 0) as this one pre-faulted by the parent thread that creates a new one. ./scripts/run.py -e /tests/tst-pipe.so --trace=mmu_vm_stack_fault --trace-backtrace -H ./scripts/trace.py extract && ./scripts/trace.py list -bl 0x816b7040 >init0 0.002215401 mmu_vm_stack_fault thread=32, addr=0x200ff9d0, page_no=1 mmu::vm_fault(unsigned long, exception_frame*) page_fault ex_pf std_malloc(unsigned long, unsigned long) malloc operator new(unsigned long) do_main_thread(void*) std::_Function_handler::_M_invoke(std::_Any_data const&) __invoke_impl&>__invoke_r&> _M_invoke sched::thread::main() thread_main_c ... 0x816b7040 >init0 0.084799151 mmu_vm_stack_fault thread=32, addr=0x200f8440, page_no=8 mmu::vm_fault(unsigned long, exception_frame*) page_fault ex_pf memory::page_pool::l1::alloc_page() untracked_alloc_page memory::alloc_page() std_malloc(unsigned long, unsigned long) malloc operator new(unsigned long) lookup sys_lstat Signed-off-by: Waldemar Kozaczuk --- core/mmu.cc | 11 +++ 1 file changed, 11 insertions(+) diff --git a/core/mmu.cc b/core/mmu.cc index e41de215..afab7cc9 100644 --- a/core/mmu.cc +++ b/core/mmu.cc @@ -1413,6 +1413,9 @@ bool access_fault(vma& vma, unsigned int error_code) TRACEPOINT(trace_mmu_vm_fault, "addr=%p, error_code=%x", uintptr_t, unsigned int); TRACEPOINT(trace_mmu_vm_fault_sigsegv, "addr=%p, error_code=%x, %s", uintptr_t, unsigned int, const char*); TRACEPOINT(trace_mmu_vm_fault_ret, "addr=%p, error_code=%x", uintptr_t, unsigned int); +#if CONF_lazy_stack +TRACEPOINT(trace_mmu_vm_stack_fault, "thread=%d, addr=%p, page_no=%d", unsigned int, uintptr_t, unsigned int); +#endif static void vm_sigsegv(uintptr_t addr, exception_frame* ef) { @@ -1438,6 +1441,14 @@ void vm_fault(uintptr_t addr, exception_frame* ef) trace_mmu_vm_fault_sigsegv(addr, ef->get_error(), "fast"); return; } +#if CONF_lazy_stack +auto stack = sched::thread::current()->get_stack_info(); +void *v_addr = reinterpret_cast(addr); +if (v_addr >= stack.begin && v_addr < stack.begin + stack.size) { +trace_mmu_vm_stack_fault(sched::thread::current()->id(), addr, +((u64)(stack.begin + stack.size - addr)) / 4096); +} +#endif addr = align_down(addr, mmu::page_size); WITH_LOCK(vma_list_mutex.for_read()) { auto vma = find_intersecting_vma(addr); -- 2.34.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220831042433.140243-8-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH 7/8] lazy stack: activate lazy stack in pthreads
This patch adds new mmap flag - mmap_stack - that is used when mmaping a stack when creating new pthread. This new flag is only used when the build parameter CONF_lazy_stack is enabled. Signed-off-by: Waldemar Kozaczuk --- include/osv/mmu-defs.hh | 1 + libc/mman.cc| 7 +-- libc/pthread.cc | 7 ++- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/osv/mmu-defs.hh b/include/osv/mmu-defs.hh index 18edf441..694815f8 100644 --- a/include/osv/mmu-defs.hh +++ b/include/osv/mmu-defs.hh @@ -84,6 +84,7 @@ enum { mmap_small = 1ul << 5, mmap_jvm_balloon = 1ul << 6, mmap_file= 1ul << 7, +mmap_stack = 1ul << 8, }; enum { diff --git a/libc/mman.cc b/libc/mman.cc index 75a94eb0..115b0313 100644 --- a/libc/mman.cc +++ b/libc/mman.cc @@ -43,12 +43,7 @@ unsigned libc_flags_to_mmap(int flags) mmap_flags |= mmu::mmap_populate; } if (flags & MAP_STACK) { -// OSv currently requires that stacks be pinned (see issue #143). So -// if an application wants to mmap() a stack for pthread_attr_setstack -// and did us the courtesy of telling this to ue (via MAP_STACK), -// let's return the courtesy by returning pre-faulted memory. -// FIXME: If issue #143 is fixed, this workaround should be removed. -mmap_flags |= mmu::mmap_populate; +mmap_flags |= mmu::mmap_stack; } if (flags & MAP_SHARED) { mmap_flags |= mmu::mmap_shared; diff --git a/libc/pthread.cc b/libc/pthread.cc index cda6cf90..de5979e8 100644 --- a/libc/pthread.cc +++ b/libc/pthread.cc @@ -141,7 +141,12 @@ namespace pthread_private { return {attr.stack_begin, attr.stack_size}; } size_t size = attr.stack_size; -void *addr = mmu::map_anon(nullptr, size, mmu::mmap_populate, mmu::perm_rw); +#if CONF_lazy_stack +unsigned stack_flags = mmu::mmap_stack; +#else +unsigned stack_flags = mmu::mmap_populate; +#endif +void *addr = mmu::map_anon(nullptr, size, stack_flags, mmu::perm_rw); mmu::mprotect(addr, attr.guard_size, 0); sched::thread::stack_info si{addr, size}; si.deleter = free_stack; -- 2.34.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220831042433.140243-7-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH 5/8] lazy stack: ensure next stack page conditionally if interrupts and preemption enabled
This patch modifies the last set of call sites where we do not know statically if any of interrupts or preemption are enabled. In those cases we dynamically check if both preemtion and interrupts are enabled and only then pre-fault the stack. Most of these places are in the tracepoint and sampler implementation. This is obviously the most costly operation but even here we are lucky that it affects the performance only when tracepoints are enabled and even then the code already saves the state of the interrupts using the arch::irq_flag_notrace so checking if interrupts are enabled is pretty cheap. With sampler on other hand, the performance is only affected when starting or stoping the sampler which is quite rare. Signed-off-by: Waldemar Kozaczuk --- arch/aarch64/interrupt.cc | 5 + core/sampler.cc | 31 +++ core/trace.cc | 17 + include/osv/trace.hh | 5 + 4 files changed, 54 insertions(+), 4 deletions(-) diff --git a/arch/aarch64/interrupt.cc b/arch/aarch64/interrupt.cc index e26e10ee..0b244e2e 100644 --- a/arch/aarch64/interrupt.cc +++ b/arch/aarch64/interrupt.cc @@ -31,6 +31,11 @@ void sgi_interrupt::send(sched::cpu* cpu) void sgi_interrupt::send_allbutself() { +#if CONF_lazy_stack +if (sched::preemptable() && arch::irq_enabled()) { +arch::ensure_next_stack_page(); +} +#endif gic::gic->send_sgi(gic::sgi_filter::SGI_TARGET_ALL_BUT_SELF, 0, get_id()); } diff --git a/core/sampler.cc b/core/sampler.cc index 9e37ba4f..b9e2b05c 100644 --- a/core/sampler.cc +++ b/core/sampler.cc @@ -32,11 +32,16 @@ private: sched::timer_base _timer; bool _active; -void rearm() +void arm() { _timer.set(_config.period); } +void rearm() +{ +_timer.set_with_irq_disabled(_config.period); +} + public: cpu_sampler() : _timer(*this) @@ -54,7 +59,11 @@ public: { assert(!_active); _active = true; -rearm(); +if (arch::irq_enabled()) { +arm(); +} else { +rearm(); +} } void stop() @@ -97,7 +106,11 @@ static void start_on_current() if (prev_active + 1 == _n_cpus) { _started = true; -_controller.wake(); +if (arch::irq_enabled()) { +_controller.wake(); +} else { +_controller.wake_from_kernel_or_with_irq_disabled(); +} } } @@ -110,7 +123,11 @@ static void stop_on_current() _sampler->stop(); if (--_active_cpus == 0) { -_controller.wake(); +if (arch::irq_enabled()) { +_controller.wake(); +} else { +_controller.wake_from_kernel_or_with_irq_disabled(); +} } } @@ -170,6 +187,12 @@ void stop_sampler() throw() WITH_LOCK(migration_lock) { stop_sampler_ipi.send_allbutself(); +#if CONF_lazy_stack_invariant +assert(arch::irq_enabled()); +#endif +#if CONF_lazy_stack +sched::ensure_next_stack_page_if_preemptable(); +#endif stop_on_current(); } diff --git a/core/trace.cc b/core/trace.cc index dc69c807..c9ed2bab 100644 --- a/core/trace.cc +++ b/core/trace.cc @@ -247,6 +247,13 @@ void tracepoint_base::update() WITH_LOCK(trace_control_lock) { bool empty; +#if CONF_lazy_stack_invariant +assert(arch::irq_enabled()); +assert(sched::preemptable()); +#endif +#if CONF_lazy_stack +arch::ensure_next_stack_page(); +#endif WITH_LOCK(osv::rcu_read_lock) { auto& probes = *probes_ptr.read(); @@ -376,6 +383,11 @@ extern "C" void __cyg_profile_func_enter(void *this_fn, void *call_site) } arch::irq_flag_notrace irq; irq.save(); +#if CONF_lazy_stack +if (sched::preemptable() && irq.enabled()) { +arch::ensure_next_stack_page(); +} +#endif arch::irq_disable_notrace(); if (func_trace_nesting++ == 0) { trace_function_entry(this_fn, call_site); @@ -391,6 +403,11 @@ extern "C" void __cyg_profile_func_exit(void *this_fn, void *call_site) } arch::irq_flag_notrace irq; irq.save(); +#if CONF_lazy_stack +if (sched::preemptable() && irq.enabled()) { +arch::ensure_next_stack_page(); +} +#endif arch::irq_disable_notrace(); if (func_trace_nesting++ == 0) { trace_function_exit(this_fn, call_site); diff --git a/include/osv/trace.hh b/include/osv/trace.hh index d735575c..01d72022 100644 --- a/include/osv/trace.hh +++ b/include/osv/trace.hh @@ -348,6 +348,11 @@ public: if (active) { arch::irq_flag_notrace irq; irq.save(); +#if CONF_lazy_stack +if (sched::preemptable() && irq.enabled()) { +arch::ensure_next_stack_page(); +} +#endif arch::irq_disable_notrace(); log(as)
[osv-dev] [PATCH 6/8] lazy stack: prevent deadlock when taking vma_list_mutex for write
This patch makes all functions in core/mmu.cc that take vma_list_mutex for write to pre-fault the stack two pages deep before using the mutex. This is necessary to prevent any follow up stack faults down the call stack after the vma_list_mutex is take for write as this would lead to a deadlock experienced when testing one of the apps when the page fault handler and mmu::vm_fault() function would try to take the same vma_list_mutex for read. Signed-off-by: Waldemar Kozaczuk --- core/mmu.cc | 17 + 1 file changed, 17 insertions(+) diff --git a/core/mmu.cc b/core/mmu.cc index 007d4331..e41de215 100644 --- a/core/mmu.cc +++ b/core/mmu.cc @@ -47,6 +47,17 @@ extern const char text_start[], text_end[]; namespace mmu { +#if CONF_lazy_stack +// We need to ensure that lazy stack is populated deeply enough (2 pages) +// for all the cases when the vma_list_mutex is taken for write to prevent +// page faults triggered on stack. The page-fault handling logic would +// attempt to take same vma_list_mutex fo read and end up with a deadlock. +#define PREVENT_STACK_PAGE_FAULT \ +arch::ensure_next_two_stack_pages(); +#else +#define PREVENT_STACK_PAGE_FAULT +#endif + struct vma_range_compare { bool operator()(const vma_range& a, const vma_range& b) { return a.start() < b.start(); @@ -1271,6 +1282,7 @@ static void nohugepage(void* addr, size_t length) error advise(void* addr, size_t size, int advice) { +PREVENT_STACK_PAGE_FAULT WITH_LOCK(vma_list_mutex.for_write()) { if (!ismapped(addr, size)) { return make_error(ENOMEM); @@ -1310,6 +1322,7 @@ void* map_anon(const void* addr, size_t size, unsigned flags, unsigned perm) size = align_up(size, mmu::page_size); auto start = reinterpret_cast(addr); auto* vma = new mmu::anon_vma(addr_range(start, start + size), perm, flags); +PREVENT_STACK_PAGE_FAULT SCOPE_LOCK(vma_list_mutex.for_write()); auto v = (void*) allocate(vma, start, size, search); if (flags & mmap_populate) { @@ -1336,6 +1349,7 @@ void* map_file(const void* addr, size_t size, unsigned flags, unsigned perm, auto start = reinterpret_cast(addr); auto *vma = f->mmap(addr_range(start, start + size), flags | mmap_file, perm, offset).release(); void *v; +PREVENT_STACK_PAGE_FAULT WITH_LOCK(vma_list_mutex.for_write()) { v = (void*) allocate(vma, start, size, search); if (flags & mmap_populate) { @@ -1708,6 +1722,7 @@ ulong map_jvm(unsigned char* jvm_addr, size_t size, size_t align, balloon_ptr b) auto* vma = new mmu::jvm_balloon_vma(jvm_addr, start, start + size, b, v->perm(), v->flags()); +PREVENT_STACK_PAGE_FAULT WITH_LOCK(vma_list_mutex.for_write()) { // This means that the mapping that we had before was a balloon mapping // that was laying around and wasn't updated to an anon mapping. If we @@ -2014,6 +2029,7 @@ void free_initial_memory_range(uintptr_t addr, size_t size) error mprotect(const void *addr, size_t len, unsigned perm) { +PREVENT_STACK_PAGE_FAULT SCOPE_LOCK(vma_list_mutex.for_write()); if (!ismapped(addr, len)) { @@ -2025,6 +2041,7 @@ error mprotect(const void *addr, size_t len, unsigned perm) error munmap(const void *addr, size_t length) { +PREVENT_STACK_PAGE_FAULT SCOPE_LOCK(vma_list_mutex.for_write()); length = align_up(length, mmu::page_size); -- 2.34.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220831042433.140243-6-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH 4/8] lazy stack: ensure next stack page dynamically if preemption enabled
This patch modifies number of critical places mostly in the scheduler code to dynamically pre-fault the stack if preemption is enabled. In all these places we can be statically sure that interrupts are enabled but not so sure about preemption (maybe in future we can prove it is enabled at least in some of the cases and replace conditional ensure_next_stack_page_if_preemptable() with cheaper single-intruction ensure_next_stack_page()). The three call sites before irq_lock is taken (interrupts are disabled) include: - cpu::schedule() before WITH_LOCK(irq_lock) - thread::pin() before WITH_LOCK(irq_lock) - thread::yield() before guard*(irq_lock) The reasoning above goes like this: the methods were designed with intention to be used when interrupts are enabled because otherwise the act of using WITH_LOCK or guard construct would imply that interrupts would be re-enabled after the block so the code does not care about restoring interrupts to the proper state it was before. If that was the intension, these methods would use irq_save_lock_type. Two other call sites in the scheduler are: - timer_base destructor before calling timer_base::cancel() which disables interrupts - thread::wake_with_from_mutex() only called from waiter::wake() follow the reasoning that interrupts are enabled most of the time except few places in scheduler and interrupt handler. Also we assume the mutex is not used when interrupts are disabled. And based on my analysis of all the code that disables/enables interrupts that seems to be the case. In any case we put invariants in these two places to verify that interrupts are enabled indeed. The last call site in abort() assumes that calling irq_disable() implies that interrupts are enabled before and we only need to check preemption status. Signed-off-by: Waldemar Kozaczuk --- core/sched.cc| 24 include/osv/sched.hh | 17 + runtime.cc | 3 +++ 3 files changed, 44 insertions(+) diff --git a/core/sched.cc b/core/sched.cc index 1e109694..65842ff3 100644 --- a/core/sched.cc +++ b/core/sched.cc @@ -224,6 +224,9 @@ void thread::cputime_estimator_get( // scheduler on a different CPU would be disastrous. void cpu::schedule() { +#if CONF_lazy_stack +sched::ensure_next_stack_page_if_preemptable(); +#endif WITH_LOCK(irq_lock) { #ifdef __aarch64__ reschedule_from_interrupt(sched::cpu::current(), false, thyst); @@ -566,6 +569,9 @@ void thread::pin(cpu *target_cpu) t.wake(); }, sched::thread::attr().pin(source_cpu))); wakeme->start(); +#if CONF_lazy_stack +sched::ensure_next_stack_page_if_preemptable(); +#endif WITH_LOCK(irq_lock) { trace_sched_migrate(&t, target_cpu->id); t.stat_migrations.incr(); @@ -822,6 +828,12 @@ void thread::yield(thread_runtime::duration preempt_after) { trace_sched_yield(); auto t = current(); +#if CONF_lazy_stack_invariant +assert(arch::irq_enabled()); +#endif +#if CONF_lazy_stack +sched::ensure_next_stack_page_if_preemptable(); +#endif std::lock_guard guard(irq_lock); // FIXME: drive by IPI cpu::current()->handle_incoming_wakeups(); @@ -1258,6 +1270,12 @@ void thread::wake_impl(detached_state* st, unsigned allowed_initial_states_mask) void thread::wake() { +#if CONF_lazy_stack_invariant +assert(arch::irq_enabled()); +#endif +#if CONF_lazy_stack +sched::ensure_next_stack_page_if_preemptable(); +#endif WITH_LOCK(rcu_read_lock) { wake_impl(_detached_state.get()); } @@ -1604,6 +1622,12 @@ timer_base::timer_base(timer_base::client& t) timer_base::~timer_base() { +#if CONF_lazy_stack_invariant +assert(arch::irq_enabled()); +#endif +#if CONF_lazy_stack +sched::ensure_next_stack_page_if_preemptable(); +#endif cancel(); } diff --git a/include/osv/sched.hh b/include/osv/sched.hh index 1691bed8..8a2694cb 100644 --- a/include/osv/sched.hh +++ b/include/osv/sched.hh @@ -1043,6 +1043,15 @@ inline bool preemptable() return !get_preempt_counter(); } +#if CONF_lazy_stack +inline void ensure_next_stack_page_if_preemptable() { +if (!preemptable()) { +return; +} +arch::ensure_next_stack_page(); +} +#endif + inline void preempt() { if (preemptable()) { @@ -1350,6 +1359,14 @@ template inline void thread::wake_with_from_mutex(Action action) { +#if CONF_lazy_stack_invariant +assert(arch::irq_enabled()); +#endif +#if CONF_lazy_stack +if (preemptable()) { +arch::ensure_next_stack_page(); +} +#endif return do_wake_with(action, (1 << unsigned(status::waiting)) | (1 << unsigned(status::sending_lock))); } diff --git a/runtime.cc b/runtime.cc index 521b5c24..5f67e79b 100644 --- a/runtime.cc +++ b/runtime.cc @@ -113,6 +113,9 @@ void abort(const char *fmt, ...) do {} while (true); } +#if CONF_lazy_stack +sched::ensure_next_stack_page_if_pre
[osv-dev] [PATCH 3/8] lazy stack: ensure next stack page statically
This patch modifies all relevant places, where we statically know that both interrupts and preemption should be enabled, to unconditionally pre-fault the stack. These include places in code before: - WITH_LOCK(preemption_lock) block - WITH_LOCK(rcu_read_lock) block - sched::preempt_disable() call - WITH_LOCK(irq_lock) block in one case In general, these are the places that follow the assumption that most of the time preemption and interrupts are enabled. And the functions/method below are called directly or indirectly by an application and there is no other kernel code in that application call stack above that also disables interrupts or preemption. One good example is the memory allocation code in mempool.cc that disables preemption in quite few places. Many of those use WITH_LOCK/DROP_LOCK(preempt_lock) combination that implies they were intended to be called with preemption enabled otherwise code in DROP_LOCK would not work. Also all of those end up calling some form of thread::wait() method that asserts that both interrupts and interrupts are enabled. To validate the reasoning, we add relevant invariants before we pre-fault the stack. Signed-off-by: Waldemar Kozaczuk --- bsd/porting/uma_stub.cc | 12 + bsd/sys/net/routecache.hh | 6 + bsd/sys/netinet/arpcache.hh | 7 + core/async.cc | 18 + core/condvar.cc | 7 + core/elf.cc | 12 + core/mempool.cc | 50 ++- core/rcu.cc | 6 + core/sched.cc | 52 + fs/vfs/kern_descrip.cc | 6 + include/osv/elf.hh | 7 + include/osv/percpu_xmit.hh | 13 ++ include/osv/sched.hh| 7 + 13 files changed, 202 insertions(+), 1 deletion(-) diff --git a/bsd/porting/uma_stub.cc b/bsd/porting/uma_stub.cc index cf320b13..a9d689c3 100644 --- a/bsd/porting/uma_stub.cc +++ b/bsd/porting/uma_stub.cc @@ -34,6 +34,12 @@ void * uma_zalloc_arg(uma_zone_t zone, void *udata, int flags) { void * ptr; +#if CONF_lazy_stack_invariant +assert(sched::preemptable() && arch::irq_enabled()); +#endif +#if CONF_lazy_stack +arch::ensure_next_stack_page(); +#endif WITH_LOCK(preempt_lock) { ptr = (*zone->percpu_cache)->alloc(); } @@ -101,6 +107,12 @@ void uma_zfree_arg(uma_zone_t zone, void *item, void *udata) zone->uz_dtor(item, zone->uz_size, udata); } +#if CONF_lazy_stack_invariant +assert(sched::preemptable() && arch::irq_enabled()); +#endif +#if CONF_lazy_stack +arch::ensure_next_stack_page(); +#endif WITH_LOCK(preempt_lock) { if ((*zone->percpu_cache)->free(item)) { return; diff --git a/bsd/sys/net/routecache.hh b/bsd/sys/net/routecache.hh index bdcdf496..e2140f06 100644 --- a/bsd/sys/net/routecache.hh +++ b/bsd/sys/net/routecache.hh @@ -162,6 +162,12 @@ public: // route.cc). assert(fibnum == 0); +#if CONF_lazy_stack_invariant +assert(sched::preemptable() && arch::irq_enabled()); +#endif +#if CONF_lazy_stack +arch::ensure_next_stack_page(); +#endif WITH_LOCK(osv::rcu_read_lock) { auto *c = cache.read(); auto entry = c->search(dst->sin_addr.s_addr); diff --git a/bsd/sys/netinet/arpcache.hh b/bsd/sys/netinet/arpcache.hh index 4556d40e..289a2f29 100644 --- a/bsd/sys/netinet/arpcache.hh +++ b/bsd/sys/netinet/arpcache.hh @@ -16,6 +16,7 @@ #include #include #include +#include #include #include @@ -98,6 +99,12 @@ struct arp_cache { boost::optional lookup(const in_addr ip) { +#if CONF_lazy_stack_invariant +assert(sched::preemptable() && arch::irq_enabled()); +#endif +#if CONF_lazy_stack +arch::ensure_next_stack_page(); +#endif WITH_LOCK(osv::rcu_read_lock) { auto i = _entries.reader_find(ip, std::hash(), entry_compare()); return boost::optional(!!i, *i); diff --git a/core/async.cc b/core/async.cc index 9f3d61d2..ada19fba 100644 --- a/core/async.cc +++ b/core/async.cc @@ -104,6 +104,12 @@ public: void insert(percpu_timer_task& task) { +#if CONF_lazy_stack_invariant +assert(arch::irq_enabled() && sched::preemptable()); +#endif +#if CONF_lazy_stack +arch::ensure_next_stack_page(); +#endif WITH_LOCK(preempt_lock) { trace_async_timer_task_insert(this, &task); @@ -125,6 +131,12 @@ public: percpu_timer_task& borrow_task() { +#if CONF_lazy_stack_invariant +assert(arch::irq_enabled() && sched::preemptable()); +#endif +#if CONF_lazy_stack +arch::ensure_next_stack_page(); +#endif WITH_LOCK(preempt_lock) { auto task = released_timer_tasks.pop(); if (task) { @@ -142,6 +154,12 @@ public: { auto task = new one_shot_task(std:
[osv-dev] [PATCH 2/8] lazy stack: do nothing in kernel threads and on populated stack
This patch annotates the relevant call sites with the invariant assert expressions to validate assumptions that let us do "nothing" in all these cases. We also reorganize some code in the scheduler to help differentiate between cases when given function/method is called with interrupts or preemption disabled or from kernel thread or by interrupt handler. Following methods get added to scheduler code with names describing state of interrupts or preemption or kernel caller: - timer_base::set_with_irq_disabled(osv::clock::uptime::time_point time) - timer_base::set_with_irq_disabled(std::chrono::duration duration) - thread::wake_with_irq_disabled() - thread::wake_with_irq_or_preemption_disabled(Action action) - thread_handle::wake_from_kernel_or_with_irq_disabled() In general: - we modify all interrupt handlers (those that are executed on interrupt stack) to call one of the 3 new wake_...() methods (mostly wake_with_irq_disabled()) to indicate we do not need/should not pre-fault the stack; most of those are in device drivers code - we modify all code executed on kernel threads that disables preemption or interrupts by adding relevant invariant - assert(!sched::thread::current()->is_app()); we do not need to pre-fault because the stack is populated - we also modify the code whhich is indirectly called from kernel threads like classifier::post_packet() in net channels - finally we also modify the scheduler code to use timer_bas::set_with_irq_disabled() mostly around preemption_timer to indicate that we should not pre-fault the stack downstream Signed-off-by: Waldemar Kozaczuk --- arch/aarch64/exceptions.cc | 3 ++ arch/aarch64/interrupt.cc | 3 ++ arch/x64/exceptions.cc | 3 ++ arch/x64/mmu.cc| 2 +- arch/x64/msi.cc| 2 +- core/async.cc | 6 +++ core/epoll.cc | 2 +- core/mempool.cc| 12 -- core/net_channel.cc| 8 +++- core/rcu.cc| 5 ++- core/sched.cc | 79 ++ drivers/acpi.cc| 2 +- drivers/ahci.cc| 2 +- drivers/ahci.hh| 2 +- drivers/cadence-uart.cc| 2 +- drivers/isa-serial.cc | 2 +- drivers/kbd.cc | 2 +- drivers/mmio-isa-serial.cc | 2 +- drivers/pl011.cc | 2 +- drivers/virtio-blk.cc | 6 +-- drivers/virtio-fs.cc | 4 +- drivers/virtio-net.cc | 6 +-- drivers/virtio-rng.cc | 2 +- drivers/virtio-scsi.cc | 2 +- drivers/virtio-vring.hh| 2 +- drivers/vmw-pvscsi.cc | 2 +- drivers/xenconsole.cc | 2 +- include/osv/net_channel.hh | 5 ++- include/osv/sched.hh | 16 include/osv/xen_intr.hh| 2 +- libc/signal.cc | 3 ++ libc/timerfd.cc| 3 ++ 32 files changed, 157 insertions(+), 39 deletions(-) diff --git a/arch/aarch64/exceptions.cc b/arch/aarch64/exceptions.cc index cadbb3a2..5c2c59ab 100644 --- a/arch/aarch64/exceptions.cc +++ b/arch/aarch64/exceptions.cc @@ -122,6 +122,9 @@ void interrupt_table::unregister_interrupt(interrupt *interrupt) bool interrupt_table::invoke_interrupt(unsigned int id) { +#if CONF_lazy_stack_invariant +assert(!arch::irq_enabled()); +#endif WITH_LOCK(osv::rcu_read_lock) { assert(id < this->nr_irqs); interrupt_desc *desc = this->irq_desc[id].read(); diff --git a/arch/aarch64/interrupt.cc b/arch/aarch64/interrupt.cc index b8337e23..e26e10ee 100644 --- a/arch/aarch64/interrupt.cc +++ b/arch/aarch64/interrupt.cc @@ -22,6 +22,9 @@ sgi_interrupt::~sgi_interrupt() void sgi_interrupt::send(sched::cpu* cpu) { +#if CONF_lazy_stack_invariant +assert(!arch::irq_enabled() || !sched::preemptable()); +#endif gic::gic->send_sgi(gic::sgi_filter::SGI_TARGET_LIST, cpu->arch.smp_idx, get_id()); } diff --git a/arch/x64/exceptions.cc b/arch/x64/exceptions.cc index 7c9eaf51..fbf6be65 100644 --- a/arch/x64/exceptions.cc +++ b/arch/x64/exceptions.cc @@ -220,6 +220,9 @@ void interrupt_descriptor_table::unregister_interrupt(gsi_level_interrupt *inter void interrupt_descriptor_table::invoke_interrupt(unsigned vector) { +#if CONF_lazy_stack_invariant +assert(!arch::irq_enabled()); +#endif WITH_LOCK(osv::rcu_read_lock) { unsigned i, nr_shared; bool handled = false; diff --git a/arch/x64/mmu.cc b/arch/x64/mmu.cc index 1af268c0..675410d0 100644 --- a/arch/x64/mmu.cc +++ b/arch/x64/mmu.cc @@ -64,7 +64,7 @@ std::atomic tlb_flush_pendingconfirms; inter_processor_interrupt tlb_flush_ipi{IPI_TLB_FLUSH, [] { mmu::flush_tlb_local(); if (tlb_flush_pendingconfirms.fetch_add(-1) == 1) { -tlb_flush_waiter.wake(); +tlb_flush_waiter.wake_from_kernel_or_with_irq_disabled(); } }}; diff --git a/arch/x64/msi.cc b/arch/x64/msi.cc index 9a28e3a5..cf0c3dc5 100644 --- a/arch/x64/msi.cc +++ b/arch/x64/msi.c
[osv-dev] [PATCH 1/8] lazy stack: inline assembly to pre-fault stack
importantly indirect usages of the call sites identified above. So this patch lays a ground work by defining the inline assembly to pre-fault the stack where necessary and introduces two build parameters - CONF_lazy_stack and CONF_lazy_stack_invariant - that are disabled by default. The first one is used in all places to enable the lazy stack logic and the second one is used to add code with some related invariants that will help us to reason about the code and whether we should do nothing, pre-fault stack "blindly" or conditionally. The remaining 7 patches mostly add the pre-fault code in relevant places but also annotate code with some invariants using assert(). Signed-off-by: Waldemar Kozaczuk --- Makefile | 3 ++- arch/aarch64/arch.hh | 14 ++ arch/x64/arch.hh | 13 + conf/base.mk | 3 +++ 4 files changed, 32 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index dab704af..fcc55e29 100644 --- a/Makefile +++ b/Makefile @@ -371,7 +371,8 @@ $(out)/bsd/%.o: INCLUDES += -isystem bsd/ # for machine/ $(out)/bsd/%.o: INCLUDES += -isystem bsd/$(arch) -configuration-defines = conf-preempt conf-debug_memory conf-logger_debug conf-debug_elf +configuration-defines = conf-preempt conf-debug_memory conf-logger_debug conf-debug_elf \ + conf-lazy_stack conf-lazy_stack_invariant configuration = $(foreach cf,$(configuration-defines), \ -D$(cf:conf-%=CONF_%)=$($(cf))) diff --git a/arch/aarch64/arch.hh b/arch/aarch64/arch.hh index abee0984..6756ed2c 100644 --- a/arch/aarch64/arch.hh +++ b/arch/aarch64/arch.hh @@ -20,6 +20,20 @@ namespace arch { #define INSTR_SIZE_MIN 4 #define ELF_IMAGE_START (OSV_KERNEL_VM_BASE + 0x1) +#if CONF_lazy_stack +inline void ensure_next_stack_page() { +u64 i, offset = -4096; +asm volatile("ldr %0, [sp, %1]" : "=r"(i) : "r"(offset)); +} + +inline void ensure_next_two_stack_pages() { +u64 i, offset = -4096; +asm volatile("ldr %0, [sp, %1]" : "=r"(i) : "r"(offset)); +offset = -8192; +asm volatile("ldr %0, [sp, %1]" : "=r"(i) : "r"(offset)); +} +#endif + inline void irq_disable() { processor::irq_disable(); diff --git a/arch/x64/arch.hh b/arch/x64/arch.hh index 17df5f5c..0ecc123c 100644 --- a/arch/x64/arch.hh +++ b/arch/x64/arch.hh @@ -20,6 +20,19 @@ namespace arch { #define INSTR_SIZE_MIN 1 #define ELF_IMAGE_START OSV_KERNEL_BASE +#if CONF_lazy_stack +inline void ensure_next_stack_page() { +char i; +asm volatile("movb -4096(%%rsp), %0" : "=r"(i)); +} + +inline void ensure_next_two_stack_pages() { +char i; +asm volatile("movb -4096(%%rsp), %0" : "=r"(i)); +asm volatile("movb -8192(%%rsp), %0" : "=r"(i)); +} +#endif + inline void irq_disable() { processor::cli(); diff --git a/conf/base.mk b/conf/base.mk index b4415a74..6b40da9f 100644 --- a/conf/base.mk +++ b/conf/base.mk @@ -13,3 +13,6 @@ conf-DEBUG_BUILD=0 conf-debug_elf=0 conf_hide_symbols=0 conf_linker_extra_options= + +conf-lazy_stack=0 +conf-lazy_stack_invariant=0 -- 2.34.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220831042433.140243-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] trace: do not use malloc/free when interrupts are disabled
This patch fixes a subtle bug found when working on lazy stack changes and trying to establish some invariants in relevant places in the kernel code. One of the findings was that the memory allocation logic or more specifically the functions malloc() and free() require both interrupts and preemption enabled to function correctly. If one starts analysis with memory::pool::alloc(), he/she will notice a DROP_LOCK(preempt_lock) executed under WITH_LOCK(preempt) if the pool is empty. This means that the code assumes the preemption must be enabled before calling alloc(), otherwise the code in DROP_LOCK would run with preemption disabled which is contrary to the intention. Also, after drilling down the add_page() calling path, one eventually would find the call to reclaimers::wait() which calls thread::wait_until(). The thread::do_wait_until() called by the former enforces that both interrupts and preemption needs to be enabled with assertion. The bottom line of the analysis above is that both interrupts and preemption must be enabled before calling malloc() (same applies to free()). This effectively means that trying to call malloc() after disabling interrupts might not work (which would only happen if relevent pool was out of free pages AND l1 pool was empty and had to refill from l2 pool which probably is quite rare). So this patch changes the code in trace::create_trace_dump() to avoid calling new/malloc() (see copies.emplace_back()) and instead makes it preallocate the vector ahead of time and then simply copy the trace buffer to a relevant spot in the vector. Signed-off-by: Waldemar Kozaczuk --- core/trace.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/core/trace.cc b/core/trace.cc index 60fa18b9..dc69c807 100644 --- a/core/trace.cc +++ b/core/trace.cc @@ -697,7 +697,7 @@ std::string trace::create_trace_dump() { semaphore signal(0); -std::vector copies; +std::vector copies(sched::cpus.size()); auto is_valid_tracepoint = [](const tracepoint_base * tp_test) { for (auto & tp : tracepoint_base::tp_list) { @@ -717,7 +717,7 @@ trace::create_trace_dump() irq.save(); arch::irq_disable_notrace(); auto * tbp = percpu_trace_buffer.for_cpu(cpu); -copies.emplace_back(*tbp); +copies[i] = trace_buf(*tbp); irq.restore(); signal.post(); }, sched::thread::attr().pin(cpu))); -- 2.34.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220829011346.432532-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] aarch64 trace: make compiler pick a register instead of x0
This subtle 1-character patch fixes a nasty bug that causes interrupts to be enabled instead of correctly restored to the state it was when saving the state. This bug would affect the tracing logic and result in crashes described by the issues #1158 and #1195. This bug in inline assembly was most likely a typo as I am sure the intention was to use '%0' instead of 'x0' to let compiler correctly pick a register instead of using any garbage in the x0 register. Fixes #1158 Fixes #1195 Signed-off-by: Waldemar Kozaczuk --- arch/aarch64/arch.hh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/aarch64/arch.hh b/arch/aarch64/arch.hh index c07e..abee0984 100644 --- a/arch/aarch64/arch.hh +++ b/arch/aarch64/arch.hh @@ -79,7 +79,7 @@ inline void irq_flag_notrace::save() { } inline void irq_flag_notrace::restore() { -asm volatile("msr daif, x0" :: "r"(daif) : "memory"); +asm volatile("msr daif, %0" :: "r"(daif) : "memory"); } inline bool irq_flag_notrace::enabled() const { -- 2.34.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220828223629.429177-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] routecache: handle case when sent to our non-loopback address
mbuf*, int) core/net_trace.cc:143 netisr_queue_workstream(netisr_workstream*, unsigned int, netisr_work*, mbuf*, int*) [clone .constprop.0] bsd/sys/net/netisr.cc:633 netisr_queue_src bsd/sys/net/netisr.cc:684 netisr_queue_src bsd/sys/net/netisr.cc:712 if_simloop bsd/sys/net/if_loop.cc:291 ether_output bsd/sys/net/if_ethersubr.cc:256 ip_output(mbuf*, mbuf*, route*, int, ip_moptions*, inpcb*) bsd/sys/netinet/ip_output.cc:621 tcp_output bsd/sys/netinet/tcp_output.cc:1385 tcp_usr_connect bsd/sys/netinet/tcp_usrreq.cc:465 tcp_usr_connect bsd/sys/netinet/tcp_usrreq.cc:436 kern_connect bsd/sys/kern/uipc_syscalls.cc:374 So this patch fixes this problem by changing the search() method to handle the first scenario. In essence, instead of simply comparing the networks of the entry IP address and dst, it first identifies type of the device the entry is associated with. If non-loopback it does the same as before, if device is loopback if checks if dst is loopback in which case it is a match otherwise it compares full IP addresses. With the patch applied, in first scenario when the second external call is handled the entry device is checked which is loopback, and new logic returns null as full addresses would not match. This patch has been backported from the original one by Jan-Michael Kho contributed to the Spirent fork of OSv - https://github.com/SpirentOrion/osv/commit/f6e6e54ba14a7e9c1e04574769d0dd1832d66b92. Co-authored-by: "Jan-Michael Kho" Co-authored-by: Waldemar Kozaczuk Signed-off-by: Waldemar Kozaczuk --- bsd/sys/net/routecache.hh | 25 +++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/bsd/sys/net/routecache.hh b/bsd/sys/net/routecache.hh index 0c5c1f40..f71ac8f6 100644 --- a/bsd/sys/net/routecache.hh +++ b/bsd/sys/net/routecache.hh @@ -54,6 +54,7 @@ #include #include #include +#include #include @@ -88,6 +89,10 @@ public: #endif return *this; } + +bool is_loopback(void) const { +return (rt_ifp && (rt_ifp->if_flags & IFF_LOOPBACK)) ? true : false; +} }; // Silly routing table implementation, allowing search given address in list @@ -116,10 +121,26 @@ public: } entries.emplace_front(a, n, r); } +// address should be in host order +bool is_loopback_net(u32 address) const { +return ((address >> IN_CLASSA_NSHIFT) == IN_LOOPBACKNET) ? true : false; +} nonlockable_rtentry *search(u32 address) { for (silly_rtable_entry &e : entries) { -if ((e.address & e.netmask) == (address & e.netmask)) { -return &e.rte; +if (e.rte.is_loopback() == false) { +if ((e.address & e.netmask) == (address & e.netmask)) { +return &e.rte; +} +} else { +if (is_loopback_net(address)) { +return &e.rte; +} +// We shouldn't use this entry on IP addresses just because they're +// on the same network as our non-loopback address. So match the entire +// address. +if (e.address == address) { +return &e.rte; +} } } return nullptr; -- 2.35.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220723181515.293695-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] zfs: allow mounting and building on host
This requires OpenZFS installed on host (see https://openzfs.github.io/openzfs-docs/Getting%20Started/Fedora/index.html for Fedora and https://openzfs.github.io/openzfs-docs/Getting%20Started/Ubuntu/index.html for Ubuntu). In essence this patch adds new script zfs-image-on-host.sh that allows mounting and build ZFS images using OpenZFS without running OSv. It also modifies the build script to support new option: '--use-openzfs' that delegates to zfs-image-on-host.sh to build the ZFS image. Please see examples above: ./scripts/build image=native-example fs=zfs -j$(nproc) --use-openzfs ./scripts/build image=native-example fs=rofs_with_zfs -j$(nproc) --use-openzfs ./scripts/build image=native-example fs=rofs -j$(nproc) --use-openzfs --create-zfs-disk Fixes #1068 Signed-off-by: Waldemar Kozaczuk --- modules/zfs-tools/usr.manifest | 1 + scripts/build | 51 +--- scripts/imgedit.py | 8 ++ scripts/zfs-image-on-host.sh | 227 + 4 files changed, 269 insertions(+), 18 deletions(-) create mode 100755 scripts/zfs-image-on-host.sh diff --git a/modules/zfs-tools/usr.manifest b/modules/zfs-tools/usr.manifest index 8be0e5d0..ccc9becd 100644 --- a/modules/zfs-tools/usr.manifest +++ b/modules/zfs-tools/usr.manifest @@ -1,4 +1,5 @@ [manifest] /zpool.so: zpool.so +/zfs.so: zfs.so /libzfs.so: libzfs.so /libuutil.so: libuutil.so diff --git a/scripts/build b/scripts/build index 64a55516..df4b7c70 100755 --- a/scripts/build +++ b/scripts/build @@ -38,6 +38,7 @@ usage() { --append-manifest Append build//append.manifest to usr.manifest --create-disk Instead of usr.img create kernel-less disk.img --create-zfs-disk Create extra empty disk with ZFS filesystem + --use-openzfs Build and manipulate ZFS images using on host OpenZFS tools Examples: ./scripts/build -j4 fs=rofs image=native-example # Create image with native-example app @@ -79,7 +80,7 @@ do case $i in --help|-h) usage ;; - image=*|modules=*|fs=*|usrskel=*|check|--append-manifest|--create-disk|--create-zfs-disk) ;; + image=*|modules=*|fs=*|usrskel=*|check|--append-manifest|--create-disk|--create-zfs-disk|--use-openzfs) ;; clean) stage1_args=clean ;; arch=*) @@ -163,11 +164,13 @@ do vars[create_disk]="true";; --create-zfs-disk) vars[create_zfs_disk]="true";; + --use-openzfs) + vars[use_openzfs]="true";; esac done # fs_size_mb is in megabytes (1024*1024 bytes) -fs_size_mb=${vars[fs_size_mb]-256} +fs_size_mb=${vars[fs_size_mb]-512} # fs_size is in bytes fs_size=${vars[fs_size]-$(($fs_size_mb*1024*1024))} # size must be a multiple of 512. Round it down @@ -316,10 +319,17 @@ fi create_zfs_disk() { cp $bare $raw_disk.raw "$SRC"/scripts/imgedit.py setpartition "-f raw ${raw_disk}.raw" 2 $partition_offset $partition_size - qemu-img convert -f raw -O qcow2 $raw_disk.raw $qcow2_disk.img - qemu-img resize $qcow2_disk.img ${image_size}b >/dev/null 2>&1 - "$SRC"/scripts/upload_manifest.py --arch=$arch -o $qcow2_disk.img -m usr.manifest -D libgcc_s_dir="$libgcc_s_dir" - #"$SRC"/scripts/zfs-image-on-host.sh build $qcow2_disk.img $partition_offset osv zfs + if [[ ${vars[use_openzfs]} == "true" ]]; then + #We use raw disk on purpose so that zfs-image-on-host.sh can use loop device which is faster to copy files to + qemu-img resize ${raw_disk}.raw ${image_size}b >/dev/null 2>&1 + "$SRC"/scripts/zfs-image-on-host.sh build ${raw_disk}.raw 1 osv zfs true + qemu-img convert -f raw -O qcow2 $raw_disk.raw $qcow2_disk.img + else + qemu-img convert -f raw -O qcow2 $raw_disk.raw $qcow2_disk.img + qemu-img resize $qcow2_disk.img ${image_size}b >/dev/null 2>&1 + "$SRC"/scripts/upload_manifest.py --arch=$arch -o $qcow2_disk.img -m usr.manifest -D libgcc_s_dir="$libgcc_s_dir" + fi + rm ${raw_disk}.raw } create_rofs_disk() { @@ -332,18 +342,23 @@ create_rofs_disk() { create_zfs_filesystem() { local image_path=$1 - local device_path=$2 - local qemu_arch=$arch - if [[ "$qemu_arch" == 'aarch64' ]]; then - console='' - zfs_builder_name='zfs_builder.img' + if [[ ${vars[use_openzfs]} == "true" ]]; then + local partition=$3 + "$SRC"/scripts/zfs-image-on-host.sh build $image_path $partition osv zfs false else - qemu_arch='x86_64' -
[osv-dev] [PATCH] tests: fix misc-zfs-arc.cc
This patch also enhances this test to make it support running with ZFS mounted from non-root. Signed-off-by: Waldemar Kozaczuk --- bsd/sys/cddl/compat/opensolaris/sys/mman.h| 1 + .../opensolaris/uts/common/fs/zfs/arc.c | 4 +- modules/tests/Makefile| 11 +-- tests/misc-zfs-arc.cc | 86 --- 4 files changed, 63 insertions(+), 39 deletions(-) diff --git a/bsd/sys/cddl/compat/opensolaris/sys/mman.h b/bsd/sys/cddl/compat/opensolaris/sys/mman.h index ca746898..ec1f17ac 100644 --- a/bsd/sys/cddl/compat/opensolaris/sys/mman.h +++ b/bsd/sys/cddl/compat/opensolaris/sys/mman.h @@ -32,6 +32,7 @@ #include_next +#undef mmap64 #definemmap64(_a,_b,_c,_d,_e,_f) mmap(_a,_b,_c,_d,_e,_f) #endif diff --git a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index 15b7a59d..ba339ed1 100644 --- a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -134,6 +134,7 @@ #include #include +#include #ifdef illumos #ifndef _KERNEL @@ -444,6 +445,7 @@ static arc_stats_t arc_stats = { } \ } +OSV_LIB_SOLARIS_API kstat_t*arc_ksp; static arc_state_t *arc_anon; static arc_state_t *arc_mru; @@ -2328,7 +2330,7 @@ arc_flush(spa_t *spa) ASSERT(spa || arc_eviction_list == NULL); } -void +OSV_LIB_SOLARIS_API void arc_shrink(void) { if (arc_c > arc_c_min) { diff --git a/modules/tests/Makefile b/modules/tests/Makefile index d1732e75..d084e357 100644 --- a/modules/tests/Makefile +++ b/modules/tests/Makefile @@ -134,13 +134,9 @@ tests := tst-pthread.so misc-ramdisk.so tst-vblk.so tst-bsd-evh.so \ tst-elf-init.so tst-realloc.so tst-setjmp.so \ libtls.so libtls_gold.so tst-tls.so tst-tls-gold.so tst-tls-pie.so \ tst-sigaction.so tst-syscall.so tst-ifaddrs.so tst-getdents.so \ - tst-netlink.so misc-zfs-io.so + tst-netlink.so misc-zfs-io.so misc-zfs-arc.so # libstatic-thread-variable.so tst-static-thread-variable.so \ -#TODO For now let us disable these tests for aarch64 until -# we support floating point numbers, TLS and correct syscall handling -# The tst-ifaddrs.so is an exception and it does not compile due to some -# missing headers ifeq ($(arch),x64) tests += tst-mmx-fpu.so endif @@ -222,10 +218,9 @@ tests += $(boost-tests) solaris-tests := tst-solaris-taskq.so -# FIXME: two of the test below can't compile now because of include path -# (BSD and OSv header files get mixed up, etc.). +#FIXME: the misc-zfs-disk.c does not compile due to some header issues #zfs-tests := misc-zfs-disk.so misc-zfs-io.so misc-zfs-arc.so -zfs-tests := misc-zfs-io.so +zfs-tests := misc-zfs-io.so misc-zfs-arc.so solaris-tests += $(zfs-tests) $(zfs-tests:%=$(out)/tests/%): COMMON+= \ diff --git a/tests/misc-zfs-arc.cc b/tests/misc-zfs-arc.cc index b24dd56b..24ce2e26 100644 --- a/tests/misc-zfs-arc.cc +++ b/tests/misc-zfs-arc.cc @@ -5,9 +5,6 @@ * BSD license as described in the LICENSE file in the top-level directory. */ -#include -#include -#include #include #include "stat.hh" @@ -19,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -31,6 +29,9 @@ typedef u_long ulong_t; #include #include +#include +#include + #define MB (1024 * 1024) using namespace std; @@ -52,7 +53,7 @@ struct arc_data { uint64_t size; }; -static mutex_t kstat_map_mutex; +static pthread_mutex_t kstat_map_mutex; static unordered_map kstat_map; static struct kstat_named *kstat_map_lookup(const char *name) @@ -86,18 +87,20 @@ static uint64_t *get_kstat_by_name(const kstat_t *ksp, const char *name) assert(ksp && ksp->ks_data); -WITH_LOCK(kstat_map_mutex) { -knp = kstat_map_lookup(name); +pthread_mutex_lock(&kstat_map_mutex); +knp = kstat_map_lookup(name); -/* If knp is NULL, kstat_named wasn't found in the hash */ +/* If knp is NULL, kstat_named wasn't found in the hash */ +if (!knp) { +/* Then do the manual search and insert it into the hash */ +knp = kstat_map_insert(ksp, name); if (!knp) { -/* Then do the manual search and insert it into the hash */ -knp = kstat_map_insert(ksp, name); -if (!knp) { -return 0; -} +pthread_mutex_unlock(&kstat_map_mutex); +return 0; } } +pthread_mutex_unlock(&kstat_map_mutex); + assert(knp->data_type == KSTAT_DATA_UINT64); return &(knp->value.ui64); @@ -297,7 +300,6 @@ static int run_test(const kstat_t *ksp, int argc, char **argv) struct arc_data data; struct stat st; char path[PATH_MAX]; -int
[osv-dev] [PATCH] zpool import: do not try devices where ZFS is mounted
Signed-off-by: Waldemar Kozaczuk --- .../lib/libzfs/common/libzfs_import.c | 48 +-- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/bsd/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_import.c b/bsd/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_import.c index 1f8fe36f..908d4b4d 100644 --- a/bsd/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_import.c +++ b/bsd/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_import.c @@ -1127,6 +1127,7 @@ zpool_clear_label(int fd) * poolname or guid (but not both) are provided by the caller when trying * to import a specific pool. */ +#define MAX_MOUNTED_DEVS 64 static nvlist_t * zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) { @@ -1146,6 +1147,8 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) avl_tree_t slice_cache; rdsk_node_t *slice; void *cookie; + char *excluded_dev_names[MAX_MOUNTED_DEVS]; + int excluded_dev_count = 0; if (dirs == 0) { dirs = 1; @@ -1229,6 +1232,28 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) } #endif +#ifdef __OSV__ + // Iterate over all mounts and identify the devices we wanted to exlude + FILE *ent = fopen("/etc/fstab", "r"); + if (ent) { + struct mnttab m; + while (getmntent(ent, &m) == 0) { + if (strcmp("none", m.mnt_special) == 0) { + continue; + } + char *dev_name = excluded_dev_names[excluded_dev_count++] = strdup(m.mnt_special + 5); + // If the device has a '.' in it it means it corresponds to a disk partion + // and in this case we should skip the parent disk as well as it will make + // the pool discovery slow. For example for 'vblk0.1' exclude 'vblk0' as well but + // not 'vblk0.2' + char *dot_pos = strchr(dev_name, '.'); + if (dot_pos) { + excluded_dev_names[excluded_dev_count++] = strndup(dev_name, dot_pos - dev_name); + } + } + fclose(ent); + } +#endif /* * This is not MT-safe, but we have no MT consumers of libzfs */ @@ -1238,12 +1263,20 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) (name[1] == 0 || (name[1] == '.' && name[2] == 0))) continue; #ifdef __OSV__ - /* In OSv, mount_zfs_roofs() always mounts /dev/vblk0.1 -* before calling zpool import, so this device is -* already mounted, and trying to do it again while -* it is already mounted is surprisingly slow. + /* Trying to call zpool import on a device that we +* have already mounted ZFS root pool from before, +* is surprisingly slow. So let us try to avoid it +* by filtering it out using a list of mounted devices +* identified before in excluded_dev_names. */ - if (!strcmp(name, "vblk0.1")) + bool skip_entry = false; + for (int i = 0; i < excluded_dev_count; i++) { + if (!strcmp(name, excluded_dev_names[i])) { + skip_entry = true; + break; + } + } + if (skip_entry) continue; #endif @@ -1255,6 +1288,11 @@ zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) slice->rn_nozpool = B_FALSE; avl_add(&slice_cache, slice); } +#ifdef __OSV__ + while (excluded_dev_count) { + free(excluded_dev_names[--excluded_dev_count]); + } +#endif #ifndef __OSV__ skipdir: -- 2.35.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220717053624.96106-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] tests: add misc-zfs-io to all images
Signed-off-by: Waldemar Kozaczuk --- modules/tests/Makefile | 4 ++-- tests/misc-zfs-io.cc | 6 -- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/modules/tests/Makefile b/modules/tests/Makefile index 7d15522c..d1732e75 100644 --- a/modules/tests/Makefile +++ b/modules/tests/Makefile @@ -91,7 +91,7 @@ rofs-only-tests := rofs/tst-chdir.so rofs/tst-symlink.so rofs/tst-readdir.so \ rofs/tst-concurrent-read.so zfs-only-tests := tst-readdir.so tst-fallocate.so tst-fs-link.so \ - tst-concurrent-read.so misc-zfs-io.so tst-solaris-taskq.so + tst-concurrent-read.so tst-solaris-taskq.so specific-fs-tests := $($(fs_type)-only-tests) @@ -134,7 +134,7 @@ tests := tst-pthread.so misc-ramdisk.so tst-vblk.so tst-bsd-evh.so \ tst-elf-init.so tst-realloc.so tst-setjmp.so \ libtls.so libtls_gold.so tst-tls.so tst-tls-gold.so tst-tls-pie.so \ tst-sigaction.so tst-syscall.so tst-ifaddrs.so tst-getdents.so \ - tst-netlink.so + tst-netlink.so misc-zfs-io.so # libstatic-thread-variable.so tst-static-thread-variable.so \ #TODO For now let us disable these tests for aarch64 until diff --git a/tests/misc-zfs-io.cc b/tests/misc-zfs-io.cc index 96c4194d..ba0b81d0 100644 --- a/tests/misc-zfs-io.cc +++ b/tests/misc-zfs-io.cc @@ -72,7 +72,7 @@ static void seq_read(int fd, char *buf, unsigned long size, unsigned long offset int main(int argc, char **argv) { -char fpath[64] = "/zfs-io-file"; +const char *fpath = "/zfs-io-file"; char buf[BUF_SIZE]; unsigned size; int fd; @@ -90,6 +90,8 @@ int main(int argc, char **argv) all_cached = true; } else if (!strcmp("--no-unlink", argv[i])) { unlink_file = false; +} else if (!strcmp("--file-path", argv[i]) && (i + 1) < argc) { +fpath = argv[i + 1]; } } @@ -140,7 +142,7 @@ int main(int argc, char **argv) close(fd); if (unlink_file) { -unlink("/zfs-io-file"); +unlink(fpath); } return 0; -- 2.35.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220717053540.96073-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] zfs: support building rofs+zfs image and second ZFS disk
This patch enhances the build scriots and run.py to allow build the images as described by #1200: 1. Run OSv from a single disk with two partitions: ROFS + ZFS (on /dev/vblk0.2) ./scripts/build image=tests,zfs,zfs-tools fs=rofs_with_zfs fs_size_mb=5000 ./scripts/run.py --execute='--mount-fs=zfs,/dev/vblk0.2,/data /tests/misc-zfs-io.so --random --file-path /data/file' 2. Run OSv with 2 disks: 1st one with ROFS and second one with ZFS (/dev/vblk1.1): ./scripts/build image=tests,zfs,zfs-tools fs=rofs fs_size_mb=5000 --create-zfs-disk ./scripts/run.py --execute='--mount-fs=zfs,/dev/vblk1.1,/data /tests/misc-zfs-io.so --random --file-path /data/file' --second-disk-image build/release/zfs_disk.img Fixes #1200 Signed-off-by: Waldemar Kozaczuk --- scripts/build | 59 ++ scripts/export_manifest.py | 7 ++--- scripts/run.py | 12 tools/mkfs/mkfs.cc | 8 +++--- 4 files changed, 73 insertions(+), 13 deletions(-) diff --git a/scripts/build b/scripts/build index b31b8172..64a55516 100755 --- a/scripts/build +++ b/scripts/build @@ -26,7 +26,8 @@ usage() { mode=release|debugSpecify the build mode; default is release export=none|selected|all If 'selected' or 'all' export the app files to export_dir= The directory to export the files to; default is build/export - fs=zfs|rofs|ramfs|virtiofsSpecify the filesystem of the image partition + fs=zfs|rofs|rofs_with_zfs|Specify the filesystem of the image partition +ramfs|virtiofs fs_size=N Specify the size of the image in bytes fs_size_mb=N Specify the size of the image in MiB app_local_exec_tls_size=N Specify the size of app local TLS in bytes; the default is 64 @@ -36,6 +37,7 @@ usage() { -j Set number of parallel jobs for make --append-manifest Append build//append.manifest to usr.manifest --create-disk Instead of usr.img create kernel-less disk.img + --create-zfs-disk Create extra empty disk with ZFS filesystem Examples: ./scripts/build -j4 fs=rofs image=native-example # Create image with native-example app @@ -77,7 +79,7 @@ do case $i in --help|-h) usage ;; - image=*|modules=*|fs=*|usrskel=*|check|--append-manifest|--create-disk) ;; + image=*|modules=*|fs=*|usrskel=*|check|--append-manifest|--create-disk|--create-zfs-disk) ;; clean) stage1_args=clean ;; arch=*) @@ -159,6 +161,8 @@ do vars[append_manifest]="true";; --create-disk) vars[create_disk]="true";; + --create-zfs-disk) + vars[create_zfs_disk]="true";; esac done @@ -195,7 +199,7 @@ usrskel_arg= case $fs_type in zfs) ;; # Nothing to change here. This is our default behavior -rofs|virtiofs) +rofs|rofs_with_zfs|virtiofs) # Both are read-only (in OSv) and require nothing extra on bootfs to work manifest=bootfs_empty.manifest.skel usrskel_arg="--usrskel usr_rofs.manifest.skel";; @@ -293,6 +297,7 @@ cd $OUT if [ "$export" != "none" ]; then export_dir=${vars[export_dir]-$SRC/build/export} + rm -rf "$export_dir" "$SRC"/scripts/export_manifest.py -e "$export_dir" -m usr.manifest -D libgcc_s_dir="$libgcc_s_dir" fi @@ -314,6 +319,7 @@ create_zfs_disk() { qemu-img convert -f raw -O qcow2 $raw_disk.raw $qcow2_disk.img qemu-img resize $qcow2_disk.img ${image_size}b >/dev/null 2>&1 "$SRC"/scripts/upload_manifest.py --arch=$arch -o $qcow2_disk.img -m usr.manifest -D libgcc_s_dir="$libgcc_s_dir" + #"$SRC"/scripts/zfs-image-on-host.sh build $qcow2_disk.img $partition_offset osv zfs } create_rofs_disk() { @@ -324,6 +330,22 @@ create_rofs_disk() { qemu-img convert -f raw -O qcow2 $raw_disk.raw $qcow2_disk.img } +create_zfs_filesystem() { + local image_path=$1 + local device_path=$2 + local qemu_arch=$arch + if [[ "$qemu_arch" == 'aarch64' ]]; then + console='' + zfs_builder_name='zfs_builder.img' + else + qemu_arch='x86_64' + console='--console=serial' + zfs_builder_name='zfs_builder-stripped.elf' + fi + "$SRC"/scripts/run.py -k --kernel-path $zfs_builder_name --arch=$qemu_arch --vnc none -m 512 -c1 -i ${image_path} \ + --block-device-cache unsafe -s -e "${console} --norandom --nomount --noinit --preloa
[osv-dev] [PATCH] vfs: support mounting ZFS from non-root partition
This patch mainly enhances VFS layer to support dynamically loading the ZFS library libsolaris.so from other root filesystem like ROFS and mounting ZFS filesystem from devices different than /dev/vblk0.1. The supported scenarios include specifying a loader option '--mount-fs' or adding an entry to /etc/fstab. In this patch we take advantage of the existing logic in VFS pivot_rootfs() that implicitly loads shared libraries from the directory /usr/lib/fs which is where we place libsolaris.so in the image. This was done as part of the commit 4ffb0fa9329849cd587d62f91b6979bc0e0ce6d1 to support dynamically loading NFS filesystem library. To support similar scenario with ZFS we need to on top of this enhance the mount_fs() to detect ZFS case and call zfsdev_init() and initialize BSD shrinker. We also enhance unmount_rootfs() to make it unmount ZFS from non-root mount points. This patch also add new module - zfs - which is intended to be used when building two types of images as described by #1200. Please note the next patch will enhance the build script to support building such images. 1. Run OSv from a single disk with two partitions: ROFS + ZFS (on /dev/vblk0.2) ./scripts/run.py --execute='--mount-fs=zfs,/dev/vblk0.2,/data /zpool.so list' 2. Run OSv with 2 disks: 1st one with ROFS and second one with ZFS (/dev/vblk1.1): ./scripts/run.py --execute='--mount-fs=zfs,/dev/vblk1.1,/data /zpool.so list' --second-disk-image build/release/zfs_disk.img Refs #1200 Signed-off-by: Waldemar Kozaczuk --- Makefile | 2 +- bootfs.manifest.skel | 2 +- drivers/zfs.cc | 12 - exported_symbols/osv_libsolaris.so.symbols | 1 + fs/vfs/main.cc | 52 +++--- fs/zfs/zfs_initialize.c| 6 +++ fs/zfs/zfs_null_vfsops.cc | 5 ++- loader.cc | 34 ++ modules/zfs-tools/usr.manifest | 1 - modules/zfs/usr.manifest | 2 + scripts/upload_manifest.py | 2 +- usr_rofs.manifest.skel | 1 + zfs_builder_bootfs.manifest.skel | 2 +- 13 files changed, 100 insertions(+), 22 deletions(-) create mode 100644 modules/zfs/usr.manifest diff --git a/Makefile b/Makefile index 6ac9c792..0625524b 100644 --- a/Makefile +++ b/Makefile @@ -2383,7 +2383,7 @@ $(out)/bsd/cddl/contrib/opensolaris/lib/libzfs/common/zprop_common.o: bsd/sys/cd $(out)/libzfs.so: $(libzfs-objects) $(out)/libuutil.so $(out)/libsolaris.so $(makedir) - $(call quiet, $(CC) $(CFLAGS) -o $@ $(libzfs-objects) -L$(out) -luutil -lsolaris, LINK libzfs.so) + $(call quiet, $(CC) $(CFLAGS) -o $@ $(libzfs-objects) -L$(out) -luutil, LINK libzfs.so) #include $(src)/bsd/cddl/contrib/opensolaris/cmd/zpool/build.mk: zpool-cmd-file-list = zpool_iter zpool_main zpool_util zpool_vdev diff --git a/bootfs.manifest.skel b/bootfs.manifest.skel index bab4c606..5e71fb3b 100644 --- a/bootfs.manifest.skel +++ b/bootfs.manifest.skel @@ -1,2 +1,2 @@ [manifest] -/libsolaris.so: libsolaris.so +/usr/lib/fs/libsolaris.so: libsolaris.so diff --git a/drivers/zfs.cc b/drivers/zfs.cc index fb335340..02c011ac 100644 --- a/drivers/zfs.cc +++ b/drivers/zfs.cc @@ -61,16 +61,24 @@ zfs_device::~zfs_device() device_destroy(_zfs_dev); } +static bool zfsdev_initialized = false; + void zfsdev_init(void) { -new zfs_device(); +if (!zfsdev_initialized) { +new zfs_device(); +zfsdev_initialized = true; +} } } extern "C" OSV_LIBSOLARIS_API void zfsdev_init() { -new zfsdev::zfs_device(); +if (!zfsdev::zfsdev_initialized) { +new zfsdev::zfs_device(); +zfsdev::zfsdev_initialized = true; +} } diff --git a/exported_symbols/osv_libsolaris.so.symbols b/exported_symbols/osv_libsolaris.so.symbols index c115cb02..9b959675 100644 --- a/exported_symbols/osv_libsolaris.so.symbols +++ b/exported_symbols/osv_libsolaris.so.symbols @@ -91,4 +91,5 @@ vrele vttoif_tab wakeup zfsdev_init +zfs_driver_initialized zfs_update_vfsops diff --git a/fs/vfs/main.cc b/fs/vfs/main.cc index 8fa99b00..df2becbe 100644 --- a/fs/vfs/main.cc +++ b/fs/vfs/main.cc @@ -62,6 +62,7 @@ #include #include #include +#include #include #include "vfs.h" @@ -83,6 +84,9 @@ #include #include +#include "drivers/zfs.hh" +#include "bsd/porting/shrinker.h" + using namespace std; @@ -2493,6 +2497,18 @@ static void mount_fs(mntent *m) return; } +bool zfs = strcmp(m->mnt_type, "zfs") == 0; +if (zfs) { +// Ignore if ZFS root pool is already mounted because we can only have one root pool +std::vector mounts = osv::current_mounts(); +for (auto &mount : mounts) { +if (mount.type == "zfs" &&am
[osv-dev] [PATCH] devfs: print details of mounted partition
Print information about the partition and offset child device is created for. This is very helpful to understand the process of mounting the filesystems. Signed-off-by: Waldemar Kozaczuk --- fs/devfs/device.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/devfs/device.cc b/fs/devfs/device.cc index 3a5c9f7c..4730f411 100644 --- a/fs/devfs/device.cc +++ b/fs/devfs/device.cc @@ -142,6 +142,8 @@ void read_partition_table(struct device *dev) new_dev->max_io_size = dev->max_io_size; new_dev->private_data = dev->private_data; device_set_softc(new_dev, device_get_softc(dev)); + + kprintf("devfs: created device %s for a partition at offset:%ld with size:%ld\n", dev_name, new_dev->offset, new_dev->size); } sched_unlock(); -- 2.35.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220717043714.95626-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] zfs: refactor loader to handle loading libsolaris.so
This patch refactors the code that loads libsolaris.so to mount ZFS filesystem by extracting the common code into the load_zfs_library_and_mount_zfs_root() function. This will help us enhance the loader and VFS code to support mounting ZFS filesystem from devices different than /dev/vblk0.1. Refs #1200 Signed-off-by: Waldemar Kozaczuk --- loader.cc | 61 --- 1 file changed, 27 insertions(+), 34 deletions(-) diff --git a/loader.cc b/loader.cc index ee05033b..3db560e1 100644 --- a/loader.cc +++ b/loader.cc @@ -406,6 +406,31 @@ static void stop_all_remaining_app_threads() } } +static void load_zfs_library_and_mount_zfs_root(const char* mount_error_msg, bool pivot_when_error = false) +{ +// Load and initialize ZFS filesystem driver implemented in libsolaris.so +const auto libsolaris_file_name = "libsolaris.so"; +//TODO: Consider calling dlclose() somewhere after ZFS is unmounted +if (dlopen(libsolaris_file_name, RTLD_LAZY)) { +zfsdev::zfsdev_init(); +auto error = mount_zfs_rootfs(opt_pivot, opt_extra_zfs_pools); +if (error) { +debug(mount_error_msg); +if (pivot_when_error) { +// Continue with ramfs (already mounted) +// TODO: Avoid the hack of using pivot_rootfs() just for +// mounting the fstab entries. +pivot_rootfs("/"); +} +} else { +bsd_shrinker_init(); +boot_time.event("ZFS mounted"); +} +} else { +debug("Could not load and/or initialize %s.\n", libsolaris_file_name); +} +} + void* do_main_thread(void *_main_args) { auto app_cmdline = static_cast(_main_args); @@ -424,7 +449,6 @@ void* do_main_thread(void *_main_args) if (opt_mount) { unmount_devfs(); -const auto libsolaris_file_name = "libsolaris.so"; if (opt_rootfs.compare("rofs") == 0) { auto error = mount_rofs_rootfs(opt_pivot); if (error) { @@ -437,20 +461,7 @@ void* do_main_thread(void *_main_args) } boot_time.event("ROFS mounted"); } else if (opt_rootfs.compare("zfs") == 0) { -//Initialize ZFS filesystem driver implemented in libsolaris.so -//TODO: Consider calling dlclose() somewhere after ZFS is unmounted -if (dlopen(libsolaris_file_name, RTLD_LAZY)) { -zfsdev::zfsdev_init(); -auto error = mount_zfs_rootfs(opt_pivot, opt_extra_zfs_pools); -if (error) { -debug("Could not mount zfs root filesystem.\n"); -} - -bsd_shrinker_init(); -boot_time.event("ZFS mounted"); -} else { -debug("Could not load and/or initialize %s.\n", libsolaris_file_name); -} +load_zfs_library_and_mount_zfs_root("Could not mount zfs root filesystem.\n"); } else if (opt_rootfs.compare("ramfs") == 0) { // NOTE: The ramfs is already mounted, we just need to mount fstab // entries. That's the only difference between this and --nomount. @@ -476,25 +487,7 @@ void* do_main_thread(void *_main_args) } else if (mount_virtiofs_rootfs(opt_pivot) == 0) { boot_time.event("Virtio-fs mounted"); } else { -//Initialize ZFS filesystem driver implemented in libsolaris.so -//TODO: Consider calling dlclose() somewhere after ZFS is unmounted -if (dlopen("libsolaris.so", RTLD_LAZY)) { -zfsdev::zfsdev_init(); -auto error = mount_zfs_rootfs(opt_pivot, opt_extra_zfs_pools); -if (error) { -debug("Could not mount zfs root filesystem (while " - "auto-discovering).\n"); -// Continue with ramfs (already mounted) -// TODO: Avoid the hack of using pivot_rootfs() just for -// mounting the fstab entries. -pivot_rootfs("/"); -} else { -bsd_shrinker_init(); -boot_time.event("ZFS mounted"); -} -} else { -debug("Could not load and/or initialize %s.\n", libsolaris_file_name); -} +load_zfs_library_and_mount_zfs_root("Could not mount zfs root filesystem (while auto-discovering).\n", true); } } } -- 2.35.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe
[osv-dev] [PATCH] zfs: use spa_dev_path instead of defaulting to /dev/vblk0.1
The commit c9640a385c44704626a9169c03cff0752bfe764d addressing the issue #918, tweaked the vdev disk mounting logic to default to import the root pool from the device /dev/vblk0.1. This was really a hack that was satisfactory to support mounting a ZFS image created or modified on host. However, if we want to be able to import root pool and mount ZFS filesystem from arbitrary device and partition like /dev/vblk0.2 or /deb/vblk1.1, we have to pass the specific device path to all places in ZFS code where it references it. There are 4 code paths that end up calling vdev_alloc() but unfortunately changing all relevant functions and its callers to pass the device path would be quite untenable. So instead, this patch adds new field spa_dev_path to the spa structure that holds the information about the Storage Pool Allocator in memory. This new field is set to point to the device we want to import the ZFS root pool from in spa_import_rootpool() function called by ZFS mount disk process and then used by vdev_alloc() downstream. Refs #1200 Signed-off-by: Waldemar Kozaczuk --- bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c | 1 + .../cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c | 1 + .../contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h | 1 + bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c | 8 ++-- .../contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c | 3 --- 5 files changed, 9 insertions(+), 5 deletions(-) diff --git a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c index 6cee8352..70f0c5b1 100644 --- a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -4206,6 +4206,7 @@ spa_import_rootpool(const char *name) } spa->spa_is_root = B_TRUE; spa->spa_import_flags = ZFS_IMPORT_VERBATIM; + spa->spa_dev_path = name; /* * Build up a vdev tree based on the boot device's label config. diff --git a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c index 2ea8b577..b61c308c 100644 --- a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c +++ b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c @@ -464,6 +464,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot) spa->spa_load_max_txg = UINT64_MAX; spa->spa_proc = &p0; spa->spa_proc_state = SPA_PROC_NONE; + spa->spa_dev_path = NULL; refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); diff --git a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h index 043370e4..dfd0fc89 100644 --- a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h +++ b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h @@ -243,6 +243,7 @@ struct spa { #ifndef sun boolean_t spa_splitting_newspa; /* creating new spa in split */ #endif + const char *spa_dev_path; /* device spa is mounted */ }; extern const char *spa_config_path; diff --git a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c index 2a265f7a..9dc2278b 100644 --- a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c +++ b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c @@ -442,8 +442,12 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, vd->vdev_islog = islog; vd->vdev_nparity = nparity; - if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) - vd->vdev_path = spa_strdup(vd->vdev_path); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) { + if (spa->spa_dev_path) + vd->vdev_path = strdup(spa->spa_dev_path); + else + vd->vdev_path = spa_strdup(vd->vdev_path); + } if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) vd->vdev_devid = spa_strdup(vd->vdev_devid); if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, diff --git a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c index 3d0d6324..650c969c 100644 --- a/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c +++ b/bsd/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_disk.c @@ -74,9 +74,6 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, dvd = vd->vdev_tsd = kmem_zalloc(sizeof(struct vdev_disk), KM_SLEEP); device_name = vd->vdev_path + 5; -
[osv-dev] [PATCH] syscalls: allocate local iovec copies on the stack instead of the heap
This commit is mostly based on the one from the Spirent fork of OSv - https://github.com/SpirentOrion/osv/commit/29e4d2bbc23d6ddbf8d4b065fc3388c9931e705a - and its original description reads: "Multiple syscalls used std::vector to manage local iovec copies. For our uses cases, this is totally unnecessary overhead and results in 1000's of small object memory allocations. So, just use the stack instead." In nutshell this patch slightly optimizes 4 functions - 2 that are part of the networking stack and 2 others in VFS layer to - by tweaking them to use stack instead of malloc()/free() which are relatively constly. Please note that the objects in question - copies of iovec - are pretty tiny, typically 16 bytes in size so it does make sense to use stack. Obviously it is hard to tell without measuring how significant this change is in terms of performance benefits and what use cases it would benefit. I did however find two usecases where I could observe some significant decrease of malloc invocations. The first is actually the cpiod app used to upload ZFS files during upload file which seems to be hitting the VFS code path in question. I this case I could see ~25% drop of malloc/free invocations. The second use case involved a test misc-tcp.cc which sends and receives data over socket in multiple threads and seems to be hitting the networking stack paths. In this case I could see 15-6% drop. The program was called like so: ./scripts/run.py -e '/tests/misc-tcp.so --remote=192.168.122.1 -c 10 -n 10 -l 5' with netcat running like so: ncat -l -k -p -e /bin/cat This patch also updates the test makefile to make it build misc-tcp.so after kernel no longer includes program options. It also slightly updates the test itself to output some helpful information in progress. Co-authored-by: "Timmons C. Player" Co-authored-by: Waldemar Kozaczuk Signed-off-by: Waldemar Kozaczuk --- bsd/sys/kern/uipc_syscalls.cc | 26 -- fs/vfs/vfs_syscalls.cc| 14 ++ modules/tests/Makefile| 8 ++-- tests/misc-tcp.cc | 12 +--- 4 files changed, 41 insertions(+), 19 deletions(-) diff --git a/bsd/sys/kern/uipc_syscalls.cc b/bsd/sys/kern/uipc_syscalls.cc index 9f8db3f6..1ae4090e 100644 --- a/bsd/sys/kern/uipc_syscalls.cc +++ b/bsd/sys/kern/uipc_syscalls.cc @@ -494,10 +494,12 @@ kern_sendit(int s, so = (struct socket *)file_data(fp); // Create a local copy of the user's iovec - sosend() is going to change it! - std::vector uio_iov(mp->msg_iov, mp->msg_iov + mp->msg_iovlen); + assert(mp->msg_iovlen <= UIO_MAXIOV); + struct iovec uio_iov[mp->msg_iovlen]; + memcpy(uio_iov, mp->msg_iov, sizeof(uio_iov)); - auio.uio_iov = uio_iov.data(); - auio.uio_iovcnt = uio_iov.size(); + auio.uio_iov = uio_iov; + auio.uio_iovcnt = mp->msg_iovlen;; auio.uio_rw = UIO_WRITE; auio.uio_offset = 0;/* XXX */ auio.uio_resid = 0; @@ -585,10 +587,12 @@ kern_recvit(int s, struct msghdr *mp, struct mbuf **controlp, ssize_t* bytes) so = (socket*)file_data(fp); // Create a local copy of the user's iovec - sorecieve() is going to change it! - std::vector uio_iov(mp->msg_iov, mp->msg_iov + mp->msg_iovlen); + assert(mp->msg_iovlen <= UIO_MAXIOV); + struct iovec uio_iov[mp->msg_iovlen]; + memcpy(uio_iov, mp->msg_iov, sizeof(uio_iov)); - auio.uio_iov = uio_iov.data(); - auio.uio_iovcnt = uio_iov.size(); + auio.uio_iov = uio_iov; + auio.uio_iovcnt = mp->msg_iovlen; auio.uio_rw = UIO_READ; auio.uio_offset = 0;/* XXX */ auio.uio_resid = 0; @@ -653,7 +657,7 @@ out: if (fromsa) free(fromsa); - if (error == 0 && controlp != NULL) + if (error == 0 && controlp != NULL) *controlp = control; else if (control) m_freem(control); @@ -1038,10 +1042,12 @@ zcopy_tx(int s, struct zmsghdr *zm) if (so->so_type != SOCK_STREAM) return (EINVAL); // Create a local copy of the user's iovec - sosend() is going to change it! - std::vector uio_iov(mp->msg_iov, mp->msg_iov + mp->msg_iovlen); + assert(mp->msg_iovlen <= UIO_MAXIOV); + struct iovec uio_iov[mp->msg_iovlen]; + memcpy(uio_iov, mp->msg_iov, sizeof(uio_iov)); - auio.uio_iov = uio_iov.data(); - auio.uio_iovcnt = uio_iov.size(); + auio.uio_iov = uio_iov; + auio.uio_iovcnt = mp->msg_iovlen; auio.uio_rw = UIO_WRITE; auio.uio_offset = 0; auio.uio_resid = 0; diff --git a/fs/vfs/vfs_syscalls.cc b/fs/vfs/vfs_syscalls.cc index cd0d1745..055a32c7 100644 --- a/fs/vfs/vfs_syscalls.cc +++ b/fs/vfs/vfs_syscalls.cc
[osv-dev] [PATCH] virtio-blk: Use multiplex strategy for I/O
This allows arbitrarily large block sizes to be used for I/O requests and matches the behavior of most other block drivers. This commit is mostly based on the one from the Spirent fork of OSv - https://github.com/SpirentOrion/osv/commit/45306415040521ac875ec7f6ba0ad6671ea8ad11. However, it differs slightly by handling the situation when the VIRTIO_BLK_F_SEG_MAX capability is not detected. In this case the driver behaves effectively as if there was no limit on the size of the request which was the original behavior. It is worth noting that we already had a check on the size of the request in runtime and would return EIO error (see make_request() method), but this change makes the driver correctly chunk the request using the multiplexing strategy if the request is larger than the maximum imposed by the hypervisor. In practical terms this commit can be tested using the misc-bdev-rw test which has been adjusted to request reads and writes as large as 2 MB which are higher that the limit on QEMU (the VIRTIO_BLK_F_SEG_MAX is equal to 254 = eq 1MB - 8K). Before the commit this test would hang with 'i' equal to 255 and now it behaves correctly. Finally some hypervisors like Firecracker do not provide the VIRTIO_BLK_F_SEG_MAX capability, in which case our driver behaves as if there was no limit which is also the old behavior. Co-authored-by: "Timmons C. Player" Co-authored-by: Waldemar Kozaczuk Signed-off-by: Waldemar Kozaczuk --- drivers/virtio-blk.cc | 8 ++-- tests/misc-bdev-rw.cc | 15 +-- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/drivers/virtio-blk.cc b/drivers/virtio-blk.cc index e03d41f7..643ca275 100644 --- a/drivers/virtio-blk.cc +++ b/drivers/virtio-blk.cc @@ -54,6 +54,7 @@ int blk::_instance = 0; struct blk_priv { +devop_strategy_t strategy; blk* drv; }; @@ -63,7 +64,6 @@ blk_strategy(struct bio *bio) struct blk_priv *prv = reinterpret_cast(bio->bio_dev->private_data); trace_virtio_blk_strategy(bio); -bio->bio_offset += bio->bio_dev->offset; prv->drv->make_request(bio); } @@ -90,7 +90,7 @@ static struct devops blk_devops { blk_write, no_ioctl, no_devctl, -blk_strategy, +multiplex_strategy, }; struct driver blk_driver = { @@ -180,8 +180,10 @@ blk::blk(virtio_device& virtio_dev) dev = device_create(&blk_driver, dev_name.c_str(), D_BLK); prv = reinterpret_cast(dev->private_data); +prv->strategy = blk_strategy; prv->drv = this; dev->size = prv->drv->size(); +dev->max_io_size = _config.seg_max ? (_config.seg_max - 1) * mmu::page_size : UINT_MAX; read_partition_table(dev); debugf("virtio-blk: Add blk device instances %d as %s, devsize=%lld\n", _id, dev_name.c_str(), dev->size); @@ -208,6 +210,8 @@ void blk::read_config() if (get_guest_feature_bit(VIRTIO_BLK_F_SEG_MAX)) { READ_CONFIGURATION_FIELD(blk_config,seg_max,_config.seg_max) trace_virtio_blk_read_config_seg_max(_config.seg_max); +} else { +_config.seg_max = 0; } if (get_guest_feature_bit(VIRTIO_BLK_F_GEOMETRY)) { READ_CONFIGURATION_FIELD(blk_config,geometry,_config.geometry) diff --git a/tests/misc-bdev-rw.cc b/tests/misc-bdev-rw.cc index ae8eaf64..ce81c1fa 100644 --- a/tests/misc-bdev-rw.cc +++ b/tests/misc-bdev-rw.cc @@ -8,6 +8,17 @@ #define MB (1024*1024) +/* +This test requires a standalone block device (not a one +actively used by given filesystem) and can be created and +mounted like so: + +dd if=/dev/zero of=/tmp/test1.raw bs=1M count=512 +qemu-img convert -O qcow2 /tmp/test1.raw /tmp/test1.img + +./scripts/run.py -e '/tests/misc-bdev-rw.so vblk1' --cloud-init-image /tmp/test1.img +*/ + using namespace std; atomic bio_inflights(0); @@ -89,7 +100,7 @@ int main(int argc, char const *argv[]) long written = 0; //Do all writes -for(auto i = 1; i < 32; i++) +for(auto i = 1; i < 511; i++) { const size_t buff_size = i * memory::page_size; @@ -142,4 +153,4 @@ int main(int argc, char const *argv[]) << "Test " << (test_failed.load() ? "FAILED" : "PASSED") << endl; return test_failed.load() ? 1 : 0; -} \ No newline at end of file +} -- 2.35.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220628154711.108483-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] loader: don't leak directory entries when processing init entries
This commit originates from the Spirent fork of OSv - https://github.com/SpirentOrion/osv/commit/502fa31d631bba073e7bb9bc7ce6623e9159dbdd Authored-by: "Timmons C. Player" Reviewed-by: Waldemar Kozaczuk Signed-off-by: Waldemar Kozaczuk --- loader.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/loader.cc b/loader.cc index b98f9681..ee05033b 100644 --- a/loader.cc +++ b/loader.cc @@ -607,6 +607,7 @@ void* do_main_thread(void *_main_args) for (int i = 0; i < count; i++) { if (!strcmp(".", namelist[i]->d_name) || !strcmp("..", namelist[i]->d_name)) { +free(namelist[i]); continue; } std::string fn("/init/"); -- 2.35.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220628140241.97140-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] x64: fix direct kernel mode boot on qemu
I cannot pretend I fully understand all details, but at some point the gnu assembler (possibly starting with 2.36) on Fedora started generating ELF notes named note.gnu.property.* that somehow when linked produce an loader.elf that both readelf and QEMU report has malformed notes. The readelf reports this: readelf -Wn build/release/loader-strippef.elf Displaying notes found in: .note OwnerData sizeDescription GNU 0x0020 NT_GNU_PROPERTY_TYPE_0 Properties: x86 feature used: x86, x87, MMX, XMM, FXSR, XSAVE, x86 ISA u Xen 0x0008 NT_VERSION (version) description data: 00 10 31 40 00 00 00 00 Xen 0x0008 NT_ARCH (architecture) description data: 00 00 31 40 00 00 00 00 Xen 0x0008 Unknown note type: (0x0003) description data: 00 00 00 40 00 00 00 00 Xen 0x0008 Unknown note type: (0x0005) description data: 78 65 6e 2d 33 2e 30 00 Xen 0x0004 Unknown note type: (0x0006) description data: 6f 73 76 00 ?.? 0x0007 Unknown note type: (0x006e6558) description data: 04 00 00 00 08 00 00 readelf: build/release/loader-stripped.elf: Warning: note with invalid namesz and/or descsz found at offset 0xc0 readelf: build/release/loader-stripped.elf: Warning: type: 0x656e6567, namesize: 0x0008, descsize: 0x006e6558, alignment: 8 QEMU on other hand fails to run OSv in the direct kernel mode like so: ./script/run.py -k qemu-system-x86_64: Error loading uncompressed kernel without PVH ELF Note qemu failed. The logic in QEMU to load kernel in direct mode relies on the ELF note XEN_ELFNOTE_PHYS32_ENTRY that specifies an address of hvm_xen_start used to jump to when booting OSv. This ELF note as well as other XEN notes are defined in assembly in arch/x64/entry-xen.S. Because something is wrong with those notes, QEMU can not boot OSv. After some research I realized that the entry-xen object file has one extra note - GNU in .note.gnu.property section: readelf -nW build/release/arch/x64/entry-xen.o Displaying notes found in: .note.xen OwnerData sizeDescription Xen 0x0008 NT_VERSION (version) description data: 00 10 00 00 00 00 00 00 Xen 0x0008 NT_ARCH (architecture) description data: 00 00 00 00 00 00 00 00 Xen 0x0008 Unknown note type: (0x0003) description data: 00 00 00 40 00 00 00 00 Xen 0x0008 Unknown note type: (0x0005) description data: 78 65 6e 2d 33 2e 30 00 Xen 0x0004 Unknown note type: (0x0006) description data: 6f 73 76 00 Xen 0x0004 Unknown note type: (0x0007) description data: 3f 2e 3f 00 Xen 0x0008 Unknown note type: (0x0008) description data: 67 65 6e 65 72 69 63 00 Xen 0x0016 Unknown note type: (0x000a) description data: 21 77 72 69 74 61 62 6c 65 5f 70 61 67 65 5f 74 61 62 6c 65 73 00 Xen 0x0004 Unknown note type: (0x000b) description data: 79 65 73 00 Xen 0x0008 Unknown note type: (0x0012) description data: 1c 10 00 c0 ff ff ff ff Displaying notes found in: .note.gnu.property OwnerData sizeDescription GNU 0x0020 NT_GNU_PROPERTY_TYPE_0 Properties: x86 ISA used: , x86 feature used: x86 I have also found somebody reported similar issue - https://sourceware.org/bugzilla/show_bug.cgi?id=27753 - which can be fixed by adding DISCARD to the linker script to filter out unwanted section from the target ELF. Besides adding DISCARD rule to the linker script this patch also cleans some issue with the assembly code to add the elf notes in entry-xen.S. Signed-off-by: Waldemar Kozaczuk --- arch/x64/entry-xen.S | 4 +++- arch/x64/loader.ld | 5 + 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/x64/entry-xen.S b/arch/x64/entry-xen.S index 451d35e4..0146a799 100644 --- a/arch/x64/entry-xen.S +++ b/arch/x64/entry-xen.S @@ -8,6 +8,7 @@ #define elfnote(type, valtype, value) \ .pushsection .note.xen, "a", @note; \ +.align 4; \ .long 2f-1f; \ .long 3f-2f; \ .long type; \ @@ -17,7 +18,8 @@ 2: \ valtype value; \ 3: \ -.align 4 +.align 4; \ +.popsection #define elfnote_val(type, value) elfnote(type, .quad, value) #define elfnote_str(type, value) elfnote(type, .asciz, value) diff --git a/arch/x64/loader.ld b/arch/x64/loader.ld index debdbf15..65f88d5b 100644 --- a/arch/x64/loader.ld +++ b/arch/x64/loader.ld @@ -57,6 +57,11 @@ SECTIONS HIDDEN(memcpy_decode_end = .);
[osv-dev] [PATCH] iconv: use stubs if iconv functions not used by an app
The GCC libstdc++ commit https://github.com/gcc-mirror/gcc/commit/c0ace69ec677d1f85f6a433c8fae2d4df6f75714 which I believe fixes the bug https://gcc.gnu.org/bugzilla/show_bug.cgi?format=multiple&id=64132 makes the C++ standard library depend on libc iconv* functions. The implementation of these functions in musl employs many large arrays used to translate characters from one encoding to another and inflates the kernel by around 156K. After some research to see where exactly the related code - _M_initialize_numpunct() and _M_initialize_moneypunct() is executed it turns out that OSv itself internally does not trigger the code path leading to the execution of iconv functions because it uses a default C locale. This observation in general would be a mute point as by default OSv provides iconv* functions as part of libc anyway so C++ standard library using it does not really change anything. Except when one builds custom kernel to remove all symbols but what application specifically needs like so (for details see d19ccb1cde100ab4a5c8e6db9a0d69560cabbd04): ./scripts/build fs=rofs conf_hide_symbols=1 image=native-example \ conf_version_script=build/last/app_version_script In this case we would like to optimize kernel size and replace real iconv functions which would be referenced by _M_initialize_numpunct and _M_initialize_moneypunct with simple stubs in libc/locale/iconv_stubs.cc. This optimization allows us to reduce kernel size by another 156K. Signed-off-by: Waldemar Kozaczuk --- Makefile | 10 -- libc/locale/iconv_stubs.cc | 26 ++ 2 files changed, 34 insertions(+), 2 deletions(-) create mode 100644 libc/locale/iconv_stubs.cc diff --git a/Makefile b/Makefile index d0971a16..dd495e83 100644 --- a/Makefile +++ b/Makefile @@ -1142,8 +1142,6 @@ musl += locale/catgets.o libc += locale/catopen.o libc += locale/duplocale.o libc += locale/freelocale.o -musl += locale/iconv.o -musl += locale/iconv_close.o libc += locale/intl.o libc += locale/langinfo.o musl += locale/localeconv.o @@ -2085,8 +2083,16 @@ endif endif linker_archives_options = --no-whole-archive $(libstdc++.a) $(libgcc.a) $(libgcc_eh.a) $(boost-libs) \ --exclude-libs libstdc++.a --gc-sections +ifneq ($(shell grep -c iconv $(out)/version_script),0) +musl += locale/iconv.o +musl += locale/iconv_close.o +else +libc += locale/iconv_stubs.o +endif else linker_archives_options = --whole-archive $(libstdc++.a) $(libgcc_eh.a) $(boost-libs) --no-whole-archive $(libgcc.a) +musl += locale/iconv.o +musl += locale/iconv_close.o endif $(out)/default_version_script: exported_symbols/*.symbols exported_symbols/$(arch)/*.symbols diff --git a/libc/locale/iconv_stubs.cc b/libc/locale/iconv_stubs.cc new file mode 100644 index ..4c7b7d7d --- /dev/null +++ b/libc/locale/iconv_stubs.cc @@ -0,0 +1,26 @@ +/* + * Copyright (C) 2022 Waldemar Kozaczuk + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ + +#include +#include +#include + +iconv_t iconv_open(const char *to, const char *from) { +WARN_STUBBED(); +errno = EINVAL; +return (iconv_t)-1; +} + +size_t iconv(iconv_t cd, char **in, size_t *inb, char **out, size_t *outb) { +WARN_STUBBED(); +return 0l; +} + +int iconv_close(iconv_t cd) { +WARN_STUBBED(); +return 0l; +} -- 2.35.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220627185608.37930-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] libc: fix bugs in if_nametoindex/if_indextoname
This patch fixes the bug described in the issue #1096 and makes both if_nametoindex() and if_indextoname() work correctly on OSv. The implementation of both functions comes from musl 1.1.24 AS IS and this patch removes the old sources from the libc/network folder which seem to have come from an older version of musl. However the musl implementation opens a socket using the AF_UNIX family which is not supported by OSv. As Charles Myers suggested and authored relevant patch of his own (which is part of ipv6 branch), changing AF_UNIX to AF_INET provides necessary workaround but is not enough to make these functions work correctly. The if_nametoindex() and if_indextoname() call ioctl() with commands SIOCGIFINDEX and SIOCGIFNAME respectively. However OSv has a bug in the FreeBSD/Linux translation logic of SIOCGIFINDEX and does not support SIOCGIFNAME at all. So this patch fixes the first issue which actually comes from a different commit on ipv6 branch and was authored by Spirent developers. Secondly, it also provides support of SIOCGIFNAME. Please note new field ifru_ifindex added to the l_ifreq struct. Finally, this patch also adds a unit test borrowed from the Android project and adapted to run using boot unit test framework. Fixes #1096 Co-authored-by: Charles Myers Co-authored-by: Waldemar Kozaczuk Signed-off-by: Waldemar Kozaczuk --- Makefile| 6 +- bsd/sys/compat/linux/linux.h| 1 + bsd/sys/compat/linux/linux_ioctl.cc | 33 ++- include/osv/ioctl.h | 1 - libc/network/README.md | 14 + libc/network/__socket.h | 5 ++ libc/network/if_indextoname.c | 18 -- libc/network/if_nametoindex.c | 19 -- modules/tests/Makefile | 3 +- tests/tst-net_if_test.cc| 89 + 10 files changed, 133 insertions(+), 56 deletions(-) create mode 100644 libc/network/__socket.h delete mode 100644 libc/network/if_indextoname.c delete mode 100644 libc/network/if_nametoindex.c create mode 100644 tests/tst-net_if_test.cc diff --git a/Makefile b/Makefile index ad70580c..8f95e1fb 100644 --- a/Makefile +++ b/Makefile @@ -1495,8 +1495,10 @@ musl += network/inet_aton.o musl += network/inet_pton.o musl += network/inet_ntop.o musl += network/proto.o -libc += network/if_indextoname.o -libc += network/if_nametoindex.o +musl += network/if_indextoname.o +$(out)/musl/src/network/if_indextoname.o: CFLAGS += --include libc/syscall_to_function.h --include libc/network/__socket.h +musl += network/if_nametoindex.o +$(out)/musl/src/network/if_nametoindex.o: CFLAGS += --include libc/syscall_to_function.h --include libc/network/__socket.h musl += network/gai_strerror.o musl += network/h_errno.o musl += network/getservbyname_r.o diff --git a/bsd/sys/compat/linux/linux.h b/bsd/sys/compat/linux/linux.h index 1e6116aa..c406580a 100644 --- a/bsd/sys/compat/linux/linux.h +++ b/bsd/sys/compat/linux/linux.h @@ -144,6 +144,7 @@ struct l_ifreq { struct l_ifmap ifru_map; charifru_slave[LINUX_IFNAMSIZ]; l_uintptr_t ifru_data; + int ifru_ifindex; } ifr_ifru; } __packed; diff --git a/bsd/sys/compat/linux/linux_ioctl.cc b/bsd/sys/compat/linux/linux_ioctl.cc index 43a50bb5..43a081f2 100644 --- a/bsd/sys/compat/linux/linux_ioctl.cc +++ b/bsd/sys/compat/linux/linux_ioctl.cc @@ -199,7 +199,6 @@ linux_gifhwaddr(struct ifnet *ifp, struct l_ifreq *ifr) return (ENOENT); } - /* * Fix the interface address field in bsd_ifreq. The bsd stack expects a * length/family byte members, while linux and everyone else use a short family @@ -222,6 +221,16 @@ linux_to_bsd_ifreq(struct bsd_ifreq *ifr_p) ifr_p->ifr_addr.sa_len= 16 ; } +/* + * FreeBSD ifru_index is short but Linux is an int so need to clear extra bits. + */ +static inline void +bsd_to_linux_ifreq_ifindex(struct bsd_ifreq *ifr_p) +{ +void *ptr = &ifr_p->ifr_index; +*(int *)(ptr) = ifr_p->ifr_index; +} + /* * Socket related ioctls */ @@ -241,8 +250,8 @@ linux_ioctl_socket(socket_file *fp, u_long cmd, void *data) switch (cmd) { case SIOCSIFADDR: case SIOCSIFNETMASK: -case SIOCSIFDSTADDR: -case SIOCSIFBRDADDR: +case SIOCSIFDSTADDR: +case SIOCSIFBRDADDR: if ((ifp = ifunit_ref((char *)data)) == NULL) return (EINVAL); linux_to_bsd_ifreq((struct bsd_ifreq *)data) ; @@ -251,11 +260,29 @@ linux_ioctl_socket(socket_file *fp, u_long cmd, void *data) case SIOCGIFMTU: case SIOCSIFMTU: +if ((ifp = ifunit_ref((char *)data)) == NULL) +return (EINVAL); +error = fp->bsd_ioctl(cmd, data); +break; + case SIOCGIFINDEX: if ((ifp = ifunit_ref((char *)data)) == NULL) return (EINVAL); error = fp->bsd_ioctl(cmd, data); +bsd_to_linux_ifreq_ifindex((stru
[osv-dev] [PATCH] libc: use musl implementation of getifaddrs and if_nameindex
This patch replaces our home grown (possibly old musl version based) implementation of getifaddrs() and if_nameindex() with the implemantation of those provided by modern version musl 1.1.24 that uses netlink interface. The advantage is that it will also support IPV6 once we merge the ipv6 branch. Please note that we are applying simple header trick with __netlink.h to counter a bug in musl netlink.c. In essence the __netlink_enumerate() in musl netlink.c calls recv() with MSG_DONTWAIT flag may (and it does sometimes on OSv) yield EAGAIN or EWOULDBLOCK errors and there is no error handling logic of those. Instead of adding the error handling we change recv call to use 0 flags by re-defining MSG_DONTWAIT as 0 to enforce blocking call which is for example what Golang runtime does to implement similar functionality. Eventually we should try to upstream a patch to musl. Signed-off-by: Waldemar Kozaczuk --- Makefile| 6 +- libc/network/__netlink.h| 4 + libc/network/getifaddrs.c | 252 libc/network/if_nameindex.c | 58 - 4 files changed, 8 insertions(+), 312 deletions(-) create mode 100644 libc/network/__netlink.h delete mode 100644 libc/network/getifaddrs.c delete mode 100644 libc/network/if_nameindex.c diff --git a/Makefile b/Makefile index 2d1ba6a8..ad70580c 100644 --- a/Makefile +++ b/Makefile @@ -1503,10 +1503,12 @@ musl += network/getservbyname_r.o musl += network/getservbyname.o musl += network/getservbyport_r.o musl += network/getservbyport.o -libc += network/getifaddrs.o -libc += network/if_nameindex.o +musl += network/getifaddrs.o +musl += network/if_nameindex.o musl += network/if_freenameindex.o musl += network/res_init.o +musl += network/netlink.o +$(out)/musl/src/network/netlink.o: CFLAGS += --include libc/syscall_to_function.h --include libc/network/__netlink.h musl += prng/rand.o musl += prng/rand_r.o diff --git a/libc/network/__netlink.h b/libc/network/__netlink.h new file mode 100644 index ..4be572a5 --- /dev/null +++ b/libc/network/__netlink.h @@ -0,0 +1,4 @@ +#include +#undef MSG_DONTWAIT +//Let us disable the non-blocking call to recv() netlink.c by re-defining MSG_DONTWAIT as 0 +#define MSG_DONTWAIT 0 diff --git a/libc/network/getifaddrs.c b/libc/network/getifaddrs.c deleted file mode 100644 index 435c93b3.. --- a/libc/network/getifaddrs.c +++ /dev/null @@ -1,252 +0,0 @@ -/* (C) 2013 John Spencer. released under musl's standard MIT license. */ -#undef _GNU_SOURCE -#define _GNU_SOURCE -#include -#include -#include /* IFNAMSIZ, ifreq, ifconf */ -#include -#include -#include -#include -#include /* inet_pton */ -#include -#include -#include -#include - -typedef union { - struct sockaddr_in6 v6; - struct sockaddr_in v4; - struct sockaddr_ll hw; -} soa; - -typedef struct ifaddrs_storage { - struct ifaddrs ifa; - soa addr; - soa netmask; - soa dst; - char name[IFNAMSIZ+1]; -} stor; -#define next ifa.ifa_next - -static stor* list_add(stor** list, stor** head, char* ifname) -{ - stor* curr = calloc(1, sizeof(stor)); - if(curr) { - strcpy(curr->name, ifname); - curr->ifa.ifa_name = curr->name; - if(*head) (*head)->next = (struct ifaddrs*) curr; - *head = curr; - if(!*list) *list = curr; - } - return curr; -} - -void freeifaddrs(struct ifaddrs *ifp) -{ - stor *head = (stor *) ifp; - while(head) { - void *p = head; - head = (stor *) head->next; - free(p); - } -} - -static void ipv6netmask(unsigned prefix_length, struct sockaddr_in6 *sa) -{ - unsigned char* hb = sa->sin6_addr.s6_addr; - unsigned onebytes = prefix_length / 8; - unsigned bits = prefix_length % 8; - unsigned nullbytes = 16 - onebytes; - memset(hb, -1, onebytes); - memset(hb+onebytes, 0, nullbytes); - if(bits) { - unsigned char x = -1; - x <<= 8 - bits; - hb[onebytes] = x; - } -} - -static void dealwithipv6(stor **list, stor** head) -{ - FILE* f = fopen("/proc/net/if_inet6", "r"); - /* 0001 01 80 10 80 lo - AB C D E F - all numbers in hex - A = addr B=netlink device#, C=prefix length, - D = scope value (ipv6.h) E = interface flags (rnetlink.h, addrconf.c) - F = if name */ - char v6conv[32 + 7 + 1], *v6; - char *line, linebuf[512]; - if(!f) return; - while((line = fgets(linebuf, sizeof linebuf, f))) { - v6 = v6conv; - size_t i = 0; - for(; i < 8; i++) { - memcpy(v6, line, 4); - v6+=4; - *v6++=':'; -
[osv-dev] [PATCH 10/10] netlink: enable it and add unit test
This changes bsd/net.cc to enables netlink by registering netlink domain and calling netlink_init(). It also adds a unit test to verify the netlink implementation. Signed-off-by: Waldemar Kozaczuk --- bsd/net.cc | 5 + modules/tests/Makefile | 3 +- tests/tst-netlink.c| 441 + 3 files changed, 448 insertions(+), 1 deletion(-) create mode 100644 tests/tst-netlink.c diff --git a/bsd/net.cc b/bsd/net.cc index 3e427575..f548e091 100644 --- a/bsd/net.cc +++ b/bsd/net.cc @@ -23,6 +23,7 @@ #include #include #include +#include /* Generation of ip ids */ void ip_initid(void); @@ -32,6 +33,8 @@ extern "C" { extern struct domain inetdomain; /* AF_ROUTE */ extern struct domain routedomain; +/* AF_NETLINK */ +extern struct domain netlinkdomain; } void net_init(void) @@ -53,9 +56,11 @@ void net_init(void) domaininit(NULL); OSV_DOMAIN_SET(inet); OSV_DOMAIN_SET(route); +OSV_DOMAIN_SET(netlink); rts_init(); route_init(); vnet_route_init(); +netlink_init(); ipport_tick_init(NULL); arp_init(); domainfinalize(NULL); diff --git a/modules/tests/Makefile b/modules/tests/Makefile index e462ebc8..f79da870 100644 --- a/modules/tests/Makefile +++ b/modules/tests/Makefile @@ -133,7 +133,8 @@ tests := tst-pthread.so misc-ramdisk.so tst-vblk.so tst-bsd-evh.so \ tst-getopt.so tst-getopt-pie.so tst-non-pie.so tst-semaphore.so \ tst-elf-init.so tst-realloc.so tst-setjmp.so \ libtls.so libtls_gold.so tst-tls.so tst-tls-gold.so tst-tls-pie.so \ - tst-sigaction.so tst-syscall.so tst-ifaddrs.so tst-getdents.so + tst-sigaction.so tst-syscall.so tst-ifaddrs.so tst-getdents.so \ + tst-netlink.so # libstatic-thread-variable.so tst-static-thread-variable.so \ #TODO For now let us disable these tests for aarch64 until diff --git a/tests/tst-netlink.c b/tests/tst-netlink.c new file mode 100644 index ..aebc9dd5 --- /dev/null +++ b/tests/tst-netlink.c @@ -0,0 +1,441 @@ +/* Unit test that verifies limited netlink support in OSv + * + * Copyright (C) 2022 Waldemar Kozaczuk + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ + +// This test should run on Linux: +// gcc tests/tst-netlink.c -o tst-netlink +// ./tst-netlink + +#include //printf, perror +#include//memset, strlen +#include//exit +#include//close +#include//msghdr +#include //inet_ntop +#include //sockaddr_nl +#include //rtgenmsg,ifinfomsg +#include +#include +#include + +#define BUFSIZE 8192 + +void die(const char *s) +{ +perror(s); +exit(1); +} + +int called_response_handler = 0; + +int test_netlink(struct nlmsghdr* req, pid_t pid, void (*handle_response)(struct nlmsghdr *)) +{ +struct sockaddr_nl src_addr, dst_addr, src_addr2; +int s, len, end = 0; +struct msghdr msg; +struct iovec iov[1]; +char buf[BUFSIZE]; + +//create a netlink socket +if ((s=socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE)) < 0) +{ +die("socket FAILED"); +} + +//bind socket +memset(&src_addr, 0, sizeof(src_addr)); +src_addr.nl_family = AF_NETLINK; +src_addr.nl_pid = pid; // if 0 kernel will assign unique id +src_addr.nl_groups = 0; /* not in mcast groups */ +if (bind(s, (struct sockaddr*) &src_addr, sizeof(src_addr))) +{ +die("bind FAILED"); +} + +//get sock name to check pid +memset(&src_addr2, 0, sizeof(src_addr2)); +socklen_t addr_len = sizeof(src_addr2); +if (getsockname(s, (struct sockaddr*)&src_addr2, &addr_len)) { +die("getsockname FAILED"); +} +if (src_addr.nl_pid != 0) { +assert(src_addr.nl_pid == src_addr2.nl_pid); +} + +//build destination - kernel netlink address +memset(&dst_addr, 0, sizeof(dst_addr)); +dst_addr.nl_family = AF_NETLINK; +dst_addr.nl_pid = 0; // should be 0 if destination is kernel +//dst_addr.nl_pid = 1; //TODO: check that non-0 errors with "sendmsg: Operation not permitted" +dst_addr.nl_groups = 0; + +//build netlink message +iov[0].iov_base = req; +iov[0].iov_len = req->nlmsg_len; + +memset(&msg, 0, sizeof(msg)); +msg.msg_iov = iov; +msg.msg_iovlen = 1; +msg.msg_name = &dst_addr; +msg.msg_namelen = sizeof(dst_addr); + +//send the message +if (sendmsg(s, &msg, 0) < 0) +{ +die("sendmsg FAILED"); +} + +called_response_handler = 0; +//parse reply +while (!end) +{ +memset(&msg, 0, sizeof(msg)); //These and 2 lines below are needed to reset msg - otherwise weird page faults happen +msg.msg_iov = iov;//Check if we can improve things downs
[osv-dev] [PATCH 09/10] netlink: set negative errno in error responses
The netlink specfication requires that error field contains a negative value of errno. Signed-off-by: Waldemar Kozaczuk --- bsd/sys/compat/linux/linux_netlink.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bsd/sys/compat/linux/linux_netlink.cc b/bsd/sys/compat/linux/linux_netlink.cc index be9ea1b8..ec7e9341 100644 --- a/bsd/sys/compat/linux/linux_netlink.cc +++ b/bsd/sys/compat/linux/linux_netlink.cc @@ -506,7 +506,7 @@ netlink_senderr(struct socket *so, struct nlmsghdr *nlm, int error) return ENOBUFS; } err = (struct nlmsgerr *) nlmsg_data(hdr); - err->error = error; + err->error = -error; //Per netlink spec - "Negative errno or 0 for acknowledgements" if (nlm) { err->msg = *nlm; } else { -- 2.35.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220604012837.214986-9-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH 08/10] netlink: made some functions static
Signed-off-by: Waldemar Kozaczuk --- bsd/sys/compat/linux/linux_netlink.cc | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/bsd/sys/compat/linux/linux_netlink.cc b/bsd/sys/compat/linux/linux_netlink.cc index 180d81b5..be9ea1b8 100644 --- a/bsd/sys/compat/linux/linux_netlink.cc +++ b/bsd/sys/compat/linux/linux_netlink.cc @@ -118,6 +118,7 @@ static int get_sockaddr_mask_prefix_len(struct bsd_sockaddr *sa) } +static void *nl_m_put(struct mbuf *m0, int len) { struct mbuf *m, *n; @@ -151,6 +152,7 @@ void *nl_m_put(struct mbuf *m0, int len) return data; } +static struct nlmsghdr * nlmsg_put(struct mbuf *m, uint32_t pid, uint32_t seq, int type, int len, int flags) { struct nlmsghdr *nlh; @@ -170,16 +172,19 @@ struct nlmsghdr * nlmsg_put(struct mbuf *m, uint32_t pid, uint32_t seq, int type return nlh; } +static struct nlmsghdr * nlmsg_begin(struct mbuf *m, uint32_t pid, uint32_t seq, int type, int len, int flags) { return nlmsg_put(m, pid, seq, type, len, flags); } +static void nlmsg_end(struct mbuf *m, struct nlmsghdr *nlh) { nlh->nlmsg_len = m->M_dat.MH.MH_pkthdr.len - ((uintptr_t)nlh - (uintptr_t)m->m_hdr.mh_data); } +static int nla_put(struct mbuf *m, int attrtype, int len, const void *src) { struct nlattr *nla; @@ -198,16 +203,18 @@ int nla_put(struct mbuf *m, int attrtype, int len, const void *src) } template -int nla_put_type(struct mbuf *m, int attrtype, T val) +static int nla_put_type(struct mbuf *m, int attrtype, T val) { return nla_put(m, attrtype, sizeof(val), &val); } +static int nla_put_string(struct mbuf *m, int attrtype, const char *str) { return nla_put(m, attrtype, strlen(str) + 1, str); } +static int nla_put_sockaddr(struct mbuf *m, int attrtype, struct bsd_sockaddr *sa) { void *data; -- 2.35.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220604012837.214986-8-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH 06/10] netlink: return stashed pid
There are three types of pid used in netlink interface: - the nl_pid on the source (app) side (part of sockaddr_nl) set before bind(); could be 0 to request kernel generating new one - the nl_pid on the destination (kernel) size set into dst_addr that always needs to be 0 if we communicate with kernel - the nlmsg_pid (sender port ID) that is part of the netlink message header sent to and received from kernel Some relevant information from Linux docs: " nlmsg_seq and nlmsg_pid are used to track messages. nlmsg_pid shows the origin of the message. Note that there isn't a 1:1 relationship between nlmsg_pid and the PID of the process if the message originated from a netlink socket. See the ADDRESS FORMATS section for further information. Both nlmsg_seq and nlmsg_pid are opaque to netlink core." and: " nl_pid is the unicast address of netlink socket. It's always 0 if the destination is in the kernel. For a user-space process, nl_pid is usually the PID of the process owning the destination socket. However, nl_pid identifies a netlink socket, not a process. If a process owns several netlink sockets, then nl_pid can be equal to the process ID only for at most one socket. There are two ways to assign nl_pid to a netlink socket. If the application sets nl_pid before calling bind(2), then it is up to the application to make sure that nl_pid is unique. If the application sets it to 0, the kernel takes care of assigning it. The kernel assigns the process ID to the first netlink socket the process opens and assigns a unique nl_pid to every netlink socket that the process subsequently creates." The 1st one needs to be stashed or generated (if 0) and then set on nlmsg_pid for each response so that the application receving it can distinguish it if necessary. Golang runtime actually calls sockname() and verifies that the nlmsg_pid in the replies matches the nl_pid on the source socket. The patch modifies relevant code that builds netlink responses to put the nl_pid stashed during socket attach process to set it as value of inlmsg_pid. It also re-implements the netlink_sockaddr() to make it return information including the source PID. Signed-off-by: Waldemar Kozaczuk --- bsd/sys/compat/linux/linux_netlink.cc | 41 +++ bsd/sys/net/if_llatbl.cc | 8 +++--- bsd/sys/net/if_llatbl.h | 4 +-- 3 files changed, 35 insertions(+), 18 deletions(-) diff --git a/bsd/sys/compat/linux/linux_netlink.cc b/bsd/sys/compat/linux/linux_netlink.cc index fcdab06b..82205d2b 100644 --- a/bsd/sys/compat/linux/linux_netlink.cc +++ b/bsd/sys/compat/linux/linux_netlink.cc @@ -436,10 +436,27 @@ netlink_shutdown(struct socket *so) return (raw_usrreqs.pru_shutdown(so)); } +static pid_t +get_socket_pid(struct socket *so) +{ + struct rawcb *rp = sotorawcb(so); + struct netlinkcb *ncb = (netlinkcb *)rp; + return ncb->nl_pid; +} + static int netlink_sockaddr(struct socket *so, struct bsd_sockaddr **nam) { - return (raw_usrreqs.pru_sockaddr(so, nam)); + struct bsd_sockaddr_nl *sin; + + sin = (bsd_sockaddr_nl*)malloc(sizeof *sin); + bzero(sin, sizeof *sin); + sin->nl_family = AF_NETLINK; + sin->nl_len = sizeof(*sin); + sin->nl_pid = get_socket_pid(so); + + *nam = (bsd_sockaddr*)sin; + return 0; } static struct pr_usrreqs netlink_usrreqs = initialize_with([] (pr_usrreqs& x) { @@ -474,7 +491,7 @@ netlink_senderr(struct socket *so, struct nlmsghdr *nlm, int error) } if ((hdr = (struct nlmsghdr *)nlmsg_put(m, - nlm ? nlm->nlmsg_pid : 0, + get_socket_pid(so), nlm ? nlm->nlmsg_seq : 0, NLMSG_ERROR, sizeof(*err), nlm ? nlm->nlmsg_flags : 0)) == NULL) { @@ -513,7 +530,7 @@ netlink_process_getlink_msg(struct socket *so, struct nlmsghdr *nlm) TAILQ_FOREACH(ifp, &V_ifnet, if_link) { IF_ADDR_RLOCK(ifp); - nlh = nlmsg_begin(m, nlm->nlmsg_pid, nlm->nlmsg_seq, LINUX_RTM_NEWLINK, sizeof(*ifm), nlm->nlmsg_flags); + nlh = nlmsg_begin(m, get_socket_pid(so), nlm->nlmsg_seq, LINUX_RTM_NEWLINK, sizeof(*ifm), nlm->nlmsg_flags); if (!nlh) { error = ENOBUFS; goto done; @@ -547,7 +564,7 @@ netlink_process_getlink_msg(struct socket *so, struct nlmsghdr *nlm) IF_ADDR_RUNLOCK(ifp); nlmsg_end(m, nlh); } - nlh = nlmsg_put(m, nlm->nlmsg_pid, nlm->nlmsg_seq, NLMSG_DONE, 0, nlm->nlmsg_flags);
[osv-dev] [PATCH 07/10] netlink: fix error handling
Fix netlink_process_msg() to propagate potential error from netlink_senderr(). Normally netlink_senderr() should return 0 indicating that the error response was built successfully. This patch tweaks the logic to make sure the error response in such case is sent back as a NLMSG_ERROR reply accordingly instead of making sendmsg() return error. Signed-off-by: Waldemar Kozaczuk --- bsd/sys/compat/linux/linux_netlink.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bsd/sys/compat/linux/linux_netlink.cc b/bsd/sys/compat/linux/linux_netlink.cc index 82205d2b..180d81b5 100644 --- a/bsd/sys/compat/linux/linux_netlink.cc +++ b/bsd/sys/compat/linux/linux_netlink.cc @@ -830,7 +830,7 @@ netlink_process_getneigh_msg(struct socket *so, struct nlmsghdr *nlm) struct netlink_getneigh_lle_cbdata cbdata; int error; - if (nlm->nlmsg_len < sizeof (struct ndmsg)) { + if (nlm->nlmsg_len < NLMSG_LENGTH(sizeof (struct ndmsg))) { return EINVAL; } @@ -892,7 +892,7 @@ netlink_process_msg(struct mbuf *m, struct socket *so) flush: if (error) { - netlink_senderr(so, nlm, error); + error = netlink_senderr(so, nlm, error); } if (m) { m_freem(m); -- 2.35.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220604012837.214986-7-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH 05/10] netlink: stash nl_pid into netlinkcb
This enhances the netlink_attach() to capture or generate the source nl_pid (if 0) and save it in the control back that could be fetched later when necessary. This will be useful in the next patch. Signed-off-by: Waldemar Kozaczuk --- bsd/sys/compat/linux/linux_netlink.cc | 25 + 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/bsd/sys/compat/linux/linux_netlink.cc b/bsd/sys/compat/linux/linux_netlink.cc index 7e743db8..fcdab06b 100644 --- a/bsd/sys/compat/linux/linux_netlink.cc +++ b/bsd/sys/compat/linux/linux_netlink.cc @@ -61,6 +61,14 @@ struct bsd_sockaddr_nl { uint32_tnl_groups;/* Multicast groups mask */ }; +struct netlinkcb { + struct rawcbraw; + pid_t nl_pid; +}; + +std::atomic _nl_next_gen_pid(2); + + MALLOC_DEFINE(M_NETLINK, "netlink", "netlink socket"); static struct bsd_sockaddr netlink_src = { 2, PF_NETLINK, }; @@ -311,16 +319,18 @@ netlink_close(struct socket *so) static int netlink_attach(struct socket *so, int proto, struct thread *td) { + struct netlinkcb *ncb; struct rawcb *rp; int s, error; KASSERT(so->so_pcb == NULL, ("netlink_attach: so_pcb != NULL")); /* XXX */ - rp = (rawcb *)malloc(sizeof *rp); - if (rp == NULL) + ncb = (netlinkcb *)malloc(sizeof *ncb); + if (ncb == NULL) return ENOBUFS; - bzero(rp, sizeof *rp); + bzero(ncb, sizeof *ncb); + rp = &ncb->raw; /* * The splnet() is necessary to block protocols from sending @@ -362,7 +372,14 @@ netlink_bind(struct socket *so, struct bsd_sockaddr *nam, struct thread *td) __FILE__, __LINE__, __FUNCTION__, nam->sa_len, sizeof(struct bsd_sockaddr_nl)); return EINVAL; } - // TODO: stash the nl_pid somewhere + auto *ncb = reinterpret_cast(rp); + bsd_sockaddr_nl *nl_sock_addr = (bsd_sockaddr_nl*)nam; + if (nl_sock_addr->nl_pid == 0) { // kernel needs to assign pid + auto assigned_pid = _nl_next_gen_pid.fetch_add(1, std::memory_order_relaxed); + ncb->nl_pid = assigned_pid; + } else { + ncb->nl_pid = nl_sock_addr->nl_pid; + } return 0; } return (raw_usrreqs.pru_bind(so, nam, td)); /* xxx just EINVAL */ -- 2.35.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220604012837.214986-5-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH 04/10] netlink: do not put IFA_BROADCAST for loopback address
This is a minor adjustment to make OSv implementation match what Linux does - skip IFA_BROADCAST attributes for loopback address in NEWADDR response. Signed-off-by: Waldemar Kozaczuk --- bsd/sys/compat/linux/linux_netlink.cc | 14 ++ 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/bsd/sys/compat/linux/linux_netlink.cc b/bsd/sys/compat/linux/linux_netlink.cc index 4208ce7f..7e743db8 100644 --- a/bsd/sys/compat/linux/linux_netlink.cc +++ b/bsd/sys/compat/linux/linux_netlink.cc @@ -616,8 +616,11 @@ netlink_process_getaddr_msg(struct socket *so, struct nlmsghdr *nlm) in6_clearscope(&broadaddr.sin6_addr); p_broadaddr = (struct bsd_sockaddr *)&broadaddr; } - if (nla_put_sockaddr(m, IFA_ADDRESS, p_addr) || - nla_put_sockaddr(m, IFA_BROADCAST, p_broadaddr)){ + if (nla_put_sockaddr(m, IFA_ADDRESS, p_addr)){ + error = ENOBUFS; + goto done; + } + if (!(ifm->ifa_flags & IFF_LOOPBACK) && nla_put_sockaddr(m, IFA_BROADCAST, p_broadaddr)){ error = ENOBUFS; goto done; } @@ -625,8 +628,11 @@ netlink_process_getaddr_msg(struct socket *so, struct nlmsghdr *nlm) else #endif { - if (nla_put_sockaddr(m, IFA_ADDRESS, ifa->ifa_addr) || - nla_put_sockaddr(m, IFA_BROADCAST, ifa->ifa_broadaddr)){ + if (nla_put_sockaddr(m, IFA_ADDRESS, ifa->ifa_addr)){ + error = ENOBUFS; + goto done; + } + if (!(ifm->ifa_flags & IFF_LOOPBACK) && nla_put_sockaddr(m, IFA_BROADCAST, ifa->ifa_broadaddr)){ error = ENOBUFS; goto done; } -- 2.35.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220604012837.214986-4-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH 03/10] netlink: IFA_ADDRESS needs to go first
Golang uses the netlink interface RTM_GETADDR to query the network interfaces and IPs. It assumes that the 1st attribute in the RTM_NEWADDR response is IFA_ADDRESS. This patch changes the order in which RTM_NEWADDR attributes are sent to make sure the IFA_ADDRESS goes first and IFA_LABEL last. This does not seem to be documented anywhere but Linux sends RTM_NEWADDR responses with the IFA_ADDRESS attribute first so we follow suit. Signed-off-by: Waldemar Kozaczuk --- bsd/sys/compat/linux/linux_netlink.cc | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bsd/sys/compat/linux/linux_netlink.cc b/bsd/sys/compat/linux/linux_netlink.cc index ea0cf609..4208ce7f 100644 --- a/bsd/sys/compat/linux/linux_netlink.cc +++ b/bsd/sys/compat/linux/linux_netlink.cc @@ -599,10 +599,6 @@ netlink_process_getaddr_msg(struct socket *so, struct nlmsghdr *nlm) ifm->ifa_prefixlen = get_sockaddr_mask_prefix_len(ifa->ifa_netmask); ifm->ifa_flags = ifp->if_flags | ifp->if_drv_flags; ifm->ifa_scope = 0; // FIXME: - if (nla_put_string(m, IFA_LABEL, ifp->if_xname)) { - error = ENOBUFS; - goto done; - } #ifdef INET6 if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET6){ // FreeBSD embeds the IPv6 scope ID in the IPv6 address @@ -635,6 +631,10 @@ netlink_process_getaddr_msg(struct socket *so, struct nlmsghdr *nlm) goto done; } } + if (nla_put_string(m, IFA_LABEL, ifp->if_xname)) { + error = ENOBUFS; + goto done; + } nlmsg_end(m, nlh); } -- 2.35.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220604012837.214986-3-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH 02/10] netlink: set LINUX_RTM_NEWADDR and LINUX_RTM_NEWNEIGH on responses
This patch fixes a minor bug in handling RTM_GETADDR and RTM_GETNEIGH requests. It tweaks the relevant code to set the RTM_NEWADDR and RTM_NEWNEIGH type for the responses respectively. This is important as for example Golang runtime tests the nlmsg_type of the netlink response and breaks if it is wrong. Signed-off-by: Waldemar Kozaczuk --- bsd/sys/compat/linux/linux_netlink.cc | 20 ++-- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/bsd/sys/compat/linux/linux_netlink.cc b/bsd/sys/compat/linux/linux_netlink.cc index bc02bb7f..ea0cf609 100644 --- a/bsd/sys/compat/linux/linux_netlink.cc +++ b/bsd/sys/compat/linux/linux_netlink.cc @@ -588,7 +588,7 @@ netlink_process_getaddr_msg(struct socket *so, struct nlmsghdr *nlm) if (!ifa->ifa_addr) continue; - nlh = nlmsg_begin(m, nlm->nlmsg_pid, nlm->nlmsg_seq, LINUX_RTM_GETADDR, sizeof(*ifm), nlm->nlmsg_flags); + nlh = nlmsg_begin(m, nlm->nlmsg_pid, nlm->nlmsg_seq, LINUX_RTM_NEWADDR, sizeof(*ifm), nlm->nlmsg_flags); if (!nlh) { error = ENOBUFS; goto done; @@ -720,7 +720,7 @@ netlink_getneigh_lle_cb(struct lltable *llt, struct llentry *lle, void *data) struct nlmsghdr *nlm = cbdata->nlm; struct mbuf *m = cbdata->m; struct ndmsg *ndm; - struct nlmsghdr *nlh = nlmsg_begin(m, nlm->nlmsg_pid, nlm->nlmsg_seq, LINUX_RTM_GETNEIGH, sizeof(*ndm), nlm->nlmsg_flags); + struct nlmsghdr *nlh = nlmsg_begin(m, nlm->nlmsg_pid, nlm->nlmsg_seq, LINUX_RTM_NEWNEIGH, sizeof(*ndm), nlm->nlmsg_flags); if (!nlh) { return ENOBUFS; @@ -753,7 +753,7 @@ netlink_getneigh_lle_cb(struct lltable *llt, struct llentry *lle, void *data) } } #endif - + if (nla_put(m, NDA_LLADDR, 6, lle->ll_addr.mac16)) { return ENOBUFS; } @@ -875,29 +875,29 @@ extern struct domain netlinkdomain; /* or at least forward */ static struct protosw netlinksw[] = { initialize_with([] (protosw& x) { - x.pr_type = SOCK_RAW; + x.pr_type = SOCK_RAW; x.pr_domain = &netlinkdomain; x.pr_flags =PR_ATOMIC|PR_ADDR; x.pr_output = netlink_output; x.pr_ctlinput = raw_ctlinput; - x.pr_init = raw_init; + x.pr_init = raw_init; x.pr_usrreqs = &netlink_usrreqs; }), initialize_with([] (protosw& x) { - x.pr_type = SOCK_DGRAM; + x.pr_type = SOCK_DGRAM; x.pr_domain = &netlinkdomain; x.pr_flags =PR_ATOMIC|PR_ADDR; x.pr_output = netlink_output; x.pr_ctlinput = raw_ctlinput; - x.pr_init = raw_init; + x.pr_init = raw_init; x.pr_usrreqs = &netlink_usrreqs; }), }; struct domain netlinkdomain = initialize_with([] (domain& x) { - x.dom_family = PF_NETLINK; - x.dom_name ="netlink"; - x.dom_protosw = netlinksw; + x.dom_family = PF_NETLINK; + x.dom_name ="netlink"; + x.dom_protosw = netlinksw; x.dom_protoswNPROTOSW = &netlinksw[sizeof(netlinksw)/sizeof(netlinksw[0])]; }); -- 2.35.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220604012837.214986-2-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH 01/10] netlink: minimal Linux rtnetlink support
This 1st of the 10 patches brings support of the minimal subset of the rtnetlink (Linux routing socket) interface as described here - https://man7.org/linux/man-pages/man7/rtnetlink.7.html. The rtnetlink is actually a subset of even richer netlink interface described here - https://man7.org/linux/man-pages/man7/netlink.7.html. In other words, rtnetlink covers a NETLINK_ROUTE family of the broader netlink interface. We need rtnetlink in order to support the implemetation of if_nameindex() and getifaddrs() in modern musl 1.1.24. In addition Golang uses the netlink interface to discover the interfaces and IP address as well. Please note this is an original copy of the Charles Myers' two commits: f1cd48e0f192564d64e7b1e1caccc8df05e7cd5d except of the modifications to bsd/net.cc that are part of the last commit and subset of the 64a0c1affe9921e6a5a5b164edf1a544a7297393 that adds lltable_foreach() and lltable_foreach_lle(). The next 8 much smaller patches fix various small bugs and enhance slightly this implementation. The last one enables the netlink support and adds a unit test. The netlink interface is pretty rich and not very precisely documented. I have actually used a unit test to discover in more details how the netlink responses should look like. In general, the application would use standard socket API to open a socket with the domain and protocol equal to AF_NETLINK and NETLINK_ROUTE respectively and typically use SOCK_RAW as type. Then it would optionally bind the socket and build a request sent using standard sendmsg(). Finally it would receive all replies from kernel using recvmsg(). To illustrate, the incomplete code might look like this: //step 1 int s = socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); //step 2 src_addr.nl_family = AF_NETLINK; src_addr.nl_pid = pid; // if 0 kernel will assign unique id bind(s, (struct sockaddr*) &src_addr, sizeof(src_addr)) // step 3 dst_addr.nl_family = AF_NETLINK; dst_addr.nl_pid = 0; // should be 0 if destination is kernel iov[0].iov_base = req; iov[0].iov_len = req->nlmsg_len; snd_msg.msg_iov = iov; snd_msg.msg_iovlen = 1; snd_msg.msg_name = &dst_addr; snd_msg.msg_namelen = sizeof(dst_addr); sendmsg(s, &snd_msg, 0) //step 4 rcv_msg.msg_iov[0].iov_base = buf; rcv_msg.msg_iov[0].iov_len = BUFSIZE; recvmsg(s, &rcv_msg, 0) //process replies received in buf This patch implements support of only 3 rtnetlink types of requests: - RTM_GETLINK - RTM_GETADDR - RTM_GETNEIGH The bulk of the implementation is in the linux_netlink.cc and mostly centers around following functions: - netlink_attach() - netlink_bind() - netlink_output() - netlink_process_msg() - netlink_process_getlink_msg() - netlink_process_getaddr_msg() - netlink_process_getneigh_msg() Most other pru_* functions delegate to raw_usrreqs as is. Authored-by: Charles Myers Signed-off-by: Waldemar Kozaczuk --- Makefile | 1 + bsd/sys/compat/linux/linux_netlink.cc | 904 ++ bsd/sys/compat/linux/linux_netlink.h | 175 + bsd/sys/compat/linux/linux_socket.cc | 5 + bsd/sys/compat/linux/linux_socket.h | 1 + bsd/sys/net/if_llatbl.cc | 46 +- bsd/sys/net/if_llatbl.h | 13 + bsd/sys/net/netisr.h | 1 + 8 files changed, 1143 insertions(+), 3 deletions(-) create mode 100644 bsd/sys/compat/linux/linux_netlink.cc create mode 100644 bsd/sys/compat/linux/linux_netlink.h diff --git a/Makefile b/Makefile index 19a4571b..2d1ba6a8 100644 --- a/Makefile +++ b/Makefile @@ -593,6 +593,7 @@ bsd += bsd/porting/bus_dma.o bsd += bsd/sys/netinet/if_ether.o bsd += bsd/sys/compat/linux/linux_socket.o bsd += bsd/sys/compat/linux/linux_ioctl.o +bsd += bsd/sys/compat/linux/linux_netlink.o bsd += bsd/sys/net/if_ethersubr.o bsd += bsd/sys/net/if_llatbl.o bsd += bsd/sys/net/radix.o diff --git a/bsd/sys/compat/linux/linux_netlink.cc b/bsd/sys/compat/linux/linux_netlink.cc new file mode 100644 index ..bc02bb7f --- /dev/null +++ b/bsd/sys/compat/linux/linux_netlink.cc @@ -0,0 +1,904 @@ +/* + * Linux NETLINK socket implementation. + * + * NETLINK is used to support IPv4/IPv6 LIBC getifaddrs(), if_nameindex(). + * + * Warning: Tx/Rx messages are compatible with Linux not FreeBSD. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#ifdef INET6 +#include +#include +#include +#include +#include +#endif + +#include +#include +#include + +#if !defined(offsetof) +#define offsetof(TYPE, MEMBER) __builtin_offsetof (TYPE, MEMBER) +#endif + +mutex netlink_mtx; + +#define NETLINK_LOCK() mutex_lock(&netlink_mtx) +#define NETLINK_UNLOCK() mutex_unlock(&netlink_m
[osv-dev] [PATCH] vfs: implement renameat()
This patch implements the renameat() function and enhances tst-rename.cc to unit test it. This patch also exposes renameat as a syscall. #Refs 1188 Signed-off-by: Waldemar Kozaczuk --- exported_symbols/osv_ld-musl.so.1.symbols | 1 + exported_symbols/osv_libc.so.6.symbols| 1 + fs/vfs/main.cc| 30 ++ linux.cc | 1 + tests/tst-rename.cc | 48 ++- 5 files changed, 79 insertions(+), 2 deletions(-) diff --git a/exported_symbols/osv_ld-musl.so.1.symbols b/exported_symbols/osv_ld-musl.so.1.symbols index 3db22e0d..6d88fda4 100644 --- a/exported_symbols/osv_ld-musl.so.1.symbols +++ b/exported_symbols/osv_ld-musl.so.1.symbols @@ -868,6 +868,7 @@ remquo remquof remquol rename +renameat res_init res_mkquery rewind diff --git a/exported_symbols/osv_libc.so.6.symbols b/exported_symbols/osv_libc.so.6.symbols index e29059bb..b27fbba1 100644 --- a/exported_symbols/osv_libc.so.6.symbols +++ b/exported_symbols/osv_libc.so.6.symbols @@ -696,6 +696,7 @@ register_printf_specifier register_printf_type remove rename +renameat __res_init rewind rewinddir diff --git a/fs/vfs/main.cc b/fs/vfs/main.cc index bdedc6c6..76e1ee0e 100644 --- a/fs/vfs/main.cc +++ b/fs/vfs/main.cc @@ -1043,6 +1043,36 @@ int rename(const char *oldpath, const char *newpath) return -1; } +OSV_LIBC_API +int renameat(int olddirfd, const char *oldpath, + int newdirfd, const char *newpath) +{ +if (!oldpath || !newpath) { +errno = EINVAL; +return -1; +} + +if (newpath[0] == '/' || newdirfd == AT_FDCWD) { +return vfs_fun_at2(olddirfd, oldpath, [newpath](const char *path) { +return rename(path, newpath); +}); +} else { +char absolute_newpath[PATH_MAX]; +auto error = vfs_fun_at(newdirfd, newpath, [&absolute_newpath](const char *absolute_path) { +strcpy(absolute_newpath, absolute_path); +return 0; +}); + +if (error) { +return error; +} else { +return vfs_fun_at2(olddirfd, oldpath, [absolute_newpath](const char *path) { +return rename(path, absolute_newpath); +}); +} +} +} + TRACEPOINT(trace_vfs_chdir, "\"%s\"", const char*); TRACEPOINT(trace_vfs_chdir_ret, ""); TRACEPOINT(trace_vfs_chdir_err, "%d", int); diff --git a/linux.cc b/linux.cc index f60489e3..1fa01326 100644 --- a/linux.cc +++ b/linux.cc @@ -516,6 +516,7 @@ OSV_LIBC_API long syscall(long number, ...) SYSCALL3(unlinkat, int, const char *, int); SYSCALL3(symlinkat, const char *, int, const char *); SYSCALL3(sys_getdents64, int, void *, size_t); +SYSCALL4(renameat, int, const char *, int, const char *); } debug_always("syscall(): unimplemented system call %d\n", number); diff --git a/tests/tst-rename.cc b/tests/tst-rename.cc index 722fdc4c..0668bf77 100644 --- a/tests/tst-rename.cc +++ b/tests/tst-rename.cc @@ -73,6 +73,9 @@ static void assert_rename_fails(const fs::path &src, const fs::path &dst, std::v BOOST_TEST_MESSAGE("Renaming " + src.string() + " to " + dst.string()); BOOST_REQUIRE(rename(src.c_str(), dst.c_str()) == -1); assert_one_of(errno, errnos); +BOOST_TEST_MESSAGE("Renaming(at) " + src.string() + " to " + dst.string()); +BOOST_REQUIRE(renameat(AT_FDCWD, src.c_str(), AT_FDCWD, dst.c_str()) == -1); +assert_one_of(errno, errnos); } static void assert_renames(const fs::path src, const fs::path dst) @@ -82,11 +85,27 @@ static void assert_renames(const fs::path src, const fs::path dst) BOOST_REQUIRE_MESSAGE(result == 0, fmt("Rename should succeed, errno=%d") % errno); } -static void test_rename(const fs::path &src, const fs::path &dst) +static void assert_renames_at(const fs::path src, const fs::path dst) +{ +BOOST_TEST_MESSAGE("Renaming " + src.string() + " to " + dst.string()); +auto src_dir_fd = open(src.parent_path().c_str(), O_DIRECTORY); +auto dst_dir_fd = open(dst.parent_path().c_str(), O_DIRECTORY); +int result = renameat(src_dir_fd, src.filename().c_str(), dst_dir_fd, dst.filename().c_str()); +BOOST_REQUIRE_MESSAGE(result == 0, fmt("Renameat should succeed, errno=%d") % errno); +close(src_dir_fd); +close(dst_dir_fd); +} + +static void test_rename(const fs::path &src, const fs::path &dst, bool at = false) { prepare_file(src); -assert_renames(src, dst); +if (at) { +assert_renames_at(src, dst); +} +else { +assert_renames(src, dst); +} check_file(dst); BOOST_CHECK_MESSAGE(!fs::exists(src), "Old file should not exist"); @@ -136,6 +155,10 @@ BOOST_AUTO_TEST_CASE(test_renaming_in_the_same_directory)
[osv-dev] [PATCH] multibyte: add __mbstowcs_chk
Signed-off-by: Waldemar Kozaczuk --- Makefile| 1 + libc/multibyte/__mbstowcs_chk.c | 17 + 2 files changed, 18 insertions(+) create mode 100644 libc/multibyte/__mbstowcs_chk.c diff --git a/Makefile b/Makefile index 3e87a16d..19a4571b 100644 --- a/Makefile +++ b/Makefile @@ -1451,6 +1451,7 @@ libc += multibyte/__mbsnrtowcs_chk.o musl += multibyte/mbsrtowcs.o libc += multibyte/__mbsrtowcs_chk.o musl += multibyte/mbstowcs.o +libc += multibyte/__mbstowcs_chk.o musl += multibyte/mbtowc.o musl += multibyte/wcrtomb.o musl += multibyte/wcsnrtombs.o diff --git a/libc/multibyte/__mbstowcs_chk.c b/libc/multibyte/__mbstowcs_chk.c new file mode 100644 index ..fcc390df --- /dev/null +++ b/libc/multibyte/__mbstowcs_chk.c @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2022 Waldemar Kozaczuk + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ + +#include +#include + +size_t __mbstowcs_chk(wchar_t *restrict dest, const char *restrict src, size_t n, size_t dstlen) +{ +if (n > dstlen) { +_chk_fail("mbstowcs"); +} +return mbstowcs(dest, src, n); +} -- 2.34.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220520193257.146906-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH V2] syscall: implement getdents64
V2: The only difference is removed delete_dir() function was accidentally left from previous attempts to implement this syscall. It looks like the golang apps that need to iterate over entries in a directory use a system call getdents64 which is documented in https://man7.org/linux/man-pages/man2/getdents.2.html. Normally this functionality is provided by the libc functions like opendir(), readdir(), etc which actually do delegate to getdents64. Go is known of bypassing libc in such cases. So this patch implements the syscall getdents64 by adding a utility function to VFS main.cc that is then called by syscall in linux.cc. For details of how this function works please look at the comments. This patch also adds a unit test to verify this syscall works. Refs #1188 Signed-off-by: Waldemar Kozaczuk --- fs/vfs/main.cc | 65 linux.cc | 4 ++ modules/tests/Makefile | 2 +- tests/tst-getdents.cc | 111 + 4 files changed, 181 insertions(+), 1 deletion(-) create mode 100644 tests/tst-getdents.cc diff --git a/fs/vfs/main.cc b/fs/vfs/main.cc index 8e3d4e5e..bdedc6c6 100644 --- a/fs/vfs/main.cc +++ b/fs/vfs/main.cc @@ -790,6 +790,71 @@ int readdir64_r(DIR *dir, struct dirent64 *entry, extern "C" OSV_LIBC_API struct dirent *readdir64(DIR *dir) __attribute__((alias("readdir"))); +struct linux_dirent64 { +u64d_ino; +s64d_off; +unsigned short d_reclen; +unsigned char d_type; +char d_name[]; +}; + +#undef getdents64 +extern "C" +ssize_t sys_getdents64(int fd, void *dirp, size_t count) +{ +auto *dir = fdopendir(fd); +if (dir) { +// We have verified that fd points to a valid directory +// but we do NOT need the DIR handle so just delete it +delete dir; + +struct file *fp; +int error = fget(fd, &fp); +if (error) { +errno = error; +return -1; +} + +size_t bytes_read = 0; +off_t last_off = -1; +errno = 0; + +// Iterate over as many entries as there is space in the buffer +// by directly calling sys_readdir() +struct dirent entry; +while ((error = sys_readdir(fp, &entry)) == 0) { +auto rec_len = offsetof(linux_dirent64, d_name) + strlen(entry.d_name) + 1; +if (rec_len <= count) { +auto *ldirent = static_cast(dirp + bytes_read); +ldirent->d_ino = entry.d_ino; +ldirent->d_off = entry.d_off; +ldirent->d_type = entry.d_type; +strcpy(ldirent->d_name, entry.d_name); +ldirent->d_reclen = rec_len; +count -= rec_len; +bytes_read += rec_len; +last_off = entry.d_off; +} else { +if (last_off >= 0) +sys_seekdir(fp, last_off); +break; +} +} + +fdrop(fp); + +if (error && error != ENOENT) { +errno = error; +return -1; +} else { +errno = 0; +return bytes_read; +} +} else { +return -1; +} +} + OSV_LIBC_API void rewinddir(DIR *dirp) { diff --git a/linux.cc b/linux.cc index 85c08981..f60489e3 100644 --- a/linux.cc +++ b/linux.cc @@ -424,6 +424,9 @@ static int tgkill(int tgid, int tid, int sig) return -1; } +#define __NR_sys_getdents64 __NR_getdents64 +extern "C" ssize_t sys_getdents64(int fd, void *dirp, size_t count); + OSV_LIBC_API long syscall(long number, ...) { // Save FPU state and restore it at the end of this function @@ -512,6 +515,7 @@ OSV_LIBC_API long syscall(long number, ...) SYSCALL2(statfs, const char *, struct statfs *); SYSCALL3(unlinkat, int, const char *, int); SYSCALL3(symlinkat, const char *, int, const char *); +SYSCALL3(sys_getdents64, int, void *, size_t); } debug_always("syscall(): unimplemented system call %d\n", number); diff --git a/modules/tests/Makefile b/modules/tests/Makefile index ca489341..e462ebc8 100644 --- a/modules/tests/Makefile +++ b/modules/tests/Makefile @@ -133,7 +133,7 @@ tests := tst-pthread.so misc-ramdisk.so tst-vblk.so tst-bsd-evh.so \ tst-getopt.so tst-getopt-pie.so tst-non-pie.so tst-semaphore.so \ tst-elf-init.so tst-realloc.so tst-setjmp.so \ libtls.so libtls_gold.so tst-tls.so tst-tls-gold.so tst-tls-pie.so \ - tst-sigaction.so tst-syscall.so tst-ifaddrs.so + tst-sigaction.so tst-syscall.so tst-ifaddrs.so tst-getdents.so # libstatic-thread-variable.so tst-static-thread-variable.so \ #TODO For now let us disable these tests for aarch64 until diff --git a/tests/tst-getdents.cc b/tests/tst-getdents.cc new file mode 100644 index ..5803aaeb --- /dev/null +++ b/tests/
[osv-dev] [PATCH] syscall: implement getcwd
#Refs 1188 Signed-off-by: Waldemar Kozaczuk --- linux.cc | 15 +++ tests/tst-syscall.cc | 7 +++ 2 files changed, 22 insertions(+) diff --git a/linux.cc b/linux.cc index dd0dabd1..85c08981 100644 --- a/linux.cc +++ b/linux.cc @@ -366,6 +366,20 @@ static int sys_exit_group(int ret) return 0; } +#define __NR_sys_getcwd __NR_getcwd +static long sys_getcwd(char *buf, unsigned long size) +{ +if (!buf) { +errno = EINVAL; +return -1; +} +auto ret = getcwd(buf, size); +if (!ret) { +return -1; +} +return strlen(ret) + 1; +} + #define __NR_sys_ioctl __NR_ioctl // // We need to define explicit sys_ioctl that takes these 3 parameters to conform @@ -482,6 +496,7 @@ OSV_LIBC_API long syscall(long number, ...) SYSCALL2(nanosleep, const struct timespec*, struct timespec *); SYSCALL4(fstatat, int, const char *, struct stat *, int); SYSCALL1(sys_exit_group, int); +SYSCALL2(sys_getcwd, char *, unsigned long); SYSCALL4(readlinkat, int, const char *, char *, size_t); SYSCALL0(getpid); SYSCALL3(set_mempolicy, int, unsigned long *, unsigned long); diff --git a/tests/tst-syscall.cc b/tests/tst-syscall.cc index 12722f1b..0fbd9c35 100644 --- a/tests/tst-syscall.cc +++ b/tests/tst-syscall.cc @@ -13,6 +13,7 @@ #include #include #include +#include #include @@ -117,6 +118,12 @@ int main(int argc, char **argv) assert(close(fd) == 0); +assert(chdir("/proc") == 0); +unsigned long size = 4096; +char path[size]; +assert(syscall(__NR_getcwd, path, size) == 6); +assert(strcmp("/proc", path) == 0); + // test that unknown system call results in a ENOSYS (see issue #757) expect_errno_l(syscall(999), ENOSYS); -- 2.34.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220520191546.141775-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH V2] vfs: implement symlinkat
V2: The implementation uses vfs_fun_at2() instead of vfs_fun_at() to further simplify code. We also expose symlinkat though syscall. This patch implements the symlinkat() function and enhances tst-symlink.cc to unit test it. #Refs 1188 Signed-off-by: Waldemar Kozaczuk --- exported_symbols/osv_ld-musl.so.1.symbols | 1 + exported_symbols/osv_libc.so.6.symbols| 1 + fs/vfs/main.cc| 13 + linux.cc | 1 + tests/tst-symlink.cc | 18 +++--- 5 files changed, 31 insertions(+), 3 deletions(-) diff --git a/exported_symbols/osv_ld-musl.so.1.symbols b/exported_symbols/osv_ld-musl.so.1.symbols index f1c61a3f..3db22e0d 100644 --- a/exported_symbols/osv_ld-musl.so.1.symbols +++ b/exported_symbols/osv_ld-musl.so.1.symbols @@ -1081,6 +1081,7 @@ swab swprintf swscanf symlink +symlinkat sync syscall sysconf diff --git a/exported_symbols/osv_libc.so.6.symbols b/exported_symbols/osv_libc.so.6.symbols index 7ae57c38..e29059bb 100644 --- a/exported_symbols/osv_libc.so.6.symbols +++ b/exported_symbols/osv_libc.so.6.symbols @@ -887,6 +887,7 @@ swprintf __swprintf_chk swscanf symlink +symlinkat sync syscall sysconf diff --git a/fs/vfs/main.cc b/fs/vfs/main.cc index 9b2e2c02..8e3d4e5e 100644 --- a/fs/vfs/main.cc +++ b/fs/vfs/main.cc @@ -1132,6 +1132,19 @@ int symlink(const char *oldpath, const char *newpath) return 0; } +OSV_LIBC_API +int symlinkat(const char *oldpath, int newdirfd, const char *newpath) +{ +if (!oldpath) { +errno = EINVAL; +return -1; +} + +return vfs_fun_at2(newdirfd, newpath, [oldpath](const char * path) { +return symlink(oldpath, path); +}); +} + TRACEPOINT(trace_vfs_unlink, "\"%s\"", const char*); TRACEPOINT(trace_vfs_unlink_ret, ""); TRACEPOINT(trace_vfs_unlink_err, "%d", int); diff --git a/linux.cc b/linux.cc index 5c271df1..dd0dabd1 100644 --- a/linux.cc +++ b/linux.cc @@ -496,6 +496,7 @@ OSV_LIBC_API long syscall(long number, ...) SYSCALL3(lseek, int, off_t, int); SYSCALL2(statfs, const char *, struct statfs *); SYSCALL3(unlinkat, int, const char *, int); +SYSCALL3(symlinkat, const char *, int, const char *); } debug_always("syscall(): unimplemented system call %d\n", number); diff --git a/tests/tst-symlink.cc b/tests/tst-symlink.cc index 978cfda3..1322e79e 100644 --- a/tests/tst-symlink.cc +++ b/tests/tst-symlink.cc @@ -25,6 +25,9 @@ #define N1"f1" #define N2"f2_AAA" +#define N2B "f2_BBB" +#define N2B "f2_BBB" +#define N2C "f2_CCC" #define N3"f3" #define N4"f4" #define N5"f5" @@ -91,6 +94,8 @@ int main(int argc, char **argv) #endif report(chdir(TESTDIR) == 0, "chdir"); +auto test_dir = opendir(TESTDIR); +report(test_dir, "opendir"); /* * test to check @@ -115,6 +120,10 @@ int main(int argc, char **argv) #else report(symlink(N1, N2) == 0, "symlink"); report(search_dir(TESTDIR, N2) == true, "search dir"); +report(symlinkat(N1, dirfd(test_dir), N2B) == 0, "symlinkat"); +report(search_dir(TESTDIR, N2B) == true, "search dir N2B"); +report(symlinkat(N1, AT_FDCWD, N2C) == 0, "symlinkat"); +report(search_dir(TESTDIR, N2C) == true, "search dir N2B"); #endif #if defined(READ_ONLY_FS) @@ -125,6 +134,8 @@ int main(int argc, char **argv) #else report(access(N1, R_OK | W_OK) == 0, "access"); report(access(N2, R_OK | W_OK) == 0, "access"); +report(access(N2B, R_OK | W_OK) == 0, "access"); +report(access(N2C, R_OK | W_OK) == 0, "access"); #endif rc = readlink(N2, path, sizeof(path)); @@ -157,6 +168,8 @@ int main(int argc, char **argv) error = errno; report(rc < 0 && errno == ENOENT, "ENOENT expected"); report(unlink(N2) == 0, "unlink"); +report(unlinkat(dirfd(test_dir),N2B,0) == 0, "unlinkat"); +report(unlinkat(dirfd(test_dir),N2C,0) == 0, "unlinkat"); /* * IO Tests 1: write(file), read(symlink), truncate(symlink) @@ -365,8 +378,6 @@ int main(int argc, char **argv) report(search_dir(D2, N5) == true, "Symlink search"); report(rename(D2, D3) == 0, "rename(d2, d3)"); -auto test_dir = opendir(TESTDIR); -report(test_dir, "opendir"); rc = readlinkat(dirfd(test_dir), D3, path, sizeof(path)); report(rc >= 0, "readlinkat"); path[rc] = 0; @@ -381,7 +392,6 @@ int main(int argc, char **argv) report(rc >= 0, "readlinkat"); path[rc] = 0; report(strcmp(path, D1) == 0, "readlinkat path"); -report(closedir(test_dir) == 0, "closedir(test_dir)");
[osv-dev] [PATCH V2] unlinkat: fill the gaps in the implementation
V2: The implementation uses vfs_fun_at2() instead of vfs_fun_at() to further simplify code. We also expose unlinkat though syscall. This patch enhances the unlinkat() implementation to handle the AT_FDCWD dirfd and AT_REMOVEDIR flags. We also enhance tst-remove.cc to test unlinkat. #Refs 1188 Signed-off-by: Waldemar Kozaczuk --- fs/vfs/main.cc | 12 +++- linux.cc| 1 + tests/tst-remove.cc | 18 -- 3 files changed, 24 insertions(+), 7 deletions(-) diff --git a/fs/vfs/main.cc b/fs/vfs/main.cc index 1b7eb588..9b2e2c02 100644 --- a/fs/vfs/main.cc +++ b/fs/vfs/main.cc @@ -1164,11 +1164,13 @@ int unlink(const char *pathname) OSV_LIBC_API int unlinkat(int dirfd, const char *pathname, int flags) { -//TODO: Really implement it -if (dirfd != AT_FDCWD || flags) { -UNIMPLEMENTED("unlinkat() with non-zero flags or dirfd != AT_FDCWD"); -} -return unlink(pathname); +return vfs_fun_at2(dirfd, pathname, [flags](const char *path) { +if (flags & AT_REMOVEDIR) { +return rmdir(path); +} else { +return unlink(path); +} +}); } TRACEPOINT(trace_vfs_stat, "\"%s\" %p", const char*, struct stat*); diff --git a/linux.cc b/linux.cc index c9b6b7b6..5c271df1 100644 --- a/linux.cc +++ b/linux.cc @@ -495,6 +495,7 @@ OSV_LIBC_API long syscall(long number, ...) SYSCALL0(getuid); SYSCALL3(lseek, int, off_t, int); SYSCALL2(statfs, const char *, struct statfs *); +SYSCALL3(unlinkat, int, const char *, int); } debug_always("syscall(): unimplemented system call %d\n", number); diff --git a/tests/tst-remove.cc b/tests/tst-remove.cc index 6851cba0..fdf4037d 100644 --- a/tests/tst-remove.cc +++ b/tests/tst-remove.cc @@ -42,6 +42,8 @@ bool do_expect(T actual, T expected, const char *actuals, const char *expecteds, int main(int argc, char **argv) { expect(mkdir("/tmp/tst-remove", 0777), 0); +auto tst_remove_dir = open("/tmp/tst-remove", O_DIRECTORY); +expect(tst_remove_dir != -1, true); /* test unlink() **/ // unlink() non-existant file returns ENOENT @@ -79,12 +81,24 @@ int main(int argc, char **argv) expect_errno(rmdir("/tmp/tst-remove/f"), ENOTDIR); expect(unlink("/tmp/tst-remove/f"), 0); -/* test remove() ***/ -// TODO... +/* test unlinkat() ***/ +expect(mknod("/tmp/tst-remove/u", 0777|S_IFREG, 0), 0); +expect(unlinkat(tst_remove_dir, "u", 0), 0); +expect(mknod("/tmp/tst-remove/u2", 0777|S_IFREG, 0), 0); +expect(chdir("/tmp/tst-remove"), 0); +expect(unlinkat(AT_FDCWD, "u2", 0), 0); + +expect(mkdir("/tmp/tst-remove/ud", 0777), 0); +expect(unlinkat(tst_remove_dir, "ud", AT_REMOVEDIR), 0); + +expect(mkdir("/tmp/tst-remove/ud2", 0777), 0); +expect(chdir("/tmp/tst-remove"), 0); +expect(unlinkat(AT_FDCWD, "ud2", AT_REMOVEDIR), 0); // Finally remove the temporary directory (assumes the above left // nothing in it) +expect(close(tst_remove_dir), 0); expect(rmdir("/tmp/tst-remove"), 0); -- 2.34.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220520185638.136645-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH V2] vfs: refactor the *at() functions
Comparing to the 1st version, this one adds another helper function - vfs_fun_at2() - which calls supplied lambda if dirfd == AT_FDCWD or pathname is an absolute path and otherwise delegates to vfs_fun_at(). It also checks if pathname is not null. The __fxstatat() and futimesat() call vfs_fun_at() directly as their logic is slightly different. The VFS functions like openat(), faccessat() and others alike take a directory descriptor argument intended to make a filesystem action happen relative to that directory. The implemetations of those function repeat almost the same code over and over. So this patch makes vfs more DRY by introducing a helper function - vfs_fun_at() - which implements common logic and executed a lambda function specific to given VFS action. This patch also adds some unit tests around readlinkat() and mkdirat(). #Refs 1188 Signed-off-by: Waldemar Kozaczuk --- fs/vfs/main.cc | 216 --- tests/tst-readdir.cc | 9 ++ tests/tst-symlink.cc | 12 ++- 3 files changed, 81 insertions(+), 156 deletions(-) diff --git a/fs/vfs/main.cc b/fs/vfs/main.cc index ca357cc8..1b7eb588 100644 --- a/fs/vfs/main.cc +++ b/fs/vfs/main.cc @@ -160,21 +160,8 @@ int open(const char *pathname, int flags, ...) LFS64(open); -OSV_LIBC_API -int openat(int dirfd, const char *pathname, int flags, ...) +static int vfs_fun_at(int dirfd, const char *pathname, std::function fun) { -mode_t mode = 0; -if (flags & O_CREAT) { -va_list ap; -va_start(ap, flags); -mode = apply_umask(va_arg(ap, mode_t)); -va_end(ap); -} - -if (pathname[0] == '/' || dirfd == AT_FDCWD) { -return open(pathname, flags, mode); -} - struct file *fp; int error = fget(dirfd, &fp); if (error) { @@ -191,16 +178,48 @@ int openat(int dirfd, const char *pathname, int flags, ...) /* build absolute path */ strlcpy(p, fp->f_dentry->d_mount->m_path, PATH_MAX); strlcat(p, fp->f_dentry->d_path, PATH_MAX); -strlcat(p, "/", PATH_MAX); -strlcat(p, pathname, PATH_MAX); +if (pathname) { +strlcat(p, "/", PATH_MAX); +strlcat(p, pathname, PATH_MAX); +} -error = open(p, flags, mode); +error = fun(p); vn_unlock(vp); fdrop(fp); return error; } + +static int vfs_fun_at2(int dirfd, const char *pathname, std::function fun) +{ +if (!pathname) { +errno = EINVAL; +return -1; +} + +if (pathname[0] == '/' || dirfd == AT_FDCWD) { +return fun(pathname); +} + +return vfs_fun_at(dirfd, pathname, fun); +} + +OSV_LIBC_API +int openat(int dirfd, const char *pathname, int flags, ...) +{ +mode_t mode = 0; +if (flags & O_CREAT) { +va_list ap; +va_start(ap, flags); +mode = apply_umask(va_arg(ap, mode_t)); +va_end(ap); +} + +return vfs_fun_at2(dirfd, pathname, [flags, mode](const char *path) { +return open(path, flags, mode); +}); +} LFS64(openat); // open() has an optional third argument, "mode", which is only needed in @@ -602,6 +621,11 @@ extern "C" OSV_LIBC_API int __fxstatat(int ver, int dirfd, const char *pathname, struct stat *st, int flags) { +if (!pathname) { +errno = EINVAL; +return -1; +} + if (pathname[0] == '/' || dirfd == AT_FDCWD) { return stat(pathname, st); } @@ -611,35 +635,14 @@ int __fxstatat(int ver, int dirfd, const char *pathname, struct stat *st, return fstat(dirfd, st); } -struct file *fp; -int error = fget(dirfd, &fp); -if (error) { -errno = error; -return -1; -} - -struct vnode *vp = fp->f_dentry->d_vnode; -vn_lock(vp); - -std::unique_ptr up (new char[PATH_MAX]); -char *p = up.get(); -/* build absolute path */ -strlcpy(p, fp->f_dentry->d_mount->m_path, PATH_MAX); -strlcat(p, fp->f_dentry->d_path, PATH_MAX); -strlcat(p, "/", PATH_MAX); -strlcat(p, pathname, PATH_MAX); - -if (flags & AT_SYMLINK_NOFOLLOW) { -error = lstat(p, st); -} -else { -error = stat(p, st); -} - -vn_unlock(vp); -fdrop(fp); - -return error; +return vfs_fun_at(dirfd, pathname, [flags,st](const char *absolute_path) { +if (flags & AT_SYMLINK_NOFOLLOW) { +return lstat(absolute_path, st); +} +else { +return stat(absolute_path, st); +} +}); } LFS64(__fxstatat); @@ -870,37 +873,9 @@ int mkdirat(int dirfd, const char *pathname, mode_t mode) { mode = apply_umask(mode); - if (pathname[0] == '/' || dirfd == AT_FDCWD) { -// Supplied path is either absolute or relative to cwd -return mkdir(pathname, mode); -} - -// Supplied path is relative to folder specified by dirfd
[osv-dev] [PATCH] syscall: implement getdents64
It looks like the golang apps that need to iterate over entries in a directory use a system call getdents64 which is documented in https://man7.org/linux/man-pages/man2/getdents.2.html. Normally this functionality is provided by the libc functions like opendir(), readdir(), etc which actually do delegate to getdents64. Go is known of bypassing libc in such cases. So this patch implements the syscall getdents64 by adding a utility function to VFS main.cc that is then called by syscall in linux.cc. For details of how this function works please look at the comments. This patch also adds a unit test to verify this syscall works. Refs #1188 Signed-off-by: Waldemar Kozaczuk --- fs/vfs/main.cc | 68 + linux.cc | 4 ++ modules/tests/Makefile | 2 +- tests/tst-getdents.cc | 111 + 4 files changed, 184 insertions(+), 1 deletion(-) create mode 100644 tests/tst-getdents.cc diff --git a/fs/vfs/main.cc b/fs/vfs/main.cc index 1b0d7c11..f5db6be0 100644 --- a/fs/vfs/main.cc +++ b/fs/vfs/main.cc @@ -668,6 +668,10 @@ struct __dirstream int fd; }; +void _delete_dir(DIR *dir) { +delete dir; +} + OSV_LIBC_API DIR *opendir(const char *path) { @@ -775,6 +779,70 @@ int readdir64_r(DIR *dir, struct dirent64 *entry, extern "C" OSV_LIBC_API struct dirent *readdir64(DIR *dir) __attribute__((alias("readdir"))); +struct linux_dirent64 { +u64d_ino; +s64d_off; +unsigned short d_reclen; +unsigned char d_type; +char d_name[]; +}; + +extern "C" +ssize_t __get_dents_64(int fd, void *dirp, size_t count) +{ +auto *dir = fdopendir(fd); +if (dir) { +// We have verified that fd points to a valid directory +// but we do NOT need the DIR handle so just delete it +delete dir; + +struct file *fp; +int error = fget(fd, &fp); +if (error) { +errno = error; +return -1; +} + +size_t bytes_read = 0; +off_t last_off = -1; +errno = 0; + +// Iterate over as many entries as there is space in the buffer +// by directly calling sys_readdir() +struct dirent entry; +while ((error = sys_readdir(fp, &entry)) == 0) { +auto rec_len = offsetof(linux_dirent64, d_name) + strlen(entry.d_name) + 1; +if (rec_len <= count) { +auto *ldirent = static_cast(dirp + bytes_read); +ldirent->d_ino = entry.d_ino; +ldirent->d_off = entry.d_off; +ldirent->d_type = entry.d_type; +strcpy(ldirent->d_name, entry.d_name); +ldirent->d_reclen = rec_len; +count -= rec_len; +bytes_read += rec_len; +last_off = entry.d_off; +} else { +if (last_off >= 0) +sys_seekdir(fp, last_off); +break; +} +} + +fdrop(fp); + +if (error && error != ENOENT) { +errno = error; +return -1; +} else { +errno = 0; +return bytes_read; +} +} else { +return -1; +} +} + OSV_LIBC_API void rewinddir(DIR *dirp) { diff --git a/linux.cc b/linux.cc index c9b6b7b6..235ba1cf 100644 --- a/linux.cc +++ b/linux.cc @@ -410,6 +410,9 @@ static int tgkill(int tgid, int tid, int sig) return -1; } +#define __NR___get_dents_64 __NR_getdents64 +extern "C" ssize_t __get_dents_64(int fd, void *dirp, size_t count); + OSV_LIBC_API long syscall(long number, ...) { // Save FPU state and restore it at the end of this function @@ -495,6 +498,7 @@ OSV_LIBC_API long syscall(long number, ...) SYSCALL0(getuid); SYSCALL3(lseek, int, off_t, int); SYSCALL2(statfs, const char *, struct statfs *); +SYSCALL3(__get_dents_64, int, void *, size_t); } debug_always("syscall(): unimplemented system call %d\n", number); diff --git a/modules/tests/Makefile b/modules/tests/Makefile index ca489341..e462ebc8 100644 --- a/modules/tests/Makefile +++ b/modules/tests/Makefile @@ -133,7 +133,7 @@ tests := tst-pthread.so misc-ramdisk.so tst-vblk.so tst-bsd-evh.so \ tst-getopt.so tst-getopt-pie.so tst-non-pie.so tst-semaphore.so \ tst-elf-init.so tst-realloc.so tst-setjmp.so \ libtls.so libtls_gold.so tst-tls.so tst-tls-gold.so tst-tls-pie.so \ - tst-sigaction.so tst-syscall.so tst-ifaddrs.so + tst-sigaction.so tst-syscall.so tst-ifaddrs.so tst-getdents.so # libstatic-thread-variable.so tst-static-thread-variable.so \ #TODO For now let us disable these tests for aarch64 until diff --git a/tests/tst-getdents.cc b/tests/tst-getdents.cc new file mode 100644 index ..5803aaeb --- /dev/null +++ b/tests/tst-getdents.cc @@ -0,0 +1,111 @@
[osv-dev] [PATCH] vfs: implement symlinkat
This patch implements the symlinkat() function and enhances tst-symlink.cc to unit test it. Signed-off-by: Waldemar Kozaczuk --- exported_symbols/osv_ld-musl.so.1.symbols | 1 + exported_symbols/osv_libc.so.6.symbols| 1 + fs/vfs/main.cc| 12 tests/tst-symlink.cc | 18 +++--- 4 files changed, 29 insertions(+), 3 deletions(-) diff --git a/exported_symbols/osv_ld-musl.so.1.symbols b/exported_symbols/osv_ld-musl.so.1.symbols index f1c61a3f..3db22e0d 100644 --- a/exported_symbols/osv_ld-musl.so.1.symbols +++ b/exported_symbols/osv_ld-musl.so.1.symbols @@ -1081,6 +1081,7 @@ swab swprintf swscanf symlink +symlinkat sync syscall sysconf diff --git a/exported_symbols/osv_libc.so.6.symbols b/exported_symbols/osv_libc.so.6.symbols index 7ae57c38..e29059bb 100644 --- a/exported_symbols/osv_libc.so.6.symbols +++ b/exported_symbols/osv_libc.so.6.symbols @@ -887,6 +887,7 @@ swprintf __swprintf_chk swscanf symlink +symlinkat sync syscall sysconf diff --git a/fs/vfs/main.cc b/fs/vfs/main.cc index 4f0ce463..1b0d7c11 100644 --- a/fs/vfs/main.cc +++ b/fs/vfs/main.cc @@ -1122,6 +1122,18 @@ int symlink(const char *oldpath, const char *newpath) return 0; } +OSV_LIBC_API +int symlinkat(const char *oldpath, int newdirfd, const char *newpath) +{ +if (newpath[0] == '/' || newdirfd == AT_FDCWD) { +return symlink(oldpath, newpath); +} + +return vfs_fun_at(newdirfd, newpath, [oldpath](const char *absolute_path) { +return symlink(oldpath, absolute_path); +}); +} + TRACEPOINT(trace_vfs_unlink, "\"%s\"", const char*); TRACEPOINT(trace_vfs_unlink_ret, ""); TRACEPOINT(trace_vfs_unlink_err, "%d", int); diff --git a/tests/tst-symlink.cc b/tests/tst-symlink.cc index 978cfda3..1322e79e 100644 --- a/tests/tst-symlink.cc +++ b/tests/tst-symlink.cc @@ -25,6 +25,9 @@ #define N1"f1" #define N2"f2_AAA" +#define N2B "f2_BBB" +#define N2B "f2_BBB" +#define N2C "f2_CCC" #define N3"f3" #define N4"f4" #define N5"f5" @@ -91,6 +94,8 @@ int main(int argc, char **argv) #endif report(chdir(TESTDIR) == 0, "chdir"); +auto test_dir = opendir(TESTDIR); +report(test_dir, "opendir"); /* * test to check @@ -115,6 +120,10 @@ int main(int argc, char **argv) #else report(symlink(N1, N2) == 0, "symlink"); report(search_dir(TESTDIR, N2) == true, "search dir"); +report(symlinkat(N1, dirfd(test_dir), N2B) == 0, "symlinkat"); +report(search_dir(TESTDIR, N2B) == true, "search dir N2B"); +report(symlinkat(N1, AT_FDCWD, N2C) == 0, "symlinkat"); +report(search_dir(TESTDIR, N2C) == true, "search dir N2B"); #endif #if defined(READ_ONLY_FS) @@ -125,6 +134,8 @@ int main(int argc, char **argv) #else report(access(N1, R_OK | W_OK) == 0, "access"); report(access(N2, R_OK | W_OK) == 0, "access"); +report(access(N2B, R_OK | W_OK) == 0, "access"); +report(access(N2C, R_OK | W_OK) == 0, "access"); #endif rc = readlink(N2, path, sizeof(path)); @@ -157,6 +168,8 @@ int main(int argc, char **argv) error = errno; report(rc < 0 && errno == ENOENT, "ENOENT expected"); report(unlink(N2) == 0, "unlink"); +report(unlinkat(dirfd(test_dir),N2B,0) == 0, "unlinkat"); +report(unlinkat(dirfd(test_dir),N2C,0) == 0, "unlinkat"); /* * IO Tests 1: write(file), read(symlink), truncate(symlink) @@ -365,8 +378,6 @@ int main(int argc, char **argv) report(search_dir(D2, N5) == true, "Symlink search"); report(rename(D2, D3) == 0, "rename(d2, d3)"); -auto test_dir = opendir(TESTDIR); -report(test_dir, "opendir"); rc = readlinkat(dirfd(test_dir), D3, path, sizeof(path)); report(rc >= 0, "readlinkat"); path[rc] = 0; @@ -381,7 +392,6 @@ int main(int argc, char **argv) report(rc >= 0, "readlinkat"); path[rc] = 0; report(strcmp(path, D1) == 0, "readlinkat path"); -report(closedir(test_dir) == 0, "closedir(test_dir)"); rc = readlink(D3, path, sizeof(path)); report(rc >= 0, "readlink"); path[rc] = 0; @@ -399,6 +409,8 @@ int main(int argc, char **argv) report(rmdir(D4) == 0, "rmdir"); #endif +report(closedir(test_dir) == 0, "closedir(test_dir)"); + #if defined(READ_ONLY_FS) report(-1 == rmdir(TESTDIR) && errno == ENOTEMPTY, "rmdir"); #else -- 2.34.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220518032648.76794-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] unlinkat: fill the gaps in the implementation
This patch enhances the unlinkat() implementation to handle the AT_FDCWD dirfd and AT_REMOVEDIR flags. We also enhance tst-remove.cc to test unlinkat. Signed-off-by: Waldemar Kozaczuk --- fs/vfs/main.cc | 18 ++ tests/tst-remove.cc | 18 -- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/fs/vfs/main.cc b/fs/vfs/main.cc index a72042b2..4f0ce463 100644 --- a/fs/vfs/main.cc +++ b/fs/vfs/main.cc @@ -1154,11 +1154,21 @@ int unlink(const char *pathname) OSV_LIBC_API int unlinkat(int dirfd, const char *pathname, int flags) { -//TODO: Really implement it -if (dirfd != AT_FDCWD || flags) { -UNIMPLEMENTED("unlinkat() with non-zero flags or dirfd != AT_FDCWD"); +if (pathname[0] == '/' || dirfd == AT_FDCWD) { +if (flags & AT_REMOVEDIR) { +return rmdir(pathname); +} else { +return unlink(pathname); +} } -return unlink(pathname); + +return vfs_fun_at(dirfd, pathname, [flags](const char *absolute_path) { +if (flags & AT_REMOVEDIR) { +return rmdir(absolute_path); +} else { +return unlink(absolute_path); +} +}); } TRACEPOINT(trace_vfs_stat, "\"%s\" %p", const char*, struct stat*); diff --git a/tests/tst-remove.cc b/tests/tst-remove.cc index 6851cba0..fdf4037d 100644 --- a/tests/tst-remove.cc +++ b/tests/tst-remove.cc @@ -42,6 +42,8 @@ bool do_expect(T actual, T expected, const char *actuals, const char *expecteds, int main(int argc, char **argv) { expect(mkdir("/tmp/tst-remove", 0777), 0); +auto tst_remove_dir = open("/tmp/tst-remove", O_DIRECTORY); +expect(tst_remove_dir != -1, true); /* test unlink() **/ // unlink() non-existant file returns ENOENT @@ -79,12 +81,24 @@ int main(int argc, char **argv) expect_errno(rmdir("/tmp/tst-remove/f"), ENOTDIR); expect(unlink("/tmp/tst-remove/f"), 0); -/* test remove() ***/ -// TODO... +/* test unlinkat() ***/ +expect(mknod("/tmp/tst-remove/u", 0777|S_IFREG, 0), 0); +expect(unlinkat(tst_remove_dir, "u", 0), 0); +expect(mknod("/tmp/tst-remove/u2", 0777|S_IFREG, 0), 0); +expect(chdir("/tmp/tst-remove"), 0); +expect(unlinkat(AT_FDCWD, "u2", 0), 0); + +expect(mkdir("/tmp/tst-remove/ud", 0777), 0); +expect(unlinkat(tst_remove_dir, "ud", AT_REMOVEDIR), 0); + +expect(mkdir("/tmp/tst-remove/ud2", 0777), 0); +expect(chdir("/tmp/tst-remove"), 0); +expect(unlinkat(AT_FDCWD, "ud2", AT_REMOVEDIR), 0); // Finally remove the temporary directory (assumes the above left // nothing in it) +expect(close(tst_remove_dir), 0); expect(rmdir("/tmp/tst-remove"), 0); -- 2.34.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220518032614.76774-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] vfs: refactor the *at() functions
The VFS functions like openat(), faccessat() and others alike take a directory descriptor argument intended to make a filesystem action happen relative to that directory. The implemetations of those function repeat almost the same code over and over. So this patch makes vfs more DRY by introducing a helper function - vfs_fun_at() - which implements common logic and executed a lambda function specific to given VFS action. This patch also adds some unit tests around readlinkat() and mkdirat(). Signed-off-by: Waldemar Kozaczuk --- fs/vfs/main.cc | 188 +++ tests/tst-readdir.cc | 9 +++ tests/tst-symlink.cc | 12 ++- 3 files changed, 66 insertions(+), 143 deletions(-) diff --git a/fs/vfs/main.cc b/fs/vfs/main.cc index ca357cc8..a72042b2 100644 --- a/fs/vfs/main.cc +++ b/fs/vfs/main.cc @@ -160,21 +160,8 @@ int open(const char *pathname, int flags, ...) LFS64(open); -OSV_LIBC_API -int openat(int dirfd, const char *pathname, int flags, ...) +static int vfs_fun_at(int dirfd, const char *pathname, std::function fun) { -mode_t mode = 0; -if (flags & O_CREAT) { -va_list ap; -va_start(ap, flags); -mode = apply_umask(va_arg(ap, mode_t)); -va_end(ap); -} - -if (pathname[0] == '/' || dirfd == AT_FDCWD) { -return open(pathname, flags, mode); -} - struct file *fp; int error = fget(dirfd, &fp); if (error) { @@ -191,16 +178,38 @@ int openat(int dirfd, const char *pathname, int flags, ...) /* build absolute path */ strlcpy(p, fp->f_dentry->d_mount->m_path, PATH_MAX); strlcat(p, fp->f_dentry->d_path, PATH_MAX); -strlcat(p, "/", PATH_MAX); -strlcat(p, pathname, PATH_MAX); +if (pathname) { +strlcat(p, "/", PATH_MAX); +strlcat(p, pathname, PATH_MAX); +} -error = open(p, flags, mode); +error = fun(p); vn_unlock(vp); fdrop(fp); return error; } + +OSV_LIBC_API +int openat(int dirfd, const char *pathname, int flags, ...) +{ +mode_t mode = 0; +if (flags & O_CREAT) { +va_list ap; +va_start(ap, flags); +mode = apply_umask(va_arg(ap, mode_t)); +va_end(ap); +} + +if (pathname[0] == '/' || dirfd == AT_FDCWD) { +return open(pathname, flags, mode); +} + +return vfs_fun_at(dirfd, pathname, [flags, mode](const char *absolute_path) { +return open(absolute_path, flags, mode); +}); +} LFS64(openat); // open() has an optional third argument, "mode", which is only needed in @@ -611,35 +620,14 @@ int __fxstatat(int ver, int dirfd, const char *pathname, struct stat *st, return fstat(dirfd, st); } -struct file *fp; -int error = fget(dirfd, &fp); -if (error) { -errno = error; -return -1; -} - -struct vnode *vp = fp->f_dentry->d_vnode; -vn_lock(vp); - -std::unique_ptr up (new char[PATH_MAX]); -char *p = up.get(); -/* build absolute path */ -strlcpy(p, fp->f_dentry->d_mount->m_path, PATH_MAX); -strlcat(p, fp->f_dentry->d_path, PATH_MAX); -strlcat(p, "/", PATH_MAX); -strlcat(p, pathname, PATH_MAX); - -if (flags & AT_SYMLINK_NOFOLLOW) { -error = lstat(p, st); -} -else { -error = stat(p, st); -} - -vn_unlock(vp); -fdrop(fp); - -return error; +return vfs_fun_at(dirfd, pathname, [flags,st](const char *absolute_path) { +if (flags & AT_SYMLINK_NOFOLLOW) { +return lstat(absolute_path, st); +} +else { +return stat(absolute_path, st); +} +}); } LFS64(__fxstatat); @@ -875,32 +863,9 @@ int mkdirat(int dirfd, const char *pathname, mode_t mode) return mkdir(pathname, mode); } -// Supplied path is relative to folder specified by dirfd -struct file *fp; -int error = fget(dirfd, &fp); -if (error) { -errno = error; -return -1; -} - -struct vnode *vp = fp->f_dentry->d_vnode; -vn_lock(vp); - -std::unique_ptr up (new char[PATH_MAX]); -char *p = up.get(); - -/* build absolute path */ -strlcpy(p, fp->f_dentry->d_mount->m_path, PATH_MAX); -strlcat(p, fp->f_dentry->d_path, PATH_MAX); -strlcat(p, "/", PATH_MAX); -strlcat(p, pathname, PATH_MAX); - -error = mkdir(p, mode); - -vn_unlock(vp); -fdrop(fp); - -return error; +return vfs_fun_at(dirfd, pathname, [mode](const char *absolute_path) { +return mkdir(absolute_path, mode); +}); } TRACEPOINT(trace_vfs_rmdir, "\"%s\"", const char*); @@ -1721,31 +1686,9 @@ int faccessat(int dirfd, const char *pathname, int mode, int flags) return access(pathname, mode); } -struct file *fp; -int error = fget(dirfd, &fp); -if (error) {
[osv-dev] [PATCH] syscall: support getgid, getuid, lseek and statfs
This patch adds 4 new syscalls that map one-to-one to the four functions listed in the title above. These are required to run SeaweedFS on OSv. Refs #1188 Signed-off-by: Waldemar Kozaczuk --- linux.cc | 5 + 1 file changed, 5 insertions(+) diff --git a/linux.cc b/linux.cc index d3823d00..c9b6b7b6 100644 --- a/linux.cc +++ b/linux.cc @@ -38,6 +38,7 @@ #include #include #include +#include #include @@ -490,6 +491,10 @@ OSV_LIBC_API long syscall(long number, ...) #endif SYSCALL3(mkdirat, int, char*, mode_t); SYSCALL3(tgkill, int, int, int); +SYSCALL0(getgid); +SYSCALL0(getuid); +SYSCALL3(lseek, int, off_t, int); +SYSCALL2(statfs, const char *, struct statfs *); } debug_always("syscall(): unimplemented system call %d\n", number); -- 2.34.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220516164321.130748-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] sysconf: handle _SC_MINSIGSTKSZ and _SC_SIGSTKSZ
New versions (>= ~1.74) of boost unit test library used by some of our tests started using sysconf() with argument _SC_SIGSTKSZ to determine size of the signal stack size. This patch enhances the sysconf() implementation to handle _SC_MINSIGSTKSZ and _SC_SIGSTKSZ. Signed-off-by: Waldemar Kozaczuk --- include/api/unistd.h | 2 ++ runtime.cc | 2 ++ 2 files changed, 4 insertions(+) diff --git a/include/api/unistd.h b/include/api/unistd.h index 4cf86adb..cb5f4489 100644 --- a/include/api/unistd.h +++ b/include/api/unistd.h @@ -470,6 +470,8 @@ void syncfs(int); #define _SC_XOPEN_STREAMS 246 #define _SC_THREAD_ROBUST_PRIO_INHERIT 247 #define _SC_THREAD_ROBUST_PRIO_PROTECT 248 +#define _SC_MINSIGSTKSZ 249 +#define _SC_SIGSTKSZ 250 #define _CS_PATH 0 #define _CS_POSIX_V6_WIDTH_RESTRICTED_ENVS 1 diff --git a/runtime.cc b/runtime.cc index 3942982c..521b5c24 100644 --- a/runtime.cc +++ b/runtime.cc @@ -379,6 +379,8 @@ long sysconf(int name) case _SC_THREAD_SAFE_FUNCTIONS: return 1; case _SC_GETGR_R_SIZE_MAX: return 1; case _SC_OPEN_MAX: return FDMAX; +case _SC_MINSIGSTKSZ: return MINSIGSTKSZ; +case _SC_SIGSTKSZ: return SIGSTKSZ; default: debug(fmt("sysconf(): stubbed for parameter %1%\n") % name); errno = EINVAL; -- 2.34.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220511215346.84070-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] tests: detect boost setup errors
New boost (>= ~1.74) based unit tests started using sysconf() to determine signal stack size which at this moment is not supported by OSv implementation. These results in errors looking like this: "Test setup error: std::bad_alloc: std::bad_alloc" which are happily ignored by our error pattern detection logic in testing.py. This patch adds one more failure pattern. Signed-off-by: Waldemar Kozaczuk --- scripts/tests/testing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/tests/testing.py b/scripts/tests/testing.py index a1cdec35..50b49c08 100644 --- a/scripts/tests/testing.py +++ b/scripts/tests/testing.py @@ -81,7 +81,8 @@ def scan_errors(s,scan_for_failed_to_load_object_error=True): "at org.junit.runner.JUnitCore.main", "ContextFailedException", "AppThreadTerminatedWithUncaughtException", - "\[backtrace\]" + "\[backtrace\]", +"Test setup error" ] if scan_for_failed_to_load_object_error: -- 2.35.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220511214915.143995-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] libc: implement _dl_find_object()
Ubuntu 2022.04 comes with a new version of GCC 11.2.0 that somehow includes instance of libgcc_s.so.1 destined for GCC_12.0.0 at least based on what readelf shows. The implication of this is that during exception handling and stack unwinding, this version of libgcc_so.so.1 uses _dl_find_object() function what was very recently added to glibc. For more details please read following: - https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg275982.html - https://www.mail-archive.com/gcc-patches@gcc.gnu.org/msg273082.html - https://github.com/gcc-mirror/gcc/commit/790854ea7670f11c14d431c102a49181d2915965 - http://www.gnu.org/software/libc/manual/html_node/Dynamic-Linker-Introspection.html So this patch adds basic (a little incomplete) implementation of _dl_find_object() that can satisfy the need of new libgcc_s.so.1 - field dlfo_eh_frame of the struct dl_find_object. Please note that for now we do not populate the dlfo_link_map field as it is not clear what exactly goes it there and how it is used. We may need to revisit this later. Signed-off-by: Waldemar Kozaczuk --- exported_symbols/osv_libc.so.6.symbols | 1 + include/api/__dlfcn.h | 26 ++ libc/dlfcn.cc | 20 3 files changed, 47 insertions(+) create mode 100644 include/api/__dlfcn.h diff --git a/exported_symbols/osv_libc.so.6.symbols b/exported_symbols/osv_libc.so.6.symbols index 7bae56c4..7ae57c38 100644 --- a/exported_symbols/osv_libc.so.6.symbols +++ b/exported_symbols/osv_libc.so.6.symbols @@ -86,6 +86,7 @@ dirfd dirname div dl_iterate_phdr +_dl_find_object dngettext dprintf drand48 diff --git a/include/api/__dlfcn.h b/include/api/__dlfcn.h new file mode 100644 index ..228122d1 --- /dev/null +++ b/include/api/__dlfcn.h @@ -0,0 +1,26 @@ +#ifndef___DLFCN_H +#define___DLFCN_H + +#ifdef __cplusplus +extern "C" { +#endif + +struct dl_find_object +{ + __extension__ unsigned long long int dlfo_flags; + void *dlfo_map_start;/* Beginning of mapping containing address. */ + void *dlfo_map_end; /* End of mapping. */ + struct link_map *dlfo_link_map; + void *dlfo_eh_frame; /* Exception handling data of the object. */ + __extension__ unsigned long long int __dflo_reserved[7]; +}; + +/* If ADDRESS is found in an object, fill in *RESULT and return 0. + Otherwise, return -1. */ +int _dl_find_object (void *__address, struct dl_find_object *__result); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libc/dlfcn.cc b/libc/dlfcn.cc index fa496e4e..8ec37148 100644 --- a/libc/dlfcn.cc +++ b/libc/dlfcn.cc @@ -6,6 +6,7 @@ */ #include +#include <__dlfcn.h> #include #include #include @@ -142,3 +143,22 @@ extern "C" char *dlerror(void) { return dlerror_set(nullptr); } + +extern "C" int _dl_find_object(void *address, dl_find_object* result) +{ // +// Find ELF object with a mapping containing the passed in +// address and if found populate the result structure as described +// in http://www.gnu.org/software/libc/manual/html_node/Dynamic-Linker-Introspection.html +auto eo = elf::get_program()->object_containing_addr(address); +if (eo) { +result->dlfo_map_start = eo->base(); +result->dlfo_map_end = eo->end(); +result->dlfo_eh_frame = eo->eh_frame_addr(); +//TODO: For now we are neglecting to populate the result->dlfo_link_map field +//as it is not very well documented what exactly should go there. Eventually, +//once we understand the purpose of this field better, we should populate it as well. + return 0; + } else { + return -1; + } +} -- 2.34.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220511194936.46011-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] elf: capture address of PT_GNU_EH_FRAME
Signed-off-by: Waldemar Kozaczuk --- core/elf.cc| 5 - include/osv/elf.hh | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/core/elf.cc b/core/elf.cc index 5170bf53..c96a45ec 100644 --- a/core/elf.cc +++ b/core/elf.cc @@ -124,6 +124,7 @@ object::object(program& prog, std::string pathname) , _module_index(_prog.register_dtv(this)) , _is_executable(false) , _init_called(false) +, _eh_frame(0) , _visibility_thread(nullptr) , _visibility_level(VisibilityLevel::Public) { @@ -517,10 +518,12 @@ void object::process_headers() case PT_PHDR: case PT_GNU_STACK: case PT_GNU_RELRO: -case PT_GNU_EH_FRAME: case PT_PAX_FLAGS: case PT_GNU_PROPERTY: break; +case PT_GNU_EH_FRAME: +_eh_frame = _base + phdr.p_vaddr; +break; case PT_TLS: _tls_segment = _base + phdr.p_vaddr; _tls_init_size = phdr.p_filesz; diff --git a/include/osv/elf.hh b/include/osv/elf.hh index afc9c9a5..31702bf8 100644 --- a/include/osv/elf.hh +++ b/include/osv/elf.hh @@ -383,6 +383,7 @@ public: ulong get_tls_size(); ulong get_aligned_tls_size(); void copy_local_tls(void* to_addr); +void* eh_frame_addr() { return _eh_frame; } protected: virtual void load_segment(const Elf64_Phdr& segment) = 0; virtual void unload_segment(const Elf64_Phdr& segment) = 0; @@ -436,6 +437,7 @@ protected: bool _is_executable; bool is_core(); bool _init_called; +void* _eh_frame; std::unordered_map _cached_symbols; -- 2.34.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220511141000.37478-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] scripts: expand %(libgcc_s_dir) when generating usr.manifest
The manifest skeleton files like usr.manifest.skel have an entry for libgcc_s_dir looking like this: /usr/lib/libgcc_s.so.1: %(libgcc_s_dir)s/libgcc_s.so.1 This actually gets expanded quite late during the build process by upload_manifest.py. The unfortunate consequence of this is that loader.py used during debugging which reads usr.manifest does not have any logic to load libgcc_s.so.1. And this makes stack traces look useless in those cases that involve libgcc_s.so.1. So this patch slightly changes the scripts/build and scripts/module.py to expand %(libgcc_s_dir) when writing to build/release/usr.manifest. As a result of this the stack trace of the crash I have been working on looks much more reasonable: (gdb) bt '#0 0x403047c2 in processor::cli_hlt () at arch/x64/processor.hh:247 #1 arch::halt_no_interrupts () at arch/x64/arch.hh:48 #2 osv::halt () at arch/x64/power.cc:29 #3 0x40239504 in abort (fmt=fmt@entry=0x405b1e93 "Aborted\n") at runtime.cc:142 #4 0x40202e80 in abort () at runtime.cc:106 #5 0x1002b6b6 in ?? () #6 0x1003f5cb in _Unwind_Resume () #7 0x10062daa in ?? () #8 0x10075b5c in boost::execution_monitor::vexecute(boost::function const&) () #9 0x1007f0a9 in boost::unit_test::framework::init(bool (*)(), int, char**) () #10 0x1009254d in boost::unit_test::unit_test_main(bool (*)(), int, char**) () #11 0x4039d021 in osv::application::run_main (this=0xa0bd8c10) at core/app.cc:416 #12 0x4039d22d in operator() (app=, __closure=0x0) at core/app.cc:236 #13 _FUN () at core/app.cc:238 #14 0x403d089a in operator() (__closure=0xa0d57800) at libc/pthread.cc:116 #15 std::__invoke_impl&> (__f=...) at /usr/include/c++/11/bits/invoke.h:61 #16 std::__invoke_r&> (__fn=...) at /usr/include/c++/11/bits/invoke.h:154 #17 std::_Function_handler >::_M_invoke(const std::_Any_data &) (__functor=...) at /usr/include/c++/11/bits/std_function.h:290 #18 0x4036b5ae in sched::thread::main (this=0x80f6a040) at core/sched.cc:1267 #19 sched::thread_main_c (t=0x80f6a040) at arch/x64/arch-switch.hh:325 #20 0x402fda43 in thread_main () at arch/x64/entry.S:116 Signed-off-by: Waldemar Kozaczuk --- scripts/build | 32 scripts/module.py | 11 +++ 2 files changed, 23 insertions(+), 20 deletions(-) diff --git a/scripts/build b/scripts/build index 38aa70d5..fbfe0ae3 100755 --- a/scripts/build +++ b/scripts/build @@ -227,6 +227,21 @@ if [[ ${vars[append_manifest]} == "true" && $modules == "!default" ]]; then modules="empty" fi +CC=gcc +if [[ "$host_arch" == "x86_64" && "$arch" == 'aarch64' ]]; then +CC=${CROSS_PREFIX:-aarch64-linux-gnu-}gcc +fi + +libgcc_s_path=$(${CC} -print-file-name=libgcc_s.so.1) +if [[ "$libgcc_s_path" == "libgcc_s.so.1" ]]; then + cat <<-EOF + Unable to resolve libgcc_s.so.1 using "${CC}". + Looking in build/downloaded_packages/aarch64/gcc/install/lib64 + EOF + libgcc_s_path="build/downloaded_packages/aarch64/gcc/install/lib64/libgcc_s.so.1" +fi +libgcc_s_dir=$(dirname $(readlink -f ${libgcc_s_path})) + # The parentheses start a subshell. Whatever is exported there, doesn't affect the external shell ( # Note: the double-quotes and almost everything in the line below is important to correctly allow spaces @@ -240,7 +255,7 @@ fi esac done # Export the variables we already have. This makes it unnecessary to do "fs__type=$fstype ..." - export fs_type mode OSV_BUILD_PATH + export fs_type mode OSV_BUILD_PATH libgcc_s_dir # Other variables we wanted to rename, I don't know why export ARCH=$arch OSV_BASE=$SRC # Run what we wanted to run. It will inherit everything we exported above. @@ -276,21 +291,6 @@ kernel_end=$(($loader_size+2097151 & ~2097151)) # the case in our old build.mk). cd $OUT -CC=gcc -if [[ "$host_arch" == "x86_64" && "$arch" == 'aarch64' ]]; then -CC=${CROSS_PREFIX:-aarch64-linux-gnu-}gcc -fi - -libgcc_s_path=$(${CC} -print-file-name=libgcc_s.so.1) -if [[ "$libgcc_s_path" == "libgcc_s.so.1" ]]; then - cat <<-EOF - Unable to resolve libgcc_s.so.1 using "${CC}". - Looking in ../downloaded_packages/aarch64/gcc/install/lib64 - EOF - libgcc_s_path="../downloaded_packages/aarch64/gcc/install/lib64/libgcc_s.so.1" -fi -libgcc_s_dir=$(dirname $(readlink -f ${libgcc_s_path})) - if [ "$export" != "none" ]; then export_dir=${vars[export_dir]-$SRC/build/export} "$SRC"/scripts/export_manifest.py -e "
[osv-dev] [PATCH] scripts: tweak loader.py to load libstdc++.so.*-gdb.py on Ubuntu 22.04
On newest version of Ubuntu, the location of the python script libstdc++.so.*-gdb.py has moved to a subdirectory under /usr/share/gcc/python. This patch tweaks the relevant logic to try this new place if the /usr/share/gcc-*/python does not work. Signed-off-by: Waldemar Kozaczuk --- scripts/loader.py | 20 +++- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/scripts/loader.py b/scripts/loader.py index cfa87e0c..f154da75 100644 --- a/scripts/loader.py +++ b/scripts/loader.py @@ -1191,24 +1191,18 @@ def setup_libstdcxx(): # "libstdc++.so.6.0.20" shared object is loaded into the debugger. # But because OSv is statically linked, we miss that auto-loading, so we # need to look for, and run, this script explicitly. -sys.path += [glob('/usr/share/gcc-*/python')[0]] +gcc_python_dirs = glob('/usr/share/gcc-*/python') +if len(gcc_python_dirs) == 0: #If the above does not work try different place +gcc_python_dirs = glob('/usr/share/gcc/python') +if len(gcc_python_dirs) == 0: + print("!!! Could not locate the libstdc++.so.6.0.20-gdb.py") + return +sys.path += [gcc_python_dirs[0]] for base, dirnames, filenames in os.walk(gdb.PYTHONDIR + '/../auto-load'): for filename in fnmatch.filter(filenames, 'libstdc++.so.*-gdb.py'): script = os.path.join(base, filename) exec(compile(open(script).read(), script, 'exec')) return -# The following commented code is similar, but takes the python script -# from external/ instead of the one installed on the system. This might -# be useful if "make build_env=external" was used. However, there's a -# snag - the Python script we have in external/ might not be compatible -# with the version of Python installed on the system (there's right now -# a transition between Python 2 and Python 3 making things difficult). -#gcc = external + '/gcc.bin' -#sys.path += [gcc + '/usr/share/gdb/auto-load/usr/lib64', -# glob(gcc + '/usr/share/gcc-*/python')[0], -# ] -#main = glob(gcc + '/usr/share/gdb/auto-load/usr/lib64/libstdc++.so.*.py')[0] -#exec(compile(open(main).read(), main, 'exec')) def sig_to_string(sig): '''Convert a tracepoing signature to a string''' -- 2.35.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220510174012.118360-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] aarch64: parse correctly cpu nodes in DTB tree
It looks like the code to parse the /cpus node in DTB tree assumed that all subnodes would represent the cpus. Unfortunately, it is not the case any more as recent QEMU on both RPI 4 and Odroid show a structure looking like this: cpus { #address-cells = <0x02>; #size-cells = <0x00>; cpu-map { cluster0 { core0 { cpu = <0x09>; }; core1 { cpu = <0x0a>; }; }; cluster1 { core0 { cpu = <0x0b>; }; core1 { cpu = <0x0c>; }; core2 { cpu = <0x0d>; }; core3 { cpu = <0x0e>; }; }; }; cpu@0 { device_type = "cpu"; compatible = "arm,cortex-a53"; reg = <0x00 0x00>; enable-method = "psci"; capacity-dmips-mhz = <0x250>; next-level-cache = <0x4c>; #cooling-cells = <0x02>; cpu-supply = <0x4d>; operating-points-v2 = <0x4e>; clocks = <0x02 0xbb>; clock-latency = <0xc350>; phandle = <0x09>; }; cpu@1 { device_type = "cpu"; compatible = "arm,cortex-a53"; reg = <0x00 0x01>; enable-method = "psci"; capacity-dmips-mhz = <0x250>; next-level-cache = <0x4c>; #cooling-cells = <0x02>; cpu-supply = <0x4d>; operating-points-v2 = <0x4e>; clocks = <0x02 0xbb>; clock-latency = <0xc350>; phandle = <0x0a>; }; }; As one can notice, besides the cpu subnodes there is also "cpu-map" one which at this moment we do not care about. And more to the point the dtb_parse_cpus_count() and dtb_parse_cpus_mpid() functions in arch-dtb.cc would try to parse those non-cpu subnodes and lead to detecting wrong number of CPUs and OSv would end up hanging while waiting for all secondary CPUs to come up. So this patch fixes the code to parse the /cpus node by iterating over all subnodes but filtering only those that have the 'device_type' property set to 'cpu'. This patch also combines dtb_parse_cpus_count() and dtb_parse_cpus_mpid() into a single dtb_parse_cpus() function. For more information about cpus information in DTB tree please read https://www.kernel.org/doc/Documentation/devicetree/bindings/cpu/cpu-topology.txt. Signed-off-by: Waldemar Kozaczuk --- arch/aarch64/arch-dtb.cc | 57 +--- 1 file changed, 24 insertions(+), 33 deletions(-) diff --git a/arch/aarch64/arch-dtb.cc b/arch/aarch64/arch-dtb.cc index d7241963..9aeeaa28 100644 --- a/arch/aarch64/arch-dtb.cc +++ b/arch/aarch64/arch-dtb.cc @@ -381,9 +381,12 @@ bool dtb_get_gic_v2(u64 *dist, size_t *dist_len, u64 *cpu, size_t *cpu_len) return true; } -/* this gets the cpus node and returns the number of cpu elements in it. */ +/* this parses the cpus node and mpidr values and returns the number of cpu in it. */ +#define DTB_MAX_CPU_COUNT 32 static int dtb_cpu_count = -1; -static int dtb_parse_cpus_count() +static u64 dtb_cpus_mpids[DTB_MAX_CPU_COUNT]; + +static int dtb_parse_cpus() { int node, subnode, count; if (!dtb) @@ -393,9 +396,24 @@ static int dtb_parse_cpus_count() if (node < 0) return -1; +u64 *mpids = dtb_cpus_mpids; for (count = 0, subnode = fdt_first_subnode(dtb, node); subnode >= 0; - count++, subnode = fdt_next_subnode(dtb, subnode)) { + subnode = fdt_next_subnode(dtb, subnode)) { + +if (count > DTB_MAX_CPU_COUNT) { +abort("dtb_parse_cpus_mpid: number of cpus greater than maximum. Increase the DTB_MAX_CPU_COUNT!\n"); +} + +// Only count subnode that have a property "device_type" with value "cpu" +auto property = fdt_get_property(dtb, subnode, "device_type", NULL); +if (property) { +if (!strncmp("cpu", property->data, 3)) { +(void)dtb_get_reg(subnode, mpids); +mpids++; +count++; +} +} } return count; } @@ -405,33 +423,6 @@ int dtb_get_cpus_count() r
[osv-dev] [PATCH] libc: add number of glibc _chk extension functions
This patch adds 5 new glibc extension functions that are part of the glibc binary standard and are needed to build OSv kernel on Ubuntu 22.04: - __mbsnrtowcs_chk - __mbsrtowcs_chk - __wmemcpy_chk - __wmemmove_chk - __wmemset_chk The functions are implemented to meet the specification defined in here as an example - http://refspecs.linux-foundation.org/LSB_4.1.0/LSB-Core-generic/LSB-Core-generic/libc---wmemset-chk-1.html. Signed-off-by: Waldemar Kozaczuk --- Makefile | 5 + exported_symbols/osv_libc.so.6.symbols | 5 + libc/multibyte/__mbsnrtowcs_chk.c | 18 ++ libc/multibyte/__mbsrtowcs_chk.c | 18 ++ libc/string/__wmemcpy_chk.c| 17 + libc/string/__wmemmove_chk.c | 17 + libc/string/__wmemset_chk.c| 17 + 7 files changed, 97 insertions(+) create mode 100644 libc/multibyte/__mbsnrtowcs_chk.c create mode 100644 libc/multibyte/__mbsrtowcs_chk.c create mode 100644 libc/string/__wmemcpy_chk.c create mode 100644 libc/string/__wmemmove_chk.c create mode 100644 libc/string/__wmemset_chk.c diff --git a/Makefile b/Makefile index c6dff37f..3e87a16d 100644 --- a/Makefile +++ b/Makefile @@ -1447,7 +1447,9 @@ musl += multibyte/mbrlen.o musl += multibyte/mbrtowc.o musl += multibyte/mbsinit.o musl += multibyte/mbsnrtowcs.o +libc += multibyte/__mbsnrtowcs_chk.o musl += multibyte/mbsrtowcs.o +libc += multibyte/__mbsrtowcs_chk.o musl += multibyte/mbstowcs.o musl += multibyte/mbtowc.o musl += multibyte/wcrtomb.o @@ -1780,8 +1782,11 @@ musl += string/wcswcs.o musl += string/wmemchr.o musl += string/wmemcmp.o musl += string/wmemcpy.o +libc += string/__wmemcpy_chk.o musl += string/wmemmove.o +libc += string/__wmemmove_chk.o musl += string/wmemset.o +libc += string/__wmemset_chk.o musl += temp/__randname.o musl += temp/mkdtemp.o diff --git a/exported_symbols/osv_libc.so.6.symbols b/exported_symbols/osv_libc.so.6.symbols index 39e79692..7bae56c4 100644 --- a/exported_symbols/osv_libc.so.6.symbols +++ b/exported_symbols/osv_libc.so.6.symbols @@ -508,7 +508,9 @@ mbrlen mbrtowc mbsinit mbsnrtowcs +__mbsnrtowcs_chk mbsrtowcs +__mbsrtowcs_chk mbstowcs mbtowc memalign @@ -1040,8 +1042,11 @@ wcwidth wmemchr wmemcmp wmemcpy +__wmemcpy_chk wmemmove +__wmemmove_chk wmemset +__wmemset_chk wprintf write writev diff --git a/libc/multibyte/__mbsnrtowcs_chk.c b/libc/multibyte/__mbsnrtowcs_chk.c new file mode 100644 index ..45fcacf3 --- /dev/null +++ b/libc/multibyte/__mbsnrtowcs_chk.c @@ -0,0 +1,18 @@ +/* + * Copyright (C) 2022 Waldemar Kozaczuk + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ + +#include +#include +#include + +size_t __mbsnrtowcs_chk(wchar_t *dst, const char **src, size_t nmc, size_t len, mbstate_t *ps, size_t dstlen) +{ +if (len > dstlen) { +_chk_fail("mbsnrtowcs"); +} +return mbsnrtowcs (dst, src, nmc, len, ps); +} diff --git a/libc/multibyte/__mbsrtowcs_chk.c b/libc/multibyte/__mbsrtowcs_chk.c new file mode 100644 index ..57194703 --- /dev/null +++ b/libc/multibyte/__mbsrtowcs_chk.c @@ -0,0 +1,18 @@ +/* + * Copyright (C) 2022 Waldemar Kozaczuk + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ + +#include +#include +#include + +size_t __mbsrtowcs_chk(wchar_t *dst, const char **src, size_t len, mbstate_t *ps, size_t dstlen) +{ +if (len > dstlen) { +_chk_fail("mbsrtowcs"); +} +return mbsrtowcs(dst, src, len, ps); +} diff --git a/libc/string/__wmemcpy_chk.c b/libc/string/__wmemcpy_chk.c new file mode 100644 index ..a7b9e5e7 --- /dev/null +++ b/libc/string/__wmemcpy_chk.c @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2022 Waldemar Kozaczuk + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ + +#include +#include + +wchar_t * __wmemcpy_chk(wchar_t *restrict dest, const wchar_t *restrict src, size_t len, size_t destlen) +{ +if (len > destlen) { +_chk_fail("wmemcpy"); +} +return wmemcpy(dest, src, len); +} diff --git a/libc/string/__wmemmove_chk.c b/libc/string/__wmemmove_chk.c new file mode 100644 index ..11ca9617 --- /dev/null +++ b/libc/string/__wmemmove_chk.c @@ -0,0 +1,17 @@ +/* + * Copyright (C) 2022 Waldemar Kozaczuk + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ + +#include +#include + +wchar_t * __wmemmove_chk(wchar_t *restrict dest, const wchar_t *restrict src, size_t len, size_t destlen) +{ +if (len > destlen) { +_chk_fail("wmemmov
[osv-dev] [PATCH] aarch64: fix zfs support
This patch fixes ZFS support on aarch64. As the issue #1131 explains, the ZFS page scanner logic clears the access flag of PTEs of relevant memory-mapped chunks of the files. On Intel, the cpu automatically sets the flags on first access (read or write) to those pages of memory. But on ARM it may need to be done by software if CPU does not have this capability (it does not on RPI 4 and Odroid I have been using possibly due to QEMU limitation). So to set the access flags in software, this patch enhances the page fault handler to detect if relevant fault is access flag related and does the manual page walk to navigate all the way down to the leaf PTE based on the virtual memory address retrieved from far_el1. Then it sets the access flag of the PTE and the dirty flag if the fault was triggered by a write. Eventually it writes the PTE back to memory and issues necessary `dsb ishst` to force completion of writes to page table entries and flush cpu pipeline. Finally, this patch adjusts `scripts/build` to support building ZFS images on arm and makes ZFS a default filesystem as on x64_64. Besides running all unit tests on ZFS image I have also verified that more involved tests like misc-zfs-io.cc work as well. Fixes #1131 Signed-off-by: Waldemar Kozaczuk --- arch/aarch64/mmu.cc| 57 ++ scripts/build | 20 ++--- scripts/upload_manifest.py | 10 +-- 3 files changed, 74 insertions(+), 13 deletions(-) diff --git a/arch/aarch64/mmu.cc b/arch/aarch64/mmu.cc index ccf40667..aff7cc61 100644 --- a/arch/aarch64/mmu.cc +++ b/arch/aarch64/mmu.cc @@ -14,6 +14,59 @@ #include "arch-cpu.hh" #include "exceptions.hh" +#define ACCESS_FLAG_FAULT_LEVEL_3(esr)((esr & 0b011) == 0x0b) // 0xb = 0b1011 indicates level 3 +#define ACCESS_FLAG_FAULT_LEVEL_3_WHEN_WRITE(esr) ((esr & 0b111) == 0x4b) + +TRACEPOINT(trace_mmu_vm_access_flag_fault, "addr=%p", void *); + +template +T* phys_to_virt_cast(mmu::phys pa) +{ +void *virt = mmu::phys_mem + pa; +return static_cast(virt); +} + +static void handle_access_flag_fault(exception_frame *ef, u64 addr) { +trace_mmu_vm_access_flag_fault((void*)addr); + +// The access bit of a PTE (Page Table Entry) at level 3 got cleared and we need +// to set it to handle this page fault. Therefore we need to do a page walk +// to navigate down to the level 3 and identify relevant PTE. + +// Start with root PTE +auto root_pt = mmu::get_root_pt(addr); +auto root_ptep = mmu::hw_ptep<4>::force(root_pt); + +// Identify PTEP (PTE Pointer) at level 0 (the template parameter is reversed) +// First identify the ptep table at this level +auto l3_ptep_table = mmu::hw_ptep<3>::force(phys_to_virt_cast>(root_ptep.read().next_pt_addr())); +// Then access ptep at the index encoded in the virtual address +auto l3_ptep = l3_ptep_table.at(mmu::pt_index(reinterpret_cast(addr), 3)); + +// Identify PTEP at level 1 (first identify the ptep table and then the relevant ptep) +auto l2_ptep_table = mmu::hw_ptep<2>::force(phys_to_virt_cast>(l3_ptep.read().next_pt_addr())); +auto l2_ptep = l2_ptep_table.at(mmu::pt_index(reinterpret_cast(addr), 2)); + +// Identify PTEP at level 2 (first identify the ptep table and then the relevant ptep) +auto l1_ptep_table = mmu::hw_ptep<1>::force(phys_to_virt_cast>(l2_ptep.read().next_pt_addr())); +auto l1_ptep = l1_ptep_table.at(mmu::pt_index(reinterpret_cast(addr), 1)); + +// Identify PTEP at level 3 (first identify the ptep table and then the relevant ptep) +auto l0_ptep_table = mmu::hw_ptep<0>::force(phys_to_virt_cast>(l1_ptep.read().next_pt_addr())); +auto l0_ptep = l0_ptep_table.at(mmu::pt_index(reinterpret_cast(addr), 0)); + +// Read leaf PTE +auto leaf_pte = l0_ptep.read(); + +leaf_pte.set_accessed(true); +if (ACCESS_FLAG_FAULT_LEVEL_3(ef->esr)) { +leaf_pte.set_dirty(true); +} + +l0_ptep.write(leaf_pte); +mmu::synchronize_page_table_modifications(); +} + void page_fault(exception_frame *ef) { sched::fpu_lock fpu; @@ -39,6 +92,10 @@ void page_fault(exception_frame *ef) abort("trying to execute null pointer"); } +if (ACCESS_FLAG_FAULT_LEVEL_3(ef->esr)) { +return handle_access_flag_fault(ef, addr); +} + /* vm_fault might sleep, so check that the thread is preemptable, * and that interrupts in the saved pstate are enabled. * Then enable interrupts for the vm_fault. diff --git a/scripts/build b/scripts/build index ffae67b3..38aa70d5 100755 --- a/scripts/build +++ b/scripts/build @@ -190,15 +190,7 @@ host_arch=$(uname -m) # Default manifest manifest=bootfs.manifest.skel -if [[ "$host_arch" == "aarch64" || "$arch" == "aarch64" ]]; then - # We default to ROFS as
[osv-dev] [PATCH] aarch64: implement signal handler
This patch implements the signal handling on aarch64. I will not be repeating the details of what it changes and why as it is quite well explained in the code changes. But in essence, this patch updates the build_signal_frame() which I believe was based on the x86_64 version of it with the changes specific to aarch64. It also adds missing handling of the SA_ONSTACK flag. Secondly, this patch also enhances entry.S to implement the call_signal_handler_thunk which is probably the most tricky part. The call_signal_handler_thunk is called on exit from a page fault as an effect of build_signal_frame() setting the field `elr` into the exception frame. Unlike on x86_64, the stack pointer register (sp) is not changed automatically based on the content of the frame on exit. To that end, the call_signal_handler_thunk has to carefully switch to SP_EL0 (exception stack) to read the value of the sp field from the exception frame in order to set SP_EL1 which is used normally for non-expection-handling by kernel and apps. Eventually, it calls the call_signal_handler() which is implemented logically in similar fashion as on x86_64 except for different registers. Finally, this patch also enables 3 unit tests to run on aarch64. Fixes #1154 Fixes #1151 Fixes #1152 Fixes #1153 Signed-off-by: Waldemar Kozaczuk --- arch/aarch64/entry.S | 52 + arch/aarch64/exceptions.hh | 2 +- arch/aarch64/signal.cc | 115 +++-- scripts/test.py| 9 - 4 files changed, 152 insertions(+), 26 deletions(-) diff --git a/arch/aarch64/entry.S b/arch/aarch64/entry.S index 8322ee90..8cbc0f57 100644 --- a/arch/aarch64/entry.S +++ b/arch/aarch64/entry.S @@ -92,8 +92,10 @@ exception_vectors: .endif mrs x2, elr_el1 mrs x3, spsr_el1 +mrs x4, far_el1 stp x30, x1, [sp, #240] // store lr, old SP stp x2, x3, [sp, #256] // store elr_el1, spsr_el1 +str x4, [sp, #280] // store far_el1 .endm /* push_state_to_exception_frame */ .macro pop_state_from_exception_frame @@ -294,18 +296,44 @@ entry_curr_el_irq x, 1 // the asynchronous exception handler used when the SP_EL call_signal_handler_thunk: .type call_signal_handler_thunk, @function .cfi_startproc simple -# stack contains a signal_frame -/* -.cfi_offset reg, offset -... -mov x0, sp -call call_signal_handler -# FIXME: fpu - -pop_pair... -add sp, sp, 16 # error_code -*/ -ret +.cfi_signal_frame +.cfi_def_cfa %sp, 0 +.cfi_offset x30, -32 // Point to the elr register located at the -32 offset + // of the exception frame to help gdb link to the + // address when interrupt was raised + +# The call_signal_handler_thunk gets called on exit from the synchronous exception +# (most likely page fault handler) as a result of build_signal_frame placing the address +# of call_signal_handler_thunk into elr field of the exception frame. + +# On exit from the exception, the stack selector is reset to point to SP_EL1 which +# is where we are now. However the build_signal_frame() placed the address of the stack +# we are supposed to use in the field 'sp' of the original exception frame still present +# on the exception stack (please note the exception have been disabled). So in order +# to read the value of the 'sp' field we need to switch back briefly to the exception +# stack. +mrs x1, SPsel +msr SPsel, #0 // switch back to SP_EL0 so we can see original exception frame +ldr x0, [sp, #-40] // read 'sp' field placed by build_signal_frame() in the original exception frame +msr SPsel, x1 // switch stack selector to the original value +mov sp, x0 // set sp to the stack setup by build_signal_frame() + // sp points to the signal frame and original exception frame at the same time +//TODO: Fix cfa to help debugger +msr daifclr, #2// enable interrupts which were disabled by build_signal_frame() +isb + +bl call_signal_handler //x0 (1st argument) points to the signal frame + +pop_state_from_exception_frame +# Adjust stack pointer by the remaining part of the signal frame to get back +# to the position in the stack we should be according to the logic in build_signal_frame(). +add sp, sp, #288 +# Please note we may not be on the original stack when exception was triggered. +# We would be IF the signal handler was executed on the same stack. However if user set +# up his own stack and passed using sigalstack() with SA_ONSTACK to make it handle +
[osv-dev] [PATCH] aarch64: force TLB flush when mprotect changes permission
When testing the tst-map.cc and tst-elf-permissions.cc with the upcoming patch to add signals support on aarch64, I noticed that sometimes they would kind of "hang" for a while and eventually complete successfully. This would happen especially when running in non-SMP mode (1 CPU). After more investigation I discovered that the tests would actually get into a page fault "loop" after calling mprotect() in the signal handler and stay like so until the page table changes were flushed which I am explaining below. Analysis of the mmu::protect() and the protection vma operation made me realize that it has an optimization to trigger full TLB flush only if pemissions are reduced for any of the relevant pages (see eefcb083a65dd5693b9dbe792f5f3e3ef0b167df). The problem is that on ARM any change to the page entry table (regardless if reduction or expansion) needs a forced completion of writes achieved by the sequence of `dsb ishst` followed by `isb`. Full flush of TLB on ARM does that but on top of the expensive `tlbi vmalle1is`. So to fix this problem, this patch changes change_perm() to return true if permission changes on aarch64 which would then trigger TLB flush when the protection vma operation completes. In future we may optimize it. Signed-off-by: Waldemar Kozaczuk --- core/mmu.cc | 8 1 file changed, 8 insertions(+) diff --git a/core/mmu.cc b/core/mmu.cc index 62aebc35..007d4331 100644 --- a/core/mmu.cc +++ b/core/mmu.cc @@ -242,7 +242,15 @@ bool change_perm(hw_ptep ptep, unsigned int perm) pte.set_rsvd_bit(0, !perm); ptep.write(pte); +#ifdef __x86_64__ return old & ~perm; +#endif +#ifdef __aarch64__ +//TODO: This will trigger full tlb flush in slightly more cases than on x64 +//and in future we should investigate more precise and hopefully lighter +//mechanism. But for now it will do it. +return old != perm; +#endif } template -- 2.27.0 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220503214128.41746-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] aarch64: build tst-elf-permissions.cc with correctly named sections
The assembler on aarch64 uses different characters - '//' - for the end of line comment (see https://sourceware.org/binutils/docs-2.26/as/i386_002dChars.html#i386_002dChars and https://sourceware.org/binutils/docs-2.26/as/AArch64_002dChars.html#AArch64_002dChars respectively). So we add the relevant ifdef directives to make it build correctly on each arch. Signed-off-by: Waldemar Kozaczuk --- tests/tst-elf-permissions.cc | 10 ++ 1 file changed, 10 insertions(+) diff --git a/tests/tst-elf-permissions.cc b/tests/tst-elf-permissions.cc index 3b704787..4ff045a4 100644 --- a/tests/tst-elf-permissions.cc +++ b/tests/tst-elf-permissions.cc @@ -20,9 +20,19 @@ static int test_text_section() __attribute__((noinline)); // solution is to take advantage from the fact that gcc passes section name // verbatim to the assembler and thus adding '#' makes whatever gcc appends // to the directive ignored. +#ifdef __x86_64__ static int test_data_section() __attribute__((noinline, section(".data #"))); +#endif +#ifdef __aarch64__ +static int test_data_section() __attribute__((noinline, section(".data //"))); +#endif +#ifdef __x86_64__ static int test_gnu_relro __attribute__((section(".got #"))); +#endif +#ifdef __aarch64__ +static int test_gnu_relro __attribute__((section(".got //"))); +#endif volatile int value = 123; static int test_text_section() -- 2.27.0 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220503205612.41541-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] lockfree ring_spsc: make counter type a template parameter
This patch does not change the ring_spsc template in any significant way but merely makes the counter type (of _begin and _end) a parameter. This allows us to use smaller type - unsigned short (2 bytes) - in the unit test tst-ring-spsc-wraparound.cc to make it execute more rapidly. Before this change this unit test would run for almost 3 minutes on aarch64 and almost 10 seconds on x64. Now it executes way under a second and still verifies the same edge condition (see 1ba76eb03cba4431b557183d1001b16991cd1fa4). Signed-off-by: Waldemar Kozaczuk --- include/lockfree/ring.hh | 22 +++--- include/lockfree/unordered-queue-spsc.hh | 2 +- include/lockfree/unordered_ring_mpsc.hh | 2 +- include/osv/net_channel.hh | 2 +- include/osv/percpu_xmit.hh | 2 +- tests/misc-free-perf.cc | 2 +- tests/misc-lfring.cc | 2 +- tests/tst-nway-merger.cc | 2 +- tests/tst-ring-spsc-wraparound.cc| 4 ++-- 9 files changed, 20 insertions(+), 20 deletions(-) diff --git a/include/lockfree/ring.hh b/include/lockfree/ring.hh index c9fefccc..434a14b7 100644 --- a/include/lockfree/ring.hh +++ b/include/lockfree/ring.hh @@ -20,7 +20,7 @@ // // spsc ring of fixed size // -template +template class ring_spsc { public: ring_spsc(): _begin(0), _end(0) @@ -31,7 +31,7 @@ public: template inline bool emplace(Args&&... args) { -unsigned end = _end.load(std::memory_order_relaxed); +COUNTER_TYPE end = _end.load(std::memory_order_relaxed); // // It's ok to load _begin with relaxed ordering (in the size()) since @@ -56,7 +56,7 @@ public: bool pop(T& element) { -unsigned beg = _begin.load(std::memory_order_relaxed); +COUNTER_TYPE beg = _begin.load(std::memory_order_relaxed); if (empty()) { return false; @@ -83,15 +83,15 @@ public: * @return TRUE if there are no elements */ bool empty() const { -unsigned beg = _begin.load(std::memory_order_relaxed); -unsigned end = _end.load(std::memory_order_acquire); +COUNTER_TYPE beg = _begin.load(std::memory_order_relaxed); +COUNTER_TYPE end = _end.load(std::memory_order_acquire); return beg == end; } const T& front() const { DEBUG_ASSERT(!empty(), "calling front() on an empty queue!"); -unsigned beg = _begin.load(std::memory_order_relaxed); +COUNTER_TYPE beg = _begin.load(std::memory_order_relaxed); return _ring[beg & MaxSizeMask]; } @@ -102,16 +102,16 @@ public: * * @return the current number of the elements. */ -unsigned size() const { -unsigned end = _end.load(std::memory_order_relaxed); -unsigned beg = _begin.load(std::memory_order_relaxed); +COUNTER_TYPE size() const { +COUNTER_TYPE end = _end.load(std::memory_order_relaxed); +COUNTER_TYPE beg = _begin.load(std::memory_order_relaxed); return (end - beg); } private: -std::atomic _begin CACHELINE_ALIGNED; -std::atomic _end CACHELINE_ALIGNED; +std::atomic _begin CACHELINE_ALIGNED; +std::atomic _end CACHELINE_ALIGNED; T _ring[MaxSize]; }; diff --git a/include/lockfree/unordered-queue-spsc.hh b/include/lockfree/unordered-queue-spsc.hh index 72f77790..702da681 100644 --- a/include/lockfree/unordered-queue-spsc.hh +++ b/include/lockfree/unordered-queue-spsc.hh @@ -26,7 +26,7 @@ namespace lockfree { template class unordered_queue_spsc { private: -ring_spsc _ring; +ring_spsc _ring; unordered_queue_mpsc _queue; public: diff --git a/include/lockfree/unordered_ring_mpsc.hh b/include/lockfree/unordered_ring_mpsc.hh index 72599bae..6c6d1165 100644 --- a/include/lockfree/unordered_ring_mpsc.hh +++ b/include/lockfree/unordered_ring_mpsc.hh @@ -26,7 +26,7 @@ template class unordered_ring_mpsc { private: -std::vector> rings; +std::vector> rings; public: using ring_mpsc_t = unordered_ring_mpsc; diff --git a/include/osv/net_channel.hh b/include/osv/net_channel.hh index 2784e9e7..11cc09cb 100644 --- a/include/osv/net_channel.hh +++ b/include/osv/net_channel.hh @@ -33,7 +33,7 @@ extern void* memory::alloc_page(); class net_channel { private: std::function _process_packet; -ring_spsc _queue; +ring_spsc _queue; sched::thread_handle _waiting_thread CACHELINE_ALIGNED; // extra list of threads to wake osv::rcu_ptr> _pollers; diff --git a/include/osv/percpu_xmit.hh b/include/osv/percpu_xmit.hh index 7ec6f1be..4b44bf6a 100644 --- a/include/osv/percpu_xmit.hh +++ b/include/osv/percpu_xmit.hh @@ -151,7 +151,7 @@ public: private: lockfree::queue_mpsc _waitq; -ring_spsc _r; +ring_spsc _r; // // We don't want to wake the waiters when the Tx worker is going to sleep. diff --git a/tests/misc-free
[osv-dev] [PATCH] aarch64: handle exceptions on dedicated stack
This patch changes exception handling mechanism to use dedicated exception stack instead of the default stack provided for kernel and application threads. This is critical to support Golang apps which are known to use tiny stacks in coroutines and exception handler of svc instruction cannnot use single byte of the application stack in such case. Having separate exception stack has other benefits for debugging and will allow future implementation of "lazy" stacks. This also makes aarch64 port similar to x64 where we use dedicated stacks as well. To support dedicated stacks, we take advantage of the fact that at every exception level but EL0 there are two stack registers available - SP_ELx and SP_EL0. OSv runs at the exception level EL1 and in boot.S selects EP_EL1 to be used by default. The SP effectively is an alias to one of the two stack registers and can be changed by setting the system register SPSel (stack selector). This patch changes all exception handlers (both synchrounous and asynchronous (interrupts)) in entry.S to switch to the new exception stack before pushing a frame by setting the SPSel to #0 which makes SP point to SP_EL0. We have to switch to SP_EL0 even in the case of the nested exception when we are on SP_EL0 as per ARM specification the SP is always reset to SP_ELx (in our case SP_EL1) after taking an exception. The typical case of nested exception is handling of a page fault where we enable exceptions downstream in the page fault handler (arch/aarch64/mmu.cc) and it may be interrupted by an asynchronous exception like a timer one. To that end we also add the exception handlers for curr_el_sp0 which system invokes when code is running with SP pointing to SP_EL0. Finally, we also change the context switch code in sched.S to make it save not only default stack register but explicitly save SP_EL0 and SP_EL1 and SPSel for old thread and then restore those from arch_thread_state for new thread. This makes context switch slightly more expensive and has been measured to add around 5% of overhead. This patch effectively enhances OSv to allow runing Golang apps on AArch64. Fixes #1155 Signed-off-by: Waldemar Kozaczuk --- arch/aarch64/arch-cpu.hh | 1 + arch/aarch64/arch-switch.hh | 12 +++-- arch/aarch64/arch-thread-state.hh | 3 ++ arch/aarch64/entry.S | 73 +-- arch/aarch64/sched.S | 18 +++- 5 files changed, 78 insertions(+), 29 deletions(-) diff --git a/arch/aarch64/arch-cpu.hh b/arch/aarch64/arch-cpu.hh index 15edbdaa..8848d880 100644 --- a/arch/aarch64/arch-cpu.hh +++ b/arch/aarch64/arch-cpu.hh @@ -33,6 +33,7 @@ struct arch_cpu { }; struct arch_thread { +char exception_stack[4096*4] __attribute__((aligned(16))); }; struct arch_fpu { diff --git a/arch/aarch64/arch-switch.hh b/arch/aarch64/arch-switch.hh index 0401a4b8..c8848605 100644 --- a/arch/aarch64/arch-switch.hh +++ b/arch/aarch64/arch-switch.hh @@ -33,13 +33,15 @@ void thread::switch_to_first() remote_thread_local_var(percpu_base) = _detached_state->_cpu->percpu_base; asm volatile("\n" - "ldp x29, x0, %2 \n" - "ldp x22, x21, %3 \n" + "ldp x29, x0, %3 \n" + "ldp x22, x21, %4 \n" "mov sp, x22 \n" + "ldr x22, %5 \n" + "msr sp_el0, x22 \n" "blr x21 \n" : // No output operands - this is to designate the input operands as earlyclobbers - "=&Ump"(this->_state.fp), "=&Ump"(this->_state.sp) - : "Ump"(this->_state.fp), "Ump"(this->_state.sp) +"=&Ump"(this->_state.fp), "=&Ump"(this->_state.sp), "=&Ump"(this->_state.exception_sp) + : "Ump"(this->_state.fp), "Ump"(this->_state.sp), "Ump"(this->_state.exception_sp) : "x0", "x19", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28", "x30", "memory"); } @@ -59,6 +61,8 @@ void thread::init_stack() _state.thread = this; _state.sp = stacktop; _state.pc = reinterpret_cast(thread_main); +_state.exception_sp = _arch.exception_stack + sizeof(_arch.exception_stack); +_state.stack_selector = 1; //Select SP_ELx } void thread::setup_tcb() diff --git a/arch/aarch64/arch-thread-state.hh b/arch/aarch64/arch-thread-state.hh index 6f1b680d..f6a27ff2 100644 --- a/arch/aarch64/arch-thread-state.hh +++ b/arch/aarch64/arch-thread-state.hh @@ -15,6 +15,9 @@ struct thread_state { void* sp; void* pc;
[osv-dev] [PATCH] aarch64: handle system calls
This patch enhances the aarch64 port to support handling system call instruction - SVC. On aarch64 the system calls are handled as synchronous exceptions triggered by executing the SVC instruction. Per syscall specification described in https://man7.org/linux/man-pages/man2/syscall.2.html, the caller needs to pass arguments using the x0-x5 registers, set the syscall number in x8 register and finally execute svc instruction. To handle this on OSv side, this patch enhances existing synchronous exception handler in entry.S to detect if the exception class set in ESR register matches the one for SVC (0x15) and then retrieve the arguments from the registers x0-x5 and syscall number from x8 and invoke the syscall_wrapper function in linux.cc. We also need to enable exceptions before calling syscall_wrapper so that functions called downstream may sleep which is similar to what we do for page faults. Please note the resulting enhancements are enough to make tst-syscall.so pass. Fixes #1156 Signed-off-by: Waldemar Kozaczuk --- arch/aarch64/entry.S | 26 ++ linux.cc | 10 ++ modules/tests/Makefile | 4 ++-- 3 files changed, 38 insertions(+), 2 deletions(-) diff --git a/arch/aarch64/entry.S b/arch/aarch64/entry.S index 03266f9c..25354359 100644 --- a/arch/aarch64/entry.S +++ b/arch/aarch64/entry.S @@ -120,6 +120,7 @@ thread_main: .equ ESR_EC_END,31 // Exception Class field end in ESR .equ ESR_EC_DATA_ABORT,0x25 // Exception Class Data Abort value .equ ESR_EC_INSN_ABORT,0x21 // Exception Class Instruction Abort value +.equ ESR_EC_SVC64,0x15 // Exception Class for SVC (System Call) in 64-bit state .equ ESR_ISS_BEG,0 // Instruction-Specific Syndrome field begin in ESR .equ ESR_ISS_END,23 // Instruction-Specific Syndrome field end in ESR @@ -197,6 +198,8 @@ entry_curr_el_spx_sync: str w1, [sp, #272] // Store Exception Syndrom Register in the frame ubfmx2, x1, #ESR_EC_BEG, #ESR_EC_END // Exception Class -> X2 ubfmx3, x1, #ESR_FLT_BEG, #ESR_FLT_END // FLT -> X3 +cmp x2, #ESR_EC_SVC64 +b.eqhandle_system_call cmp x2, #ESR_EC_DATA_ABORT b.eqhandle_mem_abort cmp x2, #ESR_EC_INSN_ABORT @@ -211,6 +214,29 @@ handle_mem_abort: pop_state_from_exception_frame eret .cfi_endproc +handle_system_call: +.cfi_startproc +//see https://man7.org/linux/man-pages/man2/syscall.2.html for details +//about calling convention for arm64 + +//because we used x1, x2, x3 and x4 above we need to restore them from the frame +ldp x1, x2, [sp, #8] +ldp x3, x4, [sp, #24] + +mov x6, x8 // copy syscall number passed in x8 to the last 7th argument of the syscall_wrapper + +msr daifclr, #2 // enable interrupts, so that the functions called by syscall_wrapper can sleep +isb + +bl syscall_wrapper + +msr daifset, #2 // disable interrupts +isb + +str x0, [sp, #0] // copy the result in x0 directly into the frame so that it can be restored +pop_state_from_exception_frame +eret +.cfi_endproc unexpected_sync_exception: .cfi_startproc mov x0, sp // save exception_frame to x0 diff --git a/linux.cc b/linux.cc index 1c2eea24..d3823d00 100644 --- a/linux.cc +++ b/linux.cc @@ -498,12 +498,22 @@ OSV_LIBC_API long syscall(long number, ...) } long __syscall(long number, ...) __attribute__((alias("syscall"))); +#ifdef __x86_64__ // In x86-64, a SYSCALL instruction has exactly 6 parameters, because this is the number of registers // alloted for passing them (additional parameters *cannot* be passed on the stack). So we can get // 7 arguments to this function (syscall number plus its 6 parameters). Because in the x86-64 ABI the // seventh argument is on the stack, we must pass the arguments explicitly to the syscall() function // and can't just call it without any arguments and hope everything will be passed on extern "C" long syscall_wrapper(long number, long p1, long p2, long p3, long p4, long p5, long p6) +#endif +#ifdef __aarch64__ +// In aarch64, the first 8 parameters to a procedure call are passed in the x0-x7 registers and +// the parameters of syscall call (SVC intruction) in are passed in x0-x5 registers and syscall number +// in x8 register before. To avoid shuffling the arguments around we make syscall_wrapper() +// accept the syscall parameters as is but accept the syscall number as the last 7th argument which +// the code in entry.S arranges. +extern "C" long syscall_wrapper(long p1, long p2, long p3, long p4, long p5, long p6, long number) +#endif { int errno_backup = errno; // syscall and function return value are in rax diff --git a/modules/tests/Makefile b/modules/tests/Makefile index bcf4d609..9ea648f7
[osv-dev] [PATCH] aarch64: enable tst-ifaddrs.so
Signed-off-by: Waldemar Kozaczuk --- modules/tests/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/modules/tests/Makefile b/modules/tests/Makefile index 9ea648f7..ca489341 100644 --- a/modules/tests/Makefile +++ b/modules/tests/Makefile @@ -133,7 +133,7 @@ tests := tst-pthread.so misc-ramdisk.so tst-vblk.so tst-bsd-evh.so \ tst-getopt.so tst-getopt-pie.so tst-non-pie.so tst-semaphore.so \ tst-elf-init.so tst-realloc.so tst-setjmp.so \ libtls.so libtls_gold.so tst-tls.so tst-tls-gold.so tst-tls-pie.so \ - tst-sigaction.so tst-syscall.so + tst-sigaction.so tst-syscall.so tst-ifaddrs.so # libstatic-thread-variable.so tst-static-thread-variable.so \ #TODO For now let us disable these tests for aarch64 until @@ -141,7 +141,7 @@ tests := tst-pthread.so misc-ramdisk.so tst-vblk.so tst-bsd-evh.so \ # The tst-ifaddrs.so is an exception and it does not compile due to some # missing headers ifeq ($(arch),x64) -tests += tst-ifaddrs.so tst-mmx-fpu.so +tests += tst-mmx-fpu.so endif tests += testrunner.so -- 2.27.0 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220429040909.13372-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] aarch64: adapt tst-syscall.cc to work on aarch64
Signed-off-by: Waldemar Kozaczuk --- tests/tst-syscall.cc | 46 +--- 1 file changed, 39 insertions(+), 7 deletions(-) diff --git a/tests/tst-syscall.cc b/tests/tst-syscall.cc index 79332ad6..12722f1b 100644 --- a/tests/tst-syscall.cc +++ b/tests/tst-syscall.cc @@ -40,18 +40,29 @@ bool do_expect(T actual, T expected, const char *actuals, const char *expecteds, int main(int argc, char **argv) { -// Test that the x86 SYSCALL instruction works, and produces the same +// Test that the x86 SYSCALL and aarch64 SVC instructions work, and produce the same // results as the syscall() function (with expected differences in how // errors are returned). unsigned long syscall_nr = __NR_gettid; long tid = 0; -asm ("movq %1, %%rax\n" +#ifdef __x86_64__ +asm ("movq %[syscall_no], %%rax\n" "syscall\n" - "movq %%rax, %0\n" - : "=m" (tid) - : "m" (syscall_nr) + "movq %%rax, %[tid]\n" + : [tid]"=m" (tid) + : [syscall_no]"m" (syscall_nr) : "rax", "rdi"); +#endif + +#ifdef __aarch64__ +asm ("mov x8, %[syscall_no]\n" + "svc #0\n" + "mov %[tid], x0\n" + : [tid]"=r" (tid) + : [syscall_no]"r" (syscall_nr) + : "x0", "x8"); +#endif std::cout << "got tid=" << tid << std::endl; expect(tid >= 0, true); @@ -68,17 +79,38 @@ int main(int argc, char **argv) off_t offset = 0; void* buf = NULL; +syscall_nr = __NR_mmap; + +#ifdef __x86_64__ asm ("movq %[addr], %%rdi\n" "movq %[length], %%rsi\n" "movl %[prot], %%edx\n" "movq %[flags], %%r10\n" "movq %[fd], %%r8\n" "movq %[offset], %%r9\n" - "movq $9, %%rax\n" + "movq %[syscall_no], %%rax\n" "syscall\n" "movq %%rax, %[buf]\n" : [buf] "=m" (buf) - : [addr] "m" (addr), [length] "m" (length), [prot] "m" (prot), [flags] "m" (flags), [fd] "m" (fd), [offset] "m" (offset)); + : [addr] "m" (addr), [length] "m" (length), [prot] "m" (prot), + [flags] "m" (flags), [fd] "m" (fd), [offset] "m" (offset), [syscall_no] "m" (syscall_nr)); +#endif + +#ifdef __aarch64__ +asm ("mov x0, %[addr]\n" + "mov x1, %[length]\n" + "mov x2, %[prot]\n" + "mov x3, %[flags]\n" + "mov x4, %[fd]\n" + "mov x5, %[offset]\n" + "mov x8, %[syscall_no]\n" + "svc #0\n" + "mov %[buf], x0\n" + : [buf] "=r" (buf) + : [addr] "r" (addr), [length] "r" (length), [prot] "r" (prot), + [flags] "r" (flags), [fd] "r" (fd), [offset] "r" (offset), [syscall_no] "r" (syscall_nr) + : "x0", "x1", "x2", "x3", "x4", "x5", "x8"); +#endif assert(((long)buf) >= 0); munmap(buf, length); -- 2.27.0 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220429040831.13321-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH V2] aarch64: move kernel to 63rd GB of virtual memory
This patch modifies the aarch64 port to move the kernel from the 2nd to 63rd GB or virtual memory. It also adjusts the early preboot and boot assembly to dynamically adjust the early phys/virt mapping tables to make it work regardless where in physical memory the kernel and DTB is loaded. This allows us to use the same kernel binary on QEMU and Firecracker without having to adjust relevant variables in the makefile and rebuild it to accomodate the fact that each hypervisor would load loader.img in different area of physical memory. Prior to this patch the kernel would be mapped 1:1 in the first 4 GB of phys/virt memory. In essence, this patch enhances the preboot.S to dynamically identify location of start_elf to jump to. Then it modifies boot.S to dynamically calculate the offset between where kernel is located in virtual memory and where it is loaded in physical memory and then adjust the 63rd GB of early boot mapping tables accordingly. Finally it also adjust the virt/phys and phys/virt translation functions in core/mmu.cc and other aspects in elf.cc and makefile. After the patch the virtual memory layout would look like this in QEMU: vaddrpaddr size perm memattr name 800 8001 rwxp dev gic_dist 801 8011 rwxp dev gic_cpu 900 900 1000 rwxp dev pl011 901 901 1000 rwxp dev pl031 1000 1000 2eff rwxp dev pci_mem 3eff 3eff1 rwxp dev pci_io fc000 4000 84e000 rwxp normal kernel 401000 401000 1000 rwxp dev pci_cfg 8a00 a00 200 rwxp normal virtio_mmio_cfg 8a000200 a000200 200 rwxp normal virtio_mmio_cfg 8a000400 a000400 200 rwxp normal virtio_mmio_cfg 8a000600 a000600 200 rwxp normal virtio_mmio_cfg 8a000800 a000800 200 rwxp normal virtio_mmio_cfg 8a000a00 a000a00 200 rwxp normal virtio_mmio_cfg 8a000c00 a000c00 200 rwxp normal virtio_mmio_cfg 8a000e00 a000e00 200 rwxp normal virtio_mmio_cfg 80004084e000 4084e000 7f7b2000 rwxp normal main 90004084e000 4084e000 7f7b2000 rwxp normal page a0004084e000 4084e000 7f7b2000 rwxp normal mempool Fixes #1087 Changes since V1: removed some code from loader.cc that got acidentally added as part of the initial version of the patch. Signed-off-by: Waldemar Kozaczuk --- Makefile | 27 +--- arch/aarch64/arch-dtb.cc | 10 +-- arch/aarch64/arch-mmu.hh | 1 + arch/aarch64/arch-setup.cc | 10 ++- arch/aarch64/arch.hh | 2 +- arch/aarch64/boot.S| 138 ++--- arch/aarch64/loader.ld | 5 +- arch/aarch64/preboot.S | 20 -- core/elf.cc| 4 ++ core/mmu.cc| 18 + loader.cc | 11 ++- 11 files changed, 213 insertions(+), 33 deletions(-) diff --git a/Makefile b/Makefile index 82885016..c6dff37f 100644 --- a/Makefile +++ b/Makefile @@ -318,8 +318,12 @@ kernel-defines = -D_KERNEL $(source-dialects) $(cc-hide-flags) $(gc-flags) # To add something that will *not* be part of the main kernel, you can do: # # mydir/*.o EXTRA_FLAGS = +ifeq ($(arch),x64) EXTRA_FLAGS = -D__OSV_CORE__ -DOSV_KERNEL_BASE=$(kernel_base) -DOSV_KERNEL_VM_BASE=$(kernel_vm_base) \ -DOSV_KERNEL_VM_SHIFT=$(kernel_vm_shift) -DOSV_LZKERNEL_BASE=$(lzkernel_base) +else +EXTRA_FLAGS = -D__OSV_CORE__ -DOSV_KERNEL_VM_BASE=$(kernel_vm_base) +endif EXTRA_LIBS = COMMON = $(autodepend) -g -Wall -Wno-pointer-arith $(CFLAGS_WERROR) -Wformat=0 -Wno-format-security \ -D __BSD_VISIBLE=1 -U _FORTIFY_SOURCE -fno-stack-protector $(INCLUDES) \ @@ -497,12 +501,13 @@ acpi = $(patsubst %.c, %.o, $(acpi-source)) $(acpi:%=$(out)/%): CFLAGS += -fno-strict-aliasing -Wno-stringop-truncation +kernel_vm_shift := $(shell printf "0x%X" $(shell expr $$(( $(kernel_vm_base) - $(kernel_base) )) )) + endif # x64 ifeq ($(arch),aarch64) -kernel_base := 0x4008 -kernel_vm_base := $(kernel_base) +kernel_vm_base := 0xfc008 #63GB app_local_exec_tls_size := 0x40 include $(libfdt_base)/Makefile.libfdt @@ -516,7 +521,7 @@ $(out)/preboot.bin: $(out)/preboot.elf $(call quiet, $(OBJCOPY) -O binary $^ $@, OBJCOPY $@) edata = $(shell readelf --syms $(out)/loader.elf | grep "\.edata" | awk '{print "0x" $$2}') -image_size = $$(( $(edata) - $(kernel_base) )) +image_size = $$(( $(edata) - $(kernel_vm_base) )) $(out)/loader.img: $(out)/preboot.bin $(out)/loader-stripped.elf $(call quiet, dd if=$(out)/preboot.bin of=$@ > /dev/null 2>&1, DD $@ preboot.bin) @@ -526,8 +531,6 @@ $(out)/loader.i
[osv-dev] [PATCH] aarch64: move kernel to 63rd GB of virtual memory
This patch modifies the aarch64 port to move the kernel from the 2nd to 63rd GB or virtual memory. It also adjusts the early preboot and boot assembly to dynamically adjust the early phys/virt mapping tables to make it work regardless where in physical memory the kernel and DTB is loaded. This allows us to use the same kernel binary on QEMU and Firecracker without having to adjust relevant variables in the makefile and rebuild it to accomodate the fact that each hypervisor would load loader.img in different area of physical memory. Prior to this patch the kernel would be mapped 1:1 in the first 4 GB of phys/virt memory. In essence, this patch enhances the preboot.S to dynamically identify location of start_elf to jump to. Then it modifies boot.S to dynamically calculate the offset between where kernel is located in virtual memory and where it is loaded in physical memory and then adjust the 63rd GB of early boot mapping tables accordingly. Finally it also adjust the virt/phys and phys/virt translation functions in core/mmu.cc and other aspects in elf.cc and makefile. After the patch the virtual memory layout would look like this in QEMU: vaddrpaddr size perm memattr name 800 8001 rwxp dev gic_dist 801 8011 rwxp dev gic_cpu 900 900 1000 rwxp dev pl011 901 901 1000 rwxp dev pl031 1000 1000 2eff rwxp dev pci_mem 3eff 3eff1 rwxp dev pci_io fc000 4000 84e000 rwxp normal kernel 401000 401000 1000 rwxp dev pci_cfg 8a00 a00 200 rwxp normal virtio_mmio_cfg 8a000200 a000200 200 rwxp normal virtio_mmio_cfg 8a000400 a000400 200 rwxp normal virtio_mmio_cfg 8a000600 a000600 200 rwxp normal virtio_mmio_cfg 8a000800 a000800 200 rwxp normal virtio_mmio_cfg 8a000a00 a000a00 200 rwxp normal virtio_mmio_cfg 8a000c00 a000c00 200 rwxp normal virtio_mmio_cfg 8a000e00 a000e00 200 rwxp normal virtio_mmio_cfg 80004084e000 4084e000 7f7b2000 rwxp normal main 90004084e000 4084e000 7f7b2000 rwxp normal page a0004084e000 4084e000 7f7b2000 rwxp normal mempool Fixes #1087 Signed-off-by: Waldemar Kozaczuk --- Makefile | 27 +--- arch/aarch64/arch-dtb.cc | 10 +-- arch/aarch64/arch-mmu.hh | 1 + arch/aarch64/arch-setup.cc | 10 ++- arch/aarch64/arch.hh | 2 +- arch/aarch64/boot.S| 138 ++--- arch/aarch64/loader.ld | 5 +- arch/aarch64/preboot.S | 20 -- core/elf.cc| 4 ++ core/mmu.cc| 18 + loader.cc | 16 - 11 files changed, 218 insertions(+), 33 deletions(-) diff --git a/Makefile b/Makefile index 82885016..c6dff37f 100644 --- a/Makefile +++ b/Makefile @@ -318,8 +318,12 @@ kernel-defines = -D_KERNEL $(source-dialects) $(cc-hide-flags) $(gc-flags) # To add something that will *not* be part of the main kernel, you can do: # # mydir/*.o EXTRA_FLAGS = +ifeq ($(arch),x64) EXTRA_FLAGS = -D__OSV_CORE__ -DOSV_KERNEL_BASE=$(kernel_base) -DOSV_KERNEL_VM_BASE=$(kernel_vm_base) \ -DOSV_KERNEL_VM_SHIFT=$(kernel_vm_shift) -DOSV_LZKERNEL_BASE=$(lzkernel_base) +else +EXTRA_FLAGS = -D__OSV_CORE__ -DOSV_KERNEL_VM_BASE=$(kernel_vm_base) +endif EXTRA_LIBS = COMMON = $(autodepend) -g -Wall -Wno-pointer-arith $(CFLAGS_WERROR) -Wformat=0 -Wno-format-security \ -D __BSD_VISIBLE=1 -U _FORTIFY_SOURCE -fno-stack-protector $(INCLUDES) \ @@ -497,12 +501,13 @@ acpi = $(patsubst %.c, %.o, $(acpi-source)) $(acpi:%=$(out)/%): CFLAGS += -fno-strict-aliasing -Wno-stringop-truncation +kernel_vm_shift := $(shell printf "0x%X" $(shell expr $$(( $(kernel_vm_base) - $(kernel_base) )) )) + endif # x64 ifeq ($(arch),aarch64) -kernel_base := 0x4008 -kernel_vm_base := $(kernel_base) +kernel_vm_base := 0xfc008 #63GB app_local_exec_tls_size := 0x40 include $(libfdt_base)/Makefile.libfdt @@ -516,7 +521,7 @@ $(out)/preboot.bin: $(out)/preboot.elf $(call quiet, $(OBJCOPY) -O binary $^ $@, OBJCOPY $@) edata = $(shell readelf --syms $(out)/loader.elf | grep "\.edata" | awk '{print "0x" $$2}') -image_size = $$(( $(edata) - $(kernel_base) )) +image_size = $$(( $(edata) - $(kernel_vm_base) )) $(out)/loader.img: $(out)/preboot.bin $(out)/loader-stripped.elf $(call quiet, dd if=$(out)/preboot.bin of=$@ > /dev/null 2>&1, DD $@ preboot.bin) @@ -526,8 +531,6 @@ $(out)/loader.img: $(out)/preboot.bin $(out)/loader-stripped.elf endif # aarch64 -kernel_vm_shift := $(shell printf "0x%
[osv-dev] [PATCH] aarch64: fix atomic_fetchadd_int and atomic_fetchadd_long
This patch fixes a subtle yet critical bug in the implementation of the atomic_fetchadd_* functions used in the bsd subtree of OSv source code. This bug is a root cause of the issues #1189 and #1190 and affects stability of ZFS and networking stack on aarch64. The atomic_fetchadd_*() are implemented in inlined assembly and provide the functionality to atomically add/subtract to/from a 4- or 8-bytes long value in memory and return old value of it before update. The assembly made of four instructions in essence implements simple loop to read a value from memory, add a specified delta, update memory with new value and finally check if the update was successful. The pre-patch version of this code is almost correct and works properly if the atomic update is successful in the 1st attempt. However it works incorrectly if that update fails and it needs to retry which is quite rare. As an example the generate machine code might look like this: 0x10119690 <+0>:ldaxr x2, [x0] 0x10119694 <+4>:add x1, x1, x2 0x10119698 <+8>:stlxr w3, x1, [x0] 0x1011969c <+12>: cbnzw3, 0x10119690 One can eventually notice that the x1 register holding a result (sum) to be updated to memory is re-used across iterations and would behave like an accumulator which is wrong. We have to fix the inline assembly to make sure that separate register is used for that. Rather than trying to fix existing code, this patch updates both atomic_fetchadd_*() functions with the copies of atomic_fetchadd_32 and atomic_fetchadd_64 from current enough version of FreeBSD code - sys/arm64/include/atomic.h@119a353e3d9d45650e109600160caca173ac8a53 and tweaked to match the types of val, tmp and res variables. Please note the FreeBSD version of the code uses ldxr/stxr instructions instead of ldaxr/stlxr ones with require/release memory ordering semantics which are excessive for atomic_fetchadd_*(). Fixes #1189 Fixes #1190 Signed-off-by: Waldemar Kozaczuk --- bsd/aarch64/machine/atomic.h | 46 ++-- 1 file changed, 28 insertions(+), 18 deletions(-) diff --git a/bsd/aarch64/machine/atomic.h b/bsd/aarch64/machine/atomic.h index eb447d8e..55532585 100644 --- a/bsd/aarch64/machine/atomic.h +++ b/bsd/aarch64/machine/atomic.h @@ -83,28 +83,38 @@ int atomic_cmpset_long(volatile u_long *dst, u_long expect, u_long src); static __inline u_int atomic_fetchadd_int(volatile u_int *p, u_int val) { -u_int result; -u_int status; -__asm __volatile("1: ldaxr %w0, %1 ; " - " add %w2, %w2, %w0 ; " - " stlxr %w3, %w2, %1 ; " - " cbnz %w3, 1b ; " - : "=&r"(result), "+Q"(*p), "+r"(val), "=&r"(status)); - -return result; +u_int tmp, ret; +u_int res; + +__asm __volatile( +"1: ldxr%w2, [%3] \n" +" add %w0, %w2, %w4 \n" +" stxr%w1, %w0, [%3] \n" +" cbnz%w1, 1b\n" +: "=&r"(tmp), "=&r"(res), "=&r"(ret) +: "r" (p), "r" (val) +: "memory" +); + +return ret; } static __inline u_long atomic_fetchadd_long(volatile u_long *p, u_long val) { -u_long result; -u_int status; -__asm __volatile("1: ldaxr %0, %1 ; " - " add %2, %2, %0 ; " - " stlxr %w3, %2, %1 ; " - " cbnz %w3, 1b ; " - : "=&r"(result), "+Q"(*p), "+r"(val), "=&r"(status)); - -return result; +u_long tmp, ret; +u_int res; + +__asm __volatile( +"1: ldxr%2, [%3] \n" +" add %0, %2, %4\n" +" stxr%w1, %0, [%3] \n" +" cbnz%w1, 1b \n" +: "=&r"(tmp), "=&r"(res), "=&r"(ret) +: "r" (p), "r" (val) +: "memory" +); + +return ret; } static __inline void atomic_store_rel_int(volatile u_int *p, u_int val) -- 2.27.0 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220420181404.53392-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] mmap: avoid collisions with linear map
This patch enhances virtual memory mapping code to track both linear (linear_map) and non-linear (mmap()) virtual memory mappings. It does so by adding simple struct - vma_range and vma_range_set collection to store all mappings. It then modifies mmu::find_hole() implementation to use vma_range_set instead of vma_list to find next hole. That way it can avoid collisions described in the issue #1135. Fixes #1135 Signed-off-by: Waldemar Kozaczuk --- core/mmu.cc | 76 + include/osv/mmu.hh | 31 ++ include/osv/prio.hh | 1 + 3 files changed, 102 insertions(+), 6 deletions(-) diff --git a/core/mmu.cc b/core/mmu.cc index a479e5c4..80ce6b63 100644 --- a/core/mmu.cc +++ b/core/mmu.cc @@ -47,6 +47,17 @@ extern const char text_start[], text_end[]; namespace mmu { +struct vma_range_compare { +bool operator()(const vma_range& a, const vma_range& b) { +return a.start() < b.start(); +} +}; + +//Set of all vma ranges - both linear and non-linear ones +__attribute__((init_priority((int)init_prio::vma_range_set))) +std::set vma_range_set; +rwlock_t vma_range_set_mutex; + struct linear_vma_compare { bool operator()(const linear_vma* a, const linear_vma* b) { return a->_virt_addr < b->_virt_addr; @@ -66,6 +77,9 @@ public: } }; +constexpr uintptr_t lower_vma_limit = 0x0; +constexpr uintptr_t upper_vma_limit = 0x8000; + typedef boost::intrusive::set, bi::member_hookstart() <= upper_vma_limit) { //we only go up to the upper mmap vma limit +//See if desired hole fits between p and n vmas if (start >= p->end() && start + size <= n->start()) { return start; } +//See if shifting start to the end of p makes desired hole fit between p and n if (p->end() >= start && n->start() - p->end() >= size) { good_enough = p->end(); if (small) { return good_enough; } +//See if huge hole fits between p and n if (n->start() - align_up(good_enough, huge_page_size) >= size) { return align_up(good_enough, huge_page_size); } } +//If nothing worked move next in the list p = n; ++n; } @@ -999,6 +1033,9 @@ ulong evacuate(uintptr_t start, uintptr_t end) memory::stats::on_jvm_heap_free(size); } vma_list.erase(dead); +WITH_LOCK(vma_range_set_mutex.for_write()) { +vma_range_set.erase(vma_range(&dead)); +} delete &dead; } } @@ -1140,6 +1177,9 @@ uintptr_t allocate(vma *v, uintptr_t start, size_t size, bool search) v->set(start, start+size); vma_list.insert(*v); +WITH_LOCK(vma_range_set_mutex.for_write()) { +vma_range_set.insert(vma_range(v)); +} return start; } @@ -1493,6 +1533,9 @@ void anon_vma::split(uintptr_t edge) vma* n = new anon_vma(addr_range(edge, _range.end()), _perm, _flags); set(_range.start(), edge); vma_list.insert(*n); +WITH_LOCK(vma_range_set_mutex.for_write()) { +vma_range_set.insert(vma_range(n)); +} } error anon_vma::sync(uintptr_t start, uintptr_t end) @@ -1600,6 +1643,9 @@ jvm_balloon_vma::~jvm_balloon_vma() // for a dangling mapping representing a balloon that was already moved // out. vma_list.erase(*this); +WITH_LOCK(vma_range_set_mutex.for_write()) { +vma_range_set.erase(vma_range(this)); +} assert(!(_real_flags & mmap_jvm_balloon)); mmu::map_anon(addr(), size(), _real_flags, _real_perm); @@ -1667,6 +1713,9 @@ ulong map_jvm(unsigned char* jvm_addr, size_t size, size_t align, balloon_ptr b) // Since we will change its position in the tree, for the sake of future // lookups we need to reinsert it. vma_list.erase(*jvma); +WITH_LOCK(vma_range_set_mutex.for_write()) { +vma_range_set.erase(vma_range(jvma)); +} if (jvma->start() < start) { assert(jvma->partial() >= (jvma->end() - start)); jvma->set(jvma->start(), start); @@ -1675,11 +1724,17 @@ ulong map_jvm(unsigned char* jvm_addr, size_t size, size_t align, balloon_ptr b) jvma->set(end, jvma->end()); } vma_list.insert(*jvma); +WITH_LOCK(vma_range_set_mutex.for_write()) { +vma_range_set.insert(vma_range(jvma)); +} } else { // Note how v and jvma are different. This is because this one, // we wil
[osv-dev] [PATCH] loader.py: add linear_mmap command
Similarly to the patch that adds new pseudo file /sys/osv/memory/linear_maps, this one adds new loader.py command that can be used with gdb to display same information: (gdb) osv linear_mmap vaddrpaddr size perm memattr name 800 8001 rwxp dev gic_dist 801 8011 rwxp dev gic_cpu 900 900 1000 rwxp dev pl011 901 901 1000 rwxp dev pl031 1000 1000 2eff rwxp dev pci_mem 3eff 3eff1 rwxp dev pci_io 4000 4000 6d3000 rwxp normal kernel 401000 401000 1000 rwxp dev pci_cfg 8a00 a00 200 rwxp normal virtio_mmio_cfg 8a000200 a000200 200 rwxp normal virtio_mmio_cfg 8a000400 a000400 200 rwxp normal virtio_mmio_cfg 8a000600 a000600 200 rwxp normal virtio_mmio_cfg 8a000800 a000800 200 rwxp normal virtio_mmio_cfg 8a000a00 a000a00 200 rwxp normal virtio_mmio_cfg 8a000c00 a000c00 200 rwxp normal virtio_mmio_cfg 8a000e00 a000e00 200 rwxp normal virtio_mmio_cfg 8000406d3000 406d3000 7f92d000 rwxp normal main 9000406d3000 406d3000 7f92d000 rwxp normal page a000406d3000 406d3000 7f92d000 rwxp normal mempool Signed-off-by: Waldemar Kozaczuk --- scripts/loader.py | 29 + 1 file changed, 29 insertions(+) diff --git a/scripts/loader.py b/scripts/loader.py index e39299bf..cfa87e0c 100644 --- a/scripts/loader.py +++ b/scripts/loader.py @@ -1643,11 +1643,40 @@ class osv_percpu(gdb.Command): return gdb.write('%s\n'%target) +class osv_linear_mmap(gdb.Command): +def __init__(self): +gdb.Command.__init__(self, 'osv linear_mmap', + gdb.COMMAND_USER, gdb.COMPLETE_NONE) +def invoke(self, arg, for_tty): +l = str(gdb.lookup_global_symbol('mmu::linear_vma_set').value()) +linear_vmas = re.findall('\[([0-9]+)\] = (0x[0-9a-zA-Z]+)', l) + +gdb.write("%16s %16s %8s %4s %7s %s\n" % ("vaddr", "paddr", "size", "perm", "memattr", "name")) + +char_ptr = gdb.lookup_type('char').pointer() +for desc in linear_vmas: +addr = desc[1] +vma = gdb.parse_and_eval('(struct mmu::linear_vma *)' + addr) + +vaddr = vma['_virt_addr'] +paddr = vma['_phys_addr'] +size = vma['_size'] +if vma['_mem_attr'] == 0: +memattr = 'normal' +else: +memattr = 'dev' +name = vma['_name'].cast(char_ptr).string() + +# dispatch time ns ticks callout function +gdb.write("%16x %16x %8x rwxp %7s %s\n" % + (vaddr, paddr, size, memattr, name)) + osv() osv_heap() osv_memory() osv_waiters() osv_mmap() +osv_linear_mmap() osv_vma_find() osv_zfs() osv_syms() -- 2.31.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220317012408.872142-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] sysfs: support linear_maps
This patch builds on a previous one to track linear maps and adds new OSv specific pseudo file /sys/osv/memory/linear_maps: x86_64 example) 0x4020 0x20 7c7434 rwxp n kernel 0x8000 0 4000 rwxp n main 0x800f0xf1 rwxp n dmi 0x800f5a000xf5a00 247 rwxp n smbios 0x80004000 0x4000 3ffdd000 rwxp n main 0x80007fe0 0x7fe0 20 rwxp n acpi 0x8000febd1000 0xfebd1000 1000 rwxp n pci_bar 0x8000febd2000 0xfebd2000 1000 rwxp n pci_bar 0x8000fec0 0xfec0 1000 rwxp n ioapic 0x9000 0 4000 rwxp n page 0x90004000 0x4000 3ffdd000 rwxp n page 0xa000 0 4000 rwxp n mempool 0xa0004000 0x4000 3ffdd000 rwxp n mempool aarch64 example) 0x800 0x8001 rwxp d gic_dist 0x801 0x8011 rwxp d gic_cpu 0x900 0x900 1000 rwxp d pl011 0x901 0x901 1000 rwxp d pl031 0x1000 0x1000 2eff rwxp d pci_mem 0x3eff 0x3eff1 rwxp d pci_io 0x4000 0x4000 6d3000 rwxp n kernel 0x401000 0x401000 1000 rwxp d pci_cfg 0x8a00 0xa00 200 rwxp n virtio_mmio_cfg 0x8a000200 0xa000200 200 rwxp n virtio_mmio_cfg 0x8a000400 0xa000400 200 rwxp n virtio_mmio_cfg 0x8a000600 0xa000600 200 rwxp n virtio_mmio_cfg 0x8a000800 0xa000800 200 rwxp n virtio_mmio_cfg 0x8a000a00 0xa000a00 200 rwxp n virtio_mmio_cfg 0x8a000c00 0xa000c00 200 rwxp n virtio_mmio_cfg 0x8a000e00 0xa000e00 200 rwxp n virtio_mmio_cfg 0x8000406d3000 0x406d3000 7f92d000 rwxp n main 0x9000406d3000 0x406d3000 7f92d000 rwxp n page 0xa000406d3000 0x406d3000 7f92d000 rwxp n mempool Signed-off-by: Waldemar Kozaczuk --- core/mmu.cc | 12 fs/sysfs/sysfs_vnops.cc | 1 + include/osv/mmu.hh | 1 + 3 files changed, 14 insertions(+) diff --git a/core/mmu.cc b/core/mmu.cc index 7c78ac5b..a479e5c4 100644 --- a/core/mmu.cc +++ b/core/mmu.cc @@ -1879,6 +1879,18 @@ linear_vma::linear_vma(void* virt, phys phys, size_t size, mattr mem_attr, const linear_vma::~linear_vma() { } +std::string sysfs_linear_maps() { +std::ostringstream os; +WITH_LOCK(linear_vma_set_mutex.for_read()) { +for(auto *vma : linear_vma_set) { +char mattr = vma->_mem_attr == mmu::mattr::normal ? 'n' : 'd'; +osv::fprintf(os, "%18x %18x %12x rwxp %c %s\n", +vma->_virt_addr, (void*)vma->_phys_addr, vma->_size, mattr, vma->_name.c_str()); +} +} +return os.str(); +} + void linear_map(void* _virt, phys addr, size_t size, const char* name, size_t slop, mattr mem_attr) { diff --git a/fs/sysfs/sysfs_vnops.cc b/fs/sysfs/sysfs_vnops.cc index 15636f92..248f16f0 100644 --- a/fs/sysfs/sysfs_vnops.cc +++ b/fs/sysfs/sysfs_vnops.cc @@ -95,6 +95,7 @@ sysfs_mount(mount* mp, const char *dev, int flags, const void* data) auto memory = make_shared(inode_count++); memory->add("free_page_ranges", inode_count++, sysfs_free_page_ranges); memory->add("pools", inode_count++, sysfs_memory_pools); +memory->add("linear_maps", inode_count++, mmu::sysfs_linear_maps); auto osv_extension = make_shared(inode_count++); osv_extension->add("memory", memory); diff --git a/include/osv/mmu.hh b/include/osv/mmu.hh index 463f2001..f4bdaa84 100644 --- a/include/osv/mmu.hh +++ b/include/osv/mmu.hh @@ -330,6 +330,7 @@ error advise(void* addr, size_t size, int advice); void vm_fault(uintptr_t addr, exception_frame* ef); std::string procfs_maps(); +std::string sysfs_linear_maps(); unsigned long all_vmas_size(); -- 2.31.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220317012348.871820-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] mmu: track linear mmap
Sometimes while debugging problems related to how kernel and devices are memory-mapped it is helpful to see it in some form in gdb or by reading a procfs file just like we can do with mmap-ed VMAs. In addition we need to know where linear VMAs are located so that we can avoid collisions with mmap() as described by the issue #1135. To that end this patch adds new struct - linear_vma and collection of those - linear_vma_set - to track how memory gets mapped using linear_map(). It also modifies all places calling linear_map() to pass new argument - name. Please note that we can not re-use existing vma class, as it holds much richer information and lots of it is not applicable to linear map which is quite static and is simply a pre-populated mapping between some area of virtual and physical memory. Upcoming patches will add new 'osv linear_mmap' to loader.py and implementation of new sysfs pseudo file. Signed-off-by: Waldemar Kozaczuk --- arch/aarch64/arch-setup.cc | 20 +++- arch/x64/apic.cc | 2 +- arch/x64/arch-setup.cc | 11 --- arch/x64/dmi.cc| 4 ++-- arch/x64/ioapic.cc | 2 +- bsd/porting/mmu.cc | 2 +- core/mmio.cc | 4 ++-- core/mmu.cc| 28 +++- drivers/acpi.cc| 2 +- drivers/hpet.cc| 2 +- drivers/mmio-isa-serial.cc | 2 +- drivers/pci-function.cc| 2 +- drivers/pl031.cc | 2 +- drivers/virtio-mmio.cc | 2 +- drivers/xenconsole.cc | 2 +- include/osv/mmio.hh| 2 +- include/osv/mmu.hh | 16 +++- include/osv/prio.hh| 1 + 18 files changed, 77 insertions(+), 29 deletions(-) diff --git a/arch/aarch64/arch-setup.cc b/arch/aarch64/arch-setup.cc index f9854b6a..622fb75f 100644 --- a/arch/aarch64/arch-setup.cc +++ b/arch/aarch64/arch-setup.cc @@ -61,7 +61,7 @@ void arch_setup_pci() pci::set_pci_cfg(pci_cfg, pci_cfg_len); pci_cfg = pci::get_pci_cfg(&pci_cfg_len); mmu::linear_map((void *)pci_cfg, (mmu::phys)pci_cfg, pci_cfg_len, - mmu::page_size, mmu::mattr::dev); + "pci_cfg", mmu::page_size, mmu::mattr::dev); /* linear_map [TTBR0 - PCI I/O and memory ranges] */ u64 ranges[2]; size_t ranges_len[2]; @@ -73,9 +73,9 @@ void arch_setup_pci() ranges[0] = pci::get_pci_io(&ranges_len[0]); ranges[1] = pci::get_pci_mem(&ranges_len[1]); mmu::linear_map((void *)ranges[0], (mmu::phys)ranges[0], ranges_len[0], -mmu::page_size, mmu::mattr::dev); +"pci_io", mmu::page_size, mmu::mattr::dev); mmu::linear_map((void *)ranges[1], (mmu::phys)ranges[1], ranges_len[1], -mmu::page_size, mmu::mattr::dev); +"pci_mem", mmu::page_size, mmu::mattr::dev); } #endif @@ -94,17 +94,19 @@ void arch_setup_free_memory() /* linear_map [TTBR1] */ for (auto&& area : mmu::identity_mapped_areas) { auto base = reinterpret_cast(get_mem_area_base(area)); -mmu::linear_map(base + addr, addr, memory::phys_mem_size); +mmu::linear_map(base + addr, addr, memory::phys_mem_size, +area == mmu::mem_area::main ? "main" : +area == mmu::mem_area::page ? "page" : "mempool"); } /* linear_map [TTBR0 - boot, DTB and ELF] */ mmu::linear_map((void *)mmu::mem_addr, (mmu::phys)mmu::mem_addr, -addr - mmu::mem_addr); +addr - mmu::mem_addr, "kernel"); if (console::PL011_Console::active) { /* linear_map [TTBR0 - UART] */ addr = (mmu::phys)console::aarch64_console.pl011.get_base_addr(); -mmu::linear_map((void *)addr, addr, 0x1000, mmu::page_size, +mmu::linear_map((void *)addr, addr, 0x1000, "pl011", mmu::page_size, mmu::mattr::dev); } @@ -112,7 +114,7 @@ void arch_setup_free_memory() if (console::Cadence_Console::active) { // linear_map [TTBR0 - UART] addr = (mmu::phys)console::aarch64_console.cadence.get_base_addr(); -mmu::linear_map((void *)addr, addr, 0x1000, mmu::page_size, +mmu::linear_map((void *)addr, addr, 0x1000, "cadence", mmu::page_size, mmu::mattr::dev); } #endif @@ -124,9 +126,9 @@ void arch_setup_free_memory() abort("arch-setup: failed to get GICv2 information from dtb.\n"); } gic::gic = new gic::gic_driver(dist, cpu); -mmu::linear_map((void *)dist, (mmu::phys)dist, dist_len, mmu::page_size, +mmu::linear_map((void *)dist, (mmu::phys)dist, dist_len, "gic_dist", mmu::page_size, mmu::mattr::dev); -mmu::linear_map((void *)cpu, (mmu::phys)cpu, cpu_len, mmu::page_size, +mmu::linear_map((void *)cpu, (mmu::phys)cpu, cpu_len, "gic_cpu&
[osv-dev] [PATCH] aarch64: improve unexpected exception handling
Debugging scenarios when OSv crashes due to an unexpected exception can be quite tedious given they are handled by entry_invalid which simply makes kernel "hang" waiting for an interrupt. One needs to connect with gdb and introspect registers to make sense of what happenned. This patch improves the unexpected exception handling by defining proper handlers for each exception level and exception type. When exception is triggered corresponding handler prints exception type, exception level and all registers and aborts potentially printing a backtrace. Signed-off-by: Waldemar Kozaczuk --- arch/aarch64/entry.S | 125 +++-- arch/aarch64/exceptions.cc | 30 +++-- 2 files changed, 103 insertions(+), 52 deletions(-) diff --git a/arch/aarch64/entry.S b/arch/aarch64/entry.S index 0c2a6e82..03266f9c 100644 --- a/arch/aarch64/entry.S +++ b/arch/aarch64/entry.S @@ -22,10 +22,10 @@ Lower Exception level, from AArch32 0x600 0x680 0x700 0x780 */ -.macro vector_entry label idx +.macro vector_entry level, type /* every entry is at 2^7 bits distance */ .align 7 -b \label +b entry_\level\()_\type .endm .global exception_vectors @@ -34,28 +34,28 @@ .align 12 exception_vectors: /* Current Exception level with SP_EL0 : unused */ -vector_entry entry_invalid 0 // Synchronous -vector_entry entry_invalid 1 // IRQ or vIRQ -vector_entry entry_invalid 2 // FIQ or vFIQ -vector_entry entry_invalid 3 // SError or vSError +vector_entry curr_el_sp0 sync // Synchronous +vector_entry curr_el_sp0 irq// IRQ or vIRQ +vector_entry curr_el_sp0 fiq// FIQ or vFIQ +vector_entry curr_el_sp0 serror // SError or vSError /* Current Exception level with SP_ELx : only actually used */ -vector_entry entry_sync 4 -vector_entry entry_irq 5 -vector_entry entry_fiq 6 -vector_entry entry_serror 7 +vector_entry curr_el_spx sync +vector_entry curr_el_spx irq +vector_entry curr_el_spx fiq +vector_entry curr_el_spx serror /* Lower Exception level in AArch64 : unused since we don't go to EL0 */ -vector_entry entry_invalid 8 -vector_entry entry_invalid 9 -vector_entry entry_invalid 10 -vector_entry entry_invalid 11 +vector_entry lower_el_aarch64 sync +vector_entry lower_el_aarch64 irq +vector_entry lower_el_aarch64 fiq +vector_entry lower_el_aarch64 serror /* Lower Exception level in AArch32 : no El0, no AArch32 */ -vector_entry entry_invalid 12 -vector_entry entry_invalid 13 -vector_entry entry_invalid 14 -vector_entry entry_invalid 15 +vector_entry lower_el_aarch32 sync +vector_entry lower_el_aarch32 irq +vector_entry lower_el_aarch32 fiq +vector_entry lower_el_aarch32 serror /* keep in sync with the struct in exceptions.hh */ .macro push_state_to_exception_frame @@ -131,24 +131,61 @@ thread_main: .equ ESR_FLT_BEG,2 // we strip LL .equ ESR_FLT_END,5 -.global entry_invalid -.hidden entry_invalid -.type entry_invalid, @function -entry_invalid: -mrs x20, elr_el1 // Exception Link Register -> X20 -mrs x21, spsr_el1 // Saved PSTATE -> X21 -mrs x22, esr_el1 // Exception Syndrome Register -> X22 +.macro entry_unexpected_exception level, type, level_id, type_id +.global entry_\level\()_\type +.hidden entry_\level\()_\type +.type entry_\level\()_\type, @function +entry_\level\()_\type: +.cfi_startproc simple +.cfi_signal_frame +.cfi_def_cfa sp, 0 +.cfi_offset x30, -32 // Point to the elr register located at the -32 offset + // of the exception frame to help gdb link to the + // address when interrupt was raised +push_state_to_exception_frame +mrs x1, esr_el1 +str w1, [sp, #272] // Store Exception Syndrom Register in the frame +mov x0, sp // Save exception_frame to x0 +mov x1, \level_id +mov x2, \type_id +bl handle_unexpected_exception +pop_state_from_exception_frame +bl abort +.cfi_endproc +.endm + +.equ CURR_EL_SP0, 0x0 +.equ CURR_EL_SPX, 0x1 +.equ LOWER_EL_AARCH64, 0x2 +.equ LOWER_EL_AARCH32, 0x3 -ubfmx23, x22, #ESR_EC_BEG, #ESR_EC_END // Exception Class -> X23 -ubfmx24, x22, #ESR_ISS_BEG, #ESR_ISS_END // Instruction-Specific Syndrome -> X24 +.equ EX_TYPE_SYNC, 0x0 +.equ EX_TYPE_IRQ, 0x1 +.equ EX_TYPE_FIQ, 0x2 +.equ EX_TYPE_SERROR, 0x3 -1: wfi -b 1b +entry_unexpected_exception curr_el_sp0, sync, #CURR_EL_SP0, #EX_TYPE_SYNC +entry_unexpec
[osv-dev] [PATCH] build: support app version script
This patch introduces another new build mechanism that allows creating custom kernel exporting only symbols required by specific application. Such kernel benefits from smaller size and better security as all unneeded code is removed. This patch addresses remaining part of the modularization/librarization functionality as explained by the issue #1110 and this part of the roadmap - https://github.com/cloudius-systems/osv/wiki/Roadmap#modularizationlibrarization. This idea was also mentioned in the P99 OSv presentation - see slide 12. In essence, this patch adds two new scripts that analyse the build manifest, detect ELF files and identify symbols required from OSv kernel and finally produce an application specific version script under build/last/app_version_script: - scripts/list_manifest_files.py - reads build/last/usr.manifest and produces a list of file paths on host filesystem - scripts/generate_app_version_script.sh - iterates over manifest files produced by list_manifest_files.py, identifies undefined symbols in the ELF files using objdump that are also exported by OSv kernel and finally generates build/last/app_version_script This patch also makes some modest changes to the main makefile to support new parameter - conf_version_script - intended to point to a custom version script. Please note that this new functionality only works when building kernel with most symbols hidden (conf_hide_symbols=1). To take advantage of this new feature one would follow these steps: 1. Build image for given application. 2. Run scripts/generate_app_version_script.sh to produce app_version_script. 3. Re-build the image with kernel exporting only symbols needed by an app like so: ./scripts/build fs=rofs conf_hide_symbols=1 image=golang-pie-example \ conf_version_script=build/last/app_version_script The version script generated for the golang ELF list only 30 symbols. My experiments show that for many apps this can reduce kernel size by close to 0.5MB. For example the size of kernel taylored to the golang app above is 3196K vs 3632K of the generic ones. Obviously this feature can be used together with the driver profile to further reduce kernel size. The kernel produced with the build command below is only 2688K in size: ./scripts/build fs=rofs conf_hide_symbols=1 image=golang-pie-example \ drivers_profile=virtio-mmio conf_version_script=build/last/app_version_script Please note that some application use dlsym() to dynamically resolve symbols which would be missed by this technique. In such scenarios such symbols would have to be manually added to app_version_script. Fixes #1110 Signed-off-by: Waldemar Kozaczuk --- Makefile | 31 +++--- scripts/generate_app_version_script.sh | 84 ++ scripts/generate_version_script.sh | 3 + scripts/list_manifest_files.py | 50 +++ 4 files changed, 160 insertions(+), 8 deletions(-) create mode 100755 scripts/generate_app_version_script.sh create mode 100755 scripts/list_manifest_files.py diff --git a/Makefile b/Makefile index c1c0eb84..82885016 100644 --- a/Makefile +++ b/Makefile @@ -2036,7 +2036,7 @@ $(out)/dummy-shlib.so: $(out)/dummy-shlib.o $(call quiet, $(CXX) -nodefaultlibs -shared $(gcc-sysroot) -o $@ $^, LINK $@) stage1_targets = $(out)/arch/$(arch)/boot.o $(out)/loader.o $(out)/runtime.o $(drivers:%=$(out)/%) $(objects:%=$(out)/%) $(out)/dummy-shlib.so -stage1: $(stage1_targets) links $(out)/version_script +stage1: $(stage1_targets) links $(out)/default_version_script .PHONY: stage1 loader_options_dep = $(out)/arch/$(arch)/loader_options.ld @@ -2047,20 +2047,35 @@ $(loader_options_dep): stage1 fi ifeq ($(conf_hide_symbols),1) +version_script_file:=$(out)/version_script +#Detect which version script to be used and copy to $(out)/version_script +#so that loader.elf/kernel.elf is rebuilt accordingly if version script has changed +ifdef conf_version_script +ifeq (,$(wildcard $(conf_version_script))) +$(error Missing version script: $(conf_version_script)) +endif +ifneq ($(shell cmp $(out)/version_script $(conf_version_script)),) +$(shell cp $(conf_version_script) $(out)/version_script) +endif +else +ifneq ($(shell cmp $(out)/version_script $(out)/default_version_script),) +$(shell cp $(out)/default_version_script $(out)/version_script) +endif +endif linker_archives_options = --no-whole-archive $(libstdc++.a) $(libgcc.a) $(libgcc_eh.a) $(boost-libs) \ - --exclude-libs libstdc++.a --gc-sections --version-script=$(out)/version_script + --exclude-libs libstdc++.a --gc-sections else linker_archives_options = --whole-archive $(libstdc++.a) $(libgcc_eh.a) $(boost-libs) --no-whole-archive $(libgcc.a) endif -$(out)/version_script: exported_symbols/*.symbols exported_symbols/$(arch)/*.symbols - $(call quiet, scripts/generate_version_script.sh $(out)/version_script, GEN version_script) +$(out)/default_version_script: exported_symbols
[osv-dev] [PATCH V2] build: support driver profiles
V2: Comparing to the previous version this one improves the gen-drivers-config-header by using awk mechanism to evaluate environment variables. It also addresses couple of nitpicks. This patch introduces new build mechanism that allows creating custom kernel with specific list of device drivers intended to target given hypervisor. Such kernel benefits from smaller size and better security as all unneeded code is removed. This patch partially addresses the modularization/librarization functionality as explained by the issue #1110 and this part of the roadmap - https://github.com/cloudius-systems/osv/wiki/Roadmap#modularizationlibrarization. This idea was also mentioned in the P99 OSv presentation - see slide 11. In essence, we introduce new build script and makefile parameter: `drivers_profile`. This new parameter is intended to specify a drivers profile which is simply a list of device drivers to be linked into kernel with some extra functionality like PCI or ACPI these drivers depend on. Each profile is specified in a tiny make include file (*.mk) under new conf/profiles/$(arch) directory and included by the main makefile as requested by drivers_profile parameter. The main makefile has number of new ifeq conditions that add given driver object file to the linked objects list depending on the value (0 or 1) of given conf_drivers_* variable specified in the relevant profile file. Sometimes it is necessary to conditionally enable/disable given code depending on the drivers selected. The good example of it is arch-setup.cc which actually registers individual drivers and this is where we need some kind of #if-way of registering given driver. To that end, this patch adds new script gen-drivers-config-header and new rule to the makefile, which automatically generates driver-config.h header file under build/$(mode)/gen/include/osv. The driver-config.h is comprised of the #define CONF_drivers_* macros that specify if given driver is enabled or not (1, 0) and is included by relatively few source file like arch-setup.cc. The extra benefit of this approach is that every time we change value of drivers_profile, all relevant files are recompiled and new kernel linked. Most of the patch are changes to the relevant source file to include new #if CONF_drivers_* conditional logic, changes to the main makefile to conditionality link specific object files and new makefile include file under conf/profiles/. The benefits of using drivers are most profound when building kernel with most symbols hidden. Below you can see examples of some build commands along with the kernel size produced: ./scripts/build fs=rofs conf_hide_symbols=1 image=native-example #all 3632K build/release/kernel-stripped.elf ./scripts/build fs=rofs conf_hide_symbols=1 image=native-example drivers_profile=virtio-pci 3380K build/release/kernel-stripped.elf ./scripts/build fs=rofs conf_hide_symbols=1 image=native-example drivers_profile=vmware 3308K build/release/kernel-stripped.elf ./scripts/build fs=rofs conf_hide_symbols=1 image=native-example drivers_profile=virtio-mmio 3120K build/release/kernel-stripped.elf ./scripts/build fs=rofs conf_hide_symbols=1 image=native-example drivers_profile=base #most drivers out 3036K build/release/kernel-stripped.elf It is also possible to enable or disable individual drivers on top of what given profiles defines like so: ./scripts/build fs=rofs conf_hide_symbols=1 image=native-example drivers_profile=base \ conf_drivers_acpi=1 conf_drivers_virtio_fs=1 conf_drivers_virtio_net=1 conf_drivers_pvpanic=1 Partially addresses #1110 Signed-off-by: Waldemar Kozaczuk --- Makefile | 109 -- arch/aarch64/arch-dtb.cc | 7 ++ arch/aarch64/arch-setup.cc| 41 +- arch/aarch64/boot.S | 3 + arch/aarch64/cpuid.cc | 5 ++ arch/aarch64/xen.cc | 1 + arch/x64/apic.cc | 1 + arch/x64/arch-setup.cc| 74 + arch/x64/cpuid.cc | 15 arch/x64/entry-xen.S | 5 ++ arch/x64/power.cc | 9 +++ arch/x64/smp.cc | 9 +++ arch/x64/xen.cc | 1 + conf/profiles/README.md | 71 + conf/profiles/aarch64/all.mk | 5 ++ conf/profiles/aarch64/base.mk | 26 ++ conf/profiles/aarch64/microvm.mk | 1 + conf/profiles/aarch64/virtio-mmio.mk | 4 + conf/profiles/aarch64/virtio-pci.mk | 6 ++ conf/profiles/aarch64/xen.mk | 2 + conf/profiles/x64/all.mk | 8 ++ conf/profiles/x64/base.mk | 74 + conf/profiles/x64/cloud_hypervisor.mk | 2 + conf/profiles/x64/hyperv.mk | 6 ++ conf/profiles/x64/microvm.mk | 1 + conf/profiles/x64/vbox.mk | 8 ++ conf/profiles/x64/virtio
[osv-dev] [PATCH] build: support driver profiles
This patch introduces new build mechanism that allows creating custom kernel with specific list of device drivers intended to target given hypervisor. Such kernel benefits from smaller size and better security as all unneeded code is removed. This patch partially addresses the modularization/librarization functionality as explained by the issue #1110 and this part of the roadmap - https://github.com/cloudius-systems/osv/wiki/Roadmap#modularizationlibrarization. This idea was also mentioned in the P99 OSv presentation - see slide 11. In essence, we introduce new build script and makefile parameter: `drivers_profile`. This new parameter is intended to specify a drivers profile which is simply a list of device drivers to be linked into kernel with some extra functionality like PCI or ACPI these drivers depend on. Each profile is specified in a tiny make include file (*.mk) under new conf/profiles/$(arch) directory and included by the main makefile as requested by drivers_profile parameter. The main makefile has number of new ifeq conditions that add given driver object file to the linked objects list depending on the value (0 or 1) of given conf_drivers_* variable specified in the relevant profile file. Sometimes it is necessary to conditionally enable/disable given code depending on the drivers selected. The good example of it is arch-setup.cc which actually registers individual drivers and this is where we need some kind of #if-way of registering given driver. To that end, this patch adds new script gen-drivers-config-header and new rule to the makefile, which automatically generates driver-config.h header file under build/$(mode)/gen/include/osv. The driver-config.h is comprised of the #define CONF_drivers_* macros that specify if given driver is enabled or not (1, 0) and is included by relatively few source file like arch-setup.cc. The extra benefit of this approach is that every time we change value of drivers_profile, all relevant files are recompiled and new kernel linked. Most of the patch are changes to the relevant source file to include new #if CONF_drivers_* conditional logic, changes to the main makefile to conditionality link specific object files and new makefile include file under conf/profiles/. The benefits of using drivers are most profound when building kernel with most symbols hidden. Below you can see examples of some build commands along with the kernel size produced: ./scripts/build fs=rofs conf_hide_symbols=1 image=native-example #all 3632K build/release/kernel-stripped.elf ./scripts/build fs=rofs conf_hide_symbols=1 image=native-example drivers_profile=virtio-pci 3380K build/release/kernel-stripped.elf ./scripts/build fs=rofs conf_hide_symbols=1 image=native-example drivers_profile=vmware 3308K build/release/kernel-stripped.elf ./scripts/build fs=rofs conf_hide_symbols=1 image=native-example drivers_profile=virtio-mmio 3120K build/release/kernel-stripped.elf ./scripts/build fs=rofs conf_hide_symbols=1 image=native-example drivers_profile=base #most drivers out 3036K build/release/kernel-stripped.elf Partially addresses #1110 Signed-off-by: Waldemar Kozaczuk --- Makefile | 109 -- arch/aarch64/arch-dtb.cc | 7 ++ arch/aarch64/arch-setup.cc| 41 +- arch/aarch64/boot.S | 3 + arch/aarch64/cpuid.cc | 5 ++ arch/aarch64/xen.cc | 1 + arch/x64/apic.cc | 1 + arch/x64/arch-setup.cc| 74 + arch/x64/cpuid.cc | 15 arch/x64/entry-xen.S | 5 ++ arch/x64/power.cc | 9 +++ arch/x64/smp.cc | 9 +++ arch/x64/xen.cc | 1 + conf/profiles/README.md | 71 + conf/profiles/aarch64/all.mk | 5 ++ conf/profiles/aarch64/base.mk | 26 ++ conf/profiles/aarch64/microvm.mk | 1 + conf/profiles/aarch64/virtio-mmio.mk | 4 + conf/profiles/aarch64/virtio-pci.mk | 6 ++ conf/profiles/aarch64/xen.mk | 2 + conf/profiles/x64/all.mk | 8 ++ conf/profiles/x64/base.mk | 74 + conf/profiles/x64/cloud_hypervisor.mk | 2 + conf/profiles/x64/hyperv.mk | 6 ++ conf/profiles/x64/microvm.mk | 1 + conf/profiles/x64/vbox.mk | 8 ++ conf/profiles/x64/virtio-mmio.mk | 4 + conf/profiles/x64/virtio-pci.mk | 10 +++ conf/profiles/x64/vmware.mk | 10 +++ conf/profiles/x64/xen.mk | 6 ++ core/xen_intr.cc | 1 + drivers/acpi.cc | 9 ++- drivers/hpet.cc | 9 +++ drivers/pci-generic.cc| 5 ++ drivers/virtio-blk.cc | 5 ++ drivers/virtio-fs.cc | 5 ++ drivers/virtio-net.cc
[osv-dev] [PATCH] PVH boot: move the code to a separate file
This patch moves the PVH boot logic handling code out of xen.cc into a new file pvh-boot.cc. This is necessary to support building kernels with a specific set of device drivers. Signed-off-by: Waldemar Kozaczuk --- Makefile | 1 + arch/x64/pvh-boot.cc | 40 arch/x64/xen.cc | 38 -- 3 files changed, 41 insertions(+), 38 deletions(-) create mode 100644 arch/x64/pvh-boot.cc diff --git a/Makefile b/Makefile index 2f249ee4..59cc6de4 100644 --- a/Makefile +++ b/Makefile @@ -916,6 +916,7 @@ objects += arch/x64/apic-clock.o objects += arch/x64/entry-xen.o objects += arch/x64/vmlinux.o objects += arch/x64/vmlinux-boot64.o +objects += arch/x64/pvh-boot.o objects += $(acpi) endif # x64 diff --git a/arch/x64/pvh-boot.cc b/arch/x64/pvh-boot.cc new file mode 100644 index ..406d39f8 --- /dev/null +++ b/arch/x64/pvh-boot.cc @@ -0,0 +1,40 @@ +#include "arch-setup.hh" +#include + +struct hvm_start_info* hvm_xen_start_info __attribute__((section(".data"))); + +#define OSV_MULTI_BOOT_INFO_ADDR 0x1000 +#define OSV_E820_TABLE_ADDR 0x2000 + +extern "C" +void hvm_xen_extract_boot_params() +{ +// Set location of multiboot info struct at arbitrary place in lower memory +// to copy to (happens to be the same as in boot16.S) +osv_multiboot_info_type* mb_info = reinterpret_cast(OSV_MULTI_BOOT_INFO_ADDR); + +// Copy command line pointer from boot params +mb_info->mb.cmdline = hvm_xen_start_info->cmdline_paddr; + +// Copy e820 information from boot params +mb_info->mb.mmap_length = 0; +mb_info->mb.mmap_addr = OSV_E820_TABLE_ADDR; + +struct hvm_memmap_table_entry *source_e820_table = reinterpret_cast(hvm_xen_start_info->memmap_paddr); +struct e820ent *dest_e820_table = reinterpret_cast(mb_info->mb.mmap_addr); + +for (uint32_t e820_index = 0; e820_index < hvm_xen_start_info->memmap_entries; e820_index++) { +dest_e820_table[e820_index].ent_size = 20; +dest_e820_table[e820_index].type = source_e820_table[e820_index].type; +dest_e820_table[e820_index].addr = source_e820_table[e820_index].addr; +dest_e820_table[e820_index].size = source_e820_table[e820_index].size; +mb_info->mb.mmap_length += sizeof(e820ent); +} + +// Save ACPI RDSP address in the field of the osv_multiboot_info_type structure +// Ideally, we would wanted to save it under the acpi::pvh_rsdp_paddr but it is +// to early in the boot process as it would have been overwritten later in premain(). +mb_info->pvh_rsdp = hvm_xen_start_info->rsdp_paddr; + +reset_bootchart(mb_info); +} diff --git a/arch/x64/xen.cc b/arch/x64/xen.cc index 4e905653..d642c4fa 100644 --- a/arch/x64/xen.cc +++ b/arch/x64/xen.cc @@ -12,20 +12,17 @@ #include "processor.hh" #include "cpuid.hh" #include "exceptions.hh" -#include "arch-setup.hh" #include #include #include #include #include -#include shared_info_t *HYPERVISOR_shared_info; uint8_t xen_features[XENFEAT_NR_SUBMAPS * 32]; // make sure xen_start_info is not in .bss, or it will be overwritten // by init code, as xen_init() is called before .bss initialization struct start_info* xen_start_info __attribute__((section(".data"))); -struct hvm_start_info* hvm_xen_start_info __attribute__((section(".data"))); namespace xen { @@ -225,39 +222,4 @@ void xen_init(struct start_info* si) xen_start_info = si; } -#define OSV_MULTI_BOOT_INFO_ADDR 0x1000 -#define OSV_E820_TABLE_ADDR 0x2000 - -extern "C" -void hvm_xen_extract_boot_params() -{ -// Set location of multiboot info struct at arbitrary place in lower memory -// to copy to (happens to be the same as in boot16.S) -osv_multiboot_info_type* mb_info = reinterpret_cast(OSV_MULTI_BOOT_INFO_ADDR); - -// Copy command line pointer from boot params -mb_info->mb.cmdline = hvm_xen_start_info->cmdline_paddr; - -// Copy e820 information from boot params -mb_info->mb.mmap_length = 0; -mb_info->mb.mmap_addr = OSV_E820_TABLE_ADDR; - -struct hvm_memmap_table_entry *source_e820_table = reinterpret_cast(hvm_xen_start_info->memmap_paddr); -struct e820ent *dest_e820_table = reinterpret_cast(mb_info->mb.mmap_addr); - -for (uint32_t e820_index = 0; e820_index < hvm_xen_start_info->memmap_entries; e820_index++) { -dest_e820_table[e820_index].ent_size = 20; -dest_e820_table[e820_index].type = source_e820_table[e820_index].type; -dest_e820_table[e820_index].addr = source_e820_table[e820_index].addr; -dest_e820_table[e820_index].size = source_e820_table[e820_index].size; -mb_info->mb.mmap_length += sizeof(e820ent); -} - -// Save ACPI RDSP address in the field of the osv_multiboot_info_type struct
[osv-dev] [PATCH] cpuid: make internal functions static
Signed-off-by: Waldemar Kozaczuk --- arch/x64/cpuid.cc | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x64/cpuid.cc b/arch/x64/cpuid.cc index 45c0fea4..29bc8d6e 100644 --- a/arch/x64/cpuid.cc +++ b/arch/x64/cpuid.cc @@ -68,7 +68,7 @@ cpuid_bit cpuid_bits[] = { constexpr unsigned nr_cpuid_bits = sizeof(cpuid_bits) / sizeof(*cpuid_bits); -void process_cpuid_bit(features_type& features, const cpuid_bit& b) +static void process_cpuid_bit(features_type& features, const cpuid_bit& b) { bool subleaf = b.leaf == 7; auto base = b.leaf & 0xf000; @@ -96,7 +96,7 @@ void process_cpuid_bit(features_type& features, const cpuid_bit& b) features.*(b.flag) = (w >> b.bit) & 1; } -void process_xen_bits(features_type &features) +static void process_xen_bits(features_type &features) { signature sig = { 0x566e6558, 0x65584d4d, 0x4d4d566e }; @@ -110,13 +110,13 @@ void process_xen_bits(features_type &features) } } -void process_hyperv_bits(features_type &features) { +static void process_hyperv_bits(features_type &features) { if(hyperv_identify() && hyperv_is_timecount_available()) { features.hyperv_clocksource = true; } } -void process_cpuid(features_type& features) +static void process_cpuid(features_type& features) { for (unsigned i = 0; i < nr_cpuid_bits; ++i) { process_cpuid_bit(features, cpuid_bits[i]); -- 2.31.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220209021834.514667-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH 2/2] add module API symbols to exported list
Signed-off-by: Waldemar Kozaczuk --- Makefile| 2 +- exported_symbols/osv_module_api.symbols | 10 ++ 2 files changed, 11 insertions(+), 1 deletion(-) create mode 100644 exported_symbols/osv_module_api.symbols diff --git a/Makefile b/Makefile index a0878787..2f249ee4 100644 --- a/Makefile +++ b/Makefile @@ -1970,7 +1970,7 @@ else linker_archives_options = --whole-archive $(libstdc++.a) $(libgcc_eh.a) $(boost-libs) --no-whole-archive $(libgcc.a) endif -$(out)/version_script: exported_symbols/$(arch)/*.symbols +$(out)/version_script: exported_symbols/*.symbols exported_symbols/$(arch)/*.symbols $(call quiet, scripts/generate_version_script.sh $(out)/version_script, GEN version_script) $(out)/loader.elf: $(stage1_targets) arch/$(arch)/loader.ld $(out)/bootfs.o $(loader_options_dep) diff --git a/exported_symbols/osv_module_api.symbols b/exported_symbols/osv_module_api.symbols new file mode 100644 index ..2ba5050c --- /dev/null +++ b/exported_symbols/osv_module_api.symbols @@ -0,0 +1,10 @@ +osv_cmdline +osv_current_app_on_termination_request +osv_debug_buffer +osv_debug_enabled +osv_firmware_vendor +osv_get_all_app_threads +osv_get_all_threads +osv_hypervisor_name +osv_processor_features +osv_version -- 2.31.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220117055538.139407-2-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] httpserver-monitoring-api: stop using kernel internal C++ API
When OSv kernel is built to hide most symbols but glibc ones, the OSv applications like httpserver monitoring API can not function corretly as they rely on number of internal C++ API. This patch modifies httpserver monitoring API to stop using kernel internal C++ API. It does so by replacing some of the calls to internal C++ symbols with new module C-style API symbols: for example, sched::with_all_threads() with new osv_get_all_threads(). In other scenarios, we fall back to standard glibc API: for example osv::current_mounts() is replaced with getmntent_r() and related functions. Finally, we link httpserver monitoring app with core/options.cc and thus remove need to have those symbols exposed by the kernel. Signed-off-by: Waldemar Kozaczuk --- modules/httpserver-api/api/fs.cc | 42 ++ modules/httpserver-api/api/hardware.cc | 23 +--- modules/httpserver-api/api/network.cc | 19 -- modules/httpserver-api/api/os.cc | 67 +- modules/httpserver-api/global_server.cc| 34 +++ modules/httpserver-api/global_server.hh| 5 ++ modules/httpserver-api/openssl-init.cc | 15 +++-- modules/httpserver-api/ssl_server.cc | 9 ++- modules/httpserver-monitoring-api/Makefile | 6 +- 9 files changed, 150 insertions(+), 70 deletions(-) diff --git a/modules/httpserver-api/api/fs.cc b/modules/httpserver-api/api/fs.cc index 94eec77b..52e58c39 100644 --- a/modules/httpserver-api/api/fs.cc +++ b/modules/httpserver-api/api/fs.cc @@ -6,12 +6,12 @@ */ #include "fs.hh" -#include "osv/mount.h" #include "json/formatter.hh" #include "autogen/fs.json.hh" #include #include #include +#include namespace httpserver { @@ -23,9 +23,9 @@ using namespace std; using namespace json; using namespace fs_json; -static void fill_dfstat(DFStat& dfstat, const osv::mount_desc& mount, const struct statvfs& st) { -dfstat.filesystem = mount.special; -dfstat.mount = mount.path; +static void fill_dfstat(DFStat& dfstat, mntent* mount, const struct statvfs& st) { +dfstat.filesystem = mount->mnt_fsname; +dfstat.mount = mount->mnt_dir; dfstat.btotal = st.f_blocks; dfstat.bfree = st.f_bfree; dfstat.ftotal = st.f_files; @@ -46,21 +46,31 @@ void init(routes& routes) { getDFStats.set_handler("json", [](const_req req) { -using namespace osv; const std::string onemount = req.param.at("mount"); struct statvfs st; httpserver::json::DFStat dfstat; vector dfstats; -for (mount_desc mount : osv::current_mounts()) { -if ((mount.type == "zfs" || mount.type == "rofs") && (onemount == "" || onemount == mount.path)) { -if (statvfs(mount.path.c_str(),&st) != 0) { +FILE *mounts_fp = setmntent("/proc/mounts", "r"); +if (!mounts_fp) { +throw server_error_exception("failed to get mounts information"); +} + +struct mntent* mount; +mntent mnt; +char strings[4096]; +while ((mount = getmntent_r(mounts_fp, &mnt, strings, sizeof(strings { +std::string fstype(mount->mnt_type); +if ((fstype == "zfs" || fstype == "rofs") && (onemount == "" || onemount == mount->mnt_dir)) { +if (statvfs(mount->mnt_dir,&st) != 0) { +endmntent(mounts_fp); throw not_found_exception("mount does not exist"); } fill_dfstat(dfstat, mount, st); dfstats.push_back(dfstat); } }; +endmntent(mounts_fp); // checking if a specific file system was requested and if we found it if (onemount != "" && dfstats.size() == 0) { @@ -76,14 +86,24 @@ void init(routes& routes) { httpserver::json::DFStat dfstat; vector res; -for (osv::mount_desc mount : osv::current_mounts()) { -if (mount.type == "zfs" || mount.type == "rofs") { -if (statvfs(mount.path.c_str(),&st) == 0) { +FILE *mounts_fp = setmntent("/proc/mounts", "r"); +if (!mounts_fp) { +throw server_error_exception("failed to get mounts information"); +} + +struct mntent* mount; +mntent mnt; +char strings[4096]; +while ((mount = getmntent_r(mounts_fp, &mnt, strings, sizeof(strings { +std::string fstype(mount->mnt_type); +if (fstype == "zfs" || fstype == "rofs") { +if (statvfs(mount->mnt_dir,&st) =
[osv-dev] [PATCH 1/2] add new C-wrappers to expose module API
The commit af2d371a61f6ab1eb5a066a0c3e93230faf6611c introduced ability to build OSv kernel with most symbols but subset of glibc hidden. The regular Linux glibc apps should run fine on such kernel, but unfortunately many unit tests and various internal OSv apps (so called modules) do not as they have been coded to use many internal API symbols. One such example is httpserver monitoring api module that exposes various monitoring API REST endpoints. At some point XLAB introduced C-wrappers API made of single C-style osv_get_all_app_threads() functions. This patch enhances the C-wrappers API by adding 9 more functions intended to be used by httpserver monitoring api module. Please note that new C-style API will open up access to relevant functionality to new apps/modules implemented in languages different than C++. Signed-off-by: Waldemar Kozaczuk --- core/osv_c_wrappers.cc | 121 +++ include/osv/export.h | 3 + include/osv/osv_c_wrappers.h | 105 +- 3 files changed, 228 insertions(+), 1 deletion(-) diff --git a/core/osv_c_wrappers.cc b/core/osv_c_wrappers.cc index 137f2c6f..dbda0613 100644 --- a/core/osv_c_wrappers.cc +++ b/core/osv_c_wrappers.cc @@ -1,12 +1,27 @@ +/* + * Copyright (C) 2022 Waldemar Kozaczuk + * Copyright (C) 2016 XLAB, d.o.o. + * + * This work is open source software, licensed under the terms of the + * BSD license as described in the LICENSE file in the top-level directory. + */ #include +#include #include #include #include +#include +#include +#include +#include +#include +#include using namespace osv; using namespace sched; +extern "C" OSV_MODULE_API int osv_get_all_app_threads(pid_t tid, pid_t** tid_arr, size_t *len) { thread* app_thread = tid==0? thread::current(): thread::find_by_id(tid); if (app_thread == nullptr) { @@ -28,3 +43,109 @@ int osv_get_all_app_threads(pid_t tid, pid_t** tid_arr, size_t *len) { } return 0; } + +static void free_threads_names(std::vector &threads) { +for (auto &t : threads) { +if (t.name) { +free(t.name); +} +} +} + +static char* str_to_c_str(const std::string& str) { +auto len = str.size(); +char *buf = static_cast(malloc(len + 1)); // This will be free()-ed in C world +if (buf) { +std::copy(str.begin(), str.end(), buf); +buf[len] = '\0'; +return buf; +} else { +return nullptr; +} +} + +extern "C" OSV_MODULE_API +int osv_get_all_threads(osv_thread** thread_arr, size_t *len) { +using namespace std::chrono; +std::vector threads; + +osv_thread thread; +bool str_copy_error = false; +sched::with_all_threads([&](sched::thread &t) { +thread.id = t.id(); +auto tcpu = t.tcpu(); +thread.cpu_id = tcpu ? tcpu->id : -1; +thread.cpu_ms = duration_cast(t.thread_clock()).count(); +thread.switches = t.stat_switches.get(); +thread.migrations = t.stat_migrations.get(); +thread.preemptions = t.stat_preemptions.get(); +thread.name = str_to_c_str(t.name()); +if (!thread.name) { +str_copy_error = true; +} +thread.priority = t.priority(); +thread.stack_size = t.get_stack_info().size; +thread.status = static_cast(static_cast(t.get_status())); +threads.push_back(thread); +}); + +if (str_copy_error) { +goto error; +} + +*thread_arr = (osv_thread*)malloc(threads.size()*sizeof(osv_thread)); +if (*thread_arr == nullptr) { +goto error; +} + +std::copy(threads.begin(), threads.end(), *thread_arr); +*len = threads.size(); +return 0; + +error: +free_threads_names(threads); +*len = 0; +return ENOMEM; +} + +extern "C" OSV_MODULE_API +char *osv_version() { +return str_to_c_str(osv::version()); +} + +extern "C" OSV_MODULE_API +char *osv_cmdline() { +return str_to_c_str(osv::getcmdline()); +} + +extern "C" OSV_MODULE_API +char *osv_hypervisor_name() { +return str_to_c_str(osv::hypervisor_name()); +} + +extern "C" OSV_MODULE_API +char *osv_firmware_vendor() { +return str_to_c_str(osv::firmware_vendor()); +} + +extern "C" OSV_MODULE_API +char *osv_processor_features() { +return str_to_c_str(processor::features_str()); +} + +extern char debug_buffer[DEBUG_BUFFER_SIZE]; +extern "C" OSV_MODULE_API +const char *osv_debug_buffer() { +return debug_buffer; +} + +extern "C" OSV_MODULE_API +void osv_current_app_on_termination_request(void (*handler)()) { +osv::this_application::on_termination_request(handler); +} + +extern bool verbose; +extern "C" OSV_MODULE_API +bool osv_debug_enabled() { +return verbose; +} diff --git a/include/osv/export.h b/include/osv/export.h index c03659b8..b21ba561 100644 --- a/include/osv/exp
[osv-dev] [PATCH] lua: change build process to download artifacts from lua binaries
As the issue #1166 explains, building lua module does not work on Fedora 33 and up. In short, lua module depends on a specific version 5.3 of lua interpreter, library and header files which may not be available on given version of Fedora and which may be the case with other Linux distributions. The issue #1166 describes at least three alternative solutions to the problem, but this patch solves it by changing the makefile to download the lua interpreter (lua executable), library and header files from a well maintained repository - LuaBinaries at http://luabinaries.sourceforge.net/ - logically in a similar way we download luarocks. The LuaBinaries has been in place since 2005 so there is good chance we can keep relying on it in foreseeable future. At the moment the makefile downloads fairly recent version 5.3.6 of lua binaries which are compatible with the versions of lua modules (like socket, etc) and luarocks. In future we may upgrade all elements needed to build the module as we see fit. As the result of this patch, lua module should in theory be build-able on any Linux distribution and version. in reality with newer versions of gcc one can imagine that lua modules themselves will stop compiling at which point we will need to upgrade those and possibly lua and luarocks itself. Also please note that lua module no longer depends on version of lua installed on host if any. Fixes #1166 Signed-off-by: Waldemar Kozaczuk --- modules/cli/Makefile | 15 +-- modules/lua/Makefile | 63 +--- 2 files changed, 53 insertions(+), 25 deletions(-) diff --git a/modules/cli/Makefile b/modules/cli/Makefile index 8a3b037e..ab648879 100644 --- a/modules/cli/Makefile +++ b/modules/cli/Makefile @@ -1,21 +1,18 @@ -LUA_LIB = $(shell pkg-config --libs lua53 2>/dev/null || pkg-config --libs lua || echo 'ERROR: Could not find lua, please run ./scripts/setup.py') -LUA_INCLUDES = $(shell pkg-config --cflags lua53 2>/dev/null || pkg-config --cflags lua || echo 'ERROR: Could not find lua, please run ./scripts/setup.py') +SRC = $(shell readlink -f ../..) + +LUA_DIR = $(SRC)/modules/lua/upstream/lua5.3 CC=gcc -CFLAGS=-O2 -g -Wall -std=gnu99 -LIBS=-ledit -ltinfo $(LUA_LIB) +CFLAGS=-O2 -g -Wall -std=gnu99 -I $(LUA_DIR)/include +LIBS=-ledit -ltinfo -ldl -L$(LUA_DIR) -llua53 SRCS=cli.c MAIN=cli -INCLUDES = $(LUA_INCLUDES) - -SRC = $(shell readlink -f ../..) - module: $(MAIN) $(SRC)/scripts/manifest_from_host.sh $(MAIN) > usr.manifest $(MAIN): $(SRCS) - $(CC) $(CFLAGS) $(INCLUDES) $^ -fPIC -pie -o $@ $(LIBS) + $(CC) $(CFLAGS) $^ -fPIC -pie -o $@ $(LIBS) rpm: $(MAIN) make -C rpmbuild diff --git a/modules/lua/Makefile b/modules/lua/Makefile index e48791af..a2412894 100644 --- a/modules/lua/Makefile +++ b/modules/lua/Makefile @@ -1,62 +1,93 @@ SRC = $(shell readlink -f ../..) + +# This makefile orchestrates building some key lua modules used by the OSv cli +# module. Please note that both lua binaries, header files and luarocks are +# downloaded from internet and lua artifacts if installed on the host are not used. +# This should make maintenance of lua module much less painful as regardless +# of the Linux distribution and version it will use lua 5.3 and luarocks 3.1.1 +# until we specifically upgrade them by modifying this makefile. + +LUA=lua5.3 +LUA_DIR=upstream/$(LUA) LUA_ROCKS=upstream/luarocks-3.1.1-linux-x86_64/luarocks + MODULES_DIR=install/lua_modules +LUA_ROCKS_INSTALL_MODULE := $(LUA_ROCKS) --lua-dir=$(LUA_DIR) install --no-doc --tree $(MODULES_DIR) + LDIR=install/lua_modules/lib/lua/5.3 CDIR=install/lua_modules/share/lua/5.3 +# Set LUAROCKS_CONFIG to make luarocks use lua binaries downloaded in upstream/lua5.3 +export LUAROCKS_CONFIG=$(SRC)/modules/lua/upstream/config.lua + # List of Lua modules, each module has its own target LUA_MODULES=LuaSocket LuaJSON Lua_stdlib LuaFileSystem LuaPath LuaSec -LUA_LIBRARY := $(shell ldconfig -p | grep -Po "liblua*.5\.3.so.0" | head -1) -ifndef LUA_LIBRARY - LUA_LIBRARY := $(shell ldconfig -p | grep -Po "liblua*.5\.3.so" | head -1) -endif - module: $(LUA_MODULES) mkdir -p $(MODULES_DIR) - $(SRC)/scripts/manifest_from_host.sh -l $(LUA_LIBRARY) > usr.manifest - -$(LUA_ROCKS): + echo "/usr/lib/liblua53.so: $(SRC)/modules/lua/$(LUA_DIR)/liblua53.so" > usr.manifest + +# Download lua interpreter from lua binaries +$(LUA_DIR)/lua53: + mkdir -p $(LUA_DIR) + cd upstream && wget -c "https://sourceforge.net/projects/luabinaries/files/5.3.6/Tools%20Executables/lua-5.3.6_Linux54_64_bin.tar.gz"; + cd $(LUA_DIR) && tar xf ../lua-5.3.6_Linux54_64_bin.tar.gz + +# Download lua shared library and header files from lua binaries +$(LUA_DIR)/liblua53.so: + mkdir -p $(LUA_DIR) + cd upstream && wget -c "https://sourceforge.net/projects/luabinarie
[osv-dev] [PATCH] tst-tls-pie.so: use printf in the init functions
According to the GCC documentation (look at https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html) "the order in which constructors for C++ objects with static storage duration and functions decorated with attribute constructor are invoked is unspecified". This means that for example using std::cout in the function with a constructor attribute is dangerous as the std::cout (global object) may have not been initialized yet (see this bug report for better explanation - https://gcc.gnu.org/bugzilla/show_bug.cgi?id=94810). The test tst-tls.cc compiled as PIE includes assertions executed as part of the before_main() function which is annotated with a constructor attribute and gets called before the regular main. The before_main() calls report() function which uses std::cout to print assertion result. This happens to work fine when OSv exports all symbols including the stdc++ ones like std::cout which is initialized by then, but crashes miserably on Linux and OSv with most symbols and stdc++ hidden. This patch fixes the test to make it work correctly in all cases by changing report() function to use std::cout or printf() depending on where in the program lifecycle it is called. Signed-off-by: Waldemar Kozaczuk --- tests/tst-tls.cc | 12 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/tst-tls.cc b/tests/tst-tls.cc index 58154382..da09ca1e 100644 --- a/tests/tst-tls.cc +++ b/tests/tst-tls.cc @@ -50,11 +50,15 @@ __thread int v10 __attribute__ ((tls_model ("local-exec"))) = ; extern void external_library(); -static void report(bool ok, std::string msg) +static void report(bool ok, std::string msg, bool use_printf = false) { ++tests; fails += !ok; -std::cout << (ok ? "PASS" : "FAIL") << ": " << msg << "\n"; +if (use_printf) { +printf("%s: %s\n", ok ? "PASS" : "FAIL", msg.c_str()); +} else { +std::cout << (ok ? "PASS" : "FAIL") << ": " << msg << "\n"; +} } int main(int argc, char** argv) @@ -124,8 +128,8 @@ int main(int argc, char** argv) static void before_main(void) __attribute__((constructor)); static void before_main(void) { -report(v7 == 987UL, "v7 in init function"); -report(v9 == 0, "v8 in init function"); +report(v7 == 987UL, "v7 in init function", true); +report(v9 == 0, "v8 in init function", true); } #endif -- 2.31.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220109011732.903007-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] sysconf: support _SC_THREAD_STACK_MIN and _SC_LINE_MAX
New java 11 on latest Ubuntu calls sysconf to get value of _SC_THREAD_STACK_MIN parameter to inquire about minimal stack size and crashes when gets -1 from OSv. This patch makes sysconf() return the same value Linux returns. Similarly one of the libraries libtsm.so used by cli needs value of _SC_LINE_MAX and this patch makes sysconf handle this parameter as well. Signed-off-by: Waldemar Kozaczuk --- runtime.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/runtime.cc b/runtime.cc index 64a9b225..10c72cca 100644 --- a/runtime.cc +++ b/runtime.cc @@ -362,6 +362,8 @@ long sysconf(int name) switch (name) { case _SC_CLK_TCK: return CLOCKS_PER_SEC; case _SC_PAGESIZE: return mmu::page_size; +case _SC_THREAD_STACK_MIN: return 16384; +case _SC_LINE_MAX: return 2048; case _SC_THREAD_PROCESS_SHARED: return true; case _SC_NPROCESSORS_ONLN: return sched::cpus.size(); case _SC_NPROCESSORS_CONF: return sched::cpus.size(); -- 2.32.0 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220104031833.83919-3-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] musl: add tfind and tsearch
New version of libtsm.so used by cli app needs tfind() and tsearch() symbols and this patch modifies the Makefile to add their musl implementations. Signed-off-by: Waldemar Kozaczuk --- Makefile | 3 +++ exported_symbols/osv_ld-musl.so.1.symbols | 2 ++ exported_symbols/osv_libc.so.6.symbols| 2 ++ 3 files changed, 7 insertions(+) diff --git a/Makefile b/Makefile index 597854d6..a0878787 100644 --- a/Makefile +++ b/Makefile @@ -1455,6 +1455,9 @@ libc += arch/$(arch)/ucontext/ucontext.o libc += string/memmove.o endif +musl += search/tfind.o +musl += search/tsearch.o + musl += stdio/__fclose_ca.o libc += stdio/__fdopen.o $(out)/libc/stdio/__fdopen.o: CFLAGS += --include libc/syscall_to_function.h diff --git a/exported_symbols/osv_ld-musl.so.1.symbols b/exported_symbols/osv_ld-musl.so.1.symbols index 3c2c5f53..f1c61a3f 100644 --- a/exported_symbols/osv_ld-musl.so.1.symbols +++ b/exported_symbols/osv_ld-musl.so.1.symbols @@ -1104,6 +1104,7 @@ tcsetpgrp telldir tempnam textdomain +tfind tgamma tgammaf tgammal @@ -1140,6 +1141,7 @@ truncate truncate64 truncf truncl +tsearch ttyname ttyname_r __tzname diff --git a/exported_symbols/osv_libc.so.6.symbols b/exported_symbols/osv_libc.so.6.symbols index 8b6c8af4..39e79692 100644 --- a/exported_symbols/osv_libc.so.6.symbols +++ b/exported_symbols/osv_libc.so.6.symbols @@ -906,6 +906,7 @@ tcsetpgrp telldir tempnam textdomain +tfind time timegm timerfd_create @@ -936,6 +937,7 @@ __towupper_l towupper_l truncate truncate64 +tsearch ttyname ttyname_r __tzname -- 2.32.0 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220104031833.83919-4-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] libc: add __sysconf alias
Signed-off-by: Waldemar Kozaczuk --- exported_symbols/osv_libc.so.6.symbols | 1 + libc/aliases.ld| 1 + 2 files changed, 2 insertions(+) diff --git a/exported_symbols/osv_libc.so.6.symbols b/exported_symbols/osv_libc.so.6.symbols index 1854f6a8..8b6c8af4 100644 --- a/exported_symbols/osv_libc.so.6.symbols +++ b/exported_symbols/osv_libc.so.6.symbols @@ -887,6 +887,7 @@ symlink sync syscall sysconf +__sysconf sysctl sys_errlist sysinfo diff --git a/libc/aliases.ld b/libc/aliases.ld index 8c54f1f4..5f3fc744 100644 --- a/libc/aliases.ld +++ b/libc/aliases.ld @@ -75,3 +75,4 @@ _exit = exit; _Exit = exit; __dn_expand = dn_expand; +__sysconf = sysconf; -- 2.32.0 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220104031833.83919-2-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] setup.py: support Ubuntu 21.10
Signed-off-by: Waldemar Kozaczuk --- scripts/setup.py | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/scripts/setup.py b/scripts/setup.py index 43b1b0a6..8cb7ca78 100755 --- a/scripts/setup.py +++ b/scripts/setup.py @@ -272,6 +272,13 @@ class Ubuntu(object): test_packages = ['libssl-dev', 'zip'] ec2_post_install = None +class Ubuntu_21_10(object): +packages = ['bridge-utils', 'libvirt-daemon-system', 'libvirt-clients', 'python3-dpkt'] +ec2_packages = ['ec2-api-tools', 'awscli'] +test_packages = [] +ec2_post_install = None +version = '21.10' + class Ubuntu_21_04(object): packages = ['bridge-utils', 'libvirt-daemon-system', 'libvirt-clients', 'python3-dpkt'] ec2_packages = ['ec2-api-tools', 'awscli'] @@ -335,7 +342,7 @@ class Ubuntu(object): ec2_post_install = None version = '16.04' -versions = [Ubuntu_21_04, Ubuntu_20_10, Ubuntu_20_04, Ubuntu_19_10, Ubuntu_19_04, Ubuntu_18_10, Ubuntu_18_04, Ubuntu_17_04, Ubuntu_16_04] +versions = [Ubuntu_21_10, Ubuntu_21_04, Ubuntu_20_10, Ubuntu_20_04, Ubuntu_19_10, Ubuntu_19_04, Ubuntu_18_10, Ubuntu_18_04, Ubuntu_17_04, Ubuntu_16_04] class LinuxMint(Ubuntu): name = 'LinuxMint' -- 2.32.0 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20220104031833.83919-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] libc: implement posix_madvise
Some apps/runtimes like dotnet call posix_madvise which we do not implement. This patch adds a simple implementation of it based on madvise with a difference that only POSIX_MADV_DONTNEED is supported. On top of this as required posix_madvise() returns an error without setting errno. Signed-off-by: Waldemar Kozaczuk --- .../aarch64/osv_ld-musl-aarch64.so.1.symbols | 1 + exported_symbols/aarch64/osv_libc.so.6.symbols | 1 + .../x64/osv_ld-musl-x86_64.so.1.symbols| 1 + exported_symbols/x64/osv_libc.so.6.symbols | 1 + libc/mman.cc | 14 ++ 5 files changed, 18 insertions(+) diff --git a/exported_symbols/aarch64/osv_ld-musl-aarch64.so.1.symbols b/exported_symbols/aarch64/osv_ld-musl-aarch64.so.1.symbols index c463131a..4c284fba 100644 --- a/exported_symbols/aarch64/osv_ld-musl-aarch64.so.1.symbols +++ b/exported_symbols/aarch64/osv_ld-musl-aarch64.so.1.symbols @@ -739,6 +739,7 @@ posix_fadvise posix_fadvise64 posix_fallocate posix_fallocate64 +posix_madvise posix_memalign pow pow10 diff --git a/exported_symbols/aarch64/osv_libc.so.6.symbols b/exported_symbols/aarch64/osv_libc.so.6.symbols index 000191b7..ea0bfcdc 100644 --- a/exported_symbols/aarch64/osv_libc.so.6.symbols +++ b/exported_symbols/aarch64/osv_libc.so.6.symbols @@ -667,6 +667,7 @@ posix_fadvise posix_fadvise64 posix_fallocate posix_fallocate64 +posix_madvise posix_memalign ppoll prctl diff --git a/exported_symbols/x64/osv_ld-musl-x86_64.so.1.symbols b/exported_symbols/x64/osv_ld-musl-x86_64.so.1.symbols index b3f87859..d88e98ed 100644 --- a/exported_symbols/x64/osv_ld-musl-x86_64.so.1.symbols +++ b/exported_symbols/x64/osv_ld-musl-x86_64.so.1.symbols @@ -721,6 +721,7 @@ posix_fadvise posix_fadvise64 posix_fallocate posix_fallocate64 +posix_madvise __posix_getopt posix_memalign pow diff --git a/exported_symbols/x64/osv_libc.so.6.symbols b/exported_symbols/x64/osv_libc.so.6.symbols index 6635cabb..07b5368b 100644 --- a/exported_symbols/x64/osv_libc.so.6.symbols +++ b/exported_symbols/x64/osv_libc.so.6.symbols @@ -596,6 +596,7 @@ posix_fadvise posix_fadvise64 posix_fallocate posix_fallocate64 +posix_madvise __posix_getopt posix_memalign ppoll diff --git a/libc/mman.cc b/libc/mman.cc index 9dd6429a..75a94eb0 100644 --- a/libc/mman.cc +++ b/libc/mman.cc @@ -257,3 +257,17 @@ void *sbrk(intptr_t increment) errno = ENOMEM; return (void *)-1; } + +static unsigned posix_madvise_to_advise(int advice) +{ +if (advice == POSIX_MADV_DONTNEED) { +return mmu::advise_dontneed; +} +return 0; +} + +OSV_LIBC_API +int posix_madvise(void *addr, size_t len, int advice) { +auto err = mmu::advise(addr, len, posix_madvise_to_advise(advice)); +return err.get(); +} -- 2.31.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20211221182614.242226-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH] virtio: fix the bug of reading queue msix vector
This patch fixes a silly bug caused by wrong placement of closing parentheses which led to reading word value at offset 0x1 instead of intended COMMON_CFG_OFFSET_OF(queue_msix_vector). This bug did not cause real damage but made some hypervisors like Intel's cloud hypervisor show this kind of warning: cloud-hypervisor: 55.151472ms: WARN:virtio-devices/src/transport/pci_common_config.rs:169 -- invalid virtio register word read: 0x1 Signed-off-by: Waldemar Kozaczuk --- drivers/virtio-pci-device.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/virtio-pci-device.cc b/drivers/virtio-pci-device.cc index 297758fc..d5b1da9c 100644 --- a/drivers/virtio-pci-device.cc +++ b/drivers/virtio-pci-device.cc @@ -185,7 +185,7 @@ void virtio_modern_pci_device::setup_queue(vring *queue) if (_dev->is_msix()) { // Setup queue_id:entry_id 1:1 correlation... _common_cfg->virtio_conf_writew(COMMON_CFG_OFFSET_OF(queue_msix_vector), queue_index); -if (_common_cfg->virtio_conf_readw(COMMON_CFG_OFFSET_OF(queue_msix_vector) != queue_index)) { +if (_common_cfg->virtio_conf_readw(COMMON_CFG_OFFSET_OF(queue_msix_vector)) != queue_index) { virtio_e("Setting MSIx entry for queue %d failed.", queue_index); return; } -- 2.31.1 -- You received this message because you are subscribed to the Google Groups "OSv Development" group. To unsubscribe from this group and stop receiving emails from it, send an email to osv-dev+unsubscr...@googlegroups.com. To view this discussion on the web visit https://groups.google.com/d/msgid/osv-dev/20211217033600.1407013-1-jwkozaczuk%40gmail.com.
[osv-dev] [PATCH 2/2] zfs: extract zfs code as optional libsolaris.so
Originally I thought that extracting ZFS out of the kernel as a shared library would not be as easy as it it has turned out to be. Obviously after figuring couple of important gotchas which I describe below and in the code comments. The advantages of moving ZFS to a separate library are following: - kernel becomes ~900K smaller - there are at least 10 less threads needed to run non-ZFS image (running ROFS image on 1 cpu requires only 25 threads) I also hope this patch provides a blueprint of how we could implement another ext2/3/4 filesystem driver (see #1179) or other true kernel modules. The essence of this patch are changes to the main makefile to build new libsolaris.so and various ZFS-related parts of the kernel like pagecache, arc_shrinker and ZFS dev driver to make them call into libsolaris.so upon dynamically registering handful of callbacks. The new libsolaris.so is mainly composed of the solaris and zfs sets as defined in the makefile (and not part of the kernel anymore) plus bsd RPC code (xdr*), kobj and finally new fs/zfs/zfs_initialize.c which provides main INIT function - zfs_initialize(). The zfs_initialize() initializes various ZFS resources like threads and memory and registers various callback functions into the main kernel (see comments in zfs_initialize.c). Two important gotchas I have discovered are: 1) The libsolaris.so needs to build with BIND_NOW to make all symbols resolved eagerly to avoid page faults to resolve those symbols later if the ZFS code in libsolaris.so is called to resolve other faults. This would cause deadlocks. 2) The libsolaris.so needs the osv-mlock note so that dynamic linker would populate the mappings. This is similar to above to avoid page faults later that would lead to deadlocks. Please note the libsolaris.so is built with most symbols hidden and code garbage collection on to help minimize its size (804K) and expose minimum number of symbols (< 100) needed by libzfs.so. The latter also helps avoid possible symbol collision with other apps. We also make changes to loader.cc to dlopen("/libsolaris.so") before we mount ZFS filesystem (for that reason libsolaris.so needs to be part of the bootfs for ZFS images). Because ZFS is root filesystem, we cannot use the same approach we used for nfs which is also implemented as a shared library but loaded in pivot_rootfs() which happens much later. In theory we could build mixed disk with two partitions - 1st ROFS one with libsolaris.so on it and the 2nd ZFS one which would be mounted after we mount ROFS and load and initialize libsolaris.so from it. I have tested this patch by running unit tests (all pass) and also using tests/misc-zfs-io.cc as well as running stress test of MySQL on ZFS image. Fixes #1009 Signed-off-by: Waldemar Kozaczuk --- Makefile | 51 bootfs.manifest.skel | 1 + bsd/init.cc | 7 --- bsd/porting/shrinker.cc | 22 +++-- core/pagecache.cc | 45 +- drivers/zfs.cc| 12 - fs/zfs/zfs_initialize.c | 97 +++ fs/zfs/zfs_null_vfsops.cc | 54 ++ libc/misc/uname.c | 2 +- loader.cc | 50 usr.manifest.skel | 1 + 11 files changed, 289 insertions(+), 53 deletions(-) create mode 100644 fs/zfs/zfs_initialize.c create mode 100644 fs/zfs/zfs_null_vfsops.cc diff --git a/Makefile b/Makefile index 7acf130c..d88efdb9 100644 --- a/Makefile +++ b/Makefile @@ -568,7 +568,6 @@ bsd += bsd/porting/kthread.o bsd += bsd/porting/mmu.o bsd += bsd/porting/pcpu.o bsd += bsd/porting/bus_dma.o -bsd += bsd/porting/kobj.o bsd += bsd/sys/netinet/if_ether.o bsd += bsd/sys/compat/linux/linux_socket.o bsd += bsd/sys/compat/linux/linux_ioctl.o @@ -618,9 +617,6 @@ bsd += bsd/sys/netinet/cc/cc_cubic.o bsd += bsd/sys/netinet/cc/cc_htcp.o bsd += bsd/sys/netinet/cc/cc_newreno.o bsd += bsd/sys/netinet/arpcache.o -bsd += bsd/sys/xdr/xdr.o -bsd += bsd/sys/xdr/xdr_array.o -bsd += bsd/sys/xdr/xdr_mem.o bsd += bsd/sys/xen/evtchn.o ifeq ($(arch),x64) @@ -644,6 +640,11 @@ bsd += bsd/sys/dev/random/live_entropy_sources.o $(out)/bsd/sys/%.o: COMMON += -Wno-sign-compare -Wno-narrowing -Wno-write-strings -Wno-parentheses -Wno-unused-but-set-variable +xdr := +xdr += bsd/sys/xdr/xdr.o +xdr += bsd/sys/xdr/xdr_array.o +xdr += bsd/sys/xdr/xdr_mem.o + solaris := solaris += bsd/sys/cddl/compat/opensolaris/kern/opensolaris.o solaris += bsd/sys/cddl/compat/opensolaris/kern/opensolaris_atomic.o @@ -799,7 +800,7 @@ libtsm += drivers/libtsm/tsm_screen.o libtsm += drivers/libtsm/tsm_vte.o libtsm += drivers/libtsm/tsm_vte_charsets.o -drivers := $(bsd) $(solaris) +drivers := $(bsd) drivers += core/mmu.o drivers += arch/$(arch)/early-console.o drivers += drivers/console.o @@ -1849,6 +1850,7 @@ fs_objs += virtiofs/virtiofs_vfsops.o \ fs_objs += pseudofs/pseudofs.o fs_o
[osv-dev] [PATCH 1/2] zfs: expose some symbols in solaris part of bsd code
This patch annotates ~90 symbols with explicit public visibility across various parts of ZFS or related code in the bds/ subtree. This is in preparation of the next patch that extracts ZFS code into a separate libsolaris.so library where all symbols but the ones marked as public here. These symbols need to be exposed for the main user of libsolaris.so - libzfs.so. Signed-off-by: Waldemar Kozaczuk --- .../opensolaris/kern/opensolaris_kmem.c | 3 + .../opensolaris/kern/opensolaris_taskq.c | 12 ++-- .../cddl/contrib/opensolaris/common/avl/avl.c | 25 +++ .../opensolaris/common/nvpair/fnvpair.c | 12 ++-- .../opensolaris/common/nvpair/nvpair.c| 68 ++- .../opensolaris/common/zfs/zfeature_common.c | 8 ++- .../opensolaris/common/zfs/zfs_comutil.c | 8 ++- .../opensolaris/common/zfs/zfs_namecheck.c| 7 +- .../contrib/opensolaris/common/zfs/zfs_prop.c | 30 .../opensolaris/common/zfs/zpool_prop.c | 30 .../opensolaris/common/zfs/zprop_common.c | 4 +- .../opensolaris/uts/common/fs/zfs/spa.c | 4 +- .../opensolaris/uts/common/fs/zfs/spa_misc.c | 13 ++-- include/osv/export.h | 3 + 14 files changed, 128 insertions(+), 99 deletions(-) diff --git a/bsd/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c b/bsd/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c index f7c6b53a..aac97ce8 100644 --- a/bsd/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c +++ b/bsd/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c @@ -35,6 +35,8 @@ #include #include +#include + void * zfs_kmem_alloc(size_t size, int kmflags) { @@ -133,6 +135,7 @@ kmem_debugging(void) return (0); } +OSV_LIB_SOLARIS_API uint64_t kmem_size(void) { return physmem * PAGE_SIZE; diff --git a/bsd/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c b/bsd/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c index 9711bb4f..3fc69e84 100644 --- a/bsd/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c +++ b/bsd/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c @@ -35,12 +35,14 @@ #include #include #include +#include static uma_zone_t taskq_zone; +OSV_LIB_SOLARIS_API taskq_t *system_taskq = NULL; -void +OSV_LIB_SOLARIS_API void system_taskq_init(void *arg) { taskq_zone = uma_zcreate("taskq_zone", sizeof(struct ostask), @@ -49,7 +51,7 @@ system_taskq_init(void *arg) } SYSINIT(system_taskq_init, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_init, NULL); -void +OSV_LIB_SOLARIS_API void system_taskq_fini(void *arg) { @@ -58,7 +60,7 @@ system_taskq_fini(void *arg) } SYSUNINIT(system_taskq_fini, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_fini, NULL); -taskq_t * +OSV_LIB_SOLARIS_API taskq_t * taskq_create(const char *name, int nthreads, pri_t pri, int minalloc __bsd_unused2, int maxalloc __bsd_unused2, uint_t flags) { @@ -83,7 +85,7 @@ taskq_create_proc(const char *name, int nthreads, pri_t pri, int minalloc, return (taskq_create(name, nthreads, pri, minalloc, maxalloc, flags)); } -void +OSV_LIB_SOLARIS_API void taskq_destroy(taskq_t *tq) { @@ -108,7 +110,7 @@ taskq_run(void *arg, int pending __bsd_unused2) uma_zfree(taskq_zone, task); } -taskqid_t +OSV_LIB_SOLARIS_API taskqid_t taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) { struct ostask *task; diff --git a/bsd/sys/cddl/contrib/opensolaris/common/avl/avl.c b/bsd/sys/cddl/contrib/opensolaris/common/avl/avl.c index e5ac2f7e..6413c208 100644 --- a/bsd/sys/cddl/contrib/opensolaris/common/avl/avl.c +++ b/bsd/sys/cddl/contrib/opensolaris/common/avl/avl.c @@ -93,6 +93,7 @@ #include #include #include +#include /* * Small arrays to translate between balance (or diff) values and child indeces. @@ -121,7 +122,7 @@ static const int avl_balance2child[] = {0, 0, 1}; * NULL - if at the end of the nodes * otherwise next node */ -void * +OSV_LIB_SOLARIS_API void * avl_walk(avl_tree_t *tree, void*oldnode, int left) { size_t off = tree->avl_offset; @@ -168,7 +169,7 @@ avl_walk(avl_tree_t *tree, void *oldnode, int left) * Return the lowest valued node in a tree or NULL. * (leftmost child from root of tree) */ -void * +OSV_LIB_SOLARIS_API void * avl_first(avl_tree_t *tree) { avl_node_t *node; @@ -187,7 +188,7 @@ avl_first(avl_tree_t *tree) * Return the highest valued node in a tree or NULL. * (rightmost child from root of tree) */ -void * +OSV_LIB_SOLARIS_API void * avl_last(avl_tree_t *tree) { avl_node_t *node; @@ -211,7 +212,7 @@ avl_last(avl_tree_t *tree) * NULL: no node in the given direction * "void *" of the found tree node */ -void * +OSV_LIB_SOLARIS_API void * avl_nearest(avl_tree_t *tree, avl_index_t where, int direction) { int child = AVL_INDEX2CHILD(where); @@ -240,7 +241,7 @@ avl_nearest(avl_tree_t *tree, avl_index_