[osv-dev] [PATCH 2/2] zfs: extract zfs code as optional libsolaris.so
Originally I thought that extracting ZFS out of the kernel as a shared library would not be as easy as it it has turned out to be. Obviously after figuring couple of important gotchas which I describe below and in the code comments. The advantages of moving ZFS to a separate library are following: - kernel becomes ~900K smaller - there are at least 10 less threads needed to run non-ZFS image (running ROFS image on 1 cpu requires only 25 threads) I also hope this patch provides a blueprint of how we could implement another ext2/3/4 filesystem driver (see #1179) or other true kernel modules. The essence of this patch are changes to the main makefile to build new libsolaris.so and various ZFS-related parts of the kernel like pagecache, arc_shrinker and ZFS dev driver to make them call into libsolaris.so upon dynamically registering handful of callbacks. The new libsolaris.so is mainly composed of the solaris and zfs sets as defined in the makefile (and not part of the kernel anymore) plus bsd RPC code (xdr*), kobj and finally new fs/zfs/zfs_initialize.c which provides main INIT function - zfs_initialize(). The zfs_initialize() initializes various ZFS resources like threads and memory and registers various callback functions into the main kernel (see comments in zfs_initialize.c). Two important gotchas I have discovered are: 1) The libsolaris.so needs to build with BIND_NOW to make all symbols resolved eagerly to avoid page faults to resolve those symbols later if the ZFS code in libsolaris.so is called to resolve other faults. This would cause deadlocks. 2) The libsolaris.so needs the osv-mlock note so that dynamic linker would populate the mappings. This is similar to above to avoid page faults later that would lead to deadlocks. Please note the libsolaris.so is built with most symbols hidden and code garbage collection on to help minimize its size (804K) and expose minimum number of symbols (< 100) needed by libzfs.so. The latter also helps avoid possible symbol collision with other apps. We also make changes to loader.cc to dlopen("/libsolaris.so") before we mount ZFS filesystem (for that reason libsolaris.so needs to be part of the bootfs for ZFS images). Because ZFS is root filesystem, we cannot use the same approach we used for nfs which is also implemented as a shared library but loaded in pivot_rootfs() which happens much later. In theory we could build mixed disk with two partitions - 1st ROFS one with libsolaris.so on it and the 2nd ZFS one which would be mounted after we mount ROFS and load and initialize libsolaris.so from it. I have tested this patch by running unit tests (all pass) and also using tests/misc-zfs-io.cc as well as running stress test of MySQL on ZFS image. Fixes #1009 Signed-off-by: Waldemar Kozaczuk --- Makefile | 51 bootfs.manifest.skel | 1 + bsd/init.cc | 7 --- bsd/porting/shrinker.cc | 22 +++-- core/pagecache.cc | 45 +- drivers/zfs.cc| 12 - fs/zfs/zfs_initialize.c | 97 +++ fs/zfs/zfs_null_vfsops.cc | 54 ++ libc/misc/uname.c | 2 +- loader.cc | 50 usr.manifest.skel | 1 + 11 files changed, 289 insertions(+), 53 deletions(-) create mode 100644 fs/zfs/zfs_initialize.c create mode 100644 fs/zfs/zfs_null_vfsops.cc diff --git a/Makefile b/Makefile index 7acf130c..d88efdb9 100644 --- a/Makefile +++ b/Makefile @@ -568,7 +568,6 @@ bsd += bsd/porting/kthread.o bsd += bsd/porting/mmu.o bsd += bsd/porting/pcpu.o bsd += bsd/porting/bus_dma.o -bsd += bsd/porting/kobj.o bsd += bsd/sys/netinet/if_ether.o bsd += bsd/sys/compat/linux/linux_socket.o bsd += bsd/sys/compat/linux/linux_ioctl.o @@ -618,9 +617,6 @@ bsd += bsd/sys/netinet/cc/cc_cubic.o bsd += bsd/sys/netinet/cc/cc_htcp.o bsd += bsd/sys/netinet/cc/cc_newreno.o bsd += bsd/sys/netinet/arpcache.o -bsd += bsd/sys/xdr/xdr.o -bsd += bsd/sys/xdr/xdr_array.o -bsd += bsd/sys/xdr/xdr_mem.o bsd += bsd/sys/xen/evtchn.o ifeq ($(arch),x64) @@ -644,6 +640,11 @@ bsd += bsd/sys/dev/random/live_entropy_sources.o $(out)/bsd/sys/%.o: COMMON += -Wno-sign-compare -Wno-narrowing -Wno-write-strings -Wno-parentheses -Wno-unused-but-set-variable +xdr := +xdr += bsd/sys/xdr/xdr.o +xdr += bsd/sys/xdr/xdr_array.o +xdr += bsd/sys/xdr/xdr_mem.o + solaris := solaris += bsd/sys/cddl/compat/opensolaris/kern/opensolaris.o solaris += bsd/sys/cddl/compat/opensolaris/kern/opensolaris_atomic.o @@ -799,7 +800,7 @@ libtsm += drivers/libtsm/tsm_screen.o libtsm += drivers/libtsm/tsm_vte.o libtsm += drivers/libtsm/tsm_vte_charsets.o -drivers := $(bsd) $(solaris) +drivers := $(bsd) drivers += core/mmu.o drivers += arch/$(arch)/early-console.o drivers += drivers/console.o @@ -1849,6 +1850,7 @@ fs_objs += virtiofs/virtiofs_vfsops.o \ fs_objs += pseudofs/pseudofs.o fs_objs +=
[osv-dev] [PATCH 1/2] zfs: expose some symbols in solaris part of bsd code
This patch annotates ~90 symbols with explicit public visibility across various parts of ZFS or related code in the bds/ subtree. This is in preparation of the next patch that extracts ZFS code into a separate libsolaris.so library where all symbols but the ones marked as public here. These symbols need to be exposed for the main user of libsolaris.so - libzfs.so. Signed-off-by: Waldemar Kozaczuk --- .../opensolaris/kern/opensolaris_kmem.c | 3 + .../opensolaris/kern/opensolaris_taskq.c | 12 ++-- .../cddl/contrib/opensolaris/common/avl/avl.c | 25 +++ .../opensolaris/common/nvpair/fnvpair.c | 12 ++-- .../opensolaris/common/nvpair/nvpair.c| 68 ++- .../opensolaris/common/zfs/zfeature_common.c | 8 ++- .../opensolaris/common/zfs/zfs_comutil.c | 8 ++- .../opensolaris/common/zfs/zfs_namecheck.c| 7 +- .../contrib/opensolaris/common/zfs/zfs_prop.c | 30 .../opensolaris/common/zfs/zpool_prop.c | 30 .../opensolaris/common/zfs/zprop_common.c | 4 +- .../opensolaris/uts/common/fs/zfs/spa.c | 4 +- .../opensolaris/uts/common/fs/zfs/spa_misc.c | 13 ++-- include/osv/export.h | 3 + 14 files changed, 128 insertions(+), 99 deletions(-) diff --git a/bsd/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c b/bsd/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c index f7c6b53a..aac97ce8 100644 --- a/bsd/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c +++ b/bsd/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c @@ -35,6 +35,8 @@ #include #include +#include + void * zfs_kmem_alloc(size_t size, int kmflags) { @@ -133,6 +135,7 @@ kmem_debugging(void) return (0); } +OSV_LIB_SOLARIS_API uint64_t kmem_size(void) { return physmem * PAGE_SIZE; diff --git a/bsd/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c b/bsd/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c index 9711bb4f..3fc69e84 100644 --- a/bsd/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c +++ b/bsd/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c @@ -35,12 +35,14 @@ #include #include #include +#include static uma_zone_t taskq_zone; +OSV_LIB_SOLARIS_API taskq_t *system_taskq = NULL; -void +OSV_LIB_SOLARIS_API void system_taskq_init(void *arg) { taskq_zone = uma_zcreate("taskq_zone", sizeof(struct ostask), @@ -49,7 +51,7 @@ system_taskq_init(void *arg) } SYSINIT(system_taskq_init, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_init, NULL); -void +OSV_LIB_SOLARIS_API void system_taskq_fini(void *arg) { @@ -58,7 +60,7 @@ system_taskq_fini(void *arg) } SYSUNINIT(system_taskq_fini, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_fini, NULL); -taskq_t * +OSV_LIB_SOLARIS_API taskq_t * taskq_create(const char *name, int nthreads, pri_t pri, int minalloc __bsd_unused2, int maxalloc __bsd_unused2, uint_t flags) { @@ -83,7 +85,7 @@ taskq_create_proc(const char *name, int nthreads, pri_t pri, int minalloc, return (taskq_create(name, nthreads, pri, minalloc, maxalloc, flags)); } -void +OSV_LIB_SOLARIS_API void taskq_destroy(taskq_t *tq) { @@ -108,7 +110,7 @@ taskq_run(void *arg, int pending __bsd_unused2) uma_zfree(taskq_zone, task); } -taskqid_t +OSV_LIB_SOLARIS_API taskqid_t taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) { struct ostask *task; diff --git a/bsd/sys/cddl/contrib/opensolaris/common/avl/avl.c b/bsd/sys/cddl/contrib/opensolaris/common/avl/avl.c index e5ac2f7e..6413c208 100644 --- a/bsd/sys/cddl/contrib/opensolaris/common/avl/avl.c +++ b/bsd/sys/cddl/contrib/opensolaris/common/avl/avl.c @@ -93,6 +93,7 @@ #include #include #include +#include /* * Small arrays to translate between balance (or diff) values and child indeces. @@ -121,7 +122,7 @@ static const int avl_balance2child[] = {0, 0, 1}; * NULL - if at the end of the nodes * otherwise next node */ -void * +OSV_LIB_SOLARIS_API void * avl_walk(avl_tree_t *tree, void*oldnode, int left) { size_t off = tree->avl_offset; @@ -168,7 +169,7 @@ avl_walk(avl_tree_t *tree, void *oldnode, int left) * Return the lowest valued node in a tree or NULL. * (leftmost child from root of tree) */ -void * +OSV_LIB_SOLARIS_API void * avl_first(avl_tree_t *tree) { avl_node_t *node; @@ -187,7 +188,7 @@ avl_first(avl_tree_t *tree) * Return the highest valued node in a tree or NULL. * (rightmost child from root of tree) */ -void * +OSV_LIB_SOLARIS_API void * avl_last(avl_tree_t *tree) { avl_node_t *node; @@ -211,7 +212,7 @@ avl_last(avl_tree_t *tree) * NULL: no node in the given direction * "void *" of the found tree node */ -void * +OSV_LIB_SOLARIS_API void * avl_nearest(avl_tree_t *tree, avl_index_t where, int direction) { int child = AVL_INDEX2CHILD(where); @@ -240,7 +241,7 @@ avl_nearest(avl_tree_t *tree, avl_index_t where, int direction) *