[osv-dev] [PATCH 2/2] zfs: extract zfs code as optional libsolaris.so

2021-12-15 Thread Waldemar Kozaczuk
Originally I thought that extracting ZFS out of the kernel
as a shared library would not be as easy as it it has turned out to
be. Obviously after figuring couple of important gotchas which I
describe below and in the code comments.

The advantages of moving ZFS to a separate library are following:
- kernel becomes ~900K smaller
- there are at least 10 less threads needed to run non-ZFS image
  (running ROFS image on 1 cpu requires only 25 threads)

I also hope this patch provides a blueprint of how we could implement
another ext2/3/4 filesystem driver (see #1179) or other true kernel modules.

The essence of this patch are changes to the main makefile to build
new libsolaris.so and various ZFS-related parts of the kernel like
pagecache, arc_shrinker and ZFS dev driver to make them call into
libsolaris.so upon dynamically registering handful of callbacks.

The new libsolaris.so is mainly composed of the solaris and zfs sets
as defined in the makefile (and not part of the kernel anymore)
plus bsd RPC code (xdr*), kobj and finally new fs/zfs/zfs_initialize.c
which provides main INIT function - zfs_initialize(). The
zfs_initialize() initializes various ZFS resources like threads and
memory and registers various callback functions into the main kernel
(see comments in zfs_initialize.c).

Two important gotchas I have discovered are:
1) The libsolaris.so needs to build with BIND_NOW to make all symbols
   resolved eagerly to avoid page faults to resolve those symbols later
   if the ZFS code in libsolaris.so is called to resolve other faults.
   This would cause deadlocks.
2) The libsolaris.so needs the osv-mlock note so that dynamic linker
   would populate the mappings. This is similar to above to avoid page
   faults later that would lead to deadlocks.

Please note the libsolaris.so is built with most symbols hidden
and code garbage collection on to help minimize its size (804K) 
and expose minimum number of symbols (< 100) needed by libzfs.so.
The latter also helps avoid possible symbol collision with other apps.

We also make changes to loader.cc to dlopen("/libsolaris.so") before
we mount ZFS filesystem (for that reason libsolaris.so needs to be part
of the bootfs for ZFS images). Because ZFS is root filesystem, we cannot
use the same approach we used for nfs which is also implemented as a
shared library but loaded in pivot_rootfs() which happens much later.

In theory we could build mixed disk with two partitions - 1st ROFS
one with libsolaris.so on it and the 2nd ZFS one which would be mounted
after we mount ROFS and load and initialize libsolaris.so from it.

I have tested this patch by running unit tests (all pass) and also using
tests/misc-zfs-io.cc as well as running stress test of MySQL on ZFS
image.

Fixes #1009

Signed-off-by: Waldemar Kozaczuk 
---
 Makefile  | 51 
 bootfs.manifest.skel  |  1 +
 bsd/init.cc   |  7 ---
 bsd/porting/shrinker.cc   | 22 +++--
 core/pagecache.cc | 45 +-
 drivers/zfs.cc| 12 -
 fs/zfs/zfs_initialize.c   | 97 +++
 fs/zfs/zfs_null_vfsops.cc | 54 ++
 libc/misc/uname.c |  2 +-
 loader.cc | 50 
 usr.manifest.skel |  1 +
 11 files changed, 289 insertions(+), 53 deletions(-)
 create mode 100644 fs/zfs/zfs_initialize.c
 create mode 100644 fs/zfs/zfs_null_vfsops.cc

diff --git a/Makefile b/Makefile
index 7acf130c..d88efdb9 100644
--- a/Makefile
+++ b/Makefile
@@ -568,7 +568,6 @@ bsd += bsd/porting/kthread.o
 bsd += bsd/porting/mmu.o
 bsd += bsd/porting/pcpu.o
 bsd += bsd/porting/bus_dma.o
-bsd += bsd/porting/kobj.o
 bsd += bsd/sys/netinet/if_ether.o
 bsd += bsd/sys/compat/linux/linux_socket.o
 bsd += bsd/sys/compat/linux/linux_ioctl.o
@@ -618,9 +617,6 @@ bsd += bsd/sys/netinet/cc/cc_cubic.o
 bsd += bsd/sys/netinet/cc/cc_htcp.o
 bsd += bsd/sys/netinet/cc/cc_newreno.o
 bsd += bsd/sys/netinet/arpcache.o
-bsd += bsd/sys/xdr/xdr.o
-bsd += bsd/sys/xdr/xdr_array.o
-bsd += bsd/sys/xdr/xdr_mem.o
 bsd += bsd/sys/xen/evtchn.o
 
 ifeq ($(arch),x64)
@@ -644,6 +640,11 @@ bsd += bsd/sys/dev/random/live_entropy_sources.o
 
 $(out)/bsd/sys/%.o: COMMON += -Wno-sign-compare -Wno-narrowing 
-Wno-write-strings -Wno-parentheses -Wno-unused-but-set-variable
 
+xdr :=
+xdr += bsd/sys/xdr/xdr.o
+xdr += bsd/sys/xdr/xdr_array.o
+xdr += bsd/sys/xdr/xdr_mem.o
+
 solaris :=
 solaris += bsd/sys/cddl/compat/opensolaris/kern/opensolaris.o
 solaris += bsd/sys/cddl/compat/opensolaris/kern/opensolaris_atomic.o
@@ -799,7 +800,7 @@ libtsm += drivers/libtsm/tsm_screen.o
 libtsm += drivers/libtsm/tsm_vte.o
 libtsm += drivers/libtsm/tsm_vte_charsets.o
 
-drivers := $(bsd) $(solaris)
+drivers := $(bsd)
 drivers += core/mmu.o
 drivers += arch/$(arch)/early-console.o
 drivers += drivers/console.o
@@ -1849,6 +1850,7 @@ fs_objs += virtiofs/virtiofs_vfsops.o \
 fs_objs += pseudofs/pseudofs.o
 fs_objs += 

[osv-dev] [PATCH 1/2] zfs: expose some symbols in solaris part of bsd code

2021-12-15 Thread Waldemar Kozaczuk
This patch annotates ~90 symbols with explicit public visibility
across various parts of ZFS or related code in the bds/ subtree. This is
in preparation of the next patch that extracts ZFS code into a separate
libsolaris.so library where all symbols but the ones marked as public
here. These symbols need to be exposed for the main user of
libsolaris.so - libzfs.so.

Signed-off-by: Waldemar Kozaczuk 
---
 .../opensolaris/kern/opensolaris_kmem.c   |  3 +
 .../opensolaris/kern/opensolaris_taskq.c  | 12 ++--
 .../cddl/contrib/opensolaris/common/avl/avl.c | 25 +++
 .../opensolaris/common/nvpair/fnvpair.c   | 12 ++--
 .../opensolaris/common/nvpair/nvpair.c| 68 ++-
 .../opensolaris/common/zfs/zfeature_common.c  |  8 ++-
 .../opensolaris/common/zfs/zfs_comutil.c  |  8 ++-
 .../opensolaris/common/zfs/zfs_namecheck.c|  7 +-
 .../contrib/opensolaris/common/zfs/zfs_prop.c | 30 
 .../opensolaris/common/zfs/zpool_prop.c   | 30 
 .../opensolaris/common/zfs/zprop_common.c |  4 +-
 .../opensolaris/uts/common/fs/zfs/spa.c   |  4 +-
 .../opensolaris/uts/common/fs/zfs/spa_misc.c  | 13 ++--
 include/osv/export.h  |  3 +
 14 files changed, 128 insertions(+), 99 deletions(-)

diff --git a/bsd/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c 
b/bsd/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c
index f7c6b53a..aac97ce8 100644
--- a/bsd/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c
+++ b/bsd/sys/cddl/compat/opensolaris/kern/opensolaris_kmem.c
@@ -35,6 +35,8 @@
 #include 
 #include 
 
+#include 
+
 void *
 zfs_kmem_alloc(size_t size, int kmflags)
 {
@@ -133,6 +135,7 @@ kmem_debugging(void)
return (0);
 }
 
+OSV_LIB_SOLARIS_API
 uint64_t kmem_size(void)
 {
return physmem * PAGE_SIZE;
diff --git a/bsd/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c 
b/bsd/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c
index 9711bb4f..3fc69e84 100644
--- a/bsd/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c
+++ b/bsd/sys/cddl/compat/opensolaris/kern/opensolaris_taskq.c
@@ -35,12 +35,14 @@
 #include 
 #include 
 #include 
+#include 
 
 static uma_zone_t taskq_zone;
 
+OSV_LIB_SOLARIS_API
 taskq_t *system_taskq = NULL;
 
-void
+OSV_LIB_SOLARIS_API void
 system_taskq_init(void *arg)
 {
taskq_zone = uma_zcreate("taskq_zone", sizeof(struct ostask),
@@ -49,7 +51,7 @@ system_taskq_init(void *arg)
 }
 SYSINIT(system_taskq_init, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_init, 
NULL);
 
-void
+OSV_LIB_SOLARIS_API void
 system_taskq_fini(void *arg)
 {
 
@@ -58,7 +60,7 @@ system_taskq_fini(void *arg)
 }
 SYSUNINIT(system_taskq_fini, SI_SUB_CONFIGURE, SI_ORDER_ANY, 
system_taskq_fini, NULL);
 
-taskq_t *
+OSV_LIB_SOLARIS_API taskq_t *
 taskq_create(const char *name, int nthreads, pri_t pri, int minalloc 
__bsd_unused2,
 int maxalloc __bsd_unused2, uint_t flags)
 {
@@ -83,7 +85,7 @@ taskq_create_proc(const char *name, int nthreads, pri_t pri, 
int minalloc,
return (taskq_create(name, nthreads, pri, minalloc, maxalloc, flags));
 }
 
-void
+OSV_LIB_SOLARIS_API void
 taskq_destroy(taskq_t *tq)
 {
 
@@ -108,7 +110,7 @@ taskq_run(void *arg, int pending __bsd_unused2)
uma_zfree(taskq_zone, task);
 }
 
-taskqid_t
+OSV_LIB_SOLARIS_API taskqid_t
 taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
 {
struct ostask *task;
diff --git a/bsd/sys/cddl/contrib/opensolaris/common/avl/avl.c 
b/bsd/sys/cddl/contrib/opensolaris/common/avl/avl.c
index e5ac2f7e..6413c208 100644
--- a/bsd/sys/cddl/contrib/opensolaris/common/avl/avl.c
+++ b/bsd/sys/cddl/contrib/opensolaris/common/avl/avl.c
@@ -93,6 +93,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
  * Small arrays to translate between balance (or diff) values and child 
indeces.
@@ -121,7 +122,7 @@ static const int  avl_balance2child[]   = {0, 0, 1};
  * NULL - if at the end of the nodes
  * otherwise next node
  */
-void *
+OSV_LIB_SOLARIS_API void *
 avl_walk(avl_tree_t *tree, void*oldnode, int left)
 {
size_t off = tree->avl_offset;
@@ -168,7 +169,7 @@ avl_walk(avl_tree_t *tree, void *oldnode, int left)
  * Return the lowest valued node in a tree or NULL.
  * (leftmost child from root of tree)
  */
-void *
+OSV_LIB_SOLARIS_API void *
 avl_first(avl_tree_t *tree)
 {
avl_node_t *node;
@@ -187,7 +188,7 @@ avl_first(avl_tree_t *tree)
  * Return the highest valued node in a tree or NULL.
  * (rightmost child from root of tree)
  */
-void *
+OSV_LIB_SOLARIS_API void *
 avl_last(avl_tree_t *tree)
 {
avl_node_t *node;
@@ -211,7 +212,7 @@ avl_last(avl_tree_t *tree)
  * NULL: no node in the given direction
  * "void *"  of the found tree node
  */
-void *
+OSV_LIB_SOLARIS_API void *
 avl_nearest(avl_tree_t *tree, avl_index_t where, int direction)
 {
int child = AVL_INDEX2CHILD(where);
@@ -240,7 +241,7 @@ avl_nearest(avl_tree_t *tree, avl_index_t where, int 
direction)
  *