common

Baptiste Daroussin Tue, 26 Feb 2019 08:05:28 -0800

On Tue, Feb 26, 2019 at 07:48:27AM -0800, Cy Schubert wrote:
> On February 26, 2019 12:18:35 AM PST, Baptiste Daroussin <b...@freebsd.org> 
> wrote:
> >Author: bapt
> >Date: Tue Feb 26 08:18:34 2019
> >New Revision: 344569
> >URL: https://svnweb.freebsd.org/changeset/base/344569
> >
> >Log:
> >  Implement parallel mounting for ZFS filesystem
> >  
> >  It was first implemented on Illumos and then ported to ZoL.
> >  This patch is a port to FreeBSD of the ZoL version.
> >  This patch also includes a fix for a race condition that was amended
> >  
> >With such patch Delphix has seen a huge decrease in latency of the
> >mount phase
> >  (https://github.com/openzfs/openzfs/commit/a3f0e2b569 for details).
> >With that current change Gandi has measured improvments that are on par
> >with
> >  those reported by Delphix.
> >  
> >  Zol commits incorporated:
> >https://github.com/zfsonlinux/zfs/commit/a10d50f999511d304f910852c7825c70c9c9e303
> >https://github.com/zfsonlinux/zfs/commit/e63ac16d25fbe991a356489c86d4077567dfea21
> >  
> >  Reviewed by:       avg, sef
> >  Approved by:       avg, sef
> >  Obtained from:     ZoL
> >  MFC after: 1 month
> >  Relnotes:  yes
> >  Sponsored by:      Gandi.net
> >  Differential Revision:     https://reviews.freebsd.org/D19098
> >
> >Modified:
> >  head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c
> >  head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
> >  head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c
> >  head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h
> >  head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c
> >
> >Modified: head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c
> >==============================================================================
> >--- head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c Tue Feb 26
> >06:22:10 2019        (r344568)
> >+++ head/cddl/contrib/opensolaris/cmd/zfs/zfs_main.c Tue Feb 26
> >08:18:34 2019        (r344569)
> >@@ -5812,8 +5812,13 @@ zfs_do_holds(int argc, char **argv)
> > 
> > #define     CHECK_SPINNER 30
> > #define     SPINNER_TIME 3          /* seconds */
> >-#define     MOUNT_TIME 5            /* seconds */
> >+#define     MOUNT_TIME 1            /* seconds */
> > 
> >+typedef struct get_all_state {
> >+    boolean_t       ga_verbose;
> >+    get_all_cb_t    *ga_cbp;
> >+} get_all_state_t;
> >+
> > static int
> > get_one_dataset(zfs_handle_t *zhp, void *data)
> > {
> >@@ -5821,10 +5826,10 @@ get_one_dataset(zfs_handle_t *zhp, void *data)
> >     static int spinval = 0;
> >     static int spincheck = 0;
> >     static time_t last_spin_time = (time_t)0;
> >-    get_all_cb_t *cbp = data;
> >+    get_all_state_t *state = data;
> >     zfs_type_t type = zfs_get_type(zhp);
> > 
> >-    if (cbp->cb_verbose) {
> >+    if (state->ga_verbose) {
> >             if (--spincheck < 0) {
> >                     time_t now = time(NULL);
> >                     if (last_spin_time + SPINNER_TIME < now) {
> >@@ -5850,26 +5855,24 @@ get_one_dataset(zfs_handle_t *zhp, void *data)
> >             zfs_close(zhp);
> >             return (0);
> >     }
> >-    libzfs_add_handle(cbp, zhp);
> >-    assert(cbp->cb_used <= cbp->cb_alloc);
> >+    libzfs_add_handle(state->ga_cbp, zhp);
> >+    assert(state->ga_cbp->cb_used <= state->ga_cbp->cb_alloc);
> > 
> >     return (0);
> > }
> > 
> > static void
> >-get_all_datasets(zfs_handle_t ***dslist, size_t *count, boolean_t
> >verbose)
> >+get_all_datasets(get_all_cb_t *cbp, boolean_t verbose)
> > {
> >-    get_all_cb_t cb = { 0 };
> >-    cb.cb_verbose = verbose;
> >-    cb.cb_getone = get_one_dataset;
> >+    get_all_state_t state = {
> >+            .ga_verbose = verbose,
> >+            .ga_cbp = cbp
> >+    };
> > 
> >     if (verbose)
> >             set_progress_header(gettext("Reading ZFS config"));
> >-    (void) zfs_iter_root(g_zfs, get_one_dataset, &cb);
> >+    (void) zfs_iter_root(g_zfs, get_one_dataset, &state);
> > 
> >-    *dslist = cb.cb_handles;
> >-    *count = cb.cb_used;
> >-
> >     if (verbose)
> >             finish_progress(gettext("done."));
> > }
> >@@ -5879,9 +5882,20 @@ get_all_datasets(zfs_handle_t ***dslist, size_t
> >*count
> >* similar, we have a common function with an extra parameter to
> >determine which
> >  * mode we are using.
> >  */
> >-#define     OP_SHARE        0x1
> >-#define     OP_MOUNT        0x2
> >+typedef enum { OP_SHARE, OP_MOUNT } share_mount_op_t;
> > 
> >+typedef struct share_mount_state {
> >+    share_mount_op_t        sm_op;
> >+    boolean_t       sm_verbose;
> >+    int     sm_flags;
> >+    char    *sm_options;
> >+    char    *sm_proto; /* only valid for OP_SHARE */
> >+    pthread_mutex_t sm_lock; /* protects the remaining fields */
> >+    uint_t  sm_total; /* number of filesystems to process */
> >+    uint_t  sm_done; /* number of filesystems processed */
> >+    int     sm_status; /* -1 if any of the share/mount operations failed */
> >+} share_mount_state_t;
> >+
> > /*
> >  * Share or mount a dataset.
> >  */
> >@@ -6122,6 +6136,29 @@ report_mount_progress(int current, int total)
> >             update_progress(info);
> > }
> > 
> >+/*
> >+ * zfs_foreach_mountpoint() callback that mounts or shares on
> >filesystem and
> >+ * updates the progress meter
> >+ */
> >+static int
> >+share_mount_one_cb(zfs_handle_t *zhp, void *arg)
> >+{
> >+    share_mount_state_t *sms = arg;
> >+    int ret;
> >+
> >+    ret = share_mount_one(zhp, sms->sm_op, sms->sm_flags, sms->sm_proto,
> >+        B_FALSE, sms->sm_options);
> >+
> >+    pthread_mutex_lock(&sms->sm_lock);
> >+    if (ret != 0)
> >+            sms->sm_status = ret;
> >+    sms->sm_done++;
> >+    if (sms->sm_verbose)
> >+            report_mount_progress(sms->sm_done, sms->sm_total);
> >+    pthread_mutex_unlock(&sms->sm_lock);
> >+    return (ret);
> >+}
> >+
> > static void
> > append_options(char *mntopts, char *newopts)
> > {
> >@@ -6194,8 +6231,6 @@ share_mount(int op, int argc, char **argv)
> > 
> >     /* check number of arguments */
> >     if (do_all) {
> >-            zfs_handle_t **dslist = NULL;
> >-            size_t i, count = 0;
> >             char *protocol = NULL;
> > 
> >             if (op == OP_SHARE && argc > 0) {
> >@@ -6216,35 +6251,48 @@ share_mount(int op, int argc, char **argv)
> >             }
> > 
> >             start_progress_timer();
> >-            get_all_datasets(&dslist, &count, verbose);
> >+            get_all_cb_t cb = { 0 };
> >+            get_all_datasets(&cb, verbose);
> > 
> >-            if (count == 0)
> >+            if (cb.cb_used == 0) {
> >+                    if (options != NULL)
> >+                            free(options);
> >                     return (0);
> >+            }
> > 
> >-            qsort(dslist, count, sizeof (void *), libzfs_dataset_cmp);
> > #ifdef illumos
> >-            sa_init_selective_arg_t sharearg;
> >-            sharearg.zhandle_arr = dslist;
> >-            sharearg.zhandle_len = count;
> >-            if ((ret = zfs_init_libshare_arg(zfs_get_handle(dslist[0]),
> >-                SA_INIT_SHARE_API_SELECTIVE, &sharearg)) != SA_OK) {
> >-                    (void) fprintf(stderr,
> >-                        gettext("Could not initialize libshare, %d"), ret);
> >-                    return (ret);
> >+            if (op == OP_SHARE) {
> >+                    sa_init_selective_arg_t sharearg;
> >+                    sharearg.zhandle_arr = cb.cb_handles;
> >+                    sharearg.zhandle_len = cb.cb_used;
> >+                    if ((ret = zfs_init_libshare_arg(g_zfs,
> >+                        SA_INIT_SHARE_API_SELECTIVE, &sharearg)) != SA_OK) {
> >+                            (void) fprintf(stderr, gettext(
> >+                                "Could not initialize libshare, %d"), ret);
> >+                            return (ret);
> >+                    }
> >             }
> > #endif
> >+            share_mount_state_t share_mount_state = { 0 };
> >+            share_mount_state.sm_op = op;
> >+            share_mount_state.sm_verbose = verbose;
> >+            share_mount_state.sm_flags = flags;
> >+            share_mount_state.sm_options = options;
> >+            share_mount_state.sm_proto = protocol;
> >+            share_mount_state.sm_total = cb.cb_used;
> >+            pthread_mutex_init(&share_mount_state.sm_lock, NULL);
> > 
> >-            for (i = 0; i < count; i++) {
> >-                    if (verbose)
> >-                            report_mount_progress(i, count);
> >+            /*
> >+             * libshare isn't mt-safe, so only do the operation in parallel
> >+             * if we're mounting.
> >+             */
> >+            zfs_foreach_mountpoint(g_zfs, cb.cb_handles, cb.cb_used,
> >+                share_mount_one_cb, &share_mount_state, op == OP_MOUNT);
> >+            ret = share_mount_state.sm_status;
> > 
> >-                    if (share_mount_one(dslist[i], op, flags, protocol,
> >-                        B_FALSE, options) != 0)
> >-                            ret = 1;
> >-                    zfs_close(dslist[i]);
> >-            }
> >-
> >-            free(dslist);
> >+            for (int i = 0; i < cb.cb_used; i++)
> >+                    zfs_close(cb.cb_handles[i]);
> >+            free(cb.cb_handles);
> >     } else if (argc == 0) {
> >             struct mnttab entry;
> > 
> >
> >Modified: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h
> >==============================================================================
> >--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h Tue Feb 26
> >06:22:10 2019        (r344568)
> >+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h Tue Feb 26
> >08:18:34 2019        (r344569)
> >@@ -579,12 +579,12 @@ typedef struct get_all_cb {
> >     zfs_handle_t    **cb_handles;
> >     size_t          cb_alloc;
> >     size_t          cb_used;
> >-    boolean_t       cb_verbose;
> >-    int             (*cb_getone)(zfs_handle_t *, void *);
> > } get_all_cb_t;
> > 
> >+void zfs_foreach_mountpoint(libzfs_handle_t *, zfs_handle_t **,
> >size_t,
> >+    zfs_iter_f, void*, boolean_t);
> >+
> > void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *);
> >-int libzfs_dataset_cmp(const void *, const void *);
> > 
> > /*
> >  * Functions to create and destroy datasets.
> >
> >Modified:
> >head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c
> >==============================================================================
> >---
> >head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c     Tue
> >Feb 26 06:22:10 2019 (r344568)
> >+++
> >head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_dataset.c     Tue
> >Feb 26 08:18:34 2019 (r344569)
> >@@ -799,6 +799,7 @@ libzfs_mnttab_cache_compare(const void *arg1, const
> >vo
> > void
> > libzfs_mnttab_init(libzfs_handle_t *hdl)
> > {
> >+    pthread_mutex_init(&hdl->libzfs_mnttab_cache_lock, NULL);
> >     assert(avl_numnodes(&hdl->libzfs_mnttab_cache) == 0);
> >     avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare,
> >         sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node));
> >@@ -839,6 +840,7 @@ libzfs_mnttab_fini(libzfs_handle_t *hdl)
> >             free(mtn);
> >     }
> >     avl_destroy(&hdl->libzfs_mnttab_cache);
> >+    (void) pthread_mutex_destroy(&hdl->libzfs_mnttab_cache_lock);
> > }
> > 
> > void
> >@@ -853,6 +855,7 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const char
> >*f
> > {
> >     mnttab_node_t find;
> >     mnttab_node_t *mtn;
> >+    int ret = ENOENT;
> > 
> >     if (!hdl->libzfs_mnttab_enable) {
> >             struct mnttab srch = { 0 };
> >@@ -868,6 +871,7 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const char
> >*f
> >                     return (ENOENT);
> >     }
> > 
> >+    pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock);
> >     if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0)
> >             libzfs_mnttab_update(hdl);
> > 
> >@@ -875,9 +879,10 @@ libzfs_mnttab_find(libzfs_handle_t *hdl, const
> >char *f
> >     mtn = avl_find(&hdl->libzfs_mnttab_cache, &find, NULL);
> >     if (mtn) {
> >             *entry = mtn->mtn_mt;
> >-            return (0);
> >+            ret = 0;
> >     }
> >-    return (ENOENT);
> >+    pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock);
> >+    return (ret);
> > }
> > 
> > void
> >@@ -886,15 +891,17 @@ libzfs_mnttab_add(libzfs_handle_t *hdl, const
> >char *sp
> > {
> >     mnttab_node_t *mtn;
> > 
> >-    if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0)
> >-            return;
> >-    mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));
> >-    mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special);
> >-    mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp);
> >-    mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS);
> >-    mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts);
> >-    avl_add(&hdl->libzfs_mnttab_cache, mtn);
> >-}
> >+    pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock);
> >+    if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0) {
> >+            mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));
> >+            mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special);
> >+            mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp);
> >+            mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS);
> >+            mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts);
> >+            avl_add(&hdl->libzfs_mnttab_cache, mtn);
> >+    }
> >+    pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock);
> >+}           
> > 
> > void
> > libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname)
> >@@ -902,6 +909,7 @@ libzfs_mnttab_remove(libzfs_handle_t *hdl, const
> >char 
> >     mnttab_node_t find;
> >     mnttab_node_t *ret;
> > 
> >+    pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock);
> >     find.mtn_mt.mnt_special = (char *)fsname;
> >     if ((ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL))
> >         != NULL) {
> >@@ -912,6 +920,7 @@ libzfs_mnttab_remove(libzfs_handle_t *hdl, const
> >char 
> >             free(ret->mtn_mt.mnt_mntopts);
> >             free(ret);
> >     }
> >+    pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock);
> > }
> > 
> > int
> >
> >Modified: head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h
> >==============================================================================
> >--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h    Tue
> >Feb 26 06:22:10 2019 (r344568)
> >+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_impl.h    Tue
> >Feb 26 08:18:34 2019 (r344569)
> >@@ -22,7 +22,7 @@
> > /*
> >* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights
> >reserved.
> >  * Copyright (c) 2011 Pawel Jakub Dawidek. All rights reserved.
> >- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
> >+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
> >* Copyright (c) 2013 Martin Matuska <m...@freebsd.org>. All rights
> >reserved.
> >  */
> > 
> >@@ -73,6 +73,13 @@ struct libzfs_handle {
> >     int libzfs_storeerr; /* stuff error messages into buffer */
> >     void *libzfs_sharehdl; /* libshare handle */
> >     boolean_t libzfs_mnttab_enable;
> >+    /*
> >+     * We need a lock to handle the case where parallel mount
> >+     * threads are populating the mnttab cache simultaneously. The
> >+     * lock only protects the integrity of the avl tree, and does
> >+     * not protect the contents of the mnttab entries themselves.
> >+     */
> >+    pthread_mutex_t libzfs_mnttab_cache_lock;
> >     avl_tree_t libzfs_mnttab_cache;
> >     int libzfs_pool_iter;
> >     libzfs_fru_t **libzfs_fru_hash;
> >
> >Modified:
> >head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c
> >==============================================================================
> >--- head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c   Tue
> >Feb 26 06:22:10 2019 (r344568)
> >+++ head/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_mount.c   Tue
> >Feb 26 08:18:34 2019 (r344569)
> >@@ -26,6 +26,7 @@
> >  * Copyright 2016 Igor Kozhukhov <ikozhuk...@gmail.com>
> >  * Copyright 2017 Joyent, Inc.
> >  * Copyright 2017 RackTop Systems.
> >+ * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
> >  */
> > 
> > /*
> >@@ -34,25 +35,25 @@
> >  * they are used by mount and unmount and when changing a filesystem's
> >  * mountpoint.
> >  *
> >- *  zfs_is_mounted()
> >- *  zfs_mount()
> >- *  zfs_unmount()
> >- *  zfs_unmountall()
> >+ *  zfs_is_mounted()
> >+ *  zfs_mount()
> >+ *  zfs_unmount()
> >+ *  zfs_unmountall()
> >  *
> >* This file also contains the functions used to manage sharing
> >filesystems via
> >  * NFS and iSCSI:
> >  *
> >- *  zfs_is_shared()
> >- *  zfs_share()
> >- *  zfs_unshare()
> >+ *  zfs_is_shared()
> >+ *  zfs_share()
> >+ *  zfs_unshare()
> >  *
> >- *  zfs_is_shared_nfs()
> >- *  zfs_is_shared_smb()
> >- *  zfs_share_proto()
> >- *  zfs_shareall();
> >- *  zfs_unshare_nfs()
> >- *  zfs_unshare_smb()
> >- *  zfs_unshareall_nfs()
> >+ *  zfs_is_shared_nfs()
> >+ *  zfs_is_shared_smb()
> >+ *  zfs_share_proto()
> >+ *  zfs_shareall();
> >+ *  zfs_unshare_nfs()
> >+ *  zfs_unshare_smb()
> >+ *  zfs_unshareall_nfs()
> >  *  zfs_unshareall_smb()
> >  *  zfs_unshareall()
> >  *  zfs_unshareall_bypath()
> >@@ -60,8 +61,8 @@
> >  * The following functions are available for pool consumers, and will
> >  * mount/unmount and share/unshare all datasets within pool:
> >  *
> >- *  zpool_enable_datasets()
> >- *  zpool_disable_datasets()
> >+ *  zpool_enable_datasets()
> >+ *  zpool_disable_datasets()
> >  */
> > 
> > #include <dirent.h>
> >@@ -83,10 +84,14 @@
> > #include <libzfs.h>
> > 
> > #include "libzfs_impl.h"
> >+#include <thread_pool.h>
> > 
> > #include <libshare.h>
> > #define     MAXISALEN       257     /* based on sysinfo(2) man page */
> > 
> >+static int mount_tp_nthr = 512; /* tpool threads for multi-threaded
> >mounting */
> >+
> >+static void zfs_mount_task(void *);
> > static int zfs_share_proto(zfs_handle_t *, zfs_share_proto_t *);
> > zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *, char **,
> >     zfs_share_proto_t);
> >@@ -1134,25 +1139,32 @@ remove_mountpoint(zfs_handle_t *zhp)
> >     }
> > }
> > 
> >+/*
> >+ * Add the given zfs handle to the cb_handles array, dynamically
> >reallocating
> >+ * the array if it is out of space
> >+ */
> > void
> > libzfs_add_handle(get_all_cb_t *cbp, zfs_handle_t *zhp)
> > {
> >     if (cbp->cb_alloc == cbp->cb_used) {
> >             size_t newsz;
> >-            void *ptr;
> >+            zfs_handle_t **newhandles;
> > 
> >-            newsz = cbp->cb_alloc ? cbp->cb_alloc * 2 : 64;
> >-            ptr = zfs_realloc(zhp->zfs_hdl,
> >-                cbp->cb_handles, cbp->cb_alloc * sizeof (void *),
> >-                newsz * sizeof (void *));
> >-            cbp->cb_handles = ptr;
> >+            newsz = cbp->cb_alloc != 0 ? cbp->cb_alloc * 2 : 64;
> >+            newhandles = zfs_realloc(zhp->zfs_hdl,
> >+                cbp->cb_handles, cbp->cb_alloc * sizeof (zfs_handle_t *),
> >+                newsz * sizeof (zfs_handle_t *));
> >+            cbp->cb_handles = newhandles;
> >             cbp->cb_alloc = newsz;
> >     }
> >     cbp->cb_handles[cbp->cb_used++] = zhp;
> > }
> > 
> >+/*
> >+ * Recursive helper function used during file system enumeration
> >+ */
> > static int
> >-mount_cb(zfs_handle_t *zhp, void *data)
> >+zfs_iter_cb(zfs_handle_t *zhp, void *data)
> > {
> >     get_all_cb_t *cbp = data;
> > 
> >@@ -1178,104 +1190,362 @@ mount_cb(zfs_handle_t *zhp, void *data)
> >     }
> > 
> >     libzfs_add_handle(cbp, zhp);
> >-    if (zfs_iter_filesystems(zhp, mount_cb, cbp) != 0) {
> >+    if (zfs_iter_filesystems(zhp, zfs_iter_cb, cbp) != 0) {
> >             zfs_close(zhp);
> >             return (-1);
> >     }
> >     return (0);
> > }
> > 
> >-int
> >-libzfs_dataset_cmp(const void *a, const void *b)
> >+/*
> >+ * Sort comparator that compares two mountpoint paths. We sort these
> >paths so
> >+ * that subdirectories immediately follow their parents. This means
> >that we
> >+ * effectively treat the '/' character as the lowest value non-nul
> >char.
> >+ * Since filesystems from non-global zones can have the same
> >mountpoint
> >+ * as other filesystems, the comparator sorts global zone filesystems
> >to
> >+ * the top of the list. This means that the global zone will traverse
> >the
> >+ * filesystem list in the correct order and can stop when it sees the
> >+ * first zoned filesystem. In a non-global zone, only the delegated
> >+ * filesystems are seen.
> >+ *
> >+ * An example sorted list using this comparator would look like:
> >+ *
> >+ * /foo
> >+ * /foo/bar
> >+ * /foo/bar/baz
> >+ * /foo/baz
> >+ * /foo.bar
> >+ * /foo (NGZ1)
> >+ * /foo (NGZ2)
> >+ *
> >+ * The mount code depend on this ordering to deterministically iterate
> >+ * over filesystems in order to spawn parallel mount tasks.
> >+ */
> >+static int
> >+mountpoint_cmp(const void *arga, const void *argb)
> > {
> >-    zfs_handle_t **za = (zfs_handle_t **)a;
> >-    zfs_handle_t **zb = (zfs_handle_t **)b;
> >+    zfs_handle_t *const *zap = arga;
> >+    zfs_handle_t *za = *zap;
> >+    zfs_handle_t *const *zbp = argb;
> >+    zfs_handle_t *zb = *zbp;
> >     char mounta[MAXPATHLEN];
> >     char mountb[MAXPATHLEN];
> >+    const char *a = mounta;
> >+    const char *b = mountb;
> >     boolean_t gota, gotb;
> >+    uint64_t zoneda, zonedb;
> > 
> >-    if ((gota = (zfs_get_type(*za) == ZFS_TYPE_FILESYSTEM)) != 0)
> >-            verify(zfs_prop_get(*za, ZFS_PROP_MOUNTPOINT, mounta,
> >+    zoneda = zfs_prop_get_int(za, ZFS_PROP_ZONED);
> >+    zonedb = zfs_prop_get_int(zb, ZFS_PROP_ZONED);
> >+    if (zoneda && !zonedb)
> >+            return (1);
> >+    if (!zoneda && zonedb)
> >+            return (-1);
> >+    gota = (zfs_get_type(za) == ZFS_TYPE_FILESYSTEM);
> >+    if (gota)
> >+            verify(zfs_prop_get(za, ZFS_PROP_MOUNTPOINT, mounta,
> >                 sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0);
> >-    if ((gotb = (zfs_get_type(*zb) == ZFS_TYPE_FILESYSTEM)) != 0)
> >-            verify(zfs_prop_get(*zb, ZFS_PROP_MOUNTPOINT, mountb,
> >+    gotb = (zfs_get_type(zb) == ZFS_TYPE_FILESYSTEM);
> >+    if (gotb)
> >+            verify(zfs_prop_get(zb, ZFS_PROP_MOUNTPOINT, mountb,
> >                 sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0);
> > 
> >-    if (gota && gotb)
> >-            return (strcmp(mounta, mountb));
> >+    if (gota && gotb) {
> >+            while (*a != '\0' && (*a == *b)) {
> >+                    a++;
> >+                    b++;
> >+            }
> >+            if (*a == *b)
> >+                    return (0);
> >+            if (*a == '\0')
> >+                    return (-1);
> >+            if (*b == '\0')
> >+                    return (-1);
> >+            if (*a == '/')
> >+                    return (-1);
> >+            if (*b == '/')
> >+                    return (-1);
> >+            return (*a < *b ? -1 : *a > *b);
> >+    }
> > 
> >     if (gota)
> >             return (-1);
> >     if (gotb)
> >             return (1);
> > 
> >-    return (strcmp(zfs_get_name(a), zfs_get_name(b)));
> >+    /*
> >+     * If neither filesystem has a mountpoint, revert to sorting by
> >+     * datset name.
> >+     */
> >+    return (strcmp(zfs_get_name(za), zfs_get_name(zb)));
> > }
> > 
> > /*
> >+ * Reutrn true if path2 is a child of path1
> >+ */
> >+static boolean_t
> >+libzfs_path_contains(const char *path1, const char *path2)
> >+{
> >+    return (strstr(path2, path1) == path2 && path2[strlen(path1)] ==
> >'/');
> >+}
> >+
> >+
> >+static int
> >+non_descendant_idx(zfs_handle_t **handles, size_t num_handles, int
> >idx)
> >+{
> >+    char parent[ZFS_MAXPROPLEN];
> >+    char child[ZFS_MAXPROPLEN];
> >+    int i;
> >+
> >+    verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, parent,
> >+        sizeof (parent), NULL, NULL, 0, B_FALSE) == 0);
> >+
> >+    for (i = idx + 1; i < num_handles; i++) {
> >+            verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT, child,
> >+                sizeof (child), NULL, NULL, 0, B_FALSE) == 0);
> >+            if (!libzfs_path_contains(parent, child))
> >+                    break;
> >+    }
> >+    return (i);
> >+}
> >+
> >+typedef struct mnt_param {
> >+    libzfs_handle_t *mnt_hdl;
> >+    tpool_t         *mnt_tp;
> >+    zfs_handle_t    **mnt_zhps; /* filesystems to mount */
> >+    size_t          mnt_num_handles;
> >+    int             mnt_idx;        /* Index of selected entry to mount */
> >+    zfs_iter_f      mnt_func;
> >+    void            *mnt_data;
> >+} mnt_param_t;
> >+
> >+/*
> >+ * Allocate and populate the parameter struct for mount function, and
> >+ * schedule mounting of the entry selected by idx.
> >+ */
> >+static void
> >+zfs_dispatch_mount(libzfs_handle_t *hdl, zfs_handle_t **handles,
> >+    size_t num_handles, int idx, zfs_iter_f func, void *data, tpool_t
> >*tp)
> >+{
> >+    mnt_param_t *mnt_param = zfs_alloc(hdl, sizeof (mnt_param_t));
> >+
> >+    mnt_param->mnt_hdl = hdl;
> >+    mnt_param->mnt_tp = tp;
> >+    mnt_param->mnt_zhps = handles;
> >+    mnt_param->mnt_num_handles = num_handles;
> >+    mnt_param->mnt_idx = idx;
> >+    mnt_param->mnt_func = func;
> >+    mnt_param->mnt_data = data;
> >+
> >+    (void) tpool_dispatch(tp, zfs_mount_task, (void*)mnt_param);
> >+}
> >+
> >+/*
> >+ * This is the structure used to keep state of mounting or sharing
> >operations
> >+ * during a call to zpool_enable_datasets().
> >+ */
> >+typedef struct mount_state {
> >+    /*
> >+     * ms_mntstatus is set to -1 if any mount fails. While multiple
> >threads
> >+     * could update this variable concurrently, no synchronization is
> >+     * needed as it's only ever set to -1.
> >+     */
> >+    int             ms_mntstatus;
> >+    int             ms_mntflags;
> >+    const char      *ms_mntopts;
> >+} mount_state_t;
> >+
> >+static int
> >+zfs_mount_one(zfs_handle_t *zhp, void *arg)
> >+{
> >+    mount_state_t *ms = arg;
> >+    int ret = 0;
> >+
> >+    if (zfs_mount(zhp, ms->ms_mntopts, ms->ms_mntflags) != 0)
> >+            ret = ms->ms_mntstatus = -1;
> >+    return (ret);
> >+}
> >+
> >+static int
> >+zfs_share_one(zfs_handle_t *zhp, void *arg)
> >+{
> >+    mount_state_t *ms = arg;
> >+    int ret = 0;
> >+
> >+    if (zfs_share(zhp) != 0)
> >+            ret = ms->ms_mntstatus = -1;
> >+    return (ret);
> >+}
> >+
> >+/*
> >+ * Thread pool function to mount one file system. On completion, it
> >finds and
> >+ * schedules its children to be mounted. This depends on the sorting
> >done in
> >+ * zfs_foreach_mountpoint(). Note that the degenerate case (chain of
> >entries
> >+ * each descending from the previous) will have no parallelism since
> >we always
> >+ * have to wait for the parent to finish mounting before we can
> >schedule
> >+ * its children.
> >+ */
> >+static void
> >+zfs_mount_task(void *arg)
> >+{
> >+    mnt_param_t *mp = arg;
> >+    int idx = mp->mnt_idx;
> >+    zfs_handle_t **handles = mp->mnt_zhps;
> >+    size_t num_handles = mp->mnt_num_handles;
> >+    char mountpoint[ZFS_MAXPROPLEN];
> >+
> >+    verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, mountpoint,
> >+        sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0);
> >+
> >+    if (mp->mnt_func(handles[idx], mp->mnt_data) != 0)
> >+            return;
> >+
> >+    /*
> >+     * We dispatch tasks to mount filesystems with mountpoints underneath
> >+     * this one. We do this by dispatching the next filesystem with a
> >+     * descendant mountpoint of the one we just mounted, then skip all of
> >+     * its descendants, dispatch the next descendant mountpoint, and so
> >on.
> >+     * The non_descendant_idx() function skips over filesystems that are
> >+     * descendants of the filesystem we just dispatched.
> >+     */
> >+    for (int i = idx + 1; i < num_handles;
> >+        i = non_descendant_idx(handles, num_handles, i)) {
> >+            char child[ZFS_MAXPROPLEN];
> >+            verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT,
> >+                child, sizeof (child), NULL, NULL, 0, B_FALSE) == 0);
> >+
> >+            if (!libzfs_path_contains(mountpoint, child))
> >+                    break; /* not a descendant, return */
> >+            zfs_dispatch_mount(mp->mnt_hdl, handles, num_handles, i,
> >+                mp->mnt_func, mp->mnt_data, mp->mnt_tp);
> >+    }
> >+    free(mp);
> >+}
> >+
> >+/*
> >+ * Issue the func callback for each ZFS handle contained in the
> >handles
> >+ * array. This function is used to mount all datasets, and so this
> >function
> >+ * guarantees that filesystems for parent mountpoints are called
> >before their
> >+ * children. As such, before issuing any callbacks, we first sort the
> >array
> >+ * of handles by mountpoint.
> >+ *
> >+ * Callbacks are issued in one of two ways:
> >+ *
> >+ * 1. Sequentially: If the parallel argument is B_FALSE or the
> >ZFS_SERIAL_MOUNT
> >+ *    environment variable is set, then we issue callbacks
> >sequentially.
> >+ *
> >+ * 2. In parallel: If the parallel argument is B_TRUE and the
> >ZFS_SERIAL_MOUNT
> >+ *    environment variable is not set, then we use a tpool to dispatch
> >threads
> >+ *    to mount filesystems in parallel. This function dispatches tasks
> >to mount
> >+ *    the filesystems at the top-level mountpoints, and these tasks in
> >turn
> >+ *    are responsible for recursively mounting filesystems in their
> >children
> >+ *    mountpoints.
> >+ */
> >+void
> >+zfs_foreach_mountpoint(libzfs_handle_t *hdl, zfs_handle_t **handles,
> >+    size_t num_handles, zfs_iter_f func, void *data, boolean_t
> >parallel)
> >+{
> >+    zoneid_t zoneid = getzoneid();
> >+
> >+    /*
> >+     * The ZFS_SERIAL_MOUNT environment variable is an undocumented
> >+     * variable that can be used as a convenience to do a/b comparison
> >+     * of serial vs. parallel mounting.
> >+     */
> >+    boolean_t serial_mount = !parallel ||
> >+        (getenv("ZFS_SERIAL_MOUNT") != NULL);
> >+
> >+    /*
> >+     * Sort the datasets by mountpoint. See mountpoint_cmp for details
> >+     * of how these are sorted.
> >+     */
> >+    qsort(handles, num_handles, sizeof (zfs_handle_t *), mountpoint_cmp);
> >+
> >+    if (serial_mount) {
> >+            for (int i = 0; i < num_handles; i++) {
> >+                    func(handles[i], data);
> >+            }
> >+            return;
> >+    }
> >+
> >+    /*
> >+     * Issue the callback function for each dataset using a parallel
> >+     * algorithm that uses a thread pool to manage threads.
> >+     */
> >+    tpool_t *tp = tpool_create(1, mount_tp_nthr, 0, NULL);
> >+
> >+    /*
> >+     * There may be multiple "top level" mountpoints outside of the
> >pool's
> >+     * root mountpoint, e.g.: /foo /bar. Dispatch a mount task for each
> >of
> >+     * these.
> >+     */
> >+    for (int i = 0; i < num_handles;
> >+        i = non_descendant_idx(handles, num_handles, i)) {
> >+            /*
> >+             * Since the mountpoints have been sorted so that the zoned
> >+             * filesystems are at the end, a zoned filesystem seen from
> >+             * the global zone means that we're done.
> >+             */
> >+            if (zoneid == GLOBAL_ZONEID &&
> >+                zfs_prop_get_int(handles[i], ZFS_PROP_ZONED))
> >+                    break;
> >+            zfs_dispatch_mount(hdl, handles, num_handles, i, func, data,
> >+                tp);
> >+    }
> >+
> >+    tpool_wait(tp); /* wait for all scheduled mounts to complete */
> >+    tpool_destroy(tp);
> >+}
> >+
> >+/*
> >* Mount and share all datasets within the given pool.  This assumes
> >that no
> >- * datasets within the pool are currently mounted.  Because users can
> >create
> >- * complicated nested hierarchies of mountpoints, we first gather all
> >the
> >- * datasets and mountpoints within the pool, and sort them by
> >mountpoint.  Once
> >- * we have the list of all filesystems, we iterate over them in order
> >and mount
> >- * and/or share each one.
> >+ * datasets within the pool are currently mounted.
> >  */
> > #pragma weak zpool_mount_datasets = zpool_enable_datasets
> > int
> >zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int
> >flags)
> > {
> >     get_all_cb_t cb = { 0 };
> >-    libzfs_handle_t *hdl = zhp->zpool_hdl;
> >+    mount_state_t ms = { 0 };
> >     zfs_handle_t *zfsp;
> >-    int i, ret = -1;
> >-    int *good;
> >+    int ret = 0;
> > 
> >-    /*
> >-     * Gather all non-snap datasets within the pool.
> >-     */
> >-    if ((zfsp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_DATASET)) ==
> >NULL)
> >+    if ((zfsp = zfs_open(zhp->zpool_hdl, zhp->zpool_name,
> >+        ZFS_TYPE_DATASET)) == NULL)
> >             goto out;
> > 
> >-    libzfs_add_handle(&cb, zfsp);
> >-    if (zfs_iter_filesystems(zfsp, mount_cb, &cb) != 0)
> >-            goto out;
> >     /*
> >-     * Sort the datasets by mountpoint.
> >+     * Gather all non-snapshot datasets within the pool. Start by adding
> >+     * the root filesystem for this pool to the list, and then iterate
> >+     * over all child filesystems.
> >      */
> >-    qsort(cb.cb_handles, cb.cb_used, sizeof (void *),
> >-        libzfs_dataset_cmp);
> >+    libzfs_add_handle(&cb, zfsp);
> >+    if (zfs_iter_filesystems(zfsp, zfs_iter_cb, &cb) != 0)
> >+            goto out;
> > 
> >     /*
> >-     * And mount all the datasets, keeping track of which ones
> >-     * succeeded or failed.
> >+     * Mount all filesystems
> >      */
> >-    if ((good = zfs_alloc(zhp->zpool_hdl,
> >-        cb.cb_used * sizeof (int))) == NULL)
> >-            goto out;
> >+    ms.ms_mntopts = mntopts;
> >+    ms.ms_mntflags = flags;
> >+    zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used,
> >+        zfs_mount_one, &ms, B_TRUE);
> >+    if (ms.ms_mntstatus != 0)
> >+            ret = ms.ms_mntstatus;
> > 
> >-    ret = 0;
> >-    for (i = 0; i < cb.cb_used; i++) {
> >-            if (zfs_mount(cb.cb_handles[i], mntopts, flags) != 0)
> >-                    ret = -1;
> >-            else
> >-                    good[i] = 1;
> >-    }
> >-
> >     /*
> >-     * Then share all the ones that need to be shared. This needs
> >-     * to be a separate pass in order to avoid excessive reloading
> >-     * of the configuration. Good should never be NULL since
> >-     * zfs_alloc is supposed to exit if memory isn't available.
> >+     * Share all filesystems that need to be shared. This needs to be
> >+     * a separate pass because libshare is not mt-safe, and so we need
> >+     * to share serially.
> >      */
> >-    for (i = 0; i < cb.cb_used; i++) {
> >-            if (good[i] && zfs_share(cb.cb_handles[i]) != 0)
> >-                    ret = -1;
> >-    }
> >+    ms.ms_mntstatus = 0;
> >+    zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used,
> >+        zfs_share_one, &ms, B_FALSE);
> >+    if (ms.ms_mntstatus != 0)
> >+            ret = ms.ms_mntstatus;
> > 
> >-    free(good);
> >-
> > out:
> >-    for (i = 0; i < cb.cb_used; i++)
> >+    for (int i = 0; i < cb.cb_used; i++)
> >             zfs_close(cb.cb_handles[i]);
> >     free(cb.cb_handles);
> > 
> 
> This broke my systems, many filesystems fail to mount causing nullfs late 
> mounts to fail. No details now until tonight.
> 
> Suggest we back this out until it is properly tested.
>


What fails to mount? what message? can you provide Gandi folks more informations
so they can fix?

I will revert if we cannot have a quick fix but let s give them a chance to fix
first.

Best regards,
Bapt

signature.asc
Description: PGP signature

Re: svn commit: r344569 - in head/cddl/contrib/opensolaris: cmd/zfs lib/libzfs/common

Reply via email to