Author: avg
Date: Wed Oct 16 06:26:51 2019
New Revision: 353612
URL: https://svnweb.freebsd.org/changeset/base/353612

Log:
  MFC r353611: 10330 merge recent ZoL vdev and metaslab changes
  
  illumos/illumos-gate@a0b03b161c4df3cfc54fbc741db09b3bdc23ffba
  
https://github.com/illumos/illumos-gate/commit/a0b03b161c4df3cfc54fbc741db09b3bdc23ffba
  
  https://www.illumos.org/issues/10330
    3 recent ZoL changes in the vdev and metaslab code which we can pull over:
    PR 8324 c853f382db 8324 Change target size of metaslabs from 256GB to 16GB
    PR 8290 b194fab0fb 8290 Factor metaslab_load_wait() in metaslab_load()
    PR 8286 419ba59145 8286 Update vdev_is_spacemap_addressable() for new 
spacemap
    encoding
  
  Author: Serapheim Dimitropoulos <seraphe...@gmail.com>
  Obtained from:        illumos, ZoL
  MFC after:    2 weeks

Modified:
  head/cddl/contrib/opensolaris/cmd/zdb/zdb.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c
Directory Properties:
  head/cddl/contrib/opensolaris/   (props changed)
  head/cddl/contrib/opensolaris/cmd/zdb/   (props changed)
  head/sys/cddl/contrib/opensolaris/   (props changed)

Modified: head/cddl/contrib/opensolaris/cmd/zdb/zdb.c
==============================================================================
--- head/cddl/contrib/opensolaris/cmd/zdb/zdb.c Wed Oct 16 06:18:37 2019        
(r353611)
+++ head/cddl/contrib/opensolaris/cmd/zdb/zdb.c Wed Oct 16 06:26:51 2019        
(r353612)
@@ -901,11 +901,8 @@ dump_metaslab(metaslab_t *msp)
 
        if (dump_opt['m'] > 2 && !dump_opt['L']) {
                mutex_enter(&msp->ms_lock);
-               metaslab_load_wait(msp);
-               if (!msp->ms_loaded) {
-                       VERIFY0(metaslab_load(msp));
-                       range_tree_stat_verify(msp->ms_allocatable);
-               }
+               VERIFY0(metaslab_load(msp));
+               range_tree_stat_verify(msp->ms_allocatable);
                dump_metaslab_stats(msp);
                metaslab_unload(msp);
                mutex_exit(&msp->ms_lock);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c      Wed Oct 
16 06:18:37 2019        (r353611)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c      Wed Oct 
16 06:26:51 2019        (r353612)
@@ -1468,7 +1468,7 @@ metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
 /*
  * Wait for any in-progress metaslab loads to complete.
  */
-void
+static void
 metaslab_load_wait(metaslab_t *msp)
 {
        ASSERT(MUTEX_HELD(&msp->ms_lock));
@@ -1479,20 +1479,17 @@ metaslab_load_wait(metaslab_t *msp)
        }
 }
 
-int
-metaslab_load(metaslab_t *msp)
+static int
+metaslab_load_impl(metaslab_t *msp)
 {
        int error = 0;
-       boolean_t success = B_FALSE;
 
        ASSERT(MUTEX_HELD(&msp->ms_lock));
-       ASSERT(!msp->ms_loaded);
-       ASSERT(!msp->ms_loading);
+       ASSERT(msp->ms_loading);
 
-       msp->ms_loading = B_TRUE;
        /*
         * Nobody else can manipulate a loading metaslab, so it's now safe
-        * to drop the lock.  This way we don't have to hold the lock while
+        * to drop the lock. This way we don't have to hold the lock while
         * reading the spacemap from disk.
         */
        mutex_exit(&msp->ms_lock);
@@ -1509,29 +1506,49 @@ metaslab_load(metaslab_t *msp)
                    msp->ms_start, msp->ms_size);
        }
 
-       success = (error == 0);
-
        mutex_enter(&msp->ms_lock);
-       msp->ms_loading = B_FALSE;
 
-       if (success) {
-               ASSERT3P(msp->ms_group, !=, NULL);
-               msp->ms_loaded = B_TRUE;
+       if (error != 0)
+               return (error);
 
-               /*
-                * If the metaslab already has a spacemap, then we need to
-                * remove all segments from the defer tree; otherwise, the
-                * metaslab is completely empty and we can skip this.
-                */
-               if (msp->ms_sm != NULL) {
-                       for (int t = 0; t < TXG_DEFER_SIZE; t++) {
-                               range_tree_walk(msp->ms_defer[t],
-                                   range_tree_remove, msp->ms_allocatable);
-                       }
+       ASSERT3P(msp->ms_group, !=, NULL);
+       msp->ms_loaded = B_TRUE;
+
+       /*
+        * If the metaslab already has a spacemap, then we need to
+        * remove all segments from the defer tree; otherwise, the
+        * metaslab is completely empty and we can skip this.
+        */
+       if (msp->ms_sm != NULL) {
+               for (int t = 0; t < TXG_DEFER_SIZE; t++) {
+                       range_tree_walk(msp->ms_defer[t],
+                           range_tree_remove, msp->ms_allocatable);
                }
-               msp->ms_max_size = metaslab_block_maxsize(msp);
        }
+       msp->ms_max_size = metaslab_block_maxsize(msp);
+
+       return (0);
+}
+
+int
+metaslab_load(metaslab_t *msp)
+{
+       ASSERT(MUTEX_HELD(&msp->ms_lock));
+
+       /*
+        * There may be another thread loading the same metaslab, if that's
+        * the case just wait until the other thread is done and return.
+        */
+       metaslab_load_wait(msp);
+       if (msp->ms_loaded)
+               return (0);
+       VERIFY(!msp->ms_loading);
+
+       msp->ms_loading = B_TRUE;
+       int error = metaslab_load_impl(msp);
+       msp->ms_loading = B_FALSE;
        cv_broadcast(&msp->ms_load_cv);
+
        return (error);
 }
 
@@ -2091,13 +2108,10 @@ metaslab_activate(metaslab_t *msp, int allocator, uint
        ASSERT(MUTEX_HELD(&msp->ms_lock));
 
        if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
-               int error = 0;
-               metaslab_load_wait(msp);
-               if (!msp->ms_loaded) {
-                       if ((error = metaslab_load(msp)) != 0) {
-                               metaslab_group_sort(msp->ms_group, msp, 0);
-                               return (error);
-                       }
+               int error = metaslab_load(msp);
+               if (error != 0) {
+                       metaslab_group_sort(msp->ms_group, msp, 0);
+                       return (error);
                }
                if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) {
                        /*
@@ -2209,9 +2223,7 @@ metaslab_preload(void *arg)
        ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock));
 
        mutex_enter(&msp->ms_lock);
-       metaslab_load_wait(msp);
-       if (!msp->ms_loaded)
-               (void) metaslab_load(msp);
+       (void) metaslab_load(msp);
        msp->ms_selected_txg = spa_syncing_txg(spa);
        mutex_exit(&msp->ms_lock);
 }

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h  Wed Oct 
16 06:18:37 2019        (r353611)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h  Wed Oct 
16 06:26:51 2019        (r353612)
@@ -48,7 +48,6 @@ int metaslab_init(metaslab_group_t *, uint64_t, uint64
     metaslab_t **);
 void metaslab_fini(metaslab_t *);
 
-void metaslab_load_wait(metaslab_t *);
 int metaslab_load(metaslab_t *);
 void metaslab_unload(metaslab_t *);
 

Modified: 
head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h     
Wed Oct 16 06:18:37 2019        (r353611)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h     
Wed Oct 16 06:26:51 2019        (r353612)
@@ -370,8 +370,8 @@ struct metaslab {
        uint64_t        ms_initializing; /* leaves initializing this ms */
 
        /*
-        * We must hold both ms_lock and ms_group->mg_lock in order to
-        * modify ms_loaded.
+        * We must always hold the ms_lock when modifying ms_loaded
+        * and ms_loading.
         */
        boolean_t       ms_loaded;
        boolean_t       ms_loading;

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c  Wed Oct 16 
06:18:37 2019        (r353611)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c  Wed Oct 16 
06:26:51 2019        (r353612)
@@ -163,34 +163,34 @@ static vdev_ops_t *vdev_ops_table[] = {
 };
 
 
-/* target number of metaslabs per top-level vdev */
-int vdev_max_ms_count = 200;
-SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count, CTLFLAG_RWTUN,
-    &vdev_max_ms_count, 0,
+/* default target for number of metaslabs per top-level vdev */
+int zfs_vdev_default_ms_count = 200;
+SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_count, CTLFLAG_RWTUN,
+    &zfs_vdev_default_ms_count, 0,
     "Target number of metaslabs per top-level vdev");
 
 /* minimum number of metaslabs per top-level vdev */
-int vdev_min_ms_count = 16;
+int zfs_vdev_min_ms_count = 16;
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, min_ms_count, CTLFLAG_RWTUN,
-    &vdev_min_ms_count, 0,
+    &zfs_vdev_min_ms_count, 0,
     "Minimum number of metaslabs per top-level vdev");
 
 /* practical upper limit of total metaslabs per top-level vdev */
-int vdev_ms_count_limit = 1ULL << 17;
+int zfs_vdev_ms_count_limit = 1ULL << 17;
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_count_limit, CTLFLAG_RWTUN,
-    &vdev_ms_count_limit, 0,
+    &zfs_vdev_ms_count_limit, 0,
     "Maximum number of metaslabs per top-level vdev");
 
 /* lower limit for metaslab size (512M) */
-int vdev_default_ms_shift = 29;
+int zfs_vdev_default_ms_shift = 29;
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, default_ms_shift, CTLFLAG_RWTUN,
-    &vdev_default_ms_shift, 0,
+    &zfs_vdev_default_ms_shift, 0,
     "Default shift between vdev size and number of metaslabs");
 
-/* upper limit for metaslab size (256G) */
-int vdev_max_ms_shift = 38;
+/* upper limit for metaslab size (16G) */
+int zfs_vdev_max_ms_shift = 34;
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, max_ms_shift, CTLFLAG_RWTUN,
-    &vdev_max_ms_shift, 0,
+    &zfs_vdev_max_ms_shift, 0,
     "Maximum shift between vdev size and number of metaslabs");
 
 boolean_t vdev_validate_skip = B_FALSE;
@@ -2205,16 +2205,24 @@ void
 vdev_metaslab_set_size(vdev_t *vd)
 {
        uint64_t asize = vd->vdev_asize;
-       uint64_t ms_count = asize >> vdev_default_ms_shift;
+       uint64_t ms_count = asize >> zfs_vdev_default_ms_shift;
        uint64_t ms_shift;
 
        /*
         * There are two dimensions to the metaslab sizing calculation:
         * the size of the metaslab and the count of metaslabs per vdev.
-        * In general, we aim for vdev_max_ms_count (200) metaslabs. The
-        * range of the dimensions are as follows:
         *
-        *      2^29 <= ms_size  <= 2^38
+        * The default values used below are a good balance between memory
+        * usage (larger metaslab size means more memory needed for loaded
+        * metaslabs; more metaslabs means more memory needed for the
+        * metaslab_t structs), metaslab load time (larger metaslabs take
+        * longer to load), and metaslab sync time (more metaslabs means
+        * more time spent syncing all of them).
+        *
+        * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs.
+        * The range of the dimensions are as follows:
+        *
+        *      2^29 <= ms_size  <= 2^34
         *        16 <= ms_count <= 131,072
         *
         * On the lower end of vdev sizes, we aim for metaslabs sizes of
@@ -2223,35 +2231,41 @@ vdev_metaslab_set_size(vdev_t *vd)
         * of at least 16 metaslabs will override this minimum size goal.
         *
         * On the upper end of vdev sizes, we aim for a maximum metaslab
-        * size of 256GB.  However, we will cap the total count to 2^17
-        * metaslabs to keep our memory footprint in check.
+        * size of 16GB.  However, we will cap the total count to 2^17
+        * metaslabs to keep our memory footprint in check and let the
+        * metaslab size grow from there if that limit is hit.
         *
         * The net effect of applying above constrains is summarized below.
         *
-        *      vdev size       metaslab count
-        *      -------------|-----------------
-        *      < 8GB           ~16
-        *      8GB - 100GB     one per 512MB
-        *      100GB - 50TB    ~200
-        *      50TB - 32PB     one per 256GB
-        *      > 32PB          ~131,072
-        *      -------------------------------
+        *   vdev size       metaslab count
+        *  --------------|-----------------
+        *      < 8GB        ~16
+        *  8GB   - 100GB   one per 512MB
+        *  100GB - 3TB     ~200
+        *  3TB   - 2PB     one per 16GB
+        *      > 2PB       ~131,072
+        *  --------------------------------
+        *
+        *  Finally, note that all of the above calculate the initial
+        *  number of metaslabs. Expanding a top-level vdev will result
+        *  in additional metaslabs being allocated making it possible
+        *  to exceed the zfs_vdev_ms_count_limit.
         */
 
-       if (ms_count < vdev_min_ms_count)
-               ms_shift = highbit64(asize / vdev_min_ms_count);
-       else if (ms_count > vdev_max_ms_count)
-               ms_shift = highbit64(asize / vdev_max_ms_count);
+       if (ms_count < zfs_vdev_min_ms_count)
+               ms_shift = highbit64(asize / zfs_vdev_min_ms_count);
+       else if (ms_count > zfs_vdev_default_ms_count)
+               ms_shift = highbit64(asize / zfs_vdev_default_ms_count);
        else
-               ms_shift = vdev_default_ms_shift;
+               ms_shift = zfs_vdev_default_ms_shift;
 
        if (ms_shift < SPA_MAXBLOCKSHIFT) {
                ms_shift = SPA_MAXBLOCKSHIFT;
-       } else if (ms_shift > vdev_max_ms_shift) {
-               ms_shift = vdev_max_ms_shift;
+       } else if (ms_shift > zfs_vdev_max_ms_shift) {
+               ms_shift = zfs_vdev_max_ms_shift;
                /* cap the total count to constrain memory footprint */
-               if ((asize >> ms_shift) > vdev_ms_count_limit)
-                       ms_shift = highbit64(asize / vdev_ms_count_limit);
+               if ((asize >> ms_shift) > zfs_vdev_ms_count_limit)
+                       ms_shift = highbit64(asize / zfs_vdev_ms_count_limit);
        }
 
        vd->vdev_ms_shift = ms_shift;
@@ -3611,13 +3625,17 @@ vdev_accessible(vdev_t *vd, zio_t *zio)
 boolean_t
 vdev_is_spacemap_addressable(vdev_t *vd)
 {
+       if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2))
+               return (B_TRUE);
+
        /*
-        * Assuming 47 bits of the space map entry dedicated for the entry's
-        * offset (see description in space_map.h), we calculate the maximum
-        * address that can be described by a space map entry for the given
-        * device.
+        * If double-word space map entries are not enabled we assume
+        * 47 bits of the space map entry are dedicated to the entry's
+        * offset (see SM_OFFSET_BITS in space_map.h). We then use that
+        * to calculate the maximum address that can be described by a
+        * space map entry for the given device.
         */
-       uint64_t shift = vd->vdev_ashift + 47;
+       uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS;
 
        if (shift >= 63) /* detect potential overflow */
                return (B_TRUE);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c       
Wed Oct 16 06:18:37 2019        (r353611)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_initialize.c       
Wed Oct 16 06:26:51 2019        (r353612)
@@ -353,16 +353,6 @@ vdev_initialize_ranges(vdev_t *vd, abd_t *data)
 }
 
 static void
-vdev_initialize_ms_load(metaslab_t *msp)
-{
-       ASSERT(MUTEX_HELD(&msp->ms_lock));
-
-       metaslab_load_wait(msp);
-       if (!msp->ms_loaded)
-               VERIFY0(metaslab_load(msp));
-}
-
-static void
 vdev_initialize_mg_wait(metaslab_group_t *mg)
 {
        ASSERT(MUTEX_HELD(&mg->mg_ms_initialize_lock));
@@ -484,10 +474,10 @@ vdev_initialize_calculate_progress(vdev_t *vd)
                 * metaslab. Load it and walk the free tree for more accurate
                 * progress estimation.
                 */
-               vdev_initialize_ms_load(msp);
+               VERIFY0(metaslab_load(msp));
 
-               for (range_seg_t *rs = 
avl_first(&msp->ms_allocatable->rt_root); rs;
-                   rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
+               for (range_seg_t *rs = avl_first(&msp->ms_allocatable->rt_root);
+                   rs; rs = AVL_NEXT(&msp->ms_allocatable->rt_root, rs)) {
                        logical_rs.rs_start = rs->rs_start;
                        logical_rs.rs_end = rs->rs_end;
                        vdev_xlate(vd, &logical_rs, &physical_rs);
@@ -615,7 +605,7 @@ vdev_initialize_thread(void *arg)
 
                vdev_initialize_ms_mark(msp);
                mutex_enter(&msp->ms_lock);
-               vdev_initialize_ms_load(msp);
+               VERIFY0(metaslab_load(msp));
 
                range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add,
                    vd);
_______________________________________________
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to