Author: mav
Date: Thu Aug  2 21:59:46 2018
New Revision: 337191
URL: https://svnweb.freebsd.org/changeset/base/337191

Log:
  MFV r337190: 9486 reduce memory used by device removal on fragmented pools
  
  In the most fragmented real-world cases, this reduces memory used by the
  mapping from ~1GB to ~50MB of RAM per 1TB of storage removed. Less
  fragmented cases will typically also see around 50-100MB of RAM per 1TB
  of storage.
  
  illumos/illumos-gate@cfd63e1b1bcf7ba4bf72f55ddbd87ce008d2986d
  
  Reviewed by: George Wilson <george.wil...@delphix.com>
  Reviewed by: Serapheim Dimitropoulos <serapheim.dimi...@delphix.com>
  Reviewed by: Brian Behlendorf <behlendo...@llnl.gov>
  Reviewed by: Tim Chase <t...@chase2k.com>
  Approved by: Robert Mustacchi <r...@joyent.com>
  Author:     Matthew Ahrens <mahr...@delphix.com>

Modified:
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
  head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
Directory Properties:
  head/sys/cddl/contrib/opensolaris/   (props changed)

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c    Thu Aug 
 2 21:57:59 2018        (r337190)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c    Thu Aug 
 2 21:59:46 2018        (r337191)
@@ -491,7 +491,6 @@ range_tree_resize_segment(range_tree_t *rt, range_seg_
 static range_seg_t *
 range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
 {
-       avl_index_t where;
        range_seg_t rsearch;
        uint64_t end = start + size;
 
@@ -499,7 +498,7 @@ range_tree_find_impl(range_tree_t *rt, uint64_t start,
 
        rsearch.rs_start = start;
        rsearch.rs_end = end;
-       return (avl_find(&rt->rt_root, &rsearch, &where));
+       return (avl_find(&rt->rt_root, &rsearch, NULL));
 }
 
 range_seg_t *
@@ -650,4 +649,24 @@ range_tree_is_empty(range_tree_t *rt)
 {
        ASSERT(rt != NULL);
        return (range_tree_space(rt) == 0);
+}
+
+uint64_t
+range_tree_min(range_tree_t *rt)
+{
+       range_seg_t *rs = avl_first(&rt->rt_root);
+       return (rs != NULL ? rs->rs_start : 0);
+}
+
+uint64_t
+range_tree_max(range_tree_t *rt)
+{
+       range_seg_t *rs = avl_last(&rt->rt_root);
+       return (rs != NULL ? rs->rs_end : 0);
+}
+
+uint64_t
+range_tree_span(range_tree_t *rt)
+{
+       return (range_tree_max(rt) - range_tree_min(rt));
 }

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h        
Thu Aug  2 21:57:59 2018        (r337190)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h        
Thu Aug  2 21:59:46 2018        (r337191)
@@ -95,6 +95,9 @@ boolean_t range_tree_is_empty(range_tree_t *rt);
 void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size);
 void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst);
 void range_tree_stat_verify(range_tree_t *rt);
+uint64_t range_tree_min(range_tree_t *rt);
+uint64_t range_tree_max(range_tree_t *rt);
+uint64_t range_tree_span(range_tree_t *rt);
 
 void range_tree_add(void *arg, uint64_t start, uint64_t size);
 void range_tree_remove(void *arg, uint64_t start, uint64_t size);

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h      
Thu Aug  2 21:57:59 2018        (r337190)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h      
Thu Aug  2 21:59:46 2018        (r337191)
@@ -86,6 +86,9 @@ extern void spa_vdev_remove_suspend(spa_t *);
 extern int spa_vdev_remove_cancel(spa_t *);
 extern void spa_vdev_removal_destroy(spa_vdev_removal_t *svr);
 
+extern int vdev_removal_max_span;
+extern int zfs_remove_max_segment;
+
 #ifdef __cplusplus
 }
 #endif

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c    Thu Aug 
 2 21:57:59 2018        (r337190)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c    Thu Aug 
 2 21:59:46 2018        (r337191)
@@ -33,15 +33,15 @@
  *     1. Uniquely identify this device as part of a ZFS pool and confirm its
  *        identity within the pool.
  *
- *     2. Verify that all the devices given in a configuration are present
+ *     2. Verify that all the devices given in a configuration are present
  *         within the pool.
  *
- *     3. Determine the uberblock for the pool.
+ *     3. Determine the uberblock for the pool.
  *
- *     4. In case of an import operation, determine the configuration of the
+ *     4. In case of an import operation, determine the configuration of the
  *         toplevel vdev of which it is a part.
  *
- *     5. If an import operation cannot find all the devices in the pool,
+ *     5. If an import operation cannot find all the devices in the pool,
  *         provide enough information to the administrator to determine which
  *         devices are missing.
  *
@@ -77,9 +77,9 @@
  * In order to identify which labels are valid, the labels are written in the
  * following manner:
  *
- *     1. For each vdev, update 'L1' to the new label
- *     2. Update the uberblock
- *     3. For each vdev, update 'L2' to the new label
+ *     1. For each vdev, update 'L1' to the new label
+ *     2. Update the uberblock
+ *     3. For each vdev, update 'L2' to the new label
  *
  * Given arbitrary failure, we can determine the correct label to use based on
  * the transaction group.  If we fail after updating L1 but before updating the
@@ -117,19 +117,19 @@
  *
  * The nvlist describing the pool and vdev contains the following elements:
  *
- *     version         ZFS on-disk version
- *     name            Pool name
- *     state           Pool state
- *     txg             Transaction group in which this label was written
- *     pool_guid       Unique identifier for this pool
- *     vdev_tree       An nvlist describing vdev tree.
+ *     version         ZFS on-disk version
+ *     name            Pool name
+ *     state           Pool state
+ *     txg             Transaction group in which this label was written
+ *     pool_guid       Unique identifier for this pool
+ *     vdev_tree       An nvlist describing vdev tree.
  *     features_for_read
  *                     An nvlist of the features necessary for reading the MOS.
  *
  * Each leaf device label also contains the following:
  *
- *     top_guid        Unique ID for top-level vdev in which this is contained
- *     guid            Unique ID for the leaf vdev
+ *     top_guid        Unique ID for top-level vdev in which this is contained
+ *     guid            Unique ID for the leaf vdev
  *
  * The 'vs' configuration follows the format described in 'spa_config.c'.
  */
@@ -396,22 +396,33 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t
                         * histograms.
                         */
                        uint64_t seg_count = 0;
+                       uint64_t to_alloc = vd->vdev_stat.vs_alloc;
 
                        /*
                         * There are the same number of allocated segments
                         * as free segments, so we will have at least one
-                        * entry per free segment.
+                        * entry per free segment.  However, small free
+                        * segments (smaller than vdev_removal_max_span)
+                        * will be combined with adjacent allocated segments
+                        * as a single mapping.
                         */
                        for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) {
-                               seg_count += vd->vdev_mg->mg_histogram[i];
+                               if (1ULL << (i + 1) < vdev_removal_max_span) {
+                                       to_alloc +=
+                                           vd->vdev_mg->mg_histogram[i] <<
+                                           i + 1;
+                               } else {
+                                       seg_count +=
+                                           vd->vdev_mg->mg_histogram[i];
+                               }
                        }
 
                        /*
-                        * The maximum length of a mapping is SPA_MAXBLOCKSIZE,
-                        * so we need at least one entry per SPA_MAXBLOCKSIZE
-                        * of allocated data.
+                        * The maximum length of a mapping is
+                        * zfs_remove_max_segment, so we need at least one entry
+                        * per zfs_remove_max_segment of allocated data.
                         */
-                       seg_count += vd->vdev_stat.vs_alloc / SPA_MAXBLOCKSIZE;
+                       seg_count += to_alloc / zfs_remove_max_segment;
 
                        fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE,
                            seg_count *

Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c
==============================================================================
--- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c  Thu Aug 
 2 21:57:59 2018        (r337190)
+++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c  Thu Aug 
 2 21:59:46 2018        (r337191)
@@ -106,6 +106,24 @@ int zfs_remove_max_copy_bytes = 64 * 1024 * 1024;
 int zfs_remove_max_segment = 1024 * 1024;
 
 /*
+ * Allow a remap segment to span free chunks of at most this size. The main
+ * impact of a larger span is that we will read and write larger, more
+ * contiguous chunks, with more "unnecessary" data -- trading off bandwidth
+ * for iops.  The value here was chosen to align with
+ * zfs_vdev_read_gap_limit, which is a similar concept when doing regular
+ * reads (but there's no reason it has to be the same).
+ *
+ * Additionally, a higher span will have the following relatively minor
+ * effects:
+ *  - the mapping will be smaller, since one entry can cover more allocated
+ *    segments
+ *  - more of the fragmentation in the removing device will be preserved
+ *  - we'll do larger allocations, which may fail and fall back on smaller
+ *    allocations
+ */
+int vdev_removal_max_span = 32 * 1024;
+
+/*
  * This is used by the test suite so that it can ensure that certain
  * actions happen while in the middle of a removal.
  */
@@ -726,13 +744,52 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx)
        spa_sync_removing_state(spa, tx);
 }
 
+typedef struct vdev_copy_segment_arg {
+       spa_t *vcsa_spa;
+       dva_t *vcsa_dest_dva;
+       uint64_t vcsa_txg;
+       range_tree_t *vcsa_obsolete_segs;
+} vdev_copy_segment_arg_t;
+
+static void
+unalloc_seg(void *arg, uint64_t start, uint64_t size)
+{
+       vdev_copy_segment_arg_t *vcsa = arg;
+       spa_t *spa = vcsa->vcsa_spa;
+       blkptr_t bp = { 0 };
+
+       BP_SET_BIRTH(&bp, TXG_INITIAL, TXG_INITIAL);
+       BP_SET_LSIZE(&bp, size);
+       BP_SET_PSIZE(&bp, size);
+       BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF);
+       BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_OFF);
+       BP_SET_TYPE(&bp, DMU_OT_NONE);
+       BP_SET_LEVEL(&bp, 0);
+       BP_SET_DEDUP(&bp, 0);
+       BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER);
+
+       DVA_SET_VDEV(&bp.blk_dva[0], DVA_GET_VDEV(vcsa->vcsa_dest_dva));
+       DVA_SET_OFFSET(&bp.blk_dva[0],
+           DVA_GET_OFFSET(vcsa->vcsa_dest_dva) + start);
+       DVA_SET_ASIZE(&bp.blk_dva[0], size);
+
+       zio_free(spa, vcsa->vcsa_txg, &bp);
+}
+
 /*
  * All reads and writes associated with a call to spa_vdev_copy_segment()
  * are done.
  */
 static void
-spa_vdev_copy_nullzio_done(zio_t *zio)
+spa_vdev_copy_segment_done(zio_t *zio)
 {
+       vdev_copy_segment_arg_t *vcsa = zio->io_private;
+
+       range_tree_vacate(vcsa->vcsa_obsolete_segs,
+           unalloc_seg, vcsa);
+       range_tree_destroy(vcsa->vcsa_obsolete_segs);
+       kmem_free(vcsa, sizeof (*vcsa));
+
        spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
 }
 
@@ -849,7 +906,8 @@ spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *n
  * read from the old location and write to the new location.
  */
 static int
-spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg,
+spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs,
+    uint64_t maxalloc, uint64_t txg,
     vdev_copy_arg_t *vca, zio_alloc_list_t *zal)
 {
        metaslab_group_t *mg = vd->vdev_mg;
@@ -857,9 +915,40 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint
        spa_vdev_removal_t *svr = spa->spa_vdev_removal;
        vdev_indirect_mapping_entry_t *entry;
        dva_t dst = { 0 };
+       uint64_t start = range_tree_min(segs);
 
-       ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+       ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE);
 
+       uint64_t size = range_tree_span(segs);
+       if (range_tree_span(segs) > maxalloc) {
+               /*
+                * We can't allocate all the segments.  Prefer to end
+                * the allocation at the end of a segment, thus avoiding
+                * additional split blocks.
+                */
+               range_seg_t search;
+               avl_index_t where;
+               search.rs_start = start + maxalloc;
+               search.rs_end = search.rs_start;
+               range_seg_t *rs = avl_find(&segs->rt_root, &search, &where);
+               if (rs == NULL) {
+                       rs = avl_nearest(&segs->rt_root, where, AVL_BEFORE);
+               } else {
+                       rs = AVL_PREV(&segs->rt_root, rs);
+               }
+               if (rs != NULL) {
+                       size = rs->rs_end - start;
+               } else {
+                       /*
+                        * There are no segments that end before maxalloc.
+                        * I.e. the first segment is larger than maxalloc,
+                        * so we must split it.
+                        */
+                       size = maxalloc;
+               }
+       }
+       ASSERT3U(size, <=, maxalloc);
+
        /*
         * We use allocator 0 for this I/O because we don't expect device remap
         * to be the steady state of the system, so parallelizing is not as
@@ -873,6 +962,31 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint
                return (error);
 
        /*
+        * Determine the ranges that are not actually needed.  Offsets are
+        * relative to the start of the range to be copied (i.e. relative to the
+        * local variable "start").
+        */
+       range_tree_t *obsolete_segs = range_tree_create(NULL, NULL);
+
+       range_seg_t *rs = avl_first(&segs->rt_root);
+       ASSERT3U(rs->rs_start, ==, start);
+       uint64_t prev_seg_end = rs->rs_end;
+       while ((rs = AVL_NEXT(&segs->rt_root, rs)) != NULL) {
+               if (rs->rs_start >= start + size) {
+                       break;
+               } else {
+                       range_tree_add(obsolete_segs,
+                           prev_seg_end - start,
+                           rs->rs_start - prev_seg_end);
+               }
+               prev_seg_end = rs->rs_end;
+       }
+       /* We don't end in the middle of an obsolete range */
+       ASSERT3U(start + size, <=, prev_seg_end);
+
+       range_tree_clear(segs, start, size);
+
+       /*
         * We can't have any padding of the allocated size, otherwise we will
         * misunderstand what's allocated, and the size of the mapping.
         * The caller ensures this will be true by passing in a size that is
@@ -883,13 +997,22 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint
        entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP);
        DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start);
        entry->vime_mapping.vimep_dst = dst;
+       if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) {
+               entry->vime_obsolete_count = range_tree_space(obsolete_segs);
+       }
 
+       vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP);
+       vcsa->vcsa_dest_dva = &entry->vime_mapping.vimep_dst;
+       vcsa->vcsa_obsolete_segs = obsolete_segs;
+       vcsa->vcsa_spa = spa;
+       vcsa->vcsa_txg = txg;
+
        /*
         * See comment before spa_vdev_copy_one_child().
         */
        spa_config_enter(spa, SCL_STATE, spa, RW_READER);
        zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL,
-           spa_vdev_copy_nullzio_done, NULL, 0);
+           spa_vdev_copy_segment_done, vcsa, 0);
        vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst));
        if (dest_vd->vdev_ops == &vdev_mirror_ops) {
                for (int i = 0; i < dest_vd->vdev_children; i++) {
@@ -1092,39 +1215,78 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr
 
        mutex_enter(&svr->svr_lock);
 
-       range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root);
-       if (rs == NULL) {
+       /*
+        * Determine how big of a chunk to copy.  We can allocate up
+        * to max_alloc bytes, and we can span up to vdev_removal_max_span
+        * bytes of unallocated space at a time.  "segs" will track the
+        * allocated segments that we are copying.  We may also be copying
+        * free segments (of up to vdev_removal_max_span bytes).
+        */
+       range_tree_t *segs = range_tree_create(NULL, NULL);
+       for (;;) {
+               range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root);
+               if (rs == NULL)
+                       break;
+
+               uint64_t seg_length;
+
+               if (range_tree_is_empty(segs)) {
+                       /* need to truncate the first seg based on max_alloc */
+                       seg_length =
+                           MIN(rs->rs_end - rs->rs_start, *max_alloc);
+               } else {
+                       if (rs->rs_start - range_tree_max(segs) >
+                           vdev_removal_max_span) {
+                               /*
+                                * Including this segment would cause us to
+                                * copy a larger unneeded chunk than is allowed.
+                                */
+                               break;
+                       } else if (rs->rs_end - range_tree_min(segs) >
+                           *max_alloc) {
+                               /*
+                                * This additional segment would extend past
+                                * max_alloc. Rather than splitting this
+                                * segment, leave it for the next mapping.
+                                */
+                               break;
+                       } else {
+                               seg_length = rs->rs_end - rs->rs_start;
+                       }
+               }
+
+               range_tree_add(segs, rs->rs_start, seg_length);
+               range_tree_remove(svr->svr_allocd_segs,
+                   rs->rs_start, seg_length);
+       }
+
+       if (range_tree_is_empty(segs)) {
                mutex_exit(&svr->svr_lock);
+               range_tree_destroy(segs);
                return;
        }
-       uint64_t offset = rs->rs_start;
-       uint64_t length = MIN(rs->rs_end - rs->rs_start, *max_alloc);
 
-       range_tree_remove(svr->svr_allocd_segs, offset, length);
-
        if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) {
                dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync,
                    svr, 0, ZFS_SPACE_CHECK_NONE, tx);
        }
 
-       svr->svr_max_offset_to_sync[txg & TXG_MASK] = offset + length;
+       svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs);
 
        /*
         * Note: this is the amount of *allocated* space
         * that we are taking care of each txg.
         */
-       svr->svr_bytes_done[txg & TXG_MASK] += length;
+       svr->svr_bytes_done[txg & TXG_MASK] += range_tree_space(segs);
 
        mutex_exit(&svr->svr_lock);
 
        zio_alloc_list_t zal;
        metaslab_trace_init(&zal);
-       uint64_t thismax = *max_alloc;
-       while (length > 0) {
-               uint64_t mylen = MIN(length, thismax);
-
+       uint64_t thismax = SPA_MAXBLOCKSIZE;
+       while (!range_tree_is_empty(segs)) {
                int error = spa_vdev_copy_segment(vd,
-                   offset, mylen, txg, vca, &zal);
+                   segs, thismax, txg, vca, &zal);
 
                if (error == ENOSPC) {
                        /*
@@ -1138,18 +1300,17 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr
                         */
                        ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT);
                        ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift);
-                       thismax = P2ROUNDUP(mylen / 2,
+                       uint64_t attempted =
+                           MIN(range_tree_span(segs), thismax);
+                       thismax = P2ROUNDUP(attempted / 2,
                            1 << spa->spa_max_ashift);
-                       ASSERT3U(thismax, <, mylen);
                        /*
                         * The minimum-size allocation can not fail.
                         */
-                       ASSERT3U(mylen, >, 1 << spa->spa_max_ashift);
-                       *max_alloc = mylen - (1 << spa->spa_max_ashift);
+                       ASSERT3U(attempted, >, 1 << spa->spa_max_ashift);
+                       *max_alloc = attempted - (1 << spa->spa_max_ashift);
                } else {
                        ASSERT0(error);
-                       length -= mylen;
-                       offset += mylen;
 
                        /*
                         * We've performed an allocation, so reset the
@@ -1160,6 +1321,7 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr
                }
        }
        metaslab_trace_fini(&zal);
+       range_tree_destroy(segs);
 }
 
 /*
_______________________________________________
svn-src-head@freebsd.org mailing list
https://lists.freebsd.org/mailman/listinfo/svn-src-head
To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"

Reply via email to