Author: mav Date: Thu Aug 2 21:59:46 2018 New Revision: 337191 URL: https://svnweb.freebsd.org/changeset/base/337191
Log: MFV r337190: 9486 reduce memory used by device removal on fragmented pools In the most fragmented real-world cases, this reduces memory used by the mapping from ~1GB to ~50MB of RAM per 1TB of storage removed. Less fragmented cases will typically also see around 50-100MB of RAM per 1TB of storage. illumos/illumos-gate@cfd63e1b1bcf7ba4bf72f55ddbd87ce008d2986d Reviewed by: George Wilson <george.wil...@delphix.com> Reviewed by: Serapheim Dimitropoulos <serapheim.dimi...@delphix.com> Reviewed by: Brian Behlendorf <behlendo...@llnl.gov> Reviewed by: Tim Chase <t...@chase2k.com> Approved by: Robert Mustacchi <r...@joyent.com> Author: Matthew Ahrens <mahr...@delphix.com> Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c Directory Properties: head/sys/cddl/contrib/opensolaris/ (props changed) Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c Thu Aug 2 21:57:59 2018 (r337190) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c Thu Aug 2 21:59:46 2018 (r337191) @@ -491,7 +491,6 @@ range_tree_resize_segment(range_tree_t *rt, range_seg_ static range_seg_t * range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) { - avl_index_t where; range_seg_t rsearch; uint64_t end = start + size; @@ -499,7 +498,7 @@ range_tree_find_impl(range_tree_t *rt, uint64_t start, rsearch.rs_start = start; rsearch.rs_end = end; - return (avl_find(&rt->rt_root, &rsearch, &where)); + return (avl_find(&rt->rt_root, &rsearch, NULL)); } range_seg_t * @@ -650,4 +649,24 @@ range_tree_is_empty(range_tree_t *rt) { ASSERT(rt != NULL); return (range_tree_space(rt) == 0); +} + +uint64_t +range_tree_min(range_tree_t *rt) +{ + range_seg_t *rs = avl_first(&rt->rt_root); + return (rs != NULL ? rs->rs_start : 0); +} + +uint64_t +range_tree_max(range_tree_t *rt) +{ + range_seg_t *rs = avl_last(&rt->rt_root); + return (rs != NULL ? rs->rs_end : 0); +} + +uint64_t +range_tree_span(range_tree_t *rt) +{ + return (range_tree_max(rt) - range_tree_min(rt)); } Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h Thu Aug 2 21:57:59 2018 (r337190) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/range_tree.h Thu Aug 2 21:59:46 2018 (r337191) @@ -95,6 +95,9 @@ boolean_t range_tree_is_empty(range_tree_t *rt); void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size); void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst); void range_tree_stat_verify(range_tree_t *rt); +uint64_t range_tree_min(range_tree_t *rt); +uint64_t range_tree_max(range_tree_t *rt); +uint64_t range_tree_span(range_tree_t *rt); void range_tree_add(void *arg, uint64_t start, uint64_t size); void range_tree_remove(void *arg, uint64_t start, uint64_t size); Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h Thu Aug 2 21:57:59 2018 (r337190) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_removal.h Thu Aug 2 21:59:46 2018 (r337191) @@ -86,6 +86,9 @@ extern void spa_vdev_remove_suspend(spa_t *); extern int spa_vdev_remove_cancel(spa_t *); extern void spa_vdev_removal_destroy(spa_vdev_removal_t *svr); +extern int vdev_removal_max_span; +extern int zfs_remove_max_segment; + #ifdef __cplusplus } #endif Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c Thu Aug 2 21:57:59 2018 (r337190) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c Thu Aug 2 21:59:46 2018 (r337191) @@ -33,15 +33,15 @@ * 1. Uniquely identify this device as part of a ZFS pool and confirm its * identity within the pool. * - * 2. Verify that all the devices given in a configuration are present + * 2. Verify that all the devices given in a configuration are present * within the pool. * - * 3. Determine the uberblock for the pool. + * 3. Determine the uberblock for the pool. * - * 4. In case of an import operation, determine the configuration of the + * 4. In case of an import operation, determine the configuration of the * toplevel vdev of which it is a part. * - * 5. If an import operation cannot find all the devices in the pool, + * 5. If an import operation cannot find all the devices in the pool, * provide enough information to the administrator to determine which * devices are missing. * @@ -77,9 +77,9 @@ * In order to identify which labels are valid, the labels are written in the * following manner: * - * 1. For each vdev, update 'L1' to the new label - * 2. Update the uberblock - * 3. For each vdev, update 'L2' to the new label + * 1. For each vdev, update 'L1' to the new label + * 2. Update the uberblock + * 3. For each vdev, update 'L2' to the new label * * Given arbitrary failure, we can determine the correct label to use based on * the transaction group. If we fail after updating L1 but before updating the @@ -117,19 +117,19 @@ * * The nvlist describing the pool and vdev contains the following elements: * - * version ZFS on-disk version - * name Pool name - * state Pool state - * txg Transaction group in which this label was written - * pool_guid Unique identifier for this pool - * vdev_tree An nvlist describing vdev tree. + * version ZFS on-disk version + * name Pool name + * state Pool state + * txg Transaction group in which this label was written + * pool_guid Unique identifier for this pool + * vdev_tree An nvlist describing vdev tree. * features_for_read * An nvlist of the features necessary for reading the MOS. * * Each leaf device label also contains the following: * - * top_guid Unique ID for top-level vdev in which this is contained - * guid Unique ID for the leaf vdev + * top_guid Unique ID for top-level vdev in which this is contained + * guid Unique ID for the leaf vdev * * The 'vs' configuration follows the format described in 'spa_config.c'. */ @@ -396,22 +396,33 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t * histograms. */ uint64_t seg_count = 0; + uint64_t to_alloc = vd->vdev_stat.vs_alloc; /* * There are the same number of allocated segments * as free segments, so we will have at least one - * entry per free segment. + * entry per free segment. However, small free + * segments (smaller than vdev_removal_max_span) + * will be combined with adjacent allocated segments + * as a single mapping. */ for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { - seg_count += vd->vdev_mg->mg_histogram[i]; + if (1ULL << (i + 1) < vdev_removal_max_span) { + to_alloc += + vd->vdev_mg->mg_histogram[i] << + i + 1; + } else { + seg_count += + vd->vdev_mg->mg_histogram[i]; + } } /* - * The maximum length of a mapping is SPA_MAXBLOCKSIZE, - * so we need at least one entry per SPA_MAXBLOCKSIZE - * of allocated data. + * The maximum length of a mapping is + * zfs_remove_max_segment, so we need at least one entry + * per zfs_remove_max_segment of allocated data. */ - seg_count += vd->vdev_stat.vs_alloc / SPA_MAXBLOCKSIZE; + seg_count += to_alloc / zfs_remove_max_segment; fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE, seg_count * Modified: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c ============================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c Thu Aug 2 21:57:59 2018 (r337190) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_removal.c Thu Aug 2 21:59:46 2018 (r337191) @@ -106,6 +106,24 @@ int zfs_remove_max_copy_bytes = 64 * 1024 * 1024; int zfs_remove_max_segment = 1024 * 1024; /* + * Allow a remap segment to span free chunks of at most this size. The main + * impact of a larger span is that we will read and write larger, more + * contiguous chunks, with more "unnecessary" data -- trading off bandwidth + * for iops. The value here was chosen to align with + * zfs_vdev_read_gap_limit, which is a similar concept when doing regular + * reads (but there's no reason it has to be the same). + * + * Additionally, a higher span will have the following relatively minor + * effects: + * - the mapping will be smaller, since one entry can cover more allocated + * segments + * - more of the fragmentation in the removing device will be preserved + * - we'll do larger allocations, which may fail and fall back on smaller + * allocations + */ +int vdev_removal_max_span = 32 * 1024; + +/* * This is used by the test suite so that it can ensure that certain * actions happen while in the middle of a removal. */ @@ -726,13 +744,52 @@ vdev_mapping_sync(void *arg, dmu_tx_t *tx) spa_sync_removing_state(spa, tx); } +typedef struct vdev_copy_segment_arg { + spa_t *vcsa_spa; + dva_t *vcsa_dest_dva; + uint64_t vcsa_txg; + range_tree_t *vcsa_obsolete_segs; +} vdev_copy_segment_arg_t; + +static void +unalloc_seg(void *arg, uint64_t start, uint64_t size) +{ + vdev_copy_segment_arg_t *vcsa = arg; + spa_t *spa = vcsa->vcsa_spa; + blkptr_t bp = { 0 }; + + BP_SET_BIRTH(&bp, TXG_INITIAL, TXG_INITIAL); + BP_SET_LSIZE(&bp, size); + BP_SET_PSIZE(&bp, size); + BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_OFF); + BP_SET_TYPE(&bp, DMU_OT_NONE); + BP_SET_LEVEL(&bp, 0); + BP_SET_DEDUP(&bp, 0); + BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER); + + DVA_SET_VDEV(&bp.blk_dva[0], DVA_GET_VDEV(vcsa->vcsa_dest_dva)); + DVA_SET_OFFSET(&bp.blk_dva[0], + DVA_GET_OFFSET(vcsa->vcsa_dest_dva) + start); + DVA_SET_ASIZE(&bp.blk_dva[0], size); + + zio_free(spa, vcsa->vcsa_txg, &bp); +} + /* * All reads and writes associated with a call to spa_vdev_copy_segment() * are done. */ static void -spa_vdev_copy_nullzio_done(zio_t *zio) +spa_vdev_copy_segment_done(zio_t *zio) { + vdev_copy_segment_arg_t *vcsa = zio->io_private; + + range_tree_vacate(vcsa->vcsa_obsolete_segs, + unalloc_seg, vcsa); + range_tree_destroy(vcsa->vcsa_obsolete_segs); + kmem_free(vcsa, sizeof (*vcsa)); + spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); } @@ -849,7 +906,8 @@ spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *n * read from the old location and write to the new location. */ static int -spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg, +spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, + uint64_t maxalloc, uint64_t txg, vdev_copy_arg_t *vca, zio_alloc_list_t *zal) { metaslab_group_t *mg = vd->vdev_mg; @@ -857,9 +915,40 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint spa_vdev_removal_t *svr = spa->spa_vdev_removal; vdev_indirect_mapping_entry_t *entry; dva_t dst = { 0 }; + uint64_t start = range_tree_min(segs); - ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE); + uint64_t size = range_tree_span(segs); + if (range_tree_span(segs) > maxalloc) { + /* + * We can't allocate all the segments. Prefer to end + * the allocation at the end of a segment, thus avoiding + * additional split blocks. + */ + range_seg_t search; + avl_index_t where; + search.rs_start = start + maxalloc; + search.rs_end = search.rs_start; + range_seg_t *rs = avl_find(&segs->rt_root, &search, &where); + if (rs == NULL) { + rs = avl_nearest(&segs->rt_root, where, AVL_BEFORE); + } else { + rs = AVL_PREV(&segs->rt_root, rs); + } + if (rs != NULL) { + size = rs->rs_end - start; + } else { + /* + * There are no segments that end before maxalloc. + * I.e. the first segment is larger than maxalloc, + * so we must split it. + */ + size = maxalloc; + } + } + ASSERT3U(size, <=, maxalloc); + /* * We use allocator 0 for this I/O because we don't expect device remap * to be the steady state of the system, so parallelizing is not as @@ -873,6 +962,31 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint return (error); /* + * Determine the ranges that are not actually needed. Offsets are + * relative to the start of the range to be copied (i.e. relative to the + * local variable "start"). + */ + range_tree_t *obsolete_segs = range_tree_create(NULL, NULL); + + range_seg_t *rs = avl_first(&segs->rt_root); + ASSERT3U(rs->rs_start, ==, start); + uint64_t prev_seg_end = rs->rs_end; + while ((rs = AVL_NEXT(&segs->rt_root, rs)) != NULL) { + if (rs->rs_start >= start + size) { + break; + } else { + range_tree_add(obsolete_segs, + prev_seg_end - start, + rs->rs_start - prev_seg_end); + } + prev_seg_end = rs->rs_end; + } + /* We don't end in the middle of an obsolete range */ + ASSERT3U(start + size, <=, prev_seg_end); + + range_tree_clear(segs, start, size); + + /* * We can't have any padding of the allocated size, otherwise we will * misunderstand what's allocated, and the size of the mapping. * The caller ensures this will be true by passing in a size that is @@ -883,13 +997,22 @@ spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP); DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start); entry->vime_mapping.vimep_dst = dst; + if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { + entry->vime_obsolete_count = range_tree_space(obsolete_segs); + } + vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP); + vcsa->vcsa_dest_dva = &entry->vime_mapping.vimep_dst; + vcsa->vcsa_obsolete_segs = obsolete_segs; + vcsa->vcsa_spa = spa; + vcsa->vcsa_txg = txg; + /* * See comment before spa_vdev_copy_one_child(). */ spa_config_enter(spa, SCL_STATE, spa, RW_READER); zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL, - spa_vdev_copy_nullzio_done, NULL, 0); + spa_vdev_copy_segment_done, vcsa, 0); vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst)); if (dest_vd->vdev_ops == &vdev_mirror_ops) { for (int i = 0; i < dest_vd->vdev_children; i++) { @@ -1092,39 +1215,78 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr mutex_enter(&svr->svr_lock); - range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root); - if (rs == NULL) { + /* + * Determine how big of a chunk to copy. We can allocate up + * to max_alloc bytes, and we can span up to vdev_removal_max_span + * bytes of unallocated space at a time. "segs" will track the + * allocated segments that we are copying. We may also be copying + * free segments (of up to vdev_removal_max_span bytes). + */ + range_tree_t *segs = range_tree_create(NULL, NULL); + for (;;) { + range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root); + if (rs == NULL) + break; + + uint64_t seg_length; + + if (range_tree_is_empty(segs)) { + /* need to truncate the first seg based on max_alloc */ + seg_length = + MIN(rs->rs_end - rs->rs_start, *max_alloc); + } else { + if (rs->rs_start - range_tree_max(segs) > + vdev_removal_max_span) { + /* + * Including this segment would cause us to + * copy a larger unneeded chunk than is allowed. + */ + break; + } else if (rs->rs_end - range_tree_min(segs) > + *max_alloc) { + /* + * This additional segment would extend past + * max_alloc. Rather than splitting this + * segment, leave it for the next mapping. + */ + break; + } else { + seg_length = rs->rs_end - rs->rs_start; + } + } + + range_tree_add(segs, rs->rs_start, seg_length); + range_tree_remove(svr->svr_allocd_segs, + rs->rs_start, seg_length); + } + + if (range_tree_is_empty(segs)) { mutex_exit(&svr->svr_lock); + range_tree_destroy(segs); return; } - uint64_t offset = rs->rs_start; - uint64_t length = MIN(rs->rs_end - rs->rs_start, *max_alloc); - range_tree_remove(svr->svr_allocd_segs, offset, length); - if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) { dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync, svr, 0, ZFS_SPACE_CHECK_NONE, tx); } - svr->svr_max_offset_to_sync[txg & TXG_MASK] = offset + length; + svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs); /* * Note: this is the amount of *allocated* space * that we are taking care of each txg. */ - svr->svr_bytes_done[txg & TXG_MASK] += length; + svr->svr_bytes_done[txg & TXG_MASK] += range_tree_space(segs); mutex_exit(&svr->svr_lock); zio_alloc_list_t zal; metaslab_trace_init(&zal); - uint64_t thismax = *max_alloc; - while (length > 0) { - uint64_t mylen = MIN(length, thismax); - + uint64_t thismax = SPA_MAXBLOCKSIZE; + while (!range_tree_is_empty(segs)) { int error = spa_vdev_copy_segment(vd, - offset, mylen, txg, vca, &zal); + segs, thismax, txg, vca, &zal); if (error == ENOSPC) { /* @@ -1138,18 +1300,17 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr */ ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT); ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift); - thismax = P2ROUNDUP(mylen / 2, + uint64_t attempted = + MIN(range_tree_span(segs), thismax); + thismax = P2ROUNDUP(attempted / 2, 1 << spa->spa_max_ashift); - ASSERT3U(thismax, <, mylen); /* * The minimum-size allocation can not fail. */ - ASSERT3U(mylen, >, 1 << spa->spa_max_ashift); - *max_alloc = mylen - (1 << spa->spa_max_ashift); + ASSERT3U(attempted, >, 1 << spa->spa_max_ashift); + *max_alloc = attempted - (1 << spa->spa_max_ashift); } else { ASSERT0(error); - length -= mylen; - offset += mylen; /* * We've performed an allocation, so reset the @@ -1160,6 +1321,7 @@ spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr } } metaslab_trace_fini(&zal); + range_tree_destroy(segs); } /* _______________________________________________ svn-src-head@freebsd.org mailing list https://lists.freebsd.org/mailman/listinfo/svn-src-head To unsubscribe, send any mail to "svn-src-head-unsubscr...@freebsd.org"