On Thu, Oct 10, 2019 at 12:51:01PM -0400, Josef Bacik wrote: > On Mon, Oct 07, 2019 at 04:17:44PM -0400, Dennis Zhou wrote: > > Non-block group destruction discarding currently only had a single list > > with no minimum discard length. This can lead to caravaning more > > meaningful discards behind a heavily fragmented block group. > > > > This adds support for multiple lists with minimum discard lengths to > > prevent the caravan effect. We promote block groups back up when we > > exceed the BTRFS_DISCARD_MAX_FILTER size, currently we support only 2 > > lists with filters of 1MB and 32KB respectively. > > > > Signed-off-by: Dennis Zhou <den...@kernel.org> > > --- > > fs/btrfs/ctree.h | 2 +- > > fs/btrfs/discard.c | 60 +++++++++++++++++++++++++++++++++---- > > fs/btrfs/discard.h | 4 +++ > > fs/btrfs/free-space-cache.c | 37 +++++++++++++++-------- > > fs/btrfs/free-space-cache.h | 2 +- > > 5 files changed, 85 insertions(+), 20 deletions(-) > > > > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h > > index e81f699347e0..b5608f8dc41a 100644 > > --- a/fs/btrfs/ctree.h > > +++ b/fs/btrfs/ctree.h > > @@ -439,7 +439,7 @@ struct btrfs_full_stripe_locks_tree { > > }; > > > > /* discard control */ > > -#define BTRFS_NR_DISCARD_LISTS 2 > > +#define BTRFS_NR_DISCARD_LISTS 3 > > > > struct btrfs_discard_ctl { > > struct workqueue_struct *discard_workers; > > diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c > > index 072c73f48297..296cbffc5957 100644 > > --- a/fs/btrfs/discard.c > > +++ b/fs/btrfs/discard.c > > @@ -20,6 +20,10 @@ > > #define BTRFS_DISCARD_MAX_DELAY (10000UL) > > #define BTRFS_DISCARD_MAX_IOPS (10UL) > > > > +/* montonically decreasing filters after 0 */ > > +static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {0, > > + BTRFS_DISCARD_MAX_FILTER, BTRFS_DISCARD_MIN_FILTER}; > > + > > static struct list_head * > > btrfs_get_discard_list(struct btrfs_discard_ctl *discard_ctl, > > struct btrfs_block_group_cache *cache) > > @@ -120,7 +124,7 @@ find_next_cache(struct btrfs_discard_ctl *discard_ctl, > > u64 now) > > } > > > > static struct btrfs_block_group_cache * > > -peek_discard_list(struct btrfs_discard_ctl *discard_ctl) > > +peek_discard_list(struct btrfs_discard_ctl *discard_ctl, int > > *discard_index) > > { > > struct btrfs_block_group_cache *cache; > > u64 now = ktime_get_ns(); > > @@ -132,6 +136,7 @@ peek_discard_list(struct btrfs_discard_ctl *discard_ctl) > > > > if (cache && now > cache->discard_delay) { > > discard_ctl->cache = cache; > > + *discard_index = cache->discard_index; > > if (cache->discard_index == 0 && > > cache->free_space_ctl->free_space != cache->key.offset) { > > __btrfs_add_to_discard_list(discard_ctl, cache); > > @@ -150,6 +155,36 @@ peek_discard_list(struct btrfs_discard_ctl > > *discard_ctl) > > return cache; > > } > > > > +void btrfs_discard_check_filter(struct btrfs_block_group_cache *cache, > > + u64 bytes) > > +{ > > + struct btrfs_discard_ctl *discard_ctl; > > + > > + if (!cache || !btrfs_test_opt(cache->fs_info, DISCARD_ASYNC)) > > + return; > > + > > + discard_ctl = &cache->fs_info->discard_ctl; > > + > > + if (cache && cache->discard_index > 1 && > > + bytes >= BTRFS_DISCARD_MAX_FILTER) { > > + remove_from_discard_list(discard_ctl, cache); > > + cache->discard_index = 1; > > Really need names here, I have no idea what 1 is. >
Yep, done. > > + btrfs_add_to_discard_list(discard_ctl, cache); > > + } > > +} > > + > > +static void btrfs_update_discard_index(struct btrfs_discard_ctl > > *discard_ctl, > > + struct btrfs_block_group_cache *cache) > > +{ > > + cache->discard_index++; > > + if (cache->discard_index == BTRFS_NR_DISCARD_LISTS) { > > + cache->discard_index = 1; > > + return; > > + } > > + > > + btrfs_add_to_discard_list(discard_ctl, cache); > > +} > > + > > void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, > > struct btrfs_block_group_cache *cache) > > { > > @@ -202,23 +237,34 @@ static void btrfs_discard_workfn(struct work_struct > > *work) > > { > > struct btrfs_discard_ctl *discard_ctl; > > struct btrfs_block_group_cache *cache; > > + int discard_index = 0; > > u64 trimmed = 0; > > + u64 minlen = 0; > > > > discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work); > > > > again: > > - cache = peek_discard_list(discard_ctl); > > + cache = peek_discard_list(discard_ctl, &discard_index); > > if (!cache || !btrfs_run_discard_work(discard_ctl)) > > return; > > > > - if (btrfs_discard_bitmaps(cache)) > > + minlen = discard_minlen[discard_index]; > > + > > + if (btrfs_discard_bitmaps(cache)) { > > + u64 maxlen = 0; > > + > > + if (discard_index) > > + maxlen = discard_minlen[discard_index - 1]; > > + > > btrfs_trim_block_group_bitmaps(cache, &trimmed, > > cache->discard_cursor, > > btrfs_block_group_end(cache), > > - 0, true); > > - else > > + minlen, maxlen, true); > > + } else { > > btrfs_trim_block_group(cache, &trimmed, cache->discard_cursor, > > - btrfs_block_group_end(cache), 0, true); > > + btrfs_block_group_end(cache), > > + minlen, true); > > + } > > > > discard_ctl->prev_discard = trimmed; > > > > @@ -231,6 +277,8 @@ static void btrfs_discard_workfn(struct work_struct > > *work) > > cache->key.offset) > > btrfs_add_to_discard_free_list(discard_ctl, > > cache); > > + else > > + btrfs_update_discard_index(discard_ctl, cache); > > } else { > > cache->discard_cursor = cache->key.objectid; > > cache->discard_flags |= BTRFS_DISCARD_BITMAPS; > > diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h > > index 898dd92dbf8f..1daa8da4a1b5 100644 > > --- a/fs/btrfs/discard.h > > +++ b/fs/btrfs/discard.h > > @@ -18,6 +18,8 @@ > > > > /* discard size limits */ > > #define BTRFS_DISCARD_MAX_SIZE (SZ_64M) > > +#define BTRFS_DISCARD_MAX_FILTER (SZ_1M) > > +#define BTRFS_DISCARD_MIN_FILTER (SZ_32K) > > > > /* discard flags */ > > #define BTRFS_DISCARD_RESET_CURSOR (1UL << 0) > > @@ -39,6 +41,8 @@ void btrfs_add_to_discard_list(struct btrfs_discard_ctl > > *discard_ctl, > > struct btrfs_block_group_cache *cache); > > void btrfs_add_to_discard_free_list(struct btrfs_discard_ctl *discard_ctl, > > struct btrfs_block_group_cache *cache); > > +void btrfs_discard_check_filter(struct btrfs_block_group_cache *cache, > > + u64 bytes); > > void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info); > > > > void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl, > > diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c > > index ce33803a45b2..ed35dc090df6 100644 > > --- a/fs/btrfs/free-space-cache.c > > +++ b/fs/btrfs/free-space-cache.c > > @@ -2471,6 +2471,7 @@ int __btrfs_add_free_space(struct btrfs_fs_info > > *fs_info, > > if (ret) > > kmem_cache_free(btrfs_free_space_cachep, info); > > out: > > + btrfs_discard_check_filter(cache, bytes); > > So we're only accounting the new space? What if we merge with a larger area > here? We should probably make our decision based on the actual trimable area. > Thanks that's a good catch. I've updated it to be info->bytes if possible. > > btrfs_discard_update_discardable(cache, ctl); > > spin_unlock(&ctl->tree_lock); > > > > @@ -3409,7 +3410,13 @@ static int trim_no_bitmap(struct > > btrfs_block_group_cache *block_group, > > goto next; > > } > > unlink_free_space(ctl, entry); > > - if (bytes > BTRFS_DISCARD_MAX_SIZE) { > > + /* > > + * Let bytes = BTRFS_MAX_DISCARD_SIZE + X. > > + * If X < BTRFS_DISCARD_MIN_FILTER, we won't trim X when > > + * we come back around. So trim it now. > > + */ > > + if (bytes > (BTRFS_DISCARD_MAX_SIZE + > > + BTRFS_DISCARD_MIN_FILTER)) { > > bytes = extent_bytes = BTRFS_DISCARD_MAX_SIZE; > > entry->offset += BTRFS_DISCARD_MAX_SIZE; > > entry->bytes -= BTRFS_DISCARD_MAX_SIZE; > > @@ -3510,7 +3517,7 @@ static void end_trimming_bitmap(struct > > btrfs_free_space_ctl *ctl, > > > > static int trim_bitmaps(struct btrfs_block_group_cache *block_group, > > u64 *total_trimmed, u64 start, u64 end, u64 minlen, > > - bool async) > > + u64 maxlen, bool async) > > { > > struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; > > struct btrfs_free_space *entry; > > @@ -3535,7 +3542,7 @@ static int trim_bitmaps(struct > > btrfs_block_group_cache *block_group, > > } > > > > entry = tree_search_offset(ctl, offset, 1, 0); > > - if (!entry || (async && start == offset && > > + if (!entry || (async && minlen && start == offset && > > btrfs_free_space_trimmed(entry))) { > > Huh? Why do we care if minlen is set if our entry is already trimmed? If > we're > already trimmed we should just skip it even with minlen set, right? Thanks, > Yeah this definitely needs a comment. The reason for this is minlen is used to check if we're in discard_index 0 which is the free path discarding. In this case, because bitmaps are lossily marked trimmed, (we skip lone pages less than BTRFS_ASYNC_DISCARD_MIN_FILTER), we need to just go back and double check. The goal is to maintain the invariant everything is discarded when we forget about a block group. I think in practice, with the FORCE_EXTENT_THRESHOLD patch later on, this shouldn't really be common case as it should have been picked out by some coalescing. Thanks, Dennis