On Mon, Oct 07, 2019 at 04:17:38PM -0400, Dennis Zhou wrote: > The prior two patches added discarding via a background workqueue. This > just piggybacked off of the fstrim code to trim the whole block at once. > Well inevitably this is worse performance wise and will aggressively > overtrim. But it was nice to plumb the other infrastructure to keep the > patches easier to review. > > This adds the real goal of this series which is discarding slowly (ie a > slow long running fstrim). The discarding is split into two phases, > extents and then bitmaps. The reason for this is two fold. First, the > bitmap regions overlap the extent regions. Second, discarding the > extents first will let the newly trimmed bitmaps have the highest chance > of coalescing when being readded to the free space cache. > > Signed-off-by: Dennis Zhou <den...@kernel.org> > --- > fs/btrfs/block-group.h | 2 + > fs/btrfs/discard.c | 73 ++++++++++++++++++++----- > fs/btrfs/discard.h | 16 ++++++ > fs/btrfs/extent-tree.c | 3 +- > fs/btrfs/free-space-cache.c | 106 ++++++++++++++++++++++++++---------- > fs/btrfs/free-space-cache.h | 6 +- > 6 files changed, 159 insertions(+), 47 deletions(-) > > diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h > index 0f9a1c91753f..b59e6a8ed73d 100644 > --- a/fs/btrfs/block-group.h > +++ b/fs/btrfs/block-group.h > @@ -120,6 +120,8 @@ struct btrfs_block_group_cache { > struct list_head discard_list; > int discard_index; > u64 discard_delay; > + u64 discard_cursor; > + u32 discard_flags; >
Same comment as the free space flags, this is just a state holder and never has more than one bit set, so switch it to an enum and treat it like a state. > /* For dirty block groups */ > struct list_head dirty_list; > diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c > index fb92b888774d..26a1e44b4bfa 100644 > --- a/fs/btrfs/discard.c > +++ b/fs/btrfs/discard.c > @@ -22,21 +22,28 @@ btrfs_get_discard_list(struct btrfs_discard_ctl > *discard_ctl, > return &discard_ctl->discard_list[cache->discard_index]; > } > > -void btrfs_add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, > - struct btrfs_block_group_cache *cache) > +static void __btrfs_add_to_discard_list(struct btrfs_discard_ctl > *discard_ctl, > + struct btrfs_block_group_cache *cache) > { > u64 now = ktime_get_ns(); > > - spin_lock(&discard_ctl->lock); > - > if (list_empty(&cache->discard_list) || !cache->discard_index) { > if (!cache->discard_index) > cache->discard_index = 1; > cache->discard_delay = now + BTRFS_DISCARD_DELAY; > + cache->discard_flags |= BTRFS_DISCARD_RESET_CURSOR; > } > > list_move_tail(&cache->discard_list, > btrfs_get_discard_list(discard_ctl, cache)); > +} > + > +void btrfs_add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, > + struct btrfs_block_group_cache *cache) > +{ > + spin_lock(&discard_ctl->lock); > + > + __btrfs_add_to_discard_list(discard_ctl, cache); > > spin_unlock(&discard_ctl->lock); > } > @@ -53,6 +60,7 @@ void btrfs_add_to_discard_free_list(struct > btrfs_discard_ctl *discard_ctl, > > cache->discard_index = 0; > cache->discard_delay = now; > + cache->discard_flags |= BTRFS_DISCARD_RESET_CURSOR; > list_add_tail(&cache->discard_list, &discard_ctl->discard_list[0]); > > spin_unlock(&discard_ctl->lock); > @@ -114,13 +122,24 @@ peek_discard_list(struct btrfs_discard_ctl *discard_ctl) > > spin_lock(&discard_ctl->lock); > > +again: > cache = find_next_cache(discard_ctl, now); > > - if (cache && now < cache->discard_delay) > + if (cache && now > cache->discard_delay) { > + discard_ctl->cache = cache; > + if (cache->discard_index == 0 && > + cache->free_space_ctl->free_space != cache->key.offset) { > + __btrfs_add_to_discard_list(discard_ctl, cache); > + goto again; The magic number thing again, it needs to be discard_index == UNUSED_DISCARD or some such descriptive thing. Also needs to be btrfs_block_group_used(cache) == 0; > + } > + if (btrfs_discard_reset_cursor(cache)) { > + cache->discard_cursor = cache->key.objectid; > + cache->discard_flags &= ~(BTRFS_DISCARD_RESET_CURSOR | > + BTRFS_DISCARD_BITMAPS); > + } > + } else { > cache = NULL; > - > - discard_ctl->cache = cache; > - > + } > spin_unlock(&discard_ctl->lock); > > return cache; > @@ -173,18 +192,42 @@ static void btrfs_discard_workfn(struct work_struct > *work) > > discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work); > > +again: > cache = peek_discard_list(discard_ctl); > if (!cache || !btrfs_run_discard_work(discard_ctl)) > return; > > - btrfs_trim_block_group(cache, &trimmed, cache->key.objectid, > - btrfs_block_group_end(cache), 0); > + if (btrfs_discard_bitmaps(cache)) > + btrfs_trim_block_group_bitmaps(cache, &trimmed, > + cache->discard_cursor, > + btrfs_block_group_end(cache), > + 0, true); > + else > + btrfs_trim_block_group(cache, &trimmed, cache->discard_cursor, > + btrfs_block_group_end(cache), 0, true); > + > + if (cache->discard_cursor >= btrfs_block_group_end(cache)) { > + if (btrfs_discard_bitmaps(cache)) { > + remove_from_discard_list(discard_ctl, cache); > + if (btrfs_is_free_space_trimmed(cache)) > + btrfs_mark_bg_unused(cache); > + else if (cache->free_space_ctl->free_space == > + cache->key.offset) btrfs_block_group_used(cache) == 0; > + btrfs_add_to_discard_free_list(discard_ctl, > + cache); > + } else { > + cache->discard_cursor = cache->key.objectid; > + cache->discard_flags |= BTRFS_DISCARD_BITMAPS; > + } > + } > + > + spin_lock(&discard_ctl->lock); > + discard_ctl->cache = NULL; > + spin_unlock(&discard_ctl->lock); > > - remove_from_discard_list(discard_ctl, cache); > - if (btrfs_is_free_space_trimmed(cache)) > - btrfs_mark_bg_unused(cache); > - else if (cache->free_space_ctl->free_space == cache->key.offset) > - btrfs_add_to_discard_free_list(discard_ctl, cache); > + /* we didn't trim anything but we really ought to so try again */ > + if (trimmed == 0) > + goto again; Why? We'll reschedule if we need to. We unconditionally do this, I feel like there's going to be some corner case where we end up seeing this workfn using 100% on a bunch of sandcastle boxes and we'll all be super sad. > > btrfs_discard_schedule_work(discard_ctl, false); > } > diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h > index 55f79b624943..22cfa7e401bb 100644 > --- a/fs/btrfs/discard.h > +++ b/fs/btrfs/discard.h > @@ -13,6 +13,22 @@ > #include "block-group.h" > #include "free-space-cache.h" > > +/* discard flags */ > +#define BTRFS_DISCARD_RESET_CURSOR (1UL << 0) > +#define BTRFS_DISCARD_BITMAPS (1UL << 1) > + > +static inline > +bool btrfs_discard_reset_cursor(struct btrfs_block_group_cache *cache) > +{ > + return (cache->discard_flags & BTRFS_DISCARD_RESET_CURSOR); > +} > + > +static inline > +bool btrfs_discard_bitmaps(struct btrfs_block_group_cache *cache) > +{ > + return (cache->discard_flags & BTRFS_DISCARD_BITMAPS); > +} > + > void btrfs_add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, > struct btrfs_block_group_cache *cache); > void btrfs_add_to_discard_free_list(struct btrfs_discard_ctl *discard_ctl, > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c > index d69ee5f51b38..ff42e4abb01d 100644 > --- a/fs/btrfs/extent-tree.c > +++ b/fs/btrfs/extent-tree.c > @@ -5683,7 +5683,8 @@ int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct > fstrim_range *range) > &group_trimmed, > start, > end, > - range->minlen); > + range->minlen, > + false); > > trimmed += group_trimmed; > if (ret) { > diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c > index ed0e7ee4c78d..97b3074e83c0 100644 > --- a/fs/btrfs/free-space-cache.c > +++ b/fs/btrfs/free-space-cache.c > @@ -3267,7 +3267,8 @@ static int do_trimming(struct btrfs_block_group_cache > *block_group, > } > > static int trim_no_bitmap(struct btrfs_block_group_cache *block_group, > - u64 *total_trimmed, u64 start, u64 end, u64 minlen) > + u64 *total_trimmed, u64 start, u64 end, u64 minlen, > + bool async) > { > struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; > struct btrfs_free_space *entry; > @@ -3284,36 +3285,25 @@ static int trim_no_bitmap(struct > btrfs_block_group_cache *block_group, > mutex_lock(&ctl->cache_writeout_mutex); > spin_lock(&ctl->tree_lock); > > - if (ctl->free_space < minlen) { > - spin_unlock(&ctl->tree_lock); > - mutex_unlock(&ctl->cache_writeout_mutex); > - break; > - } > + if (ctl->free_space < minlen) > + goto out_unlock; > > entry = tree_search_offset(ctl, start, 0, 1); > - if (!entry) { > - spin_unlock(&ctl->tree_lock); > - mutex_unlock(&ctl->cache_writeout_mutex); > - break; > - } > + if (!entry) > + goto out_unlock; > > /* skip bitmaps */ > - while (entry->bitmap) { > + while (entry->bitmap || (async && > + btrfs_free_space_trimmed(entry))) { Update the comment to say we're skipping already trimmed entries as well please. > node = rb_next(&entry->offset_index); > - if (!node) { > - spin_unlock(&ctl->tree_lock); > - mutex_unlock(&ctl->cache_writeout_mutex); > - goto out; > - } > + if (!node) > + goto out_unlock; > entry = rb_entry(node, struct btrfs_free_space, > offset_index); > } > > - if (entry->offset >= end) { > - spin_unlock(&ctl->tree_lock); > - mutex_unlock(&ctl->cache_writeout_mutex); > - break; > - } > + if (entry->offset >= end) > + goto out_unlock; > > extent_start = entry->offset; > extent_bytes = entry->bytes; > @@ -3338,10 +3328,15 @@ static int trim_no_bitmap(struct > btrfs_block_group_cache *block_group, > ret = do_trimming(block_group, total_trimmed, start, bytes, > extent_start, extent_bytes, extent_flags, > &trim_entry); > - if (ret) > + if (ret) { > + block_group->discard_cursor = start + bytes; > break; > + } > next: > start += bytes; > + block_group->discard_cursor = start; > + if (async && *total_trimmed) > + break; Alright so this means we'll only trim one entry and then return if we're async? It seems to be the same below for bitmaps. This deserves a comment for the functions, it fundamentally changes the behavior of the function if we're async. > > if (fatal_signal_pending(current)) { > ret = -ERESTARTSYS; > @@ -3350,7 +3345,14 @@ static int trim_no_bitmap(struct > btrfs_block_group_cache *block_group, > > cond_resched(); > } > -out: > + > + return ret; > + > +out_unlock: > + block_group->discard_cursor = btrfs_block_group_end(block_group); > + spin_unlock(&ctl->tree_lock); > + mutex_unlock(&ctl->cache_writeout_mutex); > + > return ret; > } > > @@ -3390,7 +3392,8 @@ static void end_trimming_bitmap(struct btrfs_free_space > *entry) > } > > static int trim_bitmaps(struct btrfs_block_group_cache *block_group, > - u64 *total_trimmed, u64 start, u64 end, u64 minlen) > + u64 *total_trimmed, u64 start, u64 end, u64 minlen, > + bool async) > { > struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; > struct btrfs_free_space *entry; > @@ -3407,13 +3410,16 @@ static int trim_bitmaps(struct > btrfs_block_group_cache *block_group, > spin_lock(&ctl->tree_lock); > > if (ctl->free_space < minlen) { > + block_group->discard_cursor = > + btrfs_block_group_end(block_group); > spin_unlock(&ctl->tree_lock); > mutex_unlock(&ctl->cache_writeout_mutex); > break; > } > > entry = tree_search_offset(ctl, offset, 1, 0); > - if (!entry) { > + if (!entry || (async && start == offset && > + btrfs_free_space_trimmed(entry))) { > spin_unlock(&ctl->tree_lock); > mutex_unlock(&ctl->cache_writeout_mutex); > next_bitmap = true; > @@ -3446,6 +3452,16 @@ static int trim_bitmaps(struct btrfs_block_group_cache > *block_group, > goto next; > } > > + /* > + * We already trimmed a region, but are using the locking above > + * to reset the BTRFS_FSC_TRIMMING_BITMAP flag. > + */ > + if (async && *total_trimmed) { > + spin_unlock(&ctl->tree_lock); > + mutex_unlock(&ctl->cache_writeout_mutex); > + return ret; > + } > + > bytes = min(bytes, end - start); > if (bytes < minlen) { > entry->flags &= ~BTRFS_FSC_TRIMMING_BITMAP; > @@ -3468,6 +3484,8 @@ static int trim_bitmaps(struct btrfs_block_group_cache > *block_group, > start, bytes, 0, &trim_entry); > if (ret) { > reset_trimming_bitmap(ctl, offset); > + block_group->discard_cursor = > + btrfs_block_group_end(block_group); > break; > } > next: > @@ -3477,6 +3495,7 @@ static int trim_bitmaps(struct btrfs_block_group_cache > *block_group, > } else { > start += bytes; > } > + block_group->discard_cursor = start; > > if (fatal_signal_pending(current)) { > if (start != offset) > @@ -3488,6 +3507,9 @@ static int trim_bitmaps(struct btrfs_block_group_cache > *block_group, > cond_resched(); > } > > + if (offset >= end) > + block_group->discard_cursor = end; > + > return ret; > } > > @@ -3532,7 +3554,8 @@ void btrfs_put_block_group_trimming(struct > btrfs_block_group_cache *block_group) > } > > int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group, > - u64 *trimmed, u64 start, u64 end, u64 minlen) > + u64 *trimmed, u64 start, u64 end, u64 minlen, > + bool async) > { > struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; > int ret; > @@ -3547,11 +3570,11 @@ int btrfs_trim_block_group(struct > btrfs_block_group_cache *block_group, > btrfs_get_block_group_trimming(block_group); > spin_unlock(&block_group->lock); > > - ret = trim_no_bitmap(block_group, trimmed, start, end, minlen); > - if (ret) > + ret = trim_no_bitmap(block_group, trimmed, start, end, minlen, async); > + if (ret || async) > goto out; > You already separate out btrfs_trim_block_group_bitmaps, so this function really only trims the whole block group if !async. Make a separate helper for trimming only the extents so it's clear what the async stuff is doing, and we're not relying on the async to change the behavior of btrfs_trim_block_group(). Thanks, Josef