On Mon, Mar 25, 2019 at 02:31:27PM +0200, Nikolay Borisov wrote:
> From: Jeff Mahoney <je...@suse.com>
> 
> The pending chunks list contains chunks that are allocated in the
> current transaction but haven't been created yet. The pinned chunks
> list contains chunks that are being released in the current transaction.
> Both describe chunks that are not reflected on disk as in use but are
> unavailable just the same.
> 
> The pending chunks list is anchored by the transaction handle, which
> means that we need to hold a reference to a transaction when working
> with the list.
> 
> We use these lists to ensure that we don't end up discarding chunks
> that are allocated or released in the current transaction.  What we r

unfinished sentence?

> 
> The way we use them is by iterating over both lists to perform
> comparisons on the stripes they describe for each device. This is
> backwards and requires that we keep a transaction handle open while
> we're trimming.
> 
> This patchset adds an extent_io_tree to btrfs_device that maintains
> the allocation state of the device.  Extents are set dirty when
> chunks are first allocated -- when the extent maps are added to the
> mapping tree. They're cleared when last removed -- when the extent
> maps are removed from the mapping tree. This matches the lifespan
> of the pending and pinned chunks list and allows us to do trims
> on unallocated space safely without pinning the transaction for what
> may be a lengthy operation. We can also use this io tree to mark
> which chunks have already been trimmed so we don't repeat the operation.
> 
> Signed-off-by: Nikolay Borisov <nbori...@suse.com>
> ---
>  fs/btrfs/ctree.h            |  6 ---
>  fs/btrfs/disk-io.c          | 11 -----
>  fs/btrfs/extent-tree.c      | 28 -----------
>  fs/btrfs/extent_io.c        |  2 +-
>  fs/btrfs/extent_io.h        |  6 ++-
>  fs/btrfs/extent_map.c       | 36 ++++++++++++++
>  fs/btrfs/extent_map.h       |  1 -
>  fs/btrfs/free-space-cache.c |  4 --
>  fs/btrfs/transaction.c      |  9 ----
>  fs/btrfs/transaction.h      |  1 -
>  fs/btrfs/volumes.c          | 96 +++++++++++++------------------------
>  fs/btrfs/volumes.h          |  2 +
>  12 files changed, 76 insertions(+), 126 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 880b5abd8ecc..0b25c2f1b77d 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1152,12 +1152,6 @@ struct btrfs_fs_info {
>       struct mutex unused_bg_unpin_mutex;
>       struct mutex delete_unused_bgs_mutex;
>  
> -     /*
> -      * Chunks that can't be freed yet (under a trim/discard operation)
> -      * and will be latter freed. Protected by fs_info->chunk_mutex.
> -      */
> -     struct list_head pinned_chunks;
> -
>       /* Cached block sizes */
>       u32 nodesize;
>       u32 sectorsize;
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 81183694f8ea..813fc21b2b0b 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -2789,8 +2789,6 @@ int open_ctree(struct super_block *sb,
>       init_waitqueue_head(&fs_info->async_submit_wait);
>       init_waitqueue_head(&fs_info->delayed_iputs_wait);
>  
> -     INIT_LIST_HEAD(&fs_info->pinned_chunks);
> -
>       /* Usable values until the real ones are cached from the superblock */
>       fs_info->nodesize = 4096;
>       fs_info->sectorsize = 4096;
> @@ -4065,15 +4063,6 @@ void close_ctree(struct btrfs_fs_info *fs_info)
>  
>       btrfs_free_stripe_hash_table(fs_info);
>       btrfs_free_ref_cache(fs_info);
> -
> -     while (!list_empty(&fs_info->pinned_chunks)) {
> -             struct extent_map *em;
> -
> -             em = list_first_entry(&fs_info->pinned_chunks,
> -                                   struct extent_map, list);
> -             list_del_init(&em->list);
> -             free_extent_map(em);
> -     }
>  }
>  
>  int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index dcda3c4de240..eb4b7f2b10a1 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -10946,10 +10946,6 @@ int btrfs_remove_block_group(struct 
> btrfs_trans_handle *trans,
>       memcpy(&key, &block_group->key, sizeof(key));
>  
>       mutex_lock(&fs_info->chunk_mutex);
> -     if (!list_empty(&em->list)) {
> -             /* We're in the transaction->pending_chunks list. */
> -             free_extent_map(em);
> -     }
>       spin_lock(&block_group->lock);
>       block_group->removed = 1;
>       /*
> @@ -10976,25 +10972,6 @@ int btrfs_remove_block_group(struct 
> btrfs_trans_handle *trans,
>        * the transaction commit has completed.
>        */
>       remove_em = (atomic_read(&block_group->trimming) == 0);
> -     /*
> -      * Make sure a trimmer task always sees the em in the pinned_chunks list
> -      * if it sees block_group->removed == 1 (needs to lock block_group->lock
> -      * before checking block_group->removed).
> -      */
> -     if (!remove_em) {
> -             /*
> -              * Our em might be in trans->transaction->pending_chunks which
> -              * is protected by fs_info->chunk_mutex ([lock|unlock]_chunks),
> -              * and so is the fs_info->pinned_chunks list.
> -              *
> -              * So at this point we must be holding the chunk_mutex to avoid
> -              * any races with chunk allocation (more specifically at
> -              * volumes.c:contains_pending_extent()), to ensure it always
> -              * sees the em, either in the pending_chunks list or in the
> -              * pinned_chunks list.
> -              */
> -             list_move_tail(&em->list, &fs_info->pinned_chunks);
> -     }
>       spin_unlock(&block_group->lock);
>  
>       if (remove_em) {
> @@ -11002,11 +10979,6 @@ int btrfs_remove_block_group(struct 
> btrfs_trans_handle *trans,
>  
>               em_tree = &fs_info->mapping_tree.map_tree;
>               write_lock(&em_tree->lock);
> -             /*
> -              * The em might be in the pending_chunks list, so make sure the
> -              * chunk mutex is locked, since remove_extent_mapping() will
> -              * delete us from that list.
> -              */
>               remove_extent_mapping(em_tree, em);
>               write_unlock(&em_tree->lock);
>               /* once for the tree */
> diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
> index 0e953c469224..ff3f8443e180 100644
> --- a/fs/btrfs/extent_io.c
> +++ b/fs/btrfs/extent_io.c
> @@ -918,7 +918,7 @@ static void cache_state(struct extent_state *state,
>   * [start, end] is inclusive This takes the tree lock.
>   */
>  
> -static int __must_check
> +int __must_check
>  __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
>                unsigned bits, unsigned exclusive_bits,
>                u64 *failed_start, struct extent_state **cached_state,
> diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
> index 0fda249e5982..4bcc203b5431 100644
> --- a/fs/btrfs/extent_io.h
> +++ b/fs/btrfs/extent_io.h
> @@ -329,7 +329,11 @@ int set_record_extent_bits(struct extent_io_tree *tree, 
> u64 start, u64 end,
>  int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
>                  unsigned bits, u64 *failed_start,
>                  struct extent_state **cached_state, gfp_t mask);
> -
> +int
> +__set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
> +              unsigned bits, unsigned exclusive_bits,
> +              u64 *failed_start, struct extent_state **cached_state,
> +              gfp_t mask, struct extent_changeset *changeset);
>  static inline int set_extent_bits(struct extent_io_tree *tree, u64 start,
>               u64 end, unsigned bits)
>  {
> diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
> index 928f729c55ba..0820f6fcf3a6 100644
> --- a/fs/btrfs/extent_map.c
> +++ b/fs/btrfs/extent_map.c
> @@ -4,6 +4,7 @@
>  #include <linux/slab.h>
>  #include <linux/spinlock.h>
>  #include "ctree.h"
> +#include "volumes.h"
>  #include "extent_map.h"
>  #include "compression.h"
>  
> @@ -336,6 +337,37 @@ static inline void setup_extent_mapping(struct 
> extent_map_tree *tree,
>       else
>               try_merge_map(tree, em);
>  }
> +static void extent_map_device_set_bits(struct extent_map *em, unsigned bits)
> +{
> +     struct map_lookup *map = em->map_lookup;
> +     u64 stripe_size = em->orig_block_len;
> +     int i;
> +
> +     for (i = 0; i < map->num_stripes; i++) {
> +             struct btrfs_bio_stripe *stripe = &map->stripes[i];
> +             struct btrfs_device *device = stripe->dev;
> +
> +             __set_extent_bit(&device->alloc_state, stripe->physical,
> +                              stripe->physical + stripe_size - 1, bits,
> +                              0, NULL, NULL, GFP_NOWAIT, NULL);
> +     }
> +}
> +
> +static void extent_map_device_clear_bits(struct extent_map *em, unsigned 
> bits)
> +{
> +     struct map_lookup *map = em->map_lookup;
> +     u64 stripe_size = em->orig_block_len;
> +     int i;
> +
> +     for (i = 0; i < map->num_stripes; i++) {
> +             struct btrfs_bio_stripe *stripe = &map->stripes[i];
> +             struct btrfs_device *device = stripe->dev;
> +
> +             __clear_extent_bit(&device->alloc_state, stripe->physical,
> +                                stripe->physical + stripe_size - 1, bits,
> +                                0, 0, NULL, GFP_NOWAIT, NULL);
> +     }
> +}
>  
>  /**
>   * add_extent_mapping - add new extent map to the extent tree
> @@ -357,6 +389,8 @@ int add_extent_mapping(struct extent_map_tree *tree,
>               goto out;
>  
>       setup_extent_mapping(tree, em, modified);
> +     if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
> +             extent_map_device_set_bits(em, CHUNK_ALLOCATED);
>  out:
>       return ret;
>  }
> @@ -438,6 +472,8 @@ void remove_extent_mapping(struct extent_map_tree *tree, 
> struct extent_map *em)
>       rb_erase_cached(&em->rb_node, &tree->map);
>       if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags))
>               list_del_init(&em->list);
> +     if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags))
> +             extent_map_device_clear_bits(em, CHUNK_ALLOCATED);
>       RB_CLEAR_NODE(&em->rb_node);
>  }
>  
> diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h
> index 473f039fcd7c..72b46833f236 100644
> --- a/fs/btrfs/extent_map.h
> +++ b/fs/btrfs/extent_map.h
> @@ -91,7 +91,6 @@ void replace_extent_mapping(struct extent_map_tree *tree,
>                           struct extent_map *cur,
>                           struct extent_map *new,
>                           int modified);
> -
>  struct extent_map *alloc_extent_map(void);
>  void free_extent_map(struct extent_map *em);
>  int __init extent_map_init(void);
> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> index 74aa552f4793..207fb50dcc7a 100644
> --- a/fs/btrfs/free-space-cache.c
> +++ b/fs/btrfs/free-space-cache.c
> @@ -3366,10 +3366,6 @@ void btrfs_put_block_group_trimming(struct 
> btrfs_block_group_cache *block_group)
>               em = lookup_extent_mapping(em_tree, block_group->key.objectid,
>                                          1);
>               BUG_ON(!em); /* logic error, can't happen */
> -             /*
> -              * remove_extent_mapping() will delete us from the pinned_chunks
> -              * list, which is protected by the chunk mutex.
> -              */
>               remove_extent_mapping(em_tree, em);
>               write_unlock(&em_tree->lock);
>               mutex_unlock(&fs_info->chunk_mutex);
> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
> index b32769998bbb..e5404326fc55 100644
> --- a/fs/btrfs/transaction.c
> +++ b/fs/btrfs/transaction.c
> @@ -50,14 +50,6 @@ void btrfs_put_transaction(struct btrfs_transaction 
> *transaction)
>                       btrfs_err(transaction->fs_info,
>                                 "pending csums is %llu",
>                                 transaction->delayed_refs.pending_csums);
> -             while (!list_empty(&transaction->pending_chunks)) {
> -                     struct extent_map *em;
> -
> -                     em = list_first_entry(&transaction->pending_chunks,
> -                                           struct extent_map, list);
> -                     list_del_init(&em->list);
> -                     free_extent_map(em);
> -             }
>               /*
>                * If any block groups are found in ->deleted_bgs then it's
>                * because the transaction was aborted and a commit did not
> @@ -235,7 +227,6 @@ static noinline int join_transaction(struct btrfs_fs_info 
> *fs_info,
>       spin_lock_init(&cur_trans->delayed_refs.lock);
>  
>       INIT_LIST_HEAD(&cur_trans->pending_snapshots);
> -     INIT_LIST_HEAD(&cur_trans->pending_chunks);
>       INIT_LIST_HEAD(&cur_trans->dev_update_list);
>       INIT_LIST_HEAD(&cur_trans->switch_commits);
>       INIT_LIST_HEAD(&cur_trans->dirty_bgs);
> diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
> index 2bd76f681520..4419a4a0294b 100644
> --- a/fs/btrfs/transaction.h
> +++ b/fs/btrfs/transaction.h
> @@ -51,7 +51,6 @@ struct btrfs_transaction {
>       wait_queue_head_t writer_wait;
>       wait_queue_head_t commit_wait;
>       struct list_head pending_snapshots;
> -     struct list_head pending_chunks;
>       struct list_head dev_update_list;
>       struct list_head switch_commits;
>       struct list_head dirty_bgs;
> diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
> index 39a107bdde06..a06f1e3c63b9 100644
> --- a/fs/btrfs/volumes.c
> +++ b/fs/btrfs/volumes.c
> @@ -335,6 +335,8 @@ void btrfs_free_device(struct btrfs_device *device)
>  {
>       WARN_ON(!list_empty(&device->post_commit_list));
>       rcu_string_free(device->name);
> +     if (!in_softirq())
> +             extent_io_tree_release(&device->alloc_state);
>       bio_put(device->flush_bio);
>       kfree(device);
>  }
> @@ -411,6 +413,7 @@ static struct btrfs_device *__alloc_device(void)
>       btrfs_device_data_ordered_init(dev);
>       INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
>       INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
> +     extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
>  
>       return dev;
>  }
> @@ -1269,6 +1272,9 @@ static void btrfs_close_one_device(struct btrfs_device 
> *device)
>       if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
>               fs_devices->missing_devices--;
>  
> +     /* Remove alloc state now since it cannot be done in RCU context */
> +     extent_io_tree_release(&device->alloc_state);
> +
>       btrfs_close_bdev(device);
>  
>       new_device = btrfs_alloc_device(NULL, &device->devid,
> @@ -1505,58 +1511,29 @@ struct btrfs_device *btrfs_scan_one_device(const char 
> *path, fmode_t flags,
>       return device;
>  }
>  
> -static int contains_pending_extent(struct btrfs_transaction *transaction,
> -                                struct btrfs_device *device,
> -                                u64 *start, u64 len)
> -{
> -     struct btrfs_fs_info *fs_info = device->fs_info;
> -     struct extent_map *em;
> -     struct list_head *search_list = &fs_info->pinned_chunks;
> -     int ret = 0;
> -     u64 physical_start = *start;
> -
> -     if (transaction)
> -             search_list = &transaction->pending_chunks;
> -again:
> -     list_for_each_entry(em, search_list, list) {
> -             struct map_lookup *map;
> -             int i;
> -
> -             map = em->map_lookup;
> -             for (i = 0; i < map->num_stripes; i++) {
> -                     u64 end;
> -
> -                     if (map->stripes[i].dev != device)
> -                             continue;
> -                     if (map->stripes[i].physical >= physical_start + len ||
> -                         map->stripes[i].physical + em->orig_block_len <=
> -                         physical_start)
> -                             continue;
> -                     /*
> -                      * Make sure that while processing the pinned list we do
> -                      * not override our *start with a lower value, because
> -                      * we can have pinned chunks that fall within this
> -                      * device hole and that have lower physical addresses
> -                      * than the pending chunks we processed before. If we
> -                      * do not take this special care we can end up getting
> -                      * 2 pending chunks that start at the same physical
> -                      * device offsets because the end offset of a pinned
> -                      * chunk can be equal to the start offset of some
> -                      * pending chunk.
> -                      */
> -                     end = map->stripes[i].physical + em->orig_block_len;
> -                     if (end > *start) {
> -                             *start = end;
> -                             ret = 1;
> -                     }
> +/*
> + * Tries to find a chunk that intersects [start, start +len] range and when 
> one
> + * such is found, records the end of it in *start
> + */
> +#define in_range(b, first, len)        ((b) >= (first) && (b) < (first) + 
> (len))
> +static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
> +                                 u64 len)
> +{
> +     u64 physical_start, physical_end;
> +     lockdep_assert_held(&device->fs_info->chunk_mutex);
> +
> +     if (!find_first_extent_bit(&device->alloc_state, *start,
> +                                &physical_start, &physical_end,
> +                                CHUNK_ALLOCATED, NULL)) {
> +
> +             if (in_range(physical_start, *start, len) ||
> +                 in_range(*start, physical_start,
> +                          physical_end - physical_start)) {
> +                     *start = physical_end + 1;
> +                     return true;
>               }
>       }
> -     if (search_list != &fs_info->pinned_chunks) {
> -             search_list = &fs_info->pinned_chunks;
> -             goto again;
> -     }
> -
> -     return ret;
> +     return false;
>  }
>  
>  
> @@ -1667,15 +1644,12 @@ int find_free_dev_extent_start(struct 
> btrfs_transaction *transaction,
>                        * Have to check before we set max_hole_start, otherwise
>                        * we could end up sending back this offset anyway.
>                        */
> -                     if (contains_pending_extent(transaction, device,
> -                                                 &search_start,
> +                     if (contains_pending_extent(device, &search_start,
>                                                   hole_size)) {
> -                             if (key.offset >= search_start) {
> +                             if (key.offset >= search_start)
>                                       hole_size = key.offset - search_start;
> -                             } else {
> -                                     WARN_ON_ONCE(1);
> +                             else
>                                       hole_size = 0;
> -                             }
>                       }
>  
>                       if (hole_size > max_hole_size) {
> @@ -1716,8 +1690,7 @@ int find_free_dev_extent_start(struct btrfs_transaction 
> *transaction,
>       if (search_end > search_start) {
>               hole_size = search_end - search_start;
>  
> -             if (contains_pending_extent(transaction, device, &search_start,
> -                                         hole_size)) {
> +             if (contains_pending_extent(device, &search_start, hole_size)) {
>                       btrfs_release_path(path);
>                       goto again;
>               }
> @@ -4759,7 +4732,7 @@ int btrfs_shrink_device(struct btrfs_device *device, 
> u64 new_size)
>        * in-memory chunks are synced to disk so that the loop below sees them
>        * and relocates them accordingly.
>        */
> -     if (contains_pending_extent(trans->transaction, device, &start, diff)) {
> +     if (contains_pending_extent(device, &start, diff)) {
>               mutex_unlock(&fs_info->chunk_mutex);
>               ret = btrfs_commit_transaction(trans);
>               if (ret)
> @@ -5201,9 +5174,6 @@ static int __btrfs_alloc_chunk(struct 
> btrfs_trans_handle *trans,
>               free_extent_map(em);
>               goto error;
>       }
> -
> -     list_add_tail(&em->list, &trans->transaction->pending_chunks);
> -     refcount_inc(&em->refs);
>       write_unlock(&em_tree->lock);
>  
>       ret = btrfs_make_block_group(trans, 0, type, start, chunk_size);
> @@ -5236,8 +5206,6 @@ static int __btrfs_alloc_chunk(struct 
> btrfs_trans_handle *trans,
>       free_extent_map(em);
>       /* One for the tree reference */
>       free_extent_map(em);
> -     /* One for the pending_chunks list reference */
> -     free_extent_map(em);
>  error:
>       kfree(devices_info);
>       return ret;
> diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
> index a0f09aad3770..49dc737d8a54 100644
> --- a/fs/btrfs/volumes.h
> +++ b/fs/btrfs/volumes.h
> @@ -134,6 +134,8 @@ struct btrfs_device {
>       /* Counter to record the change of device stats */
>       atomic_t dev_stats_ccnt;
>       atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX];
> +
> +     struct extent_io_tree alloc_state;
>  };
>  
>  /*
> -- 
> 2.17.1

Reply via email to