On Wed, Jan 15, 2014 at 08:00:58PM +0800, Miao Xie wrote:
> When we mounted the filesystem after the crash, we got the following
> message:
>   BTRFS error (device xxx): block group 4315938816 has wrong amount of free 
> space
>   BTRFS error (device xxx): failed to load free space cache for block group 
> 4315938816

The code itself is fine.
But I'm not sure if it's worth doing so, I mean those memory allocation and lock
acquire and release in our core endio path.

To fallback to rebuild space cache is fairly acceptable to me in the case of
crash.

-liubo

> 
> It is because we didn't update the metadata of the allocated space until
> the file data was written into the disk. During this time, there was no
> information about the allocated spaces in either the extent tree nor the
> free space cache. when we wrote out the free space cache at this time, those
> spaces were lost.
> 
> In ordered to fix this problem, I use a state tree for every block group
> to record those allocated spaces. We record the information when they are
> allocated, and clean up the information after the metadata update. Besides
> that, we also introduce a read-write semaphore to avoid the race between
> the allocation and the free space cache write out.
> 
> Only data block groups had this problem, so the above change is just
> for data space allocation.
> 
> Signed-off-by: Miao Xie <mi...@cn.fujitsu.com>
> ---
>  fs/btrfs/ctree.h            | 15 ++++++++++++++-
>  fs/btrfs/disk-io.c          |  2 +-
>  fs/btrfs/extent-tree.c      | 24 ++++++++++++++++++++----
>  fs/btrfs/free-space-cache.c | 42 ++++++++++++++++++++++++++++++++++++++----
>  fs/btrfs/inode.c            | 42 +++++++++++++++++++++++++++++++++++-------
>  5 files changed, 108 insertions(+), 17 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 1667c9a..f58e1f7 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1244,6 +1244,12 @@ struct btrfs_block_group_cache {
>       /* free space cache stuff */
>       struct btrfs_free_space_ctl *free_space_ctl;
>  
> +     /*
> +      * It is used to record the extents that are allocated for
> +      * the data, but don/t update its metadata.
> +      */
> +     struct extent_io_tree pinned_extents;
> +
>       /* block group cache stuff */
>       struct rb_node cache_node;
>  
> @@ -1540,6 +1546,13 @@ struct btrfs_fs_info {
>        */
>       struct list_head space_info;
>  
> +     /*
> +      * It is just used for the delayed data space allocation
> +      * because only the data space allocation can be done during
> +      * we write out the free space cache.
> +      */
> +     struct rw_semaphore data_rwsem;
> +
>       struct btrfs_space_info *data_sinfo;
>  
>       struct reloc_control *reloc_ctl;
> @@ -3183,7 +3196,7 @@ int btrfs_alloc_logged_file_extent(struct 
> btrfs_trans_handle *trans,
>                                  struct btrfs_key *ins);
>  int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
>                        u64 min_alloc_size, u64 empty_size, u64 hint_byte,
> -                      struct btrfs_key *ins, int is_data);
> +                      struct btrfs_key *ins, int is_data, bool need_pin);
>  int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
>                 struct extent_buffer *buf, int full_backref, int for_cow);
>  int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 8072cfa..426b558 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -2276,7 +2276,6 @@ int open_ctree(struct super_block *sb,
>       fs_info->pinned_extents = &fs_info->freed_extents[0];
>       fs_info->do_barriers = 1;
>  
> -
>       mutex_init(&fs_info->ordered_operations_mutex);
>       mutex_init(&fs_info->ordered_extent_flush_mutex);
>       mutex_init(&fs_info->tree_log_mutex);
> @@ -2287,6 +2286,7 @@ int open_ctree(struct super_block *sb,
>       init_rwsem(&fs_info->extent_commit_sem);
>       init_rwsem(&fs_info->cleanup_work_sem);
>       init_rwsem(&fs_info->subvol_sem);
> +     init_rwsem(&fs_info->data_rwsem);
>       sema_init(&fs_info->uuid_tree_rescan_sem, 1);
>       fs_info->dev_replace.lock_owner = 0;
>       atomic_set(&fs_info->dev_replace.nesting_level, 0);
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 3664cfb..7b07876 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -6173,7 +6173,7 @@ enum btrfs_loop_type {
>  static noinline int find_free_extent(struct btrfs_root *orig_root,
>                                    u64 num_bytes, u64 empty_size,
>                                    u64 hint_byte, struct btrfs_key *ins,
> -                                  u64 flags)
> +                                  u64 flags, bool need_pin)
>  {
>       int ret = 0;
>       struct btrfs_root *root = orig_root->fs_info->extent_root;
> @@ -6502,6 +6502,16 @@ checks:
>               ins->objectid = search_start;
>               ins->offset = num_bytes;
>  
> +             if (need_pin) {
> +                     ASSERT(search_start >= block_group->key.objectid &&
> +                            search_start < block_group->key.objectid +
> +                                           block_group->key.offset);
> +                     set_extent_dirty(&block_group->pinned_extents,
> +                                      search_start,
> +                                      search_start + num_bytes - 1,
> +                                      GFP_NOFS);
> +             }
> +
>               trace_btrfs_reserve_extent(orig_root, block_group,
>                                          search_start, num_bytes);
>               btrfs_put_block_group(block_group);
> @@ -6614,17 +6624,20 @@ again:
>  int btrfs_reserve_extent(struct btrfs_root *root,
>                        u64 num_bytes, u64 min_alloc_size,
>                        u64 empty_size, u64 hint_byte,
> -                      struct btrfs_key *ins, int is_data)
> +                      struct btrfs_key *ins, int is_data, bool need_pin)
>  {
>       bool final_tried = false;
>       u64 flags;
>       int ret;
>  
>       flags = btrfs_get_alloc_profile(root, is_data);
> +
> +     if (need_pin)
> +             down_read(&root->fs_info->data_rwsem);
>  again:
>       WARN_ON(num_bytes < root->sectorsize);
>       ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
> -                            flags);
> +                            flags, need_pin);
>  
>       if (ret == -ENOSPC) {
>               if (!final_tried && ins->offset) {
> @@ -6645,6 +6658,8 @@ again:
>               }
>       }
>  
> +     if (need_pin)
> +             up_read(&root->fs_info->data_rwsem);
>       return ret;
>  }
>  
> @@ -7016,7 +7031,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct 
> btrfs_trans_handle *trans,
>               return ERR_CAST(block_rsv);
>  
>       ret = btrfs_reserve_extent(root, blocksize, blocksize,
> -                                empty_size, hint, &ins, 0);
> +                                empty_size, hint, &ins, 0, false);
>       if (ret) {
>               unuse_block_rsv(root->fs_info, block_rsv, blocksize);
>               return ERR_PTR(ret);
> @@ -8387,6 +8402,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, 
> u64 start, u64 size)
>       INIT_LIST_HEAD(&cache->cluster_list);
>       INIT_LIST_HEAD(&cache->new_bg_list);
>       btrfs_init_free_space_ctl(cache);
> +     extent_io_tree_init(&cache->pinned_extents, NULL);
>  
>       return cache;
>  }
> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> index 057be95..486e12a3 100644
> --- a/fs/btrfs/free-space-cache.c
> +++ b/fs/btrfs/free-space-cache.c
> @@ -875,6 +875,7 @@ static int __btrfs_write_out_cache(struct btrfs_root 
> *root, struct inode *inode,
>       struct rb_node *node;
>       struct list_head *pos, *n;
>       struct extent_state *cached_state = NULL;
> +     struct extent_state *pinned_extent = NULL;
>       struct btrfs_free_cluster *cluster = NULL;
>       struct extent_io_tree *unpin = NULL;
>       struct io_ctl io_ctl;
> @@ -948,17 +949,17 @@ static int __btrfs_write_out_cache(struct btrfs_root 
> *root, struct inode *inode,
>        * so we don't leak the space
>        */
>  
> +     if (!block_group)
> +             goto bitmap;
>       /*
>        * We shouldn't have switched the pinned extents yet so this is the
>        * right one
>        */
>       unpin = root->fs_info->pinned_extents;
>  
> -     if (block_group)
> -             start = block_group->key.objectid;
> +     start = block_group->key.objectid;
>  
> -     while (block_group && (start < block_group->key.objectid +
> -                            block_group->key.offset)) {
> +     while (start < block_group->key.objectid + block_group->key.offset) {
>               ret = find_first_extent_bit(unpin, start,
>                                           &extent_start, &extent_end,
>                                           EXTENT_DIRTY, NULL);
> @@ -985,6 +986,33 @@ static int __btrfs_write_out_cache(struct btrfs_root 
> *root, struct inode *inode,
>               start = extent_end;
>       }
>  
> +     if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
> +             goto bitmap;
> +
> +     start = block_group->key.objectid;
> +     unpin = &block_group->pinned_extents;
> +     while (1) {
> +             ret = find_first_extent_bit(unpin, start,
> +                                         &extent_start, &extent_end,
> +                                         EXTENT_DIRTY, &pinned_extent);
> +             if (ret) {
> +                     ret = 0;
> +                     break;
> +             }
> +
> +             len = extent_end - extent_start + 1;
> +
> +             entries++;
> +             ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL);
> +             if (ret) {
> +                     free_extent_state(pinned_extent);
> +                     goto out_nospc;
> +             }
> +
> +             start = extent_end + 1;
> +     }
> +     free_extent_state(pinned_extent);
> +bitmap:
>       /* Write out the bitmaps */
>       list_for_each_safe(pos, n, &bitmap_list) {
>               struct btrfs_free_space *entry =
> @@ -1097,6 +1125,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
>       if (IS_ERR(inode))
>               return 0;
>  
> +     if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
> +             down_write(&root->fs_info->data_rwsem);
> +
>       ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
>                                     path, block_group->key.objectid);
>       if (ret) {
> @@ -1111,6 +1142,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
>  #endif
>       }
>  
> +     if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
> +             up_write(&root->fs_info->data_rwsem);
> +
>       iput(inode);
>       return ret;
>  }
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index f1a7744..8172ca6 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -592,6 +592,28 @@ free_pages_out:
>       goto out;
>  }
>  
> +static void btrfs_unpin_data_extent(struct btrfs_root *root, u64 start,
> +                                 u64 len)
> +{
> +     struct btrfs_block_group_cache *cache;
> +
> +     cache = btrfs_lookup_block_group(root->fs_info, start);
> +     BUG_ON(!cache);
> +     clear_extent_dirty(&cache->pinned_extents, start, start + len - 1,
> +                        GFP_NOFS);
> +     btrfs_put_block_group(cache);
> +}
> +
> +/* it is not used for free space cache file */
> +static void btrfs_free_reserved_data_extent(struct btrfs_root *root, u64 
> start,
> +                                         u64 len)
> +{
> +     down_read(&root->fs_info->data_rwsem);
> +     btrfs_unpin_data_extent(root, start, len);
> +     btrfs_free_reserved_extent(root, start, len);
> +     up_read(&root->fs_info->data_rwsem);
> +}
> +
>  /*
>   * phase two of compressed writeback.  This is the ordered portion
>   * of the code, which only gets called in the order the work was
> @@ -666,7 +688,7 @@ retry:
>               ret = btrfs_reserve_extent(root,
>                                          async_extent->compressed_size,
>                                          async_extent->compressed_size,
> -                                        0, alloc_hint, &ins, 1);
> +                                        0, alloc_hint, &ins, 1, true);
>               if (ret) {
>                       int i;
>  
> @@ -767,7 +789,7 @@ retry:
>  out:
>       return ret;
>  out_free_reserve:
> -     btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
> +     btrfs_free_reserved_data_extent(root, ins.objectid, ins.offset);
>  out_free:
>       extent_clear_unlock_delalloc(inode, async_extent->start,
>                                    async_extent->start +
> @@ -889,7 +911,7 @@ static noinline int cow_file_range(struct inode *inode,
>               cur_alloc_size = disk_num_bytes;
>               ret = btrfs_reserve_extent(root, cur_alloc_size,
>                                          root->sectorsize, 0, alloc_hint,
> -                                        &ins, 1);
> +                                        &ins, 1, true);
>               if (ret < 0)
>                       goto out_unlock;
>  
> @@ -967,6 +989,7 @@ out:
>       return ret;
>  
>  out_reserve:
> +     btrfs_unpin_data_extent(root, ins.objectid, ins.offset);
>       btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
>  out_unlock:
>       extent_clear_unlock_delalloc(inode, start, end, locked_page,
> @@ -2647,6 +2670,9 @@ static int btrfs_finish_ordered_io(struct 
> btrfs_ordered_extent *ordered_extent)
>                                               logical_len, logical_len,
>                                               compress_type, 0, 0,
>                                               BTRFS_FILE_EXTENT_REG);
> +             BUG_ON(nolock);
> +             btrfs_unpin_data_extent(root, ordered_extent->start,
> +                                     ordered_extent->disk_len);
>       }
>       unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
>                          ordered_extent->file_offset, ordered_extent->len,
> @@ -2698,8 +2724,9 @@ out:
>               if ((ret || !logical_len) &&
>                   !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
>                   !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
> -                     btrfs_free_reserved_extent(root, ordered_extent->start,
> -                                                ordered_extent->disk_len);
> +                     btrfs_free_reserved_data_extent(root,
> +                                                     ordered_extent->start,
> +                                                     
> ordered_extent->disk_len);
>       }
>  
>  
> @@ -6342,7 +6369,7 @@ static struct extent_map 
> *btrfs_new_extent_direct(struct inode *inode,
>  
>       alloc_hint = get_extent_allocation_hint(inode, start, len);
>       ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
> -                                alloc_hint, &ins, 1);
> +                                alloc_hint, &ins, 1, true);
>       if (ret)
>               return ERR_PTR(ret);
>  
> @@ -6356,6 +6383,7 @@ static struct extent_map 
> *btrfs_new_extent_direct(struct inode *inode,
>       ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
>                                          ins.offset, ins.offset, 0);
>       if (ret) {
> +             btrfs_unpin_data_extent(root, ins.objectid, ins.offset);
>               btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
>               free_extent_map(em);
>               return ERR_PTR(ret);
> @@ -8507,7 +8535,7 @@ static int __btrfs_prealloc_file_range(struct inode 
> *inode, int mode,
>               cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
>               cur_bytes = max(cur_bytes, min_size);
>               ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
> -                                        *alloc_hint, &ins, 1);
> +                                        *alloc_hint, &ins, 1, false);
>               if (ret) {
>                       if (own_trans)
>                               btrfs_end_transaction(trans, root);
> -- 
> 1.8.3.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to