Hi Miao,

On Wed, Jan 15, 2014 at 2:00 PM, Miao Xie <mi...@cn.fujitsu.com> wrote:
> When we mounted the filesystem after the crash, we got the following
> message:
>   BTRFS error (device xxx): block group 4315938816 has wrong amount of free 
> space
>   BTRFS error (device xxx): failed to load free space cache for block group 
> 4315938816
>
> It is because we didn't update the metadata of the allocated space until
> the file data was written into the disk. During this time, there was no
> information about the allocated spaces in either the extent tree nor the
> free space cache. when we wrote out the free space cache at this time, those
> spaces were lost.
Can you please clarify more about the problem?
So I understand that we allocate something from the free space cache.
So after that, the free space cache does not account for this extent
anymore. On the other hand the extent tree also does not account for
it (yet). We need to add a delayed reference, which will be run at
transaction commit and update the extent tree. But free-space cache is
also written out during transaction commit. So how the issue happens?
Can you perhaps post a flow of events?

Thanks.
Alex.


>
> In ordered to fix this problem, I use a state tree for every block group
> to record those allocated spaces. We record the information when they are
> allocated, and clean up the information after the metadata update. Besides
> that, we also introduce a read-write semaphore to avoid the race between
> the allocation and the free space cache write out.
>
> Only data block groups had this problem, so the above change is just
> for data space allocation.
>
> Signed-off-by: Miao Xie <mi...@cn.fujitsu.com>
> ---
>  fs/btrfs/ctree.h            | 15 ++++++++++++++-
>  fs/btrfs/disk-io.c          |  2 +-
>  fs/btrfs/extent-tree.c      | 24 ++++++++++++++++++++----
>  fs/btrfs/free-space-cache.c | 42 ++++++++++++++++++++++++++++++++++++++----
>  fs/btrfs/inode.c            | 42 +++++++++++++++++++++++++++++++++++-------
>  5 files changed, 108 insertions(+), 17 deletions(-)
>
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 1667c9a..f58e1f7 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -1244,6 +1244,12 @@ struct btrfs_block_group_cache {
>         /* free space cache stuff */
>         struct btrfs_free_space_ctl *free_space_ctl;
>
> +       /*
> +        * It is used to record the extents that are allocated for
> +        * the data, but don/t update its metadata.
> +        */
> +       struct extent_io_tree pinned_extents;
> +
>         /* block group cache stuff */
>         struct rb_node cache_node;
>
> @@ -1540,6 +1546,13 @@ struct btrfs_fs_info {
>          */
>         struct list_head space_info;
>
> +       /*
> +        * It is just used for the delayed data space allocation
> +        * because only the data space allocation can be done during
> +        * we write out the free space cache.
> +        */
> +       struct rw_semaphore data_rwsem;
> +
>         struct btrfs_space_info *data_sinfo;
>
>         struct reloc_control *reloc_ctl;
> @@ -3183,7 +3196,7 @@ int btrfs_alloc_logged_file_extent(struct 
> btrfs_trans_handle *trans,
>                                    struct btrfs_key *ins);
>  int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
>                          u64 min_alloc_size, u64 empty_size, u64 hint_byte,
> -                        struct btrfs_key *ins, int is_data);
> +                        struct btrfs_key *ins, int is_data, bool need_pin);
>  int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
>                   struct extent_buffer *buf, int full_backref, int for_cow);
>  int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 8072cfa..426b558 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -2276,7 +2276,6 @@ int open_ctree(struct super_block *sb,
>         fs_info->pinned_extents = &fs_info->freed_extents[0];
>         fs_info->do_barriers = 1;
>
> -
>         mutex_init(&fs_info->ordered_operations_mutex);
>         mutex_init(&fs_info->ordered_extent_flush_mutex);
>         mutex_init(&fs_info->tree_log_mutex);
> @@ -2287,6 +2286,7 @@ int open_ctree(struct super_block *sb,
>         init_rwsem(&fs_info->extent_commit_sem);
>         init_rwsem(&fs_info->cleanup_work_sem);
>         init_rwsem(&fs_info->subvol_sem);
> +       init_rwsem(&fs_info->data_rwsem);
>         sema_init(&fs_info->uuid_tree_rescan_sem, 1);
>         fs_info->dev_replace.lock_owner = 0;
>         atomic_set(&fs_info->dev_replace.nesting_level, 0);
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 3664cfb..7b07876 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -6173,7 +6173,7 @@ enum btrfs_loop_type {
>  static noinline int find_free_extent(struct btrfs_root *orig_root,
>                                      u64 num_bytes, u64 empty_size,
>                                      u64 hint_byte, struct btrfs_key *ins,
> -                                    u64 flags)
> +                                    u64 flags, bool need_pin)
>  {
>         int ret = 0;
>         struct btrfs_root *root = orig_root->fs_info->extent_root;
> @@ -6502,6 +6502,16 @@ checks:
>                 ins->objectid = search_start;
>                 ins->offset = num_bytes;
>
> +               if (need_pin) {
> +                       ASSERT(search_start >= block_group->key.objectid &&
> +                              search_start < block_group->key.objectid +
> +                                             block_group->key.offset);
> +                       set_extent_dirty(&block_group->pinned_extents,
> +                                        search_start,
> +                                        search_start + num_bytes - 1,
> +                                        GFP_NOFS);
> +               }
> +
>                 trace_btrfs_reserve_extent(orig_root, block_group,
>                                            search_start, num_bytes);
>                 btrfs_put_block_group(block_group);
> @@ -6614,17 +6624,20 @@ again:
>  int btrfs_reserve_extent(struct btrfs_root *root,
>                          u64 num_bytes, u64 min_alloc_size,
>                          u64 empty_size, u64 hint_byte,
> -                        struct btrfs_key *ins, int is_data)
> +                        struct btrfs_key *ins, int is_data, bool need_pin)
>  {
>         bool final_tried = false;
>         u64 flags;
>         int ret;
>
>         flags = btrfs_get_alloc_profile(root, is_data);
> +
> +       if (need_pin)
> +               down_read(&root->fs_info->data_rwsem);
>  again:
>         WARN_ON(num_bytes < root->sectorsize);
>         ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
> -                              flags);
> +                              flags, need_pin);
>
>         if (ret == -ENOSPC) {
>                 if (!final_tried && ins->offset) {
> @@ -6645,6 +6658,8 @@ again:
>                 }
>         }
>
> +       if (need_pin)
> +               up_read(&root->fs_info->data_rwsem);
>         return ret;
>  }
>
> @@ -7016,7 +7031,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct 
> btrfs_trans_handle *trans,
>                 return ERR_CAST(block_rsv);
>
>         ret = btrfs_reserve_extent(root, blocksize, blocksize,
> -                                  empty_size, hint, &ins, 0);
> +                                  empty_size, hint, &ins, 0, false);
>         if (ret) {
>                 unuse_block_rsv(root->fs_info, block_rsv, blocksize);
>                 return ERR_PTR(ret);
> @@ -8387,6 +8402,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, 
> u64 start, u64 size)
>         INIT_LIST_HEAD(&cache->cluster_list);
>         INIT_LIST_HEAD(&cache->new_bg_list);
>         btrfs_init_free_space_ctl(cache);
> +       extent_io_tree_init(&cache->pinned_extents, NULL);
>
>         return cache;
>  }
> diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
> index 057be95..486e12a3 100644
> --- a/fs/btrfs/free-space-cache.c
> +++ b/fs/btrfs/free-space-cache.c
> @@ -875,6 +875,7 @@ static int __btrfs_write_out_cache(struct btrfs_root 
> *root, struct inode *inode,
>         struct rb_node *node;
>         struct list_head *pos, *n;
>         struct extent_state *cached_state = NULL;
> +       struct extent_state *pinned_extent = NULL;
>         struct btrfs_free_cluster *cluster = NULL;
>         struct extent_io_tree *unpin = NULL;
>         struct io_ctl io_ctl;
> @@ -948,17 +949,17 @@ static int __btrfs_write_out_cache(struct btrfs_root 
> *root, struct inode *inode,
>          * so we don't leak the space
>          */
>
> +       if (!block_group)
> +               goto bitmap;
>         /*
>          * We shouldn't have switched the pinned extents yet so this is the
>          * right one
>          */
>         unpin = root->fs_info->pinned_extents;
>
> -       if (block_group)
> -               start = block_group->key.objectid;
> +       start = block_group->key.objectid;
>
> -       while (block_group && (start < block_group->key.objectid +
> -                              block_group->key.offset)) {
> +       while (start < block_group->key.objectid + block_group->key.offset) {
>                 ret = find_first_extent_bit(unpin, start,
>                                             &extent_start, &extent_end,
>                                             EXTENT_DIRTY, NULL);
> @@ -985,6 +986,33 @@ static int __btrfs_write_out_cache(struct btrfs_root 
> *root, struct inode *inode,
>                 start = extent_end;
>         }
>
> +       if (!(block_group->flags & BTRFS_BLOCK_GROUP_DATA))
> +               goto bitmap;
> +
> +       start = block_group->key.objectid;
> +       unpin = &block_group->pinned_extents;
> +       while (1) {
> +               ret = find_first_extent_bit(unpin, start,
> +                                           &extent_start, &extent_end,
> +                                           EXTENT_DIRTY, &pinned_extent);
> +               if (ret) {
> +                       ret = 0;
> +                       break;
> +               }
> +
> +               len = extent_end - extent_start + 1;
> +
> +               entries++;
> +               ret = io_ctl_add_entry(&io_ctl, extent_start, len, NULL);
> +               if (ret) {
> +                       free_extent_state(pinned_extent);
> +                       goto out_nospc;
> +               }
> +
> +               start = extent_end + 1;
> +       }
> +       free_extent_state(pinned_extent);
> +bitmap:
>         /* Write out the bitmaps */
>         list_for_each_safe(pos, n, &bitmap_list) {
>                 struct btrfs_free_space *entry =
> @@ -1097,6 +1125,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
>         if (IS_ERR(inode))
>                 return 0;
>
> +       if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
> +               down_write(&root->fs_info->data_rwsem);
> +
>         ret = __btrfs_write_out_cache(root, inode, ctl, block_group, trans,
>                                       path, block_group->key.objectid);
>         if (ret) {
> @@ -1111,6 +1142,9 @@ int btrfs_write_out_cache(struct btrfs_root *root,
>  #endif
>         }
>
> +       if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
> +               up_write(&root->fs_info->data_rwsem);
> +
>         iput(inode);
>         return ret;
>  }
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index f1a7744..8172ca6 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -592,6 +592,28 @@ free_pages_out:
>         goto out;
>  }
>
> +static void btrfs_unpin_data_extent(struct btrfs_root *root, u64 start,
> +                                   u64 len)
> +{
> +       struct btrfs_block_group_cache *cache;
> +
> +       cache = btrfs_lookup_block_group(root->fs_info, start);
> +       BUG_ON(!cache);
> +       clear_extent_dirty(&cache->pinned_extents, start, start + len - 1,
> +                          GFP_NOFS);
> +       btrfs_put_block_group(cache);
> +}
> +
> +/* it is not used for free space cache file */
> +static void btrfs_free_reserved_data_extent(struct btrfs_root *root, u64 
> start,
> +                                           u64 len)
> +{
> +       down_read(&root->fs_info->data_rwsem);
> +       btrfs_unpin_data_extent(root, start, len);
> +       btrfs_free_reserved_extent(root, start, len);
> +       up_read(&root->fs_info->data_rwsem);
> +}
> +
>  /*
>   * phase two of compressed writeback.  This is the ordered portion
>   * of the code, which only gets called in the order the work was
> @@ -666,7 +688,7 @@ retry:
>                 ret = btrfs_reserve_extent(root,
>                                            async_extent->compressed_size,
>                                            async_extent->compressed_size,
> -                                          0, alloc_hint, &ins, 1);
> +                                          0, alloc_hint, &ins, 1, true);
>                 if (ret) {
>                         int i;
>
> @@ -767,7 +789,7 @@ retry:
>  out:
>         return ret;
>  out_free_reserve:
> -       btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
> +       btrfs_free_reserved_data_extent(root, ins.objectid, ins.offset);
>  out_free:
>         extent_clear_unlock_delalloc(inode, async_extent->start,
>                                      async_extent->start +
> @@ -889,7 +911,7 @@ static noinline int cow_file_range(struct inode *inode,
>                 cur_alloc_size = disk_num_bytes;
>                 ret = btrfs_reserve_extent(root, cur_alloc_size,
>                                            root->sectorsize, 0, alloc_hint,
> -                                          &ins, 1);
> +                                          &ins, 1, true);
>                 if (ret < 0)
>                         goto out_unlock;
>
> @@ -967,6 +989,7 @@ out:
>         return ret;
>
>  out_reserve:
> +       btrfs_unpin_data_extent(root, ins.objectid, ins.offset);
>         btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
>  out_unlock:
>         extent_clear_unlock_delalloc(inode, start, end, locked_page,
> @@ -2647,6 +2670,9 @@ static int btrfs_finish_ordered_io(struct 
> btrfs_ordered_extent *ordered_extent)
>                                                 logical_len, logical_len,
>                                                 compress_type, 0, 0,
>                                                 BTRFS_FILE_EXTENT_REG);
> +               BUG_ON(nolock);
> +               btrfs_unpin_data_extent(root, ordered_extent->start,
> +                                       ordered_extent->disk_len);
>         }
>         unpin_extent_cache(&BTRFS_I(inode)->extent_tree,
>                            ordered_extent->file_offset, ordered_extent->len,
> @@ -2698,8 +2724,9 @@ out:
>                 if ((ret || !logical_len) &&
>                     !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
>                     !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
> -                       btrfs_free_reserved_extent(root, 
> ordered_extent->start,
> -                                                  ordered_extent->disk_len);
> +                       btrfs_free_reserved_data_extent(root,
> +                                                       ordered_extent->start,
> +                                                       
> ordered_extent->disk_len);
>         }
>
>
> @@ -6342,7 +6369,7 @@ static struct extent_map 
> *btrfs_new_extent_direct(struct inode *inode,
>
>         alloc_hint = get_extent_allocation_hint(inode, start, len);
>         ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
> -                                  alloc_hint, &ins, 1);
> +                                  alloc_hint, &ins, 1, true);
>         if (ret)
>                 return ERR_PTR(ret);
>
> @@ -6356,6 +6383,7 @@ static struct extent_map 
> *btrfs_new_extent_direct(struct inode *inode,
>         ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
>                                            ins.offset, ins.offset, 0);
>         if (ret) {
> +               btrfs_unpin_data_extent(root, ins.objectid, ins.offset);
>                 btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
>                 free_extent_map(em);
>                 return ERR_PTR(ret);
> @@ -8507,7 +8535,7 @@ static int __btrfs_prealloc_file_range(struct inode 
> *inode, int mode,
>                 cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
>                 cur_bytes = max(cur_bytes, min_size);
>                 ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
> -                                          *alloc_hint, &ins, 1);
> +                                          *alloc_hint, &ins, 1, false);
>                 if (ret) {
>                         if (own_trans)
>                                 btrfs_end_transaction(trans, root);
> --
> 1.8.3.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to