On Mon, Aug 21, 2017 at 03:48:51PM +0300, Nikolay Borisov wrote:
> 
> 
> On 18.08.2017 00:23, jo...@toxicpanda.com wrote:
> > From: Josef Bacik <jba...@fb.com>
> > 
> > The way we handle delalloc metadata reservations has gotten
> > progressively more complicated over the years.  There is so much cruft
> > and weirdness around keeping the reserved count and outstanding counters
> > consistent and handling the error cases that it's impossible to
> > understand.
> > 
> > Fix this by making the delalloc block rsv per-inode.  This way we can
> > calculate the actual size of the outstanding metadata reservations every
> > time we make a change, and then reserve the delta based on that amount.
> > This greatly simplifies the code everywhere, and makes the error
> > handling in btrfs_delalloc_reserve_metadata far less terrifying.
> > 
> > Signed-off-by: Josef Bacik <jba...@fb.com>
> > ---
> >  fs/btrfs/btrfs_inode.h       |  29 ++--
> >  fs/btrfs/ctree.h             |   5 +-
> >  fs/btrfs/delayed-inode.c     |  46 +-----
> >  fs/btrfs/disk-io.c           |  18 ++-
> >  fs/btrfs/extent-tree.c       | 330 
> > +++++++++++++++----------------------------
> >  fs/btrfs/inode.c             |  18 ++-
> >  include/trace/events/btrfs.h |  22 ---
> >  7 files changed, 147 insertions(+), 321 deletions(-)
> > 
> > diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
> > index 2894ad6..71a4ded 100644
> > --- a/fs/btrfs/btrfs_inode.h
> > +++ b/fs/btrfs/btrfs_inode.h
> > @@ -36,14 +36,13 @@
> >  #define BTRFS_INODE_ORPHAN_META_RESERVED   1
> >  #define BTRFS_INODE_DUMMY                  2
> >  #define BTRFS_INODE_IN_DEFRAG                      3
> > -#define BTRFS_INODE_DELALLOC_META_RESERVED 4
> > -#define BTRFS_INODE_HAS_ORPHAN_ITEM                5
> > -#define BTRFS_INODE_HAS_ASYNC_EXTENT               6
> > -#define BTRFS_INODE_NEEDS_FULL_SYNC                7
> > -#define BTRFS_INODE_COPY_EVERYTHING                8
> > -#define BTRFS_INODE_IN_DELALLOC_LIST               9
> > -#define BTRFS_INODE_READDIO_NEED_LOCK              10
> > -#define BTRFS_INODE_HAS_PROPS                      11
> > +#define BTRFS_INODE_HAS_ORPHAN_ITEM                4
> > +#define BTRFS_INODE_HAS_ASYNC_EXTENT               5
> > +#define BTRFS_INODE_NEEDS_FULL_SYNC                6
> > +#define BTRFS_INODE_COPY_EVERYTHING                7
> > +#define BTRFS_INODE_IN_DELALLOC_LIST               8
> > +#define BTRFS_INODE_READDIO_NEED_LOCK              9
> > +#define BTRFS_INODE_HAS_PROPS                      10
> >  
> >  /* in memory btrfs inode */
> >  struct btrfs_inode {
> > @@ -176,7 +175,8 @@ struct btrfs_inode {
> >      * of extent items we've reserved metadata for.
> >      */
> >     unsigned outstanding_extents;
> > -   unsigned reserved_extents;
> > +
> > +   struct btrfs_block_rsv block_rsv;
> >  
> >     /*
> >      * Cached values if inode properties
> > @@ -278,17 +278,6 @@ static inline void 
> > btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
> >                                               mod);
> >  }
> >  
> > -static inline void btrfs_mod_reserved_extents(struct btrfs_inode *inode,
> > -                                         int mod)
> > -{
> > -   ASSERT(spin_is_locked(&inode->lock));
> > -   inode->reserved_extents += mod;
> > -   if (btrfs_is_free_space_inode(inode))
> > -           return;
> > -   trace_btrfs_inode_mod_reserved_extents(inode->root, btrfs_ino(inode),
> > -                                          mod);
> > -}
> > -
> >  static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 
> > generation)
> >  {
> >     int ret = 0;
> > diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> > index b43114d..a4ee115 100644
> > --- a/fs/btrfs/ctree.h
> > +++ b/fs/btrfs/ctree.h
> > @@ -763,8 +763,6 @@ struct btrfs_fs_info {
> >      * delayed dir index item
> >      */
> >     struct btrfs_block_rsv global_block_rsv;
> > -   /* block reservation for delay allocation */
> > -   struct btrfs_block_rsv delalloc_block_rsv;
> >     /* block reservation for metadata operations */
> >     struct btrfs_block_rsv trans_block_rsv;
> >     /* block reservation for chunk tree */
> > @@ -2744,6 +2742,9 @@ int btrfs_delalloc_reserve_space(struct inode *inode,
> >  void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short 
> > type);
> >  struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info 
> > *fs_info,
> >                                           unsigned short type);
> > +void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
> > +                              struct btrfs_block_rsv *rsv,
> > +                              unsigned short type);
> >  void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
> >                       struct btrfs_block_rsv *rsv);
> >  void __btrfs_free_block_rsv(struct btrfs_block_rsv *rsv);
> > diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
> > index 19e4ad2..5d73f79 100644
> > --- a/fs/btrfs/delayed-inode.c
> > +++ b/fs/btrfs/delayed-inode.c
> > @@ -581,7 +581,6 @@ static int btrfs_delayed_inode_reserve_metadata(
> >     struct btrfs_block_rsv *dst_rsv;
> >     u64 num_bytes;
> >     int ret;
> > -   bool release = false;
> >  
> >     src_rsv = trans->block_rsv;
> >     dst_rsv = &fs_info->delayed_block_rsv;
> > @@ -589,36 +588,13 @@ static int btrfs_delayed_inode_reserve_metadata(
> >     num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
> >  
> >     /*
> > -    * If our block_rsv is the delalloc block reserve then check and see if
> > -    * we have our extra reservation for updating the inode.  If not fall
> > -    * through and try to reserve space quickly.
> > -    *
> > -    * We used to try and steal from the delalloc block rsv or the global
> > -    * reserve, but we'd steal a full reservation, which isn't kind.  We are
> > -    * here through delalloc which means we've likely just cowed down close
> > -    * to the leaf that contains the inode, so we would steal less just
> > -    * doing the fallback inode update, so if we do end up having to steal
> > -    * from the global block rsv we hopefully only steal one or two blocks
> > -    * worth which is less likely to hurt us.
> > -    */
> > -   if (src_rsv && src_rsv->type == BTRFS_BLOCK_RSV_DELALLOC) {
> > -           spin_lock(&inode->lock);
> > -           if (test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
> > -                                  &inode->runtime_flags))
> > -                   release = true;
> > -           else
> > -                   src_rsv = NULL;
> > -           spin_unlock(&inode->lock);
> > -   }
> > -
> > -   /*
> >      * btrfs_dirty_inode will update the inode under btrfs_join_transaction
> >      * which doesn't reserve space for speed.  This is a problem since we
> >      * still need to reserve space for this update, so try to reserve the
> >      * space.
> >      *
> >      * Now if src_rsv == delalloc_block_rsv we'll let it just steal since
> > -    * we're accounted for.
> > +    * we always reserve enough to update the inode item.
> >      */
> >     if (!src_rsv || (!trans->bytes_reserved &&
> >                      src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) {
> > @@ -643,32 +619,12 @@ static int btrfs_delayed_inode_reserve_metadata(
> >     }
> >  
> >     ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, 1);
> > -
> > -   /*
> > -    * Migrate only takes a reservation, it doesn't touch the size of the
> > -    * block_rsv.  This is to simplify people who don't normally have things
> > -    * migrated from their block rsv.  If they go to release their
> > -    * reservation, that will decrease the size as well, so if migrate
> > -    * reduced size we'd end up with a negative size.  But for the
> > -    * delalloc_meta_reserved stuff we will only know to drop 1 reservation,
> > -    * but we could in fact do this reserve/migrate dance several times
> > -    * between the time we did the original reservation and we'd clean it
> > -    * up.  So to take care of this, release the space for the meta
> > -    * reservation here.  I think it may be time for a documentation page on
> > -    * how block rsvs. work.
> > -    */
> >     if (!ret) {
> >             trace_btrfs_space_reservation(fs_info, "delayed_inode",
> >                                           btrfs_ino(inode), num_bytes, 1);
> >             node->bytes_reserved = num_bytes;
> >     }
> >  
> > -   if (release) {
> > -           trace_btrfs_space_reservation(fs_info, "delalloc",
> > -                                         btrfs_ino(inode), num_bytes, 0);
> > -           btrfs_block_rsv_release(fs_info, src_rsv, num_bytes);
> > -   }
> > -
> >     return ret;
> >  }
> >  
> > diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> > index 9283e28..5681f8f 100644
> > --- a/fs/btrfs/disk-io.c
> > +++ b/fs/btrfs/disk-io.c
> > @@ -2607,14 +2607,6 @@ int open_ctree(struct super_block *sb,
> >             goto fail_delalloc_bytes;
> >     }
> >  
> > -   fs_info->btree_inode = new_inode(sb);
> > -   if (!fs_info->btree_inode) {
> > -           err = -ENOMEM;
> > -           goto fail_bio_counter;
> > -   }
> > -
> > -   mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
> > -
> >     INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
> >     INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
> >     INIT_LIST_HEAD(&fs_info->trans_list);
> > @@ -2647,8 +2639,6 @@ int open_ctree(struct super_block *sb,
> >     btrfs_mapping_init(&fs_info->mapping_tree);
> >     btrfs_init_block_rsv(&fs_info->global_block_rsv,
> >                          BTRFS_BLOCK_RSV_GLOBAL);
> > -   btrfs_init_block_rsv(&fs_info->delalloc_block_rsv,
> > -                        BTRFS_BLOCK_RSV_DELALLOC);
> >     btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS);
> >     btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK);
> >     btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
> > @@ -2679,6 +2669,14 @@ int open_ctree(struct super_block *sb,
> >  
> >     INIT_LIST_HEAD(&fs_info->ordered_roots);
> >     spin_lock_init(&fs_info->ordered_root_lock);
> > +
> > +   fs_info->btree_inode = new_inode(sb);
> > +   if (!fs_info->btree_inode) {
> > +           err = -ENOMEM;
> > +           goto fail_bio_counter;
> > +   }
> > +   mapping_set_gfp_mask(fs_info->btree_inode->i_mapping, GFP_NOFS);
> > +
> >     fs_info->delayed_root = kmalloc(sizeof(struct btrfs_delayed_root),
> >                                     GFP_KERNEL);
> >     if (!fs_info->delayed_root) {
> 
> This hunk seem to be a leftover from some other work, so you can drop it
> for v2. Of course the the removal of the call to btrfs_init_block_rsv
> should stay.
> 

No this is actually needed.  We do a __find_space_info from
btrfs_init_metadata_block_rsv() for the btree_inode, and if the fs_info isn't
properly zero'ed out it pukes on whatever fs_info->space_info is pointing at.

> > diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> > index a313487..8484e0f 100644
> > --- a/fs/btrfs/extent-tree.c
> > +++ b/fs/btrfs/extent-tree.c
> > @@ -4761,7 +4761,6 @@ static inline u64 calc_reclaim_items_nr(struct 
> > btrfs_fs_info *fs_info,
> >  static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim,
> >                         u64 orig, bool wait_ordered)
> >  {
> > -   struct btrfs_block_rsv *block_rsv;
> >     struct btrfs_space_info *space_info;
> >     struct btrfs_trans_handle *trans;
> >     u64 delalloc_bytes;
> > @@ -4777,8 +4776,7 @@ static void shrink_delalloc(struct btrfs_fs_info 
> > *fs_info, u64 to_reclaim,
> >     to_reclaim = items * EXTENT_SIZE_PER_ITEM;
> >  
> >     trans = (struct btrfs_trans_handle *)current->journal_info;
> > -   block_rsv = &fs_info->delalloc_block_rsv;
> > -   space_info = block_rsv->space_info;
> > +   space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
> >  
> >     delalloc_bytes = percpu_counter_sum_positive(
> >                                             &fs_info->delalloc_bytes);
> > @@ -5502,7 +5500,8 @@ static void space_info_add_new_bytes(struct 
> > btrfs_fs_info *fs_info,
> >  
> >  static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
> >                                 struct btrfs_block_rsv *block_rsv,
> > -                               struct btrfs_block_rsv *dest, u64 num_bytes)
> > +                               struct btrfs_block_rsv *dest, u64 num_bytes,
> > +                               u64 *released_bytes)
> >  {
> 
> Instead of adding yet another parameter, why not just return the value?
> That way callers can handle it/check on their own accord. It seems this
> is only ever used for debugging (tracepoint) purposes anyway
> 

Yeah that sounds reasonable, I'll do that.

> >     struct btrfs_space_info *space_info = block_rsv->space_info;
> >  
> > @@ -5520,6 +5519,8 @@ static void block_rsv_release_bytes(struct 
> > btrfs_fs_info *fs_info,
> >     spin_unlock(&block_rsv->lock);
> >  
> >     if (num_bytes > 0) {
> > +           if (released_bytes)
> > +                   *released_bytes = num_bytes;
> >             if (dest) {
> >                     spin_lock(&dest->lock);
> >                     if (!dest->full) {
> > @@ -5561,6 +5562,15 @@ void btrfs_init_block_rsv(struct btrfs_block_rsv 
> > *rsv, unsigned short type)
> >     rsv->type = type;
> >  }
> >  
> > +void btrfs_init_metadata_block_rsv(struct btrfs_fs_info *fs_info,
> > +                              struct btrfs_block_rsv *rsv,
> > +                              unsigned short type)
> > +{
> > +   btrfs_init_block_rsv(rsv, type);
> > +   rsv->space_info = __find_space_info(fs_info,
> > +                                       BTRFS_BLOCK_GROUP_METADATA);
> > +}
> > +
> >  struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_fs_info 
> > *fs_info,
> >                                           unsigned short type)
> >  {
> > @@ -5570,9 +5580,7 @@ struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct 
> > btrfs_fs_info *fs_info,
> >     if (!block_rsv)
> >             return NULL;
> >  
> > -   btrfs_init_block_rsv(block_rsv, type);
> > -   block_rsv->space_info = __find_space_info(fs_info,
> > -                                             BTRFS_BLOCK_GROUP_METADATA);
> > +   btrfs_init_metadata_block_rsv(fs_info, block_rsv, type);
> >     return block_rsv;
> >  }
> >  
> > @@ -5655,6 +5663,68 @@ int btrfs_block_rsv_refill(struct btrfs_root *root,
> >     return ret;
> >  }
> >  
> > +/**
> > + * btrfs_inode_rsv_refill - refill the inode block rsv.
> > + * @inode - the inode we are refilling.
> > + * @flush - the flusing restriction.
> > + *
> > + * Essentially the same as btrfs_block_rsv_refill, except it uses the
> > + * block_rsv->size as the minimum size.  We'll either refill the missing 
> > amount
> > + * or return if we already have enough space.  This will also handle the 
> > resreve
> > + * tracepoint for the reserved amount.
> > + */
> > +int btrfs_inode_rsv_refill(struct btrfs_inode *inode,
> > +                      enum btrfs_reserve_flush_enum flush)
> > +{
> > +   struct btrfs_root *root = inode->root;
> > +   struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
> > +   u64 num_bytes = 0;
> > +   int ret = -ENOSPC;
> > +
> > +   spin_lock(&block_rsv->lock);
> > +   if (block_rsv->reserved < block_rsv->size)
> > +           num_bytes = block_rsv->size - block_rsv->reserved;
> > +   spin_unlock(&block_rsv->lock);
> > +
> > +   if (num_bytes == 0)
> > +           return 0;
> > +
> > +   ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
> > +   if (!ret) {
> > +           block_rsv_add_bytes(block_rsv, num_bytes, 0);
> > +           trace_btrfs_space_reservation(root->fs_info, "delalloc",
> > +                                         btrfs_ino(inode), num_bytes, 1);
> > +   }
> > +   return ret;
> > +}
> > +
> > +/**
> > + * btrfs_inode_rsv_release - release any excessive reservation.
> > + * @inode - the inode we need to release from.
> > + *
> > + * This is the same as btrfs_block_rsv_release, except that it handles the
> > + * tracepoint for the reservation.
> > + */
> > +void btrfs_inode_rsv_release(struct btrfs_inode *inode)
> > +{
> > +   struct btrfs_fs_info *fs_info = inode->root->fs_info;
> > +   struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
> > +   struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
> > +   u64 release_bytes = 0;
> > +
> > +   /*
> > +    * Since we statically set the block_rsv->size we just want to say we
> > +    * are releasing 0 bytes, and then we'll just get the reservation over
> > +    * the size free'd.
> > +    */
> > +   block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0,
> > +                           &release_bytes);
> > +   if (release_bytes > 0)
> > +           trace_btrfs_space_reservation(fs_info, "delalloc",
> > +                                         btrfs_ino(inode), release_bytes,
> > +                                         0);
> > +}
> > +
> >  void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
> >                          struct btrfs_block_rsv *block_rsv,
> >                          u64 num_bytes)
> > @@ -5664,7 +5734,8 @@ void btrfs_block_rsv_release(struct btrfs_fs_info 
> > *fs_info,
> >     if (global_rsv == block_rsv ||
> >         block_rsv->space_info != global_rsv->space_info)
> >             global_rsv = NULL;
> > -   block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes);
> > +   block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes,
> > +                           NULL);
> >  }
> >  
> >  static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
> > @@ -5726,7 +5797,6 @@ static void init_global_block_rsv(struct 
> > btrfs_fs_info *fs_info)
> >  
> >     space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
> >     fs_info->global_block_rsv.space_info = space_info;
> > -   fs_info->delalloc_block_rsv.space_info = space_info;
> >     fs_info->trans_block_rsv.space_info = space_info;
> >     fs_info->empty_block_rsv.space_info = space_info;
> >     fs_info->delayed_block_rsv.space_info = space_info;
> > @@ -5745,9 +5815,7 @@ static void init_global_block_rsv(struct 
> > btrfs_fs_info *fs_info)
> >  static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
> >  {
> >     block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
> > -                           (u64)-1);
> > -   WARN_ON(fs_info->delalloc_block_rsv.size > 0);
> > -   WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
> > +                           (u64)-1, NULL);
> >     WARN_ON(fs_info->trans_block_rsv.size > 0);
> >     WARN_ON(fs_info->trans_block_rsv.reserved > 0);
> >     WARN_ON(fs_info->chunk_block_rsv.size > 0);
> > @@ -5786,7 +5854,7 @@ void btrfs_trans_release_chunk_metadata(struct 
> > btrfs_trans_handle *trans)
> >     WARN_ON_ONCE(!list_empty(&trans->new_bgs));
> >  
> >     block_rsv_release_bytes(fs_info, &fs_info->chunk_block_rsv, NULL,
> > -                           trans->chunk_bytes_reserved);
> > +                           trans->chunk_bytes_reserved, NULL);
> >     trans->chunk_bytes_reserved = 0;
> >  }
> >  
> > @@ -5886,95 +5954,37 @@ void btrfs_subvolume_release_metadata(struct 
> > btrfs_fs_info *fs_info,
> >     btrfs_block_rsv_release(fs_info, rsv, (u64)-1);
> >  }
> >  
> > -/**
> > - * drop_over_reserved_extents - drop our extra extent reservations
> > - * @inode: the inode we're dropping the extent for
> > - *
> > - * We reserve extents we may use, but they may have been merged with other
> > - * extents and we may not need the extra reservation.
> > - *
> > - * We also call this when we've completed io to an extent or had an error 
> > and
> > - * cleared the outstanding extent, in either case we no longer need our
> > - * reservation and can drop the excess.
> > - */
> > -static unsigned drop_over_reserved_extents(struct btrfs_inode *inode)
> > -{
> > -   unsigned num_extents = 0;
> > -
> > -   if (inode->reserved_extents > inode->outstanding_extents) {
> > -           num_extents = inode->reserved_extents -
> > -                   inode->outstanding_extents;
> > -           btrfs_mod_reserved_extents(inode, -num_extents);
> > -   }
> > -
> > -   if (inode->outstanding_extents == 0 &&
> > -       test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
> > -                          &inode->runtime_flags))
> > -           num_extents++;
> > -   return num_extents;
> > -}
> > -
> > -/**
> > - * calc_csum_metadata_size - return the amount of metadata space that must 
> > be
> > - * reserved/freed for the given bytes.
> > - * @inode: the inode we're manipulating
> > - * @num_bytes: the number of bytes in question
> > - * @reserve: 1 if we are reserving space, 0 if we are freeing space
> > - *
> > - * This adjusts the number of csum_bytes in the inode and then returns the
> > - * correct amount of metadata that must either be reserved or freed.  We
> > - * calculate how many checksums we can fit into one leaf and then divide 
> > the
> > - * number of bytes that will need to be checksumed by this value to figure 
> > out
> > - * how many checksums will be required.  If we are adding bytes then the 
> > number
> > - * may go up and we will return the number of additional bytes that must be
> > - * reserved.  If it is going down we will return the number of bytes that 
> > must
> > - * be freed.
> > - *
> > - * This must be called with BTRFS_I(inode)->lock held.
> > - */
> > -static u64 calc_csum_metadata_size(struct btrfs_inode *inode, u64 
> > num_bytes,
> > -                              int reserve)
> > +static void btrfs_calculate_inode_block_rsv_size(struct btrfs_fs_info 
> > *fs_info,
> > +                                            struct btrfs_inode *inode)
> >  {
> > -   struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
> > -   u64 old_csums, num_csums;
> > +   struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
> > +   u64 reserve_size = 0;
> > +   u64 csum_leaves;
> > +   unsigned outstanding_extents;
> >  
> > -   if (inode->flags & BTRFS_INODE_NODATASUM && inode->csum_bytes == 0)
> > -           return 0;
> > +   ASSERT(spin_is_locked(&inode->lock));
> 
> How about using lockdep_is_held so that this is caught even if btrfs is
> compiled without assertions enabled ?
> 

Yup can do.

> > +   outstanding_extents = inode->outstanding_extents;
> > +   if (outstanding_extents)
> > +           reserve_size = btrfs_calc_trans_metadata_size(fs_info,
> > +                                           outstanding_extents + 1);
> > +   csum_leaves = btrfs_csum_bytes_to_leaves(fs_info,
> > +                                            inode->csum_bytes);
> > +   reserve_size += btrfs_calc_trans_metadata_size(fs_info,
> > +                                                  csum_leaves);
> >  
> > -   old_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
> > -   if (reserve)
> > -           inode->csum_bytes += num_bytes;
> > -   else
> > -           inode->csum_bytes -= num_bytes;
> > -   num_csums = btrfs_csum_bytes_to_leaves(fs_info, inode->csum_bytes);
> > -
> > -   /* No change, no need to reserve more */
> > -   if (old_csums == num_csums)
> > -           return 0;
> > -
> > -   if (reserve)
> > -           return btrfs_calc_trans_metadata_size(fs_info,
> > -                                                 num_csums - old_csums);
> > -
> > -   return btrfs_calc_trans_metadata_size(fs_info, old_csums - num_csums);
> > +   spin_lock(&block_rsv->lock);
> > +   block_rsv->size = reserve_size;
> > +   spin_unlock(&block_rsv->lock);
> >  }
> >  
> >  int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 
> > num_bytes)
> >  {
> >     struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
> >     struct btrfs_root *root = inode->root;
> > -   struct btrfs_block_rsv *block_rsv = &fs_info->delalloc_block_rsv;
> > -   u64 to_reserve = 0;
> > -   u64 csum_bytes;
> > -   unsigned nr_extents, reserve_extents;
> > +   unsigned nr_extents;
> >     enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
> >     int ret = 0;
> >     bool delalloc_lock = true;
> > -   u64 to_free = 0;
> > -   unsigned dropped;
> > -   bool release_extra = false;
> > -   bool underflow = false;
> > -   bool did_retry = false;
> >  
> >     WARN_ON(btrfs_is_free_space_inode(inode));
> >  
> > @@ -6001,33 +6011,13 @@ int btrfs_delalloc_reserve_metadata(struct 
> > btrfs_inode *inode, u64 num_bytes)
> >             mutex_lock(&inode->delalloc_mutex);
> >  
> >     num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
> > -retry:
> > +
> > +   /* Add our new extents and calculate the new rsv size. */
> >     spin_lock(&inode->lock);
> > -   reserve_extents = nr_extents = count_max_extents(num_bytes);
> > +   nr_extents = count_max_extents(num_bytes);
> >     btrfs_mod_outstanding_extents(inode, nr_extents);
> > -
> > -   /*
> > -    * Because we add an outstanding extent for ordered before we clear
> > -    * delalloc we will double count our outstanding extents slightly.  This
> > -    * could mean that we transiently over-reserve, which could result in an
> > -    * early ENOSPC if our timing is unlucky.  Keep track of the case that
> > -    * we had a reservation underflow so we can retry if we fail.
> > -    *
> > -    * Keep in mind we can legitimately have more outstanding extents than
> > -    * reserved because of fragmentation, so only allow a retry once.
> > -    */
> > -   if (inode->outstanding_extents >
> > -       inode->reserved_extents + nr_extents) {
> > -           reserve_extents = inode->outstanding_extents -
> > -                   inode->reserved_extents;
> > -           underflow = true;
> > -   }
> > -
> > -   /* We always want to reserve a slot for updating the inode. */
> > -   to_reserve = btrfs_calc_trans_metadata_size(fs_info,
> > -                                               reserve_extents + 1);
> > -   to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
> > -   csum_bytes = inode->csum_bytes;
> > +   inode->csum_bytes += num_bytes;
> > +   btrfs_calculate_inode_block_rsv_size(fs_info, inode);
> >     spin_unlock(&inode->lock);
> >  
> >     if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)) {
> > @@ -6037,100 +6027,26 @@ int btrfs_delalloc_reserve_metadata(struct 
> > btrfs_inode *inode, u64 num_bytes)
> >                     goto out_fail;
> >     }
> >  
> > -   ret = btrfs_block_rsv_add(root, block_rsv, to_reserve, flush);
> > +   ret = btrfs_inode_rsv_refill(inode, flush);
> >     if (unlikely(ret)) {
> >             btrfs_qgroup_free_meta(root,
> >                                    nr_extents * fs_info->nodesize);
> >             goto out_fail;
> >     }
> >  
> > -   spin_lock(&inode->lock);
> > -   if (test_and_set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
> > -                        &inode->runtime_flags)) {
> > -           to_reserve -= btrfs_calc_trans_metadata_size(fs_info, 1);
> > -           release_extra = true;
> > -   }
> > -   btrfs_mod_reserved_extents(inode, reserve_extents);
> > -   spin_unlock(&inode->lock);
> > -
> >     if (delalloc_lock)
> >             mutex_unlock(&inode->delalloc_mutex);
> > -
> > -   if (to_reserve)
> > -           trace_btrfs_space_reservation(fs_info, "delalloc",
> > -                                         btrfs_ino(inode), to_reserve, 1);
> > -   if (release_extra)
> > -           btrfs_block_rsv_release(fs_info, block_rsv,
> > -                           btrfs_calc_trans_metadata_size(fs_info, 1));
> >     return 0;
> >  
> >  out_fail:
> >     spin_lock(&inode->lock);
> >     nr_extents = count_max_extents(num_bytes);
> >     btrfs_mod_outstanding_extents(inode, -nr_extents);
> > -
> > -   dropped = drop_over_reserved_extents(inode);
> > -   /*
> > -    * If the inodes csum_bytes is the same as the original
> > -    * csum_bytes then we know we haven't raced with any free()ers
> > -    * so we can just reduce our inodes csum bytes and carry on.
> > -    */
> > -   if (inode->csum_bytes == csum_bytes) {
> > -           calc_csum_metadata_size(inode, num_bytes, 0);
> > -   } else {
> > -           u64 orig_csum_bytes = inode->csum_bytes;
> > -           u64 bytes;
> > -
> > -           /*
> > -            * This is tricky, but first we need to figure out how much we
> > -            * freed from any free-ers that occurred during this
> > -            * reservation, so we reset ->csum_bytes to the csum_bytes
> > -            * before we dropped our lock, and then call the free for the
> > -            * number of bytes that were freed while we were trying our
> > -            * reservation.
> > -            */
> > -           bytes = csum_bytes - inode->csum_bytes;
> > -           inode->csum_bytes = csum_bytes;
> > -           to_free = calc_csum_metadata_size(inode, bytes, 0);
> > -
> > -
> > -           /*
> > -            * Now we need to see how much we would have freed had we not
> > -            * been making this reservation and our ->csum_bytes were not
> > -            * artificially inflated.
> > -            */
> > -           inode->csum_bytes = csum_bytes - num_bytes;
> > -           bytes = csum_bytes - orig_csum_bytes;
> > -           bytes = calc_csum_metadata_size(inode, bytes, 0);
> > -
> > -           /*
> > -            * Now reset ->csum_bytes to what it should be.  If bytes is
> > -            * more than to_free then we would have freed more space had we
> > -            * not had an artificially high ->csum_bytes, so we need to free
> > -            * the remainder.  If bytes is the same or less then we don't
> > -            * need to do anything, the other free-ers did the correct
> > -            * thing.
> > -            */
> > -           inode->csum_bytes = orig_csum_bytes - num_bytes;
> > -           if (bytes > to_free)
> > -                   to_free = bytes - to_free;
> > -           else
> > -                   to_free = 0;
> > -   }
> > +   inode->csum_bytes -= num_bytes;
> > +   btrfs_calculate_inode_block_rsv_size(fs_info, inode);
> >     spin_unlock(&inode->lock);
> > -   if (dropped)
> > -           to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
> >  
> > -   if (to_free) {
> > -           btrfs_block_rsv_release(fs_info, block_rsv, to_free);
> > -           trace_btrfs_space_reservation(fs_info, "delalloc",
> > -                                         btrfs_ino(inode), to_free, 0);
> > -   }
> > -   if (underflow && !did_retry) {
> > -           did_retry = true;
> > -           underflow = false;
> > -           goto retry;
> > -   }
> > +   btrfs_inode_rsv_release(inode);
> >     if (delalloc_lock)
> >             mutex_unlock(&inode->delalloc_mutex);
> >     return ret;
> > @@ -6148,25 +6064,17 @@ int btrfs_delalloc_reserve_metadata(struct 
> > btrfs_inode *inode, u64 num_bytes)
> >  void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 
> > num_bytes)
> >  {
> >     struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
> > -   u64 to_free = 0;
> > -   unsigned dropped;
> >  
> >     num_bytes = ALIGN(num_bytes, fs_info->sectorsize);
> >     spin_lock(&inode->lock);
> > -   dropped = drop_over_reserved_extents(inode);
> > -   if (num_bytes)
> > -           to_free = calc_csum_metadata_size(inode, num_bytes, 0);
> > +   inode->csum_bytes -= num_bytes;
> > +   btrfs_calculate_inode_block_rsv_size(fs_info, inode);
> >     spin_unlock(&inode->lock);
> > -   if (dropped > 0)
> > -           to_free += btrfs_calc_trans_metadata_size(fs_info, dropped);
> >  
> >     if (btrfs_is_testing(fs_info))
> >             return;
> >  
> > -   trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode),
> > -                                 to_free, 0);
> > -
> > -   btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free);
> > +   btrfs_inode_rsv_release(inode);
> >  }
> >  
> >  /**
> > @@ -6184,25 +6092,17 @@ void btrfs_delalloc_release_extents(struct 
> > btrfs_inode *inode, u64 num_bytes)
> >  {
> >     struct btrfs_fs_info *fs_info = btrfs_sb(inode->vfs_inode.i_sb);
> >     unsigned num_extents;
> > -   u64 to_free;
> > -   unsigned dropped;
> >  
> >     spin_lock(&inode->lock);
> >     num_extents = count_max_extents(num_bytes);
> >     btrfs_mod_outstanding_extents(inode, -num_extents);
> > -   dropped = drop_over_reserved_extents(inode);
> > +   btrfs_calculate_inode_block_rsv_size(fs_info, inode);
> >     spin_unlock(&inode->lock);
> >  
> > -   if (!dropped)
> > -           return;
> > -
> >     if (btrfs_is_testing(fs_info))
> >             return;
> >  
> > -   to_free = btrfs_calc_trans_metadata_size(fs_info, dropped);
> > -   trace_btrfs_space_reservation(fs_info, "delalloc", btrfs_ino(inode),
> > -                                 to_free, 0);
> > -   btrfs_block_rsv_release(fs_info, &fs_info->delalloc_block_rsv, to_free);
> > +   btrfs_inode_rsv_release(inode);
> >  }
> >  
> >  /**
> > @@ -8427,7 +8327,7 @@ static void unuse_block_rsv(struct btrfs_fs_info 
> > *fs_info,
> >                         struct btrfs_block_rsv *block_rsv, u32 blocksize)
> >  {
> >     block_rsv_add_bytes(block_rsv, blocksize, 0);
> > -   block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
> > +   block_rsv_release_bytes(fs_info, block_rsv, NULL, 0, NULL);
> >  }
> >  
> >  /*
> > diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> > index 912417b..b0d1e94 100644
> > --- a/fs/btrfs/inode.c
> > +++ b/fs/btrfs/inode.c
> > @@ -42,6 +42,7 @@
> >  #include <linux/blkdev.h>
> >  #include <linux/posix_acl_xattr.h>
> >  #include <linux/uio.h>
> > +#include <linux/magic.h>
> >  #include "ctree.h"
> >  #include "disk-io.h"
> >  #include "transaction.h"
> > @@ -303,7 +304,7 @@ static noinline int cow_file_range_inline(struct 
> > btrfs_root *root,
> >             btrfs_free_path(path);
> >             return PTR_ERR(trans);
> >     }
> > -   trans->block_rsv = &fs_info->delalloc_block_rsv;
> > +   trans->block_rsv = &BTRFS_I(inode)->block_rsv;
> >  
> >     if (compressed_size && compressed_pages)
> >             extent_item_size = btrfs_file_extent_calc_inline_size(
> > @@ -2949,7 +2950,7 @@ static int btrfs_finish_ordered_io(struct 
> > btrfs_ordered_extent *ordered_extent)
> >                     trans = NULL;
> >                     goto out;
> >             }
> > -           trans->block_rsv = &fs_info->delalloc_block_rsv;
> > +           trans->block_rsv = &BTRFS_I(inode)->block_rsv;
> >             ret = btrfs_update_inode_fallback(trans, root, inode);
> >             if (ret) /* -ENOMEM or corruption */
> >                     btrfs_abort_transaction(trans, ret);
> > @@ -2985,7 +2986,7 @@ static int btrfs_finish_ordered_io(struct 
> > btrfs_ordered_extent *ordered_extent)
> >             goto out;
> >     }
> >  
> > -   trans->block_rsv = &fs_info->delalloc_block_rsv;
> > +   trans->block_rsv = &BTRFS_I(inode)->block_rsv;
> >  
> >     if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags))
> >             compress_type = ordered_extent->compress_type;
> > @@ -8834,7 +8835,6 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, 
> > struct iov_iter *iter)
> >     if (iov_iter_rw(iter) == WRITE) {
> >             up_read(&BTRFS_I(inode)->dio_sem);
> >             current->journal_info = NULL;
> > -           btrfs_delalloc_release_extents(BTRFS_I(inode), count);
> >             if (ret < 0 && ret != -EIOCBQUEUED) {
> >                     if (dio_data.reserve)
> >                             btrfs_delalloc_release_space(inode, 
> > data_reserved,
> > @@ -8855,6 +8855,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, 
> > struct iov_iter *iter)
> >             } else if (ret >= 0 && (size_t)ret < count)
> >                     btrfs_delalloc_release_space(inode, data_reserved,
> >                                     offset, count - (size_t)ret);
> > +           btrfs_delalloc_release_extents(BTRFS_I(inode), count);
> >     }
> >  out:
> >     if (wakeup)
> > @@ -9419,6 +9420,7 @@ int btrfs_create_subvol_root(struct 
> > btrfs_trans_handle *trans,
> >  
> >  struct inode *btrfs_alloc_inode(struct super_block *sb)
> >  {
> > +   struct btrfs_fs_info *fs_info = btrfs_sb(sb);
> >     struct btrfs_inode *ei;
> >     struct inode *inode;
> >  
> > @@ -9445,8 +9447,9 @@ struct inode *btrfs_alloc_inode(struct super_block 
> > *sb)
> >  
> >     spin_lock_init(&ei->lock);
> >     ei->outstanding_extents = 0;
> > -   ei->reserved_extents = 0;
> > -
> > +   if (sb->s_magic != BTRFS_TEST_MAGIC)
> > +           btrfs_init_metadata_block_rsv(fs_info, &ei->block_rsv,
> > +                                         BTRFS_BLOCK_RSV_DELALLOC);
> 
> Why the dependendance on BTRFS_TEST_MAGIC here?
> 

Because we allocate test inodes without a fs_info sometimes, this was the most
expedient way to deal with that.  Thanks,

Josef
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to