On 28.09.2018 14:17, Josef Bacik wrote:
> From: Josef Bacik <jba...@fb.com>
> 
> Traditionally we've had voodoo in btrfs to account for the space that
> delayed refs may take up by having a global_block_rsv.  This works most
> of the time, except when it doesn't.  We've had issues reported and seen
> in production where sometimes the global reserve is exhausted during
> transaction commit before we can run all of our delayed refs, resulting
> in an aborted transaction.  Because of this voodoo we have equally
> dubious flushing semantics around throttling delayed refs which we often
> get wrong.
> 
> So instead give them their own block_rsv.  This way we can always know
> exactly how much outstanding space we need for delayed refs.  This
> allows us to make sure we are constantly filling that reservation up
> with space, and allows us to put more precise pressure on the enospc
> system.  Instead of doing math to see if its a good time to throttle,
> the normal enospc code will be invoked if we have a lot of delayed refs
> pending, and they will be run via the normal flushing mechanism.
> 
> For now the delayed_refs_rsv will hold the reservations for the delayed
> refs, the block group updates, and deleting csums.  We could have a
> separate rsv for the block group updates, but the csum deletion stuff is
> still handled via the delayed_refs so that will stay there.
> 
> Signed-off-by: Josef Bacik <jba...@fb.com>
> ---
>  fs/btrfs/ctree.h             |  27 +++--
>  fs/btrfs/delayed-ref.c       |  28 ++++-
>  fs/btrfs/disk-io.c           |   4 +
>  fs/btrfs/extent-tree.c       | 279 
> +++++++++++++++++++++++++++++++++++--------
>  fs/btrfs/inode.c             |   2 +-
>  fs/btrfs/transaction.c       |  77 ++++++------
>  include/trace/events/btrfs.h |   2 +
>  7 files changed, 312 insertions(+), 107 deletions(-)
> 
> diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
> index 66f1d3895bca..1a2c3b629af2 100644
> --- a/fs/btrfs/ctree.h
> +++ b/fs/btrfs/ctree.h
> @@ -452,8 +452,9 @@ struct btrfs_space_info {
>  #define      BTRFS_BLOCK_RSV_TRANS           3
>  #define      BTRFS_BLOCK_RSV_CHUNK           4
>  #define      BTRFS_BLOCK_RSV_DELOPS          5
> -#define      BTRFS_BLOCK_RSV_EMPTY           6
> -#define      BTRFS_BLOCK_RSV_TEMP            7
> +#define BTRFS_BLOCK_RSV_DELREFS              6
> +#define      BTRFS_BLOCK_RSV_EMPTY           7
> +#define      BTRFS_BLOCK_RSV_TEMP            8
>  
>  struct btrfs_block_rsv {
>       u64 size;
> @@ -794,6 +795,8 @@ struct btrfs_fs_info {
>       struct btrfs_block_rsv chunk_block_rsv;
>       /* block reservation for delayed operations */
>       struct btrfs_block_rsv delayed_block_rsv;
> +     /* block reservation for delayed refs */
> +     struct btrfs_block_rsv delayed_refs_rsv;
>  
>       struct btrfs_block_rsv empty_block_rsv;
>  
> @@ -2608,8 +2611,7 @@ static inline u64 btrfs_calc_trunc_metadata_size(struct 
> btrfs_fs_info *fs_info,
>  
>  int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans,
>                                      struct btrfs_fs_info *fs_info);
> -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
> -                                    struct btrfs_fs_info *fs_info);
> +bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info);
>  void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
>                                        const u64 start);
>  void btrfs_wait_block_group_reservations(struct btrfs_block_group_cache *bg);
> @@ -2723,10 +2725,12 @@ enum btrfs_reserve_flush_enum {
>  enum btrfs_flush_state {
>       FLUSH_DELAYED_ITEMS_NR  =       1,
>       FLUSH_DELAYED_ITEMS     =       2,
> -     FLUSH_DELALLOC          =       3,
> -     FLUSH_DELALLOC_WAIT     =       4,
> -     ALLOC_CHUNK             =       5,
> -     COMMIT_TRANS            =       6,
> +     FLUSH_DELAYED_REFS_NR   =       3,
> +     FLUSH_DELAYED_REFS      =       4,
> +     FLUSH_DELALLOC          =       5,
> +     FLUSH_DELALLOC_WAIT     =       6,
> +     ALLOC_CHUNK             =       7,
> +     COMMIT_TRANS            =       8,
>  };
>  
>  int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes);
> @@ -2777,6 +2781,13 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info 
> *fs_info,
>  void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
>                            struct btrfs_block_rsv *block_rsv,
>                            u64 num_bytes);
> +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr);
> +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans);
> +int btrfs_throttle_delayed_refs(struct btrfs_fs_info *fs_info,
> +                             enum btrfs_reserve_flush_enum flush);
> +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
> +                                    struct btrfs_block_rsv *src,
> +                                    u64 num_bytes);
>  int btrfs_inc_block_group_ro(struct btrfs_block_group_cache *cache);
>  void btrfs_dec_block_group_ro(struct btrfs_block_group_cache *cache);
>  void btrfs_put_block_group_cache(struct btrfs_fs_info *info);
> diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
> index 27f7dd4e3d52..96ce087747b2 100644
> --- a/fs/btrfs/delayed-ref.c
> +++ b/fs/btrfs/delayed-ref.c
> @@ -467,11 +467,14 @@ static int insert_delayed_ref(struct btrfs_trans_handle 
> *trans,
>   * existing and update must have the same bytenr
>   */
>  static noinline void
> -update_existing_head_ref(struct btrfs_delayed_ref_root *delayed_refs,
> +update_existing_head_ref(struct btrfs_trans_handle *trans,
>                        struct btrfs_delayed_ref_head *existing,
>                        struct btrfs_delayed_ref_head *update,
>                        int *old_ref_mod_ret)
>  {
> +     struct btrfs_delayed_ref_root *delayed_refs =
> +             &trans->transaction->delayed_refs;
> +     struct btrfs_fs_info *fs_info = trans->fs_info;
>       int old_ref_mod;
>  
>       BUG_ON(existing->is_data != update->is_data);
> @@ -529,10 +532,18 @@ update_existing_head_ref(struct btrfs_delayed_ref_root 
> *delayed_refs,
>        * versa we need to make sure to adjust pending_csums accordingly.
>        */
>       if (existing->is_data) {
> -             if (existing->total_ref_mod >= 0 && old_ref_mod < 0)
> +             u64 csum_items =
> +                     btrfs_csum_bytes_to_leaves(fs_info,
> +                                                existing->num_bytes);
> +
> +             if (existing->total_ref_mod >= 0 && old_ref_mod < 0) {
>                       delayed_refs->pending_csums -= existing->num_bytes;
> -             if (existing->total_ref_mod < 0 && old_ref_mod >= 0)
> +                     btrfs_delayed_refs_rsv_release(fs_info, csum_items);
> +             }
> +             if (existing->total_ref_mod < 0 && old_ref_mod >= 0) {
>                       delayed_refs->pending_csums += existing->num_bytes;
> +                     trans->delayed_ref_updates += csum_items;
> +             }
>       }
>       spin_unlock(&existing->lock);
>  }
> @@ -638,7 +649,7 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
>                       && head_ref->qgroup_reserved
>                       && existing->qgroup_ref_root
>                       && existing->qgroup_reserved);
> -             update_existing_head_ref(delayed_refs, existing, head_ref,
> +             update_existing_head_ref(trans, existing, head_ref,
>                                        old_ref_mod);
>               /*
>                * we've updated the existing ref, free the newly
> @@ -649,8 +660,12 @@ add_delayed_ref_head(struct btrfs_trans_handle *trans,
>       } else {
>               if (old_ref_mod)
>                       *old_ref_mod = 0;
> -             if (head_ref->is_data && head_ref->ref_mod < 0)
> +             if (head_ref->is_data && head_ref->ref_mod < 0) {
>                       delayed_refs->pending_csums += head_ref->num_bytes;
> +                     trans->delayed_ref_updates +=
> +                             btrfs_csum_bytes_to_leaves(trans->fs_info,
> +                                                        head_ref->num_bytes);
> +             }
>               delayed_refs->num_heads++;
>               delayed_refs->num_heads_ready++;
>               atomic_inc(&delayed_refs->num_entries);
> @@ -785,6 +800,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle 
> *trans,
>  
>       ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
>       spin_unlock(&delayed_refs->lock);
> +     btrfs_update_delayed_refs_rsv(trans);

You haven't adressed my initial point about merging modification of
delayed_ref_updates and calling btrfs_update_delayed_refs_rsv into one
function otherwise this seems error prone. I don't see why this cannot
be made, if there is some reason which I'm missing then explain it.

As it stands this btrfs_updated_delayed_refs_rsv is paired with the
modifications made in one of the 2nd level callees:

 btrfs_add_delayed_tree_ref
   add_delayed_ref_head
    update_existing_head_ref

I'd rather have btrfs_update_delayed_refs_rsv renamed to something else
with 'inc' in its name and called everytime we modify
delayed_ref_update. I'm willing to bet 50 bucks in 6 months time someone
will change delayed_ref_updates and will forget to call
btrfs_update_delayed_refs_rsv.


WRT locking in update_existing_head_ref we are guaranteed to hold
delayed_refs->lock, same thing in add_delayed_extent_op. The only places
where we don't hold it is in the bg-related paths. But that's easily
solvable by simplying breaking the function down into an internal helper
doing the actual work with lockdep_assert_held(delayed_refs) at the top
and a "public" api which will be taking the lock and calling the helper.
WRT performance you will not be putting that much extra code in the
critical section i.e the check + the arithmetic of
btrfs_calc_trans_metadata_size.


>  
>       trace_add_delayed_tree_ref(fs_info, &ref->node, ref,
>                                  action == BTRFS_ADD_DELAYED_EXTENT ?
> @@ -866,6 +882,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle 
> *trans,
>  
>       ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
>       spin_unlock(&delayed_refs->lock);
> +     btrfs_update_delayed_refs_rsv(trans);
>  
>       trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref,
>                                  action == BTRFS_ADD_DELAYED_EXTENT ?
> @@ -903,6 +920,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_fs_info 
> *fs_info,
>                            NULL, NULL, NULL);
>  
>       spin_unlock(&delayed_refs->lock);
> +     btrfs_update_delayed_refs_rsv(trans);
>       return 0;
>  }
>  
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 5124c15705ce..377ad9c1cb17 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -2692,6 +2692,9 @@ int open_ctree(struct super_block *sb,
>       btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY);
>       btrfs_init_block_rsv(&fs_info->delayed_block_rsv,
>                            BTRFS_BLOCK_RSV_DELOPS);
> +     btrfs_init_block_rsv(&fs_info->delayed_refs_rsv,
> +                          BTRFS_BLOCK_RSV_DELREFS);
> +
>       atomic_set(&fs_info->async_delalloc_pages, 0);
>       atomic_set(&fs_info->defrag_running, 0);
>       atomic_set(&fs_info->qgroup_op_seq, 0);
> @@ -4419,6 +4422,7 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction 
> *cur_trans,
>  
>               spin_unlock(&cur_trans->dirty_bgs_lock);
>               btrfs_put_block_group(cache);
> +             btrfs_delayed_refs_rsv_release(fs_info, 1);
>               spin_lock(&cur_trans->dirty_bgs_lock);
>       }
>       spin_unlock(&cur_trans->dirty_bgs_lock);
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index b32bd38390dd..1213f573eea2 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -2481,6 +2481,7 @@ static void cleanup_ref_head_accounting(struct 
> btrfs_trans_handle *trans,
>       struct btrfs_fs_info *fs_info = trans->fs_info;
>       struct btrfs_delayed_ref_root *delayed_refs =
>               &trans->transaction->delayed_refs;
> +     int nr_items = 1;
>  
>       if (head->total_ref_mod < 0) {
>               struct btrfs_space_info *space_info;
> @@ -2502,12 +2503,15 @@ static void cleanup_ref_head_accounting(struct 
> btrfs_trans_handle *trans,
>                       spin_lock(&delayed_refs->lock);
>                       delayed_refs->pending_csums -= head->num_bytes;
>                       spin_unlock(&delayed_refs->lock);
> +                     nr_items += btrfs_csum_bytes_to_leaves(fs_info,
> +                             head->num_bytes);
>               }
>       }
>  
>       /* Also free its reserved qgroup space */
>       btrfs_qgroup_free_delayed_ref(fs_info, head->qgroup_ref_root,
>                                     head->qgroup_reserved);
> +     btrfs_delayed_refs_rsv_release(fs_info, nr_items);
>  }
>  
>  static int cleanup_ref_head(struct btrfs_trans_handle *trans,
> @@ -2802,40 +2806,22 @@ u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info 
> *fs_info, u64 csum_bytes)
>       return num_csums;
>  }
>  
> -int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans,
> -                                    struct btrfs_fs_info *fs_info)
> +bool btrfs_check_space_for_delayed_refs( struct btrfs_fs_info *fs_info)
>  {
> -     struct btrfs_block_rsv *global_rsv;
> -     u64 num_heads = trans->transaction->delayed_refs.num_heads_ready;
> -     u64 csum_bytes = trans->transaction->delayed_refs.pending_csums;
> -     unsigned int num_dirty_bgs = trans->transaction->num_dirty_bgs;
> -     u64 num_bytes, num_dirty_bgs_bytes;
> -     int ret = 0;
> -
> -     num_bytes = btrfs_calc_trans_metadata_size(fs_info, 1);
> -     num_heads = heads_to_leaves(fs_info, num_heads);
> -     if (num_heads > 1)
> -             num_bytes += (num_heads - 1) * fs_info->nodesize;
> -     num_bytes <<= 1;
> -     num_bytes += btrfs_csum_bytes_to_leaves(fs_info, csum_bytes) *
> -                                                     fs_info->nodesize;
> -     num_dirty_bgs_bytes = btrfs_calc_trans_metadata_size(fs_info,
> -                                                          num_dirty_bgs);
> -     global_rsv = &fs_info->global_block_rsv;
> -
> -     /*
> -      * If we can't allocate any more chunks lets make sure we have _lots_ of
> -      * wiggle room since running delayed refs can create more delayed refs.
> -      */
> -     if (global_rsv->space_info->full) {
> -             num_dirty_bgs_bytes <<= 1;
> -             num_bytes <<= 1;
> -     }
> +     struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
> +     struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
> +     u64 reserved;
> +     bool ret = false;
>  
>       spin_lock(&global_rsv->lock);
> -     if (global_rsv->reserved <= num_bytes + num_dirty_bgs_bytes)
> -             ret = 1;
> +     reserved = global_rsv->reserved;
>       spin_unlock(&global_rsv->lock);
> +
> +     spin_lock(&delayed_refs_rsv->lock);
> +     reserved += delayed_refs_rsv->reserved;
> +     if (delayed_refs_rsv->size >= reserved)
> +             ret = true;
> +     spin_unlock(&delayed_refs_rsv->lock);
>       return ret;
>  }
>  
> @@ -2855,7 +2841,7 @@ int btrfs_should_throttle_delayed_refs(struct 
> btrfs_trans_handle *trans,
>       if (val >= NSEC_PER_SEC / 2)
>               return 2;
>  
> -     return btrfs_check_space_for_delayed_refs(trans, fs_info);
> +     return btrfs_check_space_for_delayed_refs(fs_info) ? 1 : 0;
>  }
>  
>  struct async_delayed_refs {
> @@ -3610,6 +3596,8 @@ int btrfs_start_dirty_block_groups(struct 
> btrfs_trans_handle *trans)
>        */
>       mutex_lock(&trans->transaction->cache_write_mutex);
>       while (!list_empty(&dirty)) {
> +             bool drop_reserve = true;
> +
>               cache = list_first_entry(&dirty,
>                                        struct btrfs_block_group_cache,
>                                        dirty_list);
> @@ -3682,6 +3670,7 @@ int btrfs_start_dirty_block_groups(struct 
> btrfs_trans_handle *trans)
>                                       list_add_tail(&cache->dirty_list,
>                                                     &cur_trans->dirty_bgs);
>                                       btrfs_get_block_group(cache);
> +                                     drop_reserve = false;
>                               }
>                               spin_unlock(&cur_trans->dirty_bgs_lock);
>                       } else if (ret) {
> @@ -3692,6 +3681,8 @@ int btrfs_start_dirty_block_groups(struct 
> btrfs_trans_handle *trans)
>               /* if its not on the io list, we need to put the block group */
>               if (should_put)
>                       btrfs_put_block_group(cache);
> +             if (drop_reserve)
> +                     btrfs_delayed_refs_rsv_release(fs_info, 1);
>  
>               if (ret)
>                       break;
> @@ -3840,6 +3831,7 @@ int btrfs_write_dirty_block_groups(struct 
> btrfs_trans_handle *trans,
>               /* if its not on the io list, we need to put the block group */
>               if (should_put)
>                       btrfs_put_block_group(cache);
> +             btrfs_delayed_refs_rsv_release(fs_info, 1);
>               spin_lock(&cur_trans->dirty_bgs_lock);
>       }
>       spin_unlock(&cur_trans->dirty_bgs_lock);
> @@ -4816,8 +4808,10 @@ static int may_commit_transaction(struct btrfs_fs_info 
> *fs_info,
>  {
>       struct reserve_ticket *ticket = NULL;
>       struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv;
> +     struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
>       struct btrfs_trans_handle *trans;
>       u64 bytes;
> +     u64 reclaim_bytes = 0;
>  
>       trans = (struct btrfs_trans_handle *)current->journal_info;
>       if (trans)
> @@ -4850,12 +4844,16 @@ static int may_commit_transaction(struct 
> btrfs_fs_info *fs_info,
>               return -ENOSPC;
>  
>       spin_lock(&delayed_rsv->lock);
> -     if (delayed_rsv->size > bytes)
> -             bytes = 0;
> -     else
> -             bytes -= delayed_rsv->size;
> +     reclaim_bytes += delayed_rsv->reserved;
>       spin_unlock(&delayed_rsv->lock);
>  
> +     spin_lock(&delayed_refs_rsv->lock);
> +     reclaim_bytes += delayed_refs_rsv->reserved;
> +     spin_unlock(&delayed_refs_rsv->lock);
> +     if (reclaim_bytes >= bytes)
> +             goto commit;
> +     bytes -= reclaim_bytes;
> +
>       if (__percpu_counter_compare(&space_info->total_bytes_pinned,
>                                  bytes,
>                                  BTRFS_TOTAL_BYTES_PINNED_BATCH) < 0) {
> @@ -4905,6 +4903,20 @@ static void flush_space(struct btrfs_fs_info *fs_info,
>               shrink_delalloc(fs_info, num_bytes * 2, num_bytes,
>                               state == FLUSH_DELALLOC_WAIT);
>               break;
> +     case FLUSH_DELAYED_REFS_NR:
> +     case FLUSH_DELAYED_REFS:
> +             trans = btrfs_join_transaction(root);
> +             if (IS_ERR(trans)) {
> +                     ret = PTR_ERR(trans);
> +                     break;
> +             }
> +             if (state == FLUSH_DELAYED_REFS_NR)
> +                     nr = calc_reclaim_items_nr(fs_info, num_bytes);
> +             else
> +                     nr = 0;
> +             btrfs_run_delayed_refs(trans, nr);
> +             btrfs_end_transaction(trans);
> +             break;
>       case ALLOC_CHUNK:
>               trans = btrfs_join_transaction(root);
>               if (IS_ERR(trans)) {
> @@ -5377,6 +5389,91 @@ int btrfs_cond_migrate_bytes(struct btrfs_fs_info 
> *fs_info,
>       return 0;
>  }
>  
> +/**
> + * btrfs_migrate_to_delayed_refs_rsv - transfer bytes to our delayed refs 
> rsv.
> + * @fs_info - the fs info for our fs.
> + * @src - the source block rsv to transfer from.
> + * @num_bytes - the number of bytes to transfer.
> + *
> + * This transfers up to the num_bytes amount from the src rsv to the
> + * delayed_refs_rsv.  Any extra bytes are returned to the space info.
> + */
> +void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
> +                                    struct btrfs_block_rsv *src,
> +                                    u64 num_bytes)
> +{
> +     struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
> +     u64 to_free = 0;
> +
> +     spin_lock(&src->lock);
> +     src->reserved -= num_bytes;
> +     src->size -= num_bytes;
> +     spin_unlock(&src->lock);
> +
> +     spin_lock(&delayed_refs_rsv->lock);
> +     if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
> +             u64 delta = delayed_refs_rsv->size -
> +                     delayed_refs_rsv->reserved;
> +             if (num_bytes > delta) {
> +                     to_free = num_bytes - delta;
> +                     num_bytes = delta;
> +             }
> +     } else {
> +             to_free = num_bytes;
> +             num_bytes = 0;
> +     }
> +
> +     if (num_bytes)
> +             delayed_refs_rsv->reserved += num_bytes;
> +     if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
> +             delayed_refs_rsv->full = 1;
> +     spin_unlock(&delayed_refs_rsv->lock);
> +
> +     if (num_bytes)
> +             trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
> +                                           0, num_bytes, 1);
> +     if (to_free)
> +             space_info_add_old_bytes(fs_info, delayed_refs_rsv->space_info,
> +                                      to_free);
> +}
> +
> +/**
> + * btrfs_throttle_delayed_refs - throttle based on our delayed refs usage.
> + * @fs_info - the fs_info for our fs.
> + * @flush - control how we can flush for this reservation.
> + *
> + * This will refill the delayed block_rsv up to 1 items size worth of space 
> and
> + * will return -ENOSPC if we can't make the reservation.
> + */
> +int btrfs_throttle_delayed_refs(struct btrfs_fs_info *fs_info,
> +                             enum btrfs_reserve_flush_enum flush)
> +{
> +     struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
> +     u64 limit = btrfs_calc_trans_metadata_size(fs_info, 1);
> +     u64 num_bytes = 0;
> +     int ret = -ENOSPC;
> +
> +     spin_lock(&block_rsv->lock);
> +     if (block_rsv->reserved < block_rsv->size) {
> +             num_bytes = block_rsv->size - block_rsv->reserved;
> +             num_bytes = min(num_bytes, limit);
> +     }
> +     spin_unlock(&block_rsv->lock);
> +
> +     if (!num_bytes)
> +             return 0;
> +
> +     ret = reserve_metadata_bytes(fs_info->extent_root, block_rsv,
> +                                  num_bytes, flush);
> +     if (ret)
> +             return ret;
> +     block_rsv_add_bytes(block_rsv, num_bytes, 0);
> +     trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
> +                                   0, num_bytes, 1);
> +     return 0;
> +}
> +
> +
>  /*
>   * This is for space we already have accounted in space_info->bytes_may_use, 
> so
>   * basically when we're returning space from block_rsv's.
> @@ -5699,6 +5796,31 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode 
> *inode,
>       return ret;
>  }
>  
> +static u64 __btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
> +                                  struct btrfs_block_rsv *block_rsv,
> +                                  u64 num_bytes, u64 *qgroup_to_release)
> +{
> +     struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
> +     struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
> +     struct btrfs_block_rsv *target = delayed_rsv;
> +
> +     if (target->full || target == block_rsv)
> +             target = global_rsv;
> +
> +     if (block_rsv->space_info != target->space_info)
> +             target = NULL;
> +
> +     return block_rsv_release_bytes(fs_info, block_rsv, target, num_bytes,
> +                                    qgroup_to_release);
> +}
> +
> +void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
> +                          struct btrfs_block_rsv *block_rsv,
> +                          u64 num_bytes)
> +{
> +     __btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
> +}
> +
>  /**
>   * btrfs_inode_rsv_release - release any excessive reservation.
>   * @inode - the inode we need to release from.
> @@ -5713,7 +5835,6 @@ static int btrfs_inode_rsv_refill(struct btrfs_inode 
> *inode,
>  static void btrfs_inode_rsv_release(struct btrfs_inode *inode, bool 
> qgroup_free)
>  {
>       struct btrfs_fs_info *fs_info = inode->root->fs_info;
> -     struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
>       struct btrfs_block_rsv *block_rsv = &inode->block_rsv;
>       u64 released = 0;
>       u64 qgroup_to_release = 0;
> @@ -5723,8 +5844,8 @@ static void btrfs_inode_rsv_release(struct btrfs_inode 
> *inode, bool qgroup_free)
>        * are releasing 0 bytes, and then we'll just get the reservation over
>        * the size free'd.
>        */
> -     released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv, 0,
> -                                        &qgroup_to_release);
> +     released = __btrfs_block_rsv_release(fs_info, block_rsv, 0,
> +                                          &qgroup_to_release);
>       if (released > 0)
>               trace_btrfs_space_reservation(fs_info, "delalloc",
>                                             btrfs_ino(inode), released, 0);
> @@ -5735,16 +5856,26 @@ static void btrfs_inode_rsv_release(struct 
> btrfs_inode *inode, bool qgroup_free)
>                                                  qgroup_to_release);
>  }
>  
> -void btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,
> -                          struct btrfs_block_rsv *block_rsv,
> -                          u64 num_bytes)
> +/**
> + * btrfs_delayed_refs_rsv_release - release a ref head's reservation.
> + * @fs_info - the fs_info for our fs.
> + * @nr - the number of items to drop.
> + *
> + * This drops the delayed ref head's count from the delayed refs rsv and 
> free's
> + * any excess reservation we had.
> + */
> +void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
>  {
> +     struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
>       struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
> +     u64 num_bytes = btrfs_calc_trans_metadata_size(fs_info, nr);
> +     u64 released = 0;
>  
> -     if (global_rsv == block_rsv ||
> -         block_rsv->space_info != global_rsv->space_info)
> -             global_rsv = NULL;
> -     block_rsv_release_bytes(fs_info, block_rsv, global_rsv, num_bytes, 
> NULL);
> +     released = block_rsv_release_bytes(fs_info, block_rsv, global_rsv,
> +                                        num_bytes, NULL);
> +     if (released)
> +             trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
> +                                           0, released, 0);
>  }
>  
>  static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
> @@ -5809,9 +5940,10 @@ static void init_global_block_rsv(struct btrfs_fs_info 
> *fs_info)
>       fs_info->trans_block_rsv.space_info = space_info;
>       fs_info->empty_block_rsv.space_info = space_info;
>       fs_info->delayed_block_rsv.space_info = space_info;
> +     fs_info->delayed_refs_rsv.space_info = space_info;
>  
> -     fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
> -     fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
> +     fs_info->extent_root->block_rsv = &fs_info->delayed_refs_rsv;
> +     fs_info->csum_root->block_rsv = &fs_info->delayed_refs_rsv;
>       fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
>       fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
>       if (fs_info->quota_root)
> @@ -5831,8 +5963,34 @@ static void release_global_block_rsv(struct 
> btrfs_fs_info *fs_info)
>       WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
>       WARN_ON(fs_info->delayed_block_rsv.size > 0);
>       WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
> +     WARN_ON(fs_info->delayed_refs_rsv.reserved > 0);
> +     WARN_ON(fs_info->delayed_refs_rsv.size > 0);
>  }
>  
> +/*
> + * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
> + * @trans - the trans that may have generated delayed refs
> + *
> + * This is to be called anytime we may have adjusted 
> trans->delayed_ref_updates,
> + * it'll calculate the additional size and add it to the delayed_refs_rsv.
> + */
> +void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
> +{
> +     struct btrfs_fs_info *fs_info = trans->fs_info;
> +     struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
> +     u64 num_bytes;
> +
> +     if (!trans->delayed_ref_updates)
> +             return;
> +
> +     num_bytes = btrfs_calc_trans_metadata_size(fs_info,
> +                                                trans->delayed_ref_updates);
> +     spin_lock(&delayed_rsv->lock);
> +     delayed_rsv->size += num_bytes;
> +     delayed_rsv->full = 0;
> +     spin_unlock(&delayed_rsv->lock);
> +     trans->delayed_ref_updates = 0;
> +}
>  
>  /*
>   * To be called after all the new block groups attached to the transaction
> @@ -6126,6 +6284,7 @@ static int update_block_group(struct btrfs_trans_handle 
> *trans,
>       u64 old_val;
>       u64 byte_in_group;
>       int factor;
> +     int ret = 0;
>  
>       /* block accounting for super block */
>       spin_lock(&info->delalloc_root_lock);
> @@ -6139,8 +6298,10 @@ static int update_block_group(struct 
> btrfs_trans_handle *trans,
>  
>       while (total) {
>               cache = btrfs_lookup_block_group(info, bytenr);
> -             if (!cache)
> -                     return -ENOENT;
> +             if (!cache) {
> +                     ret = -ENOENT;
> +                     break;
> +             }
>               factor = btrfs_bg_type_to_factor(cache->flags);
>  
>               /*
> @@ -6199,6 +6360,7 @@ static int update_block_group(struct btrfs_trans_handle 
> *trans,
>                       list_add_tail(&cache->dirty_list,
>                                     &trans->transaction->dirty_bgs);
>                       trans->transaction->num_dirty_bgs++;
> +                     trans->delayed_ref_updates++;
>                       btrfs_get_block_group(cache);
>               }
>               spin_unlock(&trans->transaction->dirty_bgs_lock);
> @@ -6216,7 +6378,10 @@ static int update_block_group(struct 
> btrfs_trans_handle *trans,
>               total -= num_bytes;
>               bytenr += num_bytes;
>       }
> -     return 0;
> +
> +     /* Modified block groups are accounted for in the delayed_refs_rsv. */
> +     btrfs_update_delayed_refs_rsv(trans);
> +     return ret;
>  }
>  
>  static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 
> search_start)
> @@ -8230,7 +8395,12 @@ use_block_rsv(struct btrfs_trans_handle *trans,
>               goto again;
>       }
>  
> -     if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
> +     /*
> +      * The global reserve still exists to save us from ourselves, so don't
> +      * warn_on if we are short on our delayed refs reserve.
> +      */
> +     if (block_rsv->type != BTRFS_BLOCK_RSV_DELREFS &&
> +         btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
>               static DEFINE_RATELIMIT_STATE(_rs,
>                               DEFAULT_RATELIMIT_INTERVAL * 10,
>                               /*DEFAULT_RATELIMIT_BURST*/ 1);
> @@ -10146,6 +10316,7 @@ void btrfs_create_pending_block_groups(struct 
> btrfs_trans_handle *trans)
>               add_block_group_free_space(trans, block_group);
>               /* already aborted the transaction if it failed. */
>  next:
> +             btrfs_delayed_refs_rsv_release(fs_info, 1);
>               list_del_init(&block_group->bg_list);
>       }
>       trans->can_flush_pending_bgs = can_flush_pending_bgs;
> @@ -10223,6 +10394,8 @@ int btrfs_make_block_group(struct btrfs_trans_handle 
> *trans, u64 bytes_used,
>       link_block_group(cache);
>  
>       list_add_tail(&cache->bg_list, &trans->new_bgs);
> +     trans->delayed_ref_updates++;
> +     btrfs_update_delayed_refs_rsv(trans);
>  
>       set_avail_alloc_bits(fs_info, type);
>       return 0;
> @@ -10260,6 +10433,7 @@ int btrfs_remove_block_group(struct 
> btrfs_trans_handle *trans,
>       int factor;
>       struct btrfs_caching_control *caching_ctl = NULL;
>       bool remove_em;
> +     bool remove_rsv = false;
>  
>       block_group = btrfs_lookup_block_group(fs_info, group_start);
>       BUG_ON(!block_group);
> @@ -10324,6 +10498,7 @@ int btrfs_remove_block_group(struct 
> btrfs_trans_handle *trans,
>  
>       if (!list_empty(&block_group->dirty_list)) {
>               list_del_init(&block_group->dirty_list);
> +             remove_rsv = true;
>               btrfs_put_block_group(block_group);
>       }
>       spin_unlock(&trans->transaction->dirty_bgs_lock);
> @@ -10533,6 +10708,8 @@ int btrfs_remove_block_group(struct 
> btrfs_trans_handle *trans,
>  
>       ret = btrfs_del_item(trans, root, path);
>  out:
> +     if (remove_rsv)
> +             btrfs_delayed_refs_rsv_release(fs_info, 1);
>       btrfs_free_path(path);
>       return ret;
>  }
> diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
> index 212fa71317d6..cd00ec869c96 100644
> --- a/fs/btrfs/inode.c
> +++ b/fs/btrfs/inode.c
> @@ -5382,7 +5382,7 @@ static struct btrfs_trans_handle 
> *evict_refill_and_join(struct btrfs_root *root,
>                * Try to steal from the global reserve if there is space for
>                * it.
>                */
> -             if (!btrfs_check_space_for_delayed_refs(trans, fs_info) &&
> +             if (!btrfs_check_space_for_delayed_refs(fs_info) &&
>                   !btrfs_block_rsv_migrate(global_rsv, rsv, min_size, 0))
>                       return trans;
>  
> diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
> index 3b84f5015029..117e0c4a914a 100644
> --- a/fs/btrfs/transaction.c
> +++ b/fs/btrfs/transaction.c
> @@ -455,7 +455,7 @@ start_transaction(struct btrfs_root *root, unsigned int 
> num_items,
>                 bool enforce_qgroups)
>  {
>       struct btrfs_fs_info *fs_info = root->fs_info;
> -
> +     struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
>       struct btrfs_trans_handle *h;
>       struct btrfs_transaction *cur_trans;
>       u64 num_bytes = 0;
> @@ -484,13 +484,28 @@ start_transaction(struct btrfs_root *root, unsigned int 
> num_items,
>        * the appropriate flushing if need be.
>        */
>       if (num_items && root != fs_info->chunk_root) {
> +             struct btrfs_block_rsv *rsv = &fs_info->trans_block_rsv;
> +             u64 delayed_refs_bytes = 0;
> +
>               qgroup_reserved = num_items * fs_info->nodesize;
>               ret = btrfs_qgroup_reserve_meta_pertrans(root, qgroup_reserved,
>                               enforce_qgroups);
>               if (ret)
>                       return ERR_PTR(ret);
>  
> +             /*
> +              * We want to reserve all the bytes we may need all at once, so
> +              * we only do 1 enospc flushing cycle per transaction start.  We
> +              * accomplish this by simply assuming we'll do 2 x num_items
> +              * worth of delayed refs updates in this trans handle, and
> +              * refill that amount for whatever is missing in the reserve.
> +              */
>               num_bytes = btrfs_calc_trans_metadata_size(fs_info, num_items);
> +             if (delayed_refs_rsv->full == 0) {
> +                     delayed_refs_bytes = num_bytes;
> +                     num_bytes <<= 1;
> +             }
> +
>               /*
>                * Do the reservation for the relocation root creation
>                */
> @@ -499,8 +514,24 @@ start_transaction(struct btrfs_root *root, unsigned int 
> num_items,
>                       reloc_reserved = true;
>               }
>  
> -             ret = btrfs_block_rsv_add(root, &fs_info->trans_block_rsv,
> -                                       num_bytes, flush);
> +             ret = btrfs_block_rsv_add(root, rsv, num_bytes, flush);
> +             if (ret)
> +                     goto reserve_fail;
> +             if (delayed_refs_bytes) {
> +                     btrfs_migrate_to_delayed_refs_rsv(fs_info, rsv,
> +                                                       delayed_refs_bytes);
> +                     num_bytes -= delayed_refs_bytes;
> +             }
> +     } else if (num_items == 0 && flush == BTRFS_RESERVE_FLUSH_ALL &&
> +                !delayed_refs_rsv->full) {
> +             /*
> +              * Some people call with btrfs_start_transaction(root, 0)
> +              * because they can be throttled, but have some other mechanism
> +              * for reserving space.  We still want these guys to refill the
> +              * delayed block_rsv so just add 1 items worth of reservation
> +              * here.
> +              */
> +             ret = btrfs_throttle_delayed_refs(fs_info, flush);
>               if (ret)
>                       goto reserve_fail;
>       }
> @@ -759,7 +790,7 @@ static int should_end_transaction(struct 
> btrfs_trans_handle *trans)
>  {
>       struct btrfs_fs_info *fs_info = trans->fs_info;
>  
> -     if (btrfs_check_space_for_delayed_refs(trans, fs_info))
> +     if (btrfs_check_space_for_delayed_refs(fs_info))
>               return 1;
>  
>       return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);
> @@ -768,22 +799,12 @@ static int should_end_transaction(struct 
> btrfs_trans_handle *trans)
>  int btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
>  {
>       struct btrfs_transaction *cur_trans = trans->transaction;
> -     int updates;
> -     int err;
>  
>       smp_mb();
>       if (cur_trans->state >= TRANS_STATE_BLOCKED ||
>           cur_trans->delayed_refs.flushing)
>               return 1;
>  
> -     updates = trans->delayed_ref_updates;
> -     trans->delayed_ref_updates = 0;
> -     if (updates) {
> -             err = btrfs_run_delayed_refs(trans, updates * 2);
> -             if (err) /* Error code will also eval true */
> -                     return err;
> -     }
> -
>       return should_end_transaction(trans);
>  }
>  
> @@ -813,11 +834,8 @@ static int __btrfs_end_transaction(struct 
> btrfs_trans_handle *trans,
>  {
>       struct btrfs_fs_info *info = trans->fs_info;
>       struct btrfs_transaction *cur_trans = trans->transaction;
> -     u64 transid = trans->transid;
> -     unsigned long cur = trans->delayed_ref_updates;
>       int lock = (trans->type != TRANS_JOIN_NOLOCK);
>       int err = 0;
> -     int must_run_delayed_refs = 0;
>  
>       if (refcount_read(&trans->use_count) > 1) {
>               refcount_dec(&trans->use_count);
> @@ -828,27 +846,6 @@ static int __btrfs_end_transaction(struct 
> btrfs_trans_handle *trans,
>       btrfs_trans_release_metadata(trans);
>       trans->block_rsv = NULL;
>  
> -     if (!list_empty(&trans->new_bgs))
> -             btrfs_create_pending_block_groups(trans);
> -
> -     trans->delayed_ref_updates = 0;
> -     if (!trans->sync) {
> -             must_run_delayed_refs =
> -                     btrfs_should_throttle_delayed_refs(trans, info);
> -             cur = max_t(unsigned long, cur, 32);
> -
> -             /*
> -              * don't make the caller wait if they are from a NOLOCK
> -              * or ATTACH transaction, it will deadlock with commit
> -              */
> -             if (must_run_delayed_refs == 1 &&
> -                 (trans->type & (__TRANS_JOIN_NOLOCK | __TRANS_ATTACH)))
> -                     must_run_delayed_refs = 2;
> -     }
> -
> -     btrfs_trans_release_metadata(trans);
> -     trans->block_rsv = NULL;
> -
>       if (!list_empty(&trans->new_bgs))
>               btrfs_create_pending_block_groups(trans);
>  
> @@ -893,10 +890,6 @@ static int __btrfs_end_transaction(struct 
> btrfs_trans_handle *trans,
>       }
>  
>       kmem_cache_free(btrfs_trans_handle_cachep, trans);
> -     if (must_run_delayed_refs) {
> -             btrfs_async_run_delayed_refs(info, cur, transid,
> -                                          must_run_delayed_refs == 1);
> -     }
>       return err;
>  }
>  
> diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
> index b401c4e36394..7d205e50b09c 100644
> --- a/include/trace/events/btrfs.h
> +++ b/include/trace/events/btrfs.h
> @@ -1048,6 +1048,8 @@ TRACE_EVENT(btrfs_trigger_flush,
>               { FLUSH_DELAYED_ITEMS,          "FLUSH_DELAYED_ITEMS"},         
> \
>               { FLUSH_DELALLOC,               "FLUSH_DELALLOC"},              
> \
>               { FLUSH_DELALLOC_WAIT,          "FLUSH_DELALLOC_WAIT"},         
> \
> +             { FLUSH_DELAYED_REFS_NR,        "FLUSH_DELAYED_REFS_NR"},       
> \
> +             { FLUSH_DELAYED_REFS,           "FLUSH_ELAYED_REFS"},           
> \
>               { ALLOC_CHUNK,                  "ALLOC_CHUNK"},                 
> \
>               { COMMIT_TRANS,                 "COMMIT_TRANS"})
>  
> 

Reply via email to