Hi Josef,

are you fine with V2?

On Thu, Oct 13, 2016 at 05:31:25PM +0800, Wang Xiaoguang wrote:
> Since commit b02441999efcc6152b87cd58e7970bb7843f76cf, we don't wait all
> ordered extents, but I run into some enospc errors when doing large file
> create and delete tests, it's because shrink_delalloc() does not write
> enough delalloc bytes and wait them finished:
>     From: Miao Xie <mi...@cn.fujitsu.com>
>     Date: Mon, 4 Nov 2013 23:13:25 +0800
>     Subject: [PATCH] Btrfs: don't wait for the completion of all the ordered 
> extents
> 
>     It is very likely that there are lots of ordered extents in the filesytem,
>     if we wait for the completion of all of them when we want to reclaim some
>     space for the metadata space reservation, we would be blocked for a long
>     time. The performance would drop down suddenly for a long time.
> 
> Here we introduce a simple reclaim_priority variable, the lower the
> value, the higher the priority, 0 is the maximum priority. The core
> idea is:
>     delalloc_bytes = 
> percpu_counter_sum_positive(&root->fs_info->delalloc_bytes);
>     if (reclaim_priority)
>         to_reclaim = orig * (2 << (BTRFS_DEFAULT_FLUSH_PRIORITY - 
> reclaim_priority));
>     else
>         to_reclaim = delalloc_bytes;
> 
> Here 'orig' is the number of metadata we want to reserve, and as the priority
> increases, we will try wo write more delalloc bytes, meanwhile if
> "reclaim_priority == 0" returns true, we'll also wait all current ordered
> extents to finish.
> 
> Signed-off-by: Wang Xiaoguang <wangxg.f...@cn.fujitsu.com>
> ---
>  fs/btrfs/extent-tree.c       | 63 
> ++++++++++++++++++++++++++------------------
>  include/trace/events/btrfs.h | 11 +++-----
>  2 files changed, 42 insertions(+), 32 deletions(-)
> 
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index e08791d..7477c25 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -4686,16 +4686,18 @@ static inline int calc_reclaim_items_nr(struct 
> btrfs_root *root, u64 to_reclaim)
>  }
>  
>  #define EXTENT_SIZE_PER_ITEM SZ_256K
> +#define      BTRFS_DEFAULT_FLUSH_PRIORITY    3
>  
>  /*
>   * shrink metadata reservation for delalloc
>   */
> -static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 
> orig,
> -                         bool wait_ordered)
> +static void shrink_delalloc(struct btrfs_root *root, u64 orig,
> +                         bool wait_ordered, int reclaim_priority)
>  {
>       struct btrfs_block_rsv *block_rsv;
>       struct btrfs_space_info *space_info;
>       struct btrfs_trans_handle *trans;
> +     u64 to_reclaim;
>       u64 delalloc_bytes;
>       u64 max_reclaim;
>       long time_left;
> @@ -4703,22 +4705,36 @@ static void shrink_delalloc(struct btrfs_root *root, 
> u64 to_reclaim, u64 orig,
>       int loops;
>       int items;
>       enum btrfs_reserve_flush_enum flush;
> +     int items_to_wait;
> +
> +     delalloc_bytes = percpu_counter_sum_positive(
> +                             &root->fs_info->delalloc_bytes);
> +     if (reclaim_priority < 0)
> +             reclaim_priority = 0;
> +
> +     if (reclaim_priority)
> +             to_reclaim = orig * (2 << (BTRFS_DEFAULT_FLUSH_PRIORITY -
> +                                     reclaim_priority));
> +     else
> +             to_reclaim = delalloc_bytes;
>  
>       /* Calc the number of the pages we need flush for space reservation */
>       items = calc_reclaim_items_nr(root, to_reclaim);
>       to_reclaim = (u64)items * EXTENT_SIZE_PER_ITEM;
> +     if (reclaim_priority)
> +             items_to_wait = items;
> +     else
> +             items_to_wait = -1;
>  
>       trans = (struct btrfs_trans_handle *)current->journal_info;
>       block_rsv = &root->fs_info->delalloc_block_rsv;
>       space_info = block_rsv->space_info;
>  
> -     delalloc_bytes = percpu_counter_sum_positive(
> -                                             &root->fs_info->delalloc_bytes);
>       if (delalloc_bytes == 0) {
>               if (trans)
>                       return;
>               if (wait_ordered)
> -                     btrfs_wait_ordered_roots(root->fs_info, items,
> +                     btrfs_wait_ordered_roots(root->fs_info, items_to_wait,
>                                                0, (u64)-1);
>               return;
>       }
> @@ -4763,7 +4779,7 @@ static void shrink_delalloc(struct btrfs_root *root, 
> u64 to_reclaim, u64 orig,
>  
>               loops++;
>               if (wait_ordered && !trans) {
> -                     btrfs_wait_ordered_roots(root->fs_info, items,
> +                     btrfs_wait_ordered_roots(root->fs_info, items_to_wait,
>                                                0, (u64)-1);
>               } else {
>                       time_left = schedule_timeout_killable(1);
> @@ -4836,7 +4852,7 @@ struct reserve_ticket {
>  
>  static int flush_space(struct btrfs_root *root,
>                      struct btrfs_space_info *space_info, u64 num_bytes,
> -                    u64 orig_bytes, int state)
> +                    int state, int reclaim_priority)
>  {
>       struct btrfs_trans_handle *trans;
>       int nr;
> @@ -4860,8 +4876,8 @@ static int flush_space(struct btrfs_root *root,
>               break;
>       case FLUSH_DELALLOC:
>       case FLUSH_DELALLOC_WAIT:
> -             shrink_delalloc(root, num_bytes * 2, orig_bytes,
> -                             state == FLUSH_DELALLOC_WAIT);
> +             shrink_delalloc(root, num_bytes, state == FLUSH_DELALLOC_WAIT,
> +                             reclaim_priority);
>               break;
>       case ALLOC_CHUNK:
>               trans = btrfs_join_transaction(root);
> @@ -4877,7 +4893,7 @@ static int flush_space(struct btrfs_root *root,
>                       ret = 0;
>               break;
>       case COMMIT_TRANS:
> -             ret = may_commit_transaction(root, space_info, orig_bytes, 0);
> +             ret = may_commit_transaction(root, space_info, num_bytes, 0);
>               break;
>       default:
>               ret = -ENOSPC;
> @@ -4885,7 +4901,7 @@ static int flush_space(struct btrfs_root *root,
>       }
>  
>       trace_btrfs_flush_space(root->fs_info, space_info->flags, num_bytes,
> -                             orig_bytes, state, ret);
> +                             state, ret);
>       return ret;
>  }
>  
> @@ -5008,8 +5024,8 @@ static void btrfs_async_reclaim_metadata_space(struct 
> work_struct *work)
>       struct btrfs_space_info *space_info;
>       u64 to_reclaim;
>       int flush_state;
> -     int commit_cycles = 0;
>       u64 last_tickets_id;
> +     int reclaim_priority = BTRFS_DEFAULT_FLUSH_PRIORITY;
>  
>       fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work);
>       space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
> @@ -5030,8 +5046,11 @@ static void btrfs_async_reclaim_metadata_space(struct 
> work_struct *work)
>               struct reserve_ticket *ticket;
>               int ret;
>  
> +             if (flush_state > COMMIT_TRANS)
> +                     flush_state = FLUSH_DELAYED_ITEMS_NR;
>               ret = flush_space(fs_info->fs_root, space_info, to_reclaim,
> -                         to_reclaim, flush_state);
> +                               flush_state, reclaim_priority);
> +
>               spin_lock(&space_info->lock);
>               if (!ret)
>                       try_to_wake_tickets(fs_info->fs_root, space_info);
> @@ -5049,21 +5068,15 @@ static void btrfs_async_reclaim_metadata_space(struct 
> work_struct *work)
>               } else {
>                       last_tickets_id = space_info->tickets_id;
>                       flush_state = FLUSH_DELAYED_ITEMS_NR;
> -                     if (commit_cycles)
> -                             commit_cycles--;
> +                     reclaim_priority = BTRFS_DEFAULT_FLUSH_PRIORITY;
>               }
>  
> -             if (flush_state > COMMIT_TRANS) {
> -                     commit_cycles++;
> -                     if (commit_cycles > 2) {
> -                             wake_all_tickets(&space_info->tickets);
> -                             space_info->flush = 0;
> -                     } else {
> -                             flush_state = FLUSH_DELAYED_ITEMS_NR;
> -                     }
> +             if (flush_state > COMMIT_TRANS && reclaim_priority == 0) {
> +                     wake_all_tickets(&space_info->tickets);
> +                     space_info->flush = 0;
>               }
>               spin_unlock(&space_info->lock);
> -     } while (flush_state <= COMMIT_TRANS);
> +     } while ((flush_state <= COMMIT_TRANS) || (--reclaim_priority >= 0));
>  }
>  
>  void btrfs_init_async_reclaim_work(struct work_struct *work)
> @@ -5089,7 +5102,7 @@ static void priority_reclaim_metadata_space(struct 
> btrfs_fs_info *fs_info,
>  
>       do {
>               flush_space(fs_info->fs_root, space_info, to_reclaim,
> -                         to_reclaim, flush_state);
> +                         flush_state, 1);
>               flush_state++;
>               spin_lock(&space_info->lock);
>               if (ticket->bytes == 0) {
> diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h
> index e030d6f..7d953a6 100644
> --- a/include/trace/events/btrfs.h
> +++ b/include/trace/events/btrfs.h
> @@ -857,15 +857,14 @@ TRACE_EVENT(btrfs_trigger_flush,
>  TRACE_EVENT(btrfs_flush_space,
>  
>       TP_PROTO(struct btrfs_fs_info *fs_info, u64 flags, u64 num_bytes,
> -              u64 orig_bytes, int state, int ret),
> +              int state, int ret),
>  
> -     TP_ARGS(fs_info, flags, num_bytes, orig_bytes, state, ret),
> +     TP_ARGS(fs_info, flags, num_bytes, state, ret),
>  
>       TP_STRUCT__entry(
>               __array(        u8,     fsid,   BTRFS_UUID_SIZE )
>               __field(        u64,    flags                   )
>               __field(        u64,    num_bytes               )
> -             __field(        u64,    orig_bytes              )
>               __field(        int,    state                   )
>               __field(        int,    ret                     )
>       ),
> @@ -874,19 +873,17 @@ TRACE_EVENT(btrfs_flush_space,
>               memcpy(__entry->fsid, fs_info->fsid, BTRFS_UUID_SIZE);
>               __entry->flags          =       flags;
>               __entry->num_bytes      =       num_bytes;
> -             __entry->orig_bytes     =       orig_bytes;
>               __entry->state          =       state;
>               __entry->ret            =       ret;
>       ),
>  
>       TP_printk("%pU: state = %d(%s), flags = %llu(%s), num_bytes = %llu, "
> -               "orig_bytes = %llu, ret = %d", __entry->fsid, __entry->state,
> +               "ret = %d", __entry->fsid, __entry->state,
>                 show_flush_state(__entry->state),
>                 (unsigned long long)__entry->flags,
>                 __print_flags((unsigned long)__entry->flags, "|",
>                               BTRFS_GROUP_FLAGS),
> -               (unsigned long long)__entry->num_bytes,
> -               (unsigned long long)__entry->orig_bytes, __entry->ret)
> +               (unsigned long long)__entry->num_bytes, __entry->ret)
>  );
>  
>  DECLARE_EVENT_CLASS(btrfs__reserved_extent,
> -- 
> 2.7.4
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to