On Fri, Oct 21, 2016 at 05:05:07PM +0800, Wang Xiaoguang wrote:
> This issue was found when I tried to delete a heavily reflinked file,
> when deleting such files, other transaction operation will not have a
> chance to make progress, for example, start_transaction() will blocked
> in wait_current_trans(root) for long time, sometimes it even triggers
> soft lockups, and the time taken to delete such heavily reflinked file
> is also very large, often hundreds of seconds. Using perf top, it reports
> that:
> 
> PerfTop:    7416 irqs/sec  kernel:99.8%  exact:  0.0% [4000Hz cpu-clock],  
> (all, 4 CPUs)
> ---------------------------------------------------------------------------------------
>     84.37%  [btrfs]             [k] __btrfs_run_delayed_refs.constprop.80
>     11.02%  [kernel]            [k] delay_tsc
>      0.79%  [kernel]            [k] _raw_spin_unlock_irq
>      0.78%  [kernel]            [k] _raw_spin_unlock_irqrestore
>      0.45%  [kernel]            [k] do_raw_spin_lock
>      0.18%  [kernel]            [k] __slab_alloc
> It seems __btrfs_run_delayed_refs() took most cpu time, after some debug
> work, I found it's select_delayed_ref() causing this issue, for a delayed
> head, in our case, it'll be full of BTRFS_DROP_DELAYED_REF nodes, but
> select_delayed_ref() will firstly try to iterate node list to find
> BTRFS_ADD_DELAYED_REF nodes, obviously it's a disaster in this case, and
> waste much time.
> 
> To fix this issue, we introduce a new ref_add_list in struct 
> btrfs_delayed_ref_head,
> then in select_delayed_ref(), if this list is not empty, we can directly use
> nodes in this list. With this patch, it just took about 10~15 seconds to
> delte the same file. Now using perf top, it reports that:
> 
> PerfTop:    2734 irqs/sec  kernel:99.5%  exact:  0.0% [4000Hz cpu-clock],  
> (all, 4 CPUs)
> ----------------------------------------------------------------------------------------
> 
>     20.74%  [kernel]          [k] _raw_spin_unlock_irqrestore
>     16.33%  [kernel]          [k] __slab_alloc
>      5.41%  [kernel]          [k] lock_acquired
>      4.42%  [kernel]          [k] lock_acquire
>      4.05%  [kernel]          [k] lock_release
>      3.37%  [kernel]          [k] _raw_spin_unlock_irq
> 
> For normal files, this patch also gives help, at least we do not need to
> iterate whole list to found BTRFS_ADD_DELAYED_REF nodes.
> 
> Signed-off-by: Wang Xiaoguang <wangxg.f...@cn.fujitsu.com>
> ---
>  fs/btrfs/delayed-ref.c | 14 ++++++++++++++
>  fs/btrfs/delayed-ref.h |  8 ++++++++
>  fs/btrfs/disk-io.c     |  2 ++
>  fs/btrfs/extent-tree.c | 15 +++++++++------
>  4 files changed, 33 insertions(+), 6 deletions(-)
> 
> diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
> index 8d93854..39c28e0 100644
> --- a/fs/btrfs/delayed-ref.c
> +++ b/fs/btrfs/delayed-ref.c
> @@ -189,6 +189,8 @@ static inline void drop_delayed_ref(struct 
> btrfs_trans_handle *trans,
>       } else {
>               assert_spin_locked(&head->lock);
>               list_del(&ref->list);
> +             if (!list_empty(&ref->add_list))
> +                     list_del(&ref->add_list);
>       }
>       ref->in_tree = 0;
>       btrfs_put_delayed_ref(ref);
> @@ -431,6 +433,11 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle 
> *trans,
>                       exist->action = ref->action;
>                       mod = -exist->ref_mod;
>                       exist->ref_mod = ref->ref_mod;
> +                     if (ref->action == BTRFS_ADD_DELAYED_REF)
> +                             list_add_tail(&exist->add_list,
> +                                           &href->ref_add_list);
> +                     else if (!list_empty(&exist->add_list))
> +                             list_del(&exist->add_list);

->action is either BTRFS_ADD_DELAYED_REF or BTRFS_DROP_DELAYED_REF, so
in 'else' section, (!list_empty(&exist->add_list)) is true indeed.

>               } else
>                       mod = -ref->ref_mod;
>       }
> @@ -444,6 +451,8 @@ add_delayed_ref_tail_merge(struct btrfs_trans_handle 
> *trans,
>  
>  add_tail:
>       list_add_tail(&ref->list, &href->ref_list);
> +     if (ref->action == BTRFS_ADD_DELAYED_REF)
> +             list_add_tail(&ref->add_list, &href->ref_add_list);
>       atomic_inc(&root->num_entries);
>       trans->delayed_ref_updates++;
>       spin_unlock(&href->lock);
> @@ -590,6 +599,7 @@ add_delayed_ref_head(struct btrfs_fs_info *fs_info,
>       head_ref->must_insert_reserved = must_insert_reserved;
>       head_ref->is_data = is_data;
>       INIT_LIST_HEAD(&head_ref->ref_list);
> +     INIT_LIST_HEAD(&head_ref->ref_add_list);
>       head_ref->processing = 0;
>       head_ref->total_ref_mod = count_mod;
>       head_ref->qgroup_reserved = 0;
> @@ -671,6 +681,8 @@ add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
>       ref->is_head = 0;
>       ref->in_tree = 1;
>       ref->seq = seq;
> +     INIT_LIST_HEAD(&ref->list);
> +     INIT_LIST_HEAD(&ref->add_list);
>  
>       full_ref = btrfs_delayed_node_to_tree_ref(ref);
>       full_ref->parent = parent;
> @@ -726,6 +738,8 @@ add_delayed_data_ref(struct btrfs_fs_info *fs_info,
>       ref->is_head = 0;
>       ref->in_tree = 1;
>       ref->seq = seq;
> +     INIT_LIST_HEAD(&ref->list);
> +     INIT_LIST_HEAD(&ref->add_list);
>  
>       full_ref = btrfs_delayed_node_to_data_ref(ref);
>       full_ref->parent = parent;
> diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
> index 43f3629..dba9784 100644
> --- a/fs/btrfs/delayed-ref.h
> +++ b/fs/btrfs/delayed-ref.h
> @@ -42,6 +42,12 @@ struct btrfs_delayed_ref_node {
>  
>       /*data/tree ref use list, stored in ref_head->ref_list. */
>       struct list_head list;
> +     /*
> +      * If action is BTRFS_ADD_DELAYED_REF, also link this node to
> +      * ref_head->ref_add_list, then we do not need to iterate the
> +      * whole ref_head->ref_list to find BTRFS_ADD_DELAYED_REF nodes.
> +      */
> +     struct list_head add_list;
>  
>       /* the starting bytenr of the extent */
>       u64 bytenr;
> @@ -99,6 +105,8 @@ struct btrfs_delayed_ref_head {
>  
>       spinlock_t lock;
>       struct list_head ref_list;
> +     /* accumulate add BTRFS_ADD_DELAYED_REF nodes to this ref_add_list. */
> +     struct list_head ref_add_list;
>  
>       struct rb_node href_node;
>  
> diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
> index 3a57f99..bc2edaf 100644
> --- a/fs/btrfs/disk-io.c
> +++ b/fs/btrfs/disk-io.c
> @@ -4354,6 +4354,8 @@ static int btrfs_destroy_delayed_refs(struct 
> btrfs_transaction *trans,
>                                                list) {
>                       ref->in_tree = 0;
>                       list_del(&ref->list);
> +                     if (!list_empty(&ref->add_list))
> +                             list_del(&ref->add_list);
>                       atomic_dec(&delayed_refs->num_entries);
>                       btrfs_put_delayed_ref(ref);
>               }
> diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
> index 210c94a..1284222 100644
> --- a/fs/btrfs/extent-tree.c
> +++ b/fs/btrfs/extent-tree.c
> @@ -2454,13 +2454,14 @@ select_delayed_ref(struct btrfs_delayed_ref_head 
> *head)
>        * the extent item from the extent tree, when there still are references
>        * to add, which would fail because they would not find the extent item.
>        */
> -     list_for_each_entry(ref, &head->ref_list, list) {
> -             if (ref->action == BTRFS_ADD_DELAYED_REF)
> -                     return ref;
> -     }
> +     if (!list_empty(&head->ref_add_list))
> +             return list_entry(head->ref_add_list.next,
> +                             struct btrfs_delayed_ref_node, add_list);
>  
> -     return list_entry(head->ref_list.next, struct btrfs_delayed_ref_node,
> -                       list);
> +     ref = list_entry(head->ref_list.next, struct btrfs_delayed_ref_node,
> +                      list);
> +     WARN_ON(!list_empty(&ref->add_list));

I'd prefer ASSERT for only developers troubleshooting.

Others look good to me.

Reviewed-by: Liu Bo <bo.li....@oracle.com>

I had a patch[1] while working on dedupe back then, it was trying to
resolve the same problem, somehow it didn't make it to this retry of dedupe.

[1]: https://patchwork.kernel.org/patch/3959021/


Thanks,

-liubo
> +     return ref;
>  }
>  
>  /*
> @@ -2620,6 +2621,8 @@ static noinline int __btrfs_run_delayed_refs(struct 
> btrfs_trans_handle *trans,
>                       actual_count++;
>                       ref->in_tree = 0;
>                       list_del(&ref->list);
> +                     if (!list_empty(&ref->add_list))
> +                             list_del(&ref->add_list);
>               }
>               atomic_dec(&delayed_refs->num_entries);
>  
> -- 
> 2.9.0
> 
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-btrfs" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to