Hello, Artem.

Thanks a lot for the debug dump.  Can you please test whether the
below patch fixes the issue?

Index: work/mm/page-writeback.c
===================================================================
--- work.orig/mm/page-writeback.c
+++ work/mm/page-writeback.c
@@ -1956,7 +1956,6 @@ void laptop_mode_timer_fn(unsigned long
        int nr_pages = global_page_state(NR_FILE_DIRTY) +
                global_page_state(NR_UNSTABLE_NFS);
        struct bdi_writeback *wb;
-       struct wb_iter iter;
 
        /*
         * We want to write everything out, not just down to the dirty
@@ -1965,10 +1964,12 @@ void laptop_mode_timer_fn(unsigned long
        if (!bdi_has_dirty_io(&q->backing_dev_info))
                return;
 
-       bdi_for_each_wb(wb, &q->backing_dev_info, &iter, 0)
+       rcu_read_lock();
+       list_for_each_entry_rcu(wb, &q->backing_dev_info.wb_list, bdi_node)
                if (wb_has_dirty_io(wb))
                        wb_start_writeback(wb, nr_pages, true,
                                           WB_REASON_LAPTOP_TIMER);
+       rcu_read_unlock();
 }
 
 /*
Index: work/fs/fs-writeback.c
===================================================================
--- work.orig/fs/fs-writeback.c
+++ work/fs/fs-writeback.c
@@ -778,19 +778,24 @@ static void bdi_split_work_to_wbs(struct
                                  struct wb_writeback_work *base_work,
                                  bool skip_if_busy)
 {
-       int next_memcg_id = 0;
-       struct bdi_writeback *wb;
-       struct wb_iter iter;
+       struct bdi_writeback *last_wb = NULL;
+       struct bdi_writeback *wb = list_entry_rcu(&bdi->wb_list,
+                                               struct bdi_writeback, bdi_node);
 
        might_sleep();
 restart:
        rcu_read_lock();
-       bdi_for_each_wb(wb, bdi, &iter, next_memcg_id) {
+       list_for_each_entry_continue_rcu(wb, &bdi->wb_list, bdi_node) {
                DEFINE_WB_COMPLETION_ONSTACK(fallback_work_done);
                struct wb_writeback_work fallback_work;
                struct wb_writeback_work *work;
                long nr_pages;
 
+               if (last_wb) {
+                       wb_put(last_wb);
+                       last_wb = NULL;
+               }
+
                /* SYNC_ALL writes out I_DIRTY_TIME too */
                if (!wb_has_dirty_io(wb) &&
                    (base_work->sync_mode == WB_SYNC_NONE ||
@@ -819,7 +824,14 @@ restart:
 
                wb_queue_work(wb, work);
 
-               next_memcg_id = wb->memcg_css->id + 1;
+               /*
+                * Pin @wb so that it stays on @bdi->wb_list.  This allows
+                * continuing iteration from @wb after dropping regrabbing
+                * rcu read lock.
+                */
+               wb_get(wb);
+               last_wb = wb;
+
                rcu_read_unlock();
                wb_wait_for_completion(bdi, &fallback_work_done);
                goto restart;
@@ -1857,12 +1869,11 @@ void wakeup_flusher_threads(long nr_page
        rcu_read_lock();
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                struct bdi_writeback *wb;
-               struct wb_iter iter;
 
                if (!bdi_has_dirty_io(bdi))
                        continue;
 
-               bdi_for_each_wb(wb, bdi, &iter, 0)
+               list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
                        wb_start_writeback(wb, wb_split_bdi_pages(wb, nr_pages),
                                           false, reason);
        }
@@ -1894,11 +1905,10 @@ static void wakeup_dirtytime_writeback(s
        rcu_read_lock();
        list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
                struct bdi_writeback *wb;
-               struct wb_iter iter;
 
-               bdi_for_each_wb(wb, bdi, &iter, 0)
-                       if (!list_empty(&bdi->wb.b_dirty_time))
-                               wb_wakeup(&bdi->wb);
+               list_for_each_entry_rcu(wb, &bdi->wb_list, bdi_node)
+                       if (!list_empty(&wb->b_dirty_time))
+                               wb_wakeup(wb);
        }
        rcu_read_unlock();
        schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
Index: work/include/linux/backing-dev-defs.h
===================================================================
--- work.orig/include/linux/backing-dev-defs.h
+++ work/include/linux/backing-dev-defs.h
@@ -116,6 +116,8 @@ struct bdi_writeback {
        struct list_head work_list;
        struct delayed_work dwork;      /* work item used for writeback */
 
+       struct list_head bdi_node;      /* anchored at bdi->wb_list */
+
 #ifdef CONFIG_CGROUP_WRITEBACK
        struct percpu_ref refcnt;       /* used only for !root wb's */
        struct fprop_local_percpu memcg_completions;
@@ -150,6 +152,7 @@ struct backing_dev_info {
        atomic_long_t tot_write_bandwidth;
 
        struct bdi_writeback wb;  /* the root writeback info for this bdi */
+       struct list_head wb_list; /* list of all wbs */
 #ifdef CONFIG_CGROUP_WRITEBACK
        struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
        struct rb_root cgwb_congested_tree; /* their congested states */
Index: work/include/linux/backing-dev.h
===================================================================
--- work.orig/include/linux/backing-dev.h
+++ work/include/linux/backing-dev.h
@@ -401,61 +401,6 @@ static inline void unlocked_inode_to_wb_
        rcu_read_unlock();
 }
 
-struct wb_iter {
-       int                     start_memcg_id;
-       struct radix_tree_iter  tree_iter;
-       void                    **slot;
-};
-
-static inline struct bdi_writeback *__wb_iter_next(struct wb_iter *iter,
-                                                  struct backing_dev_info *bdi)
-{
-       struct radix_tree_iter *titer = &iter->tree_iter;
-
-       WARN_ON_ONCE(!rcu_read_lock_held());
-
-       if (iter->start_memcg_id >= 0) {
-               iter->slot = radix_tree_iter_init(titer, iter->start_memcg_id);
-               iter->start_memcg_id = -1;
-       } else {
-               iter->slot = radix_tree_next_slot(iter->slot, titer, 0);
-       }
-
-       if (!iter->slot)
-               iter->slot = radix_tree_next_chunk(&bdi->cgwb_tree, titer, 0);
-       if (iter->slot)
-               return *iter->slot;
-       return NULL;
-}
-
-static inline struct bdi_writeback *__wb_iter_init(struct wb_iter *iter,
-                                                  struct backing_dev_info *bdi,
-                                                  int start_memcg_id)
-{
-       iter->start_memcg_id = start_memcg_id;
-
-       if (start_memcg_id)
-               return __wb_iter_next(iter, bdi);
-       else
-               return &bdi->wb;
-}
-
-/**
- * bdi_for_each_wb - walk all wb's of a bdi in ascending memcg ID order
- * @wb_cur: cursor struct bdi_writeback pointer
- * @bdi: bdi to walk wb's of
- * @iter: pointer to struct wb_iter to be used as iteration buffer
- * @start_memcg_id: memcg ID to start iteration from
- *
- * Iterate @wb_cur through the wb's (bdi_writeback's) of @bdi in ascending
- * memcg ID order starting from @start_memcg_id.  @iter is struct wb_iter
- * to be used as temp storage during iteration.  rcu_read_lock() must be
- * held throughout iteration.
- */
-#define bdi_for_each_wb(wb_cur, bdi, iter, start_memcg_id)             \
-       for ((wb_cur) = __wb_iter_init(iter, bdi, start_memcg_id);      \
-            (wb_cur); (wb_cur) = __wb_iter_next(iter, bdi))
-
 #else  /* CONFIG_CGROUP_WRITEBACK */
 
 static inline bool inode_cgwb_enabled(struct inode *inode)
@@ -515,14 +460,6 @@ static inline void wb_blkcg_offline(stru
 {
 }
 
-struct wb_iter {
-       int             next_id;
-};
-
-#define bdi_for_each_wb(wb_cur, bdi, iter, start_blkcg_id)             \
-       for ((iter)->next_id = (start_blkcg_id);                        \
-            ({ (wb_cur) = !(iter)->next_id++ ? &(bdi)->wb : NULL; }); )
-
 static inline int inode_congested(struct inode *inode, int cong_bits)
 {
        return wb_congested(&inode_to_bdi(inode)->wb, cong_bits);
Index: work/mm/backing-dev.c
===================================================================
--- work.orig/mm/backing-dev.c
+++ work/mm/backing-dev.c
@@ -480,6 +480,10 @@ static void cgwb_release_workfn(struct w
                                                release_work);
        struct backing_dev_info *bdi = wb->bdi;
 
+       spin_lock_irq(&cgwb_lock);
+       list_del_rcu(&wb->bdi_node);
+       spin_unlock_irq(&cgwb_lock);
+
        wb_shutdown(wb);
 
        css_put(wb->memcg_css);
@@ -575,6 +579,7 @@ static int cgwb_create(struct backing_de
                ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb);
                if (!ret) {
                        atomic_inc(&bdi->usage_cnt);
+                       list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list);
                        list_add(&wb->memcg_node, memcg_cgwb_list);
                        list_add(&wb->blkcg_node, blkcg_cgwb_list);
                        css_get(memcg_css);
@@ -669,6 +674,7 @@ static int cgwb_bdi_init(struct backing_
        if (!ret) {
                bdi->wb.memcg_css = mem_cgroup_root_css;
                bdi->wb.blkcg_css = blkcg_root_css;
+               list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
        }
        return ret;
 }
@@ -686,6 +692,9 @@ static void cgwb_bdi_destroy(struct back
        radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
                cgwb_kill(*slot);
 
+       /* release of wb's may happen after @bdi is freed, sever list head */
+       list_del(&bdi->wb_list);
+
        rbtree_postorder_for_each_entry_safe(congested, congested_n,
                                        &bdi->cgwb_congested_tree, rb_node) {
                rb_erase(&congested->rb_node, &bdi->cgwb_congested_tree);
@@ -755,6 +764,9 @@ static int cgwb_bdi_init(struct backing_
                kfree(bdi->wb_congested);
                return err;
        }
+
+       list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
+
        return 0;
 }
 
@@ -770,6 +782,7 @@ int bdi_init(struct backing_dev_info *bd
        bdi->max_ratio = 100;
        bdi->max_prop_frac = FPROP_FRAC_BASE;
        INIT_LIST_HEAD(&bdi->bdi_list);
+       INIT_LIST_HEAD(&bdi->wb_list);
        init_waitqueue_head(&bdi->wb_waitq);
 
        return cgwb_bdi_init(bdi);
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to