Hello, Artem.

On Fri, Sep 25, 2015 at 01:50:22PM +0300, Artem Bityutskiy wrote:
> > Does not compile with multiple errors like
> > 
> > linux/fs/fs-writeback.c:799:10: error: ‘struct bdi_writeback’ has no
> > member named ‘last_comp_gen’
> >    bdi->wb.last_comp_gen = bdi->wb.comp_gen;
> 
> I tried to extend your patch with these fields, but I am not sure I got
> it right, so please, send a new patch, I'll run the reboot corruption
> test with your patch.

Oops, sorry, I'm an idiot.

> Please, note, because this test is about reboots, I'll probably output
> everything to the serial console. Therefore, please, do not print too
> much data. Otherwise I'd have to modify my scripts to collect dmesg
> before restarting, which is more work.

Hmmm... I'm gonna add ratelimit on inode printouts; other than that,
it shouldn't print too much.

Thanks.

---
 fs/fs-writeback.c                |  154 +++++++++++++++++++++++++++++++++++++--
 fs/inode.c                       |    1 
 include/linux/backing-dev-defs.h |    2 
 include/linux/backing-dev.h      |   20 ++++-
 include/linux/fs.h               |    2 
 mm/backing-dev.c                 |    2 
 6 files changed, 173 insertions(+), 8 deletions(-)

--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -101,7 +101,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(wbc_writepa
 
 static bool wb_io_lists_populated(struct bdi_writeback *wb)
 {
-       if (wb_has_dirty_io(wb)) {
+       if (test_bit(WB_has_dirty_io, &wb->state)) {
                return false;
        } else {
                set_bit(WB_has_dirty_io, &wb->state);
@@ -763,6 +763,15 @@ static long wb_split_bdi_pages(struct bd
                return DIV_ROUND_UP_ULL((u64)nr_pages * this_bw, tot_bw);
 }
 
+extern spinlock_t cgwb_lock;
+
+struct split_work_dbg {
+       DECLARE_BITMAP(all_wbs, 8192);
+       DECLARE_BITMAP(iterated_wbs, 8192);
+       DECLARE_BITMAP(written_wbs, 8192);
+       DECLARE_BITMAP(sync_wbs, 8192);
+};
+
 /**
  * bdi_split_work_to_wbs - split a wb_writeback_work to all wb's of a bdi
  * @bdi: target backing_dev_info
@@ -776,11 +785,25 @@ static long wb_split_bdi_pages(struct bd
  */
 static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
                                  struct wb_writeback_work *base_work,
-                                 bool skip_if_busy)
+                                 bool skip_if_busy, struct split_work_dbg *dbg)
 {
        int next_memcg_id = 0;
        struct bdi_writeback *wb;
        struct wb_iter iter;
+       struct radix_tree_iter riter;
+       void **slot;
+
+       if (dbg) {
+               spin_lock_irq(&cgwb_lock);
+               set_bit(bdi->wb.memcg_css->id, dbg->all_wbs);
+               bdi->wb.last_comp_gen = bdi->wb.comp_gen;
+               radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &riter, 0) {
+                       wb = *slot;
+                       set_bit(wb->memcg_css->id, dbg->all_wbs);
+                       wb->last_comp_gen = wb->comp_gen;
+               }
+               spin_unlock_irq(&cgwb_lock);
+       }
 
        might_sleep();
 restart:
@@ -791,6 +814,9 @@ restart:
                struct wb_writeback_work *work;
                long nr_pages;
 
+               if (dbg)
+                       set_bit(wb->memcg_css->id, dbg->iterated_wbs);
+
                /* SYNC_ALL writes out I_DIRTY_TIME too */
                if (!wb_has_dirty_io(wb) &&
                    (base_work->sync_mode == WB_SYNC_NONE ||
@@ -799,6 +825,9 @@ restart:
                if (skip_if_busy && writeback_in_progress(wb))
                        continue;
 
+               if (dbg)
+                       set_bit(wb->memcg_css->id, dbg->written_wbs);
+
                nr_pages = wb_split_bdi_pages(wb, base_work->nr_pages);
 
                work = kmalloc(sizeof(*work), GFP_ATOMIC);
@@ -817,6 +846,8 @@ restart:
                work->auto_free = 0;
                work->done = &fallback_work_done;
 
+               if (dbg)
+                       set_bit(wb->memcg_css->id, dbg->sync_wbs);
                wb_queue_work(wb, work);
 
                next_memcg_id = wb->memcg_css->id + 1;
@@ -1425,6 +1456,9 @@ static long writeback_sb_inodes(struct s
                        break;
                }
 
+               inode->i_dbg_marker = 0;
+               inode->i_dbg_marker2 = 0;
+
                /*
                 * Don't bother with new inodes or inodes being freed, first
                 * kind does not need periodic writeout yet, and for the latter
@@ -1515,6 +1549,7 @@ static long writeback_sb_inodes(struct s
                                break;
                }
        }
+
        return wrote;
 }
 
@@ -1574,6 +1609,28 @@ static long writeback_inodes_wb(struct b
        return nr_pages - work.nr_pages;
 }
 
+static int inode_which_wb_io_list(struct inode *inode, struct backing_dev_info 
*bdi)
+{
+       struct bdi_writeback *wb = inode->i_wb ?: &bdi->wb;
+       struct inode *pos;
+
+       if (list_empty(&inode->i_io_list))
+               return 0;
+       list_for_each_entry(pos, &wb->b_dirty, i_io_list)
+               if (pos == inode)
+                       return 1;
+       list_for_each_entry(pos, &wb->b_io, i_io_list)
+               if (pos == inode)
+                       return 2;
+       list_for_each_entry(pos, &wb->b_more_io, i_io_list)
+               if (pos == inode)
+                       return 3;
+       list_for_each_entry(pos, &wb->b_dirty_time, i_io_list)
+               if (pos == inode)
+                       return 4;
+       return 5;
+}
+
 /*
  * Explicit flushing or periodic writeback of "old" data.
  *
@@ -1604,6 +1661,16 @@ static long wb_writeback(struct bdi_writ
 
        blk_start_plug(&plug);
        spin_lock(&wb->list_lock);
+
+       list_for_each_entry(inode, &wb->b_dirty, i_io_list)
+               inode->i_dbg_marker2 = 1;
+       list_for_each_entry(inode, &wb->b_io, i_io_list)
+               inode->i_dbg_marker2 = 2;
+       list_for_each_entry(inode, &wb->b_more_io, i_io_list)
+               inode->i_dbg_marker2 = 3;
+       list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
+               inode->i_dbg_marker2 = 4;
+
        for (;;) {
                /*
                 * Stop writeback when nr_pages has been consumed
@@ -1681,6 +1748,24 @@ static long wb_writeback(struct bdi_writ
                        spin_lock(&wb->list_lock);
                }
        }
+
+       if (work->sync_mode == WB_SYNC_ALL) {
+               list_for_each_entry(inode, &wb->b_dirty, i_io_list)
+                       if (inode->i_dbg_marker2)
+                               printk("XXX wb_writeback: inode %lu marker2=%d 
on b_dirty\n",
+                                      inode->i_ino, inode->i_dbg_marker2);
+               list_for_each_entry(inode, &wb->b_io, i_io_list)
+                       printk("XXX wb_writeback: inode %lu marker2=%d on 
b_io\n",
+                              inode->i_ino, inode->i_dbg_marker2);
+               list_for_each_entry(inode, &wb->b_more_io, i_io_list)
+                       printk("XXX wb_writeback: inode %lu marker2=%d on 
b_more_io\n",
+                              inode->i_ino, inode->i_dbg_marker2);
+               list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
+                       if (inode->i_dbg_marker2)
+                               printk("XXX wb_writeback: inode %lu marker2=%d 
on b_dirty_time\n",
+                                      inode->i_ino, inode->i_dbg_marker2);
+       }
+
        spin_unlock(&wb->list_lock);
        blk_finish_plug(&plug);
 
@@ -1785,8 +1870,11 @@ static long wb_do_writeback(struct bdi_w
 
                if (work->auto_free)
                        kfree(work);
-               if (done && atomic_dec_and_test(&done->cnt))
-                       wake_up_all(&wb->bdi->wb_waitq);
+               if (done) {
+                       wb->comp_gen++;
+                       if (atomic_dec_and_test(&done->cnt))
+                               wake_up_all(&wb->bdi->wb_waitq);
+               }
        }
 
        /*
@@ -1976,6 +2064,9 @@ void __mark_inode_dirty(struct inode *in
 
        trace_writeback_mark_inode_dirty(inode, flags);
 
+       WARN_ON_ONCE(!(sb->s_flags & MS_LAZYTIME) &&
+                    !list_empty(&inode_to_bdi(inode)->wb.b_dirty_time));
+
        /*
         * Don't do this for I_DIRTY_PAGES - that doesn't actually
         * dirty the inode itself
@@ -2165,7 +2256,7 @@ static void __writeback_inodes_sb_nr(str
                return;
        WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
-       bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy);
+       bdi_split_work_to_wbs(sb->s_bdi, &work, skip_if_busy, NULL);
        wb_wait_for_completion(bdi, &done);
 }
 
@@ -2257,6 +2348,10 @@ void sync_inodes_sb(struct super_block *
                .for_sync       = 1,
        };
        struct backing_dev_info *bdi = sb->s_bdi;
+       static DEFINE_MUTEX(dbg_mutex);
+       static struct split_work_dbg dbg;
+       static DECLARE_BITMAP(tmp_bitmap, 8192);
+       struct inode *inode;
 
        /*
         * Can't skip on !bdi_has_dirty() because we should wait for !dirty
@@ -2267,9 +2362,56 @@ void sync_inodes_sb(struct super_block *
                return;
        WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
-       bdi_split_work_to_wbs(bdi, &work, false);
+       mutex_lock(&dbg_mutex);
+
+       printk("XXX SYNCING %d:%d\n", MAJOR(sb->s_dev), MINOR(sb->s_dev));
+
+       bitmap_zero(dbg.all_wbs, 8192);
+       bitmap_zero(dbg.iterated_wbs, 8192);
+       bitmap_zero(dbg.written_wbs, 8192);
+       bitmap_zero(dbg.sync_wbs, 8192);
+
+       spin_lock(&sb->s_inode_list_lock);
+       list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+               spin_lock(&inode->i_lock);
+               inode->i_dbg_marker = inode_which_wb_io_list(inode, bdi);
+               spin_unlock(&inode->i_lock);
+       }
+       spin_unlock(&sb->s_inode_list_lock);
+
+       bdi_split_work_to_wbs(bdi, &work, false, &dbg);
        wb_wait_for_completion(bdi, &done);
 
+       spin_lock(&sb->s_inode_list_lock);
+       list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+               struct bdi_writeback *wb = inode->i_wb ?: &bdi->wb;
+
+               if (!inode->i_dbg_marker)
+                       continue;
+
+               spin_lock_irq(&wb->list_lock);
+               if (inode->i_state & I_DIRTY_ALL)
+                       printk_ratelimited("XXX sync_inodes_sb(%d:%d): dirty 
inode %lu skipped, wb=%d comp_gen=%d->%d which=%d->%d i_state=0x%lx\n",
+                              MAJOR(sb->s_dev), MINOR(sb->s_dev), inode->i_ino,
+                              wb->memcg_css->id, wb->last_comp_gen, 
wb->comp_gen,
+                              inode->i_dbg_marker, 
inode_which_wb_io_list(inode, bdi),
+                              inode->i_state);
+               spin_unlock_irq(&wb->list_lock);
+       }
+       spin_unlock(&sb->s_inode_list_lock);
+
+       bitmap_andnot(tmp_bitmap, dbg.all_wbs, dbg.iterated_wbs, 8192);
+       if (!bitmap_empty(tmp_bitmap, 8192))
+               printk("XXX sync_inodes_sb(%d:%d): iteration skipped 
%8192pbl\n",
+                      MAJOR(sb->s_dev), MINOR(sb->s_dev), tmp_bitmap);
+
+       printk("XXX all_wbs      = %8192pbl\n", dbg.all_wbs);
+       printk("XXX iterated_wbs = %8192pbl\n", dbg.iterated_wbs);
+       printk("XXX written_wbs  = %8192pbl\n", dbg.written_wbs);
+       printk("XXX sync_wbs     = %8192pbl\n", dbg.sync_wbs);
+
+       mutex_unlock(&dbg_mutex);
+
        wait_sb_inodes(sb);
 }
 EXPORT_SYMBOL(sync_inodes_sb);
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -183,6 +183,7 @@ int inode_init_always(struct super_block
 #endif
        inode->i_flctx = NULL;
        this_cpu_inc(nr_inodes);
+       inode->i_dbg_marker = 0;
 
        return 0;
 out:
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -129,6 +129,8 @@ struct bdi_writeback {
                struct rcu_head rcu;
        };
 #endif
+       int comp_gen;
+       int last_comp_gen;
 };
 
 struct backing_dev_info {
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -38,7 +38,25 @@ extern struct workqueue_struct *bdi_wq;
 
 static inline bool wb_has_dirty_io(struct bdi_writeback *wb)
 {
-       return test_bit(WB_has_dirty_io, &wb->state);
+       bool ret = test_bit(WB_has_dirty_io, &wb->state);
+       long tot_write_bw = atomic_long_read(&wb->bdi->tot_write_bandwidth);
+
+       if (!ret && (!list_empty(&wb->b_dirty) || !list_empty(&wb->b_io) ||
+                    !list_empty(&wb->b_more_io))) {
+               const char *name = wb->bdi->dev ? dev_name(wb->bdi->dev) : 
"UNK";
+
+               pr_err("wb_has_dirty_io: ERR %s has_dirty=%d b_dirty=%d b_io=%d 
b_more_io=%d\n",
+                      name, ret, !list_empty(&wb->b_dirty), 
!list_empty(&wb->b_io), !list_empty(&wb->b_more_io));
+               WARN_ON(1);
+       }
+       if (ret && !tot_write_bw) {
+               const char *name = wb->bdi->dev ? dev_name(wb->bdi->dev) : 
"UNK";
+
+               pr_err("wb_has_dirty_io: ERR %s has_dirty=%d but 
tot_write_bw=%ld\n",
+                      name, ret, tot_write_bw);
+               WARN_ON(1);
+       }
+       return ret;
 }
 
 static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -677,6 +677,8 @@ struct inode {
 #endif
 
        void                    *i_private; /* fs or device private pointer */
+       unsigned                i_dbg_marker;
+       unsigned                i_dbg_marker2;
 };
 
 static inline int inode_unhashed(struct inode *inode)
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -382,7 +382,7 @@ static void wb_exit(struct bdi_writeback
  * protected.  cgwb_release_wait is used to wait for the completion of cgwb
  * releases from bdi destruction path.
  */
-static DEFINE_SPINLOCK(cgwb_lock);
+DEFINE_SPINLOCK(cgwb_lock);
 static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait);
 
 /**
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to