On Fri, Dec 28, 2007 at 03:01:11PM -0800, Andrew Morton wrote:
> On Thu, 27 Dec 2007 23:08:40 +0100 Sascha Warner <[EMAIL PROTECTED]> wrote:
> 
> > Hi,
> > 
> > I applied your patches to 2.6.24-rc6-mm1, but now I am faced with one
> > pdflush often using 100% CPU for a long time. There seem to be some rare
> > pauses from its 100% usage, however.
> > 
> > On ~23 minutes uptime i have ~19 minutes pdflush runtime.
> > 
> > This is on E6600, x86_64, 2 Gig RAM, SATA HDD, running on gentoo ~x64_64
> > 
> > Let me know if you need more info.

Sascha, Thank you for the testing.

Replace the first patch with this one should help:

---
Subject: writeback: introduce s_more_io_wait for inodes to be re-synced after a 
while

Introduce super_block.s_more_io_wait and writeback_control.more_io_wait.
They are for inodes that for some reason cannot be synced immediately.

These inodes will be moved to s_more_io_wait and retried after a while(waiting
up to 0.1s).  The normal lots-of-dirty-pages inodes will be moved to more_io
and be synced after other inodes in s_io have been serviced(no sleep).

The data flow is made more simple:

s_dirty --> s_io --> s_more_io/s_more_io_wait --+
             ^                                  |
             |                                  |
             +----------------------------------+

- to fill s_io:
                s_more_io +
                s_dirty(expired) +
                s_more_io_wait
                                ---> s_io
- to drain s_io:
                s_io -+--> clean inodes goto inode_in_use/inode_unused
                      |
                      +--> s_more_io
                      |
                      +--> s_more_io_wait

Obviously there's no ordering or starvation problems in the queues:
- s_dirty is now a strict FIFO queue
- inode.dirtied_when now really means the first dirty time
- once exipired, the dirty inode will stay in s_*io* queues until made clean
- the dirty inodes in s_*io* queues will be revisted in order: no starvation

Cc: Michael Rubin <[EMAIL PROTECTED]>
Cc: Peter Zijlstra <[EMAIL PROTECTED]>
Signed-off-by: Fengguang Wu <[EMAIL PROTECTED]>
---
 fs/fs-writeback.c         |   26 ++++++++++++----
 fs/super.c                |    1 
 include/linux/fs.h        |    1 
 include/linux/writeback.h |    1 
 mm/page-writeback.c       |   56 ++++++++++++++++++++----------------
 5 files changed, 55 insertions(+), 30 deletions(-)

--- linux-2.6.24-rc6-mm1.orig/fs/fs-writeback.c
+++ linux-2.6.24-rc6-mm1/fs/fs-writeback.c
@@ -172,6 +172,14 @@ static void requeue_io(struct inode *ino
        list_move(&inode->i_list, &inode->i_sb->s_more_io);
 }
 
+/*
+ * The inode should be retried after _sleeping_ for a while.
+ */
+static void requeue_io_wait(struct inode *inode)
+{
+       list_move(&inode->i_list, &inode->i_sb->s_more_io_wait);
+}
+
 static void inode_sync_complete(struct inode *inode)
 {
        /*
@@ -206,13 +214,15 @@ static void queue_io(struct super_block 
 {
        list_splice_init(&sb->s_more_io, sb->s_io.prev);
        move_expired_inodes(&sb->s_dirty, &sb->s_io, older_than_this);
+       list_splice_init(&sb->s_more_io_wait, sb->s_io.prev);
 }
 
 int sb_has_dirty_inodes(struct super_block *sb)
 {
-       return !list_empty(&sb->s_dirty) ||
-              !list_empty(&sb->s_io) ||
-              !list_empty(&sb->s_more_io);
+       return !list_empty(&sb->s_dirty)   ||
+              !list_empty(&sb->s_io)      ||
+              !list_empty(&sb->s_more_io) ||
+              !list_empty(&sb->s_more_io_wait);
 }
 EXPORT_SYMBOL(sb_has_dirty_inodes);
 
@@ -472,11 +482,15 @@ int generic_sync_sb_inodes(struct super_
                iput(inode);
                cond_resched();
                spin_lock(&inode_lock);
-               if (wbc->nr_to_write <= 0)
+               if (wbc->nr_to_write <= 0) {
+                       wbc->more_io = 1;
                        break;
+               }
+               if (!list_empty(&sb->s_more_io))
+                       wbc->more_io = 1;
+               if (!list_empty(&sb->s_more_io_wait))
+                       wbc->more_io_wait = 1;
        }
-       if (!list_empty(&sb->s_more_io))
-               wbc->more_io = 1;
        spin_unlock(&inode_lock);
        return ret;             /* Leave any unwritten inodes on s_io */
 }
--- linux-2.6.24-rc6-mm1.orig/mm/page-writeback.c
+++ linux-2.6.24-rc6-mm1/mm/page-writeback.c
@@ -543,6 +543,34 @@ void throttle_vm_writeout(gfp_t gfp_mask
 }
 
 /*
+ * Write back up to MAX_WRITEBACK_PAGES.
+ * Return true if there's no more work.
+ */
+static int writeback_some_pages(struct writeback_control *wbc, int nr)
+{
+       int all_done = 0;
+
+       wbc->more_io = 0;
+       wbc->more_io_wait = 0;
+       wbc->encountered_congestion = 0;
+       wbc->nr_to_write = nr;
+
+       writeback_inodes(wbc);
+
+       if (wbc->encountered_congestion)
+               congestion_wait(WRITE, HZ/10);
+
+       if (wbc->more_io)
+               ;
+       else if (wbc->more_io_wait)
+               congestion_wait(WRITE, HZ/10);
+       else
+               all_done = 1;
+
+       return all_done;
+}
+
+/*
  * writeback at least _min_pages, and keep writing until the amount of dirty
  * memory is less than the background threshold, or until we're all clean.
  */
@@ -553,7 +581,6 @@ static void background_writeout(unsigned
                .bdi            = NULL,
                .sync_mode      = WB_SYNC_NONE,
                .older_than_this = NULL,
-               .nr_to_write    = 0,
                .nonblocking    = 1,
                .range_cyclic   = 1,
        };
@@ -567,19 +594,9 @@ static void background_writeout(unsigned
                        global_page_state(NR_UNSTABLE_NFS) < background_thresh
                                && min_pages <= 0)
                        break;
-               wbc.more_io = 0;
-               wbc.encountered_congestion = 0;
-               wbc.nr_to_write = MAX_WRITEBACK_PAGES;
-               wbc.pages_skipped = 0;
-               writeback_inodes(&wbc);
+               if (writeback_some_pages(&wbc, MAX_WRITEBACK_PAGES))
+                       break;
                min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
-               if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) {
-                       /* Wrote less than expected */
-                       if (wbc.encountered_congestion || wbc.more_io)
-                               congestion_wait(WRITE, HZ/10);
-                       else
-                               break;
-               }
        }
 }
 
@@ -627,7 +644,6 @@ static void wb_kupdate(unsigned long arg
                .bdi            = NULL,
                .sync_mode      = WB_SYNC_NONE,
                .older_than_this = &oldest_jif,
-               .nr_to_write    = 0,
                .nonblocking    = 1,
                .for_kupdate    = 1,
                .range_cyclic   = 1,
@@ -642,16 +658,8 @@ static void wb_kupdate(unsigned long arg
                        global_page_state(NR_UNSTABLE_NFS) +
                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
        while (nr_to_write > 0) {
-               wbc.more_io = 0;
-               wbc.encountered_congestion = 0;
-               wbc.nr_to_write = MAX_WRITEBACK_PAGES;
-               writeback_inodes(&wbc);
-               if (wbc.nr_to_write > 0) {
-                       if (wbc.encountered_congestion || wbc.more_io)
-                               congestion_wait(WRITE, HZ/10);
-                       else
-                               break;  /* All the old data is written */
-               }
+               if (writeback_some_pages(&wbc, MAX_WRITEBACK_PAGES))
+                       break;
                nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
        }
        if (time_before(next_jif, jiffies + HZ))
--- linux-2.6.24-rc6-mm1.orig/fs/super.c
+++ linux-2.6.24-rc6-mm1/fs/super.c
@@ -64,6 +64,7 @@ static struct super_block *alloc_super(s
                INIT_LIST_HEAD(&s->s_dirty);
                INIT_LIST_HEAD(&s->s_io);
                INIT_LIST_HEAD(&s->s_more_io);
+               INIT_LIST_HEAD(&s->s_more_io_wait);
                INIT_LIST_HEAD(&s->s_files);
                INIT_LIST_HEAD(&s->s_instances);
                INIT_HLIST_HEAD(&s->s_anon);
--- linux-2.6.24-rc6-mm1.orig/include/linux/fs.h
+++ linux-2.6.24-rc6-mm1/include/linux/fs.h
@@ -1011,6 +1011,7 @@ struct super_block {
        struct list_head        s_dirty;        /* dirty inodes */
        struct list_head        s_io;           /* parked for writeback */
        struct list_head        s_more_io;      /* parked for more writeback */
+       struct list_head        s_more_io_wait; /* parked for sleep-then-retry 
*/
        struct hlist_head       s_anon;         /* anonymous dentries for (nfs) 
exporting */
        struct list_head        s_files;
 
--- linux-2.6.24-rc6-mm1.orig/include/linux/writeback.h
+++ linux-2.6.24-rc6-mm1/include/linux/writeback.h
@@ -63,6 +63,7 @@ struct writeback_control {
        unsigned for_writepages:1;      /* This is a writepages() call */
        unsigned range_cyclic:1;        /* range_start is cyclic */
        unsigned more_io:1;             /* more io to be dispatched */
+       unsigned more_io_wait:1;        /* more io to be dispatched after a 
while */
 };
 
 /*

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to