Use per-cgroup memory statistics to evaluate dirty limits, dirtyable memory and
start background writeout via pdflush.

Also add an argument to pdflush_operation() to pass the memory cgroup that
requested the background writeout. In this way pdflush is able to check the
cgroup dirty limits according to the cgroup statistics.

Signed-off-by: Andrea Righi <[EMAIL PROTECTED]>
---
 fs/super.c                |    4 +-
 fs/sync.c                 |    7 ++-
 include/linux/writeback.h |   11 +++--
 kernel/trace/trace.c      |    2 +-
 mm/backing-dev.c          |    3 +-
 mm/page-writeback.c       |  115 +++++++++++++++++++++++++++-----------------
 mm/pdflush.c              |   10 +++-
 7 files changed, 95 insertions(+), 57 deletions(-)

diff --git a/fs/super.c b/fs/super.c
index f31ef82..33fbcaa 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -646,7 +646,7 @@ int do_remount_sb(struct super_block *sb, int flags, void 
*data, int force)
        return 0;
 }
 
-static void do_emergency_remount(unsigned long foo)
+static void do_emergency_remount(struct mem_cgroup *unused, unsigned long foo)
 {
        struct super_block *sb;
 
@@ -674,7 +674,7 @@ static void do_emergency_remount(unsigned long foo)
 
 void emergency_remount(void)
 {
-       pdflush_operation(do_emergency_remount, 0);
+       pdflush_operation(do_emergency_remount, NULL, 0);
 }
 
 /*
diff --git a/fs/sync.c b/fs/sync.c
index 2967562..aac77c3 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -42,9 +42,14 @@ asmlinkage long sys_sync(void)
        return 0;
 }
 
+static void memcg_do_sync(struct mem_cgroup *unused, unsigned long wait)
+{
+       do_sync(wait);
+}
+
 void emergency_sync(void)
 {
-       pdflush_operation(do_sync, 0);
+       pdflush_operation(memcg_do_sync, NULL, 0);
 }
 
 /*
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 12b15c5..dd5bc8a 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -5,6 +5,7 @@
 #define WRITEBACK_H
 
 #include <linux/sched.h>
+#include <linux/memcontrol.h>
 #include <linux/fs.h>
 
 struct backing_dev_info;
@@ -106,7 +107,7 @@ extern int vm_highmem_is_dirtyable;
 extern int block_dump;
 extern int laptop_mode;
 
-extern unsigned long determine_dirtyable_memory(void);
+extern unsigned long determine_dirtyable_memory(struct mem_cgroup *mem);
 
 extern int dirty_ratio_handler(struct ctl_table *table, int write,
                struct file *filp, void __user *buffer, size_t *lenp,
@@ -117,8 +118,9 @@ struct file;
 int dirty_writeback_centisecs_handler(struct ctl_table *, int, struct file *,
                                      void __user *, size_t *, loff_t *);
 
-void get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
-                struct backing_dev_info *bdi);
+void get_dirty_limits(struct mem_cgroup *mem, long *pbackground,
+               long *pdirty, long *pbdi_dirty,
+               struct backing_dev_info *bdi);
 
 void page_writeback_init(void);
 void balance_dirty_pages_ratelimited_nr(struct address_space *mapping,
@@ -133,7 +135,8 @@ balance_dirty_pages_ratelimited(struct address_space 
*mapping)
 typedef int (*writepage_t)(struct page *page, struct writeback_control *wbc,
                                void *data);
 
-int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0);
+int pdflush_operation(void (*fn)(struct mem_cgroup *, unsigned long),
+                       struct mem_cgroup *mem, unsigned long arg0);
 int generic_writepages(struct address_space *mapping,
                       struct writeback_control *wbc);
 int write_cache_pages(struct address_space *mapping,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bc6a22a..ec64004 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2877,7 +2877,7 @@ tracing_entries_write(struct file *filp, const char 
__user *ubuf,
                        goto out;
                }
 
-               freeable_pages = determine_dirtyable_memory();
+               freeable_pages = determine_dirtyable_memory(NULL);
 
                /* we only allow to request 1/4 of useable memory */
                if (pages_requested >
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index f2e574d..df6a01c 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -28,7 +28,8 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
        long dirty_thresh;
        long bdi_thresh;
 
-       get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
+       get_dirty_limits(NULL, &background_thresh, &dirty_thresh,
+                       &bdi_thresh, bdi);
 
 #define K(x) ((x) << (PAGE_SHIFT - 10))
        seq_printf(m,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 17c6141..1a9b602 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -106,7 +106,8 @@ EXPORT_SYMBOL(laptop_mode);
 /* End of sysctl-exported parameters */
 
 
-static void background_writeout(unsigned long _min_pages);
+static void background_writeout(struct mem_cgroup *mem,
+                               unsigned long _min_pages);
 
 /*
  * Scale the writeback cache size proportional to the relative writeout speeds.
@@ -136,7 +137,9 @@ static int calc_period_shift(void)
 {
        unsigned long dirty_total;
 
-       dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / 100;
+       dirty_total = (mem_cgroup_dirty_ratio(NULL)
+                       * determine_dirtyable_memory(NULL))
+                       / 100;
        return 2 + ilog2(dirty_total - 1);
 }
 
@@ -147,9 +150,9 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
                struct file *filp, void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
-       int old_ratio = vm_dirty_ratio;
+       int old_ratio = mem_cgroup_dirty_ratio(NULL);
        int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
-       if (ret == 0 && write && vm_dirty_ratio != old_ratio) {
+       if (ret == 0 && write && mem_cgroup_dirty_ratio(NULL) != old_ratio) {
                int shift = calc_period_shift();
                prop_change_shift(&vm_completions, shift);
                prop_change_shift(&vm_dirties, shift);
@@ -350,30 +353,35 @@ static unsigned long highmem_dirtyable_memory(unsigned 
long total)
  * Returns the numebr of pages that can currently be freed and used
  * by the kernel for direct mappings.
  */
-unsigned long determine_dirtyable_memory(void)
+unsigned long determine_dirtyable_memory(struct mem_cgroup *memcg)
 {
-       unsigned long x;
+       unsigned long mem_memory, memcg_memory;
 
-       x = global_page_state(NR_FREE_PAGES) + global_lru_pages();
+       memcg_memory = mem_cgroup_get_free_pages(memcg) +
+                       mem_cgroup_global_lru_pages(memcg);
+       mem_memory = global_page_state(NR_FREE_PAGES) + global_lru_pages();
+       if (memcg_memory && (memcg_memory < mem_memory))
+               return memcg_memory;
 
        if (!vm_highmem_is_dirtyable)
-               x -= highmem_dirtyable_memory(x);
+               mem_memory -= highmem_dirtyable_memory(mem_memory);
 
-       return x + 1;   /* Ensure that we never return 0 */
+       return mem_memory + 1;  /* Ensure that we never return 0 */
 }
 
 void
-get_dirty_limits(long *pbackground, long *pdirty, long *pbdi_dirty,
+get_dirty_limits(struct mem_cgroup *mem, long *pbackground,
+               long *pdirty, long *pbdi_dirty,
                 struct backing_dev_info *bdi)
 {
        int background_ratio;           /* Percentages */
        int dirty_ratio;
        long background;
        long dirty;
-       unsigned long available_memory = determine_dirtyable_memory();
+       unsigned long available_memory = determine_dirtyable_memory(mem);
        struct task_struct *tsk;
 
-       dirty_ratio = vm_dirty_ratio;
+       dirty_ratio = mem_cgroup_dirty_ratio(mem);
        if (dirty_ratio < 5)
                dirty_ratio = 5;
 
@@ -383,10 +391,12 @@ get_dirty_limits(long *pbackground, long *pdirty, long 
*pbdi_dirty,
 
        background = (background_ratio * available_memory) / 100;
        dirty = (dirty_ratio * available_memory) / 100;
-       tsk = current;
-       if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
-               background += background / 4;
-               dirty += dirty / 4;
+       if (mem == NULL) {
+               tsk = current;
+               if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
+                       background += background / 4;
+                       dirty += dirty / 4;
+               }
        }
        *pbackground = background;
        *pdirty = dirty;
@@ -409,16 +419,17 @@ get_dirty_limits(long *pbackground, long *pdirty, long 
*pbdi_dirty,
 
                *pbdi_dirty = bdi_dirty;
                clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
-               task_dirty_limit(current, pbdi_dirty);
+               if (mem == NULL)
+                       task_dirty_limit(current, pbdi_dirty);
        }
 }
 
 /*
  * balance_dirty_pages() must be called by processes which are generating dirty
  * data.  It looks at the number of dirty pages in the machine and will force
- * the caller to perform writeback if the system is over `vm_dirty_ratio'.
- * If we're over `background_thresh' then pdflush is woken to perform some
- * writeout.
+ * the caller to perform writeback if the system is over
+ * `mem_cgroup_dirty_ratio()'.  If we're over `background_thresh' then pdflush
+ * is woken to perform some writeout.
  */
 static void balance_dirty_pages(struct address_space *mapping)
 {
@@ -441,12 +452,11 @@ static void balance_dirty_pages(struct address_space 
*mapping)
                        .range_cyclic   = 1,
                };
 
-               get_dirty_limits(&background_thresh, &dirty_thresh,
+               get_dirty_limits(NULL, &background_thresh, &dirty_thresh,
                                &bdi_thresh, bdi);
 
-               nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
-                                       global_page_state(NR_UNSTABLE_NFS);
-               nr_writeback = global_page_state(NR_WRITEBACK);
+               nr_reclaimable = mem_cgroup_nr_file_dirty(NULL);
+               nr_writeback = mem_cgroup_nr_writeback(NULL);
 
                bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
                bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
@@ -475,8 +485,9 @@ static void balance_dirty_pages(struct address_space 
*mapping)
                if (bdi_nr_reclaimable) {
                        writeback_inodes(&wbc);
                        pages_written += write_chunk - wbc.nr_to_write;
-                       get_dirty_limits(&background_thresh, &dirty_thresh,
-                                      &bdi_thresh, bdi);
+                       get_dirty_limits(NULL,
+                                       &background_thresh, &dirty_thresh,
+                                       &bdi_thresh, bdi);
                }
 
                /*
@@ -521,10 +532,13 @@ static void balance_dirty_pages(struct address_space 
*mapping)
         * background_thresh, to keep the amount of dirty memory low.
         */
        if ((laptop_mode && pages_written) ||
-                       (!laptop_mode && (global_page_state(NR_FILE_DIRTY)
-                                         + global_page_state(NR_UNSTABLE_NFS)
-                                         > background_thresh)))
-               pdflush_operation(background_writeout, 0);
+               (!laptop_mode &&
+                       (mem_cgroup_nr_file_dirty(NULL) > background_thresh))) {
+               struct mem_cgroup *mem = get_current_mem_cgroup();
+
+               if (pdflush_operation(background_writeout, mem, 0))
+                       put_mem_cgroup(mem);
+       }
 }
 
 void set_page_dirty_balance(struct page *page, int page_mkwrite)
@@ -585,8 +599,8 @@ void throttle_vm_writeout(gfp_t gfp_mask)
        long dirty_thresh;
 
         for ( ; ; ) {
-               get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
-
+               get_dirty_limits(NULL, &background_thresh, &dirty_thresh,
+                               NULL, NULL);
                 /*
                  * Boost the allowable dirty threshold a bit for page
                  * allocators so they don't get DoS'ed by heavy writers
@@ -612,7 +626,8 @@ void throttle_vm_writeout(gfp_t gfp_mask)
  * writeback at least _min_pages, and keep writing until the amount of dirty
  * memory is less than the background threshold, or until we're all clean.
  */
-static void background_writeout(unsigned long _min_pages)
+static void background_writeout(struct mem_cgroup *mem,
+                               unsigned long _min_pages)
 {
        long min_pages = _min_pages;
        struct writeback_control wbc = {
@@ -628,9 +643,9 @@ static void background_writeout(unsigned long _min_pages)
                long background_thresh;
                long dirty_thresh;
 
-               get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
-               if (global_page_state(NR_FILE_DIRTY) +
-                       global_page_state(NR_UNSTABLE_NFS) < background_thresh
+               get_dirty_limits(mem, &background_thresh, &dirty_thresh,
+                               NULL, NULL);
+               if (mem_cgroup_nr_file_dirty(mem) < background_thresh
                                && min_pages <= 0)
                        break;
                wbc.more_io = 0;
@@ -647,6 +662,7 @@ static void background_writeout(unsigned long _min_pages)
                                break;
                }
        }
+       put_mem_cgroup(mem);
 }
 
 /*
@@ -656,10 +672,15 @@ static void background_writeout(unsigned long _min_pages)
  */
 int wakeup_pdflush(long nr_pages)
 {
+       struct mem_cgroup *mem = get_current_mem_cgroup();
+       int ret;
+
        if (nr_pages == 0)
-               nr_pages = global_page_state(NR_FILE_DIRTY) +
-                               global_page_state(NR_UNSTABLE_NFS);
-       return pdflush_operation(background_writeout, nr_pages);
+               nr_pages = mem_cgroup_nr_file_dirty(NULL);
+       ret = pdflush_operation(background_writeout, mem, nr_pages);
+       if (ret)
+               put_mem_cgroup(mem);
+       return ret;
 }
 
 static void wb_timer_fn(unsigned long unused);
@@ -683,7 +704,7 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 
0, 0);
  * older_than_this takes precedence over nr_to_write.  So we'll only write back
  * all dirty pages if they are all attached to "old" mappings.
  */
-static void wb_kupdate(unsigned long arg)
+static void wb_kupdate(struct mem_cgroup *mem, unsigned long arg)
 {
        unsigned long oldest_jif;
        unsigned long start_jif;
@@ -704,8 +725,7 @@ static void wb_kupdate(unsigned long arg)
        oldest_jif = jiffies - dirty_expire_interval;
        start_jif = jiffies;
        next_jif = start_jif + dirty_writeback_interval;
-       nr_to_write = global_page_state(NR_FILE_DIRTY) +
-                       global_page_state(NR_UNSTABLE_NFS) +
+       nr_to_write = mem_cgroup_nr_file_dirty(mem) +
                        (inodes_stat.nr_inodes - inodes_stat.nr_unused);
        while (nr_to_write > 0) {
                wbc.more_io = 0;
@@ -724,6 +744,7 @@ static void wb_kupdate(unsigned long arg)
                next_jif = jiffies + HZ;
        if (dirty_writeback_interval)
                mod_timer(&wb_timer, next_jif);
+       put_mem_cgroup(mem);
 }
 
 /*
@@ -742,18 +763,22 @@ int dirty_writeback_centisecs_handler(ctl_table *table, 
int write,
 
 static void wb_timer_fn(unsigned long unused)
 {
-       if (pdflush_operation(wb_kupdate, 0) < 0)
+       struct mem_cgroup *mem = get_current_mem_cgroup();
+
+       if (pdflush_operation(wb_kupdate, mem, 0) < 0) {
+               put_mem_cgroup(mem);
                mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
+       }
 }
 
-static void laptop_flush(unsigned long unused)
+static void laptop_flush(struct mem_cgroup *mem, unsigned long unused)
 {
        sys_sync();
 }
 
 static void laptop_timer_fn(unsigned long unused)
 {
-       pdflush_operation(laptop_flush, 0);
+       pdflush_operation(laptop_flush, NULL, 0);
 }
 
 /*
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 0cbe0c6..27f05b6 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -83,7 +83,9 @@ static unsigned long last_empty_jifs;
  */
 struct pdflush_work {
        struct task_struct *who;        /* The thread */
-       void (*fn)(unsigned long);      /* A callback function */
+       void (*fn)(struct mem_cgroup *,
+                       unsigned long); /* A callback function */
+       struct mem_cgroup *mem;         /* callback memory cgroup argument */
        unsigned long arg0;             /* An argument to the callback */
        struct list_head list;          /* On pdflush_list, when idle */
        unsigned long when_i_went_to_sleep;
@@ -124,7 +126,7 @@ static int __pdflush(struct pdflush_work *my_work)
                }
                spin_unlock_irq(&pdflush_lock);
 
-               (*my_work->fn)(my_work->arg0);
+               (*my_work->fn)(my_work->mem, my_work->arg0);
 
                /*
                 * Thread creation: For how long have there been zero
@@ -198,7 +200,8 @@ static int pdflush(void *dummy)
  * Returns zero if it indeed managed to find a worker thread, and passed your
  * payload to it.
  */
-int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0)
+int pdflush_operation(void (*fn)(struct mem_cgroup *, unsigned long),
+               struct mem_cgroup *mem, unsigned long arg0)
 {
        unsigned long flags;
        int ret = 0;
@@ -216,6 +219,7 @@ int pdflush_operation(void (*fn)(unsigned long), unsigned 
long arg0)
                if (list_empty(&pdflush_list))
                        last_empty_jifs = jiffies;
                pdf->fn = fn;
+               pdf->mem = mem;
                pdf->arg0 = arg0;
                wake_up_process(pdf->who);
        }
-- 
1.5.4.3

_______________________________________________
Containers mailing list
[EMAIL PROTECTED]
https://lists.linux-foundation.org/mailman/listinfo/containers

_______________________________________________
Devel mailing list
[email protected]
https://openvz.org/mailman/listinfo/devel

Reply via email to