[PATCH 2/6] memcg: dirty-set limiting and filtered writeback

2015-01-15 Thread Konstantin Khebnikov
From: Konstantin Khlebnikov 

mem_cgroup_dirty_limits() checks thresholds and schedules per-bdi
writeback work (where ->for_memcg is set) which writes only inodes
where dirty limit is exceeded for owner memcg or for whole bdi.

Interface: memory.dirty_ratio percent of memory limit used as threshold
(0 = unlimited, default 50). Background threshold is a half of that.
And fs_dirty_threshold line in memory.stat shows current threshold.

Signed-off-by: Konstantin Khlebnikov 
---
 fs/fs-writeback.c|   18 -
 include/linux/backing-dev.h  |1 
 include/linux/memcontrol.h   |6 ++
 include/linux/writeback.h|1 
 include/trace/events/writeback.h |1 
 mm/memcontrol.c  |  145 ++
 mm/page-writeback.c  |   25 ++-
 7 files changed, 190 insertions(+), 7 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2d609a5..9034768 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -20,6 +20,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -47,6 +48,7 @@ struct wb_writeback_work {
unsigned int range_cyclic:1;
unsigned int for_background:1;
unsigned int for_sync:1;/* sync(2) WB_SYNC_ALL writeback */
+   unsigned int for_memcg:1;
enum wb_reason reason;  /* why was writeback initiated? */
 
struct list_head list;  /* pending work list */
@@ -137,6 +139,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long 
nr_pages,
work->nr_pages  = nr_pages;
work->range_cyclic = range_cyclic;
work->reason= reason;
+   work->for_memcg = reason == WB_REASON_FOR_MEMCG;
 
bdi_queue_work(bdi, work);
 }
@@ -258,15 +261,16 @@ static int move_expired_inodes(struct list_head 
*delaying_queue,
LIST_HEAD(tmp);
struct list_head *pos, *node;
struct super_block *sb = NULL;
-   struct inode *inode;
+   struct inode *inode, *next;
int do_sb_sort = 0;
int moved = 0;
 
-   while (!list_empty(delaying_queue)) {
-   inode = wb_inode(delaying_queue->prev);
+   list_for_each_entry_safe(inode, next, delaying_queue, i_wb_list) {
if (work->older_than_this &&
inode_dirtied_after(inode, *work->older_than_this))
break;
+   if (work->for_memcg && !mem_cgroup_dirty_exceeded(inode))
+   continue;
list_move(>i_wb_list, );
moved++;
if (sb_is_blkdev_sb(inode->i_sb))
@@ -650,6 +654,11 @@ static long writeback_sb_inodes(struct super_block *sb,
break;
}
 
+   if (work->for_memcg && !mem_cgroup_dirty_exceeded(inode)) {
+   redirty_tail(inode, wb);
+   continue;
+   }
+
/*
 * Don't bother with new inodes or inodes being freed, first
 * kind does not need periodic writeout yet, and for the latter
@@ -1014,6 +1023,9 @@ static long wb_do_writeback(struct bdi_writeback *wb)
 
wrote += wb_writeback(wb, work);
 
+   if (work->for_memcg)
+   clear_bit(BDI_memcg_writeback_running, >state);
+
/*
 * Notify the caller of completion if this is a synchronous
 * work item, otherwise just free it.
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 5da6012..91b55d8 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -32,6 +32,7 @@ enum bdi_state {
BDI_sync_congested, /* The sync queue is getting full */
BDI_registered, /* bdi_register() was done */
BDI_writeback_running,  /* Writeback is in progress */
+   BDI_memcg_writeback_running,
 };
 
 typedef int (congested_fn)(void *, int);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b281333..ae05563 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -178,6 +178,9 @@ void mem_cgroup_dec_page_dirty(struct address_space 
*mapping);
 void mem_cgroup_inc_page_writeback(struct address_space *mapping);
 void mem_cgroup_dec_page_writeback(struct address_space *mapping);
 void mem_cgroup_forget_mapping(struct address_space *mapping);
+bool mem_cgroup_dirty_limits(struct address_space *mapping, unsigned long 
*dirty,
+unsigned long *thresh, unsigned long *bg_thresh);
+bool mem_cgroup_dirty_exceeded(struct inode *inode);
 
 #else /* CONFIG_MEMCG */
 struct mem_cgroup;
@@ -352,6 +355,9 @@ static inline void mem_cgroup_dec_page_dirty(struct 
address_space *mapping) {}
 static inline void mem_cgroup_inc_page_writeback(struct address_space 
*mapping) {}
 static inline void mem_cgroup_dec_page_writeback(struct address_space 
*mapping) {}
 static inline void 

[PATCH 2/6] memcg: dirty-set limiting and filtered writeback

2015-01-15 Thread Konstantin Khebnikov
From: Konstantin Khlebnikov khlebni...@yandex-team.ru

mem_cgroup_dirty_limits() checks thresholds and schedules per-bdi
writeback work (where -for_memcg is set) which writes only inodes
where dirty limit is exceeded for owner memcg or for whole bdi.

Interface: memory.dirty_ratio percent of memory limit used as threshold
(0 = unlimited, default 50). Background threshold is a half of that.
And fs_dirty_threshold line in memory.stat shows current threshold.

Signed-off-by: Konstantin Khlebnikov khlebni...@yandex-team.ru
---
 fs/fs-writeback.c|   18 -
 include/linux/backing-dev.h  |1 
 include/linux/memcontrol.h   |6 ++
 include/linux/writeback.h|1 
 include/trace/events/writeback.h |1 
 mm/memcontrol.c  |  145 ++
 mm/page-writeback.c  |   25 ++-
 7 files changed, 190 insertions(+), 7 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 2d609a5..9034768 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -20,6 +20,7 @@
 #include linux/sched.h
 #include linux/fs.h
 #include linux/mm.h
+#include linux/memcontrol.h
 #include linux/pagemap.h
 #include linux/kthread.h
 #include linux/writeback.h
@@ -47,6 +48,7 @@ struct wb_writeback_work {
unsigned int range_cyclic:1;
unsigned int for_background:1;
unsigned int for_sync:1;/* sync(2) WB_SYNC_ALL writeback */
+   unsigned int for_memcg:1;
enum wb_reason reason;  /* why was writeback initiated? */
 
struct list_head list;  /* pending work list */
@@ -137,6 +139,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long 
nr_pages,
work-nr_pages  = nr_pages;
work-range_cyclic = range_cyclic;
work-reason= reason;
+   work-for_memcg = reason == WB_REASON_FOR_MEMCG;
 
bdi_queue_work(bdi, work);
 }
@@ -258,15 +261,16 @@ static int move_expired_inodes(struct list_head 
*delaying_queue,
LIST_HEAD(tmp);
struct list_head *pos, *node;
struct super_block *sb = NULL;
-   struct inode *inode;
+   struct inode *inode, *next;
int do_sb_sort = 0;
int moved = 0;
 
-   while (!list_empty(delaying_queue)) {
-   inode = wb_inode(delaying_queue-prev);
+   list_for_each_entry_safe(inode, next, delaying_queue, i_wb_list) {
if (work-older_than_this 
inode_dirtied_after(inode, *work-older_than_this))
break;
+   if (work-for_memcg  !mem_cgroup_dirty_exceeded(inode))
+   continue;
list_move(inode-i_wb_list, tmp);
moved++;
if (sb_is_blkdev_sb(inode-i_sb))
@@ -650,6 +654,11 @@ static long writeback_sb_inodes(struct super_block *sb,
break;
}
 
+   if (work-for_memcg  !mem_cgroup_dirty_exceeded(inode)) {
+   redirty_tail(inode, wb);
+   continue;
+   }
+
/*
 * Don't bother with new inodes or inodes being freed, first
 * kind does not need periodic writeout yet, and for the latter
@@ -1014,6 +1023,9 @@ static long wb_do_writeback(struct bdi_writeback *wb)
 
wrote += wb_writeback(wb, work);
 
+   if (work-for_memcg)
+   clear_bit(BDI_memcg_writeback_running, bdi-state);
+
/*
 * Notify the caller of completion if this is a synchronous
 * work item, otherwise just free it.
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 5da6012..91b55d8 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -32,6 +32,7 @@ enum bdi_state {
BDI_sync_congested, /* The sync queue is getting full */
BDI_registered, /* bdi_register() was done */
BDI_writeback_running,  /* Writeback is in progress */
+   BDI_memcg_writeback_running,
 };
 
 typedef int (congested_fn)(void *, int);
diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index b281333..ae05563 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -178,6 +178,9 @@ void mem_cgroup_dec_page_dirty(struct address_space 
*mapping);
 void mem_cgroup_inc_page_writeback(struct address_space *mapping);
 void mem_cgroup_dec_page_writeback(struct address_space *mapping);
 void mem_cgroup_forget_mapping(struct address_space *mapping);
+bool mem_cgroup_dirty_limits(struct address_space *mapping, unsigned long 
*dirty,
+unsigned long *thresh, unsigned long *bg_thresh);
+bool mem_cgroup_dirty_exceeded(struct inode *inode);
 
 #else /* CONFIG_MEMCG */
 struct mem_cgroup;
@@ -352,6 +355,9 @@ static inline void mem_cgroup_dec_page_dirty(struct 
address_space *mapping) {}
 static inline void mem_cgroup_inc_page_writeback(struct