[PATCH 2/6] memcg: dirty-set limiting and filtered writeback
From: Konstantin Khlebnikov mem_cgroup_dirty_limits() checks thresholds and schedules per-bdi writeback work (where ->for_memcg is set) which writes only inodes where dirty limit is exceeded for owner memcg or for whole bdi. Interface: memory.dirty_ratio percent of memory limit used as threshold (0 = unlimited, default 50). Background threshold is a half of that. And fs_dirty_threshold line in memory.stat shows current threshold. Signed-off-by: Konstantin Khlebnikov --- fs/fs-writeback.c| 18 - include/linux/backing-dev.h |1 include/linux/memcontrol.h |6 ++ include/linux/writeback.h|1 include/trace/events/writeback.h |1 mm/memcontrol.c | 145 ++ mm/page-writeback.c | 25 ++- 7 files changed, 190 insertions(+), 7 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 2d609a5..9034768 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -47,6 +48,7 @@ struct wb_writeback_work { unsigned int range_cyclic:1; unsigned int for_background:1; unsigned int for_sync:1;/* sync(2) WB_SYNC_ALL writeback */ + unsigned int for_memcg:1; enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ @@ -137,6 +139,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, work->nr_pages = nr_pages; work->range_cyclic = range_cyclic; work->reason= reason; + work->for_memcg = reason == WB_REASON_FOR_MEMCG; bdi_queue_work(bdi, work); } @@ -258,15 +261,16 @@ static int move_expired_inodes(struct list_head *delaying_queue, LIST_HEAD(tmp); struct list_head *pos, *node; struct super_block *sb = NULL; - struct inode *inode; + struct inode *inode, *next; int do_sb_sort = 0; int moved = 0; - while (!list_empty(delaying_queue)) { - inode = wb_inode(delaying_queue->prev); + list_for_each_entry_safe(inode, next, delaying_queue, i_wb_list) { if (work->older_than_this && inode_dirtied_after(inode, *work->older_than_this)) break; + if (work->for_memcg && !mem_cgroup_dirty_exceeded(inode)) + continue; list_move(>i_wb_list, ); moved++; if (sb_is_blkdev_sb(inode->i_sb)) @@ -650,6 +654,11 @@ static long writeback_sb_inodes(struct super_block *sb, break; } + if (work->for_memcg && !mem_cgroup_dirty_exceeded(inode)) { + redirty_tail(inode, wb); + continue; + } + /* * Don't bother with new inodes or inodes being freed, first * kind does not need periodic writeout yet, and for the latter @@ -1014,6 +1023,9 @@ static long wb_do_writeback(struct bdi_writeback *wb) wrote += wb_writeback(wb, work); + if (work->for_memcg) + clear_bit(BDI_memcg_writeback_running, >state); + /* * Notify the caller of completion if this is a synchronous * work item, otherwise just free it. diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 5da6012..91b55d8 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -32,6 +32,7 @@ enum bdi_state { BDI_sync_congested, /* The sync queue is getting full */ BDI_registered, /* bdi_register() was done */ BDI_writeback_running, /* Writeback is in progress */ + BDI_memcg_writeback_running, }; typedef int (congested_fn)(void *, int); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b281333..ae05563 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -178,6 +178,9 @@ void mem_cgroup_dec_page_dirty(struct address_space *mapping); void mem_cgroup_inc_page_writeback(struct address_space *mapping); void mem_cgroup_dec_page_writeback(struct address_space *mapping); void mem_cgroup_forget_mapping(struct address_space *mapping); +bool mem_cgroup_dirty_limits(struct address_space *mapping, unsigned long *dirty, +unsigned long *thresh, unsigned long *bg_thresh); +bool mem_cgroup_dirty_exceeded(struct inode *inode); #else /* CONFIG_MEMCG */ struct mem_cgroup; @@ -352,6 +355,9 @@ static inline void mem_cgroup_dec_page_dirty(struct address_space *mapping) {} static inline void mem_cgroup_inc_page_writeback(struct address_space *mapping) {} static inline void mem_cgroup_dec_page_writeback(struct address_space *mapping) {} static inline void
[PATCH 2/6] memcg: dirty-set limiting and filtered writeback
From: Konstantin Khlebnikov khlebni...@yandex-team.ru mem_cgroup_dirty_limits() checks thresholds and schedules per-bdi writeback work (where -for_memcg is set) which writes only inodes where dirty limit is exceeded for owner memcg or for whole bdi. Interface: memory.dirty_ratio percent of memory limit used as threshold (0 = unlimited, default 50). Background threshold is a half of that. And fs_dirty_threshold line in memory.stat shows current threshold. Signed-off-by: Konstantin Khlebnikov khlebni...@yandex-team.ru --- fs/fs-writeback.c| 18 - include/linux/backing-dev.h |1 include/linux/memcontrol.h |6 ++ include/linux/writeback.h|1 include/trace/events/writeback.h |1 mm/memcontrol.c | 145 ++ mm/page-writeback.c | 25 ++- 7 files changed, 190 insertions(+), 7 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 2d609a5..9034768 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -20,6 +20,7 @@ #include linux/sched.h #include linux/fs.h #include linux/mm.h +#include linux/memcontrol.h #include linux/pagemap.h #include linux/kthread.h #include linux/writeback.h @@ -47,6 +48,7 @@ struct wb_writeback_work { unsigned int range_cyclic:1; unsigned int for_background:1; unsigned int for_sync:1;/* sync(2) WB_SYNC_ALL writeback */ + unsigned int for_memcg:1; enum wb_reason reason; /* why was writeback initiated? */ struct list_head list; /* pending work list */ @@ -137,6 +139,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, work-nr_pages = nr_pages; work-range_cyclic = range_cyclic; work-reason= reason; + work-for_memcg = reason == WB_REASON_FOR_MEMCG; bdi_queue_work(bdi, work); } @@ -258,15 +261,16 @@ static int move_expired_inodes(struct list_head *delaying_queue, LIST_HEAD(tmp); struct list_head *pos, *node; struct super_block *sb = NULL; - struct inode *inode; + struct inode *inode, *next; int do_sb_sort = 0; int moved = 0; - while (!list_empty(delaying_queue)) { - inode = wb_inode(delaying_queue-prev); + list_for_each_entry_safe(inode, next, delaying_queue, i_wb_list) { if (work-older_than_this inode_dirtied_after(inode, *work-older_than_this)) break; + if (work-for_memcg !mem_cgroup_dirty_exceeded(inode)) + continue; list_move(inode-i_wb_list, tmp); moved++; if (sb_is_blkdev_sb(inode-i_sb)) @@ -650,6 +654,11 @@ static long writeback_sb_inodes(struct super_block *sb, break; } + if (work-for_memcg !mem_cgroup_dirty_exceeded(inode)) { + redirty_tail(inode, wb); + continue; + } + /* * Don't bother with new inodes or inodes being freed, first * kind does not need periodic writeout yet, and for the latter @@ -1014,6 +1023,9 @@ static long wb_do_writeback(struct bdi_writeback *wb) wrote += wb_writeback(wb, work); + if (work-for_memcg) + clear_bit(BDI_memcg_writeback_running, bdi-state); + /* * Notify the caller of completion if this is a synchronous * work item, otherwise just free it. diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 5da6012..91b55d8 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -32,6 +32,7 @@ enum bdi_state { BDI_sync_congested, /* The sync queue is getting full */ BDI_registered, /* bdi_register() was done */ BDI_writeback_running, /* Writeback is in progress */ + BDI_memcg_writeback_running, }; typedef int (congested_fn)(void *, int); diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b281333..ae05563 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -178,6 +178,9 @@ void mem_cgroup_dec_page_dirty(struct address_space *mapping); void mem_cgroup_inc_page_writeback(struct address_space *mapping); void mem_cgroup_dec_page_writeback(struct address_space *mapping); void mem_cgroup_forget_mapping(struct address_space *mapping); +bool mem_cgroup_dirty_limits(struct address_space *mapping, unsigned long *dirty, +unsigned long *thresh, unsigned long *bg_thresh); +bool mem_cgroup_dirty_exceeded(struct inode *inode); #else /* CONFIG_MEMCG */ struct mem_cgroup; @@ -352,6 +355,9 @@ static inline void mem_cgroup_dec_page_dirty(struct address_space *mapping) {} static inline void mem_cgroup_inc_page_writeback(struct