The main idea is following: * for background works we check all UBs for exceeding dirty limit. * background work goes on if any UB has exceed dirty limit. * In that case, writeback will skip inodes if those belong to "within dirty-limits UB"
writeback_inodes_wb() gain an new 'struct user_beancounter *ub' argument which is needed for targeted per-CT writeback. This will be used in the next patch. https://jira.sw.ru/browse/PSBM-33841 Signed-off-by: Andrey Ryabinin <aryabi...@virtuozzo.com> --- fs/fs-writeback.c | 38 ++++++++++++++++++++++++++++++++------ include/bc/io_acct.h | 7 ++++++- include/linux/backing-dev.h | 2 ++ kernel/bc/io_acct.c | 42 ++++++++++++++++++++++++++++++++++++++++-- 4 files changed, 80 insertions(+), 9 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ac8066b..ef4f963 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -42,6 +42,7 @@ struct wb_writeback_work { struct super_block *sb; unsigned long *older_than_this; enum writeback_sync_modes sync_mode; + unsigned int filter_ub:1; unsigned int tagged_writepages:1; unsigned int for_kupdate:1; unsigned int range_cyclic:1; @@ -51,6 +52,7 @@ struct wb_writeback_work { struct list_head list; /* pending work list */ struct completion *done; /* set if the caller waits */ + struct user_beancounter *ub; }; /* @@ -724,6 +726,13 @@ static long writeback_sb_inodes(struct super_block *sb, trace_writeback_sb_inodes_requeue(inode); continue; } + if ((work->ub || work->filter_ub) && + ub_should_skip_writeback(work->ub, inode)) { + spin_unlock(&inode->i_lock); + requeue_io(inode, wb); + continue; + } + spin_unlock(&wb->list_lock); /* @@ -809,14 +818,16 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb, return wrote; } -static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, - enum wb_reason reason) +long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, + enum wb_reason reason, struct user_beancounter *ub) { struct wb_writeback_work work = { .nr_pages = nr_pages, .sync_mode = WB_SYNC_NONE, .range_cyclic = 1, .reason = reason, + .ub = ub, + .filter_ub = 0, }; spin_lock(&wb->list_lock); @@ -904,8 +915,14 @@ static long wb_writeback(struct bdi_writeback *wb, * For background writeout, stop when we are below the * background dirty threshold */ - if (work->for_background && !over_bground_thresh(wb->bdi)) - break; + if (work->for_background) { + if (over_bground_thresh(wb->bdi)) + work->filter_ub = 0; + else if (ub_over_bground_thresh()) + work->filter_ub = 1; + else + break; + } /* * Kupdate and background works are special and we want to @@ -996,7 +1013,8 @@ static unsigned long get_nr_dirty_pages(void) static long wb_check_background_flush(struct bdi_writeback *wb) { - if (over_bground_thresh(wb->bdi)) { + if (over_bground_thresh(wb->bdi) || + ub_over_bground_thresh()) { struct wb_writeback_work work = { .nr_pages = LONG_MAX, @@ -1004,6 +1022,8 @@ static long wb_check_background_flush(struct bdi_writeback *wb) .for_background = 1, .range_cyclic = 1, .reason = WB_REASON_BACKGROUND, + .filter_ub = 0, + .ub = NULL, }; return wb_writeback(wb, &work); @@ -1038,6 +1058,8 @@ static long wb_check_old_data_flush(struct bdi_writeback *wb) .for_kupdate = 1, .range_cyclic = 1, .reason = WB_REASON_PERIODIC, + .filter_ub = 0, + .ub = NULL, }; return wb_writeback(wb, &work); @@ -1115,7 +1137,7 @@ void bdi_writeback_workfn(struct work_struct *work) * enough for efficient IO. */ pages_written = writeback_inodes_wb(&bdi->wb, 1024, - WB_REASON_FORKER_THREAD); + WB_REASON_FORKER_THREAD, NULL); trace_writeback_pages_written(pages_written); } @@ -1432,6 +1454,8 @@ void writeback_inodes_sb_nr(struct super_block *sb, .done = &done, .nr_pages = nr, .reason = reason, + .filter_ub = 0, + .ub = NULL, }; if (sb->s_bdi == &noop_backing_dev_info) @@ -1514,6 +1538,8 @@ void sync_inodes_sb(struct super_block *sb) .done = &done, .reason = WB_REASON_SYNC, .for_sync = 1, + .filter_ub = 0, + .ub = NULL, }; /* Nothing to do? */ diff --git a/include/bc/io_acct.h b/include/bc/io_acct.h index fa7afb1..e0af0bf 100644 --- a/include/bc/io_acct.h +++ b/include/bc/io_acct.h @@ -58,7 +58,7 @@ extern void ub_io_writeback_dec(struct address_space *mapping); extern int ub_dirty_limits(unsigned long *pbackground, long *pdirty, struct user_beancounter *ub); - +extern bool ub_over_bground_thresh(void); extern bool ub_should_skip_writeback(struct user_beancounter *ub, struct inode *inode); @@ -116,6 +116,11 @@ static inline struct user_beancounter *get_io_ub(void) return NULL; } +static inline bool ub_over_bground_thresh(void) +{ + return false; +} + #endif /* UBC_IO_ACCT */ #endif diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 859504b..b7668cf 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -130,6 +130,8 @@ int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int); void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages, enum wb_reason reason); void bdi_start_background_writeback(struct backing_dev_info *bdi); +long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages, + enum wb_reason reason, struct user_beancounter *ub); void bdi_writeback_workfn(struct work_struct *work); int bdi_has_dirty_io(struct backing_dev_info *bdi); void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi); diff --git a/kernel/bc/io_acct.c b/kernel/bc/io_acct.c index f9778f8..e863ce7 100644 --- a/kernel/bc/io_acct.c +++ b/kernel/bc/io_acct.c @@ -126,12 +126,48 @@ void ub_io_writeback_dec(struct address_space *mapping) } } +static bool __ub_over_bground_thresh(struct user_beancounter *ub) +{ + unsigned long background_thresh, dirty_thresh; + unsigned long ub_dirty, ub_writeback; + + ub_dirty_limits(&background_thresh, &dirty_thresh, ub); + + ub_dirty = ub_stat_get(ub, dirty_pages); + ub_writeback = ub_stat_get(ub, writeback_pages); + + if (ub_dirty + ub_writeback >= background_thresh) + return true; + + return false; +} + +bool ub_over_bground_thresh(void) +{ + struct user_beancounter *ub; + bool ret = false; + + rcu_read_lock(); + for_each_beancounter(ub) { + if (ub == get_ub0()) + continue; + if (__ub_over_bground_thresh(ub)) { + ret = true; + break; + } + } + rcu_read_unlock(); + return ret; +} + int ub_dirty_limits(unsigned long *pbackground, long *pdirty, struct user_beancounter *ub) { int dirty_ratio; unsigned long available_memory; + *pdirty = *pbackground = LONG_MAX; + dirty_ratio = ub_dirty_ratio; if (!dirty_ratio) return 0; @@ -157,8 +193,10 @@ bool ub_should_skip_writeback(struct user_beancounter *ub, struct inode *inode) rcu_read_lock(); dirtied_ub = rcu_dereference(inode->i_mapping->dirtied_ub); - ret = !dirtied_ub || (dirtied_ub != ub && - !test_bit(UB_DIRTY_EXCEEDED, &dirtied_ub->ub_flags)); + if (ub) + ret = (ub != dirtied_ub); + else + ret = (dirtied_ub && !ub_over_bground_thresh()); rcu_read_unlock(); return ret; -- 2.4.10 _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel