For the planned cgroup writeback support, on each bdi
(backing_dev_info), each cgroup will be served by a separate wb
(bdi_writeback).  This patch updates bdi so that a bdi can host
multiple wbs (bdi_writebacks).

bdi->wb remains unchanged and will keep serving the root cgroup.
cgwb's (cgroup wb's) for non-root cgroups are created on-demand or
looked up during init_cgwb_dirty_page_contex() according to the dirty
blkcg of the page being dirtied.  Each cgwb is indexed on
bdi->cgwb_tree by its blkcg id.

Once dirty_context is initialized for a page, the page's wb can be
looked up using page_cgwb_{dirty|wb}() while the page is dirty or
under writeback respectively.  Once created, a cgwb is destroyed iff
either its associated bdi or blkcg is destroyed, meaning that as long
as a page is dirty or under writeback, its associated cgwb is
accessible without further locking.

dirty_context grew a new field ->wb which caches the selected wb and
account_page_dirtied() is updated to use that instead of
unconditionally using bdi->wb.

Currently, none of the filesystems has FS_CGROUP_WRITEBACK and all
pages will keep being associated with bdi->wb.

Signed-off-by: Tejun Heo <t...@kernel.org>
Cc: Jens Axboe <ax...@kernel.dk>
Cc: Jan Kara <j...@suse.cz>
---
 block/blk-cgroup.c               |  11 ++-
 fs/fs-writeback.c                |  19 +++-
 include/linux/backing-dev-defs.h |  17 +++-
 include/linux/backing-dev.h      | 123 +++++++++++++++++++++++++
 include/linux/blk-cgroup.h       |   4 +
 mm/backing-dev.c                 | 189 +++++++++++++++++++++++++++++++++++++++
 mm/page-writeback.c              |   4 +-
 7 files changed, 361 insertions(+), 6 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 9e0fe38..8bebaa9 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -15,6 +15,7 @@
 #include <linux/module.h>
 #include <linux/err.h>
 #include <linux/blkdev.h>
+#include <linux/backing-dev.h>
 #include <linux/slab.h>
 #include <linux/genhd.h>
 #include <linux/delay.h>
@@ -813,6 +814,11 @@ static void blkcg_css_offline(struct cgroup_subsys_state 
*css)
        spin_unlock_irq(&blkcg->lock);
 }
 
+static void blkcg_css_released(struct cgroup_subsys_state *css)
+{
+       cgwb_blkcg_released(css);
+}
+
 static void blkcg_css_free(struct cgroup_subsys_state *css)
 {
        struct blkcg *blkcg = css_to_blkcg(css);
@@ -841,7 +847,9 @@ done:
        spin_lock_init(&blkcg->lock);
        INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_ATOMIC);
        INIT_HLIST_HEAD(&blkcg->blkg_list);
-
+#ifdef CONFIG_CGROUP_WRITEBACK
+       INIT_LIST_HEAD(&blkcg->cgwb_list);
+#endif
        return &blkcg->css;
 }
 
@@ -926,6 +934,7 @@ static int blkcg_can_attach(struct cgroup_subsys_state *css,
 struct cgroup_subsys blkio_cgrp_subsys = {
        .css_alloc = blkcg_css_alloc,
        .css_offline = blkcg_css_offline,
+       .css_released = blkcg_css_released,
        .css_free = blkcg_css_free,
        .can_attach = blkcg_can_attach,
        .legacy_cftypes = blkcg_files,
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 138a5ea..3b54835 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -117,21 +117,37 @@ out_unlock:
  */
 static void init_cgwb_dirty_page_context(struct dirty_context *dctx)
 {
+       struct backing_dev_info *bdi = dctx->mapping->backing_dev_info;
+       struct cgroup_subsys_state *blkcg_css;
+
        /* cgroup writeback requires support from both the bdi and filesystem */
        if (!mapping_cgwb_enabled(dctx->mapping))
                goto force_root;
 
-       page_blkcg_attach_dirty(dctx->page);
+       /*
+        * @dctx->page is a candidate for cgroup writeback and about to be
+        * dirtied.  Attach the dirty blkcg to the page and pre-allocate
+        * all resources necessary for cgroup writeback.  On failure, fall
+        * back to the root blkcg.
+        */
+       blkcg_css = page_blkcg_attach_dirty(dctx->page);
+       dctx->wb = cgwb_lookup_create(bdi, blkcg_css);
+       if (!dctx->wb) {
+               page_blkcg_detach_dirty(dctx->page);
+               goto force_root;
+       }
        return;
 
 force_root:
        page_blkcg_force_root_dirty(dctx->page);
+       dctx->wb = &bdi->wb;
 }
 
 #else  /* CONFIG_CGROUP_WRITEBACK */
 
 static void init_cgwb_dirty_page_context(struct dirty_context *dctx)
 {
+       dctx->wb = &dctx->mapping->backing_dev_info->wb;
 }
 
 #endif /* CONFIG_CGROUP_WRITEBACK */
@@ -176,6 +192,7 @@ void init_dirty_inode_context(struct dirty_context *dctx, 
struct inode *inode)
 {
        memset(dctx, 0, sizeof(*dctx));
        dctx->inode = inode;
+       dctx->wb = &inode_to_bdi(inode)->wb;
 }
 
 static void __wb_start_writeback(struct bdi_writeback *wb, long nr_pages,
diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h
index bf20ef1..511066f 100644
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -2,6 +2,7 @@
 #define __LINUX_BACKING_DEV_DEFS_H
 
 #include <linux/list.h>
+#include <linux/radix-tree.h>
 #include <linux/spinlock.h>
 #include <linux/percpu_counter.h>
 #include <linux/flex_proportions.h>
@@ -68,6 +69,15 @@ struct bdi_writeback {
        spinlock_t work_lock;           /* protects work_list & dwork 
scheduling */
        struct list_head work_list;
        struct delayed_work dwork;      /* work item used for writeback */
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+       struct cgroup_subsys_state *blkcg_css; /* the blkcg we belong to */
+       struct list_head blkcg_node;    /* anchored at blkcg->wb_list */
+       union {
+               struct list_head shutdown_node;
+               struct rcu_head rcu;
+       };
+#endif
 };
 
 struct backing_dev_info {
@@ -82,8 +92,10 @@ struct backing_dev_info {
        unsigned int min_ratio;
        unsigned int max_ratio, max_prop_frac;
 
-       struct bdi_writeback wb;  /* default writeback info for this bdi */
-
+       struct bdi_writeback wb; /* the root writeback info for this bdi */
+#ifdef CONFIG_CGROUP_WRITEBACK
+       struct radix_tree_root cgwb_tree; /* radix tree of !root cgroup wbs */
+#endif
        struct device *dev;
 
        struct timer_list laptop_mode_wb_timer;
@@ -102,6 +114,7 @@ struct dirty_context {
        struct page             *page;
        struct inode            *inode;
        struct address_space    *mapping;
+       struct bdi_writeback    *wb;
 };
 
 enum {
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 7a20cff..3722796 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -13,6 +13,7 @@
 #include <linux/sched.h>
 #include <linux/writeback.h>
 #include <linux/memcontrol.h>
+#include <linux/blk-cgroup.h>
 
 #include <linux/backing-dev-defs.h>
 
@@ -273,6 +274,10 @@ void init_dirty_inode_context(struct dirty_context *dctx, 
struct inode *inode);
 
 #ifdef CONFIG_CGROUP_WRITEBACK
 
+void cgwb_blkcg_released(struct cgroup_subsys_state *blkcg_css);
+int __cgwb_create(struct backing_dev_info *bdi,
+                 struct cgroup_subsys_state *blkcg_css);
+
 /**
  * mapping_cgwb_enabled - test whether cgroup writeback is enabled on a mapping
  * @mapping: address_space of interest
@@ -290,6 +295,97 @@ static inline bool mapping_cgwb_enabled(struct 
address_space *mapping)
                inode && (inode->i_sb->s_type->fs_flags & FS_CGROUP_WRITEBACK);
 }
 
+/**
+ * cgwb_lookup - lookup cgwb for a given blkcg on a bdi
+ * @bdi: target bdi
+ * @blkcg_css: target blkcg
+ *
+ * Look up the cgwb (cgroup bdi_writeback) for @blkcg_css on @bdi.  The
+ * returned cgwb is accessible as long as @bdi and @blkcg_css stay alive.
+ *
+ * Returns the pointer to the found cgwb on success, NULL on failure.
+ */
+static inline struct bdi_writeback *
+cgwb_lookup(struct backing_dev_info *bdi, struct cgroup_subsys_state 
*blkcg_css)
+{
+       struct bdi_writeback *cgwb;
+
+       if (blkcg_css == blkcg_root_css)
+               return &bdi->wb;
+
+       /*
+        * RCU locking protects the radix tree itself.  The looked up cgwb
+        * is protected by the caller ensuring that @bdi and the blkcg w/
+        * @blkcg_id are alive.
+        */
+       rcu_read_lock();
+       cgwb = radix_tree_lookup(&bdi->cgwb_tree, blkcg_css->id);
+       rcu_read_unlock();
+       return cgwb;
+}
+
+/**
+ * cgwb_lookup_create - try to lookup cgwb and create one if not found
+ * @bdi: target bdi
+ * @blkcg_css: cgroup_subsys_state of the target blkcg
+ *
+ * Try to look up the cgwb (cgroup bdi_writeback) for the blkcg with
+ * @blkcg_css on @bdi.  If it doesn't exist, try to create one.  This
+ * function can be called under any context without locking as long as @bdi
+ * and @blkcg_css are kept alive.  See cgwb_lookup() for details.
+ *
+ * Returns the pointer to the found cgwb on success, NULL if such cgwb
+ * doesn't exist and creation failed due to memory pressure.
+ */
+static inline struct bdi_writeback *
+cgwb_lookup_create(struct backing_dev_info *bdi,
+                  struct cgroup_subsys_state *blkcg_css)
+{
+       struct bdi_writeback *wb;
+
+       do {
+               wb = cgwb_lookup(bdi, blkcg_css);
+               if (wb)
+                       return wb;
+       } while (!__cgwb_create(bdi, blkcg_css));
+
+       return NULL;
+}
+
+/**
+ * page_cgwb_dirty - lookup the dirty cgwb of a page
+ * @page: target page
+ *
+ * Returns the dirty cgwb (cgroup bdi_writeback) of @page.  The returned
+ * wb is accessible as long as @page is dirty.
+ */
+static inline struct bdi_writeback *page_cgwb_dirty(struct page *page)
+{
+       struct backing_dev_info *bdi = page->mapping->backing_dev_info;
+       struct bdi_writeback *wb = cgwb_lookup(bdi, page_blkcg_dirty(page));
+
+       if (WARN_ON_ONCE(!wb))
+               return &bdi->wb;
+       return wb;
+}
+
+/**
+ * page_cgwb_wb - lookup the writeback cgwb of a page
+ * @page: target page
+ *
+ * Returns the writeback cgwb (cgroup bdi_writeback) of @page.  The
+ * returned wb is accessible as long as @page is under writeback.
+ */
+static inline struct bdi_writeback *page_cgwb_wb(struct page *page)
+{
+       struct backing_dev_info *bdi = page->mapping->backing_dev_info;
+       struct bdi_writeback *wb = cgwb_lookup(bdi, page_blkcg_wb(page));
+
+       if (WARN_ON_ONCE(!wb))
+               return &bdi->wb;
+       return wb;
+}
+
 #else  /* CONFIG_CGROUP_WRITEBACK */
 
 static inline bool mapping_cgwb_enabled(struct address_space *mapping)
@@ -297,6 +393,33 @@ static inline bool mapping_cgwb_enabled(struct 
address_space *mapping)
        return false;
 }
 
+static inline void cgwb_blkcg_released(struct cgroup_subsys_state *blkcg_css)
+{
+}
+
+static inline struct bdi_writeback *
+cgwb_lookup(struct backing_dev_info *bdi, struct cgroup_subsys_state 
*blkcg_css)
+{
+       return &bdi->wb;
+}
+
+static inline struct bdi_writeback *
+cgwb_lookup_create(struct backing_dev_info *bdi,
+                  struct cgroup_subsys_state *blkcg_css)
+{
+       return &bdi->wb;
+}
+
+static inline struct bdi_writeback *page_cgwb_dirty(struct page *page)
+{
+       return &page->mapping->backing_dev_info->wb;
+}
+
+static inline struct bdi_writeback *page_cgwb_wb(struct page *page)
+{
+       return &page->mapping->backing_dev_info->wb;
+}
+
 #endif /* CONFIG_CGROUP_WRITEBACK */
 
 #endif         /* _LINUX_BACKING_DEV_H */
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 4dc643f..3033eb1 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -53,6 +53,10 @@ struct blkcg {
        /* TODO: per-policy storage in blkcg */
        unsigned int                    cfq_weight;     /* belongs to cfq */
        unsigned int                    cfq_leaf_weight;
+
+#ifdef CONFIG_CGROUP_WRITEBACK
+       struct list_head                cgwb_list;
+#endif
 };
 
 struct blkg_stat {
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 1c9b70e..c6dda82 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -440,6 +440,192 @@ static void wb_exit(struct bdi_writeback *wb)
        fprop_local_destroy_percpu(&wb->completions);
 }
 
+#ifdef CONFIG_CGROUP_WRITEBACK
+
+/*
+ * cgwb_lock protects bdi->cgwb_tree and blkcg->cgwb_list where the former
+ * is also RCU protected.  cgwb_shutdown_mutex synchronizes shutdown
+ * attempts from bdi and blkcg destructions.  For details, see
+ * cgwb_shutdown_prepare/commit().
+ */
+static DEFINE_SPINLOCK(cgwb_lock);
+static DEFINE_MUTEX(cgwb_shutdown_mutex);
+
+int __cgwb_create(struct backing_dev_info *bdi,
+                 struct cgroup_subsys_state *blkcg_css)
+{
+       struct blkcg *blkcg = css_to_blkcg(blkcg_css);
+       struct bdi_writeback *wb;
+       unsigned long flags;
+       int ret;
+
+       wb = kzalloc(sizeof(*wb), GFP_ATOMIC);
+       if (!wb)
+               return -ENOMEM;
+
+       ret = wb_init(wb, bdi, GFP_ATOMIC);
+       if (ret) {
+               kfree(wb);
+               return -ENOMEM;
+       }
+
+       wb->blkcg_css = blkcg_css;
+       set_bit(WB_registered, &wb->state); /* cgwbs are always registered */
+
+       ret = -ENODEV;
+       spin_lock_irqsave(&cgwb_lock, flags);
+       /* the root wb determines the registered state of the whole bdi */
+       if (test_bit(WB_registered, &bdi->wb.state)) {
+               /* we might have raced w/ another instance of this function */
+               ret = radix_tree_insert(&bdi->cgwb_tree, blkcg_css->id, wb);
+               if (!ret)
+                       list_add_tail(&wb->blkcg_node, &blkcg->cgwb_list);
+       }
+       spin_unlock_irqrestore(&cgwb_lock, flags);
+       if (ret) {
+               wb_exit(wb);
+               if (ret != -EEXIST)
+                       return ret;
+       }
+       return 0;
+}
+
+/**
+ * cgwb_shutdown_prepare - prepare to shutdown a cgwb
+ * @wb: cgwb to be shutdown
+ * @to_shutdown: list to queue @wb on
+ *
+ * This function is called to queue @wb for shutdown on @to_shutdown.  The
+ * bdi_writeback indexes use the cgwb_lock spinlock but wb_shutdown() needs
+ * process context, so this function can be called while holding cgwb_lock
+ * and cgwb_shutdown_mutex to queue cgwbs for shutdown.  Once all target
+ * cgwbs are queued, the caller should release cgwb_lock and invoke
+ * cgwb_shutdown_commit().
+ */
+static void cgwb_shutdown_prepare(struct bdi_writeback *wb,
+                                 struct list_head *to_shutdown)
+{
+       lockdep_assert_held(&cgwb_lock);
+       lockdep_assert_held(&cgwb_shutdown_mutex);
+
+       WARN_ON(!test_bit(WB_registered, &wb->state));
+       clear_bit(WB_registered, &wb->state);
+       list_add_tail(&wb->shutdown_node, to_shutdown);
+}
+
+/**
+ * cgwb_shutdown_commit - commit cgwb shutdowns
+ * @to_shutdown: list of cgwbs to shutdown
+ *
+ * This function is called after @to_shutdown is built by calls to
+ * cgwb_shutdown_prepare() and cgwb_lock is released.  It invokes
+ * wb_shutdown() on all cgwbs on the list.  bdi and blkcg may try to
+ * shutdown the same cgwbs and should wait till completion if shutdown is
+ * initiated by the other.  This synchronization is achieved through
+ * cgwb_shutdown_mutex which should have been acquired before the
+ * cgwb_shutdown_prepare() invocations.
+ */
+static void cgwb_shutdown_commit(struct list_head *to_shutdown)
+{
+       struct bdi_writeback *wb;
+
+       lockdep_assert_held(&cgwb_shutdown_mutex);
+
+       list_for_each_entry(wb, to_shutdown, shutdown_node)
+               wb_shutdown(wb);
+}
+
+static void cgwb_exit(struct bdi_writeback *wb)
+{
+       WARN_ON(!radix_tree_delete(&wb->bdi->cgwb_tree, wb->blkcg_css->id));
+       list_del(&wb->blkcg_node);
+       wb_exit(wb);
+       kfree_rcu(wb, rcu);
+}
+
+static void cgwb_bdi_init(struct backing_dev_info *bdi)
+{
+       bdi->wb.blkcg_css = blkcg_root_css;
+       INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
+}
+
+/**
+ * cgwb_bdi_shutdown - @bdi is being shut down, shut down all cgwbs
+ * @bdi: bdi being shut down
+ */
+static void cgwb_bdi_shutdown(struct backing_dev_info *bdi)
+{
+       LIST_HEAD(to_shutdown);
+       struct radix_tree_iter iter;
+       void **slot;
+
+       WARN_ON(test_bit(WB_registered, &bdi->wb.state));
+
+       mutex_lock(&cgwb_shutdown_mutex);
+       spin_lock_irq(&cgwb_lock);
+
+       radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0)
+               cgwb_shutdown_prepare(*slot, &to_shutdown);
+
+       spin_unlock_irq(&cgwb_lock);
+       cgwb_shutdown_commit(&to_shutdown);
+       mutex_unlock(&cgwb_shutdown_mutex);
+}
+
+/**
+ * cgwb_bdi_exit - @bdi is being exit, exit all its cgwbs
+ * @bdi: bdi being shut down
+ */
+static void cgwb_bdi_exit(struct backing_dev_info *bdi)
+{
+       LIST_HEAD(to_free);
+       struct radix_tree_iter iter;
+       void **slot;
+
+       spin_lock_irq(&cgwb_lock);
+       radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) {
+               struct bdi_writeback *wb = *slot;
+
+               WARN_ON(test_bit(WB_registered, &wb->state));
+               cgwb_exit(wb);
+       }
+       spin_unlock_irq(&cgwb_lock);
+}
+
+/**
+ * cgwb_blkcg_released - a blkcg is being destroyed, release all matching cgwbs
+ * @blkcg_css: blkcg being destroyed
+ */
+void cgwb_blkcg_released(struct cgroup_subsys_state *blkcg_css)
+{
+       LIST_HEAD(to_shutdown);
+       struct blkcg *blkcg = css_to_blkcg(blkcg_css);
+       struct bdi_writeback *wb, *next;
+
+       mutex_lock(&cgwb_shutdown_mutex);
+       spin_lock_irq(&cgwb_lock);
+
+       list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node)
+               cgwb_shutdown_prepare(wb, &to_shutdown);
+
+       spin_unlock_irq(&cgwb_lock);
+       cgwb_shutdown_commit(&to_shutdown);
+       mutex_unlock(&cgwb_shutdown_mutex);
+
+       spin_lock_irq(&cgwb_lock);
+       list_for_each_entry_safe(wb, next, &blkcg->cgwb_list, blkcg_node)
+               cgwb_exit(wb);
+       spin_unlock_irq(&cgwb_lock);
+}
+
+#else  /* CONFIG_CGROUP_WRITEBACK */
+
+static void cgwb_bdi_init(struct backing_dev_info *bdi) { }
+static void cgwb_bdi_shutdown(struct backing_dev_info *bdi) { }
+static void cgwb_bdi_exit(struct backing_dev_info *bdi) { }
+
+#endif /* CONFIG_CGROUP_WRITEBACK */
+
 int bdi_init(struct backing_dev_info *bdi)
 {
        int err;
@@ -455,6 +641,7 @@ int bdi_init(struct backing_dev_info *bdi)
        if (err)
                return err;
 
+       cgwb_bdi_init(bdi);
        return 0;
 }
 EXPORT_SYMBOL(bdi_init);
@@ -532,6 +719,7 @@ void bdi_unregister(struct backing_dev_info *bdi)
                        /* make sure nobody finds us on the bdi_list anymore */
                        bdi_remove_from_list(bdi);
                        wb_shutdown(&bdi->wb);
+                       cgwb_bdi_shutdown(bdi);
                }
 
                bdi_debug_unregister(bdi);
@@ -544,6 +732,7 @@ EXPORT_SYMBOL(bdi_unregister);
 void bdi_destroy(struct backing_dev_info *bdi)
 {
        bdi_unregister(bdi);
+       cgwb_bdi_exit(bdi);
        wb_exit(&bdi->wb);
 }
 EXPORT_SYMBOL(bdi_destroy);
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 72a0edf..6475504 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2102,8 +2102,8 @@ void account_page_dirtied(struct dirty_context *dctx)
 
        __inc_zone_page_state(page, NR_FILE_DIRTY);
        __inc_zone_page_state(page, NR_DIRTIED);
-       __inc_wb_stat(&mapping->backing_dev_info->wb, WB_RECLAIMABLE);
-       __inc_wb_stat(&mapping->backing_dev_info->wb, WB_DIRTIED);
+       __inc_wb_stat(dctx->wb, WB_RECLAIMABLE);
+       __inc_wb_stat(dctx->wb, WB_DIRTIED);
        task_io_account_write(PAGE_CACHE_SIZE);
        current->nr_dirtied++;
        this_cpu_inc(bdp_ratelimits);
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to