[RFC v1] add new io-scheduler to use cgroup on high-speed device

Robin Dong Tue, 04 Jun 2013 19:24:31 -0700

From: Robin Dong <san...@taobao.com>

We want to use blkio.cgroup on high-speed device (like fusionio) for our mysql 
clusters.
After testing different io-scheduler, we found that  cfq is too slow and 
deadline can't run on cgroup.
So we developed a new io-scheduler: tpps (Tiny Parallel Proportion 
Scheduler).It dispatch requests
only by using their individual weight and total weight (proportion) therefore 
it's simply and efficient.


Test case: fusionio card, 4 cgroups, iodepth-512

groupname  weight
test1      1000
test2      800
test3      600
test4      400

Use tpps, the result is:

groupname  iops    avg-rt(ms)   max-rt(ms)
test1      30220   16           54 
test2      28261   18           56
test3      26333   19           69
test4      20152   25           87

Use cfq, the result is:

groupname  iops    avg-rt(ms)   max-rt(ms)
test1      16478   30           242
test2      13015   39           347
test3       9300   54           371
test4       5806   87           393

Signed-off-by: Robin Dong <san...@taobao.com>
Signed-off-by: Zhu Yanhai <gaoyang....@taobao.com>
Cc: Tejun Heo <t...@kernel.org>
Cc: Vivek Goyal <vgo...@redhat.com>
Cc: Jens Axboe <ax...@kernel.dk>
Cc: Tao Ma <taoma...@gmail.com>
---
 block/Kconfig.iosched  |   13 +
 block/Makefile         |    1 +
 block/tpps-iosched.c   | 1272 ++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/blkdev.h |    2 +-
 4 files changed, 1287 insertions(+), 1 deletions(-)
 create mode 100644 block/tpps-iosched.c

diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 421bef9..e5e28c2 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -21,6 +21,16 @@ config IOSCHED_DEADLINE
          a new point in the service tree and doing a batch of IO from there
          in case of expiry.
 
+config IOSCHED_TPPS
+       tristate "TPPS I/O scheduler"
+       # If BLK_CGROUP is a module, TPPS has to be built as module.
+       default y
+       ---help---
+         The TPPS I/O scheduler tries to distribute iops proportional
+         among all cgroups in the system. It should also provide a low
+         latency working environment, suitable for flash-based device.
+         Note: If BLK_CGROUP=m, then TPPS can be built only as module.
+
 config IOSCHED_CFQ
        tristate "CFQ I/O scheduler"
        default y
@@ -49,6 +59,9 @@ choice
        config DEFAULT_DEADLINE
                bool "Deadline" if IOSCHED_DEADLINE=y
 
+       config DEFAULT_TPPS
+               bool "Tiny Parallel Proportion" if IOSCHED_TPPS=y
+
        config DEFAULT_CFQ
                bool "CFQ" if IOSCHED_CFQ=y
 
diff --git a/block/Makefile b/block/Makefile
index 39b76ba..6e30ef4 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -15,6 +15,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)      += blk-throttle.o
 obj-$(CONFIG_IOSCHED_NOOP)     += noop-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)      += cfq-iosched.o
+obj-$(CONFIG_IOSCHED_TPPS)     += tpps-iosched.o
 
 obj-$(CONFIG_BLOCK_COMPAT)     += compat_ioctl.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY)        += blk-integrity.o
diff --git a/block/tpps-iosched.c b/block/tpps-iosched.c
new file mode 100644
index 0000000..981fde2
--- /dev/null
+++ b/block/tpps-iosched.c
@@ -0,0 +1,1272 @@
+/*
+ *  TPPS, or Tiny Parallel Proportion disk Scheduler.
+ *
+ *  Based on ideas from Zhu Yanhai <gaoyang....@taobao.com>
+ *
+ *  Copyright (C) 2013 Robin Dong <san...@taobao.com>
+ */
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/elevator.h>
+#include <linux/jiffies.h>
+#include <linux/rbtree.h>
+#include <linux/ioprio.h>
+#include <linux/blktrace_api.h>
+#include "blk-cgroup.h"
+#include "blk.h"
+
+static struct kmem_cache *tpps_pool;
+
+struct tpps_queue {
+       /* reference count */
+       int ref;
+       /* parent tpps_data */
+       struct tpps_data *tppd;
+       /* tpps_group member */
+       struct list_head tppg_node;
+       /* sorted list of pending requests */
+       struct list_head sort_list;
+       struct tpps_group *tppg;
+       pid_t pid;
+       int online;
+       int rq_queued;
+};
+
+struct tppg_stats {
+       /* total bytes transferred */
+       struct blkg_rwstat              service_bytes;
+       /* total IOs serviced, post merge */
+       struct blkg_rwstat              serviced;
+       /* number of ios merged */
+       struct blkg_rwstat              merged;
+       /* total time spent on device in ns, may not be accurate w/ queueing */
+       struct blkg_rwstat              service_time;
+       /* total time spent waiting in scheduler queue in ns */
+       struct blkg_rwstat              wait_time;
+       /* number of IOs queued up */
+       struct blkg_rwstat              queued;
+       /* total sectors transferred */
+       struct blkg_stat                sectors;
+       /* total disk time and nr sectors dispatched by this group */
+       struct blkg_stat                time;
+};
+
+struct tpps_group {
+       struct blkg_policy_data pd;
+       /* tpps_data member */
+       struct list_head tppd_node;
+       struct list_head *cur_dispatcher;
+
+       unsigned int weight;
+       unsigned int new_weight;
+       unsigned int dev_weight;
+       unsigned int leaf_weight;
+       unsigned int new_leaf_weight;
+       unsigned int dev_leaf_weight;
+
+       bool needs_update;
+
+       /*
+        * lists of queues with requests.
+        */
+       struct list_head queue_list;
+       int nr_tppq;
+       int rq_queued;
+       int rq_in_driver;
+
+       struct tppg_stats stats;        /* stats for this tppg */
+       struct tppg_stats dead_stats;   /* stats pushed from dead children */
+};
+
+struct tpps_io_cq {
+       struct io_cq            icq;            /* must be the first member */
+       struct tpps_queue       *tppq;
+       uint64_t                        blkcg_id;       /* the current blkcg ID 
*/
+};
+
+struct tpps_data {
+       struct request_queue *queue;
+       struct tpps_group *root_group;
+
+       /* List of tpps groups being managed on this device*/
+       struct list_head group_list;
+
+       unsigned int busy_queues;
+       int dispatched;
+       int rq_in_driver;
+
+       struct work_struct unplug_work;
+
+       /* Number of groups which are on blkcg->blkg_list */
+       unsigned int nr_blkcg_linked_grps;
+
+       unsigned total_weight;
+};
+
+static inline struct blkcg_gq *tppg_to_blkg(struct tpps_group *tppg)
+{
+       return pd_to_blkg(&tppg->pd);
+}
+
+#define tpps_log_tppq(tppd, tppq, fmt, args...)        do {                    
\
+       char __pbuf[128];                                               \
+                                                                       \
+       blkg_path(tppg_to_blkg((tppq)->tppg), __pbuf, sizeof(__pbuf));  \
+       blk_add_trace_msg((tppd)->queue, "tpps%d %s " fmt, (tppq)->pid, \
+                         __pbuf, ##args);                              \
+} while (0)
+
+#define tpps_log_tppg(tppd, tppg, fmt, args...)        do {                    
\
+       char __pbuf[128];                                               \
+                                                                       \
+       blkg_path(tppg_to_blkg(tppg), __pbuf, sizeof(__pbuf));          \
+       blk_add_trace_msg((tppd)->queue, "%s " fmt, __pbuf, ##args);    \
+} while (0)
+#define tpps_log(tppd, fmt, args...)   \
+       blk_add_trace_msg((tppd)->queue, "tpps " fmt, ##args)
+
+static inline struct tpps_io_cq *icq_to_tic(struct io_cq *icq)
+{
+       /* tic->icq is the first member, %NULL will convert to %NULL */
+       return container_of(icq, struct tpps_io_cq, icq);
+}
+
+#define RQ_TIC(rq)     icq_to_tic((rq)->elv.icq)
+#define RQ_TPPQ(rq)    (struct tpps_queue *) ((rq)->elv.priv[0])
+#define RQ_TPPG(rq)    (struct tpps_group *) ((rq)->elv.priv[1])
+
+#define TPPS_WEIGHT_DEFAULT    (500)
+#define MIN_DISPATCH_RQ                (8)
+
+static struct blkcg_policy blkcg_policy_tpps;
+
+static inline struct tpps_group *pd_to_tppg(struct blkg_policy_data *pd)
+{
+       return pd ? container_of(pd, struct tpps_group, pd) : NULL;
+}
+
+static inline struct tpps_group *blkg_to_tppg(struct blkcg_gq *blkg)
+{
+       return pd_to_tppg(blkg_to_pd(blkg, &blkcg_policy_tpps));
+}
+
+static inline struct tpps_io_cq *
+tpps_tic_lookup(struct tpps_data *tppd, struct io_context *ioc)
+{
+       if (ioc)
+               return icq_to_tic(ioc_lookup_icq(ioc, tppd->queue));
+       return NULL;
+}
+
+static inline struct tpps_queue *tic_to_tppq(struct tpps_io_cq *tic)
+{
+       return tic->tppq;
+}
+
+static inline void tic_set_tppq(struct tpps_io_cq *tic, struct tpps_queue 
*tppq)
+{
+       tic->tppq = tppq;
+}
+
+static inline struct tpps_data *tic_to_tppd(struct tpps_io_cq *tic)
+{
+       return tic->icq.q->elevator->elevator_data;
+}
+
+static inline void tppg_get(struct tpps_group *tppg)
+{
+       return blkg_get(tppg_to_blkg(tppg));
+}
+
+static inline void tppg_put(struct tpps_group *tppg)
+{
+       return blkg_put(tppg_to_blkg(tppg));
+}
+
+static inline void tppg_stats_update_io_add(struct tpps_group *tppg,
+                                           struct tpps_group *curr_tppg, int 
rw)
+{
+       blkg_rwstat_add(&tppg->stats.queued, rw, 1);
+}
+
+static inline void tppg_stats_update_io_remove(struct tpps_group *tppg, int rw)
+{
+       blkg_rwstat_add(&tppg->stats.queued, rw, -1);
+}
+
+static inline void tppg_stats_update_io_merged(struct tpps_group *tppg, int rw)
+{
+       blkg_rwstat_add(&tppg->stats.merged, rw, 1);
+}
+
+static inline void tppg_stats_update_dispatch(struct tpps_group *tppg,
+                                             uint64_t bytes, int rw)
+{
+       blkg_stat_add(&tppg->stats.sectors, bytes >> 9);
+       blkg_rwstat_add(&tppg->stats.serviced, rw, 1);
+       blkg_rwstat_add(&tppg->stats.service_bytes, rw, bytes);
+}
+
+static inline void tppg_stats_update_completion(struct tpps_group *tppg,
+                       uint64_t start_time, uint64_t io_start_time, int rw)
+{
+       struct tppg_stats *stats = &tppg->stats;
+       unsigned long long now = sched_clock();
+
+       if (time_after64(now, io_start_time))
+               blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
+       if (time_after64(io_start_time, start_time))
+               blkg_rwstat_add(&stats->wait_time, rw,
+                               io_start_time - start_time);
+}
+
+static void tpps_del_queue(struct tpps_queue *tppq)
+{
+       struct tpps_data *tppd = tppq->tppd;
+       struct tpps_group *tppg = tppq->tppg;
+
+       if (!list_empty(&tppq->tppg_node)) {
+               list_del_init(&tppq->tppg_node);
+               tpps_log_tppq(tppd, tppq, "del queue\n");
+               tppg->cur_dispatcher = NULL;
+               tppq->tppg = NULL;
+       }
+
+       printk("%p nr_tppq:%d\n", tppg, tppg->nr_tppq);
+       BUG_ON(tppg->nr_tppq < 1);
+       tppg->nr_tppq--;
+       if (!tppg->nr_tppq)
+               tppd->total_weight -= tppg->pd.blkg->blkcg->cfq_weight;
+
+       BUG_ON(!tppd->busy_queues);
+       tppd->busy_queues--;
+}
+
+/*
+ * task holds one reference to the queue, dropped when task exits. each rq
+ * in-flight on this queue also holds a reference, dropped when rq is freed.
+ *
+ * Each tpps queue took a reference on the parent group. Drop it now.
+ * queue lock must be held here.
+ */
+static void tpps_put_queue(struct tpps_queue *tppq)
+{
+       struct tpps_data *tppd = tppq->tppd;
+       struct tpps_group *tppg;
+
+       BUG_ON(tppq->ref <= 0);
+
+       tppq->ref--;
+       if (tppq->ref)
+               return;
+
+       tpps_log_tppq(tppd, tppq, "put_queue");
+       BUG_ON(!list_empty(&tppq->sort_list));
+       tppg = tppq->tppg;
+
+       tpps_del_queue(tppq);
+       kmem_cache_free(tpps_pool, tppq);
+       tppg_put(tppg);
+}
+
+static void tpps_init_tppq(struct tpps_data *tppd, struct tpps_queue *tppq,
+                         pid_t pid)
+{
+       INIT_LIST_HEAD(&tppq->tppg_node);
+       INIT_LIST_HEAD(&tppq->sort_list);
+
+       tppq->ref = 0;
+       tppq->tppd = tppd;
+       tppq->pid = pid;
+
+}
+
+static void tpps_link_tppq_tppg(struct tpps_queue *tppq,
+               struct tpps_group *tppg)
+{
+       tppq->tppg = tppg;
+       /* tppq reference on tppg */
+       tppg_get(tppg);
+}
+
+static struct tpps_group *tpps_lookup_create_tppg(struct tpps_data *tppd,
+                                               struct blkcg *blkcg)
+{
+       struct request_queue *q = tppd->queue;
+       struct tpps_group *tppg = NULL;
+
+       /* avoid lookup for the common case where there's no blkcg */
+       if (blkcg == &blkcg_root) {
+               tppg = tppd->root_group;
+       } else {
+               struct blkcg_gq *blkg;
+
+               blkg = blkg_lookup_create(blkcg, q);
+               if (!IS_ERR(blkg))
+                       tppg = blkg_to_tppg(blkg);
+       }
+
+       return tppg;
+}
+
+static struct tpps_queue *
+tpps_find_alloc_queue(struct tpps_data *tppd, struct tpps_io_cq* tic, struct 
bio *bio,
+               gfp_t gfp_mask)
+{
+       struct tpps_queue *tppq, *new_tppq = NULL;
+       struct tpps_group *tppg;
+       struct blkcg *blkcg;
+
+retry:
+       rcu_read_lock();
+
+       blkcg = bio_blkcg(bio);
+       tppg = tpps_lookup_create_tppg(tppd, blkcg);
+       tppq = tic_to_tppq(tic);
+
+       if (!tppq) {
+               if (new_tppq) {
+                       tppq = new_tppq;
+                       new_tppq = NULL;
+               } else if (gfp_mask & __GFP_WAIT) {
+                       rcu_read_unlock();
+                       spin_unlock_irq(tppd->queue->queue_lock);
+                       new_tppq = kmem_cache_alloc_node(tpps_pool,
+                                       gfp_mask | __GFP_ZERO,
+                                       tppd->queue->node);
+                       spin_lock_irq(tppd->queue->queue_lock);
+                       if (new_tppq)
+                               goto retry;
+               } else
+                       tppq = kmem_cache_alloc_node(tpps_pool,
+                                       gfp_mask | __GFP_ZERO,
+                                       tppd->queue->node);
+
+               if (tppq) {
+                       tpps_init_tppq(tppd, tppq, current->pid);
+                       tpps_link_tppq_tppg(tppq, tppg);
+                       tpps_log_tppq(tppd, tppq, "alloced");
+               }
+       }
+
+       if (new_tppq)
+               kmem_cache_free(tpps_pool, new_tppq);
+
+       rcu_read_unlock();
+       return tppq;
+}
+
+static struct tpps_queue *
+tpps_get_queue(struct tpps_data *tppd, struct tpps_io_cq *tic, struct bio *bio,
+                       gfp_t gfp_mask)
+{
+       struct tpps_queue *tppq;
+
+       tppq = tpps_find_alloc_queue(tppd, tic, bio, gfp_mask);
+       tppq->ref++;
+       return tppq;
+}
+
+/*
+ * scheduler run of queue, if there are requests pending and no one in the
+ * driver that will restart queueing
+ */
+static inline void tpps_schedule_dispatch(struct tpps_data *tppd)
+{
+       if (tppd->busy_queues) {
+               tpps_log(tppd, "schedule dispatch");
+               kblockd_schedule_work(tppd->queue, &tppd->unplug_work);
+       }
+}
+
+static void check_blkcg_changed(struct tpps_io_cq *tic, struct bio *bio)
+{
+       struct tpps_data *tppd = tic_to_tppd(tic);
+       struct tpps_queue *tppq;
+       uint64_t id;
+
+       rcu_read_lock();
+       id = bio_blkcg(bio)->id;
+       rcu_read_unlock();
+
+       /*
+        * Check whether blkcg has changed.  The condition may trigger
+        * spuriously on a newly created tic but there's no harm.
+        */
+       if (unlikely(!tppd) || likely(tic->blkcg_id == id))
+               return;
+
+       tppq = tic_to_tppq(tic);
+       if (tppq) {
+               /*
+                * Drop reference to sync queue. A new sync queue will be
+                * assigned in new group upon arrival of a fresh request.
+                */
+               tpps_log_tppq(tppd, tppq, "changed cgroup");
+               tic_set_tppq(tic, NULL);
+               tpps_put_queue(tppq);
+       }
+
+       tic->blkcg_id = id;
+}
+
+static int
+tpps_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
+                       gfp_t gfp_mask)
+{
+       struct tpps_data *tppd = q->elevator->elevator_data;
+       struct tpps_io_cq *tic = icq_to_tic(rq->elv.icq);
+       struct tpps_queue *tppq;
+
+       might_sleep_if(gfp_mask & __GFP_WAIT);
+
+       spin_lock_irq(q->queue_lock);
+
+       check_blkcg_changed(tic, bio);
+
+       tppq = tic_to_tppq(tic);
+       if (!tppq) {
+               tppq = tpps_get_queue(tppd, tic, bio, gfp_mask);
+               tic_set_tppq(tic, tppq);
+       }
+
+       tppq->ref++;
+       tppg_get(tppq->tppg);
+       rq->elv.priv[0] = tppq;
+       rq->elv.priv[1] = tppq->tppg;
+       spin_unlock_irq(q->queue_lock);
+       return 0;
+}
+
+/*
+ * queue lock held here
+ */
+static void tpps_put_request(struct request *rq)
+{
+       struct tpps_queue *tppq = RQ_TPPQ(rq);
+
+       if (tppq) {
+               WARN_ON(tppq->tppg != RQ_TPPG(rq));
+
+               /* Put down rq reference on cfqg */
+               tppg_put(RQ_TPPG(rq));
+               rq->elv.priv[0] = NULL;
+               rq->elv.priv[1] = NULL;
+
+               tpps_put_queue(tppq);
+       }
+}
+
+static void
+tpps_update_group_weight(struct tpps_group *tppg)
+{
+       if (tppg->needs_update) {
+               tppg->weight = tppg->new_weight;
+               tppg->needs_update = false;
+       }
+}
+
+static void tpps_add_queue(struct tpps_data *tppd, struct tpps_queue *tppq)
+{
+       struct tpps_group *tppg;
+
+       if (!tppq->online) {
+               tppq->online = 1;
+               tppg = tppq->tppg;
+               tpps_log_tppq(tppd, tppq, "add queue");
+               tppg->nr_tppq++;
+               tppd->busy_queues++;
+               list_add(&tppq->tppg_node, &tppg->queue_list);
+               printk("add tppq %p to %p\n", tppq, tppg);
+               tpps_update_group_weight(tppg);
+               if (tppg->nr_tppq <= 1) {
+                       tppd->total_weight += tppg->pd.blkg->blkcg->cfq_weight;
+                       list_add(&tppg->tppd_node, &tppd->group_list);
+                       printk("twt:%u, wt:%u %u %d %p\n", tppd->total_weight, 
tppg->weight,
+                                       tppg->pd.blkg->blkcg->cfq_weight,
+                                       tppg->nr_tppq,
+                                       tppg);
+               }
+       }
+}
+
+static void tpps_insert_request(struct request_queue *q, struct request *rq)
+{
+       struct tpps_data *tppd = q->elevator->elevator_data;
+       struct tpps_queue *tppq = RQ_TPPQ(rq);
+
+       tpps_log_tppq(tppd, tppq, "insert_request");
+
+       list_add_tail(&rq->queuelist, &tppq->sort_list);
+       tppq->rq_queued++;
+       tppq->tppg->rq_queued++;
+       tppd->dispatched++;
+       tpps_add_queue(tppd, tppq);
+       tppg_stats_update_io_add(RQ_TPPG(rq), tppq->tppg, rq->cmd_flags);
+}
+
+static void tpps_remove_request(struct request *rq)
+{
+       struct tpps_queue *tppq = RQ_TPPQ(rq);
+
+       list_del_init(&rq->queuelist);
+       tppq->rq_queued--;
+       tppq->tppg->rq_queued--;
+       tppg_stats_update_io_remove(RQ_TPPG(rq), rq->cmd_flags);
+}
+
+/*
+ * Move request from internal lists to the request queue dispatch list.
+ */
+static int tpps_dispatch_insert(struct request_queue *q,
+                               struct tpps_queue *tppq)
+{
+       struct list_head *rbnext = tppq->sort_list.next;
+       struct request *rq;
+
+       if (rbnext == &tppq->sort_list)
+               return 0;
+
+       rq = rq_entry_fifo(rbnext);
+       tpps_remove_request(rq);
+       elv_dispatch_sort(q, rq);
+       tppg_stats_update_dispatch(tppq->tppg, blk_rq_bytes(rq), rq->cmd_flags);
+       return 1;
+}
+
+static int tpps_dispatch_requests_nr(struct tpps_data *tppd,
+                               struct tpps_queue *tppq, int count)
+{
+       int cnt = 0, ret;
+
+       if (!tppq->rq_queued)
+               return cnt;
+
+       do {
+               ret = tpps_dispatch_insert(tppd->queue, tppq);
+               if (ret) {
+                       cnt++;
+                       tppd->dispatched--;
+               }
+       } while (ret && cnt < count);
+
+       return cnt;
+}
+
+static int tpps_dispatch_requests(struct request_queue *q, int force)
+{
+       struct tpps_data *tppd = q->elevator->elevator_data;
+       struct tpps_group *tppg, *group_n;
+       struct tpps_queue *tppq;
+       struct list_head *next;
+       int count = 0, total = 0, ret;
+       int quota, grp_quota;
+
+       if (!tppd->total_weight)
+               return 0;
+
+       quota = q->nr_requests - tppd->rq_in_driver;
+       if (quota < MIN_DISPATCH_RQ && !force)
+               return 0;
+
+       list_for_each_entry_safe(tppg, group_n, &tppd->group_list, tppd_node) {
+               if (!tppg->nr_tppq)
+                       continue;
+               grp_quota = (quota * tppg->pd.blkg->blkcg->cfq_weight
+                                       / tppd->total_weight) - 
tppg->rq_in_driver;
+               tpps_log_tppg(tppd, tppg,
+                       "nr:%d, wt:%u total_wt:%u in_driver:%d %d quota:%d 
grp_quota:%d",
+                       tppg->nr_tppq, tppg->pd.blkg->blkcg->cfq_weight,
+                       tppd->total_weight, tppg->rq_in_driver, tppg->rq_queued,
+                       quota, grp_quota);
+               if (grp_quota <= 0 && !force)
+                       continue;
+               BUG_ON(tppg->queue_list.next == &tppg->queue_list);
+               if (!tppg->cur_dispatcher)
+                       tppg->cur_dispatcher = tppg->queue_list.next;
+               next = tppg->cur_dispatcher;
+               count = 0;
+               do {
+                       tppq = list_entry(next, struct tpps_queue, tppg_node);
+                       tpps_log_tppq(tppd, tppq, "tppq: %d\n", 
tppq->rq_queued);
+                       if (force)
+                               ret = tpps_dispatch_requests_nr(tppd, tppq, -1);
+                       else
+                               ret = tpps_dispatch_requests_nr(tppd, tppq, 1);
+                       count += ret;
+                       total += ret;
+                       next = next->next;
+                       if (next == &tppg->queue_list)
+                               next = tppg->queue_list.next;
+                       if (count >= grp_quota && !force) {
+                               tppg->cur_dispatcher = next;
+                               break;
+                       }
+                       BUG_ON(tppg->cur_dispatcher == &tppg->queue_list);
+               } while (next != tppg->cur_dispatcher);
+       }
+       return total > 0;
+}
+
+static void tpps_kick_queue(struct work_struct *work)
+{
+       struct tpps_data *tppd =
+               container_of(work, struct tpps_data, unplug_work);
+       struct request_queue *q = tppd->queue;
+
+       spin_lock_irq(q->queue_lock);
+       __blk_run_queue(q);
+       spin_unlock_irq(q->queue_lock);
+}
+
+static void tpps_init_tppg_base(struct tpps_group *tppg)
+{
+       INIT_LIST_HEAD(&tppg->tppd_node);
+       INIT_LIST_HEAD(&tppg->queue_list);
+       tppg->cur_dispatcher = NULL;
+
+}
+
+static int tpps_init_queue(struct request_queue *q)
+{
+       struct tpps_data *tppd;
+       struct tpps_group *tppg;
+       int ret;
+
+       tppd = kmalloc_node(sizeof(*tppd), GFP_KERNEL | __GFP_ZERO, q->node);
+       if (!tppd)
+               return -ENOMEM;
+
+       tppd->queue = q;
+       q->elevator->elevator_data = tppd;
+
+       INIT_LIST_HEAD(&tppd->group_list);
+
+       ret = blkcg_activate_policy(q, &blkcg_policy_tpps);
+       if (ret)
+               goto out_free;
+
+       /* Init root group */
+       tppd->root_group = blkg_to_tppg(q->root_blkg);
+       tppg = tppd->root_group;
+       tpps_init_tppg_base(tppg);
+
+       /* Give preference to root group over other groups */
+       tppg->weight = 2 * TPPS_WEIGHT_DEFAULT;
+       tppg->leaf_weight = 2 * TPPS_WEIGHT_DEFAULT;
+
+       INIT_WORK(&tppd->unplug_work, tpps_kick_queue);
+
+       return 0;
+
+out_free:
+       kfree(tppd);
+       return ret;
+}
+
+static void tpps_exit_queue(struct elevator_queue *e)
+{
+       struct tpps_data *tppd = e->elevator_data;
+       struct request_queue *q = tppd->queue;
+
+       cancel_work_sync(&tppd->unplug_work);
+
+       blkcg_deactivate_policy(q, &blkcg_policy_tpps);
+       kfree(tppd->root_group);
+       kfree(tppd);
+}
+
+static void tpps_activate_request(struct request_queue *q, struct request *rq)
+{
+       struct tpps_queue *tppq = RQ_TPPQ(rq);
+       struct tpps_data *tppd = q->elevator->elevator_data;
+       tppd->rq_in_driver++;
+       tppq->tppg->rq_in_driver++;
+       tpps_log_tppq(tppd, RQ_TPPQ(rq), "activate rq, drv=%d",
+                                               tppd->rq_in_driver);
+}
+
+static void tpps_deactivate_request(struct request_queue *q, struct request 
*rq)
+{
+       struct tpps_queue *tppq = RQ_TPPQ(rq);
+       struct tpps_data *tppd = q->elevator->elevator_data;
+
+       WARN_ON(!tppd->rq_in_driver);
+       tppd->rq_in_driver--;
+       tppq->tppg->rq_in_driver--;
+       tpps_log_tppq(tppd, RQ_TPPQ(rq), "deactivate rq, drv=%d",
+                                               tppd->rq_in_driver);
+}
+
+static void tpps_completed_request(struct request_queue *q, struct request *rq)
+{
+       struct tpps_queue *tppq = RQ_TPPQ(rq);
+       struct tpps_data *tppd = tppq->tppd;
+
+       WARN_ON(!tppq);
+       WARN_ON(tppq->tppg != RQ_TPPG(rq));
+
+       tpps_log_tppq(tppd, tppq, "complete rqnoidle %d",
+                       !!(rq->cmd_flags & REQ_NOIDLE));
+       WARN_ON(!tppd->rq_in_driver);
+       tppd->rq_in_driver--;
+       tppq->tppg->rq_in_driver--;
+       tppg_stats_update_completion(tppq->tppg,
+                       rq_start_time_ns(rq), rq_io_start_time_ns(rq), 
rq->cmd_flags);
+
+       if (!tppd->rq_in_driver)
+               tpps_schedule_dispatch(tppd);
+}
+
+static void
+tpps_merged_request(struct request_queue *q, struct request *rq, int type)
+{
+       if (type == ELEVATOR_FRONT_MERGE) {
+               struct tpps_queue *tppq = RQ_TPPQ(rq);
+               list_del_init(&rq->queuelist);
+               tppq->rq_queued--;
+               tppg_stats_update_io_remove(RQ_TPPG(rq), rq->cmd_flags);
+               list_add_tail(&rq->queuelist, &tppq->sort_list);
+               tppq->rq_queued++;
+               tppg_stats_update_io_add(RQ_TPPG(rq), tppq->tppg, 
rq->cmd_flags);
+       }
+}
+
+static void
+tpps_merged_requests(struct request_queue *q, struct request *rq,
+                       struct request *next)
+{
+       tpps_remove_request(next);
+       tppg_stats_update_io_merged(RQ_TPPG(rq), rq->cmd_flags);
+}
+
+static void tpps_init_icq(struct io_cq *icq)
+{ }
+
+static void tpps_exit_icq(struct io_cq *icq)
+{
+       struct tpps_io_cq *tic = icq_to_tic(icq);
+
+       if (tic->tppq) {
+               tpps_put_queue(tic->tppq);
+               tic->tppq = NULL;
+       }
+}
+
+static struct elevator_type iosched_tpps = {
+       .ops = {
+               .elevator_merged_fn =           tpps_merged_request,
+               .elevator_merge_req_fn =        tpps_merged_requests,
+               .elevator_dispatch_fn =         tpps_dispatch_requests,
+               .elevator_add_req_fn =          tpps_insert_request,
+               .elevator_activate_req_fn =     tpps_activate_request,
+               .elevator_deactivate_req_fn =   tpps_deactivate_request,
+               .elevator_completed_req_fn =    tpps_completed_request,
+               .elevator_init_icq_fn =         tpps_init_icq,
+               .elevator_exit_icq_fn =         tpps_exit_icq,
+               .elevator_set_req_fn =          tpps_set_request,
+               .elevator_put_req_fn =          tpps_put_request,
+               .elevator_init_fn =             tpps_init_queue,
+               .elevator_exit_fn =             tpps_exit_queue,
+       },
+       .icq_size               = sizeof(struct tpps_io_cq),
+       .icq_align              = __alignof__(struct tpps_io_cq),
+       .elevator_name  =       "tpps",
+       .elevator_owner =       THIS_MODULE,
+};
+
+static u64 tppg_prfill_weight_device(struct seq_file *sf,
+                                    struct blkg_policy_data *pd, int off)
+{
+       struct tpps_group *tppg = pd_to_tppg(pd);
+
+       if (!tppg->dev_weight)
+               return 0;
+       return __blkg_prfill_u64(sf, pd, tppg->dev_weight);
+}
+
+static int tppg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
+                                   struct seq_file *sf)
+{
+       blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp),
+                         tppg_prfill_weight_device, &blkcg_policy_tpps, 0,
+                         false);
+       return 0;
+}
+
+static u64 tppg_prfill_leaf_weight_device(struct seq_file *sf,
+                                         struct blkg_policy_data *pd, int off)
+{
+       struct tpps_group *tppg = pd_to_tppg(pd);
+
+       if (!tppg->dev_leaf_weight)
+               return 0;
+       return __blkg_prfill_u64(sf, pd, tppg->dev_leaf_weight);
+}
+
+static int tppg_print_leaf_weight_device(struct cgroup *cgrp,
+                                        struct cftype *cft,
+                                        struct seq_file *sf)
+{
+       blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp),
+                         tppg_prfill_leaf_weight_device, &blkcg_policy_tpps, 0,
+                         false);
+       return 0;
+}
+
+static int tppg_print_weight(struct cgroup *cgrp, struct cftype *cft,
+                           struct seq_file *sf)
+{
+       seq_printf(sf, "%u\n", cgroup_to_blkcg(cgrp)->cfq_weight);
+       return 0;
+}
+
+static int tppg_print_leaf_weight(struct cgroup *cgrp, struct cftype *cft,
+                                struct seq_file *sf)
+{
+       seq_printf(sf, "%u\n",
+                  cgroup_to_blkcg(cgrp)->cfq_leaf_weight);
+       return 0;
+}
+
+static int __tppg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
+                                   const char *buf, bool is_leaf_weight)
+{
+       struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+       struct blkg_conf_ctx ctx;
+       struct tpps_group *tppg;
+       int ret;
+
+       ret = blkg_conf_prep(blkcg, &blkcg_policy_tpps, buf, &ctx);
+       if (ret)
+               return ret;
+
+       ret = -EINVAL;
+       tppg = blkg_to_tppg(ctx.blkg);
+       if (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN && ctx.v <= CFQ_WEIGHT_MAX)) {
+               if (!is_leaf_weight) {
+                       tppg->dev_weight = ctx.v;
+                       tppg->new_weight = ctx.v ?: blkcg->cfq_weight;
+               } else {
+                       tppg->dev_leaf_weight = ctx.v;
+                       tppg->new_leaf_weight = ctx.v ?: blkcg->cfq_leaf_weight;
+               }
+               ret = 0;
+       }
+
+       blkg_conf_finish(&ctx);
+       return ret;
+}
+
+static int tppg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
+                                 const char *buf)
+{
+       return __tppg_set_weight_device(cgrp, cft, buf, false);
+}
+
+static int tppg_set_leaf_weight_device(struct cgroup *cgrp, struct cftype *cft,
+                                      const char *buf)
+{
+       return __tppg_set_weight_device(cgrp, cft, buf, true);
+}
+
+static int __tpps_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val,
+                           bool is_leaf_weight)
+{
+       struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+       struct blkcg_gq *blkg;
+
+       if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
+               return -EINVAL;
+
+       spin_lock_irq(&blkcg->lock);
+
+       if (!is_leaf_weight)
+               blkcg->cfq_weight = val;
+       else
+               blkcg->cfq_leaf_weight = val;
+
+       hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+               struct tpps_group *tppg = blkg_to_tppg(blkg);
+
+               if (!tppg)
+                       continue;
+
+               if (!is_leaf_weight) {
+                       if (!tppg->dev_weight)
+                               tppg->new_weight = blkcg->cfq_weight;
+               } else {
+                       if (!tppg->dev_leaf_weight)
+                               tppg->new_leaf_weight = blkcg->cfq_leaf_weight;
+               }
+       }
+
+       spin_unlock_irq(&blkcg->lock);
+       return 0;
+}
+
+static int tpps_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
+{
+       return __tpps_set_weight(cgrp, cft, val, false);
+}
+
+static int tpps_set_leaf_weight(struct cgroup *cgrp, struct cftype *cft, u64 
val)
+{
+       return __tpps_set_weight(cgrp, cft, val, true);
+}
+
+/* offset delta from tppg->stats to tppg->dead_stats */
+static const int dead_stats_off_delta = offsetof(struct tpps_group, 
dead_stats) -
+                                       offsetof(struct tpps_group, stats);
+
+/* to be used by recursive prfill, sums live and dead rwstats recursively */
+static struct blkg_rwstat tppg_rwstat_pd_recursive_sum(struct blkg_policy_data 
*pd,
+                                                      int off)
+{
+       struct blkg_rwstat a, b;
+
+       a = blkg_rwstat_recursive_sum(pd, off);
+       b = blkg_rwstat_recursive_sum(pd, off + dead_stats_off_delta);
+       blkg_rwstat_merge(&a, &b);
+       return a;
+}
+
+/* to be used by recursive prfill, sums live and dead stats recursively */
+static u64 tppg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off)
+{
+       u64 sum = 0;
+
+       sum += blkg_stat_recursive_sum(pd, off);
+       sum += blkg_stat_recursive_sum(pd, off + dead_stats_off_delta);
+       return sum;
+}
+
+static int tppg_print_stat(struct cgroup *cgrp, struct cftype *cft,
+                          struct seq_file *sf)
+{
+       struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+
+       blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkcg_policy_tpps,
+                         cft->private, false);
+       return 0;
+}
+
+static int tppg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
+                            struct seq_file *sf)
+{
+       struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+
+       blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkcg_policy_tpps,
+                         cft->private, true);
+       return 0;
+}
+
+static u64 tppg_prfill_stat_recursive(struct seq_file *sf,
+                                     struct blkg_policy_data *pd, int off)
+{
+       u64 sum = tppg_stat_pd_recursive_sum(pd, off);
+
+       return __blkg_prfill_u64(sf, pd, sum);
+}
+
+static u64 tppg_prfill_rwstat_recursive(struct seq_file *sf,
+                                       struct blkg_policy_data *pd, int off)
+{
+       struct blkg_rwstat sum = tppg_rwstat_pd_recursive_sum(pd, off);
+
+       return __blkg_prfill_rwstat(sf, pd, &sum);
+}
+
+static int tppg_print_stat_recursive(struct cgroup *cgrp, struct cftype *cft,
+                                    struct seq_file *sf)
+{
+       struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+
+       blkcg_print_blkgs(sf, blkcg, tppg_prfill_stat_recursive,
+                         &blkcg_policy_tpps, cft->private, false);
+       return 0;
+}
+
+static int tppg_print_rwstat_recursive(struct cgroup *cgrp, struct cftype *cft,
+                                      struct seq_file *sf)
+{
+       struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+
+       blkcg_print_blkgs(sf, blkcg, tppg_prfill_rwstat_recursive,
+                         &blkcg_policy_tpps, cft->private, true);
+       return 0;
+}
+
+static struct cftype tpps_blkcg_files[] = {
+       /* on root, weight is mapped to leaf_weight */
+       {
+               .name = "tpps.weight_device",
+               .flags = CFTYPE_ONLY_ON_ROOT,
+               .read_seq_string = tppg_print_leaf_weight_device,
+               .write_string = tppg_set_leaf_weight_device,
+               .max_write_len = 256,
+       },
+       {
+               .name = "tpps.weight",
+               .flags = CFTYPE_ONLY_ON_ROOT,
+               .read_seq_string = tppg_print_leaf_weight,
+               .write_u64 = tpps_set_leaf_weight,
+       },
+
+       /* no such mapping necessary for !roots */
+       {
+               .name = "tpps.weight_device",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .read_seq_string = tppg_print_weight_device,
+               .write_string = tppg_set_weight_device,
+               .max_write_len = 256,
+       },
+       {
+               .name = "tpps.weight",
+               .flags = CFTYPE_NOT_ON_ROOT,
+               .read_seq_string = tppg_print_weight,
+               .write_u64 = tpps_set_weight,
+       },
+
+       {
+               .name = "tpps.leaf_weight_device",
+               .read_seq_string = tppg_print_leaf_weight_device,
+               .write_string = tppg_set_leaf_weight_device,
+               .max_write_len = 256,
+       },
+       {
+               .name = "tpps.leaf_weight",
+               .read_seq_string = tppg_print_leaf_weight,
+               .write_u64 = tpps_set_leaf_weight,
+       },
+
+       /* statistics, covers only the tasks in the tppg */
+       {
+               .name = "tpps.time",
+               .private = offsetof(struct tpps_group, stats.time),
+               .read_seq_string = tppg_print_stat,
+       },
+       {
+               .name = "tpps.sectors",
+               .private = offsetof(struct tpps_group, stats.sectors),
+               .read_seq_string = tppg_print_stat,
+       },
+       {
+               .name = "tpps.io_service_bytes",
+               .private = offsetof(struct tpps_group, stats.service_bytes),
+               .read_seq_string = tppg_print_rwstat,
+       },
+       {
+               .name = "tpps.io_serviced",
+               .private = offsetof(struct tpps_group, stats.serviced),
+               .read_seq_string = tppg_print_rwstat,
+       },
+       {
+               .name = "tpps.io_service_time",
+               .private = offsetof(struct tpps_group, stats.service_time),
+               .read_seq_string = tppg_print_rwstat,
+       },
+       {
+               .name = "tpps.io_wait_time",
+               .private = offsetof(struct tpps_group, stats.wait_time),
+               .read_seq_string = tppg_print_rwstat,
+       },
+       {
+               .name = "tpps.io_merged",
+               .private = offsetof(struct tpps_group, stats.merged),
+               .read_seq_string = tppg_print_rwstat,
+       },
+       {
+               .name = "tpps.io_queued",
+               .private = offsetof(struct tpps_group, stats.queued),
+               .read_seq_string = tppg_print_rwstat,
+       },
+
+       /* the same statictics which cover the tppg and its descendants */
+       {
+               .name = "tpps.time_recursive",
+               .private = offsetof(struct tpps_group, stats.time),
+               .read_seq_string = tppg_print_stat_recursive,
+       },
+       {
+               .name = "tpps.sectors_recursive",
+               .private = offsetof(struct tpps_group, stats.sectors),
+               .read_seq_string = tppg_print_stat_recursive,
+       },
+       {
+               .name = "tpps.io_service_bytes_recursive",
+               .private = offsetof(struct tpps_group, stats.service_bytes),
+               .read_seq_string = tppg_print_rwstat_recursive,
+       },
+       {
+               .name = "tpps.io_serviced_recursive",
+               .private = offsetof(struct tpps_group, stats.serviced),
+               .read_seq_string = tppg_print_rwstat_recursive,
+       },
+       {
+               .name = "tpps.io_service_time_recursive",
+               .private = offsetof(struct tpps_group, stats.service_time),
+               .read_seq_string = tppg_print_rwstat_recursive,
+       },
+       {
+               .name = "tpps.io_wait_time_recursive",
+               .private = offsetof(struct tpps_group, stats.wait_time),
+               .read_seq_string = tppg_print_rwstat_recursive,
+       },
+       {
+               .name = "tpps.io_merged_recursive",
+               .private = offsetof(struct tpps_group, stats.merged),
+               .read_seq_string = tppg_print_rwstat_recursive,
+       },
+       {
+               .name = "tpps.io_queued_recursive",
+               .private = offsetof(struct tpps_group, stats.queued),
+               .read_seq_string = tppg_print_rwstat_recursive,
+       },
+       { }     /* terminate */
+};
+
+static void tpps_pd_init(struct blkcg_gq *blkg)
+{
+       struct tpps_group *tppg = blkg_to_tppg(blkg);
+
+       tpps_init_tppg_base(tppg);
+       tppg->weight = blkg->blkcg->cfq_weight;
+       tppg->leaf_weight = blkg->blkcg->cfq_leaf_weight;
+}
+
+static inline struct tpps_group *tppg_parent(struct tpps_group *tppg)
+{
+       struct blkcg_gq *pblkg = tppg_to_blkg(tppg)->parent;
+
+       return pblkg ? blkg_to_tppg(pblkg) : NULL;
+}
+
+static void tppg_stats_reset(struct tppg_stats *stats)
+{
+       /* queued stats shouldn't be cleared */
+       blkg_rwstat_reset(&stats->service_bytes);
+       blkg_rwstat_reset(&stats->serviced);
+       blkg_rwstat_reset(&stats->merged);
+       blkg_rwstat_reset(&stats->service_time);
+       blkg_rwstat_reset(&stats->wait_time);
+       blkg_stat_reset(&stats->time);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+       blkg_stat_reset(&stats->unaccounted_time);
+       blkg_stat_reset(&stats->avg_queue_size_sum);
+       blkg_stat_reset(&stats->avg_queue_size_samples);
+       blkg_stat_reset(&stats->dequeue);
+       blkg_stat_reset(&stats->group_wait_time);
+       blkg_stat_reset(&stats->idle_time);
+       blkg_stat_reset(&stats->empty_time);
+#endif
+}
+
+/* @to += @from */
+static void tppg_stats_merge(struct tppg_stats *to, struct tppg_stats *from)
+{
+       /* queued stats shouldn't be cleared */
+       blkg_rwstat_merge(&to->service_bytes, &from->service_bytes);
+       blkg_rwstat_merge(&to->serviced, &from->serviced);
+       blkg_rwstat_merge(&to->merged, &from->merged);
+       blkg_rwstat_merge(&to->service_time, &from->service_time);
+       blkg_rwstat_merge(&to->wait_time, &from->wait_time);
+       blkg_stat_merge(&from->time, &from->time);
+#ifdef CONFIG_DEBUG_BLK_CGROUP
+       blkg_stat_merge(&to->unaccounted_time, &from->unaccounted_time);
+       blkg_stat_merge(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+       blkg_stat_merge(&to->avg_queue_size_samples, 
&from->avg_queue_size_samples);
+       blkg_stat_merge(&to->dequeue, &from->dequeue);
+       blkg_stat_merge(&to->group_wait_time, &from->group_wait_time);
+       blkg_stat_merge(&to->idle_time, &from->idle_time);
+       blkg_stat_merge(&to->empty_time, &from->empty_time);
+#endif
+}
+
+static void tppg_stats_xfer_dead(struct tpps_group *tppg)
+{
+       struct tpps_group *parent = tppg_parent(tppg);
+
+       lockdep_assert_held(tppg_to_blkg(tppg)->q->queue_lock);
+
+       if (unlikely(!parent))
+               return;
+
+       tppg_stats_merge(&parent->dead_stats, &tppg->stats);
+       tppg_stats_merge(&parent->dead_stats, &tppg->dead_stats);
+       tppg_stats_reset(&tppg->stats);
+       tppg_stats_reset(&tppg->dead_stats);
+}
+
+static void tpps_pd_offline(struct blkcg_gq *blkg)
+{
+       struct tpps_group *tppg = blkg_to_tppg(blkg);
+       /*
+        * @blkg is going offline and will be ignored by
+        * blkg_[rw]stat_recursive_sum().  Transfer stats to the parent so
+        * that they don't get lost.  If IOs complete after this point, the
+        * stats for them will be lost.  Oh well...
+        */
+       tppg_stats_xfer_dead(tppg);
+
+       if (!list_empty(&tppg->tppd_node))
+               list_del_init(&tppg->tppd_node);
+
+       //BUG_ON(!list_empty(&(tppg->queue_list)));
+}
+
+static void tpps_pd_reset_stats(struct blkcg_gq *blkg)
+{
+       struct tpps_group *tppg = blkg_to_tppg(blkg);
+
+       tppg_stats_reset(&tppg->stats);
+       tppg_stats_reset(&tppg->dead_stats);
+}
+
+static struct blkcg_policy blkcg_policy_tpps = {
+       .pd_size                        = sizeof(struct tpps_group),
+       .cftypes                        = tpps_blkcg_files,
+       .pd_init_fn                     = tpps_pd_init,
+       .pd_offline_fn          = tpps_pd_offline,
+       .pd_reset_stats_fn      = tpps_pd_reset_stats,
+};
+
+static int __init tpps_init(void)
+{
+       int ret;
+
+       ret = blkcg_policy_register(&blkcg_policy_tpps);
+       if (ret)
+               return ret;
+
+       ret = -ENOMEM;
+       tpps_pool = KMEM_CACHE(tpps_queue, 0);
+       if (!tpps_pool)
+               goto err_pol_unreg;
+
+       ret = elv_register(&iosched_tpps);
+       if (ret)
+               goto err_free_pool;
+
+       return 0;
+
+err_free_pool:
+       kmem_cache_destroy(tpps_pool);
+err_pol_unreg:
+       blkcg_policy_unregister(&blkcg_policy_tpps);
+       return ret;
+}
+
+static void __exit tpps_exit(void)
+{
+       blkcg_policy_unregister(&blkcg_policy_tpps);
+       elv_unregister(&iosched_tpps);
+       kmem_cache_destroy(tpps_pool);
+}
+
+module_init(tpps_init);
+module_exit(tpps_exit);
+
+MODULE_AUTHOR("Robin Dong");
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Tiny Parallel Proportion io Scheduler");
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 2fdb4a4..489257a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -42,7 +42,7 @@ struct blkcg_gq;
  * Maximum number of blkcg policies allowed to be registered concurrently.
  * Defined here to simplify include dependency.
  */
-#define BLKCG_MAX_POLS         2
+#define BLKCG_MAX_POLS         3
 
 struct request;
 typedef void (rq_end_io_fn)(struct request *, int);
-- 
1.7.1

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

[RFC v1] add new io-scheduler to use cgroup on high-speed device

Reply via email to