From: Jiufei Xue <jiufei....@alibaba-inc.com>

Cgroup writeback is supported since v4.2. But there exists a problem
in the following case.

A cgroup may send both buffer and direct/sync IOs. The foreground
thread will be stalled when periodic writeback IOs is flushed because
the service queue in block throttle layer already has a plenty of
writeback IOs, then foreground IOs should be enqueued with its FIFO
policy. The current policy is dispatching 6 reads and 2 writes during
each round, sync writes will be significantly delayed.

This patch adds another queue in block throttle. Now there are 3 queues
in a service queue: read, sync write, async write, and we can dispatch
more sync writes than aync writes.

Test:
1) setup a memcg and a blkcg with uplimit bandwidth 20m/s;
2) start a script writing 5G buffer to page cache and start to sync
3) start sync writes:
    dd if=/dev/zero of=test bs=4k count=12800 oflag=direct conv=notrunc

Unpatched:
52428800 bytes (52 MB) copied, 307.065 s, 171 kB/s

Patched:
52428800 bytes (52 MB) copied, 13.8192 s, 3.8 MB/s

Signed-off-by: Jiufei Xue <jiufei....@alibaba-inc.com>
---
 block/blk-throttle.c | 195 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 129 insertions(+), 66 deletions(-)

diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index d19f416..842257e 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -14,10 +14,10 @@
 #include "blk.h"
 
 /* Max dispatch from a group in 1 round */
-static int throtl_grp_quantum = 8;
+static int throtl_grp_quantum = 32;
 
 /* Total max dispatch from all groups in one round */
-static int throtl_quantum = 32;
+static int throtl_quantum = 128;
 
 /* Throttling is performed over a slice and after that slice is renewed */
 #define DFL_THROTL_SLICE_HD (HZ / 10)
@@ -43,6 +43,12 @@
 /* A workqueue to queue throttle related work */
 static struct workqueue_struct *kthrotld_workqueue;
 
+enum wl_type {
+       READ_WORKLOAD = 0,
+       SYNC_WRITE_WORKLOAD = 1,
+       ASYNC_WRITE_WORKLOAD = 2
+};
+
 /*
  * To implement hierarchical throttling, throtl_grps form a tree and bios
  * are dispatched upwards level by level until they reach the top and get
@@ -79,8 +85,11 @@ struct throtl_service_queue {
         * Bios queued directly to this service_queue or dispatched from
         * children throtl_grp's.
         */
-       struct list_head        queued[2];      /* throtl_qnode [READ/WRITE] */
-       unsigned int            nr_queued[2];   /* number of queued bios */
+       /* throtl_qnode [READ/WRITE/ASYNC_WRITE] */
+       struct list_head        queued[3];
+
+       unsigned int            nr_queued[3];   /* number of queued bios */
+
 
        /*
         * RB tree of active children throtl_grp's, which are sorted by
@@ -127,8 +136,8 @@ struct throtl_grp {
         * with the sibling qnode_on_parents and the parent's
         * qnode_on_self.
         */
-       struct throtl_qnode qnode_on_self[2];
-       struct throtl_qnode qnode_on_parent[2];
+       struct throtl_qnode qnode_on_self[3];
+       struct throtl_qnode qnode_on_parent[3];
 
        /*
         * Dispatch time in jiffies. This is the estimated time when group
@@ -202,7 +211,7 @@ struct throtl_data
        struct request_queue *queue;
 
        /* Total Number of queued bios on READ and WRITE lists */
-       unsigned int nr_queued[2];
+       unsigned int nr_queued[3];
 
        unsigned int throtl_slice;
 
@@ -274,6 +283,18 @@ static struct throtl_data *sq_to_td(struct 
throtl_service_queue *sq)
                return container_of(sq, struct throtl_data, service_queue);
 }
 
+static inline enum wl_type bio_workload_type(struct bio *bio)
+{
+       return bio_data_dir(bio) ?
+              ((bio->bi_opf & REQ_SYNC) ? SYNC_WRITE_WORKLOAD :
+              ASYNC_WRITE_WORKLOAD) : READ_WORKLOAD;
+}
+
+static inline bool wl_to_rw(enum wl_type type)
+{
+       return type >= SYNC_WRITE_WORKLOAD;
+}
+
 /*
  * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to
  * make the IO dispatch more smooth.
@@ -475,8 +496,9 @@ static struct bio *throtl_pop_queued(struct list_head 
*queued,
 /* init a service_queue, assumes the caller zeroed it */
 static void throtl_service_queue_init(struct throtl_service_queue *sq)
 {
-       INIT_LIST_HEAD(&sq->queued[0]);
-       INIT_LIST_HEAD(&sq->queued[1]);
+       INIT_LIST_HEAD(&sq->queued[READ_WORKLOAD]);
+       INIT_LIST_HEAD(&sq->queued[SYNC_WRITE_WORKLOAD]);
+       INIT_LIST_HEAD(&sq->queued[ASYNC_WRITE_WORKLOAD]);
        sq->pending_tree = RB_ROOT;
        timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0);
 }
@@ -484,7 +506,7 @@ static void throtl_service_queue_init(struct 
throtl_service_queue *sq)
 static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
 {
        struct throtl_grp *tg;
-       int rw;
+       enum wl_type type;
 
        tg = kzalloc_node(sizeof(*tg), gfp, node);
        if (!tg)
@@ -492,9 +514,9 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, 
int node)
 
        throtl_service_queue_init(&tg->service_queue);
 
-       for (rw = READ; rw <= WRITE; rw++) {
-               throtl_qnode_init(&tg->qnode_on_self[rw], tg);
-               throtl_qnode_init(&tg->qnode_on_parent[rw], tg);
+       for (type = READ_WORKLOAD; type <= ASYNC_WRITE_WORKLOAD; type++) {
+               throtl_qnode_init(&tg->qnode_on_self[type], tg);
+               throtl_qnode_init(&tg->qnode_on_parent[type], tg);
        }
 
        RB_CLEAR_NODE(&tg->rb_node);
@@ -985,6 +1007,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct 
bio *bio,
                            unsigned long *wait)
 {
        bool rw = bio_data_dir(bio);
+       enum wl_type type = bio_workload_type(bio);
        unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
 
        /*
@@ -993,8 +1016,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct 
bio *bio,
         * this function with a different bio if there are other bios
         * queued.
         */
-       BUG_ON(tg->service_queue.nr_queued[rw] &&
-              bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
+       BUG_ON(tg->service_queue.nr_queued[type] &&
+              bio != throtl_peek_queued(&tg->service_queue.queued[type]));
 
        /* If tg->bps = -1, then BW is unlimited */
        if (tg_bps_limit(tg, rw) == U64_MAX &&
@@ -1011,7 +1034,12 @@ static bool tg_may_dispatch(struct throtl_grp *tg, 
struct bio *bio,
         * If there is queued bio, that means there should be an active
         * slice and it should be extended instead.
         */
-       if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
+       if (throtl_slice_used(tg, rw) &&
+          ((rw == READ &&
+           !tg->service_queue.nr_queued[READ_WORKLOAD]) ||
+           (rw == WRITE &&
+           !tg->service_queue.nr_queued[SYNC_WRITE_WORKLOAD] &&
+           !tg->service_queue.nr_queued[ASYNC_WRITE_WORKLOAD])))
                throtl_start_new_slice(tg, rw);
        else {
                if (time_before(tg->slice_end[rw],
@@ -1072,10 +1100,10 @@ static void throtl_add_bio_tg(struct bio *bio, struct 
throtl_qnode *qn,
                              struct throtl_grp *tg)
 {
        struct throtl_service_queue *sq = &tg->service_queue;
-       bool rw = bio_data_dir(bio);
+       int type = bio_workload_type(bio);
 
        if (!qn)
-               qn = &tg->qnode_on_self[rw];
+               qn = &tg->qnode_on_self[type];
 
        /*
         * If @tg doesn't currently have any bios queued in the same
@@ -1083,12 +1111,12 @@ static void throtl_add_bio_tg(struct bio *bio, struct 
throtl_qnode *qn,
         * dispatched.  Mark that @tg was empty.  This is automatically
         * cleaered on the next tg_update_disptime().
         */
-       if (!sq->nr_queued[rw])
+       if (!sq->nr_queued[type])
                tg->flags |= THROTL_TG_WAS_EMPTY;
 
-       throtl_qnode_add_bio(bio, qn, &sq->queued[rw]);
+       throtl_qnode_add_bio(bio, qn, &sq->queued[type]);
 
-       sq->nr_queued[rw]++;
+       sq->nr_queued[type]++;
        throtl_enqueue_tg(tg);
 }
 
@@ -1096,16 +1124,22 @@ static void tg_update_disptime(struct throtl_grp *tg)
 {
        struct throtl_service_queue *sq = &tg->service_queue;
        unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
+       unsigned long sync_write_wait, async_write_wait;
        struct bio *bio;
 
-       bio = throtl_peek_queued(&sq->queued[READ]);
+       bio = throtl_peek_queued(&sq->queued[READ_WORKLOAD]);
        if (bio)
                tg_may_dispatch(tg, bio, &read_wait);
 
-       bio = throtl_peek_queued(&sq->queued[WRITE]);
+       bio = throtl_peek_queued(&sq->queued[SYNC_WRITE_WORKLOAD]);
+       if (bio)
+               tg_may_dispatch(tg, bio, &sync_write_wait);
+
+       bio = throtl_peek_queued(&sq->queued[ASYNC_WRITE_WORKLOAD]);
        if (bio)
-               tg_may_dispatch(tg, bio, &write_wait);
+               tg_may_dispatch(tg, bio, &async_write_wait);
 
+       write_wait = min(sync_write_wait, async_write_wait);
        min_wait = min(read_wait, write_wait);
        disptime = jiffies + min_wait;
 
@@ -1125,16 +1159,16 @@ static void start_parent_slice_with_credit(struct 
throtl_grp *child_tg,
                throtl_start_new_slice_with_credit(parent_tg, rw,
                                child_tg->slice_start[rw]);
        }
-
 }
 
-static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
+static void tg_dispatch_one_bio(struct throtl_grp *tg, enum wl_type type)
 {
        struct throtl_service_queue *sq = &tg->service_queue;
        struct throtl_service_queue *parent_sq = sq->parent_sq;
        struct throtl_grp *parent_tg = sq_to_tg(parent_sq);
        struct throtl_grp *tg_to_put = NULL;
        struct bio *bio;
+       bool rw = wl_to_rw(type);
 
        /*
         * @bio is being transferred from @tg to @parent_sq.  Popping a bio
@@ -1142,8 +1176,8 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, 
bool rw)
         * getting released prematurely.  Remember the tg to put and put it
         * after @bio is transferred to @parent_sq.
         */
-       bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put);
-       sq->nr_queued[rw]--;
+       bio = throtl_pop_queued(&sq->queued[type], &tg_to_put);
+       sq->nr_queued[type]--;
 
        throtl_charge_bio(tg, bio);
 
@@ -1155,13 +1189,13 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, 
bool rw)
         * responsible for issuing these bios.
         */
        if (parent_tg) {
-               throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
+               throtl_add_bio_tg(bio, &tg->qnode_on_parent[type], parent_tg);
                start_parent_slice_with_credit(tg, parent_tg, rw);
        } else {
-               throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
-                                    &parent_sq->queued[rw]);
-               BUG_ON(tg->td->nr_queued[rw] <= 0);
-               tg->td->nr_queued[rw]--;
+               throtl_qnode_add_bio(bio, &tg->qnode_on_parent[type],
+                                    &parent_sq->queued[type]);
+               BUG_ON(tg->td->nr_queued[type] <= 0);
+               tg->td->nr_queued[type]--;
        }
 
        throtl_trim_slice(tg, rw);
@@ -1173,34 +1207,45 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, 
bool rw)
 static int throtl_dispatch_tg(struct throtl_grp *tg)
 {
        struct throtl_service_queue *sq = &tg->service_queue;
-       unsigned int nr_reads = 0, nr_writes = 0;
+       unsigned int nr_reads = 0, nr_async_writes = 0, nr_sync_writes = 0;
        unsigned int max_nr_reads = throtl_grp_quantum*3/4;
-       unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
+       unsigned int max_sync_writes = (throtl_grp_quantum - max_nr_reads)*7/8;
+       unsigned int max_async_writes = throtl_grp_quantum - max_nr_reads
+                                       - max_sync_writes;
        struct bio *bio;
 
        /* Try to dispatch 75% READS and 25% WRITES */
-
-       while ((bio = throtl_peek_queued(&sq->queued[READ])) &&
+       while ((bio = throtl_peek_queued(&sq->queued[READ_WORKLOAD])) &&
               tg_may_dispatch(tg, bio, NULL)) {
 
-               tg_dispatch_one_bio(tg, bio_data_dir(bio));
+               tg_dispatch_one_bio(tg, READ_WORKLOAD);
                nr_reads++;
 
                if (nr_reads >= max_nr_reads)
                        break;
        }
 
-       while ((bio = throtl_peek_queued(&sq->queued[WRITE])) &&
+       while ((bio = throtl_peek_queued(&sq->queued[SYNC_WRITE_WORKLOAD])) &&
               tg_may_dispatch(tg, bio, NULL)) {
 
-               tg_dispatch_one_bio(tg, bio_data_dir(bio));
-               nr_writes++;
+               tg_dispatch_one_bio(tg, SYNC_WRITE_WORKLOAD);
+               nr_sync_writes++;
 
-               if (nr_writes >= max_nr_writes)
+               if (nr_sync_writes >= max_sync_writes)
                        break;
        }
 
-       return nr_reads + nr_writes;
+       while ((bio = throtl_peek_queued(&sq->queued[ASYNC_WRITE_WORKLOAD])) &&
+              tg_may_dispatch(tg, bio, NULL)) {
+
+               tg_dispatch_one_bio(tg, ASYNC_WRITE_WORKLOAD);
+               nr_async_writes++;
+
+               if (nr_async_writes >= max_async_writes)
+                       break;
+       }
+
+       return nr_reads + nr_sync_writes + nr_async_writes;
 }
 
 static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
@@ -1221,7 +1266,9 @@ static int throtl_select_dispatch(struct 
throtl_service_queue *parent_sq)
 
                nr_disp += throtl_dispatch_tg(tg);
 
-               if (sq->nr_queued[0] || sq->nr_queued[1])
+               if (sq->nr_queued[READ_WORKLOAD] ||
+                   sq->nr_queued[SYNC_WRITE_WORKLOAD] ||
+                   sq->nr_queued[ASYNC_WRITE_WORKLOAD])
                        tg_update_disptime(tg);
 
                if (nr_disp >= throtl_quantum)
@@ -1267,9 +1314,14 @@ static void throtl_pending_timer_fn(struct timer_list *t)
        dispatched = false;
 
        while (true) {
-               throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u",
-                          sq->nr_queued[READ] + sq->nr_queued[WRITE],
-                          sq->nr_queued[READ], sq->nr_queued[WRITE]);
+               throtl_log(sq,
+                         "dispatch nr_queued=%u read=%u sync_write=%u 
async_write=%u",
+                          sq->nr_queued[READ_WORKLOAD] +
+                          sq->nr_queued[SYNC_WRITE_WORKLOAD] +
+                          sq->nr_queued[ASYNC_WRITE_WORKLOAD],
+                          sq->nr_queued[READ_WORKLOAD],
+                          sq->nr_queued[SYNC_WRITE_WORKLOAD],
+                          sq->nr_queued[ASYNC_WRITE_WORKLOAD]);
 
                ret = throtl_select_dispatch(sq);
                if (ret) {
@@ -1325,13 +1377,13 @@ static void blk_throtl_dispatch_work_fn(struct 
work_struct *work)
        struct bio_list bio_list_on_stack;
        struct bio *bio;
        struct blk_plug plug;
-       int rw;
+       enum wl_type type;
 
        bio_list_init(&bio_list_on_stack);
 
        spin_lock_irq(q->queue_lock);
-       for (rw = READ; rw <= WRITE; rw++)
-               while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL)))
+       for (type = READ_WORKLOAD; type <= ASYNC_WRITE_WORKLOAD; type++)
+               while ((bio = throtl_pop_queued(&td_sq->queued[type], NULL)))
                        bio_list_add(&bio_list_on_stack, bio);
        spin_unlock_irq(q->queue_lock);
 
@@ -1820,11 +1872,13 @@ static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
        write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW];
        if (!read_limit && !write_limit)
                return true;
-       if (read_limit && sq->nr_queued[READ] &&
-           (!write_limit || sq->nr_queued[WRITE]))
+       if (read_limit && sq->nr_queued[READ_WORKLOAD] &&
+           (!write_limit || sq->nr_queued[SYNC_WRITE_WORKLOAD] ||
+                            sq->nr_queued[ASYNC_WRITE_WORKLOAD]))
                return true;
-       if (write_limit && sq->nr_queued[WRITE] &&
-           (!read_limit || sq->nr_queued[READ]))
+       if (write_limit && (sq->nr_queued[SYNC_WRITE_WORKLOAD] ||
+                           sq->nr_queued[ASYNC_WRITE_WORKLOAD]) &&
+           (!read_limit || sq->nr_queued[READ_WORKLOAD]))
                return true;
 
        if (time_after_eq(jiffies,
@@ -2129,6 +2183,7 @@ bool blk_throtl_bio(struct request_queue *q, struct 
blkcg_gq *blkg,
        struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg);
        struct throtl_service_queue *sq;
        bool rw = bio_data_dir(bio);
+       enum wl_type type = bio_workload_type(bio);
        bool throttled = false;
        struct throtl_data *td = tg->td;
 
@@ -2157,7 +2212,7 @@ bool blk_throtl_bio(struct request_queue *q, struct 
blkcg_gq *blkg,
                throtl_downgrade_check(tg);
                throtl_upgrade_check(tg);
                /* throtl is FIFO - if bios are already queued, should queue */
-               if (sq->nr_queued[rw])
+               if (sq->nr_queued[type])
                        break;
 
                /* if above limits, break to queue */
@@ -2191,7 +2246,7 @@ bool blk_throtl_bio(struct request_queue *q, struct 
blkcg_gq *blkg,
                 * Climb up the ladder.  If we''re already at the top, it
                 * can be executed directly.
                 */
-               qn = &tg->qnode_on_parent[rw];
+               qn = &tg->qnode_on_parent[type];
                sq = sq->parent_sq;
                tg = sq_to_tg(sq);
                if (!tg)
@@ -2199,16 +2254,19 @@ bool blk_throtl_bio(struct request_queue *q, struct 
blkcg_gq *blkg,
        }
 
        /* out-of-limit, queue to @tg */
-       throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u 
queued=%d/%d",
+       throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u "
+                  "iops=%u queued=%d/%d/%d",
                   rw == READ ? 'R' : 'W',
                   tg->bytes_disp[rw], bio->bi_iter.bi_size,
                   tg_bps_limit(tg, rw),
                   tg->io_disp[rw], tg_iops_limit(tg, rw),
-                  sq->nr_queued[READ], sq->nr_queued[WRITE]);
+                  sq->nr_queued[READ_WORKLOAD],
+                  sq->nr_queued[SYNC_WRITE_WORKLOAD],
+                  sq->nr_queued[ASYNC_WRITE_WORKLOAD]);
 
        tg->last_low_overflow_time[rw] = jiffies;
 
-       td->nr_queued[rw]++;
+       td->nr_queued[type]++;
        throtl_add_bio_tg(bio, qn, tg);
        throttled = true;
 
@@ -2334,10 +2392,15 @@ static void tg_drain_bios(struct throtl_service_queue 
*parent_sq)
 
                throtl_dequeue_tg(tg);
 
-               while ((bio = throtl_peek_queued(&sq->queued[READ])))
-                       tg_dispatch_one_bio(tg, bio_data_dir(bio));
-               while ((bio = throtl_peek_queued(&sq->queued[WRITE])))
-                       tg_dispatch_one_bio(tg, bio_data_dir(bio));
+               while ((bio =
+                     throtl_peek_queued(&sq->queued[READ_WORKLOAD])))
+                       tg_dispatch_one_bio(tg, READ_WORKLOAD);
+               while ((bio =
+                     throtl_peek_queued(&sq->queued[SYNC_WRITE_WORKLOAD])))
+                       tg_dispatch_one_bio(tg, SYNC_WRITE_WORKLOAD);
+               while ((bio =
+                     throtl_peek_queued(&sq->queued[ASYNC_WRITE_WORKLOAD])))
+                       tg_dispatch_one_bio(tg, ASYNC_WRITE_WORKLOAD);
        }
 }
 
@@ -2354,7 +2417,7 @@ void blk_throtl_drain(struct request_queue *q)
        struct blkcg_gq *blkg;
        struct cgroup_subsys_state *pos_css;
        struct bio *bio;
-       int rw;
+       enum wl_type type;
 
        queue_lockdep_assert_held(q);
        rcu_read_lock();
@@ -2375,8 +2438,8 @@ void blk_throtl_drain(struct request_queue *q)
        spin_unlock_irq(q->queue_lock);
 
        /* all bios now should be in td->service_queue, issue them */
-       for (rw = READ; rw <= WRITE; rw++)
-               while ((bio = throtl_pop_queued(&td->service_queue.queued[rw],
+       for (type = READ_WORKLOAD; type <= ASYNC_WRITE_WORKLOAD; type++)
+               while ((bio = throtl_pop_queued(&td->service_queue.queued[type],
                                                NULL)))
                        generic_make_request(bio);
 
-- 
1.8.3.1

Reply via email to