Re: [PATCH V4 05/14] blk-mq-sched: improve dispatching from sw queue
On Wed, Sep 20, 2017 at 08:20:58PM +0800, Ming Lei wrote: > On Tue, Sep 19, 2017 at 02:37:50PM -0600, Jens Axboe wrote: > > On 09/02/2017 09:17 AM, Ming Lei wrote: > > > @@ -142,18 +178,31 @@ void blk_mq_sched_dispatch_requests(struct > > > blk_mq_hw_ctx *hctx) > > > if (!list_empty(_list)) { > > > blk_mq_sched_mark_restart_hctx(hctx); > > > do_sched_dispatch = blk_mq_dispatch_rq_list(q, _list); > > > - } else if (!has_sched_dispatch) { > > > + } else if (!has_sched_dispatch && !q->queue_depth) { > > > + /* > > > + * If there is no per-request_queue depth, we > > > + * flush all requests in this hw queue, otherwise > > > + * pick up request one by one from sw queue for > > > + * avoiding to mess up I/O merge when dispatch > > > + * is busy, which can be triggered easily by > > > + * per-request_queue queue depth > > > + */ > > > blk_mq_flush_busy_ctxs(hctx, _list); > > > blk_mq_dispatch_rq_list(q, _list); > > > } > > > > I don't like this part at all. It's a heuristic, and on top of that, > > it's a heuristic based on a weak assumption that if q->queue_depth == 0, > > then we never run into BUSY constraints. I think that would be stronger > > if it was based on "is this using shared tags", but even that is not > > great at all. It's perfectly possible to have a shared tags setup and > > never run into resource constraints. The opposite condition is also true > > - you can run without shared tags, and yet hit resource constraints > > constantly. Hence this is NOT a good test for deciding whether to flush > > everything at once or not. In short, I think a much better test case > > would be "has this device ever returned BLK_STS_RESOURCE. As it so > > happens, that's easy enough to keep track of and base this test on. > > Hi Jens, > > The attached patch follows your suggestion, and uses EWMA to > compute the average length of hctx->dispatch, then only flush > all requests from ctxs if the average length is zero, what do > you think about this approach? Or other suggestions? Hi Jens, I am going to prepare for V5, as suggested by Omar. Could you suggest which way you prefer to? Keeping to check q->queue_depth, or the approach of using average length of hctx->dispath, or others? -- Ming
Re: [PATCH V4 05/14] blk-mq-sched: improve dispatching from sw queue
On Tue, Sep 19, 2017 at 02:37:50PM -0600, Jens Axboe wrote: > On 09/02/2017 09:17 AM, Ming Lei wrote: > > @@ -142,18 +178,31 @@ void blk_mq_sched_dispatch_requests(struct > > blk_mq_hw_ctx *hctx) > > if (!list_empty(_list)) { > > blk_mq_sched_mark_restart_hctx(hctx); > > do_sched_dispatch = blk_mq_dispatch_rq_list(q, _list); > > - } else if (!has_sched_dispatch) { > > + } else if (!has_sched_dispatch && !q->queue_depth) { > > + /* > > +* If there is no per-request_queue depth, we > > +* flush all requests in this hw queue, otherwise > > +* pick up request one by one from sw queue for > > +* avoiding to mess up I/O merge when dispatch > > +* is busy, which can be triggered easily by > > +* per-request_queue queue depth > > +*/ > > blk_mq_flush_busy_ctxs(hctx, _list); > > blk_mq_dispatch_rq_list(q, _list); > > } > > I don't like this part at all. It's a heuristic, and on top of that, > it's a heuristic based on a weak assumption that if q->queue_depth == 0, > then we never run into BUSY constraints. I think that would be stronger > if it was based on "is this using shared tags", but even that is not > great at all. It's perfectly possible to have a shared tags setup and > never run into resource constraints. The opposite condition is also true > - you can run without shared tags, and yet hit resource constraints > constantly. Hence this is NOT a good test for deciding whether to flush > everything at once or not. In short, I think a much better test case > would be "has this device ever returned BLK_STS_RESOURCE. As it so > happens, that's easy enough to keep track of and base this test on. Hi Jens, The attached patch follows your suggestion, and uses EWMA to compute the average length of hctx->dispatch, then only flush all requests from ctxs if the average length is zero, what do you think about this approach? Or other suggestions? diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 7a27f262c96a..c420c775b9c0 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -232,6 +232,14 @@ static int hctx_flags_show(void *data, struct seq_file *m) return 0; } +static int hctx_dispatch_len_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + + seq_printf(m, "%u\n", hctx->dispatch_len); + return 0; +} + #define REQ_OP_NAME(name) [REQ_OP_##name] = #name static const char *const op_name[] = { REQ_OP_NAME(READ), @@ -763,6 +771,7 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"state", 0400, hctx_state_show}, {"flags", 0400, hctx_flags_show}, {"dispatch", 0400, .seq_ops = _dispatch_seq_ops}, + {"dispatch_length", 0400, hctx_dispatch_len_show}, {"busy", 0400, hctx_busy_show}, {"ctx_map", 0400, hctx_ctx_map_show}, {"tags", 0400, hctx_tags_show}, diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index a5712dd67783..91a6eeaadaf1 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -102,7 +102,7 @@ static void blk_mq_do_dispatch_sched(struct request_queue *q, if (!rq) break; list_add(>queuelist, _list); - } while (blk_mq_dispatch_rq_list(q, _list)); + } while (blk_mq_dispatch_rq_list(q, _list, 1)); } static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, @@ -134,19 +134,51 @@ static void blk_mq_do_dispatch_ctx(struct request_queue *q, /* round robin for fair dispatch */ ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); - dispatched = blk_mq_dispatch_rq_list(q, _list); + dispatched = blk_mq_dispatch_rq_list(q, _list, 1); } while (dispatched); if (!dispatched) WRITE_ONCE(hctx->dispatch_from, ctx); } +/* borrowed from bcache */ +void ewma_add(unsigned *ewma, unsigned val, unsigned weight) +{ + unsigned result = READ_ONCE(*ewma); + + if (val == 1) { + result += 1; + } else { + result *= weight - 1; + result += val; + result /= weight; + } + WRITE_ONCE(*ewma, result); +} + +/* + * We use EWMA to compute the average length of dispatch list. + * It is totally lockless, but we can survive that since it + * is just a hint. + * + * We only flush requests from all ctxs if the average length + * of dispatch list is zero, otherwise the hw queue can be + * thought as busy and we dequeue request from ctxs one by + * one + */ +static void blk_mq_update_dispatch_length(struct blk_mq_hw_ctx *hctx, + unsigned cnt) +{ + ewma_add(>dispatch_len, cnt, 8); +} + void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct
Re: [PATCH V4 05/14] blk-mq-sched: improve dispatching from sw queue
Hi Jens, Thanks for your comment! On Tue, Sep 19, 2017 at 02:37:50PM -0600, Jens Axboe wrote: > On 09/02/2017 09:17 AM, Ming Lei wrote: > > @@ -142,18 +178,31 @@ void blk_mq_sched_dispatch_requests(struct > > blk_mq_hw_ctx *hctx) > > if (!list_empty(_list)) { > > blk_mq_sched_mark_restart_hctx(hctx); > > do_sched_dispatch = blk_mq_dispatch_rq_list(q, _list); > > - } else if (!has_sched_dispatch) { > > + } else if (!has_sched_dispatch && !q->queue_depth) { > > + /* > > +* If there is no per-request_queue depth, we > > +* flush all requests in this hw queue, otherwise > > +* pick up request one by one from sw queue for > > +* avoiding to mess up I/O merge when dispatch > > +* is busy, which can be triggered easily by > > +* per-request_queue queue depth > > +*/ > > blk_mq_flush_busy_ctxs(hctx, _list); > > blk_mq_dispatch_rq_list(q, _list); > > } > > I don't like this part at all. It's a heuristic, and on top of that, Yeah, it is a heuristic for improving SCSI-MQ's performance. > it's a heuristic based on a weak assumption that if q->queue_depth == 0, Only SCSI sets q->queue_depth, which means all hw queues share the global queue depth of q->queue_depth, once the total number of in-flight requests from all hctx is bigger than q->queue_depth, the queue will becomes BUSY. > then we never run into BUSY constraints. I think that would be stronger > if it was based on "is this using shared tags", but even that is not IMO, it isn't related with shared tags, for example, SATA's performance can be affected by SCSI_MQ, which has single tags. The root cause is that limits of q->queue_depth and SCSI's .cmd_per_lun inside SCSI, which will cause device to return BLK_STS_RESOURCE because IO scheduler queue depth is usually bigger than q->queue_depth, and it can be quite worse especially in case of shared tags. > great at all. It's perfectly possible to have a shared tags setup and > never run into resource constraints. The opposite condition is also true As I explained, it isn't related with shared tags, either shared tags or single tags may run into resource constraints, SCSI has both shared/single tags cases, and all have this issue if q->queue_depth set. > - you can run without shared tags, and yet hit resource constraints > constantly. Hence this is NOT a good test for deciding whether to flush > everything at once or not. In short, I think a much better test case > would be "has this device ever returned BLK_STS_RESOURCE. As it so > happens, that's easy enough to keep track of and base this test on. OK, looks we can try to flush all first, and switch to pick one by one if device ever returned BLK_STS_RESOURCE. Are you fine with this way? Or better suggestion? > > Additionally, this further differentiates dispatch cases. I'd much > rather just have one sane dispatch case. I realize we are balancing > between performance/scalability and practical cases like merging here, > but it should not be impossible to do. Yeah, I agree, but both flush_all and pick one by one aren't invented by this patch, :-) -- Ming
Re: [PATCH V4 05/14] blk-mq-sched: improve dispatching from sw queue
On 09/02/2017 09:17 AM, Ming Lei wrote: > @@ -142,18 +178,31 @@ void blk_mq_sched_dispatch_requests(struct > blk_mq_hw_ctx *hctx) > if (!list_empty(_list)) { > blk_mq_sched_mark_restart_hctx(hctx); > do_sched_dispatch = blk_mq_dispatch_rq_list(q, _list); > - } else if (!has_sched_dispatch) { > + } else if (!has_sched_dispatch && !q->queue_depth) { > + /* > + * If there is no per-request_queue depth, we > + * flush all requests in this hw queue, otherwise > + * pick up request one by one from sw queue for > + * avoiding to mess up I/O merge when dispatch > + * is busy, which can be triggered easily by > + * per-request_queue queue depth > + */ > blk_mq_flush_busy_ctxs(hctx, _list); > blk_mq_dispatch_rq_list(q, _list); > } I don't like this part at all. It's a heuristic, and on top of that, it's a heuristic based on a weak assumption that if q->queue_depth == 0, then we never run into BUSY constraints. I think that would be stronger if it was based on "is this using shared tags", but even that is not great at all. It's perfectly possible to have a shared tags setup and never run into resource constraints. The opposite condition is also true - you can run without shared tags, and yet hit resource constraints constantly. Hence this is NOT a good test for deciding whether to flush everything at once or not. In short, I think a much better test case would be "has this device ever returned BLK_STS_RESOURCE. As it so happens, that's easy enough to keep track of and base this test on. Additionally, this further differentiates dispatch cases. I'd much rather just have one sane dispatch case. I realize we are balancing between performance/scalability and practical cases like merging here, but it should not be impossible to do. -- Jens Axboe
Re: [PATCH V4 05/14] blk-mq-sched: improve dispatching from sw queue
On Mon, Sep 11, 2017 at 12:13:09PM +0800, Ming Lei wrote: > On Sun, Sep 10, 2017 at 10:38:33AM -0700, Omar Sandoval wrote: > > On Sun, Sep 10, 2017 at 12:45:15PM +0800, Ming Lei wrote: > > > On Fri, Sep 08, 2017 at 04:54:39PM -0700, Omar Sandoval wrote: > > > > On Sat, Sep 02, 2017 at 11:17:20PM +0800, Ming Lei wrote: > > > > > SCSI devices use host-wide tagset, and the shared > > > > > driver tag space is often quite big. Meantime > > > > > there is also queue depth for each lun(.cmd_per_lun), > > > > > which is often small. > > > > > > > > > > So lots of requests may stay in sw queue, and we > > > > > always flush all belonging to same hw queue and > > > > > dispatch them all to driver, unfortunately it is > > > > > easy to cause queue busy because of the small > > > > > per-lun queue depth. Once these requests are flushed > > > > > out, they have to stay in hctx->dispatch, and no bio > > > > > merge can participate into these requests, and > > > > > sequential IO performance is hurted. > > > > > > > > > > This patch improves dispatching from sw queue when > > > > > there is per-request-queue queue depth by taking > > > > > request one by one from sw queue, just like the way > > > > > of IO scheduler. > > > > > > > > > > Reviewed-by: Bart Van Assche> > > > > Signed-off-by: Ming Lei > > > > > --- > > > > > block/blk-mq-sched.c | 61 > > > > > +- > > > > > include/linux/blk-mq.h | 2 ++ > > > > > 2 files changed, 57 insertions(+), 6 deletions(-) > > > > > > > > > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c > > > > > index f69752961a34..735e432294ab 100644 > > > > > --- a/block/blk-mq-sched.c > > > > > +++ b/block/blk-mq-sched.c > > > > > @@ -89,9 +89,9 @@ static bool blk_mq_sched_restart_hctx(struct > > > > > blk_mq_hw_ctx *hctx) > > > > > return false; > > > > > } > > > > > > > > > > -static void blk_mq_do_dispatch(struct request_queue *q, > > > > > -struct elevator_queue *e, > > > > > -struct blk_mq_hw_ctx *hctx) > > > > > +static void blk_mq_do_dispatch_sched(struct request_queue *q, > > > > > + struct elevator_queue *e, > > > > > + struct blk_mq_hw_ctx *hctx) > > > > > { > > > > > LIST_HEAD(rq_list); > > > > > > > > > > @@ -105,6 +105,42 @@ static void blk_mq_do_dispatch(struct > > > > > request_queue *q, > > > > > } while (blk_mq_dispatch_rq_list(q, _list)); > > > > > } > > > > > > > > > > +static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, > > > > > + struct blk_mq_ctx *ctx) > > > > > +{ > > > > > + unsigned idx = ctx->index_hw; > > > > > + > > > > > + if (++idx == hctx->nr_ctx) > > > > > + idx = 0; > > > > > + > > > > > + return hctx->ctxs[idx]; > > > > > +} > > > > > + > > > > > +static void blk_mq_do_dispatch_ctx(struct request_queue *q, > > > > > +struct blk_mq_hw_ctx *hctx) > > > > > +{ > > > > > + LIST_HEAD(rq_list); > > > > > + struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); > > > > > + bool dispatched; > > > > > + > > > > > + do { > > > > > + struct request *rq; > > > > > + > > > > > + rq = blk_mq_dispatch_rq_from_ctx(hctx, ctx); > > > > > + if (!rq) > > > > > + break; > > > > > + list_add(>queuelist, _list); > > > > > + > > > > > + /* round robin for fair dispatch */ > > > > > + ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); > > > > > > > > Hm... this next ctx will get skipped if the dispatch on the previous ctx > > > > fails, since we call blk_mq_next_ctx() again. Seems unfair. Maybe move > > > > the blk_mq_next_ctx() from the if (!dispatched) below into the if (!rq) > > > > above? > > > > > > In case of if (!rq), that means there isn't any request in all ctxs > > > belonging to this hctx, so it is reasonable to start the dispatch from > > > any one of these ctxs next time, include the next one. > > > > Yep, that case is okay. > > > > > If dispatch fails on previous ctx, the rq from that ctx will be > > > put into ->dispatch, so it is fair to start dispatch from next ctx > > > next time too. > > > > I'm talking about this case > > > > LIST_HEAD(rq_list); > > struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); > > bool dispatched; > > > > /* > > * Let's say that ctxs 0, 1, and 2 all have requests pending and > > * hctx->dispatch_from was ctx0, so ctx is ctx0 when we start. > > */ > > do { > > struct request *rq; > > > > rq = blk_mq_dispatch_rq_from_ctx(hctx, ctx); > > if (!rq) > > break; > > list_add(>queuelist, _list); > > > > /* Now rq is a request from ctx0 */ > > > >
Re: [PATCH V4 05/14] blk-mq-sched: improve dispatching from sw queue
On Sun, Sep 10, 2017 at 12:45:15PM +0800, Ming Lei wrote: > On Fri, Sep 08, 2017 at 04:54:39PM -0700, Omar Sandoval wrote: > > On Sat, Sep 02, 2017 at 11:17:20PM +0800, Ming Lei wrote: > > > SCSI devices use host-wide tagset, and the shared > > > driver tag space is often quite big. Meantime > > > there is also queue depth for each lun(.cmd_per_lun), > > > which is often small. > > > > > > So lots of requests may stay in sw queue, and we > > > always flush all belonging to same hw queue and > > > dispatch them all to driver, unfortunately it is > > > easy to cause queue busy because of the small > > > per-lun queue depth. Once these requests are flushed > > > out, they have to stay in hctx->dispatch, and no bio > > > merge can participate into these requests, and > > > sequential IO performance is hurted. > > > > > > This patch improves dispatching from sw queue when > > > there is per-request-queue queue depth by taking > > > request one by one from sw queue, just like the way > > > of IO scheduler. > > > > > > Reviewed-by: Bart Van Assche> > > Signed-off-by: Ming Lei > > > --- > > > block/blk-mq-sched.c | 61 > > > +- > > > include/linux/blk-mq.h | 2 ++ > > > 2 files changed, 57 insertions(+), 6 deletions(-) > > > > > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c > > > index f69752961a34..735e432294ab 100644 > > > --- a/block/blk-mq-sched.c > > > +++ b/block/blk-mq-sched.c > > > @@ -89,9 +89,9 @@ static bool blk_mq_sched_restart_hctx(struct > > > blk_mq_hw_ctx *hctx) > > > return false; > > > } > > > > > > -static void blk_mq_do_dispatch(struct request_queue *q, > > > -struct elevator_queue *e, > > > -struct blk_mq_hw_ctx *hctx) > > > +static void blk_mq_do_dispatch_sched(struct request_queue *q, > > > + struct elevator_queue *e, > > > + struct blk_mq_hw_ctx *hctx) > > > { > > > LIST_HEAD(rq_list); > > > > > > @@ -105,6 +105,42 @@ static void blk_mq_do_dispatch(struct request_queue > > > *q, > > > } while (blk_mq_dispatch_rq_list(q, _list)); > > > } > > > > > > +static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, > > > + struct blk_mq_ctx *ctx) > > > +{ > > > + unsigned idx = ctx->index_hw; > > > + > > > + if (++idx == hctx->nr_ctx) > > > + idx = 0; > > > + > > > + return hctx->ctxs[idx]; > > > +} > > > + > > > +static void blk_mq_do_dispatch_ctx(struct request_queue *q, > > > +struct blk_mq_hw_ctx *hctx) > > > +{ > > > + LIST_HEAD(rq_list); > > > + struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); > > > + bool dispatched; > > > + > > > + do { > > > + struct request *rq; > > > + > > > + rq = blk_mq_dispatch_rq_from_ctx(hctx, ctx); > > > + if (!rq) > > > + break; > > > + list_add(>queuelist, _list); > > > + > > > + /* round robin for fair dispatch */ > > > + ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); > > > > Hm... this next ctx will get skipped if the dispatch on the previous ctx > > fails, since we call blk_mq_next_ctx() again. Seems unfair. Maybe move > > the blk_mq_next_ctx() from the if (!dispatched) below into the if (!rq) > > above? > > In case of if (!rq), that means there isn't any request in all ctxs > belonging to this hctx, so it is reasonable to start the dispatch from > any one of these ctxs next time, include the next one. Yep, that case is okay. > If dispatch fails on previous ctx, the rq from that ctx will be > put into ->dispatch, so it is fair to start dispatch from next ctx > next time too. I'm talking about this case LIST_HEAD(rq_list); struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); bool dispatched; /* * Let's say that ctxs 0, 1, and 2 all have requests pending and * hctx->dispatch_from was ctx0, so ctx is ctx0 when we start. */ do { struct request *rq; rq = blk_mq_dispatch_rq_from_ctx(hctx, ctx); if (!rq) break; list_add(>queuelist, _list); /* Now rq is a request from ctx0 */ /* round robin for fair dispatch */ ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); /* Now ctx is ctx1. */ dispatched = blk_mq_dispatch_rq_list(q, _list); /* If we couldn't dispatch, we break here. */ } while (dispatched); if (!dispatched) /* * Now we set hctx->dispatch_from to ctx2, so we've * skipped over ctx1. */ WRITE_ONCE(hctx->dispatch_from, blk_mq_next_ctx(hctx, ctx));
Re: [PATCH V4 05/14] blk-mq-sched: improve dispatching from sw queue
On Fri, Sep 08, 2017 at 04:54:39PM -0700, Omar Sandoval wrote: > On Sat, Sep 02, 2017 at 11:17:20PM +0800, Ming Lei wrote: > > SCSI devices use host-wide tagset, and the shared > > driver tag space is often quite big. Meantime > > there is also queue depth for each lun(.cmd_per_lun), > > which is often small. > > > > So lots of requests may stay in sw queue, and we > > always flush all belonging to same hw queue and > > dispatch them all to driver, unfortunately it is > > easy to cause queue busy because of the small > > per-lun queue depth. Once these requests are flushed > > out, they have to stay in hctx->dispatch, and no bio > > merge can participate into these requests, and > > sequential IO performance is hurted. > > > > This patch improves dispatching from sw queue when > > there is per-request-queue queue depth by taking > > request one by one from sw queue, just like the way > > of IO scheduler. > > > > Reviewed-by: Bart Van Assche> > Signed-off-by: Ming Lei > > --- > > block/blk-mq-sched.c | 61 > > +- > > include/linux/blk-mq.h | 2 ++ > > 2 files changed, 57 insertions(+), 6 deletions(-) > > > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c > > index f69752961a34..735e432294ab 100644 > > --- a/block/blk-mq-sched.c > > +++ b/block/blk-mq-sched.c > > @@ -89,9 +89,9 @@ static bool blk_mq_sched_restart_hctx(struct > > blk_mq_hw_ctx *hctx) > > return false; > > } > > > > -static void blk_mq_do_dispatch(struct request_queue *q, > > - struct elevator_queue *e, > > - struct blk_mq_hw_ctx *hctx) > > +static void blk_mq_do_dispatch_sched(struct request_queue *q, > > +struct elevator_queue *e, > > +struct blk_mq_hw_ctx *hctx) > > { > > LIST_HEAD(rq_list); > > > > @@ -105,6 +105,42 @@ static void blk_mq_do_dispatch(struct request_queue *q, > > } while (blk_mq_dispatch_rq_list(q, _list)); > > } > > > > +static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, > > + struct blk_mq_ctx *ctx) > > +{ > > + unsigned idx = ctx->index_hw; > > + > > + if (++idx == hctx->nr_ctx) > > + idx = 0; > > + > > + return hctx->ctxs[idx]; > > +} > > + > > +static void blk_mq_do_dispatch_ctx(struct request_queue *q, > > + struct blk_mq_hw_ctx *hctx) > > +{ > > + LIST_HEAD(rq_list); > > + struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); > > + bool dispatched; > > + > > + do { > > + struct request *rq; > > + > > + rq = blk_mq_dispatch_rq_from_ctx(hctx, ctx); > > + if (!rq) > > + break; > > + list_add(>queuelist, _list); > > + > > + /* round robin for fair dispatch */ > > + ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); > > Hm... this next ctx will get skipped if the dispatch on the previous ctx > fails, since we call blk_mq_next_ctx() again. Seems unfair. Maybe move > the blk_mq_next_ctx() from the if (!dispatched) below into the if (!rq) > above? In case of if (!rq), that means there isn't any request in all ctxs belonging to this hctx, so it is reasonable to start the dispatch from any one of these ctxs next time, include the next one. If dispatch fails on previous ctx, the rq from that ctx will be put into ->dispatch, so it is fair to start dispatch from next ctx next time too. -- Ming
Re: [PATCH V4 05/14] blk-mq-sched: improve dispatching from sw queue
On Sat, Sep 02, 2017 at 11:17:20PM +0800, Ming Lei wrote: > SCSI devices use host-wide tagset, and the shared > driver tag space is often quite big. Meantime > there is also queue depth for each lun(.cmd_per_lun), > which is often small. > > So lots of requests may stay in sw queue, and we > always flush all belonging to same hw queue and > dispatch them all to driver, unfortunately it is > easy to cause queue busy because of the small > per-lun queue depth. Once these requests are flushed > out, they have to stay in hctx->dispatch, and no bio > merge can participate into these requests, and > sequential IO performance is hurted. > > This patch improves dispatching from sw queue when > there is per-request-queue queue depth by taking > request one by one from sw queue, just like the way > of IO scheduler. > > Reviewed-by: Bart Van Assche> Signed-off-by: Ming Lei > --- > block/blk-mq-sched.c | 61 > +- > include/linux/blk-mq.h | 2 ++ > 2 files changed, 57 insertions(+), 6 deletions(-) > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c > index f69752961a34..735e432294ab 100644 > --- a/block/blk-mq-sched.c > +++ b/block/blk-mq-sched.c > @@ -89,9 +89,9 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx > *hctx) > return false; > } > > -static void blk_mq_do_dispatch(struct request_queue *q, > -struct elevator_queue *e, > -struct blk_mq_hw_ctx *hctx) > +static void blk_mq_do_dispatch_sched(struct request_queue *q, > + struct elevator_queue *e, > + struct blk_mq_hw_ctx *hctx) > { > LIST_HEAD(rq_list); > > @@ -105,6 +105,42 @@ static void blk_mq_do_dispatch(struct request_queue *q, > } while (blk_mq_dispatch_rq_list(q, _list)); > } > > +static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, > + struct blk_mq_ctx *ctx) > +{ > + unsigned idx = ctx->index_hw; > + > + if (++idx == hctx->nr_ctx) > + idx = 0; > + > + return hctx->ctxs[idx]; > +} > + > +static void blk_mq_do_dispatch_ctx(struct request_queue *q, > +struct blk_mq_hw_ctx *hctx) > +{ > + LIST_HEAD(rq_list); > + struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); > + bool dispatched; > + > + do { > + struct request *rq; > + > + rq = blk_mq_dispatch_rq_from_ctx(hctx, ctx); > + if (!rq) > + break; > + list_add(>queuelist, _list); > + > + /* round robin for fair dispatch */ > + ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); Hm... this next ctx will get skipped if the dispatch on the previous ctx fails, since we call blk_mq_next_ctx() again. Seems unfair. Maybe move the blk_mq_next_ctx() from the if (!dispatched) below into the if (!rq) above? > + dispatched = blk_mq_dispatch_rq_list(q, _list); > + } while (dispatched); > + > + if (!dispatched) > + WRITE_ONCE(hctx->dispatch_from, blk_mq_next_ctx(hctx, ctx)); > +} > + > void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) > { > struct request_queue *q = hctx->queue; > @@ -142,18 +178,31 @@ void blk_mq_sched_dispatch_requests(struct > blk_mq_hw_ctx *hctx) > if (!list_empty(_list)) { > blk_mq_sched_mark_restart_hctx(hctx); > do_sched_dispatch = blk_mq_dispatch_rq_list(q, _list); > - } else if (!has_sched_dispatch) { > + } else if (!has_sched_dispatch && !q->queue_depth) { > + /* > + * If there is no per-request_queue depth, we > + * flush all requests in this hw queue, otherwise > + * pick up request one by one from sw queue for > + * avoiding to mess up I/O merge when dispatch > + * is busy, which can be triggered easily by > + * per-request_queue queue depth > + */ > blk_mq_flush_busy_ctxs(hctx, _list); > blk_mq_dispatch_rq_list(q, _list); > } > > + if (!do_sched_dispatch) > + return; > + > /* >* We want to dispatch from the scheduler if we had no work left >* on the dispatch list, OR if we did have work but weren't able >* to make progress. >*/ > - if (do_sched_dispatch && has_sched_dispatch) > - blk_mq_do_dispatch(q, e, hctx); > + if (has_sched_dispatch) > + blk_mq_do_dispatch_sched(q, e, hctx); > + else > + blk_mq_do_dispatch_ctx(q, hctx); > } > > bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, > diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h > index 50c6485cb04f..7b7a366a97f3 100644 > --- a/include/linux/blk-mq.h > +++ b/include/linux/blk-mq.h > @@ -30,6
[PATCH V4 05/14] blk-mq-sched: improve dispatching from sw queue
SCSI devices use host-wide tagset, and the shared driver tag space is often quite big. Meantime there is also queue depth for each lun(.cmd_per_lun), which is often small. So lots of requests may stay in sw queue, and we always flush all belonging to same hw queue and dispatch them all to driver, unfortunately it is easy to cause queue busy because of the small per-lun queue depth. Once these requests are flushed out, they have to stay in hctx->dispatch, and no bio merge can participate into these requests, and sequential IO performance is hurted. This patch improves dispatching from sw queue when there is per-request-queue queue depth by taking request one by one from sw queue, just like the way of IO scheduler. Reviewed-by: Bart Van AsscheSigned-off-by: Ming Lei --- block/blk-mq-sched.c | 61 +- include/linux/blk-mq.h | 2 ++ 2 files changed, 57 insertions(+), 6 deletions(-) diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index f69752961a34..735e432294ab 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -89,9 +89,9 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) return false; } -static void blk_mq_do_dispatch(struct request_queue *q, - struct elevator_queue *e, - struct blk_mq_hw_ctx *hctx) +static void blk_mq_do_dispatch_sched(struct request_queue *q, +struct elevator_queue *e, +struct blk_mq_hw_ctx *hctx) { LIST_HEAD(rq_list); @@ -105,6 +105,42 @@ static void blk_mq_do_dispatch(struct request_queue *q, } while (blk_mq_dispatch_rq_list(q, _list)); } +static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx) +{ + unsigned idx = ctx->index_hw; + + if (++idx == hctx->nr_ctx) + idx = 0; + + return hctx->ctxs[idx]; +} + +static void blk_mq_do_dispatch_ctx(struct request_queue *q, + struct blk_mq_hw_ctx *hctx) +{ + LIST_HEAD(rq_list); + struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); + bool dispatched; + + do { + struct request *rq; + + rq = blk_mq_dispatch_rq_from_ctx(hctx, ctx); + if (!rq) + break; + list_add(>queuelist, _list); + + /* round robin for fair dispatch */ + ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); + + dispatched = blk_mq_dispatch_rq_list(q, _list); + } while (dispatched); + + if (!dispatched) + WRITE_ONCE(hctx->dispatch_from, blk_mq_next_ctx(hctx, ctx)); +} + void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; @@ -142,18 +178,31 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) if (!list_empty(_list)) { blk_mq_sched_mark_restart_hctx(hctx); do_sched_dispatch = blk_mq_dispatch_rq_list(q, _list); - } else if (!has_sched_dispatch) { + } else if (!has_sched_dispatch && !q->queue_depth) { + /* +* If there is no per-request_queue depth, we +* flush all requests in this hw queue, otherwise +* pick up request one by one from sw queue for +* avoiding to mess up I/O merge when dispatch +* is busy, which can be triggered easily by +* per-request_queue queue depth +*/ blk_mq_flush_busy_ctxs(hctx, _list); blk_mq_dispatch_rq_list(q, _list); } + if (!do_sched_dispatch) + return; + /* * We want to dispatch from the scheduler if we had no work left * on the dispatch list, OR if we did have work but weren't able * to make progress. */ - if (do_sched_dispatch && has_sched_dispatch) - blk_mq_do_dispatch(q, e, hctx); + if (has_sched_dispatch) + blk_mq_do_dispatch_sched(q, e, hctx); + else + blk_mq_do_dispatch_ctx(q, hctx); } bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 50c6485cb04f..7b7a366a97f3 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -30,6 +30,8 @@ struct blk_mq_hw_ctx { struct sbitmap ctx_map; + struct blk_mq_ctx *dispatch_from; + struct blk_mq_ctx **ctxs; unsigned intnr_ctx; -- 2.9.5