Re: [PATCH V4 05/14] blk-mq-sched: improve dispatching from sw queue

2017-09-21 Thread Ming Lei
On Wed, Sep 20, 2017 at 08:20:58PM +0800, Ming Lei wrote:
> On Tue, Sep 19, 2017 at 02:37:50PM -0600, Jens Axboe wrote:
> > On 09/02/2017 09:17 AM, Ming Lei wrote:
> > > @@ -142,18 +178,31 @@ void blk_mq_sched_dispatch_requests(struct 
> > > blk_mq_hw_ctx *hctx)
> > >   if (!list_empty(_list)) {
> > >   blk_mq_sched_mark_restart_hctx(hctx);
> > >   do_sched_dispatch = blk_mq_dispatch_rq_list(q, _list);
> > > - } else if (!has_sched_dispatch) {
> > > + } else if (!has_sched_dispatch && !q->queue_depth) {
> > > + /*
> > > +  * If there is no per-request_queue depth, we
> > > +  * flush all requests in this hw queue, otherwise
> > > +  * pick up request one by one from sw queue for
> > > +  * avoiding to mess up I/O merge when dispatch
> > > +  * is busy, which can be triggered easily by
> > > +  * per-request_queue queue depth
> > > +  */
> > >   blk_mq_flush_busy_ctxs(hctx, _list);
> > >   blk_mq_dispatch_rq_list(q, _list);
> > >   }
> > 
> > I don't like this part at all. It's a heuristic, and on top of that,
> > it's a heuristic based on a weak assumption that if q->queue_depth == 0,
> > then we never run into BUSY constraints. I think that would be stronger
> > if it was based on "is this using shared tags", but even that is not
> > great at all. It's perfectly possible to have a shared tags setup and
> > never run into resource constraints. The opposite condition is also true
> > - you can run without shared tags, and yet hit resource constraints
> > constantly. Hence this is NOT a good test for deciding whether to flush
> > everything at once or not. In short, I think a much better test case
> > would be "has this device ever returned BLK_STS_RESOURCE. As it so
> > happens, that's easy enough to keep track of and base this test on.
> 
> Hi Jens,
> 
> The attached patch follows your suggestion, and uses EWMA to
> compute the average length of hctx->dispatch, then only flush
> all requests from ctxs if the average length is zero, what do
> you think about this approach? Or other suggestions?

Hi Jens,

I am going to prepare for V5, as suggested by Omar.

Could you suggest which way you prefer to?  Keeping to check
q->queue_depth, or the approach of using average length of
hctx->dispath, or others? 


-- 
Ming


Re: [PATCH V4 05/14] blk-mq-sched: improve dispatching from sw queue

2017-09-20 Thread Ming Lei
On Tue, Sep 19, 2017 at 02:37:50PM -0600, Jens Axboe wrote:
> On 09/02/2017 09:17 AM, Ming Lei wrote:
> > @@ -142,18 +178,31 @@ void blk_mq_sched_dispatch_requests(struct 
> > blk_mq_hw_ctx *hctx)
> > if (!list_empty(_list)) {
> > blk_mq_sched_mark_restart_hctx(hctx);
> > do_sched_dispatch = blk_mq_dispatch_rq_list(q, _list);
> > -   } else if (!has_sched_dispatch) {
> > +   } else if (!has_sched_dispatch && !q->queue_depth) {
> > +   /*
> > +* If there is no per-request_queue depth, we
> > +* flush all requests in this hw queue, otherwise
> > +* pick up request one by one from sw queue for
> > +* avoiding to mess up I/O merge when dispatch
> > +* is busy, which can be triggered easily by
> > +* per-request_queue queue depth
> > +*/
> > blk_mq_flush_busy_ctxs(hctx, _list);
> > blk_mq_dispatch_rq_list(q, _list);
> > }
> 
> I don't like this part at all. It's a heuristic, and on top of that,
> it's a heuristic based on a weak assumption that if q->queue_depth == 0,
> then we never run into BUSY constraints. I think that would be stronger
> if it was based on "is this using shared tags", but even that is not
> great at all. It's perfectly possible to have a shared tags setup and
> never run into resource constraints. The opposite condition is also true
> - you can run without shared tags, and yet hit resource constraints
> constantly. Hence this is NOT a good test for deciding whether to flush
> everything at once or not. In short, I think a much better test case
> would be "has this device ever returned BLK_STS_RESOURCE. As it so
> happens, that's easy enough to keep track of and base this test on.

Hi Jens,

The attached patch follows your suggestion, and uses EWMA to
compute the average length of hctx->dispatch, then only flush
all requests from ctxs if the average length is zero, what do
you think about this approach? Or other suggestions?


diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 7a27f262c96a..c420c775b9c0 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -232,6 +232,14 @@ static int hctx_flags_show(void *data, struct seq_file *m)
return 0;
 }
 
+static int hctx_dispatch_len_show(void *data, struct seq_file *m)
+{
+   struct blk_mq_hw_ctx *hctx = data;
+
+   seq_printf(m, "%u\n", hctx->dispatch_len);
+   return 0;
+}
+
 #define REQ_OP_NAME(name) [REQ_OP_##name] = #name
 static const char *const op_name[] = {
REQ_OP_NAME(READ),
@@ -763,6 +771,7 @@ static const struct blk_mq_debugfs_attr 
blk_mq_debugfs_hctx_attrs[] = {
{"state", 0400, hctx_state_show},
{"flags", 0400, hctx_flags_show},
{"dispatch", 0400, .seq_ops = _dispatch_seq_ops},
+   {"dispatch_length", 0400, hctx_dispatch_len_show},
{"busy", 0400, hctx_busy_show},
{"ctx_map", 0400, hctx_ctx_map_show},
{"tags", 0400, hctx_tags_show},
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index a5712dd67783..91a6eeaadaf1 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -102,7 +102,7 @@ static void blk_mq_do_dispatch_sched(struct request_queue 
*q,
if (!rq)
break;
list_add(>queuelist, _list);
-   } while (blk_mq_dispatch_rq_list(q, _list));
+   } while (blk_mq_dispatch_rq_list(q, _list, 1));
 }
 
 static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
@@ -134,19 +134,51 @@ static void blk_mq_do_dispatch_ctx(struct request_queue 
*q,
/* round robin for fair dispatch */
ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
 
-   dispatched = blk_mq_dispatch_rq_list(q, _list);
+   dispatched = blk_mq_dispatch_rq_list(q, _list, 1);
} while (dispatched);
 
if (!dispatched)
WRITE_ONCE(hctx->dispatch_from, ctx);
 }
 
+/* borrowed from bcache */
+void ewma_add(unsigned *ewma, unsigned val, unsigned weight)
+{
+   unsigned  result = READ_ONCE(*ewma);
+
+   if (val == 1) {
+   result += 1;
+   } else {
+   result *= weight - 1;
+   result += val;
+   result /= weight;
+   }
+   WRITE_ONCE(*ewma, result);
+}
+
+/*
+ * We use EWMA to compute the average length of dispatch list.
+ * It is totally lockless, but we can survive that since it
+ * is just a hint.
+ *
+ * We only flush requests from all ctxs if the average length
+ * of dispatch list is zero, otherwise the hw queue can be
+ * thought as busy and we dequeue request from ctxs one by
+ * one
+ */
+static void blk_mq_update_dispatch_length(struct blk_mq_hw_ctx *hctx,
+ unsigned cnt)
+{
+   ewma_add(>dispatch_len, cnt, 8);
+}
+
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
struct request_queue *q = hctx->queue;
struct 

Re: [PATCH V4 05/14] blk-mq-sched: improve dispatching from sw queue

2017-09-19 Thread Ming Lei
Hi Jens,

Thanks for your comment!

On Tue, Sep 19, 2017 at 02:37:50PM -0600, Jens Axboe wrote:
> On 09/02/2017 09:17 AM, Ming Lei wrote:
> > @@ -142,18 +178,31 @@ void blk_mq_sched_dispatch_requests(struct 
> > blk_mq_hw_ctx *hctx)
> > if (!list_empty(_list)) {
> > blk_mq_sched_mark_restart_hctx(hctx);
> > do_sched_dispatch = blk_mq_dispatch_rq_list(q, _list);
> > -   } else if (!has_sched_dispatch) {
> > +   } else if (!has_sched_dispatch && !q->queue_depth) {
> > +   /*
> > +* If there is no per-request_queue depth, we
> > +* flush all requests in this hw queue, otherwise
> > +* pick up request one by one from sw queue for
> > +* avoiding to mess up I/O merge when dispatch
> > +* is busy, which can be triggered easily by
> > +* per-request_queue queue depth
> > +*/
> > blk_mq_flush_busy_ctxs(hctx, _list);
> > blk_mq_dispatch_rq_list(q, _list);
> > }
> 
> I don't like this part at all. It's a heuristic, and on top of that,

Yeah, it is a heuristic for improving SCSI-MQ's performance.

> it's a heuristic based on a weak assumption that if q->queue_depth == 0,

Only SCSI sets q->queue_depth, which means all hw queues share
the global queue depth of q->queue_depth, once the total number
of in-flight requests from all hctx is bigger than q->queue_depth,
the queue will becomes BUSY.

> then we never run into BUSY constraints. I think that would be stronger
> if it was based on "is this using shared tags", but even that is not

IMO, it isn't related with shared tags, for example, SATA's
performance can be affected by SCSI_MQ, which has single tags.

The root cause is that limits of q->queue_depth and SCSI's
.cmd_per_lun inside SCSI, which will cause device to return
BLK_STS_RESOURCE because IO scheduler queue depth is usually
bigger than q->queue_depth, and it can be quite worse especially
in case of shared tags.

> great at all. It's perfectly possible to have a shared tags setup and
> never run into resource constraints. The opposite condition is also true

As I explained, it isn't related with shared tags, either shared tags
or single tags may run into resource constraints, SCSI has both
shared/single tags cases, and all have this issue if q->queue_depth
set.

> - you can run without shared tags, and yet hit resource constraints
> constantly. Hence this is NOT a good test for deciding whether to flush
> everything at once or not. In short, I think a much better test case
> would be "has this device ever returned BLK_STS_RESOURCE. As it so
> happens, that's easy enough to keep track of and base this test on.

OK, looks we can try to flush all first, and switch to pick one by
one if device ever returned BLK_STS_RESOURCE. Are you fine with
this way? Or better suggestion?

> 
> Additionally, this further differentiates dispatch cases. I'd much
> rather just have one sane dispatch case. I realize we are balancing
> between performance/scalability and practical cases like merging here,
> but it should not be impossible to do.

Yeah, I agree, but both flush_all and pick one by one aren't invented
by this patch, :-)

-- 
Ming


Re: [PATCH V4 05/14] blk-mq-sched: improve dispatching from sw queue

2017-09-19 Thread Jens Axboe
On 09/02/2017 09:17 AM, Ming Lei wrote:
> @@ -142,18 +178,31 @@ void blk_mq_sched_dispatch_requests(struct 
> blk_mq_hw_ctx *hctx)
>   if (!list_empty(_list)) {
>   blk_mq_sched_mark_restart_hctx(hctx);
>   do_sched_dispatch = blk_mq_dispatch_rq_list(q, _list);
> - } else if (!has_sched_dispatch) {
> + } else if (!has_sched_dispatch && !q->queue_depth) {
> + /*
> +  * If there is no per-request_queue depth, we
> +  * flush all requests in this hw queue, otherwise
> +  * pick up request one by one from sw queue for
> +  * avoiding to mess up I/O merge when dispatch
> +  * is busy, which can be triggered easily by
> +  * per-request_queue queue depth
> +  */
>   blk_mq_flush_busy_ctxs(hctx, _list);
>   blk_mq_dispatch_rq_list(q, _list);
>   }

I don't like this part at all. It's a heuristic, and on top of that,
it's a heuristic based on a weak assumption that if q->queue_depth == 0,
then we never run into BUSY constraints. I think that would be stronger
if it was based on "is this using shared tags", but even that is not
great at all. It's perfectly possible to have a shared tags setup and
never run into resource constraints. The opposite condition is also true
- you can run without shared tags, and yet hit resource constraints
constantly. Hence this is NOT a good test for deciding whether to flush
everything at once or not. In short, I think a much better test case
would be "has this device ever returned BLK_STS_RESOURCE. As it so
happens, that's easy enough to keep track of and base this test on.

Additionally, this further differentiates dispatch cases. I'd much
rather just have one sane dispatch case. I realize we are balancing
between performance/scalability and practical cases like merging here,
but it should not be impossible to do.

-- 
Jens Axboe


Re: [PATCH V4 05/14] blk-mq-sched: improve dispatching from sw queue

2017-09-13 Thread Omar Sandoval
On Mon, Sep 11, 2017 at 12:13:09PM +0800, Ming Lei wrote:
> On Sun, Sep 10, 2017 at 10:38:33AM -0700, Omar Sandoval wrote:
> > On Sun, Sep 10, 2017 at 12:45:15PM +0800, Ming Lei wrote:
> > > On Fri, Sep 08, 2017 at 04:54:39PM -0700, Omar Sandoval wrote:
> > > > On Sat, Sep 02, 2017 at 11:17:20PM +0800, Ming Lei wrote:
> > > > > SCSI devices use host-wide tagset, and the shared
> > > > > driver tag space is often quite big. Meantime
> > > > > there is also queue depth for each lun(.cmd_per_lun),
> > > > > which is often small.
> > > > > 
> > > > > So lots of requests may stay in sw queue, and we
> > > > > always flush all belonging to same hw queue and
> > > > > dispatch them all to driver, unfortunately it is
> > > > > easy to cause queue busy because of the small
> > > > > per-lun queue depth. Once these requests are flushed
> > > > > out, they have to stay in hctx->dispatch, and no bio
> > > > > merge can participate into these requests, and
> > > > > sequential IO performance is hurted.
> > > > > 
> > > > > This patch improves dispatching from sw queue when
> > > > > there is per-request-queue queue depth by taking
> > > > > request one by one from sw queue, just like the way
> > > > > of IO scheduler.
> > > > > 
> > > > > Reviewed-by: Bart Van Assche 
> > > > > Signed-off-by: Ming Lei 
> > > > > ---
> > > > >  block/blk-mq-sched.c   | 61 
> > > > > +-
> > > > >  include/linux/blk-mq.h |  2 ++
> > > > >  2 files changed, 57 insertions(+), 6 deletions(-)
> > > > > 
> > > > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> > > > > index f69752961a34..735e432294ab 100644
> > > > > --- a/block/blk-mq-sched.c
> > > > > +++ b/block/blk-mq-sched.c
> > > > > @@ -89,9 +89,9 @@ static bool blk_mq_sched_restart_hctx(struct 
> > > > > blk_mq_hw_ctx *hctx)
> > > > >   return false;
> > > > >  }
> > > > >  
> > > > > -static void blk_mq_do_dispatch(struct request_queue *q,
> > > > > -struct elevator_queue *e,
> > > > > -struct blk_mq_hw_ctx *hctx)
> > > > > +static void blk_mq_do_dispatch_sched(struct request_queue *q,
> > > > > +  struct elevator_queue *e,
> > > > > +  struct blk_mq_hw_ctx *hctx)
> > > > >  {
> > > > >   LIST_HEAD(rq_list);
> > > > >  
> > > > > @@ -105,6 +105,42 @@ static void blk_mq_do_dispatch(struct 
> > > > > request_queue *q,
> > > > >   } while (blk_mq_dispatch_rq_list(q, _list));
> > > > >  }
> > > > >  
> > > > > +static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > > > > +   struct blk_mq_ctx *ctx)
> > > > > +{
> > > > > + unsigned idx = ctx->index_hw;
> > > > > +
> > > > > + if (++idx == hctx->nr_ctx)
> > > > > + idx = 0;
> > > > > +
> > > > > + return hctx->ctxs[idx];
> > > > > +}
> > > > > +
> > > > > +static void blk_mq_do_dispatch_ctx(struct request_queue *q,
> > > > > +struct blk_mq_hw_ctx *hctx)
> > > > > +{
> > > > > + LIST_HEAD(rq_list);
> > > > > + struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
> > > > > + bool dispatched;
> > > > > +
> > > > > + do {
> > > > > + struct request *rq;
> > > > > +
> > > > > + rq = blk_mq_dispatch_rq_from_ctx(hctx, ctx);
> > > > > + if (!rq)
> > > > > + break;
> > > > > + list_add(>queuelist, _list);
> > > > > +
> > > > > + /* round robin for fair dispatch */
> > > > > + ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
> > > > 
> > > > Hm... this next ctx will get skipped if the dispatch on the previous ctx
> > > > fails, since we call blk_mq_next_ctx() again. Seems unfair. Maybe move
> > > > the blk_mq_next_ctx() from the if (!dispatched) below into the if (!rq)
> > > > above?
> > > 
> > > In case of if (!rq), that means there isn't any request in all ctxs
> > > belonging to this hctx, so it is reasonable to start the dispatch from
> > > any one of these ctxs next time, include the next one.
> > 
> > Yep, that case is okay.
> > 
> > > If dispatch fails on previous ctx, the rq from that ctx will be
> > > put into ->dispatch, so it is fair to start dispatch from next ctx
> > > next time too.
> > 
> > I'm talking about this case
> > 
> > LIST_HEAD(rq_list);
> > struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
> > bool dispatched;
> >  
> > /*
> >  * Let's say that ctxs 0, 1, and 2 all have requests pending and
> >  * hctx->dispatch_from was ctx0, so ctx is ctx0 when we start.
> >  */
> > do {
> > struct request *rq;
> >  
> > rq = blk_mq_dispatch_rq_from_ctx(hctx, ctx);
> > if (!rq)
> > break;
> > list_add(>queuelist, _list);
> > 
> > /* Now rq is a request from ctx0 */
> > 
> >  

Re: [PATCH V4 05/14] blk-mq-sched: improve dispatching from sw queue

2017-09-10 Thread Omar Sandoval
On Sun, Sep 10, 2017 at 12:45:15PM +0800, Ming Lei wrote:
> On Fri, Sep 08, 2017 at 04:54:39PM -0700, Omar Sandoval wrote:
> > On Sat, Sep 02, 2017 at 11:17:20PM +0800, Ming Lei wrote:
> > > SCSI devices use host-wide tagset, and the shared
> > > driver tag space is often quite big. Meantime
> > > there is also queue depth for each lun(.cmd_per_lun),
> > > which is often small.
> > > 
> > > So lots of requests may stay in sw queue, and we
> > > always flush all belonging to same hw queue and
> > > dispatch them all to driver, unfortunately it is
> > > easy to cause queue busy because of the small
> > > per-lun queue depth. Once these requests are flushed
> > > out, they have to stay in hctx->dispatch, and no bio
> > > merge can participate into these requests, and
> > > sequential IO performance is hurted.
> > > 
> > > This patch improves dispatching from sw queue when
> > > there is per-request-queue queue depth by taking
> > > request one by one from sw queue, just like the way
> > > of IO scheduler.
> > > 
> > > Reviewed-by: Bart Van Assche 
> > > Signed-off-by: Ming Lei 
> > > ---
> > >  block/blk-mq-sched.c   | 61 
> > > +-
> > >  include/linux/blk-mq.h |  2 ++
> > >  2 files changed, 57 insertions(+), 6 deletions(-)
> > > 
> > > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> > > index f69752961a34..735e432294ab 100644
> > > --- a/block/blk-mq-sched.c
> > > +++ b/block/blk-mq-sched.c
> > > @@ -89,9 +89,9 @@ static bool blk_mq_sched_restart_hctx(struct 
> > > blk_mq_hw_ctx *hctx)
> > >   return false;
> > >  }
> > >  
> > > -static void blk_mq_do_dispatch(struct request_queue *q,
> > > -struct elevator_queue *e,
> > > -struct blk_mq_hw_ctx *hctx)
> > > +static void blk_mq_do_dispatch_sched(struct request_queue *q,
> > > +  struct elevator_queue *e,
> > > +  struct blk_mq_hw_ctx *hctx)
> > >  {
> > >   LIST_HEAD(rq_list);
> > >  
> > > @@ -105,6 +105,42 @@ static void blk_mq_do_dispatch(struct request_queue 
> > > *q,
> > >   } while (blk_mq_dispatch_rq_list(q, _list));
> > >  }
> > >  
> > > +static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > > +   struct blk_mq_ctx *ctx)
> > > +{
> > > + unsigned idx = ctx->index_hw;
> > > +
> > > + if (++idx == hctx->nr_ctx)
> > > + idx = 0;
> > > +
> > > + return hctx->ctxs[idx];
> > > +}
> > > +
> > > +static void blk_mq_do_dispatch_ctx(struct request_queue *q,
> > > +struct blk_mq_hw_ctx *hctx)
> > > +{
> > > + LIST_HEAD(rq_list);
> > > + struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
> > > + bool dispatched;
> > > +
> > > + do {
> > > + struct request *rq;
> > > +
> > > + rq = blk_mq_dispatch_rq_from_ctx(hctx, ctx);
> > > + if (!rq)
> > > + break;
> > > + list_add(>queuelist, _list);
> > > +
> > > + /* round robin for fair dispatch */
> > > + ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
> > 
> > Hm... this next ctx will get skipped if the dispatch on the previous ctx
> > fails, since we call blk_mq_next_ctx() again. Seems unfair. Maybe move
> > the blk_mq_next_ctx() from the if (!dispatched) below into the if (!rq)
> > above?
> 
> In case of if (!rq), that means there isn't any request in all ctxs
> belonging to this hctx, so it is reasonable to start the dispatch from
> any one of these ctxs next time, include the next one.

Yep, that case is okay.

> If dispatch fails on previous ctx, the rq from that ctx will be
> put into ->dispatch, so it is fair to start dispatch from next ctx
> next time too.

I'm talking about this case

LIST_HEAD(rq_list);
struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
bool dispatched;
 
/*
 * Let's say that ctxs 0, 1, and 2 all have requests pending and
 * hctx->dispatch_from was ctx0, so ctx is ctx0 when we start.
 */
do {
struct request *rq;
 
rq = blk_mq_dispatch_rq_from_ctx(hctx, ctx);
if (!rq)
break;
list_add(>queuelist, _list);

/* Now rq is a request from ctx0 */

/* round robin for fair dispatch */
ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
/* Now ctx is ctx1. */
 
dispatched = blk_mq_dispatch_rq_list(q, _list);

/* If we couldn't dispatch, we break here. */
} while (dispatched);
 
if (!dispatched)
/*
 * Now we set hctx->dispatch_from to ctx2, so we've
 * skipped over ctx1.
 */
WRITE_ONCE(hctx->dispatch_from, blk_mq_next_ctx(hctx, ctx));


Re: [PATCH V4 05/14] blk-mq-sched: improve dispatching from sw queue

2017-09-09 Thread Ming Lei
On Fri, Sep 08, 2017 at 04:54:39PM -0700, Omar Sandoval wrote:
> On Sat, Sep 02, 2017 at 11:17:20PM +0800, Ming Lei wrote:
> > SCSI devices use host-wide tagset, and the shared
> > driver tag space is often quite big. Meantime
> > there is also queue depth for each lun(.cmd_per_lun),
> > which is often small.
> > 
> > So lots of requests may stay in sw queue, and we
> > always flush all belonging to same hw queue and
> > dispatch them all to driver, unfortunately it is
> > easy to cause queue busy because of the small
> > per-lun queue depth. Once these requests are flushed
> > out, they have to stay in hctx->dispatch, and no bio
> > merge can participate into these requests, and
> > sequential IO performance is hurted.
> > 
> > This patch improves dispatching from sw queue when
> > there is per-request-queue queue depth by taking
> > request one by one from sw queue, just like the way
> > of IO scheduler.
> > 
> > Reviewed-by: Bart Van Assche 
> > Signed-off-by: Ming Lei 
> > ---
> >  block/blk-mq-sched.c   | 61 
> > +-
> >  include/linux/blk-mq.h |  2 ++
> >  2 files changed, 57 insertions(+), 6 deletions(-)
> > 
> > diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> > index f69752961a34..735e432294ab 100644
> > --- a/block/blk-mq-sched.c
> > +++ b/block/blk-mq-sched.c
> > @@ -89,9 +89,9 @@ static bool blk_mq_sched_restart_hctx(struct 
> > blk_mq_hw_ctx *hctx)
> > return false;
> >  }
> >  
> > -static void blk_mq_do_dispatch(struct request_queue *q,
> > -  struct elevator_queue *e,
> > -  struct blk_mq_hw_ctx *hctx)
> > +static void blk_mq_do_dispatch_sched(struct request_queue *q,
> > +struct elevator_queue *e,
> > +struct blk_mq_hw_ctx *hctx)
> >  {
> > LIST_HEAD(rq_list);
> >  
> > @@ -105,6 +105,42 @@ static void blk_mq_do_dispatch(struct request_queue *q,
> > } while (blk_mq_dispatch_rq_list(q, _list));
> >  }
> >  
> > +static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> > + struct blk_mq_ctx *ctx)
> > +{
> > +   unsigned idx = ctx->index_hw;
> > +
> > +   if (++idx == hctx->nr_ctx)
> > +   idx = 0;
> > +
> > +   return hctx->ctxs[idx];
> > +}
> > +
> > +static void blk_mq_do_dispatch_ctx(struct request_queue *q,
> > +  struct blk_mq_hw_ctx *hctx)
> > +{
> > +   LIST_HEAD(rq_list);
> > +   struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
> > +   bool dispatched;
> > +
> > +   do {
> > +   struct request *rq;
> > +
> > +   rq = blk_mq_dispatch_rq_from_ctx(hctx, ctx);
> > +   if (!rq)
> > +   break;
> > +   list_add(>queuelist, _list);
> > +
> > +   /* round robin for fair dispatch */
> > +   ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
> 
> Hm... this next ctx will get skipped if the dispatch on the previous ctx
> fails, since we call blk_mq_next_ctx() again. Seems unfair. Maybe move
> the blk_mq_next_ctx() from the if (!dispatched) below into the if (!rq)
> above?

In case of if (!rq), that means there isn't any request in all ctxs
belonging to this hctx, so it is reasonable to start the dispatch from
any one of these ctxs next time, include the next one.

If dispatch fails on previous ctx, the rq from that ctx will be
put into ->dispatch, so it is fair to start dispatch from next ctx
next time too.


-- 
Ming


Re: [PATCH V4 05/14] blk-mq-sched: improve dispatching from sw queue

2017-09-08 Thread Omar Sandoval
On Sat, Sep 02, 2017 at 11:17:20PM +0800, Ming Lei wrote:
> SCSI devices use host-wide tagset, and the shared
> driver tag space is often quite big. Meantime
> there is also queue depth for each lun(.cmd_per_lun),
> which is often small.
> 
> So lots of requests may stay in sw queue, and we
> always flush all belonging to same hw queue and
> dispatch them all to driver, unfortunately it is
> easy to cause queue busy because of the small
> per-lun queue depth. Once these requests are flushed
> out, they have to stay in hctx->dispatch, and no bio
> merge can participate into these requests, and
> sequential IO performance is hurted.
> 
> This patch improves dispatching from sw queue when
> there is per-request-queue queue depth by taking
> request one by one from sw queue, just like the way
> of IO scheduler.
> 
> Reviewed-by: Bart Van Assche 
> Signed-off-by: Ming Lei 
> ---
>  block/blk-mq-sched.c   | 61 
> +-
>  include/linux/blk-mq.h |  2 ++
>  2 files changed, 57 insertions(+), 6 deletions(-)
> 
> diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
> index f69752961a34..735e432294ab 100644
> --- a/block/blk-mq-sched.c
> +++ b/block/blk-mq-sched.c
> @@ -89,9 +89,9 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx 
> *hctx)
>   return false;
>  }
>  
> -static void blk_mq_do_dispatch(struct request_queue *q,
> -struct elevator_queue *e,
> -struct blk_mq_hw_ctx *hctx)
> +static void blk_mq_do_dispatch_sched(struct request_queue *q,
> +  struct elevator_queue *e,
> +  struct blk_mq_hw_ctx *hctx)
>  {
>   LIST_HEAD(rq_list);
>  
> @@ -105,6 +105,42 @@ static void blk_mq_do_dispatch(struct request_queue *q,
>   } while (blk_mq_dispatch_rq_list(q, _list));
>  }
>  
> +static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
> +   struct blk_mq_ctx *ctx)
> +{
> + unsigned idx = ctx->index_hw;
> +
> + if (++idx == hctx->nr_ctx)
> + idx = 0;
> +
> + return hctx->ctxs[idx];
> +}
> +
> +static void blk_mq_do_dispatch_ctx(struct request_queue *q,
> +struct blk_mq_hw_ctx *hctx)
> +{
> + LIST_HEAD(rq_list);
> + struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
> + bool dispatched;
> +
> + do {
> + struct request *rq;
> +
> + rq = blk_mq_dispatch_rq_from_ctx(hctx, ctx);
> + if (!rq)
> + break;
> + list_add(>queuelist, _list);
> +
> + /* round robin for fair dispatch */
> + ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);

Hm... this next ctx will get skipped if the dispatch on the previous ctx
fails, since we call blk_mq_next_ctx() again. Seems unfair. Maybe move
the blk_mq_next_ctx() from the if (!dispatched) below into the if (!rq)
above?

> + dispatched = blk_mq_dispatch_rq_list(q, _list);
> + } while (dispatched);
> +
> + if (!dispatched)
> + WRITE_ONCE(hctx->dispatch_from, blk_mq_next_ctx(hctx, ctx));
> +}
> +
>  void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
>  {
>   struct request_queue *q = hctx->queue;
> @@ -142,18 +178,31 @@ void blk_mq_sched_dispatch_requests(struct 
> blk_mq_hw_ctx *hctx)
>   if (!list_empty(_list)) {
>   blk_mq_sched_mark_restart_hctx(hctx);
>   do_sched_dispatch = blk_mq_dispatch_rq_list(q, _list);
> - } else if (!has_sched_dispatch) {
> + } else if (!has_sched_dispatch && !q->queue_depth) {
> + /*
> +  * If there is no per-request_queue depth, we
> +  * flush all requests in this hw queue, otherwise
> +  * pick up request one by one from sw queue for
> +  * avoiding to mess up I/O merge when dispatch
> +  * is busy, which can be triggered easily by
> +  * per-request_queue queue depth
> +  */
>   blk_mq_flush_busy_ctxs(hctx, _list);
>   blk_mq_dispatch_rq_list(q, _list);
>   }
>  
> + if (!do_sched_dispatch)
> + return;
> +
>   /*
>* We want to dispatch from the scheduler if we had no work left
>* on the dispatch list, OR if we did have work but weren't able
>* to make progress.
>*/
> - if (do_sched_dispatch && has_sched_dispatch)
> - blk_mq_do_dispatch(q, e, hctx);
> + if (has_sched_dispatch)
> + blk_mq_do_dispatch_sched(q, e, hctx);
> + else
> + blk_mq_do_dispatch_ctx(q, hctx);
>  }
>  
>  bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 50c6485cb04f..7b7a366a97f3 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -30,6 

[PATCH V4 05/14] blk-mq-sched: improve dispatching from sw queue

2017-09-02 Thread Ming Lei
SCSI devices use host-wide tagset, and the shared
driver tag space is often quite big. Meantime
there is also queue depth for each lun(.cmd_per_lun),
which is often small.

So lots of requests may stay in sw queue, and we
always flush all belonging to same hw queue and
dispatch them all to driver, unfortunately it is
easy to cause queue busy because of the small
per-lun queue depth. Once these requests are flushed
out, they have to stay in hctx->dispatch, and no bio
merge can participate into these requests, and
sequential IO performance is hurted.

This patch improves dispatching from sw queue when
there is per-request-queue queue depth by taking
request one by one from sw queue, just like the way
of IO scheduler.

Reviewed-by: Bart Van Assche 
Signed-off-by: Ming Lei 
---
 block/blk-mq-sched.c   | 61 +-
 include/linux/blk-mq.h |  2 ++
 2 files changed, 57 insertions(+), 6 deletions(-)

diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index f69752961a34..735e432294ab 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -89,9 +89,9 @@ static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx 
*hctx)
return false;
 }
 
-static void blk_mq_do_dispatch(struct request_queue *q,
-  struct elevator_queue *e,
-  struct blk_mq_hw_ctx *hctx)
+static void blk_mq_do_dispatch_sched(struct request_queue *q,
+struct elevator_queue *e,
+struct blk_mq_hw_ctx *hctx)
 {
LIST_HEAD(rq_list);
 
@@ -105,6 +105,42 @@ static void blk_mq_do_dispatch(struct request_queue *q,
} while (blk_mq_dispatch_rq_list(q, _list));
 }
 
+static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
+ struct blk_mq_ctx *ctx)
+{
+   unsigned idx = ctx->index_hw;
+
+   if (++idx == hctx->nr_ctx)
+   idx = 0;
+
+   return hctx->ctxs[idx];
+}
+
+static void blk_mq_do_dispatch_ctx(struct request_queue *q,
+  struct blk_mq_hw_ctx *hctx)
+{
+   LIST_HEAD(rq_list);
+   struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
+   bool dispatched;
+
+   do {
+   struct request *rq;
+
+   rq = blk_mq_dispatch_rq_from_ctx(hctx, ctx);
+   if (!rq)
+   break;
+   list_add(>queuelist, _list);
+
+   /* round robin for fair dispatch */
+   ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
+
+   dispatched = blk_mq_dispatch_rq_list(q, _list);
+   } while (dispatched);
+
+   if (!dispatched)
+   WRITE_ONCE(hctx->dispatch_from, blk_mq_next_ctx(hctx, ctx));
+}
+
 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
 {
struct request_queue *q = hctx->queue;
@@ -142,18 +178,31 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx 
*hctx)
if (!list_empty(_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
do_sched_dispatch = blk_mq_dispatch_rq_list(q, _list);
-   } else if (!has_sched_dispatch) {
+   } else if (!has_sched_dispatch && !q->queue_depth) {
+   /*
+* If there is no per-request_queue depth, we
+* flush all requests in this hw queue, otherwise
+* pick up request one by one from sw queue for
+* avoiding to mess up I/O merge when dispatch
+* is busy, which can be triggered easily by
+* per-request_queue queue depth
+*/
blk_mq_flush_busy_ctxs(hctx, _list);
blk_mq_dispatch_rq_list(q, _list);
}
 
+   if (!do_sched_dispatch)
+   return;
+
/*
 * We want to dispatch from the scheduler if we had no work left
 * on the dispatch list, OR if we did have work but weren't able
 * to make progress.
 */
-   if (do_sched_dispatch && has_sched_dispatch)
-   blk_mq_do_dispatch(q, e, hctx);
+   if (has_sched_dispatch)
+   blk_mq_do_dispatch_sched(q, e, hctx);
+   else
+   blk_mq_do_dispatch_ctx(q, hctx);
 }
 
 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 50c6485cb04f..7b7a366a97f3 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -30,6 +30,8 @@ struct blk_mq_hw_ctx {
 
struct sbitmap  ctx_map;
 
+   struct blk_mq_ctx   *dispatch_from;
+
struct blk_mq_ctx   **ctxs;
unsigned intnr_ctx;
 
-- 
2.9.5