[PATCH 5/5] blk-mq: Rework blk-mq timeout handling again

Bart Van Assche Fri, 27 Jul 2018 09:21:02 -0700

Recently the blk-mq timeout handling code was reworked to avoid that
completions that occur while a timeout handler is in progress get
ignored. However, that rework removed the protection against completions
that occur while a timeout handler is in progress. Fix this by
introducing a new request state, namely MQ_RQ_TIMED_OUT. This state
means that the timeout handler is in progress.


Other changes in this patch are:
- Reintroduce the request generation number to avoid that request
  timeout handling happens after a request has already completed.
- Reintroduce deadline_seq to make reads and updates of the request
  deadline possible without having to use atomic instructions.
- Remove the request state information that became superfluous due to
  this patch, namely the RQF_TIMED_OUT flag and the ref member.
- Atomic instructions are only used to update the request state if
  a concurrent request state change could be in progress.
- Move the code for managing the request state back from the SCSI
  core into the block layer core.

This patch reverts the following two commits:
* 0fc09f920983 ("blk-mq: export setting request completion state")
* 065990bd198e ("scsi: set timed out out mq requests to complete")

Signed-off-by: Bart Van Assche <bart.vanass...@wdc.com>
Cc: Martin K. Petersen <martin.peter...@oracle.com>
Cc: Christoph Hellwig <h...@lst.de>
Cc: Keith Busch <keith.bu...@intel.com>
Cc: Jianchao Wang <jianchao.w.w...@oracle.com>
Cc: Ming Lei <ming....@redhat.com>
---
 block/blk-core.c          |   3 +
 block/blk-mq-debugfs.c    |   3 +-
 block/blk-mq.c            | 280 ++++++++++++++++++++++++++++----------
 block/blk-mq.h            |  10 +-
 block/blk-timeout.c       |  28 ++--
 drivers/scsi/scsi_error.c |  14 --
 include/linux/blk-mq.h    |  14 --
 include/linux/blkdev.h    |  49 +++++--
 8 files changed, 281 insertions(+), 120 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 03a4ea93a5f3..3c63e410ede4 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -198,6 +198,9 @@ void blk_rq_init(struct request_queue *q, struct request 
*rq)
        rq->internal_tag = -1;
        rq->start_time_ns = ktime_get_ns();
        rq->part = NULL;
+#ifndef CONFIG_64BIT
+       seqcount_init(&rq->deadline_seq);
+#endif
 }
 EXPORT_SYMBOL(blk_rq_init);
 
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index cb1e6cf7ac48..72abc9a87c53 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -335,8 +335,9 @@ static const char *const rqf_name[] = {
 
 static const char *const blk_mq_rq_state_name_array[] = {
        [MQ_RQ_IDLE]            = "idle",
-       [MQ_RQ_IN_FLIGHT]       = "in_flight",
+       [MQ_RQ_IN_FLIGHT]       = "in flight",
        [MQ_RQ_COMPLETE]        = "complete",
+       [MQ_RQ_TIMED_OUT]       = "timed out",
 };
 
 static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 1b49973629f6..a97ab5ba9d18 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -6,6 +6,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/atomic.h>      /* cmpxchg() */
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
@@ -322,6 +323,7 @@ static struct request *blk_mq_rq_ctx_init(struct 
blk_mq_alloc_data *data,
 
        INIT_LIST_HEAD(&rq->timeout_list);
        rq->timeout = 0;
+       WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
 
        rq->end_io = NULL;
        rq->end_io_data = NULL;
@@ -332,7 +334,6 @@ static struct request *blk_mq_rq_ctx_init(struct 
blk_mq_alloc_data *data,
 #endif
 
        data->ctx->rq_dispatched[op_is_sync(op)]++;
-       refcount_set(&rq->ref, 1);
        return rq;
 }
 
@@ -466,19 +467,42 @@ struct request *blk_mq_alloc_request_hctx(struct 
request_queue *q,
 }
 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
 
-static void __blk_mq_free_request(struct request *rq)
+/**
+ * blk_mq_change_rq_state - atomically test and set request state
+ * @rq: Request pointer.
+ * @old_state: Old request state.
+ * @new_state: New request state.
+ *
+ * Returns %true if and only if the old state was @old and if the state has
+ * been changed into @new.
+ */
+static bool blk_mq_change_rq_state(struct request *rq,
+                                  enum mq_rq_state old_state,
+                                  enum mq_rq_state new_state)
 {
-       struct request_queue *q = rq->q;
-       struct blk_mq_ctx *ctx = rq->mq_ctx;
-       struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
-       const int sched_tag = rq->internal_tag;
+       union blk_generation_and_state old_val = READ_ONCE(rq->gstate);
+       union blk_generation_and_state new_val = old_val;
 
-       if (rq->tag != -1)
-               blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
-       if (sched_tag != -1)
-               blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
-       blk_mq_sched_restart(hctx);
-       blk_queue_exit(q);
+       old_val.state = old_state;
+       new_val.state = new_state;
+       if (new_state == MQ_RQ_IN_FLIGHT)
+               new_val.generation++;
+       /*
+        * For transitions from state in-flight or timed out to another state
+        * cmpxchg() must be used. For other state transitions it is safe to
+        * use WRITE_ONCE().
+        */
+       switch (old_state) {
+       case MQ_RQ_IN_FLIGHT:
+       case MQ_RQ_TIMED_OUT:
+               WRITE_ONCE(rq->gstate.val, new_val.val);
+               return true;
+       case MQ_RQ_IDLE:
+       case MQ_RQ_COMPLETE:
+               return blk_mq_set_rq_state(rq, old_val, new_val);
+       }
+       WARN(true, "Invalid request state %d\n", old_state);
+       return false;
 }
 
 void blk_mq_free_request(struct request *rq)
@@ -487,6 +511,7 @@ void blk_mq_free_request(struct request *rq)
        struct elevator_queue *e = q->elevator;
        struct blk_mq_ctx *ctx = rq->mq_ctx;
        struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
+       const int sched_tag = rq->internal_tag;
 
        if (rq->rq_flags & RQF_ELVPRIV) {
                if (e && e->type->ops.mq.finish_request)
@@ -509,9 +534,14 @@ void blk_mq_free_request(struct request *rq)
        if (blk_rq_rl(rq))
                blk_put_rl(blk_rq_rl(rq));
 
-       WRITE_ONCE(rq->state, MQ_RQ_IDLE);
-       if (refcount_dec_and_test(&rq->ref))
-               __blk_mq_free_request(rq);
+       if (!blk_mq_change_rq_state(rq, blk_mq_rq_state(rq), MQ_RQ_IDLE))
+               WARN_ON_ONCE(true);
+       if (rq->tag != -1)
+               blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
+       if (sched_tag != -1)
+               blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
+       blk_mq_sched_restart(hctx);
+       blk_queue_exit(q);
 }
 EXPORT_SYMBOL_GPL(blk_mq_free_request);
 
@@ -558,8 +588,8 @@ static void __blk_mq_complete_request(struct request *rq)
        bool shared = false;
        int cpu;
 
-       if (!blk_mq_mark_complete(rq))
-               return;
+       WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_COMPLETE);
+
        if (rq->internal_tag != -1)
                blk_mq_sched_completed_request(rq);
 
@@ -615,7 +645,18 @@ void blk_mq_complete_request(struct request *rq)
 {
        if (unlikely(blk_should_fake_timeout(rq->q)))
                return;
-       __blk_mq_complete_request(rq);
+again:
+       if (blk_mq_change_rq_state(rq, MQ_RQ_IN_FLIGHT, MQ_RQ_COMPLETE)) {
+               __blk_mq_complete_request(rq);
+               return;
+       }
+       if (blk_mq_change_rq_state(rq, MQ_RQ_TIMED_OUT, MQ_RQ_COMPLETE))
+               return;
+       if (WARN_ON_ONCE(blk_mq_rq_state(rq) == MQ_RQ_IDLE ||
+                        blk_mq_rq_state(rq) == MQ_RQ_COMPLETE))
+               return;
+       /* In the unlikely case of a race with the timeout code, retry. */
+       goto again;
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
 
@@ -645,7 +686,7 @@ void blk_mq_start_request(struct request *rq)
        WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE);
 
        blk_mq_add_timer(rq);
-       WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
+       WARN_ON_ONCE(!blk_mq_change_rq_state(rq, MQ_RQ_IDLE, MQ_RQ_IN_FLIGHT));
 
        if (q->dma_drain_size && blk_rq_bytes(rq)) {
                /*
@@ -658,23 +699,46 @@ void blk_mq_start_request(struct request *rq)
 }
 EXPORT_SYMBOL(blk_mq_start_request);
 
+/**
+ * __blk_mq_requeue_request - requeue a request
+ * @rq: request to be requeued
+ *
+ * This function is either called by blk_mq_requeue_request() or by the block
+ * layer core if .queue_rq() returned BLK_STS_RESOURCE or BLK_STS_DEV_RESOURCE.
+ * If the request state is MQ_RQ_IN_FLIGHT and if this function is called from
+ * inside .queue_rq() then it is guaranteed that the timeout code won't try to
+ * modify the request state while this function is in progress because an RCU
+ * read lock is held around .queue_rq() and because the timeout code calls
+ * synchronize_rcu() after having marked requests and before processing
+ * time-outs.
+ */
 static void __blk_mq_requeue_request(struct request *rq)
 {
        struct request_queue *q = rq->q;
+       enum mq_rq_state old_state = blk_mq_rq_state(rq);
 
        blk_mq_put_driver_tag(rq);
 
        trace_block_rq_requeue(q, rq);
        rq_qos_requeue(q, rq);
 
-       if (blk_mq_request_started(rq)) {
-               WRITE_ONCE(rq->state, MQ_RQ_IDLE);
-               rq->rq_flags &= ~RQF_TIMED_OUT;
+       if (old_state != MQ_RQ_IDLE) {
+               if (!blk_mq_change_rq_state(rq, old_state, MQ_RQ_IDLE))
+                       WARN_ON_ONCE(true);
                if (q->dma_drain_size && blk_rq_bytes(rq))
                        rq->nr_phys_segments--;
        }
 }
 
+/**
+ * blk_mq_requeue_request - requeue a request
+ * @rq: request to be requeued
+ * @kick_requeue_list: whether or not to kick the requeue_list
+ *
+ * This function is called after a request has completed, after a request has
+ * timed out or from inside .queue_rq(). In the latter case the request may
+ * already have been started.
+ */
 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
 {
        __blk_mq_requeue_request(rq);
@@ -767,82 +831,117 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags 
*tags, unsigned int tag)
 }
 EXPORT_SYMBOL(blk_mq_tag_to_rq);
 
+struct blk_mq_timeout_data {
+       unsigned long next;
+       unsigned int next_set;
+       unsigned int nr_expired;
+};
+
 static void blk_mq_rq_timed_out(struct request *req, bool reserved)
 {
-       req->rq_flags |= RQF_TIMED_OUT;
-       if (req->q->mq_ops->timeout) {
-               enum blk_eh_timer_return ret;
+       enum blk_eh_timer_return ret;
 
-               ret = req->q->mq_ops->timeout(req, reserved);
-               if (ret == BLK_EH_DONT_RESET_TIMER)
-                       return;
-               WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
+       if (!req->q->mq_ops->timeout) {
+               blk_mq_add_timer(req);
+               return;
        }
 
+       ret = req->q->mq_ops->timeout(req, reserved);
+       /*
+        * BLK_EH_DONT_RESET_TIMER means that the block driver either
+        * completed the request or still owns the request and will
+        * continue processing the timeout asynchronously. In the
+        * latter case, if blk_mq_complete_request() was called while
+        * the timeout handler was in progress, ignore that call.
+        */
+       if (ret == BLK_EH_DONT_RESET_TIMER)
+               return;
+       WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
        blk_mq_add_timer(req);
+again:
+       if (blk_mq_change_rq_state(req, MQ_RQ_TIMED_OUT, MQ_RQ_IN_FLIGHT))
+               return;
+       if (blk_mq_rq_state(req) == MQ_RQ_COMPLETE) {
+               __blk_mq_complete_request(req);
+               return;
+       }
+       if (WARN_ON_ONCE(blk_mq_rq_state(req) == MQ_RQ_IDLE ||
+                        blk_mq_rq_state(req) == MQ_RQ_IN_FLIGHT))
+               return;
+       /*
+        * In the unlikely case of a race with the request completion code,
+        * retry.
+        */
+       goto again;
 }
 
-static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
+static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, struct request 
*rq,
+                                void *priv, bool reserved)
 {
+       struct blk_mq_timeout_data *data = priv;
+       union blk_generation_and_state gstate = READ_ONCE(rq->gstate);
        unsigned long deadline;
 
-       if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT)
-               return false;
-       if (rq->rq_flags & RQF_TIMED_OUT)
-               return false;
+       might_sleep();
 
-       deadline = blk_rq_deadline(rq);
-       if (time_after_eq(jiffies, deadline))
-               return true;
+#ifdef CONFIG_64BIT
+       deadline = rq->__deadline;
+#else
+       while (true) {
+               int start;
 
-       if (*next == 0)
-               *next = deadline;
-       else if (time_after(*next, deadline))
-               *next = deadline;
-       return false;
+               start = read_seqcount_begin(&rq->deadline_seq);
+               deadline = rq->__deadline;
+               if (!read_seqcount_retry(&rq->deadline_seq, start))
+                       break;
+               cond_resched();
+       }
+#endif
+
+       /*
+        * Make sure that rq->aborted_gstate != rq->gstate if a request gets
+        * recycled before blk_mq_terminate_expired() is called.
+        */
+       rq->aborted_gstate = gstate;
+       rq->aborted_gstate.generation ^= (1UL << 29);
+       if (gstate.state == MQ_RQ_IN_FLIGHT &&
+           time_after_eq(jiffies, deadline)) {
+               /* request timed out */
+               rq->aborted_gstate = gstate;
+               data->nr_expired++;
+               hctx->nr_expired++;
+       } else if (!data->next_set || time_after(data->next, deadline)) {
+               data->next = deadline;
+               data->next_set = 1;
+       }
 }
 
-static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
+static void blk_mq_terminate_expired(struct blk_mq_hw_ctx *hctx,
                struct request *rq, void *priv, bool reserved)
 {
-       unsigned long *next = priv;
+       union blk_generation_and_state old_val = rq->aborted_gstate;
+       union blk_generation_and_state new_val = old_val;
 
-       /*
-        * Just do a quick check if it is expired before locking the request in
-        * so we're not unnecessarilly synchronizing across CPUs.
-        */
-       if (!blk_mq_req_expired(rq, next))
-               return;
-
-       /*
-        * We have reason to believe the request may be expired. Take a
-        * reference on the request to lock this request lifetime into its
-        * currently allocated context to prevent it from being reallocated in
-        * the event the completion by-passes this timeout handler.
-        *
-        * If the reference was already released, then the driver beat the
-        * timeout handler to posting a natural completion.
-        */
-       if (!refcount_inc_not_zero(&rq->ref))
-               return;
+       new_val.state = MQ_RQ_TIMED_OUT;
 
        /*
-        * The request is now locked and cannot be reallocated underneath the
-        * timeout handler's processing. Re-verify this exact request is truly
-        * expired; if it is not expired, then the request was completed and
-        * reallocated as a new request.
+        * We marked @rq->aborted_gstate and waited for ongoing .queue_rq()
+        * calls. If rq->gstate has not changed that means that it
+        * is now safe to change the request state and to handle the timeout.
         */
-       if (blk_mq_req_expired(rq, next))
+       if (blk_mq_set_rq_state(rq, old_val, new_val))
                blk_mq_rq_timed_out(rq, reserved);
-       if (refcount_dec_and_test(&rq->ref))
-               __blk_mq_free_request(rq);
 }
 
 static void blk_mq_timeout_work(struct work_struct *work)
 {
        struct request_queue *q =
                container_of(work, struct request_queue, timeout_work);
-       unsigned long next = 0;
+       struct blk_mq_timeout_data data = {
+               .next           = 0,
+               .next_set       = 0,
+               .nr_expired     = 0,
+       };
        struct blk_mq_hw_ctx *hctx;
        int i;
 
@@ -862,10 +961,41 @@ static void blk_mq_timeout_work(struct work_struct *work)
        if (!percpu_ref_tryget(&q->q_usage_counter))
                return;
 
-       blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
+       /* scan for the expired ones and set their ->aborted_gstate */
+       blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data);
+
+       if (data.nr_expired) {
+               bool has_rcu = false;
+
+               /*
+                * For very short timeouts it can happen that
+                * blk_mq_check_expired() modifies the state of a request
+                * while .queue_rq() is still in progress. Hence wait until
+                * these .queue_rq() calls have finished. This is also
+                * necessary to avoid races with blk_mq_requeue_request() for
+                * requests that have already been started.
+                */
+               queue_for_each_hw_ctx(q, hctx, i) {
+                       if (!hctx->nr_expired)
+                               continue;
+
+                       if (!(hctx->flags & BLK_MQ_F_BLOCKING))
+                               has_rcu = true;
+                       else
+                               synchronize_srcu(hctx->srcu);
 
-       if (next != 0) {
-               mod_timer(&q->timeout, next);
+                       hctx->nr_expired = 0;
+               }
+               if (has_rcu)
+                       synchronize_rcu();
+
+               /* terminate the ones we won */
+               blk_mq_queue_tag_busy_iter(q, blk_mq_terminate_expired, NULL);
+       }
+
+       if (data.next_set) {
+               data.next = blk_rq_timeout(round_jiffies_up(data.next));
+               mod_timer(&q->timeout, data.next);
        } else {
                /*
                 * Request timeouts are handled as a forward rolling timer. If
@@ -2009,7 +2139,9 @@ static int blk_mq_init_request(struct blk_mq_tag_set 
*set, struct request *rq,
                        return ret;
        }
 
-       WRITE_ONCE(rq->state, MQ_RQ_IDLE);
+#ifndef CONFIG_64BIT
+       seqcount_init(&rq->deadline_seq);
+#endif
        return 0;
 }
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 9497b47e2526..de92cddc147f 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -90,13 +90,21 @@ extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx 
*hctx);
 
 void blk_mq_release(struct request_queue *q);
 
+static inline bool blk_mq_set_rq_state(struct request *rq,
+                                      union blk_generation_and_state old_val,
+                                      union blk_generation_and_state new_val)
+{
+       return cmpxchg(&rq->gstate.val, old_val.val, new_val.val) ==
+               old_val.val;
+}
+
 /**
  * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
  * @rq: target request.
  */
 static inline enum mq_rq_state blk_mq_rq_state(struct request *rq)
 {
-       return READ_ONCE(rq->state);
+       return READ_ONCE(rq->gstate).state;
 }
 
 static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 02d7e3427369..b37b8090f3ae 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -142,6 +142,22 @@ void blk_timeout_work(struct work_struct *work)
        spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
+/* Update deadline to @time. May be called from interrupt context. */
+static void blk_mq_rq_set_deadline(struct request *rq, unsigned long new_time)
+{
+#ifdef CONFIG_64BIT
+       rq->__deadline = new_time;
+#else
+       unsigned long flags;
+
+       local_irq_save(flags);
+       write_seqcount_begin(&rq->deadline_seq);
+       rq->__deadline = new_time;
+       write_seqcount_end(&rq->deadline_seq);
+       local_irq_restore(flags);
+#endif
+}
+
 /**
  * blk_abort_request -- Request request recovery for the specified command
  * @req:       pointer to the request of interest
@@ -155,11 +171,10 @@ void blk_abort_request(struct request *req)
 {
        if (req->q->mq_ops) {
                /*
-                * All we need to ensure is that timeout scan takes place
-                * immediately and that scan sees the new timeout value.
-                * No need for fancy synchronizations.
+                * Ensure that a timeout scan takes place immediately and that
+                * that scan sees the new timeout value.
                 */
-               blk_rq_set_deadline(req, jiffies);
+               blk_mq_rq_set_deadline(req, jiffies);
                kblockd_schedule_work(&req->q->timeout_work);
        } else {
                if (blk_mark_rq_complete(req))
@@ -228,7 +243,6 @@ void blk_add_timer(struct request *req)
        if (!req->timeout)
                req->timeout = q->rq_timeout;
 
-       req->rq_flags &= ~RQF_TIMED_OUT;
        deadline = jiffies + req->timeout;
        blk_rq_set_deadline(req, deadline);
        list_add_tail(&req->timeout_list, &req->q->timeout_list);
@@ -250,9 +264,7 @@ void blk_mq_add_timer(struct request *req)
 
        if (!req->timeout)
                req->timeout = q->rq_timeout;
-
-       req->rq_flags &= ~RQF_TIMED_OUT;
        deadline = jiffies + req->timeout;
-       blk_rq_set_deadline(req, deadline);
+       blk_mq_rq_set_deadline(req, deadline);
        __blk_add_timer(req, deadline);
 }
diff --git a/drivers/scsi/scsi_error.c b/drivers/scsi/scsi_error.c
index b38b8e62e618..fa18f9d78170 100644
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -296,20 +296,6 @@ enum blk_eh_timer_return scsi_times_out(struct request 
*req)
                rtn = host->hostt->eh_timed_out(scmd);
 
        if (rtn == BLK_EH_DONT_RESET_TIMER) {
-               /*
-                * For blk-mq, we must set the request state to complete now
-                * before sending the request to the scsi error handler. This
-                * will prevent a use-after-free in the event the LLD manages
-                * to complete the request before the error handler finishes
-                * processing this timed out request.
-                *
-                * If the request was already completed, then the LLD beat the
-                * time out handler from transferring the request to the scsi
-                * error handler. In that case we can return immediately as no
-                * further action is required.
-                */
-               if (req->q->mq_ops && !blk_mq_mark_complete(req))
-                       return rtn;
                if (scsi_abort_command(scmd) != SUCCESS) {
                        set_host_byte(scmd, DID_TIME_OUT);
                        scsi_eh_scmd_add(scmd);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 1da59c16f637..d710e92874cc 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -289,20 +289,6 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set 
*set, int nr_hw_queues);
 
 void blk_mq_quiesce_queue_nowait(struct request_queue *q);
 
-/**
- * blk_mq_mark_complete() - Set request state to complete
- * @rq: request to set to complete state
- *
- * Returns true if request state was successfully set to complete. If
- * successful, the caller is responsibile for seeing this request is ended, as
- * blk_mq_complete_request will not work again.
- */
-static inline bool blk_mq_mark_complete(struct request *rq)
-{
-       return cmpxchg(&rq->state, MQ_RQ_IN_FLIGHT, MQ_RQ_COMPLETE) ==
-                       MQ_RQ_IN_FLIGHT;
-}
-
 /*
  * Driver command data is immediately after the request. So subtract request
  * size to get back to the original request, add request size to get the PDU.
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 8244a5a1aa5b..58e5b22123a2 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -126,20 +126,39 @@ typedef __u32 __bitwise req_flags_t;
 #define RQF_ZONE_WRITE_LOCKED  ((__force req_flags_t)(1 << 19))
 /* already slept for hybrid poll */
 #define RQF_MQ_POLL_SLEPT      ((__force req_flags_t)(1 << 20))
-/* ->timeout has been called, don't expire again */
-#define RQF_TIMED_OUT          ((__force req_flags_t)(1 << 21))
 
 /* flags that prevent us from merging requests: */
 #define RQF_NOMERGE_FLAGS \
        (RQF_STARTED | RQF_SOFTBARRIER | RQF_FLUSH_SEQ | RQF_SPECIAL_PAYLOAD)
 
-/*
- * Request state for blk-mq.
+union blk_generation_and_state {
+       struct {
+               uint32_t generation:30;
+               uint32_t state:2;
+       };
+       uint32_t val;
+};
+
+/**
+ * enum mq_rq_state - blk-mq request state
+ *
+ * The legal state transitions are:
+ * - idle      -> in flight: blk_mq_start_request()
+ * - in flight -> complete:  blk_mq_complete_request()
+ * - timed out -> complete:  blk_mq_complete_request()
+ * - in flight -> timed out: request times out
+ * - complete  -> idle:      blk_mq_requeue_request() or blk_mq_free_request()
+ * - in flight -> idle:      blk_mq_requeue_request() or blk_mq_free_request()
+ * - timed out -> idle:      blk_mq_requeue_request() or blk_mq_free_request()
+ * - timed out -> in flight: request restart due to BLK_EH_RESET_TIMER
+ *
+ * See also blk_generation_and_state.state.
  */
 enum mq_rq_state {
        MQ_RQ_IDLE              = 0,
        MQ_RQ_IN_FLIGHT         = 1,
        MQ_RQ_COMPLETE          = 2,
+       MQ_RQ_TIMED_OUT         = 3,
 };
 
 /*
@@ -242,12 +261,26 @@ struct request {
 
        unsigned int extra_len; /* length of alignment and padding */
 
-       enum mq_rq_state state;
-       refcount_t ref;
-
        unsigned int timeout;
 
-       /* access through blk_rq_set_deadline, blk_rq_deadline */
+       /*
+        * ->aborted_gstate is used by the timeout to claim a specific
+        * recycle instance of this request.  See blk_mq_timeout_work().
+        */
+       union blk_generation_and_state aborted_gstate;
+
+       /*
+        * Access through blk_rq_deadline() and blk_rq_set_deadline(),
+        * blk_mark_rq_complete(), blk_clear_rq_complete() and
+        * blk_rq_is_complete() for legacy queues or blk_mq_rq_state(),
+        * blk_mq_change_rq_state() and blk_mq_rq_set_deadline() for
+        * blk-mq queues. deadline_seq protects __deadline in blk-mq mode
+        * only.
+        */
+       union blk_generation_and_state gstate;
+#ifndef CONFIG_64BIT
+       seqcount_t deadline_seq;
+#endif
        unsigned long __deadline;
 
        struct list_head timeout_list;
-- 
2.18.0

[PATCH 5/5] blk-mq: Rework blk-mq timeout handling again

Reply via email to