If pending_queues is increased once, it will only be decreased when
nr_active is zero, and that will lead to the under-utilization of
host tags because pending_queues is non-zero and the available
tags for the queue will be max(host tags / active_queues, 4)
instead of the needed tags of the queue.

Fix it by adding an expiration time for the increasement of pending_queues,
and decrease it when it expires, so pending_queues will be decreased
to zero if there is no tag allocation failure, and the available tags
for the queue will be the whole host tags.

Signed-off-by: Hou Tao <hout...@huawei.com>
Signed-off-by: Yu Kuai <yuku...@huawei.com>
---
 block/blk-mq-tag.c     | 29 ++++++++++++++++++++++++++---
 block/blk-mq-tag.h     |  6 +++---
 block/blk-mq.c         |  5 +++--
 block/blk-mq.h         |  2 +-
 include/linux/blk-mq.h |  6 +++++-
 5 files changed, 38 insertions(+), 10 deletions(-)

diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 6dcd6dd9123a..e3ffe0fde052 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -46,12 +46,16 @@ void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *hctx)
                struct blk_mq_tag_set *set = q->tag_set;
 
                if (!test_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags) &&
-                   !test_and_set_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags))
+                   !test_and_set_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags)) {
+                       hctx->dtag_wait_time = jiffies;
                        atomic_inc(&set->pending_queues_shared_sbitmap);
+               }
        } else {
                if (!test_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state) &&
-                   !test_and_set_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state))
+                   !test_and_set_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state)) {
+                       hctx->dtag_wait_time = jiffies;
                        atomic_inc(&hctx->tags->pending_queues);
+               }
        }
 }
 
@@ -89,12 +93,28 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
        blk_mq_tag_wakeup_all(tags, false);
 }
 
-void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx)
+#define BLK_MQ_DTAG_WAIT_EXPIRE (5 * HZ)
+
+void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx, bool force)
 {
        struct blk_mq_tags *tags = hctx->tags;
        struct request_queue *q = hctx->queue;
        struct blk_mq_tag_set *set = q->tag_set;
 
+       if (!force) {
+               if (blk_mq_is_sbitmap_shared(hctx->flags)) {
+                       if (!(test_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags) &&
+                             time_after(jiffies, hctx->dtag_wait_time +
+                                       BLK_MQ_DTAG_WAIT_EXPIRE)))
+                               return;
+               } else {
+                       if (!(test_bit(BLK_MQ_S_DTAG_WAIT, &hctx->state) &&
+                             time_after(jiffies, hctx->dtag_wait_time +
+                                       BLK_MQ_DTAG_WAIT_EXPIRE)))
+                               return;
+               }
+       }
+
        if (blk_mq_is_sbitmap_shared(hctx->flags) &&
            test_and_clear_bit(QUEUE_FLAG_HCTX_WAIT, &q->queue_flags))
                atomic_dec(&set->pending_queues_shared_sbitmap);
@@ -202,6 +222,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
        sbitmap_finish_wait(bt, ws, &wait);
 
 found_tag:
+       if (!data->q->elevator)
+               blk_mq_dtag_idle(data->hctx, false);
+
        /*
         * Give up this allocation if the hctx is inactive.  The caller will
         * retry on an active hctx.
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 52d08a92f683..888692498ef2 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -68,7 +68,7 @@ enum {
 extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
 extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
 extern void __blk_mq_dtag_busy(struct blk_mq_hw_ctx *);
-extern void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *);
+extern void __blk_mq_dtag_idle(struct blk_mq_hw_ctx *, bool);
 
 static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
 {
@@ -94,12 +94,12 @@ static inline void blk_mq_dtag_busy(struct blk_mq_hw_ctx 
*hctx)
        __blk_mq_dtag_busy(hctx);
 }
 
-static inline void blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx)
+static inline void blk_mq_dtag_idle(struct blk_mq_hw_ctx *hctx, bool force)
 {
        if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
                return;
 
-       __blk_mq_dtag_idle(hctx);
+       __blk_mq_dtag_idle(hctx, force);
 }
 
 static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 2b8fa49bccb4..9ac976107154 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -1015,7 +1015,7 @@ static void blk_mq_timeout_work(struct work_struct *work)
                        /* the hctx may be unmapped, so check it here */
                        if (blk_mq_hw_queue_mapped(hctx)) {
                                blk_mq_tag_idle(hctx);
-                               blk_mq_dtag_idle(hctx);
+                               blk_mq_dtag_idle(hctx, true);
                        }
                }
        }
@@ -2568,7 +2568,7 @@ static void blk_mq_exit_hctx(struct request_queue *q,
 {
        if (blk_mq_hw_queue_mapped(hctx)) {
                blk_mq_tag_idle(hctx);
-               blk_mq_dtag_idle(hctx);
+               blk_mq_dtag_idle(hctx, true);
        }
 
        if (set->ops->exit_request)
@@ -2667,6 +2667,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct 
blk_mq_tag_set *set,
        INIT_LIST_HEAD(&hctx->dispatch);
        hctx->queue = q;
        hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;
+       hctx->dtag_wait_time = jiffies;
 
        INIT_LIST_HEAD(&hctx->hctx_list);
 
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 228c5c442be4..93ede498f5e6 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -214,7 +214,7 @@ static inline void __blk_mq_dec_active_requests(struct 
blk_mq_hw_ctx *hctx)
                atomic_dec(&hctx->queue->nr_active_requests_shared_sbitmap);
        else if (!atomic_dec_return(&hctx->nr_active)) {
                blk_mq_tag_idle(hctx);
-               blk_mq_dtag_idle(hctx);
+               blk_mq_dtag_idle(hctx, true);
        }
 }
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2a473865ee7f..82591c2f76cc 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -158,7 +158,11 @@ struct blk_mq_hw_ctx {
        unsigned long           poll_invoked;
        /** @poll_success: Count how many polled requests were completed. */
        unsigned long           poll_success;
-
+       /**
+        * record when hardware queue is pending, specifically when
+        * BLK_MQ_S_DTAG_WAIT is set in state.
+        */
+       unsigned long dtag_wait_time;
 #ifdef CONFIG_BLK_DEBUG_FS
        /**
         * @debugfs_dir: debugfs directory for this hardware queue. Named
-- 
2.25.4

Reply via email to