In high-performance storage environments, particularly when utilising
RAID controllers with shared tag sets (BLK_MQ_F_TAG_HCTX_SHARED), severe
latency spikes can occur when fast devices are starved of available
tags.

This patch introduces two new debugfs attributes for each block
hardware queue:
  - /sys/kernel/debug/block/[device]/hctxN/wait_on_hw_tag
  - /sys/kernel/debug/block/[device]/hctxN/wait_on_sched_tag

These files expose atomic counters that increment each time a submitting
context is forced into an uninterruptible sleep via io_schedule() due to
the complete exhaustion of physical driver tags or software scheduler
tags, respectively.

To ensure negligible performance overhead even in production
environments where CONFIG_BLK_DEBUG_FS is actively enabled, this
tracking logic utilises dynamically allocated per-CPU counters. When
this configuration is disabled, the tracking logic compiles down to a
safe no-op.

Signed-off-by: Aaron Tomlin <[email protected]>
---
 block/blk-mq-debugfs.c | 84 ++++++++++++++++++++++++++++++++++++++++++
 block/blk-mq-debugfs.h |  7 ++++
 block/blk-mq-tag.c     |  4 ++
 include/linux/blk-mq.h | 12 ++++++
 4 files changed, 107 insertions(+)

diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 047ec887456b..a3effed55d90 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -7,6 +7,7 @@
 #include <linux/blkdev.h>
 #include <linux/build_bug.h>
 #include <linux/debugfs.h>
+#include <linux/percpu.h>
 
 #include "blk.h"
 #include "blk-mq.h"
@@ -484,6 +485,54 @@ static int hctx_dispatch_busy_show(void *data, struct 
seq_file *m)
        return 0;
 }
 
+/**
+ * hctx_wait_on_hw_tag_show - display hardware tag starvation count
+ * @data: generic pointer to the associated hardware context (hctx)
+ * @m: seq_file pointer for debugfs output formatting
+ *
+ * Prints the cumulative number of times a submitting context was forced
+ * to block due to the exhaustion of physical hardware driver tags.
+ *
+ * Return: 0 on success.
+ */
+static int hctx_wait_on_hw_tag_show(void *data, struct seq_file *m)
+{
+       struct blk_mq_hw_ctx *hctx = data;
+       unsigned long count = 0;
+       int cpu;
+
+       if (hctx->wait_on_hw_tag) {
+               for_each_possible_cpu(cpu)
+                       count += *per_cpu_ptr(hctx->wait_on_hw_tag, cpu);
+       }
+       seq_printf(m, "%lu\n", count);
+       return 0;
+}
+
+/**
+ * hctx_wait_on_sched_tag_show - display scheduler tag starvation count
+ * @data: generic pointer to the associated hardware context (hctx)
+ * @m: seq_file pointer for debugfs output formatting
+ *
+ * Prints the cumulative number of times a submitting context was forced
+ * to block due to the exhaustion of software scheduler tags.
+ *
+ * Return: 0 on success.
+ */
+static int hctx_wait_on_sched_tag_show(void *data, struct seq_file *m)
+{
+       struct blk_mq_hw_ctx *hctx = data;
+       unsigned long count = 0;
+       int cpu;
+
+       if (hctx->wait_on_sched_tag) {
+               for_each_possible_cpu(cpu)
+                       count += *per_cpu_ptr(hctx->wait_on_sched_tag, cpu);
+       }
+       seq_printf(m, "%lu\n", count);
+       return 0;
+}
+
 #define CTX_RQ_SEQ_OPS(name, type)                                     \
 static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t *pos) \
        __acquires(&ctx->lock)                                          \
@@ -599,6 +648,8 @@ static const struct blk_mq_debugfs_attr 
blk_mq_debugfs_hctx_attrs[] = {
        {"active", 0400, hctx_active_show},
        {"dispatch_busy", 0400, hctx_dispatch_busy_show},
        {"type", 0400, hctx_type_show},
+       {"wait_on_hw_tag", 0400, hctx_wait_on_hw_tag_show},
+       {"wait_on_sched_tag", 0400, hctx_wait_on_sched_tag_show},
        {},
 };
 
@@ -670,6 +721,11 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q,
        snprintf(name, sizeof(name), "hctx%u", hctx->queue_num);
        hctx->debugfs_dir = debugfs_create_dir(name, q->debugfs_dir);
 
+       if (!hctx->wait_on_hw_tag)
+               hctx->wait_on_hw_tag = alloc_percpu(unsigned long);
+       if (!hctx->wait_on_sched_tag)
+               hctx->wait_on_sched_tag = alloc_percpu(unsigned long);
+
        debugfs_create_files(q, hctx->debugfs_dir, hctx,
                             blk_mq_debugfs_hctx_attrs);
 
@@ -684,6 +740,11 @@ void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx 
*hctx)
        debugfs_remove_recursive(hctx->debugfs_dir);
        hctx->sched_debugfs_dir = NULL;
        hctx->debugfs_dir = NULL;
+
+       free_percpu(hctx->wait_on_hw_tag);
+       hctx->wait_on_hw_tag = NULL;
+       free_percpu(hctx->wait_on_sched_tag);
+       hctx->wait_on_sched_tag = NULL;
 }
 
 void blk_mq_debugfs_register_hctxs(struct request_queue *q)
@@ -815,3 +876,26 @@ void blk_mq_debugfs_unregister_sched_hctx(struct 
blk_mq_hw_ctx *hctx)
        debugfs_remove_recursive(hctx->sched_debugfs_dir);
        hctx->sched_debugfs_dir = NULL;
 }
+
+/**
+ * blk_mq_debugfs_inc_wait_tags - increment the tag starvation counters
+ * @hctx: hardware context associated with the tag allocation
+ * @is_sched: true if the starved pool is the software scheduler
+ *
+ * Evaluates the exhausted tag pool and safely increments the appropriate
+ * per-cpu debugfs starvation counter.
+ *
+ * Note: A race window exists during rapid device probe or CPU hotplug
+ * where I/O might be submitted before blk_mq_debugfs_register_hctx() has
+ * completed allocating the per-CPU counters. Therefore, the pointer is
+ * explicitly checked to prevent a NULL pointer dereference.
+ */
+void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx,
+                                 bool is_sched)
+{
+       unsigned long __percpu *tags = is_sched ? hctx->wait_on_sched_tag :
+                                                 hctx->wait_on_hw_tag;
+
+       if (likely(tags))
+               this_cpu_inc(*tags);
+}
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index 49bb1aaa83dc..a0094d004d08 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -17,6 +17,8 @@ struct blk_mq_debugfs_attr {
        const struct seq_operations *seq_ops;
 };
 
+void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx,
+                                 bool is_sched);
 int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq);
 int blk_mq_debugfs_rq_show(struct seq_file *m, void *v);
 
@@ -35,6 +37,11 @@ void blk_mq_debugfs_unregister_sched_hctx(struct 
blk_mq_hw_ctx *hctx);
 
 void blk_mq_debugfs_register_rq_qos(struct request_queue *q);
 #else
+static inline void blk_mq_debugfs_inc_wait_tags(struct blk_mq_hw_ctx *hctx,
+                                               bool is_sched)
+{
+}
+
 static inline void blk_mq_debugfs_register(struct request_queue *q)
 {
 }
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 66138dd043d4..3cc6a97a87a0 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -17,6 +17,7 @@
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-sched.h"
+#include "blk-mq-debugfs.h"
 
 /*
  * Recalculate wakeup batch when tag is shared by hctx.
@@ -191,6 +192,9 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
                trace_block_rq_tag_wait(data->q, data->hctx,
                                        data->rq_flags & RQF_SCHED_TAGS);
 
+               blk_mq_debugfs_inc_wait_tags(data->hctx,
+                                            data->rq_flags & RQF_SCHED_TAGS);
+
                bt_prev = bt;
                io_schedule();
 
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ebc45557aee8..17cd6221bb93 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -453,6 +453,18 @@ struct blk_mq_hw_ctx {
        struct dentry           *debugfs_dir;
        /** @sched_debugfs_dir: debugfs directory for the scheduler. */
        struct dentry           *sched_debugfs_dir;
+       /**
+        * @wait_on_hw_tag: Cumulative per-cpu counter incremented each
+        * time a submitting context is forced to block due to physical
+        * hardware tag exhaustion.
+        */
+       unsigned long __percpu  *wait_on_hw_tag;
+       /**
+        * @wait_on_sched_tag: Cumulative per-cpu counter incremented each
+        * time a submitting context is forced to block due to software
+        * scheduler tag exhaustion.
+        */
+       unsigned long __percpu  *wait_on_sched_tag;
 #endif
 
        /**
-- 
2.51.0


Reply via email to