For legacy block, we simply track them in the request queue. For blk-mq, we track them on a per-sw queue basis, which we can then sum up through the hardware queues and finally to a per device state.
The stats are tracked in, roughly, 0.1s interval windows. Add sysfs files to display the stats. Signed-off-by: Jens Axboe <ax...@fb.com> --- block/Makefile | 2 +- block/blk-core.c | 4 + block/blk-mq-sysfs.c | 47 ++++++++++++ block/blk-mq.c | 14 ++++ block/blk-mq.h | 3 + block/blk-stat.c | 184 ++++++++++++++++++++++++++++++++++++++++++++++ block/blk-stat.h | 17 +++++ block/blk-sysfs.c | 26 +++++++ include/linux/blk_types.h | 8 ++ include/linux/blkdev.h | 4 + 10 files changed, 308 insertions(+), 1 deletion(-) create mode 100644 block/blk-stat.c create mode 100644 block/blk-stat.h diff --git a/block/Makefile b/block/Makefile index 9eda2322b2d4..3446e0472df0 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ - blk-lib.o blk-mq.o blk-mq-tag.o \ + blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ badblocks.o partitions/ diff --git a/block/blk-core.c b/block/blk-core.c index 74c16fd8995d..40b57bf4852c 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2514,6 +2514,8 @@ void blk_start_request(struct request *req) { blk_dequeue_request(req); + req->issue_time = ktime_to_ns(ktime_get()); + /* * We are now handing the request to the hardware, initialize * resid_len to full count and add the timeout handler. @@ -2581,6 +2583,8 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) trace_block_rq_complete(req->q, req, nr_bytes); + blk_stat_add(&req->q->rq_stats[rq_data_dir(req)], req); + if (!req->bio) return false; diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 4ea4dd8a1eed..2f68015f8616 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -247,6 +247,47 @@ static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) return ret; } +static void blk_mq_stat_clear(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_ctx *ctx; + unsigned int i; + + hctx_for_each_ctx(hctx, ctx, i) { + blk_stat_init(&ctx->stat[0]); + blk_stat_init(&ctx->stat[1]); + } +} + +static ssize_t blk_mq_hw_sysfs_stat_store(struct blk_mq_hw_ctx *hctx, + const char *page, size_t count) +{ + blk_mq_stat_clear(hctx); + return count; +} + +static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre) +{ + return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n", + pre, (long long) stat->nr_samples, + (long long) stat->mean, (long long) stat->min, + (long long) stat->max); +} + +static ssize_t blk_mq_hw_sysfs_stat_show(struct blk_mq_hw_ctx *hctx, char *page) +{ + struct blk_rq_stat stat[2]; + ssize_t ret; + + blk_stat_init(&stat[0]); + blk_stat_init(&stat[1]); + + blk_hctx_stat_get(hctx, stat); + + ret = print_stat(page, &stat[0], "read :"); + ret += print_stat(page + ret, &stat[1], "write:"); + return ret; +} + static struct blk_mq_ctx_sysfs_entry blk_mq_sysfs_dispatched = { .attr = {.name = "dispatched", .mode = S_IRUGO }, .show = blk_mq_sysfs_dispatched_show, @@ -304,6 +345,11 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = { .attr = {.name = "io_poll", .mode = S_IRUGO }, .show = blk_mq_hw_sysfs_poll_show, }; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_stat = { + .attr = {.name = "stats", .mode = S_IRUGO | S_IWUSR }, + .show = blk_mq_hw_sysfs_stat_show, + .store = blk_mq_hw_sysfs_stat_store, +}; static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_queued.attr, @@ -314,6 +360,7 @@ static struct attribute *default_hw_ctx_attrs[] = { &blk_mq_hw_sysfs_cpus.attr, &blk_mq_hw_sysfs_active.attr, &blk_mq_hw_sysfs_poll.attr, + &blk_mq_hw_sysfs_stat.attr, NULL, }; diff --git a/block/blk-mq.c b/block/blk-mq.c index 1699baf39b78..71b4a13fbf94 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -29,6 +29,7 @@ #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" +#include "blk-stat.h" static DEFINE_MUTEX(all_q_mutex); static LIST_HEAD(all_q_list); @@ -356,10 +357,19 @@ static void blk_mq_ipi_complete_request(struct request *rq) put_cpu(); } +static void blk_mq_stat_add(struct request *rq) +{ + struct blk_rq_stat *stat = &rq->mq_ctx->stat[rq_data_dir(rq)]; + + blk_stat_add(stat, rq); +} + static void __blk_mq_complete_request(struct request *rq) { struct request_queue *q = rq->q; + blk_mq_stat_add(rq); + if (!q->softirq_done_fn) blk_mq_end_request(rq, rq->errors); else @@ -403,6 +413,8 @@ void blk_mq_start_request(struct request *rq) if (unlikely(blk_bidi_rq(rq))) rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); + rq->issue_time = ktime_to_ns(ktime_get()); + blk_add_timer(rq); /* @@ -1761,6 +1773,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, spin_lock_init(&__ctx->lock); INIT_LIST_HEAD(&__ctx->rq_list); __ctx->queue = q; + blk_stat_init(&__ctx->stat[0]); + blk_stat_init(&__ctx->stat[1]); /* If the cpu isn't online, the cpu is mapped to first hctx */ if (!cpu_online(i)) diff --git a/block/blk-mq.h b/block/blk-mq.h index 9087b11037b7..e107f700ff17 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -1,6 +1,8 @@ #ifndef INT_BLK_MQ_H #define INT_BLK_MQ_H +#include "blk-stat.h" + struct blk_mq_tag_set; struct blk_mq_ctx { @@ -20,6 +22,7 @@ struct blk_mq_ctx { /* incremented at completion time */ unsigned long ____cacheline_aligned_in_smp rq_completed[2]; + struct blk_rq_stat stat[2]; struct request_queue *queue; struct kobject kobj; diff --git a/block/blk-stat.c b/block/blk-stat.c new file mode 100644 index 000000000000..b38776a83173 --- /dev/null +++ b/block/blk-stat.c @@ -0,0 +1,184 @@ +/* + * Block stat tracking code + * + * Copyright (C) 2016 Jens Axboe + */ +#include <linux/kernel.h> +#include <linux/blk-mq.h> + +#include "blk-stat.h" +#include "blk-mq.h" + +void blk_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) +{ + if (!src->nr_samples) + return; + + dst->min = min(dst->min, src->min); + dst->max = max(dst->max, src->max); + + if (!dst->nr_samples) + dst->mean = src->mean; + else { + dst->mean = div64_s64((src->mean * src->nr_samples) + + (dst->mean * dst->nr_samples), + dst->nr_samples + src->nr_samples); + } + dst->nr_samples += src->nr_samples; +} + +static void blk_mq_stat_get(struct request_queue *q, struct blk_rq_stat *dst) +{ + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + int i, j, nr; + + blk_stat_init(&dst[0]); + blk_stat_init(&dst[1]); + + nr = 0; + do { + uint64_t newest = 0; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx_for_each_ctx(hctx, ctx, j) { + if (!ctx->stat[0].nr_samples && + !ctx->stat[1].nr_samples) + continue; + if (ctx->stat[0].time > newest) + newest = ctx->stat[0].time; + if (ctx->stat[1].time > newest) + newest = ctx->stat[1].time; + } + } + + /* + * No samples + */ + if (!newest) + break; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx_for_each_ctx(hctx, ctx, j) { + if (ctx->stat[0].time == newest) { + blk_stat_sum(&dst[0], &ctx->stat[0]); + nr++; + } + if (ctx->stat[1].time == newest) { + blk_stat_sum(&dst[1], &ctx->stat[1]); + nr++; + } + } + } + /* + * If we race on finding an entry, just loop back again. + * Should be very rare. + */ + } while (!nr); +} + +void blk_queue_stat_get(struct request_queue *q, struct blk_rq_stat *dst) +{ + if (q->mq_ops) + blk_mq_stat_get(q, dst); + else { + memcpy(&dst[0], &q->rq_stats[0], sizeof(struct blk_rq_stat)); + memcpy(&dst[1], &q->rq_stats[1], sizeof(struct blk_rq_stat)); + } +} + +void blk_hctx_stat_get(struct blk_mq_hw_ctx *hctx, struct blk_rq_stat *dst) +{ + struct blk_mq_ctx *ctx; + unsigned int i, nr; + + nr = 0; + do { + uint64_t newest = 0; + + hctx_for_each_ctx(hctx, ctx, i) { + if (!ctx->stat[0].nr_samples && + !ctx->stat[1].nr_samples) + continue; + + if (ctx->stat[0].time > newest) + newest = ctx->stat[0].time; + if (ctx->stat[1].time > newest) + newest = ctx->stat[1].time; + } + + if (!newest) + break; + + hctx_for_each_ctx(hctx, ctx, i) { + if (ctx->stat[0].time == newest) { + blk_stat_sum(&dst[0], &ctx->stat[0]); + nr++; + } + if (ctx->stat[1].time == newest) { + blk_stat_sum(&dst[1], &ctx->stat[1]); + nr++; + } + } + /* + * If we race on finding an entry, just loop back again. + * Should be very rare, as the window is only updated + * occasionally + */ + } while (!nr); +} + +static void __blk_stat_init(struct blk_rq_stat *stat, s64 time_now) +{ + stat->min = -1ULL; + stat->max = stat->nr_samples = stat->mean = 0; + stat->time = time_now & BLK_STAT_MASK; +} + +void blk_stat_init(struct blk_rq_stat *stat) +{ + __blk_stat_init(stat, ktime_to_ns(ktime_get())); +} + +void blk_stat_add(struct blk_rq_stat *stat, struct request *rq) +{ + s64 delta, now, value; + + now = ktime_to_ns(ktime_get()); + if (now < rq->issue_time) + return; + + if ((now & BLK_STAT_MASK) != (stat->time & BLK_STAT_MASK)) + __blk_stat_init(stat, now); + + value = now - rq->issue_time; + if (value > stat->max) + stat->max = value; + if (value < stat->min) + stat->min = value; + + delta = value - stat->mean; + if (delta) + stat->mean += div64_s64(delta, stat->nr_samples + 1); + + stat->nr_samples++; +} + +void blk_stat_clear(struct request_queue *q) +{ + if (q->mq_ops) { + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + int i, j; + + queue_for_each_hw_ctx(q, hctx, i) { + hctx_for_each_ctx(hctx, ctx, j) { + blk_stat_init(&ctx->stat[0]); + blk_stat_init(&ctx->stat[1]); + } + } + } else { + blk_stat_init(&q->rq_stats[0]); + blk_stat_init(&q->rq_stats[1]); + } +} diff --git a/block/blk-stat.h b/block/blk-stat.h new file mode 100644 index 000000000000..d77548dbf196 --- /dev/null +++ b/block/blk-stat.h @@ -0,0 +1,17 @@ +#ifndef BLK_STAT_H +#define BLK_STAT_H + +/* + * ~0.13s window as a power-of-2 (2^27 nsecs) + */ +#define BLK_STAT_NSEC 134217728ULL +#define BLK_STAT_MASK ~(BLK_STAT_NSEC - 1) + +void blk_stat_add(struct blk_rq_stat *, struct request *); +void blk_hctx_stat_get(struct blk_mq_hw_ctx *, struct blk_rq_stat *); +void blk_queue_stat_get(struct request_queue *, struct blk_rq_stat *); +void blk_stat_clear(struct request_queue *q); +void blk_stat_init(struct blk_rq_stat *); +void blk_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *); + +#endif diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 99205965f559..6e516cc0d3d0 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -379,6 +379,26 @@ static ssize_t queue_wc_store(struct request_queue *q, const char *page, return count; } +static ssize_t print_stat(char *page, struct blk_rq_stat *stat, const char *pre) +{ + return sprintf(page, "%s samples=%llu, mean=%lld, min=%lld, max=%lld\n", + pre, (long long) stat->nr_samples, + (long long) stat->mean, (long long) stat->min, + (long long) stat->max); +} + +static ssize_t queue_stats_show(struct request_queue *q, char *page) +{ + struct blk_rq_stat stat[2]; + ssize_t ret; + + blk_queue_stat_get(q, stat); + + ret = print_stat(page, &stat[0], "read :"); + ret += print_stat(page + ret, &stat[1], "write:"); + return ret; +} + static struct queue_sysfs_entry queue_requests_entry = { .attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR }, .show = queue_requests_show, @@ -516,6 +536,11 @@ static struct queue_sysfs_entry queue_wc_entry = { .store = queue_wc_store, }; +static struct queue_sysfs_entry queue_stats_entry = { + .attr = {.name = "stats", .mode = S_IRUGO }, + .show = queue_stats_show, +}; + static struct attribute *default_attrs[] = { &queue_requests_entry.attr, &queue_ra_entry.attr, @@ -542,6 +567,7 @@ static struct attribute *default_attrs[] = { &queue_random_entry.attr, &queue_poll_entry.attr, &queue_wc_entry.attr, + &queue_stats_entry.attr, NULL, }; diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 223012451c7a..2b4414fb4d8e 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -268,4 +268,12 @@ static inline unsigned int blk_qc_t_to_tag(blk_qc_t cookie) return cookie & ((1u << BLK_QC_T_SHIFT) - 1); } +struct blk_rq_stat { + s64 mean; + u64 min; + u64 max; + s64 nr_samples; + s64 time; +}; + #endif /* __LINUX_BLK_TYPES_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index eee94bd6de52..87f6703ced71 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -153,6 +153,7 @@ struct request { struct gendisk *rq_disk; struct hd_struct *part; unsigned long start_time; + s64 issue_time; #ifdef CONFIG_BLK_CGROUP struct request_list *rl; /* rl this rq is alloced from */ unsigned long long start_time_ns; @@ -402,6 +403,9 @@ struct request_queue { unsigned int nr_sorted; unsigned int in_flight[2]; + + struct blk_rq_stat rq_stats[2]; + /* * Number of active block driver functions for which blk_drain_queue() * must wait. Must be incremented around functions that unlock the -- 2.8.0.rc4.6.g7e4ba36