Add support for an optional stats struct embedded in the refill queue
region, allowing userspace to monitor copy-fallback and no-buffers events
in real-time.

Userspace queries the stats struct size and alignment via
IO_URING_QUERY_ZCRX (notif_stats_size / notif_stats_alignment), then
provides a stats_offset in zcrx_notification_desc pointing to a location
within the refill queue region.

The kernel updates the stats counters in-place using atomic ops on every
copy-fallback and no-buffers event.

Signed-off-by: Clément Léger <[email protected]>
---
 include/uapi/linux/io_uring/query.h | 12 +++++++
 include/uapi/linux/io_uring/zcrx.h  | 15 +++++++--
 io_uring/query.c                    | 14 ++++++++
 io_uring/zcrx.c                     | 50 +++++++++++++++++++++++++++--
 io_uring/zcrx.h                     |  1 +
 5 files changed, 88 insertions(+), 4 deletions(-)

diff --git a/include/uapi/linux/io_uring/query.h 
b/include/uapi/linux/io_uring/query.h
index 95500759cc13..738c35c7d05c 100644
--- a/include/uapi/linux/io_uring/query.h
+++ b/include/uapi/linux/io_uring/query.h
@@ -23,6 +23,7 @@ enum {
        IO_URING_QUERY_OPCODES                  = 0,
        IO_URING_QUERY_ZCRX                     = 1,
        IO_URING_QUERY_SCQ                      = 2,
+       IO_URING_QUERY_ZCRX_NOTIF               = 3,
 
        __IO_URING_QUERY_MAX,
 };
@@ -62,6 +63,17 @@ struct io_uring_query_zcrx {
        __u64 __resv2;
 };
 
+struct io_uring_query_zcrx_notif {
+       /* Bitmask of supported ZCRX_NOTIF_* flags*/
+       __u32 notif_flags;
+       /* Size of io_uring_zcrx_notif_stats */
+       __u32 notif_stats_size;
+       /* Required alignment for the stats struct within the region (ie 
stats_offset) */
+       __u32 notif_stats_off_alignment;
+       __u32 resv1;
+       __u64 __resv2[10];
+};
+
 struct io_uring_query_scq {
        /* The SQ/CQ rings header size */
        __u64 hdr_size;
diff --git a/include/uapi/linux/io_uring/zcrx.h 
b/include/uapi/linux/io_uring/zcrx.h
index e0c0079626c8..ae9bbca3004c 100644
--- a/include/uapi/linux/io_uring/zcrx.h
+++ b/include/uapi/linux/io_uring/zcrx.h
@@ -73,11 +73,22 @@ enum zcrx_notification_type {
        ZCRX_NOTIF_COPY = 1 << 1
 };
 
+enum zcrx_notification_desc_flags {
+       /* If set, stats_offset holds a valid offset to a notif_stats struct */
+       ZCRX_NOTIF_DESC_FLAG_STATS = 1 << 0,
+};
+
+struct io_uring_zcrx_notif_stats {
+       __u64   copy_count;     /* cumulative copy-fallback CQEs */
+       __u64   copy_bytes;     /* cumulative bytes copied */
+};
+
 struct zcrx_notification_desc {
        __u64   user_data;
        __u32   type_mask;
-       __u32   __resv1;
-       __u64   __resv2[10];
+       __u32   flags; /* see enum zcrx_notification_desc_flags */
+       __u64   stats_offset; /* offset from the beginning of refill ring 
region for stats */
+       __u64   __resv2[9];
 };
 
 /*
diff --git a/io_uring/query.c b/io_uring/query.c
index c1704d088374..3591106e139d 100644
--- a/io_uring/query.c
+++ b/io_uring/query.c
@@ -9,6 +9,7 @@
 union io_query_data {
        struct io_uring_query_opcode opcodes;
        struct io_uring_query_zcrx zcrx;
+       struct io_uring_query_zcrx_notif zcrx_notif;
        struct io_uring_query_scq scq;
 };
 
@@ -44,6 +45,16 @@ static ssize_t io_query_zcrx(union io_query_data *data)
        return sizeof(*e);
 }
 
+static ssize_t io_query_zcrx_notif(union io_query_data *data)
+{
+       struct io_uring_query_zcrx_notif *e = &data->zcrx_notif;
+
+       e->notif_flags = ZCRX_NOTIF_TYPE_MASK;
+       e->notif_stats_size = sizeof(struct io_uring_zcrx_notif_stats);
+       e->notif_stats_off_alignment = __alignof__(struct 
io_uring_zcrx_notif_stats);
+       return sizeof(*e);
+}
+
 static ssize_t io_query_scq(union io_query_data *data)
 {
        struct io_uring_query_scq *e = &data->scq;
@@ -83,6 +94,9 @@ static int io_handle_query_entry(union io_query_data *data, 
void __user *uhdr,
        case IO_URING_QUERY_ZCRX:
                ret = io_query_zcrx(data);
                break;
+       case IO_URING_QUERY_ZCRX_NOTIF:
+               ret = io_query_zcrx_notif(data);
+               break;
        case IO_URING_QUERY_SCQ:
                ret = io_query_scq(data);
                break;
diff --git a/io_uring/zcrx.c b/io_uring/zcrx.c
index 732e585aa13a..c61f94fb14c3 100644
--- a/io_uring/zcrx.c
+++ b/io_uring/zcrx.c
@@ -414,6 +414,7 @@ static void io_free_rbuf_ring(struct io_zcrx_ifq *ifq)
        io_free_region(ifq->user, &ifq->rq_region);
        ifq->rq.ring = NULL;
        ifq->rq.rqes = NULL;
+       ifq->notif_stats = NULL;
 }
 
 static void io_zcrx_free_area(struct io_zcrx_ifq *ifq,
@@ -841,6 +842,33 @@ static int zcrx_register_netdev(struct io_zcrx_ifq *ifq,
        return ret;
 }
 
+static int zcrx_validate_notif_stats(struct io_zcrx_ifq *ifq,
+                                    const struct io_uring_zcrx_ifq_reg *reg,
+                                    const struct zcrx_notification_desc *notif)
+{
+       size_t stats_off = notif->stats_offset;
+       size_t used, end;
+
+       used = reg->offsets.rqes +
+              sizeof(struct io_uring_zcrx_rqe) * reg->rq_entries;
+
+       if (!IS_ALIGNED(stats_off, __alignof__(struct 
io_uring_zcrx_notif_stats)))
+               return -EINVAL;
+       if (stats_off < used)
+               return -ERANGE;
+       if (check_add_overflow(stats_off,
+                              sizeof(struct io_uring_zcrx_notif_stats),
+                              &end))
+               return -ERANGE;
+       if (end > io_region_size(&ifq->rq_region))
+               return -ERANGE;
+
+       ifq->notif_stats = io_region_get_ptr(&ifq->rq_region) + stats_off;
+       memset(ifq->notif_stats, 0, sizeof(*ifq->notif_stats));
+
+       return 0;
+}
+
 int io_register_zcrx(struct io_ring_ctx *ctx,
                     struct io_uring_zcrx_ifq_reg __user *arg)
 {
@@ -894,7 +922,9 @@ int io_register_zcrx(struct io_ring_ctx *ctx,
                return -EFAULT;
        if (notif.type_mask & ~ZCRX_NOTIF_TYPE_MASK)
                return -EINVAL;
-       if (notif.__resv1 || !mem_is_zero(&notif.__resv2, 
sizeof(notif.__resv2)))
+       if (notif.flags & ~ZCRX_NOTIF_DESC_FLAG_STATS)
+               return -EINVAL;
+       if (!mem_is_zero(&notif.__resv2, sizeof(notif.__resv2)))
                return -EINVAL;
 
        ifq = io_zcrx_ifq_alloc(ctx);
@@ -925,6 +955,12 @@ int io_register_zcrx(struct io_ring_ctx *ctx,
        if (ret)
                goto err;
 
+       if (notif.flags & ZCRX_NOTIF_DESC_FLAG_STATS) {
+               ret = zcrx_validate_notif_stats(ifq, &reg, &notif);
+               if (ret)
+                       goto err;
+       }
+
        ifq->kern_readable = !(area.flags & IORING_ZCRX_AREA_DMABUF);
 
        if (!(reg.flags & ZCRX_REG_NODEV)) {
@@ -1133,6 +1169,11 @@ static void zcrx_notif_tw(struct io_tw_req tw_req, 
io_tw_token_t tw)
        kfree_rcu(req, rcu_head);
 }
 
+static void zcrx_stat_add(__u64 *p, s64 v)
+{
+       WRITE_ONCE(*p, READ_ONCE(*p) + v);
+}
+
 static void zcrx_send_notif(struct io_zcrx_ifq *ifq, u32 type_mask)
 {
        gfp_t gfp = GFP_ATOMIC | __GFP_NOWARN | __GFP_ZERO;
@@ -1513,8 +1554,13 @@ static int io_zcrx_copy_frag(struct io_kiocb *req, 
struct io_zcrx_ifq *ifq,
        int ret;
 
        ret = io_zcrx_copy_chunk(req, ifq, page, off + skb_frag_off(frag), len);
-       if (ret > 0)
+       if (ret > 0) {
+               if (ifq->notif_stats) {
+                       zcrx_stat_add(&ifq->notif_stats->copy_count, 1);
+                       zcrx_stat_add(&ifq->notif_stats->copy_bytes, ret);
+               }
                zcrx_send_notif(ifq, ZCRX_NOTIF_COPY);
+       }
 
        return ret;
 }
diff --git a/io_uring/zcrx.h b/io_uring/zcrx.h
index 1bd63adaa711..0dcf486ff530 100644
--- a/io_uring/zcrx.h
+++ b/io_uring/zcrx.h
@@ -82,6 +82,7 @@ struct io_zcrx_ifq {
        u32                             allowed_notif_mask;
        u32                             fired_notifs;
        u64                             notif_data;
+       struct io_uring_zcrx_notif_stats *notif_stats;
 };
 
 #if defined(CONFIG_IO_URING_ZCRX)
-- 
2.52.0


Reply via email to