[PATCH RFC v1 2/3] aio-poll: refine iothread polling using weighted handler intervals

Jaehoon Kim Tue, 13 Jan 2026 09:52:34 -0800

Refine adaptive polling in aio_poll by updating iothread polling
duration based on weighted AioHandler event intervals.


Each AioHandler's poll.ns is updated using a weighted factor when an
event occurs. Idle handlers accumulate block_ns until poll_max_ns and
then reset to 0, preventing sporadically active handlers from
unnecessarily prolonging iothread polling.

The iothread polling duration is set based on the largest poll.ns among
active handlers. The shrink divider defaults to 2, matching the grow
rate, to reduce frequent poll_ns resets for slow devices.

The default weight factor (POLL_WEIGHT_SHIFT=2) was selected based on
various FIO tests to balance mean poll_ns, reset frequency, and high
poll_ns occurrences. Correlation between current block_ns and weighted
value (adj_block_ns) slightly decreases as weight increases. Lower
weights cause more fluctuation; higher weights maintain poll_ns once it
rises.

The table below shows results for a representative randread case (bs=4k,
iodepth=8, 2 IOThread), illustrating the average poll_ns, the ratio of
poll_ns resets to 0, and the time spent near the maximum poll_ns for
different weight values.

Weight| Mean poll_ns |  poll_ns reset rate | Time near max(%)
------| ------------ | ------------------- | -----------------
  1   |     4523     |       89.9%         |     7.41%
  2   |     8442     |       78.6%         |     15.84%
  3   |    11147     |       70.4%         |     21.38%
  4   |    11624     |       70.1%         |     23.35%

Weight=1 reacts quickly, Weight=3-4 holds poll_ns higher once it rises,
and Weight=2 provides a good balance between responsiveness and CPU
usage.

Signed-off-by: Jaehoon Kim <[email protected]>
---
 include/qemu/aio.h |   4 +-
 util/aio-posix.c   | 135 +++++++++++++++++++++++++++++++--------------
 util/async.c       |   1 +
 3 files changed, 99 insertions(+), 41 deletions(-)

diff --git a/include/qemu/aio.h b/include/qemu/aio.h
index 8cca2360d1..6c77a190e9 100644
--- a/include/qemu/aio.h
+++ b/include/qemu/aio.h
@@ -195,7 +195,8 @@ struct BHListSlice {
 typedef QSLIST_HEAD(, AioHandler) AioHandlerSList;
 
 typedef struct AioPolledEvent {
-    int64_t ns;        /* current polling time in nanoseconds */
+    bool has_event; /* Flag to indicate if an event has occurred */
+    int64_t ns;     /* estimated block time in nanoseconds */
 } AioPolledEvent;
 
 struct AioContext {
@@ -306,6 +307,7 @@ struct AioContext {
     int poll_disable_cnt;
 
     /* Polling mode parameters */
+    int64_t poll_ns;        /* current polling time in nanoseconds */
     int64_t poll_max_ns;    /* maximum polling time in nanoseconds */
     int64_t poll_grow;      /* polling time growth factor */
     int64_t poll_shrink;    /* polling time shrink factor */
diff --git a/util/aio-posix.c b/util/aio-posix.c
index 7ddf92a25f..dd6008898b 100644
--- a/util/aio-posix.c
+++ b/util/aio-posix.c
@@ -28,9 +28,11 @@
 
 /* Stop userspace polling on a handler if it isn't active for some time */
 #define POLL_IDLE_INTERVAL_NS (7 * NANOSECONDS_PER_SECOND)
+#define POLL_WEIGHT_SHIFT   (2)
 
-static void adjust_polling_time(AioContext *ctx, AioPolledEvent *poll,
-                                int64_t block_ns);
+static void adjust_block_ns(AioContext *ctx, int64_t block_ns);
+static void grow_polling_time(AioContext *ctx, int64_t block_ns);
+static void shrink_polling_time(AioContext *ctx, int64_t block_ns);
 
 bool aio_poll_disabled(AioContext *ctx)
 {
@@ -372,7 +374,7 @@ static bool aio_dispatch_ready_handlers(AioContext *ctx,
          * add the handler to ctx->poll_aio_handlers.
          */
         if (ctx->poll_max_ns && QLIST_IS_INSERTED(node, node_poll)) {
-            adjust_polling_time(ctx, &node->poll, block_ns);
+            node->poll.has_event = true;
         }
     }
 
@@ -559,18 +561,13 @@ static bool run_poll_handlers(AioContext *ctx, 
AioHandlerList *ready_list,
 static bool try_poll_mode(AioContext *ctx, AioHandlerList *ready_list,
                           int64_t *timeout)
 {
-    AioHandler *node;
     int64_t max_ns;
 
     if (QLIST_EMPTY_RCU(&ctx->poll_aio_handlers)) {
         return false;
     }
 
-    max_ns = 0;
-    QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
-        max_ns = MAX(max_ns, node->poll.ns);
-    }
-    max_ns = qemu_soonest_timeout(*timeout, max_ns);
+    max_ns = qemu_soonest_timeout(*timeout, ctx->poll_ns);
 
     if (max_ns && !ctx->fdmon_ops->need_wait(ctx)) {
         /*
@@ -586,46 +583,98 @@ static bool try_poll_mode(AioContext *ctx, AioHandlerList 
*ready_list,
     return false;
 }
 
-static void adjust_polling_time(AioContext *ctx, AioPolledEvent *poll,
-                                int64_t block_ns)
+static void shrink_polling_time(AioContext *ctx, int64_t block_ns)
 {
-    if (block_ns <= poll->ns) {
-        /* This is the sweet spot, no adjustment needed */
-    } else if (block_ns > ctx->poll_max_ns) {
-        /* We'd have to poll for too long, poll less */
-        int64_t old = poll->ns;
-
-        if (ctx->poll_shrink) {
-            poll->ns /= ctx->poll_shrink;
-        } else {
-            poll->ns = 0;
-        }
+    /*
+     * Reduce polling time if the block_ns is zero or
+     * less than the current poll_ns.
+     */
+    int64_t old = ctx->poll_ns;
+    int64_t shrink = ctx->poll_shrink;
 
-        trace_poll_shrink(ctx, old, poll->ns);
-    } else if (poll->ns < ctx->poll_max_ns &&
-               block_ns < ctx->poll_max_ns) {
-        /* There is room to grow, poll longer */
-        int64_t old = poll->ns;
-        int64_t grow = ctx->poll_grow;
+    if (shrink == 0) {
+        shrink = 2;
+    }
 
-        if (grow == 0) {
-            grow = 2;
-        }
+    if (block_ns < (ctx->poll_ns / shrink)) {
+        ctx->poll_ns /= shrink;
+    }
 
-        if (poll->ns) {
-            poll->ns *= grow;
-        } else {
-            poll->ns = 4000; /* start polling at 4 microseconds */
-        }
+    trace_poll_shrink(ctx, old, ctx->poll_ns);
+}
 
-        if (poll->ns > ctx->poll_max_ns) {
-            poll->ns = ctx->poll_max_ns;
-        }
+static void grow_polling_time(AioContext *ctx, int64_t block_ns)
+{
+    /* There is room to grow, poll longer */
+    int64_t old = ctx->poll_ns;
+    int64_t grow = ctx->poll_grow;
 
-        trace_poll_grow(ctx, old, poll->ns);
+    if (grow == 0) {
+        grow = 2;
     }
+
+    if (block_ns > ctx->poll_ns * grow) {
+        ctx->poll_ns = block_ns;
+    } else {
+        ctx->poll_ns *= grow;
+    }
+
+    if (ctx->poll_ns > ctx->poll_max_ns) {
+        ctx->poll_ns = ctx->poll_max_ns;
+    }
+
+    trace_poll_grow(ctx, old, ctx->poll_ns);
 }
 
+static void adjust_block_ns(AioContext *ctx, int64_t block_ns)
+{
+    AioHandler *node;
+    int64_t adj_block_ns = -1;
+
+    QLIST_FOREACH(node, &ctx->poll_aio_handlers, node_poll) {
+        if (node->poll.has_event) {
+            /*
+             * Update poll.ns for the node with an event.
+             * Uses a weighted average of the current block_ns and the previous
+             * poll.ns to smooth out polling time adjustments.
+             */
+            node->poll.ns = node->poll.ns
+                ? (node->poll.ns - (node->poll.ns >> POLL_WEIGHT_SHIFT))
+                + (block_ns >> POLL_WEIGHT_SHIFT) : block_ns;
+
+            if (node->poll.ns >= ctx->poll_max_ns) {
+                node->poll.ns = 0;
+            }
+            /*
+             * To avoid excessive polling time increase, update adj_block_ns
+             * for nodes with the event flag set to true
+             */
+            adj_block_ns = MAX(adj_block_ns, node->poll.ns);
+            node->poll.has_event = false;
+         } else {
+            /*
+             * No event now, but was active before.
+             * If it waits longer than poll_max_ns, poll.ns will stay 0
+             * until the next event arrives.
+             */
+            if (node->poll.ns != 0) {
+                node->poll.ns += block_ns;
+                if (node->poll.ns >= ctx->poll_max_ns) {
+                    node->poll.ns = 0;
+                }
+            }
+        }
+    }
+
+    if (adj_block_ns >= 0) {
+        if (adj_block_ns > ctx->poll_ns) {
+            grow_polling_time(ctx, adj_block_ns);
+        } else {
+            shrink_polling_time(ctx, adj_block_ns);
+         }
+     }
+ }
+
 bool aio_poll(AioContext *ctx, bool blocking)
 {
     AioHandlerList ready_list = QLIST_HEAD_INITIALIZER(ready_list);
@@ -722,6 +771,10 @@ bool aio_poll(AioContext *ctx, bool blocking)
 
     aio_free_deleted_handlers(ctx);
 
+    if (ctx->poll_max_ns) {
+        adjust_block_ns(ctx, block_ns);
+    }
+
     qemu_lockcnt_dec(&ctx->list_lock);
 
     progress |= timerlistgroup_run_timers(&ctx->tlg);
@@ -783,6 +836,7 @@ void aio_context_set_poll_params(AioContext *ctx, int64_t 
max_ns,
 
     qemu_lockcnt_inc(&ctx->list_lock);
     QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+        node->poll.has_event = false;
         node->poll.ns = 0;
     }
     qemu_lockcnt_dec(&ctx->list_lock);
@@ -793,6 +847,7 @@ void aio_context_set_poll_params(AioContext *ctx, int64_t 
max_ns,
     ctx->poll_max_ns = max_ns;
     ctx->poll_grow = grow;
     ctx->poll_shrink = shrink;
+    ctx->poll_ns = 0;
 
     aio_notify(ctx);
 }
diff --git a/util/async.c b/util/async.c
index 80d6b01a8a..9d3627566f 100644
--- a/util/async.c
+++ b/util/async.c
@@ -606,6 +606,7 @@ AioContext *aio_context_new(Error **errp)
     timerlistgroup_init(&ctx->tlg, aio_timerlist_notify, ctx);
 
     ctx->poll_max_ns = 0;
+    ctx->poll_ns = 0;
     ctx->poll_grow = 0;
     ctx->poll_shrink = 0;
 
-- 
2.50.1

[PATCH RFC v1 2/3] aio-poll: refine iothread polling using weighted handler intervals

Reply via email to