The patch looks good to me overall, but I encountered a compilation error (see details below). I suspect it might have been tested on a vendor-specific version; could you please re-verify it against the latest mainline version? I’ll be happy to merge it once the build issue is resolved.
On Wed, Jun 3, 2026 at 11:16 AM Bin Guo <[email protected]> wrote: > The dirty ring reaper thread polls with a fixed sleep(1), which the > code itself flags with a TODO for a smarter timeout. Events that > would benefit from a prompt reap must wait up to one full sleep tick > before the reaper notices them. > > Replace the polling loop with an EventNotifier-based wait via > qemu_poll_ns(), kicked by paths that have first-hand evidence that > a reap is desirable. A 1s fallback timeout is retained as a > defensive backstop. > > Two kick sites are wired up: > > * dirtylimit_change(false) -- when dirty-limit is cancelled, kick > the reaper so it resumes real reaping immediately instead of > waiting for the next sleep tick. > > Measured with strace on the reaper TID (20 set/cancel cycles > against a stress-ng dirty-page workload), latency from the > cancel-vcpu-dirty-limit QMP ack to the reaper actually waking: > > before: median 530ms, max 937ms > after: median <1ms, max <1ms > > * kvm_cpu_exec() KVM_EXIT_DIRTY_RING_FULL handler -- the handler > already reaps the exiting vCPU synchronously; the kick hints the > reaper to also check other vCPUs whose rings are likely filling > concurrently. > > kvm_dirty_ring_reaper_kick() is exposed as a public API; multiple > kicks collapse into a single wake via the eventfd counter. > kvm_dirty_ring_reaper_init() now returns int again (it was made void > in commit 43a5e377f4) to propagate event_notifier_init() failure. > A stub is added for non-KVM builds. > > Signed-off-by: Bin Guo <[email protected]> > --- > accel/kvm/kvm-all.c | 62 +++++++++++++++++++++++++++++++++++++--- > accel/stubs/kvm-stub.c | 4 +++ > include/system/kvm.h | 9 ++++++ > include/system/kvm_int.h | 3 ++ > system/dirtylimit.c | 6 ++++ > 5 files changed, 80 insertions(+), 4 deletions(-) > > diff --git a/accel/kvm/kvm-all.c b/accel/kvm/kvm-all.c > index 96f90ebb24..f964102f09 100644 > --- a/accel/kvm/kvm-all.c > +++ b/accel/kvm/kvm-all.c > @@ -1754,6 +1754,9 @@ static void kvm_set_phys_mem(KVMMemoryListener *kml, > } while (size); > } > > +/* Fallback liveness timeout for the dirty ring reaper, in nanoseconds. */ > +#define KVM_DIRTY_RING_REAPER_FALLBACK_NS (1 * NANOSECONDS_PER_SECOND) > + > static void *kvm_dirty_ring_reaper_thread(void *data) > { > KVMState *s = data; > @@ -1764,12 +1767,30 @@ static void *kvm_dirty_ring_reaper_thread(void > *data) > trace_kvm_dirty_ring_reaper("init"); > > while (true) { > + GPollFD pfd = { > + .fd = event_notifier_get_fd(&r->reaper_notifier), > + .events = G_IO_IN, > + }; > + > r->reaper_state = KVM_DIRTY_RING_REAPER_WAIT; > trace_kvm_dirty_ring_reaper("wait"); > + > /* > - * TODO: provide a smarter timeout rather than a constant? > + * Event-driven wait: sleep until something kicks the reaper > + * (vCPU ring-full exit, dirtylimit disabled, ...) or until the > + * fallback timeout fires. The fallback preserves the original > + * sleep(1) worst-case behaviour as a liveness backstop in case > + * a kick site is ever missed. > */ > - sleep(1); > + qemu_poll_ns(&pfd, 1, KVM_DIRTY_RING_REAPER_FALLBACK_NS); > + > + /* > + * Drain the notifier whether or not the wakeup came from it -- > + * any pending kick is satisfied by the reap we are about to > + * perform, so we must not leave a stale event behind that would > + * cause the next iteration to spin without sleeping. > + */ > + event_notifier_test_and_clear(&r->reaper_notifier); > > /* keep sleeping so that dirtylimit not be interfered by reaper */ > if (dirtylimit_in_service()) { > @@ -1789,13 +1810,39 @@ static void *kvm_dirty_ring_reaper_thread(void > *data) > g_assert_not_reached(); > } > > -static void kvm_dirty_ring_reaper_init(KVMState *s) > +static int kvm_dirty_ring_reaper_init(KVMState *s, Error **errp) > { > struct KVMDirtyRingReaper *r = &s->reaper; > + int ret; > + > + ret = event_notifier_init(&r->reaper_notifier, 0); > + if (ret < 0) { > + error_setg_errno(errp, -ret, > + "Failed to initialize dirty ring reaper > notifier"); > + return ret; > + } > > qemu_thread_create(&r->reaper_thr, "kvm-reaper", > kvm_dirty_ring_reaper_thread, > s, QEMU_THREAD_JOINABLE); > + return 0; > +} > + > +/* > + * Wake the dirty ring reaper thread so it performs a reap as soon as > + * possible. Safe to call from any thread; safe to call even when the > + * dirty ring is not enabled (no-op in that case). Coalescing is > + * provided by the eventfd counter -- multiple kicks before the reaper > + * runs collapse into a single wakeup. > + */ > +void kvm_dirty_ring_reaper_kick(void) > +{ > + KVMState *s = kvm_state; > + > + if (!s || !s->kvm_dirty_ring_size) { > + return; > + } > + event_notifier_set(&s->reaper.reaper_notifier); > } > > static int kvm_dirty_ring_init(KVMState *s) > @@ -3097,7 +3144,9 @@ static int kvm_init(AccelState *as, MachineState *ms) > } > > if (s->kvm_dirty_ring_size) { > - kvm_dirty_ring_reaper_init(s); > Compilation error: 'errp' is undefined in this context. + if (kvm_dirty_ring_reaper_init(s, errp) < 0) { > + goto err; > + } > } > > if (kvm_check_extension(kvm_state, KVM_CAP_BINARY_STATS_FD)) { > @@ -3571,6 +3620,11 @@ int kvm_cpu_exec(CPUState *cpu) > kvm_dirty_ring_reap(kvm_state, NULL); > } > bql_unlock(); > + /* > + * Ring-full pressure is the strongest signal that background > + * reaping should stay hot; wake the reaper unconditionally. > + */ > + kvm_dirty_ring_reaper_kick(); > dirtylimit_vcpu_execute(cpu); > ret = 0; > break; > diff --git a/accel/stubs/kvm-stub.c b/accel/stubs/kvm-stub.c > index c4617caac6..b878598552 100644 > --- a/accel/stubs/kvm-stub.c > +++ b/accel/stubs/kvm-stub.c > @@ -134,6 +134,10 @@ uint32_t kvm_dirty_ring_size(void) > return 0; > } > > +void kvm_dirty_ring_reaper_kick(void) > +{ > +} > + > bool kvm_hwpoisoned_mem(void) > { > return false; > diff --git a/include/system/kvm.h b/include/system/kvm.h > index 5fa33eddda..c42c8e0b74 100644 > --- a/include/system/kvm.h > +++ b/include/system/kvm.h > @@ -553,6 +553,15 @@ bool kvm_dirty_ring_enabled(void); > > uint32_t kvm_dirty_ring_size(void); > > +/** > + * kvm_dirty_ring_reaper_kick - wake the background dirty ring reaper. > + * > + * Hint that an immediate reap is desirable (e.g. ring-full pressure > + * detected, dirtylimit disabled). Coalesced via eventfd. No-op when > + * the dirty ring feature is not in use. > + */ > +void kvm_dirty_ring_reaper_kick(void); > + > void kvm_mark_guest_state_protected(void); > > /** > diff --git a/include/system/kvm_int.h b/include/system/kvm_int.h > index 0876aac938..08e28d075a 100644 > --- a/include/system/kvm_int.h > +++ b/include/system/kvm_int.h > @@ -12,6 +12,7 @@ > #include "system/memory.h" > #include "qapi/qapi-types-common.h" > #include "qemu/accel.h" > +#include "qemu/event_notifier.h" > #include "qemu/queue.h" > #include "system/kvm.h" > #include "accel/accel-ops.h" > @@ -100,6 +101,8 @@ struct KVMDirtyRingReaper { > QemuThread reaper_thr; > volatile uint64_t reaper_iteration; /* iteration number of reaper thr > */ > volatile enum KVMDirtyRingReaperState reaper_state; /* reap thr state > */ > + /* Wakeup channel: kicked by ring-full vCPU exits, dirtylimit toggle, > ... */ > + EventNotifier reaper_notifier; > }; > struct KVMState > { > diff --git a/system/dirtylimit.c b/system/dirtylimit.c > index c934ceb0de..3ee9c58479 100644 > --- a/system/dirtylimit.c > +++ b/system/dirtylimit.c > @@ -393,6 +393,12 @@ void dirtylimit_change(bool start) > qatomic_set(&dirtylimit_quit, 0); > } else { > qatomic_set(&dirtylimit_quit, 1); > + /* > + * The reaper has been short-circuiting via the > + * dirtylimit_in_service() branch. Kick it so it picks up the > + * policy change immediately instead of after the next 1s tick. > + */ > + kvm_dirty_ring_reaper_kick(); > } > } > > -- > 2.50.1 (Apple Git-155) > >
