On 4/28/26 12:58, Denis V. Lunev wrote:
> qemu_laio_process_completions() wraps its body in defer_call_begin /
> defer_call_end. Inside the section, completion callbacks wake coroutines
> that queue new aiocbs; laio_do_submit() defers laio_deferred_fn. At the
> bottom of qemu_laio_process_completions() the defer_call_end() fires
> laio_deferred_fn, which calls ioq_submit(), closing the cycle:
>
> ioq_submit
> -> io_submit(2) // some sync completions
> -> qemu_laio_process_completions // defer_call_begin
> -> aio_co_wake // resumes coroutine
> -> laio_do_submit
> -> defer_call(laio_deferred_fn, s) // enqueued
> -> defer_call_end // nesting drops to 0
> -> laio_deferred_fn
> -> ioq_submit // +1 stack frame, loop
>
> When io_submit(2) returns asynchronously (O_DIRECT) the cycle
> terminates in one extra frame: the fresh aiocb is still in flight, no
> completion is drained, no coroutine wakes, no new submission queues.
> When submissions complete synchronously (non-O_DIRECT, or per-descriptor
> drivers such as vmdk) each level enqueues more work for the next
> defer_call_end() to drain, so recursion grows without bound and QEMU
> crashes with SIGSEGV on the thread guard page.
>
> The cycle was closed by two performance commits, each correct in
> isolation:
>
> 076682885d ("block/linux-aio: convert to blk_io_plug_call() API")
> -- introduced laio_deferred_fn and wired
> laio_do_submit -> defer_call(laio_deferred_fn, s).
>
> 84d61e5f36 ("virtio: use defer_call() in virtio_irqfd_notify()")
> -- added defer_call_begin/end around qemu_laio_process_completions
> so virtio-irqfd notifications batch across a completion pass.
>
> The supported aio=native + cache=none pairing keeps submissions
> asynchronous, so the cycle stays bounded; nothing in the code enforces
> that contract. Observed in production as a SIGSEGV during a backup job
> configured with --cached + aio=native; reproducible on upstream with
> qemu-io against vmdk.
>
> Cap ioq_submit() recursion with a per-thread counter. On overflow,
> return without submitting. The pending work is drained by
> s->completion_bh, which qemu_laio_process_completions() has already
> scheduled on entry -- no work is lost; one event-loop round-trip of
> latency is paid only when the bound is hit, which cannot happen on a
> supported configuration.
>
> Signed-off-by: Denis V. Lunev <[email protected]>
> CC: Kevin Wolf <[email protected]>
> CC: Hanna Reitz <[email protected]>
> CC: Stefan Hajnoczi <[email protected]>
> CC: Paolo Bonzini <[email protected]>
> ---
> block/linux-aio.c | 23 +++++++++++++++++++++++
> 1 file changed, 23 insertions(+)
>
> diff --git a/block/linux-aio.c b/block/linux-aio.c
> index 0a7424fbb3..f98bb6e766 100644
> --- a/block/linux-aio.c
> +++ b/block/linux-aio.c
> @@ -36,6 +36,19 @@
> /* Maximum number of requests in a batch. (default value) */
> #define DEFAULT_MAX_BATCH 32
>
> +/*
> + * Bound on how deep ioq_submit() may recurse on a single thread via the
> + * ioq_submit -> qemu_laio_process_completions -> defer_call_end ->
> + * laio_deferred_fn -> ioq_submit cycle. The cycle terminates naturally
> + * when io_submit(2) returns asynchronously (O_DIRECT), but can grow
> + * without bound when submissions complete synchronously. On overflow
> + * the caller returns without submitting; the outermost
> + * qemu_laio_process_completions() has already scheduled s->completion_bh
> + * (via qemu_bh_schedule() at the top of that function), which resumes
> + * submission from the next event-loop dispatch.
> + */
> +#define IOQ_SUBMIT_MAX_DEPTH 8
> +
> struct qemu_laiocb {
> Coroutine *co;
> LinuxAioState *ctx;
> @@ -80,6 +93,9 @@ struct LinuxAioState {
> static void ioq_submit(LinuxAioState *s);
> static int laio_do_submit(struct qemu_laiocb *laiocb);
>
> +/* Per-thread recursion counter for ioq_submit(). See IOQ_SUBMIT_MAX_DEPTH.
> */
> +static __thread unsigned ioq_submit_depth;
> +
> static inline ssize_t io_event_ret(struct io_event *ev)
> {
> return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res);
> @@ -340,6 +356,11 @@ static void ioq_submit(LinuxAioState *s)
> QEMU_UNINITIALIZED struct iocb *iocbs[MAX_EVENTS];
> QSIMPLEQ_HEAD(, qemu_laiocb) completed;
>
> + if (ioq_submit_depth >= IOQ_SUBMIT_MAX_DEPTH) {
> + return;
> + }
> + ioq_submit_depth++;
> +
> do {
> if (s->io_q.in_flight >= MAX_EVENTS) {
> break;
> @@ -385,6 +406,8 @@ static void ioq_submit(LinuxAioState *s)
> * pended requests will be submitted from there.
> */
> }
> +
> + ioq_submit_depth--;
> }
>
> static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
ping