On 4/28/26 12:58, Denis V. Lunev wrote:
> qemu_laio_process_completions() wraps its body in defer_call_begin /
> defer_call_end. Inside the section, completion callbacks wake coroutines
> that queue new aiocbs; laio_do_submit() defers laio_deferred_fn. At the
> bottom of qemu_laio_process_completions() the defer_call_end() fires
> laio_deferred_fn, which calls ioq_submit(), closing the cycle:
>
>   ioq_submit
>     -> io_submit(2)                           // some sync completions
>     -> qemu_laio_process_completions          // defer_call_begin
>          -> aio_co_wake                       // resumes coroutine
>               -> laio_do_submit
>                    -> defer_call(laio_deferred_fn, s)   // enqueued
>          -> defer_call_end                    // nesting drops to 0
>               -> laio_deferred_fn
>                    -> ioq_submit              // +1 stack frame, loop
>
> When io_submit(2) returns asynchronously (O_DIRECT) the cycle
> terminates in one extra frame: the fresh aiocb is still in flight, no
> completion is drained, no coroutine wakes, no new submission queues.
> When submissions complete synchronously (non-O_DIRECT, or per-descriptor
> drivers such as vmdk) each level enqueues more work for the next
> defer_call_end() to drain, so recursion grows without bound and QEMU
> crashes with SIGSEGV on the thread guard page.
>
> The cycle was closed by two performance commits, each correct in
> isolation:
>
>   076682885d ("block/linux-aio: convert to blk_io_plug_call() API")
>     -- introduced laio_deferred_fn and wired
>        laio_do_submit -> defer_call(laio_deferred_fn, s).
>
>   84d61e5f36 ("virtio: use defer_call() in virtio_irqfd_notify()")
>     -- added defer_call_begin/end around qemu_laio_process_completions
>        so virtio-irqfd notifications batch across a completion pass.
>
> The supported aio=native + cache=none pairing keeps submissions
> asynchronous, so the cycle stays bounded; nothing in the code enforces
> that contract. Observed in production as a SIGSEGV during a backup job
> configured with --cached + aio=native; reproducible on upstream with
> qemu-io against vmdk.
>
> Cap ioq_submit() recursion with a per-thread counter. On overflow,
> return without submitting. The pending work is drained by
> s->completion_bh, which qemu_laio_process_completions() has already
> scheduled on entry -- no work is lost; one event-loop round-trip of
> latency is paid only when the bound is hit, which cannot happen on a
> supported configuration.
>
> Signed-off-by: Denis V. Lunev <[email protected]>
> CC: Kevin Wolf <[email protected]>
> CC: Hanna Reitz <[email protected]>
> CC: Stefan Hajnoczi <[email protected]>
> CC: Paolo Bonzini <[email protected]>
> ---
>  block/linux-aio.c | 23 +++++++++++++++++++++++
>  1 file changed, 23 insertions(+)
>
> diff --git a/block/linux-aio.c b/block/linux-aio.c
> index 0a7424fbb3..f98bb6e766 100644
> --- a/block/linux-aio.c
> +++ b/block/linux-aio.c
> @@ -36,6 +36,19 @@
>  /* Maximum number of requests in a batch. (default value) */
>  #define DEFAULT_MAX_BATCH 32
>  
> +/*
> + * Bound on how deep ioq_submit() may recurse on a single thread via the
> + * ioq_submit -> qemu_laio_process_completions -> defer_call_end ->
> + * laio_deferred_fn -> ioq_submit cycle. The cycle terminates naturally
> + * when io_submit(2) returns asynchronously (O_DIRECT), but can grow
> + * without bound when submissions complete synchronously. On overflow
> + * the caller returns without submitting; the outermost
> + * qemu_laio_process_completions() has already scheduled s->completion_bh
> + * (via qemu_bh_schedule() at the top of that function), which resumes
> + * submission from the next event-loop dispatch.
> + */
> +#define IOQ_SUBMIT_MAX_DEPTH 8
> +
>  struct qemu_laiocb {
>      Coroutine *co;
>      LinuxAioState *ctx;
> @@ -80,6 +93,9 @@ struct LinuxAioState {
>  static void ioq_submit(LinuxAioState *s);
>  static int laio_do_submit(struct qemu_laiocb *laiocb);
>  
> +/* Per-thread recursion counter for ioq_submit(). See IOQ_SUBMIT_MAX_DEPTH. 
> */
> +static __thread unsigned ioq_submit_depth;
> +
>  static inline ssize_t io_event_ret(struct io_event *ev)
>  {
>      return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res);
> @@ -340,6 +356,11 @@ static void ioq_submit(LinuxAioState *s)
>      QEMU_UNINITIALIZED struct iocb *iocbs[MAX_EVENTS];
>      QSIMPLEQ_HEAD(, qemu_laiocb) completed;
>  
> +    if (ioq_submit_depth >= IOQ_SUBMIT_MAX_DEPTH) {
> +        return;
> +    }
> +    ioq_submit_depth++;
> +
>      do {
>          if (s->io_q.in_flight >= MAX_EVENTS) {
>              break;
> @@ -385,6 +406,8 @@ static void ioq_submit(LinuxAioState *s)
>           * pended requests will be submitted from there.
>           */
>      }
> +
> +    ioq_submit_depth--;
>  }
>  
>  static uint64_t laio_max_batch(LinuxAioState *s, uint64_t dev_max_batch)
ping

Reply via email to