Please disregard this, I'll send it soon as part of the correct series. Paolo
On 05/10/2016 18:47, Paolo Bonzini wrote: > bdrv_requests_pending is checking children to also wait until internal > requests (such as metadata writes) have completed. However, checking > children is in general overkill. Children requests can be of two kinds: > > - requests caused by an operation on bs, e.g. a bdrv_aio_write to bs > causing a write to bs->file->bs. In this case, the parent's in_flight > count will always be incremented by at least one for every request in > the child. > > - asynchronous metadata writes or flushes. Such writes can be started > even if bs's in_flight count is zero, but not after the .bdrv_drain > callback has been invoked. > > This patch therefore changes bdrv_drain to finish I/O in the parent > (after which the parent's in_flight will be locked to zero), call > bdrv_drain (after which the parent will not generate I/O on the child > anymore), and then wait for internal I/O in the children to complete. > > Signed-off-by: Paolo Bonzini <pbonz...@redhat.com> > --- > block/io.c | 54 ++++++++++++++++++++++++++++++++++++------------------ > block/qed.c | 16 +++++++++++++++- > 2 files changed, 51 insertions(+), 19 deletions(-) > > diff --git a/block/io.c b/block/io.c > index f81c884..b94edec 100644 > --- a/block/io.c > +++ b/block/io.c > @@ -156,16 +156,33 @@ bool bdrv_requests_pending(BlockDriverState *bs) > return false; > } > > -static void bdrv_drain_recurse(BlockDriverState *bs) > +static bool bdrv_drain_poll(BlockDriverState *bs) > +{ > + bool waited = false; > + > + while (atomic_read(&bs->in_flight) > 0) { > + aio_poll(bdrv_get_aio_context(bs), true); > + waited = true; > + } > + return waited; > +} > + > +static bool bdrv_drain_io_recurse(BlockDriverState *bs) > { > BdrvChild *child; > + bool waited; > + > + waited = bdrv_drain_poll(bs); > > if (bs->drv && bs->drv->bdrv_drain) { > bs->drv->bdrv_drain(bs); > } > + > QLIST_FOREACH(child, &bs->children, next) { > - bdrv_drain_recurse(child->bs); > + waited |= bdrv_drain_io_recurse(child->bs); > } > + > + return waited; > } > > typedef struct { > @@ -174,14 +191,6 @@ typedef struct { > bool done; > } BdrvCoDrainData; > > -static void bdrv_drain_poll(BlockDriverState *bs) > -{ > - while (bdrv_requests_pending(bs)) { > - /* Keep iterating */ > - aio_poll(bdrv_get_aio_context(bs), true); > - } > -} > - > static void bdrv_co_drain_bh_cb(void *opaque) > { > BdrvCoDrainData *data = opaque; > @@ -215,6 +224,20 @@ static void coroutine_fn > bdrv_co_yield_to_drain(BlockDriverState *bs) > assert(data.done); > } > > +static void bdrv_co_drain_io_recurse(BlockDriverState *bs) > +{ > + BdrvChild *child; > + > + bdrv_co_yield_to_drain(bs); > + if (bs->drv && bs->drv->bdrv_drain) { > + bs->drv->bdrv_drain(bs); > + } > + > + QLIST_FOREACH(child, &bs->children, next) { > + bdrv_co_drain_io_recurse(child->bs); > + } > +} > + > void bdrv_drained_begin(BlockDriverState *bs) > { > if (!bs->quiesce_counter++) { > @@ -223,11 +246,10 @@ void bdrv_drained_begin(BlockDriverState *bs) > } > > bdrv_io_unplugged_begin(bs); > - bdrv_drain_recurse(bs); > if (qemu_in_coroutine()) { > - bdrv_co_yield_to_drain(bs); > + bdrv_co_drain_io_recurse(bs); > } else { > - bdrv_drain_poll(bs); > + bdrv_drain_io_recurse(bs); > } > bdrv_io_unplugged_end(bs); > } > @@ -296,7 +318,6 @@ void bdrv_drain_all(void) > aio_context_acquire(aio_context); > bdrv_parent_drained_begin(bs); > bdrv_io_unplugged_begin(bs); > - bdrv_drain_recurse(bs); > aio_context_release(aio_context); > > if (!g_slist_find(aio_ctxs, aio_context)) { > @@ -319,10 +340,7 @@ void bdrv_drain_all(void) > aio_context_acquire(aio_context); > for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) { > if (aio_context == bdrv_get_aio_context(bs)) { > - if (bdrv_requests_pending(bs)) { > - aio_poll(aio_context, true); > - waited = true; > - } > + waited |= bdrv_drain_io_recurse(bs); > } > } > aio_context_release(aio_context); > diff --git a/block/qed.c b/block/qed.c > index 3ee879b..1a7ef0a 100644 > --- a/block/qed.c > +++ b/block/qed.c > @@ -336,7 +336,7 @@ static void qed_need_check_timer_cb(void *opaque) > qed_plug_allocating_write_reqs(s); > > /* Ensure writes are on disk before clearing flag */ > - bdrv_aio_flush(s->bs, qed_clear_need_check, s); > + bdrv_aio_flush(s->bs->file->bs, qed_clear_need_check, s); > } > > static void qed_start_need_check_timer(BDRVQEDState *s) > @@ -378,6 +378,19 @@ static void bdrv_qed_attach_aio_context(BlockDriverState > *bs, > } > } > > +static void bdrv_qed_drain(BlockDriverState *bs) > +{ > + BDRVQEDState *s = bs->opaque; > + > + /* Fire the timer immediately in order to start doing I/O as soon as the > + * header is flushed. > + */ > + if (s->need_check_timer && timer_pending(s->need_check_timer)) { > + qed_cancel_need_check_timer(s); > + qed_need_check_timer_cb(s); > + } > +} > + > static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, > Error **errp) > { > @@ -1668,6 +1681,7 @@ static BlockDriver bdrv_qed = { > .bdrv_check = bdrv_qed_check, > .bdrv_detach_aio_context = bdrv_qed_detach_aio_context, > .bdrv_attach_aio_context = bdrv_qed_attach_aio_context, > + .bdrv_drain = bdrv_qed_drain, > }; > > static void bdrv_qed_init(void) >