On Thu, Jun 17, 2021 at 05:52:43PM +0200, Max Reitz wrote: > > To address this, we want to cache data regions. Most of the time, when > bad performance is reported, it is in places where the image is iterated > over from start to end (qemu-img convert or the mirror job), so a simple > yet effective solution is to cache only the current data region.
Here's hoping third time's the charm! > > (Note that only caching data regions but not zero regions means that > returning false information from the cache is not catastrophic: Treating > zeroes as data is fine. While we try to invalidate the cache on zero > writes and discards, such incongruences may still occur when there are > other processes writing to the image.) > > We only use the cache for nodes without children (i.e. protocol nodes), > because that is where the problem is: Drivers that rely on block-status > implementations outside of qemu (e.g. SEEK_DATA/HOLE). > > Resolves: https://gitlab.com/qemu-project/qemu/-/issues/307 > Signed-off-by: Max Reitz <mre...@redhat.com> > --- > include/block/block_int.h | 19 ++++++++++ > block.c | 2 + > block/io.c | 80 +++++++++++++++++++++++++++++++++++++-- > 3 files changed, 98 insertions(+), 3 deletions(-) > > diff --git a/include/block/block_int.h b/include/block/block_int.h > index a8f9598102..c09512556a 100644 > --- a/include/block/block_int.h > +++ b/include/block/block_int.h > @@ -832,6 +832,23 @@ struct BdrvChild { > QLIST_ENTRY(BdrvChild) next_parent; > }; > > +/* > + * Allows bdrv_co_block_status() to cache one data region for a > + * protocol node. > + * > + * @lock: Lock for accessing this object's fields > + * @valid: Whether the cache is valid > + * @data_start: Offset where we know (or strongly assume) is data > + * @data_end: Offset where the data region ends (which is not necessarily > + * the start of a zeroed region) > + */ > +typedef struct BdrvBlockStatusCache { > + CoMutex lock; > + bool valid; > + int64_t data_start; > + int64_t data_end; > +} BdrvBlockStatusCache; Looks like the right bits of information, and I'm glad you documented the need to be prepared for protocols that report split data sections rather than consolidated. > +++ b/block/io.c > @@ -35,6 +35,7 @@ > #include "qapi/error.h" > #include "qemu/error-report.h" > #include "qemu/main-loop.h" > +#include "qemu/range.h" > #include "sysemu/replay.h" > > /* Maximum bounce buffer for copy-on-read and write zeroes, in bytes */ > @@ -1862,6 +1863,7 @@ static int coroutine_fn > bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, > bool need_flush = false; > int head = 0; > int tail = 0; > + BdrvBlockStatusCache *bsc = &bs->block_status_cache; > > int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_pwrite_zeroes, INT_MAX); > int alignment = MAX(bs->bl.pwrite_zeroes_alignment, > @@ -1878,6 +1880,16 @@ static int coroutine_fn > bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, > return -ENOTSUP; > } > > + /* Invalidate the cached block-status data range if this write overlaps > */ > + qemu_co_mutex_lock(&bsc->lock); Are we going to be suffering from a lot of lock contention performance degradation? Is there a way to take advantage of RCU access patterns for any more performance without sacrificing correctness? > + if (bsc->valid && > + ranges_overlap(offset, bytes, bsc->data_start, > + bsc->data_end - bsc->data_start)) > + { > + bsc->valid = false; > + } Do we want to invalidate the entire bsc, or can we be smart and leave the prefix intact (if offset > bsc->data_start, then set bsc->data_end to offset)? > + qemu_co_mutex_unlock(&bsc->lock); Worth using WITH_QEMU_LOCK_GUARD? > + > assert(alignment % bs->bl.request_alignment == 0); > head = offset % alignment; > tail = (offset + bytes) % alignment; > @@ -2394,6 +2406,7 @@ static int coroutine_fn > bdrv_co_block_status(BlockDriverState *bs, > int64_t aligned_offset, aligned_bytes; > uint32_t align; > bool has_filtered_child; > + BdrvBlockStatusCache *bsc = &bs->block_status_cache; > > assert(pnum); > *pnum = 0; > @@ -2442,9 +2455,59 @@ static int coroutine_fn > bdrv_co_block_status(BlockDriverState *bs, > aligned_bytes = ROUND_UP(offset + bytes, align) - aligned_offset; > > if (bs->drv->bdrv_co_block_status) { > - ret = bs->drv->bdrv_co_block_status(bs, want_zero, aligned_offset, > - aligned_bytes, pnum, &local_map, > - &local_file); > + bool from_cache = false; > + > + /* > + * Use the block-status cache only for protocol nodes: Format > + * drivers are generally quick to inquire the status, but protocol > + * drivers often need to get information from outside of qemu, so > + * we do not have control over the actual implementation. There > + * have been cases where inquiring the status took an unreasonably > + * long time, and we can do nothing in qemu to fix it. > + * This is especially problematic for images with large data areas, > + * because finding the few holes in them and giving them special > + * treatment does not gain much performance. Therefore, we try to > + * cache the last-identified data region. > + * > + * Second, limiting ourselves to protocol nodes allows us to assume > + * the block status for data regions to be DATA | OFFSET_VALID, and > + * that the host offset is the same as the guest offset. > + * > + * Note that it is possible that external writers zero parts of > + * the cached regions without the cache being invalidated, and so > + * we may report zeroes as data. This is not catastrophic, > + * however, because reporting zeroes as data is fine. > + */ Useful comment. > + if (QLIST_EMPTY(&bs->children)) { > + qemu_co_mutex_lock(&bsc->lock); > + if (bsc->valid && > + aligned_offset >= bsc->data_start && > + aligned_offset < bsc->data_end) > + { > + ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; > + local_file = bs; > + local_map = aligned_offset; > + > + *pnum = bsc->data_end - aligned_offset; > + > + from_cache = true; > + } > + qemu_co_mutex_unlock(&bsc->lock); > + } > + > + if (!from_cache) { > + ret = bs->drv->bdrv_co_block_status(bs, want_zero, > aligned_offset, > + aligned_bytes, pnum, > &local_map, > + &local_file); > + > + if (ret == (BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID)) { > + qemu_co_mutex_lock(&bsc->lock); > + bsc->data_start = aligned_offset; > + bsc->data_end = aligned_offset + *pnum; > + bsc->valid = true; > + qemu_co_mutex_unlock(&bsc->lock); > + } > + } Looks correct. > } else { > /* Default code for filters */ > > @@ -2974,6 +3037,7 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, > int64_t offset, > int max_pdiscard, ret; > int head, tail, align; > BlockDriverState *bs = child->bs; > + BdrvBlockStatusCache *bsc = &bs->block_status_cache; > > if (!bs || !bs->drv || !bdrv_is_inserted(bs)) { > return -ENOMEDIUM; > @@ -2997,6 +3061,16 @@ int coroutine_fn bdrv_co_pdiscard(BdrvChild *child, > int64_t offset, > return 0; > } > > + /* Invalidate the cached block-status data range if this discard > overlaps */ > + qemu_co_mutex_lock(&bsc->lock); > + if (bsc->valid && > + ranges_overlap(offset, bytes, bsc->data_start, > + bsc->data_end - bsc->data_start)) > + { > + bsc->valid = false; > + } Same question as above about possibly shortening the cached range in-place rather than discarding it altogether. > + qemu_co_mutex_unlock(&bsc->lock); > + > /* Discard is advisory, but some devices track and coalesce > * unaligned requests, so we must pass everything down rather than > * round here. Still, most devices will just silently ignore > -- > 2.31.1 > > -- Eric Blake, Principal Software Engineer Red Hat, Inc. +1-919-301-3266 Virtualization: qemu.org | libvirt.org