01.11.2019 13:00, Max Reitz wrote: > This reverts commit c8bb23cbdbe32f5c326365e0a82e1b0e68cdcd8a. > > This commit causes fundamental performance problems on XFS (because > fallocate() stalls the AIO pipeline), and as such it is not clear that > we should unconditionally enable this behavior. > > We expect subclusters to alleviate the performance penalty of small > writes to newly allocated clusters, so when we get them, the originally > intended performance gain may actually no longer be significant. > > If we want to reintroduce something similar to c8bb23cbdbe, it will > require extensive benchmarking on various systems with subclusters > enabled. > > Cc: qemu-sta...@nongnu.org > Signed-off-by: Max Reitz <mre...@redhat.com>
It's sad, but OK: Reviewed-by: Vladimir Sementsov-Ogievskiy <vsement...@virtuozzo.com> > --- > qapi/block-core.json | 4 +- > block/qcow2.h | 6 --- > block/qcow2-cluster.c | 2 +- > block/qcow2.c | 86 -------------------------------------- > block/trace-events | 1 - > tests/qemu-iotests/060 | 7 +--- > tests/qemu-iotests/060.out | 5 +-- > 7 files changed, 4 insertions(+), 107 deletions(-) > > diff --git a/qapi/block-core.json b/qapi/block-core.json > index aa97ee2641..f053f15431 100644 > --- a/qapi/block-core.json > +++ b/qapi/block-core.json > @@ -3304,8 +3304,6 @@ > # > # @cor_write: a write due to copy-on-read (since 2.11) > # > -# @cluster_alloc_space: an allocation of file space for a cluster (since 4.1) > -# > # @none: triggers once at creation of the blkdebug node (since 4.1) > # > # Since: 2.9 > @@ -3326,7 +3324,7 @@ > 'pwritev_rmw_tail', 'pwritev_rmw_after_tail', 'pwritev', > 'pwritev_zero', 'pwritev_done', 'empty_image_prepare', > 'l1_shrink_write_table', 'l1_shrink_free_l2_clusters', > - 'cor_write', 'cluster_alloc_space', 'none'] } > + 'cor_write', 'none'] } > > ## > # @BlkdebugIOType: > diff --git a/block/qcow2.h b/block/qcow2.h > index 601c2e4c82..8166f6e311 100644 > --- a/block/qcow2.h > +++ b/block/qcow2.h > @@ -418,12 +418,6 @@ typedef struct QCowL2Meta > */ > Qcow2COWRegion cow_end; > > - /* > - * Indicates that COW regions are already handled and do not require > - * any more processing. > - */ > - bool skip_cow; > - > /** > * The I/O vector with the data from the actual guest write request. > * If non-NULL, this is meant to be merged together with the data > diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c > index 8982b7b762..fbfea8c817 100644 > --- a/block/qcow2-cluster.c > +++ b/block/qcow2-cluster.c > @@ -809,7 +809,7 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta > *m) > assert(start->nb_bytes + end->nb_bytes <= UINT_MAX - data_bytes); > assert(start->offset + start->nb_bytes <= end->offset); > > - if ((start->nb_bytes == 0 && end->nb_bytes == 0) || m->skip_cow) { > + if (start->nb_bytes == 0 && end->nb_bytes == 0) { > return 0; > } > > diff --git a/block/qcow2.c b/block/qcow2.c > index 7c18721741..17555cb0a1 100644 > --- a/block/qcow2.c > +++ b/block/qcow2.c > @@ -2274,11 +2274,6 @@ static bool merge_cow(uint64_t offset, unsigned bytes, > continue; > } > > - /* If COW regions are handled already, skip this too */ > - if (m->skip_cow) { > - continue; > - } > - > /* The data (middle) region must be immediately after the > * start region */ > if (l2meta_cow_start(m) + m->cow_start.nb_bytes != offset) { > @@ -2305,81 +2300,6 @@ static bool merge_cow(uint64_t offset, unsigned bytes, > return false; > } > > -static bool is_unallocated(BlockDriverState *bs, int64_t offset, int64_t > bytes) > -{ > - int64_t nr; > - return !bytes || > - (!bdrv_is_allocated_above(bs, NULL, false, offset, bytes, &nr) && > - nr == bytes); > -} > - > -static bool is_zero_cow(BlockDriverState *bs, QCowL2Meta *m) > -{ > - /* > - * This check is designed for optimization shortcut so it must be > - * efficient. > - * Instead of is_zero(), use is_unallocated() as it is faster (but not > - * as accurate and can result in false negatives). > - */ > - return is_unallocated(bs, m->offset + m->cow_start.offset, > - m->cow_start.nb_bytes) && > - is_unallocated(bs, m->offset + m->cow_end.offset, > - m->cow_end.nb_bytes); > -} > - > -static int handle_alloc_space(BlockDriverState *bs, QCowL2Meta *l2meta) > -{ > - BDRVQcow2State *s = bs->opaque; > - QCowL2Meta *m; > - > - if (!(s->data_file->bs->supported_zero_flags & BDRV_REQ_NO_FALLBACK)) { > - return 0; > - } > - > - if (bs->encrypted) { > - return 0; > - } > - > - for (m = l2meta; m != NULL; m = m->next) { > - int ret; > - > - if (!m->cow_start.nb_bytes && !m->cow_end.nb_bytes) { > - continue; > - } > - > - if (!is_zero_cow(bs, m)) { > - continue; > - } > - > - /* > - * instead of writing zero COW buffers, > - * efficiently zero out the whole clusters > - */ > - > - ret = qcow2_pre_write_overlap_check(bs, 0, m->alloc_offset, > - m->nb_clusters * s->cluster_size, > - true); > - if (ret < 0) { > - return ret; > - } > - > - BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_SPACE); > - ret = bdrv_co_pwrite_zeroes(s->data_file, m->alloc_offset, > - m->nb_clusters * s->cluster_size, > - BDRV_REQ_NO_FALLBACK); > - if (ret < 0) { > - if (ret != -ENOTSUP && ret != -EAGAIN) { > - return ret; > - } > - continue; > - } > - > - trace_qcow2_skip_cow(qemu_coroutine_self(), m->offset, > m->nb_clusters); > - m->skip_cow = true; > - } > - return 0; > -} > - > /* > * qcow2_co_pwritev_task > * Called with s->lock unlocked > @@ -2421,12 +2341,6 @@ static coroutine_fn int > qcow2_co_pwritev_task(BlockDriverState *bs, > qiov_offset = 0; > } > > - /* Try to efficiently initialize the physical space with zeroes */ > - ret = handle_alloc_space(bs, l2meta); > - if (ret < 0) { > - goto out_unlocked; > - } > - > /* > * If we need to do COW, check if it's possible to merge the > * writing of the guest data together with that of the COW regions. > diff --git a/block/trace-events b/block/trace-events > index 6ba86decca..c615b26d71 100644 > --- a/block/trace-events > +++ b/block/trace-events > @@ -72,7 +72,6 @@ qcow2_writev_done_part(void *co, int cur_bytes) "co %p > cur_bytes %d" > qcow2_writev_data(void *co, uint64_t offset) "co %p offset 0x%" PRIx64 > qcow2_pwrite_zeroes_start_req(void *co, int64_t offset, int count) "co %p > offset 0x%" PRIx64 " count %d" > qcow2_pwrite_zeroes(void *co, int64_t offset, int count) "co %p offset 0x%" > PRIx64 " count %d" > -qcow2_skip_cow(void *co, uint64_t offset, int nb_clusters) "co %p offset > 0x%" PRIx64 " nb_clusters %d" > > # qcow2-cluster.c > qcow2_alloc_clusters_offset(void *co, uint64_t offset, int bytes) "co %p > offset 0x%" PRIx64 " bytes %d" > diff --git a/tests/qemu-iotests/060 b/tests/qemu-iotests/060 > index b91d8321bb..89e911400c 100755 > --- a/tests/qemu-iotests/060 > +++ b/tests/qemu-iotests/060 > @@ -150,15 +150,10 @@ $QEMU_IO -c "$OPEN_RO" -c "read -P 1 0 512" | > _filter_qemu_io > echo > echo "=== Testing overlap while COW is in flight ===" > echo > -BACKING_IMG=$TEST_IMG.base > -TEST_IMG=$BACKING_IMG _make_test_img 1G > - > -$QEMU_IO -c 'write 0k 64k' "$BACKING_IMG" | _filter_qemu_io > - > # compat=0.10 is required in order to make the following discard actually > # unallocate the sector rather than make it a zero sector - we want COW, > after > # all. > -IMGOPTS='compat=0.10' _make_test_img -b "$BACKING_IMG" 1G > +IMGOPTS='compat=0.10' _make_test_img 1G > # Write two clusters, the second one enforces creation of an L2 table after > # the first data cluster. > $QEMU_IO -c 'write 0k 64k' -c 'write 512M 64k' "$TEST_IMG" | _filter_qemu_io > diff --git a/tests/qemu-iotests/060.out b/tests/qemu-iotests/060.out > index 0f6b0658a1..e42bf8c5a9 100644 > --- a/tests/qemu-iotests/060.out > +++ b/tests/qemu-iotests/060.out > @@ -97,10 +97,7 @@ read 512/512 bytes at offset 0 > > === Testing overlap while COW is in flight === > > -Formatting 'TEST_DIR/t.IMGFMT.base', fmt=IMGFMT size=1073741824 > -wrote 65536/65536 bytes at offset 0 > -64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) > -Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 > backing_file=TEST_DIR/t.IMGFMT.base > +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1073741824 > wrote 65536/65536 bytes at offset 0 > 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) > wrote 65536/65536 bytes at offset 536870912 > -- Best regards, Vladimir