On Tuesday, 2020-08-25 at 17:11:34 +02, Max Reitz wrote: > On 21.08.20 16:11, Vladimir Sementsov-Ogievskiy wrote: >> It's intended to be inserted between format and protocol nodes to >> preallocate additional space (expanding protocol file) on writes >> crossing EOF. It improves performance for file-systems with slow >> allocation. >> >> Signed-off-by: Vladimir Sementsov-Ogievskiy <vsement...@virtuozzo.com> >> --- >> docs/system/qemu-block-drivers.rst.inc | 26 +++ >> qapi/block-core.json | 20 +- >> block/preallocate.c | 291 +++++++++++++++++++++++++ >> block/Makefile.objs | 1 + >> 4 files changed, 337 insertions(+), 1 deletion(-) >> create mode 100644 block/preallocate.c > > Looks good to me in essence. Besides minor details, I wonder most about > whether truncating the file on close can be safe, but more about that below. > >> diff --git a/docs/system/qemu-block-drivers.rst.inc >> b/docs/system/qemu-block-drivers.rst.inc >> index b052a6d14e..5e8a35c571 100644 >> --- a/docs/system/qemu-block-drivers.rst.inc >> +++ b/docs/system/qemu-block-drivers.rst.inc >> @@ -952,3 +952,29 @@ on host and see if there are locks held by the QEMU >> process on the image file. >> More than one byte could be locked by the QEMU instance, each byte of which >> reflects a particular permission that is acquired or protected by the >> running >> block driver. >> + >> +Filter drivers >> +~~~~~~~~~~~~~~ >> + >> +QEMU supports several filter drivers, which don't store any data, but do >> some > > s/do/perform/ > >> +additional tasks, hooking io requests. >> + >> +.. program:: filter-drivers >> +.. option:: preallocate >> + >> + Preallocate filter driver is intended to be inserted between format > > *The preallocate filter driver > >> + and protocol nodes and does preallocation of some additional space > > I’d simplify this as s/does preallocation of/preallocates/ > >> + (expanding the protocol file) on write. It may be used for > > I’d complicate this as s/on write/when writing past the file’s end/, or > “when data is written to the file after its end”, or at least “on > post-EOF writes”. > > Maybe also s/It may be used for/This can be useful for/? > >> + file-systems with slow allocation. >> + >> + Supported options: >> + >> + .. program:: preallocate >> + .. option:: prealloc-align >> + >> + On preallocation, align file length to this number, default 1M. > > *the file length > > As for “number”... Well, it is a number. But “value” might fit better. > Or “length (in bytes)”?
Isn't it really: "On preallocation, ensure that the file length is aligned to a multiple of this value, default 1M." ? >> + >> + .. program:: preallocate >> + .. option:: prealloc-size >> + >> + How much to preallocate, default 128M. >> diff --git a/qapi/block-core.json b/qapi/block-core.json >> index 197bdc1c36..b40448063b 100644 >> --- a/qapi/block-core.json >> +++ b/qapi/block-core.json >> @@ -2805,7 +2805,7 @@ >> 'cloop', 'compress', 'copy-on-read', 'dmg', 'file', 'ftp', >> 'ftps', >> 'gluster', 'host_cdrom', 'host_device', 'http', 'https', >> 'iscsi', >> 'luks', 'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', >> 'parallels', >> - 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd', >> + 'preallocate', 'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'rbd', >> { 'name': 'replication', 'if': 'defined(CONFIG_REPLICATION)' }, >> 'sheepdog', >> 'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] } >> @@ -3074,6 +3074,23 @@ >> 'data': { 'aes': 'QCryptoBlockOptionsQCow', >> 'luks': 'QCryptoBlockOptionsLUKS'} } >> >> +## >> +# @BlockdevOptionsPreallocate: >> +# >> +# Filter driver intended to be inserted between format and protocol node >> +# and do preallocation in protocol node on write. >> +# >> +# @prealloc-align: on preallocation, align file length to this number, >> +# default 1048576 (1M) > > Speaking of alignment, this second line isn’t properly aligned. > >> +# >> +# @prealloc-size: how much to preallocate, default 134217728 (128M) >> +# >> +# Since: 5.2 >> +## >> +{ 'struct': 'BlockdevOptionsPreallocate', >> + 'base': 'BlockdevOptionsGenericFormat', >> + 'data': { '*prealloc-align': 'int', '*prealloc-size': 'int' } } >> + >> ## >> # @BlockdevOptionsQcow2: >> # >> @@ -3979,6 +3996,7 @@ >> 'null-co': 'BlockdevOptionsNull', >> 'nvme': 'BlockdevOptionsNVMe', >> 'parallels': 'BlockdevOptionsGenericFormat', >> + 'preallocate':'BlockdevOptionsPreallocate', >> 'qcow2': 'BlockdevOptionsQcow2', >> 'qcow': 'BlockdevOptionsQcow', >> 'qed': 'BlockdevOptionsGenericCOWFormat', >> diff --git a/block/preallocate.c b/block/preallocate.c >> new file mode 100644 >> index 0000000000..bdf54dbd2f >> --- /dev/null >> +++ b/block/preallocate.c >> @@ -0,0 +1,291 @@ >> +/* >> + * preallocate filter driver >> + * >> + * The driver performs preallocate operation: it is injected above >> + * some node, and before each write over EOF it does additional >> preallocating >> + * write-zeroes request. >> + * >> + * Copyright (c) 2020 Virtuozzo International GmbH. >> + * >> + * Author: >> + * Sementsov-Ogievskiy Vladimir <vsement...@virtuozzo.com> >> + * >> + * This program is free software; you can redistribute it and/or modify >> + * it under the terms of the GNU General Public License as published by >> + * the Free Software Foundation; either version 2 of the License, or >> + * (at your option) any later version. >> + * >> + * This program is distributed in the hope that it will be useful, >> + * but WITHOUT ANY WARRANTY; without even the implied warranty of >> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the >> + * GNU General Public License for more details. >> + * >> + * You should have received a copy of the GNU General Public License >> + * along with this program. If not, see <http://www.gnu.org/licenses/>. >> + */ >> + >> +#include "qemu/osdep.h" >> + >> +#include "qapi/error.h" >> +#include "qemu/module.h" >> +#include "qemu/option.h" >> +#include "qemu/units.h" >> +#include "block/block_int.h" >> + >> + >> +typedef struct BDRVPreallocateState { >> + int64_t prealloc_size; >> + int64_t prealloc_align; >> + >> + /* >> + * Filter is started as not-active, so it doesn't do any preallocations >> nor >> + * requires BLK_PERM_RESIZE on its child. This is needed to create >> filter >> + * above another node-child and then do bdrv_replace_node to insert the >> + * filter. > > A bit weird, but seems fair. It’s basically just a cache for whether > this node has a writer on it or not. > > Apart from the weirdness, I don’t understand the “another node-child”. > Say you have “format” -> “proto”, and then you want to insert > “prealloc”. Wouldn’t you blockdev-add prealloc,file=proto and then > blockdev-replace format.file=prealloc? So what is that “other node-child”? > >> + * >> + * Filter becomes active the first time it detects that its parents have >> + * BLK_PERM_RESIZE on it. >> + * Filter becomes active forever: it doesn't lose active status if >> parents >> + * lose BLK_PERM_RESIZE, otherwise we'll not be able to shrink the file >> on >> + * filter close. > > Oh, the file is shrunk? That wasn’t clear from the documentation. > > Hm. Seems like useful behavior. I just wonder if keeping the > permission around indefinitely makes sense. Why not just truncate it > when the permission is revoked? > >> + */ >> + bool active; >> + >> + /* >> + * Track real data end, to crop preallocation on close data_end may be > > s/on close /when closed./ > >> + * negative, which means that actual status is unknown (nothing cropped >> in >> + * this case) >> + */ >> + int64_t data_end; >> +} BDRVPreallocateState; >> + >> +#define PREALLOCATE_OPT_PREALLOC_ALIGN "prealloc-align" >> +#define PREALLOCATE_OPT_PREALLOC_SIZE "prealloc-size" >> +static QemuOptsList runtime_opts = { >> + .name = "preallocate", >> + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), >> + .desc = { >> + { >> + .name = PREALLOCATE_OPT_PREALLOC_ALIGN, >> + .type = QEMU_OPT_SIZE, >> + .help = "on preallocation, align file length to this number, " >> + "default 1M", >> + }, >> + { >> + .name = PREALLOCATE_OPT_PREALLOC_SIZE, >> + .type = QEMU_OPT_SIZE, >> + .help = "how much to preallocate, default 128M", >> + }, >> + { /* end of list */ } >> + }, >> +}; >> + >> +static int preallocate_open(BlockDriverState *bs, QDict *options, int flags, >> + Error **errp) >> +{ >> + QemuOpts *opts; >> + BDRVPreallocateState *s = bs->opaque; >> + >> + /* >> + * Parameters are hardcoded now. May need to add corresponding options >> in >> + * future. >> + */ >> + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); >> + qemu_opts_absorb_qdict(opts, options, &error_abort); >> + s->prealloc_align = >> + qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_ALIGN, 1 * MiB); >> + s->prealloc_size = >> + qemu_opt_get_size(opts, PREALLOCATE_OPT_PREALLOC_SIZE, 128 * MiB); >> + qemu_opts_del(opts); > > Should we have some validation on these parameters? Like, > prealloc_align being at least 512, or maybe even file.request_alignment? > > (I suppose anything for prealloc_size is fine, though 0 doesn’t make > much sense...) > >> + >> + bs->file = bdrv_open_child(NULL, options, "file", bs, &child_of_bds, >> + BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY, >> + false, errp); >> + if (!bs->file) { >> + return -EINVAL; >> + } >> + >> + s->data_end = bdrv_getlength(bs->file->bs); >> + if (s->data_end < 0) { >> + return s->data_end; >> + } >> + >> + bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED | >> + (BDRV_REQ_FUA & bs->file->bs->supported_write_flags); >> + >> + bs->supported_zero_flags = BDRV_REQ_WRITE_UNCHANGED | >> + ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) & >> + bs->file->bs->supported_zero_flags); >> + >> + return 0; >> +} >> + >> +static void preallocate_close(BlockDriverState *bs) >> +{ >> + BDRVPreallocateState *s = bs->opaque; >> + >> + if (s->active && s->data_end >= 0 && >> + bdrv_getlength(bs->file->bs) > s->data_end) >> + { >> + bdrv_truncate(bs->file, s->data_end, true, PREALLOC_MODE_OFF, 0, >> NULL); > > Now that I think more about it... What if there are other writers on > bs->file? This may throw data away. Maybe BDS.wr_highest_offset can > help to make a better call? > >> + } >> +} >> + >> +static void preallocate_child_perm(BlockDriverState *bs, BdrvChild *c, >> + BdrvChildRole role, >> + BlockReopenQueue *reopen_queue, >> + uint64_t perm, uint64_t shared, >> + uint64_t *nperm, uint64_t *nshared) >> +{ >> + BDRVPreallocateState *s = bs->opaque; >> + >> + bdrv_default_perms(bs, c, role, reopen_queue, perm, shared, nperm, >> nshared); >> + >> + s->active = s->active || (perm & BLK_PERM_RESIZE); >> + >> + if (s->active) { >> + /* Force RESIZE permission, to be able to crop file on close() */ >> + *nperm |= BLK_PERM_RESIZE; >> + } >> +} >> + >> +static coroutine_fn int preallocate_co_preadv_part( >> + BlockDriverState *bs, uint64_t offset, uint64_t bytes, >> + QEMUIOVector *qiov, size_t qiov_offset, int flags) >> +{ >> + return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset, >> + flags); >> +} >> + >> +static int coroutine_fn preallocate_co_pdiscard(BlockDriverState *bs, >> + int64_t offset, int bytes) >> +{ >> + return bdrv_co_pdiscard(bs->file, offset, bytes); >> +} >> + >> +static bool coroutine_fn do_preallocate(BlockDriverState *bs, int64_t >> offset, >> + int64_t bytes, bool write_zero) >> +{ >> + BDRVPreallocateState *s = bs->opaque; >> + int64_t len, start, end; >> + >> + if (!s->active) { >> + return false; >> + } >> + >> + if (s->data_end >= 0) { >> + s->data_end = MAX(s->data_end, >> + QEMU_ALIGN_UP(offset + bytes, BDRV_SECTOR_SIZE)); >> + } >> + >> + len = bdrv_getlength(bs->file->bs); > > I’d rename @len so it’s clear that it has nothing to do with the request > length. Like, file_len. > > (Because... > >> + if (len < 0) { >> + return false; >> + } >> + >> + if (s->data_end < 0) { >> + s->data_end = MAX(len, >> + QEMU_ALIGN_UP(offset + bytes, BDRV_SECTOR_SIZE)); >> + } >> + >> + if (offset + bytes <= len) { >> + return false; >> + } >> + >> + start = write_zero ? MIN(offset, len) : len; > > ...I got confused here for a bit) > >> + end = QEMU_ALIGN_UP(offset + bytes + s->prealloc_size, >> s->prealloc_align); > > Ah, I expected offset + s->prealloc_size. But OK. > >> + return !bdrv_co_pwrite_zeroes(bs->file, start, end - start, >> + BDRV_REQ_NO_FALLBACK | BDRV_REQ_SERIALISING | BDRV_REQ_NO_WAIT); >> +} >> + >> +static int coroutine_fn preallocate_co_pwrite_zeroes(BlockDriverState *bs, >> + int64_t offset, int bytes, BdrvRequestFlags flags) >> +{ >> + if (do_preallocate(bs, offset, bytes, true)) { >> + return 0; >> + } >> + >> + return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags); >> +} >> + >> +static coroutine_fn int preallocate_co_pwritev_part(BlockDriverState *bs, >> + uint64_t offset, >> + uint64_t bytes, >> + QEMUIOVector *qiov, >> + size_t qiov_offset, >> + int flags) >> +{ >> + do_preallocate(bs, offset, bytes, false); >> + >> + return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset, >> + flags); >> +} >> + >> +static int coroutine_fn >> +preallocate_co_truncate(BlockDriverState *bs, int64_t offset, >> + bool exact, PreallocMode prealloc, >> + BdrvRequestFlags flags, Error **errp) >> +{ >> + BDRVPreallocateState *s = bs->opaque; >> + int ret = bdrv_co_truncate(bs->file, offset, exact, prealloc, flags, >> errp); >> + >> + /* s->data_end may become negative here, which means unknown data end */ >> + s->data_end = bdrv_getlength(bs->file->bs); > > What would be the problem with just setting data_end = offset? We only > use data_end to truncate down on close, so basically repeat the > bdrv_co_truncate() call here, which seems correct. > >> + >> + return ret; >> +} >> + >> +static int coroutine_fn preallocate_co_flush(BlockDriverState *bs) >> +{ >> + return bdrv_co_flush(bs->file->bs); >> +} >> + >> +static int64_t preallocate_getlength(BlockDriverState *bs) >> +{ >> + /* >> + * We probably can return s->data_end here, but seems safer to return >> real >> + * file length, not trying to hide the preallocation. > > I don’t know. The auto-truncation on close makes that a bit weird, in > my opinion. But since this filter is probably never directly below a > BlockBackend (i.e., the length is never exposed to anything outside of > the block layer), but always below a format driver, it should be fine. > (In fact, those drivers do probably rather care about the true file > length rather than whatever they may have truncated it to, so you’re > right, it should be safer.) > > Max > >> + * >> + * Still, don't miss the chance to restore s->data_end if it is broken. >> + */ >> + BDRVPreallocateState *s = bs->opaque; >> + int64_t ret = bdrv_getlength(bs->file->bs); >> + >> + if (s->data_end < 0) { >> + s->data_end = ret; >> + } >> + >> + return ret; >> +} >> + >> +BlockDriver bdrv_preallocate_filter = { >> + .format_name = "preallocate", >> + .instance_size = sizeof(BDRVPreallocateState), >> + >> + .bdrv_getlength = preallocate_getlength, >> + .bdrv_open = preallocate_open, >> + .bdrv_close = preallocate_close, >> + >> + .bdrv_co_preadv_part = preallocate_co_preadv_part, >> + .bdrv_co_pwritev_part = preallocate_co_pwritev_part, >> + .bdrv_co_pwrite_zeroes = preallocate_co_pwrite_zeroes, >> + .bdrv_co_pdiscard = preallocate_co_pdiscard, >> + .bdrv_co_flush = preallocate_co_flush, >> + .bdrv_co_truncate = preallocate_co_truncate, >> + >> + .bdrv_co_block_status = bdrv_co_block_status_from_file, >> + >> + .bdrv_child_perm = preallocate_child_perm, >> + >> + .has_variable_length = true, >> + .is_filter = true, >> +}; >> + >> +static void bdrv_preallocate_init(void) >> +{ >> + bdrv_register(&bdrv_preallocate_filter); >> +} >> + >> +block_init(bdrv_preallocate_init); >> diff --git a/block/Makefile.objs b/block/Makefile.objs >> index 19c6f371c9..f8e6f16522 100644 >> --- a/block/Makefile.objs >> +++ b/block/Makefile.objs >> @@ -44,6 +44,7 @@ block-obj-y += crypto.o >> block-obj-y += aio_task.o >> block-obj-y += backup-top.o >> block-obj-y += filter-compress.o >> +block-obj-y += preallocate.o >> common-obj-y += monitor/ >> block-obj-y += monitor/ >> >> dme. -- I think I waited too long, I'm moving into the dollhouse.