Re: [PATCH v2] scripts/qcow2-to-stdout.py: Add script to write qcow2 images to stdout
On Mon, Jul 1, 2024 at 6:13 PM Alberto Garcia wrote: > > This tool converts a disk image to qcow2, writing the result directly > to stdout. This can be used for example to send the generated file > over the network. > > This is equivalent to using qemu-img to convert a file to qcow2 and > then writing the result to stdout, with the difference that this tool > does not need to create this temporary qcow2 file and therefore does > not need any additional disk space. > > Implementing this directly in qemu-img is not really an option because > it expects the output file to be seekable and it is also meant to be a > generic tool that supports all combinations of file formats and image > options. Instead, this tool can only produce qcow2 files with the > basic options, without compression, encryption or other features. > > The input file is read twice. The first pass is used to determine > which clusters contain non-zero data and that information is used to > create the qcow2 header, refcount table and blocks, and L1 and L2 > tables. After all that metadata is created then the second pass is > used to write the guest data. > By default qcow2-to-stdout.py expects the input to be a raw file, but > if qemu-storage-daemon is available then it can also be used to read > images in other formats. Alternatively the user can also run qemu-ndb > or qemu-storage-daemon manually instead. > > Signed-off-by: Alberto Garcia > Signed-off-by: Madeeha Javed > --- > scripts/qcow2-to-stdout.py | 377 + > 1 file changed, 377 insertions(+) > create mode 100755 scripts/qcow2-to-stdout.py > > v2: > - Define the QCOW2_V3_HDR_LENGTH and QCOW2_FEATURE_NAME_TABLE constants > [Manos] > - Define the QEMU_STORAGE_DAEMON constant > - Use isfile() instead of exists() for the input file > - Refuse to write to stdout if it's a tty [Manos] > - Move the bulk of the code to a function called from __main__ [Manos] > - Remove the qcow2_ prefix from qcow2_cluster_size and qcow2_refcount_bits > - Formatting fixes suggested by the Python black formatter [Manos] > - On error pass the string directly to sys.exit() > - Capture the output of qemu-storage-daemon [Manos] > - Use a contextmanager to run qemu-storage-daemon [Manos] > - Update patch description to mention why this cannot be implemeted directly > in qemu-img [Manos] > > v1: https://lists.gnu.org/archive/html/qemu-block/2024-06/msg00073.html > > diff --git a/scripts/qcow2-to-stdout.py b/scripts/qcow2-to-stdout.py > new file mode 100755 > index 00..d486a80e86 > --- /dev/null > +++ b/scripts/qcow2-to-stdout.py > @@ -0,0 +1,377 @@ > +#!/usr/bin/env python3 > + > +# This tool reads a disk image in any format and converts it to qcow2, > +# writing the result directly to stdout. > +# > +# Copyright (C) 2024 Igalia, S.L. > +# > +# Authors: Alberto Garcia > +# Madeeha Javed > +# > +# SPDX-License-Identifier: GPL-2.0-or-later > +# > +# qcow2 files produced by this script are always arranged like this: > +# > +# - qcow2 header > +# - refcount table > +# - refcount blocks > +# - L1 table > +# - L2 tables > +# - Data clusters > +# > +# A note about variable names: in qcow2 there is one refcount table > +# and one (active) L1 table, although each can occupy several > +# clusters. For the sake of simplicity the code sometimes talks about > +# refcount tables and L1 tables when referring to those clusters. > + > +import argparse > +import atexit This is unused now > +import math > +import os > +import signal > +import struct > +import subprocess > +import sys > +import tempfile > +import time > +from contextlib import contextmanager > + > +QCOW2_DEFAULT_CLUSTER_SIZE = 65536 > +QCOW2_DEFAULT_REFCOUNT_BITS = 16 > +QCOW2_DEFAULT_VERSION = 3 > +QCOW2_FEATURE_NAME_TABLE = 0x6803F857 > +QCOW2_V3_HEADER_LENGTH = 112 # Header length in QEMU 9.0. Must be a > multiple of 8 > +QCOW_OFLAG_COPIED = 1 << 63 > +QEMU_STORAGE_DAEMON = "qemu-storage-daemon" > + > + > +def bitmap_set(bitmap, idx): > +bitmap[int(idx / 8)] |= 1 << (idx % 8) Should use floor division operator (//): bitmap[idx // 8] |= 1 << (idx % 8) Same for bitmap_test(). > + > + > +def bitmap_test(bitmap, idx): bitmap_is_set() can be more clear. For example it is obvious that it returns True if the bit is set > +return (bitmap[int(idx / 8)] & (1 << (idx % 8))) != 0 > + > + > +# create_qcow2_file() expects a raw input file. If we have a different > +# format we can use qemu-storage-daemon to make it appear as raw. > +@contextmanager > +def get_input_as_raw_file(input_file, input_format): > +if input_format == "raw": > +yield input_file > +return > +try: > +temp_dir = tempfile.mkdtemp() > +pid_file = temp_dir + "/pid" > +raw_file = temp_dir + "/raw" This is fragile, better to use os.path.join() > +open(raw_file, "wb").close() > +ret = subprocess.run( > +[ > +QEMU_STORAGE_DAEMON, > +
Re: [PATCH 3/4] iotests: Change imports for Python 3.13
> On 2 Jul 2024, at 17:44, John Snow wrote: > > > > On Tue, Jul 2, 2024 at 7:52 AM Nir Soffer <mailto:nsof...@redhat.com>> wrote: >> On Thu, Jun 27, 2024 at 2:23 AM John Snow > <mailto:js...@redhat.com>> wrote: >> > >> > Python 3.13 isn't out yet, but it's in beta and Fedora is ramping up to >> > make it the default system interpreter for Fedora 41. >> > >> > They moved our cheese for where ContextManager lives; add a conditional >> > to locate it while we support both pre-3.9 and 3.13+. >> > >> > Signed-off-by: John Snow mailto:js...@redhat.com>> >> > --- >> > tests/qemu-iotests/testenv.py| 7 ++- >> > tests/qemu-iotests/testrunner.py | 9 ++--- >> > 2 files changed, 12 insertions(+), 4 deletions(-) >> > >> > diff --git a/tests/qemu-iotests/testenv.py b/tests/qemu-iotests/testenv.py >> > index 588f30a4f14..96d69e56963 100644 >> > --- a/tests/qemu-iotests/testenv.py >> > +++ b/tests/qemu-iotests/testenv.py >> > @@ -25,7 +25,12 @@ >> > import random >> > import subprocess >> > import glob >> > -from typing import List, Dict, Any, Optional, ContextManager >> > +from typing import List, Dict, Any, Optional >> > + >> > +if sys.version_info >= (3, 9): >> > +from contextlib import AbstractContextManager as ContextManager >> > +else: >> > +from typing import ContextManager >> >> It can be cleaner to add a compat module hiding the details so the >> entire project >> can have a single instance of this. Other code will just use: >> >> from compat import ContextManager > > If there were more than two uses, I'd consider it. As it stands, a compat.py > module with just one import conditional in it doesn't seem worth the hassle. > Are there more cases of compatibility goop inside iotests that need to be > factored out to make it worth it? I don’t about other. For me even one instance is ugly enough :-)
Re: [PATCH 3/4] iotests: Change imports for Python 3.13
On Thu, Jun 27, 2024 at 2:23 AM John Snow wrote: > > Python 3.13 isn't out yet, but it's in beta and Fedora is ramping up to > make it the default system interpreter for Fedora 41. > > They moved our cheese for where ContextManager lives; add a conditional > to locate it while we support both pre-3.9 and 3.13+. > > Signed-off-by: John Snow > --- > tests/qemu-iotests/testenv.py| 7 ++- > tests/qemu-iotests/testrunner.py | 9 ++--- > 2 files changed, 12 insertions(+), 4 deletions(-) > > diff --git a/tests/qemu-iotests/testenv.py b/tests/qemu-iotests/testenv.py > index 588f30a4f14..96d69e56963 100644 > --- a/tests/qemu-iotests/testenv.py > +++ b/tests/qemu-iotests/testenv.py > @@ -25,7 +25,12 @@ > import random > import subprocess > import glob > -from typing import List, Dict, Any, Optional, ContextManager > +from typing import List, Dict, Any, Optional > + > +if sys.version_info >= (3, 9): > +from contextlib import AbstractContextManager as ContextManager > +else: > +from typing import ContextManager It can be cleaner to add a compat module hiding the details so the entire project can have a single instance of this. Other code will just use: from compat import ContextManager > > DEF_GDB_OPTIONS = 'localhost:12345' > > diff --git a/tests/qemu-iotests/testrunner.py > b/tests/qemu-iotests/testrunner.py > index 7b322272e92..2e236c8fa39 100644 > --- a/tests/qemu-iotests/testrunner.py > +++ b/tests/qemu-iotests/testrunner.py > @@ -27,11 +27,14 @@ > import shutil > import sys > from multiprocessing import Pool > -from typing import List, Optional, Any, Sequence, Dict, \ > -ContextManager > - > +from typing import List, Optional, Any, Sequence, Dict > from testenv import TestEnv > > +if sys.version_info >= (3, 9): > +from contextlib import AbstractContextManager as ContextManager > +else: > +from typing import ContextManager > + > > def silent_unlink(path: Path) -> None: > try: > -- > 2.45.0 > >
[PATCH v3 0/2] Consider discard option when writing zeros
Punch holes only when the image is opened with discard=on or discard=unmap. Tested by: - new write-zeroes-unmap iotest on xfs, ext4, and tmpfs - tests/qemu-iotests/check -raw - tests/qemu-iotests/check -qcow2 Changes since v2 - Add write-zeroes-unmap iotest - Fix iotest missing discard=unmap v2 was here: https://lists.nongnu.org/archive/html/qemu-block/2024-06/msg00231.html Nir Soffer (2): qemu-iotest/245: Add missing discard=unmap Consider discard option when writing zeros block/io.c| 9 +- tests/qemu-iotests/245| 2 +- tests/qemu-iotests/tests/write-zeroes-unmap | 127 ++ .../qemu-iotests/tests/write-zeroes-unmap.out | 81 +++ 4 files changed, 214 insertions(+), 5 deletions(-) create mode 100755 tests/qemu-iotests/tests/write-zeroes-unmap create mode 100644 tests/qemu-iotests/tests/write-zeroes-unmap.out -- 2.45.2
[PATCH v3 1/2] qemu-iotest/245: Add missing discard=unmap
The test works since we punch holes by default even when opening the image without discard=on or discard=unmap. Fix the test to enable discard. --- tests/qemu-iotests/245 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/qemu-iotests/245 b/tests/qemu-iotests/245 index a934c9d1e6..f96610f510 100755 --- a/tests/qemu-iotests/245 +++ b/tests/qemu-iotests/245 @@ -590,11 +590,11 @@ class TestBlockdevReopen(iotests.QMPTestCase): # Insert (and remove) a compress filter @iotests.skip_if_unsupported(['compress']) def test_insert_compress_filter(self): # Add an image to the VM: hd (raw) -> hd0 (qcow2) -> hd0-file (file) -opts = {'driver': 'raw', 'node-name': 'hd', 'file': hd_opts(0)} +opts = {'driver': 'raw', 'node-name': 'hd', 'file': hd_opts(0), 'discard': 'unmap'} self.vm.cmd('blockdev-add', conv_keys = False, **opts) # Add a 'compress' filter filter_opts = {'driver': 'compress', 'node-name': 'compress0', -- 2.45.2
[PATCH v3 2/2] Consider discard option when writing zeros
When opening an image with discard=off, we punch hole in the image when writing zeroes, making the image sparse. This breaks users that want to ensure that writes cannot fail with ENOSPACE by using fully allocated images[1]. bdrv_co_pwrite_zeroes() correctly disables BDRV_REQ_MAY_UNMAP if we opened the child without discard=unmap or discard=on. But we don't go through this function when accessing the top node. Move the check down to bdrv_co_do_pwrite_zeroes() which seems to be used in all code paths. This change implements the documented behavior, punching holes only when opening the image with discard=on or discard=unmap. This may not be the best default but can improve it later. The test depends on a file system supporting discard, deallocating the entire file when punching hole with the length of the entire file. Tested with xfs, ext4, and tmpfs. [1] https://lists.nongnu.org/archive/html/qemu-discuss/2024-06/msg3.html Signed-off-by: Nir Soffer --- block/io.c| 9 +- tests/qemu-iotests/tests/write-zeroes-unmap | 127 ++ .../qemu-iotests/tests/write-zeroes-unmap.out | 81 +++ 3 files changed, 213 insertions(+), 4 deletions(-) create mode 100755 tests/qemu-iotests/tests/write-zeroes-unmap create mode 100644 tests/qemu-iotests/tests/write-zeroes-unmap.out diff --git a/block/io.c b/block/io.c index 7217cf811b..301514c880 100644 --- a/block/io.c +++ b/block/io.c @@ -1860,10 +1860,15 @@ bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes, /* By definition there is no user buffer so this flag doesn't make sense */ if (flags & BDRV_REQ_REGISTERED_BUF) { return -EINVAL; } +/* If opened with discard=off we should never unmap. */ +if (!(bs->open_flags & BDRV_O_UNMAP)) { +flags &= ~BDRV_REQ_MAY_UNMAP; +} + /* Invalidate the cached block-status data range if this write overlaps */ bdrv_bsc_invalidate_range(bs, offset, bytes); assert(alignment % bs->bl.request_alignment == 0); head = offset % alignment; @@ -2313,14 +2318,10 @@ int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, { IO_CODE(); trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags); assert_bdrv_graph_readable(); -if (!(child->bs->open_flags & BDRV_O_UNMAP)) { -flags &= ~BDRV_REQ_MAY_UNMAP; -} - return bdrv_co_pwritev(child, offset, bytes, NULL, BDRV_REQ_ZERO_WRITE | flags); } /* diff --git a/tests/qemu-iotests/tests/write-zeroes-unmap b/tests/qemu-iotests/tests/write-zeroes-unmap new file mode 100755 index 00..7cfeeaf839 --- /dev/null +++ b/tests/qemu-iotests/tests/write-zeroes-unmap @@ -0,0 +1,127 @@ +#!/usr/bin/env bash +# group: quick +# +# Test write zeros unmap. +# +# Copyright (C) Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# + +seq="$(basename $0)" +echo "QA output created by $seq" + +trap _cleanup_test_img exit + +# get standard environment, filters and checks +cd .. +. ./common.rc +. ./common.filter + +_supported_fmt raw +_supported_proto file +_supported_os Linux + +create_test_image() { +_make_test_img -f $IMGFMT 1m +} + +filter_command() { +_filter_testdir | _filter_qemu_io | _filter_qemu | _filter_hmp +} + +print_disk_usage() { +du -sh $TEST_IMG | _filter_testdir +} + +echo +echo "=== defaults - write zeros ===" +echo + +create_test_image +echo -e 'qemu-io none0 "write -z 0 1m"\nquit' \ +| $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT \ +| filter_command +print_disk_usage + +echo +echo "=== defaults - write zeros unmap ===" +echo + +create_test_image +echo -e 'qemu-io none0 "write -zu 0 1m"\nquit' \ +| $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT \ +| filter_command +print_disk_usage + + +echo +echo "=== defaults - write actual zeros ===" +echo + +create_test_image +echo -e 'qemu-io none0 "write -P 0 0 1m"\nquit' \ +| $QEMU -monitor stdio -drive if=none,file=$TEST_IMG,format=$IMGFMT \ +| filter_command +print_disk_usage + +echo +echo "=== discard=off - write zeroes unmap ===" +echo + +create_test_image +echo
Re: [PATCH v2] Consider discard option when writing zeros
On Thu, Jun 27, 2024 at 2:42 PM Kevin Wolf wrote: > Am 26.06.2024 um 18:27 hat Nir Soffer geschrieben: > > On Wed, Jun 26, 2024 at 12:17 PM Daniel P. Berrangé > > > wrote: > > > > > On Mon, Jun 24, 2024 at 06:08:26PM +0200, Kevin Wolf wrote: > > > > Am 24.06.2024 um 17:23 hat Stefan Hajnoczi geschrieben: > > > > > On Wed, Jun 19, 2024 at 08:43:25PM +0300, Nir Soffer wrote: > > > > > > Tested using: > > > > > > > > > > Hi Nir, > > > > > This looks like a good candidate for the qemu-iotests test suite. > > > Adding > > > > > it to the automated tests will protect against future regressions. > > > > > > > > > > Please add the script and the expected output to > > > > > tests/qemu-iotests/test/write-zeroes-unmap and run it using > > > > > `(cd build && tests/qemu-iotests/check write-zeroes-unmap)`. > > > > > > > > > > See the existing test cases in tests/qemu-iotests/ and > > > > > tests/qemu-iotests/tests/ for examples. Some are shell scripts and > > > > > others are Python. I think shell makes sense for this test case. > You > > > > > can copy the test framework boilerplate from an existing test case. > > > > > > > > 'du' can't be used like this in qemu-iotests because it makes > > > > assumptions that depend on the filesystem. A test case replicating > what > > > > Nir did manually would likely fail on XFS with its preallocation. > > > > > > > > Maybe we could operate on a file exposed by the FUSE export that is > > > > backed by qcow2, and then you can use 'qemu-img map' on that qcow2 > image > > > > to verify the allocation status. Somewhat complicated, but I think it > > > > could work. > > > > > > A simpler option would be to use 'du' but with a fuzzy range test, > > > rather than an exact equality test. > > > > > > For the tests which write 1 MB, check the 'du' usage is "at least 1MB", > > > for the tests which expect to unmap blocks, check that the 'du' usage > > > is "less than 256kb". This should be within bounds of xfs speculative > > > allocation. > > > > This should work, I'll start with this approach. > > If we're okay with accepting tests that depend on filesystem behaviour, > then 'qemu-img map -f raw --output=json' should be the less risky > approach than checking 'du'. > Unfortunately it does not work since qemu-img map and qemu-nbd reports the allocated area as zero area with no data. I tried this: $ cat test-print-allocation.sh #!/bin/sh qemu=${1:?Usage: $0 qemu-executable} img=/tmp/qemu-test-unmap.img echo echo "discard=unmap - write zeroes" fallocate -l 1m $img echo -e 'qemu-io none0 "write -z 0 1m"\nquit' | $qemu -monitor stdio \ -drive if=none,file=$img,format=raw,discard=unmap >/dev/null echo "du:" du -sh $img echo "qemu-img map:" qemu-img map -f raw --output json $img echo "nbdinfo --map:" nbdinfo --map -- [ qemu-nbd -r -f raw $img ] echo echo "discard=unmap - write zeroes unmap" fallocate -l 1m $img echo -e 'qemu-io none0 "write -zu 0 1m"\nquit' | $qemu -monitor stdio \ -drive if=none,file=$img,format=raw,discard=unmap >/dev/null echo "du:" du -sh $img echo "qemu-img map:" qemu-img map -f raw --output json $img echo "nbdinfo --map:" nbdinfo --map -- [ qemu-nbd -r -f raw $img ] rm $img $ ./test-print-allocation.sh ./qemu-system-x86_64 discard=unmap - write zeroes du: 1.0M /tmp/qemu-test-unmap.img qemu-img map: [{ "start": 0, "length": 1048576, "depth": 0, "present": true, "zero": true, "data": false, "offset": 0}] nbdinfo --map: 0 10485763 hole,zero discard=unmap - write zeroes unmap du: 0 /tmp/qemu-test-unmap.img qemu-img map: [{ "start": 0, "length": 1048576, "depth": 0, "present": true, "zero": true, "data": false, "offset": 0}] nbdinfo --map: 0 10485763 hole,zero
Re: [PATCH v2] Consider discard option when writing zeros
On Wed, Jun 26, 2024 at 12:17 PM Daniel P. Berrangé wrote: > On Mon, Jun 24, 2024 at 06:08:26PM +0200, Kevin Wolf wrote: > > Am 24.06.2024 um 17:23 hat Stefan Hajnoczi geschrieben: > > > On Wed, Jun 19, 2024 at 08:43:25PM +0300, Nir Soffer wrote: > > > > Tested using: > > > > > > Hi Nir, > > > This looks like a good candidate for the qemu-iotests test suite. > Adding > > > it to the automated tests will protect against future regressions. > > > > > > Please add the script and the expected output to > > > tests/qemu-iotests/test/write-zeroes-unmap and run it using > > > `(cd build && tests/qemu-iotests/check write-zeroes-unmap)`. > > > > > > See the existing test cases in tests/qemu-iotests/ and > > > tests/qemu-iotests/tests/ for examples. Some are shell scripts and > > > others are Python. I think shell makes sense for this test case. You > > > can copy the test framework boilerplate from an existing test case. > > > > 'du' can't be used like this in qemu-iotests because it makes > > assumptions that depend on the filesystem. A test case replicating what > > Nir did manually would likely fail on XFS with its preallocation. > > > > Maybe we could operate on a file exposed by the FUSE export that is > > backed by qcow2, and then you can use 'qemu-img map' on that qcow2 image > > to verify the allocation status. Somewhat complicated, but I think it > > could work. > > A simpler option would be to use 'du' but with a fuzzy range test, > rather than an exact equality test. > > For the tests which write 1 MB, check the 'du' usage is "at least 1MB", > for the tests which expect to unmap blocks, check that the 'du' usage > is "less than 256kb". This should be within bounds of xfs speculative > allocation. > This should work, I'll start with this approach.
Re: [PATCH v2] Consider discard option when writing zeros
On Wed, Jun 26, 2024 at 11:42 AM Kevin Wolf wrote: > Am 24.06.2024 um 23:12 hat Nir Soffer geschrieben: > > On Mon, Jun 24, 2024 at 7:08 PM Kevin Wolf wrote: > > > > > Am 24.06.2024 um 17:23 hat Stefan Hajnoczi geschrieben: > > > > On Wed, Jun 19, 2024 at 08:43:25PM +0300, Nir Soffer wrote: > > > > > Tested using: > > > > > > > > Hi Nir, > > > > This looks like a good candidate for the qemu-iotests test suite. > Adding > > > > it to the automated tests will protect against future regressions. > > > > > > > > Please add the script and the expected output to > > > > tests/qemu-iotests/test/write-zeroes-unmap and run it using > > > > `(cd build && tests/qemu-iotests/check write-zeroes-unmap)`. > > > > > > > > See the existing test cases in tests/qemu-iotests/ and > > > > tests/qemu-iotests/tests/ for examples. Some are shell scripts and > > > > others are Python. I think shell makes sense for this test case. You > > > > can copy the test framework boilerplate from an existing test case. > > > > > > 'du' can't be used like this in qemu-iotests because it makes > > > assumptions that depend on the filesystem. A test case replicating what > > > Nir did manually would likely fail on XFS with its preallocation. > > > > This is why I did not try to add a new qemu-iotest yet. > > > > > Maybe we could operate on a file exposed by the FUSE export that is > > > backed by qcow2, and then you can use 'qemu-img map' on that qcow2 > image > > > to verify the allocation status. Somewhat complicated, but I think it > > > could work. > > > > Do we have examples of using the FUSE export? It sounds complicated but > > being able to test on any file system is awesome. The complexity can be > > hidden behind simple test helpers. > > We seem to have a few tests that use it, and then the fuse protocol > implementation, too. 308 and file-io-error look relevant. > > > Another option is to use a specific file system created for the tests, > > for example on a loop device. We used userstorage[1] in ovirt to test > > on specific file systems with known sector size. > > Creating loop devices requires root privileges. If I understand > correctly, userstorage solved that by having a setup phase as root > before running the tests as a normal user? We don't really have that in > qemu-iotests. > > Some tests require passwordless sudo and are skipped otherwise, but this > means that in practice they are almost always skipped. > Yes, this is the assumption the storage is being created before running the tests, for example when setting up a development or CI environment, and the tests can run with unprivileged user. > But more important, are you ok with the change? > > > > I'm not sure about not creating sparse images by default - this is not > > consistent with qemu-img convert and qemu-nbd, which do sparsify by > > default. The old behavior seems better. > > Well, your patches make it do what we always claimed it would do, so > that consistency is certainly a good thing. Unmapping on write_zeroes > and ignoring truncate is a weird combination anyway that doesn't really > make any sense to me, so I don't think it's worth preserving. The other > way around could have been more defensible, but that's not how our bug > works. > > Now, if ignoring all discard requests is a good default these days is a > separate question and I'm not sure really. Maybe discard=unmap should > be the default (and apply to both discard are write_zeroes, of course). > OK, lets limit the scope to fix the code to match the current docs. We can tweak the defaults later.
Re: [PATCH v2] Consider discard option when writing zeros
On Mon, Jun 24, 2024 at 7:08 PM Kevin Wolf wrote: > Am 24.06.2024 um 17:23 hat Stefan Hajnoczi geschrieben: > > On Wed, Jun 19, 2024 at 08:43:25PM +0300, Nir Soffer wrote: > > > Tested using: > > > > Hi Nir, > > This looks like a good candidate for the qemu-iotests test suite. Adding > > it to the automated tests will protect against future regressions. > > > > Please add the script and the expected output to > > tests/qemu-iotests/test/write-zeroes-unmap and run it using > > `(cd build && tests/qemu-iotests/check write-zeroes-unmap)`. > > > > See the existing test cases in tests/qemu-iotests/ and > > tests/qemu-iotests/tests/ for examples. Some are shell scripts and > > others are Python. I think shell makes sense for this test case. You > > can copy the test framework boilerplate from an existing test case. > > 'du' can't be used like this in qemu-iotests because it makes > assumptions that depend on the filesystem. A test case replicating what > Nir did manually would likely fail on XFS with its preallocation. > This is why I did not try to add a new qemu-iotest yet. > Maybe we could operate on a file exposed by the FUSE export that is > backed by qcow2, and then you can use 'qemu-img map' on that qcow2 image > to verify the allocation status. Somewhat complicated, but I think it > could work. > Do we have examples of using the FUSE export? It sounds complicated but being able to test on any file system is awesome. The complexity can be hidden behind simple test helpers. Another option is to use a specific file system created for the tests, for example on a loop device. We used userstorage[1] in ovirt to test on specific file systems with known sector size. But more important, are you ok with the change? I'm not sure about not creating sparse images by default - this is not consistent with qemu-img convert and qemu-nbd, which do sparsify by default. The old behavior seems better. [1] https://github.com/nirs/userstorage Nir
Re: [PATCH v2] Consider discard option when writing zeros
On Wed, Jun 19, 2024 at 8:40 PM Nir Soffer wrote: > - Need to run all block tests > Stale note, make check pass
Re: [PATCH v2] Consider discard option when writing zeros
Tested using: $ cat test-unmap.sh #!/bin/sh qemu=${1:?Usage: $0 qemu-executable} img=/tmp/test.raw echo echo "defaults - write zeroes" fallocate -l 1m $img echo -e 'qemu-io none0 "write -z 0 1m"\nquit' | $qemu -monitor stdio \ -drive if=none,file=$img,format=raw >/dev/null du -sh $img echo echo "defaults - write zeroes unmap" fallocate -l 1m $img echo -e 'qemu-io none0 "write -zu 0 1m"\nquit' | $qemu -monitor stdio \ -drive if=none,file=$img,format=raw >/dev/null du -sh $img echo echo "defaults - write actual zeros" fallocate -l 1m $img echo -e 'qemu-io none0 "write -P 0 0 1m"\nquit' | $qemu -monitor stdio \ -drive if=none,file=$img,format=raw >/dev/null du -sh $img echo echo "discard=off - write zeroes unmap" fallocate -l 1m $img echo -e 'qemu-io none0 "write -zu 0 1m"\nquit' | $qemu -monitor stdio \ -drive if=none,file=$img,format=raw,discard=off >/dev/null du -sh $img echo echo "detect-zeros=on - write actual zeros" fallocate -l 1m $img echo -e 'qemu-io none0 "write -P 0 0 1m"\nquit' | $qemu -monitor stdio \ -drive if=none,file=$img,format=raw,detect-zeroes=on >/dev/null du -sh $img echo echo "detect-zeros=unmap,discard=unmap - write actual zeros" fallocate -l 1m $img echo -e 'qemu-io none0 "write -P 0 0 1m"\nquit' | $qemu -monitor stdio \ -drive if=none,file=$img,format=raw,detect-zeroes=unmap,discard=unmap >/dev/null du -sh $img echo echo "discard=unmap - write zeroes" fallocate -l 1m $img echo -e 'qemu-io none0 "write -z 0 1m"\nquit' | $qemu -monitor stdio \ -drive if=none,file=$img,format=raw,discard=unmap >/dev/null du -sh $img echo echo "discard=unmap - write zeroes unmap" fallocate -l 1m $img echo -e 'qemu-io none0 "write -zu 0 1m"\nquit' | $qemu -monitor stdio \ -drive if=none,file=$img,format=raw,discard=unmap >/dev/null du -sh $img rm $img Before this change: $ cat before.out defaults - write zeroes 1.0M /tmp/test.raw defaults - write zeroes unmap 0 /tmp/test.raw defaults - write actual zeros 1.0M /tmp/test.raw discard=off - write zeroes unmap 0 /tmp/test.raw detect-zeros=on - write actual zeros 1.0M /tmp/test.raw detect-zeros=unmap,discard=unmap - write actual zeros 0 /tmp/test.raw discard=unmap - write zeroes 1.0M /tmp/test.raw discard=unmap - write zeroes unmap 0 /tmp/test.raw [nsoffer build (consider-discard-option)]$ After this change: $ cat after.out defaults - write zeroes 1.0M /tmp/test.raw defaults - write zeroes unmap 1.0M /tmp/test.raw defaults - write actual zeros 1.0M /tmp/test.raw discard=off - write zeroes unmap 1.0M /tmp/test.raw detect-zeros=on - write actual zeros 1.0M /tmp/test.raw detect-zeros=unmap,discard=unmap - write actual zeros 0 /tmp/test.raw discard=unmap - write zeroes 1.0M /tmp/test.raw discard=unmap - write zeroes unmap 0 /tmp/test.raw Differences: $ diff -u before.out after.out --- before.out 2024-06-19 20:24:09.234083713 +0300 +++ after.out 2024-06-19 20:24:20.526165573 +0300 @@ -3,13 +3,13 @@ 1.0M /tmp/test.raw defaults - write zeroes unmap -0 /tmp/test.raw +1.0M /tmp/test.raw defaults - write actual zeros 1.0M /tmp/test.raw discard=off - write zeroes unmap -0 /tmp/test.raw +1.0M /tmp/test.raw On Wed, Jun 19, 2024 at 8:40 PM Nir Soffer wrote: > When opening an image with discard=off, we punch hole in the image when > writing zeroes, making the image sparse. This breaks users that want to > ensure that writes cannot fail with ENOSPACE by using fully allocated > images. > > bdrv_co_pwrite_zeroes() correctly disable BDRV_REQ_MAY_UNMAP if we > opened the child without discard=unmap or discard=on. But we don't go > through this function when accessing the top node. Move the check down > to bdrv_co_do_pwrite_zeroes() which seems to be used in all code paths. > > Issues: > - We don't punch hole by default, so images are kept allocated. Before > this change we punched holes by default. I'm not sure this is a good > change in behavior. > - Need to run all block tests > - Not sure that we have tests covering unmapping, we may need new tests > - We may need new tests to cover this change > > Signed-off-by: Nir Soffer > --- > > Changes since v1: > - Replace the incorrect has_discard change with the right fix > > v1 was here: > https://lists.nongnu.org/archive/html/qemu-block/2024-06/msg00198.html > > block/io.c | 9 + > 1 file changed, 5 insertions(+), 4 deletions(-) > > diff --git a/block/io.c b/block/io.c > index 7217cf811b..301514c880 100644 > --- a/block/io.c > +++ b/block/io.c > @@ -1860,10 +1860,15 @@ bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, > int64_t offset, int64_t bytes, >
[PATCH v2] Consider discard option when writing zeros
When opening an image with discard=off, we punch hole in the image when writing zeroes, making the image sparse. This breaks users that want to ensure that writes cannot fail with ENOSPACE by using fully allocated images. bdrv_co_pwrite_zeroes() correctly disable BDRV_REQ_MAY_UNMAP if we opened the child without discard=unmap or discard=on. But we don't go through this function when accessing the top node. Move the check down to bdrv_co_do_pwrite_zeroes() which seems to be used in all code paths. Issues: - We don't punch hole by default, so images are kept allocated. Before this change we punched holes by default. I'm not sure this is a good change in behavior. - Need to run all block tests - Not sure that we have tests covering unmapping, we may need new tests - We may need new tests to cover this change Signed-off-by: Nir Soffer --- Changes since v1: - Replace the incorrect has_discard change with the right fix v1 was here: https://lists.nongnu.org/archive/html/qemu-block/2024-06/msg00198.html block/io.c | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/block/io.c b/block/io.c index 7217cf811b..301514c880 100644 --- a/block/io.c +++ b/block/io.c @@ -1860,10 +1860,15 @@ bdrv_co_do_pwrite_zeroes(BlockDriverState *bs, int64_t offset, int64_t bytes, /* By definition there is no user buffer so this flag doesn't make sense */ if (flags & BDRV_REQ_REGISTERED_BUF) { return -EINVAL; } +/* If opened with discard=off we should never unmap. */ +if (!(bs->open_flags & BDRV_O_UNMAP)) { +flags &= ~BDRV_REQ_MAY_UNMAP; +} + /* Invalidate the cached block-status data range if this write overlaps */ bdrv_bsc_invalidate_range(bs, offset, bytes); assert(alignment % bs->bl.request_alignment == 0); head = offset % alignment; @@ -2313,14 +2318,10 @@ int coroutine_fn bdrv_co_pwrite_zeroes(BdrvChild *child, int64_t offset, { IO_CODE(); trace_bdrv_co_pwrite_zeroes(child->bs, offset, bytes, flags); assert_bdrv_graph_readable(); -if (!(child->bs->open_flags & BDRV_O_UNMAP)) { -flags &= ~BDRV_REQ_MAY_UNMAP; -} - return bdrv_co_pwritev(child, offset, bytes, NULL, BDRV_REQ_ZERO_WRITE | flags); } /* -- 2.45.1
Re: [PATCH] block/file-posix: Consider discard flag when opening
> On 19 Jun 2024, at 11:16, Kevin Wolf wrote: > > Am 18.06.2024 um 23:24 hat Nir Soffer geschrieben: >> Set has_discard only when BDRV_O_UNMAP is not set. With this users that >> want to keep their images fully allocated can disable hole punching >> when writing zeros or discarding using: >> >> -drive file=thick.img,discard=off >> >> This change is not entirely correct since it changes the default discard >> behavior. Previously we always allowed punching holes, but now you have >> must use discard=unmap|on to enable it. We probably need to add the >> BDDR_O_UNMAP flag by default. >> >> make check still works, so maybe we don't have tests for sparsifying >> images, or maybe you need to run special tests that do not run by >> default. We needs tests for keeping images non-sparse. >> >> Signed-off-by: Nir Soffer > > So first of all, I agree with you that this patch is wrong. ;-) > > At first, I failed to understand the problem this is trying to solve. I > put a debug message in handle_aiocb_discard() and tried with which > options it triggers. [1] To me, this looked exactly like it should be. > We only try to discard blocks when discard=unmap is given as an option. > > That leaves the case of write_zeroes. And while at the first sight, the > code looked good, we do seem to have a problem there and it tried to > unmap even with discard=off. > >> block/file-posix.c | 2 +- >> 1 file changed, 1 insertion(+), 1 deletion(-) >> >> diff --git a/block/file-posix.c b/block/file-posix.c >> index be25e35ff6..acac2abadc 100644 >> --- a/block/file-posix.c >> +++ b/block/file-posix.c >> @@ -738,11 +738,11 @@ static int raw_open_common(BlockDriverState *bs, QDict >> *options, >> ret = -EINVAL; >> goto fail; >> } >> #endif /* !defined(CONFIG_LINUX_IO_URING) */ >> >> -s->has_discard = true; >> +s->has_discard = !!(bdrv_flags & BDRV_O_UNMAP); >> s->has_write_zeroes = true; >> >> if (fstat(s->fd, &st) < 0) { >> ret = -errno; >> error_setg_errno(errp, errno, "Could not stat file"); > > s->has_discard is about what the host supports, not about the semantics > of the QEMU block node. So this doesn't feel right to me. > > So for the buggy case, write_zeroes, bdrv_co_pwrite_zeroes() has code > that considers the case and clears the ~BDRV_REQ_MAY_UNMAP flags: > >if (!(child->bs->open_flags & BDRV_O_UNMAP)) { >flags &= ~BDRV_REQ_MAY_UNMAP; >} > > But it turns out that we don't necessarily even go through this function > for the top node which has discard=off, so it can't take effect: > > (gdb) bt > #0 0x74f2f144 in __pthread_kill_implementation () at /lib64/libc.so > <http://libc.so/>.6 > #1 0x74ed765e in raise () at /lib64/libc.so <http://libc.so/>.6 > #2 0x74ebf902 in abort () at /lib64/libc.so <http://libc.so/>.6 > #3 0x5615aff0 in raw_do_pwrite_zeroes (bs=0x57f4bcf0, offset=0, > bytes=1048576, flags=BDRV_REQ_MAY_UNMAP, blkdev=false) at > ../block/file-posix.c:3643 > #4 0x5615557e in raw_co_pwrite_zeroes (bs=0x57f4bcf0, offset=0, > bytes=1048576, flags=BDRV_REQ_MAY_UNMAP) at ../block/file-posix.c:3655 > #5 0x560cde2a in bdrv_co_do_pwrite_zeroes (bs=0x57f4bcf0, > offset=0, bytes=1048576, flags=6) at ../block/io.c:1901 > #6 0x560c72f9 in bdrv_aligned_pwritev (child=0x57f51460, > req=0x7fffed5ff800, offset=0, bytes=1048576, align=1, qiov=0x0, > qiov_offset=0, flags=6) at ../block/io.c:2100 > #7 0x560c6b41 in bdrv_co_do_zero_pwritev (child=0x57f51460, > offset=0, bytes=1048576, flags=6, req=0x7fffed5ff800) at ../block/io.c:2183 > #8 0x560c6647 in bdrv_co_pwritev_part (child=0x57f51460, > offset=0, bytes=1048576, qiov=0x0, qiov_offset=0, flags=6) at > ../block/io.c:2283 > #9 0x560c634f in bdrv_co_pwritev (child=0x57f51460, offset=0, > bytes=1048576, qiov=0x0, flags=6) at ../block/io.c:2216 > #10 0x560c75b5 in bdrv_co_pwrite_zeroes (child=0x57f51460, > offset=0, bytes=1048576, flags=BDRV_REQ_MAY_UNMAP) at ../block/io.c:2322 > #11 0x56117d24 in raw_co_pwrite_zeroes (bs=0x57f44980, offset=0, > bytes=1048576, flags=BDRV_REQ_MAY_UNMAP) at ../block/raw-format.c:307 > #12 0x560cde2a in bdrv_co_do_pwrite_zeroes (bs=0x57f44980, > offset=0, bytes=1048576, flags=6) at ../block/io.c:1901 > #13 0x560c72f9 in bdrv_aligned_pwritev (child=0x57f513f0, > req=0x7fffed5ffd90, offset=0, bytes=1048576, al
[PATCH] block/file-posix: Consider discard flag when opening
Set has_discard only when BDRV_O_UNMAP is not set. With this users that want to keep their images fully allocated can disable hole punching when writing zeros or discarding using: -drive file=thick.img,discard=off This change is not entirely correct since it changes the default discard behavior. Previously we always allowed punching holes, but now you have must use discard=unmap|on to enable it. We probably need to add the BDDR_O_UNMAP flag by default. make check still works, so maybe we don't have tests for sparsifying images, or maybe you need to run special tests that do not run by default. We needs tests for keeping images non-sparse. Signed-off-by: Nir Soffer --- block/file-posix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/file-posix.c b/block/file-posix.c index be25e35ff6..acac2abadc 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -738,11 +738,11 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, ret = -EINVAL; goto fail; } #endif /* !defined(CONFIG_LINUX_IO_URING) */ -s->has_discard = true; +s->has_discard = !!(bdrv_flags & BDRV_O_UNMAP); s->has_write_zeroes = true; if (fstat(s->fd, &st) < 0) { ret = -errno; error_setg_errno(errp, errno, "Could not stat file"); -- 2.45.1
Re: [PATCH 1/1] block: improve alignment detection and fix 271 test
On Fri, Sep 8, 2023 at 12:54 AM Denis V. Lunev wrote: > Unfortunately 271 IO test is broken if started in non-cached mode. > Is this a real world issue? For example in oVirt you cannot create a disk with size < 4k so there is no way that 4k is not a good alignment. Should we fix the test to reflect real world usage? _reset_img 2083k I guess it works with: _reset_img 2084k Commits > commit a6b257a08e3d72219f03e461a52152672fec0612 > Author: Nir Soffer > Date: Tue Aug 13 21:21:03 2019 +0300 > file-posix: Handle undetectable alignment > and > commit 9c60a5d1978e6dcf85c0e01b50e6f7f54ca09104 > Author: Kevin Wolf > Date: Thu Jul 16 16:26:00 2020 +0200 > block: Require aligned image size to avoid assertion failure > have interesting side effect if used togather. > > If the image size is not multiple of 4k and that image falls under > original constraints of Nil's patch, the image can not be opened > due to the check in the bdrv_check_perm(). > > The patch tries to satisfy the requirements of bdrv_check_perm() > inside raw_probe_alignment(). This is at my opinion better that just > disallowing to run that test in non-cached mode. The operation is legal > by itself. > > Signed-off-by: Denis V. Lunev > CC: Nir Soffer > CC: Kevin Wolf > CC: Hanna Reitz > CC: Alberto Garcia > --- > block/file-posix.c | 17 +++-- > 1 file changed, 15 insertions(+), 2 deletions(-) > > diff --git a/block/file-posix.c b/block/file-posix.c > index b16e9c21a1..988cfdc76c 100644 > --- a/block/file-posix.c > +++ b/block/file-posix.c > @@ -447,8 +447,21 @@ static void raw_probe_alignment(BlockDriverState *bs, > int fd, Error **errp) > for (i = 0; i < ARRAY_SIZE(alignments); i++) { > align = alignments[i]; > if (raw_is_io_aligned(fd, buf, align)) { > -/* Fallback to safe value. */ > -bs->bl.request_alignment = (align != 1) ? align : > max_align; > +if (align != 1) { > +bs->bl.request_alignment = align; > +break; > +} > +/* > + * Fallback to safe value. max_align is perfect, but the > size of the device must be multiple of > + * the virtual length of the device. In the other case we > will get a error in > + * bdrv_node_refresh_perm(). > + */ > +for (align = max_align; align > 1; align /= 2) { > +if ((bs->total_sectors * BDRV_SECTOR_SIZE) % align == > 0) { > Moving image size calculation out of the loop would make the intent of the code more clear: if (image_size % align == 0) { Since qemu does not enforce image size alignment, I can see how you create a 512 bytes aligned image and in the case when qemu cannot detect the alignment, we end with align = 4k. In this case this loop would select align = 512, but with the image aligned to some strange value, this loop may select align = 2 or some other value that does not make sense. So I can see using 4k or 512 bytes as a good fallback value, but anything else should not be possible, so maybe we should fix this in bdrv_check_perm()? Nir
[PATCH] libvhost-user: Fix update of signalled_used
When we check if a driver needs a signal, we compare: - used_event: written by the driver each time it consumes an item - new: current idx written to the used ring, updated by us - old: last idx we signaled about We call vring_need_event() which does: return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old); Previously we updated signalled_used on every check, so old was always new - 1. Because used_event cannot bigger than new_idx, this check becomes (ignoring wrapping): return new_idx == event_idx + 1; Since the driver consumes items at the same time the device produces items, it is very likely (and seen in logs) that the driver used_event is too far behind new_idx and we don't signal the driver. With libblkio virtio-blk-vhost-user driver, if the driver does not get a signal, the libblkio client can hang polling the completion fd. This is very easy to reproduce on some machines and impossible to reproduce on others. Fixed by updating signalled_used only when we signal the driver. Tested using blkio-bench and libblkio client application that used to hang randomly without this change. Buglink: https://gitlab.com/libblkio/libblkio/-/issues/68 Signed-off-by: Nir Soffer --- subprojects/libvhost-user/libvhost-user.c | 23 +-- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/subprojects/libvhost-user/libvhost-user.c b/subprojects/libvhost-user/libvhost-user.c index 8fb61e2df2..5f26d2d378 100644 --- a/subprojects/libvhost-user/libvhost-user.c +++ b/subprojects/libvhost-user/libvhost-user.c @@ -2382,12 +2382,11 @@ vu_queue_empty(VuDev *dev, VuVirtq *vq) } static bool vring_notify(VuDev *dev, VuVirtq *vq) { -uint16_t old, new; -bool v; +uint16_t old, new, used; /* We need to expose used array entries before checking used event. */ smp_mb(); /* Always notify when queue is empty (when feature acknowledge) */ @@ -2398,15 +2397,27 @@ vring_notify(VuDev *dev, VuVirtq *vq) if (!vu_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { return !(vring_avail_flags(vq) & VRING_AVAIL_F_NO_INTERRUPT); } -v = vq->signalled_used_valid; -vq->signalled_used_valid = true; +if (!vq->signalled_used_valid) { +vq->signalled_used_valid = true; +vq->signalled_used = vq->used_idx; +return true; +} + +used = vring_get_used_event(vq); +new = vq->used_idx; old = vq->signalled_used; -new = vq->signalled_used = vq->used_idx; -return !v || vring_need_event(vring_get_used_event(vq), new, old); + +if (vring_need_event(used, new, old)) { +vq->signalled_used_valid = true; +vq->signalled_used = vq->used_idx; +return true; +} + +return false; } static void _vu_queue_notify(VuDev *dev, VuVirtq *vq, bool sync) { if (unlikely(dev->broken) || -- 2.40.1
Re: [Libguestfs] [PATCH v2 1/6] spec: Recommend cap on NBD_REPLY_TYPE_BLOCK_STATUS length
On Sun, Mar 5, 2023 at 10:42 AM Wouter Verhelst wrote: > > On Fri, Mar 03, 2023 at 04:17:40PM -0600, Eric Blake wrote: > > On Fri, Dec 16, 2022 at 10:32:01PM +0300, Vladimir Sementsov-Ogievskiy > > wrote: > > > s-o-b line missed. > > > > I'm not sure if the NBD project has a strict policy on including one, > > but I don't mind adding it. > > I've never required it, mostly because it's something that I myself > always forget, too, so, *shrug*. > > (if there were a way in git to make it add that automatically, that > would help; I've looked but haven't found it) What I'm using in all projects that require signed-off-by is: $ cat .git/hooks/commit-msg #!/bin/sh # Add Signed-off-by trailer. sob=$(git var GIT_AUTHOR_IDENT | sed -n 's/^\(.*>\).*$/Signed-off-by: \1/p') git interpret-trailers --in-place --trailer "$sob" "$1" You can also use a pre-commit hook but the commit-msg hook is more convenient. And in github you can add the DCO application to the project: https://github.com/apps/dco Once installed it will check that all commits are signed off, and provide helpful error messages to contributors. Nir
Re: [PATCH v2 2/5] Support format or cache specific out file
On Tue, Dec 13, 2022 at 8:09 PM Hanna Reitz wrote: > > On 13.12.22 16:56, Nir Soffer wrote: > > On Mon, Dec 12, 2022 at 12:38 PM Hanna Reitz wrote: > >> On 28.11.22 15:15, Nir Soffer wrote: > >>> Extend the test finder to find tests with format (*.out.qcow2) or cache > >>> specific (*.out.nocache) out file. This worked before only for the > >>> numbered tests. > >>> --- > >>>tests/qemu-iotests/findtests.py | 10 -- > >>>1 file changed, 8 insertions(+), 2 deletions(-) > >> This patch lacks an S-o-b, too. > >> > >>> diff --git a/tests/qemu-iotests/findtests.py > >>> b/tests/qemu-iotests/findtests.py > >>> index dd77b453b8..f4344ce78c 100644 > >>> --- a/tests/qemu-iotests/findtests.py > >>> +++ b/tests/qemu-iotests/findtests.py > >>> @@ -38,31 +38,37 @@ def chdir(path: Optional[str] = None) -> > >>> Iterator[None]: > >>>os.chdir(saved_dir) > >>> > >>> > >>>class TestFinder: > >>>def __init__(self, test_dir: Optional[str] = None) -> None: > >>>self.groups = defaultdict(set) > >>> > >>>with chdir(test_dir): > >>>self.all_tests = glob.glob('[0-9][0-9][0-9]') > >>>self.all_tests += [f for f in glob.iglob('tests/*') > >>> - if not f.endswith('.out') and > >>> - os.path.isfile(f + '.out')] > >>> + if self.is_test(f)] > >> So previously a file was only considered a test file if there was a > >> corresponding reference output file (`f + '.out'`), so files without > >> such a reference output aren’t considered test files... > >> > >>>for t in self.all_tests: > >>>with open(t, encoding="utf-8") as f: > >>>for line in f: > >>>if line.startswith('# group: '): > >>>for g in line.split()[2:]: > >>>self.groups[g].add(t) > >>>break > >>> > >>> +def is_test(self, fname: str) -> bool: > >>> +""" > >>> +The tests directory contains tests (no extension) and out files > >>> +(*.out, *.out.{format}, *.out.{option}). > >>> +""" > >>> +return re.search(r'.+\.out(\.\w+)?$', fname) is None > >> ...but this new function doesn’t check that. I think we should check it > >> (just whether there’s any variant of `/{fname}\.out(\.\w+)?/` to go with > >> `fname`) so that behavior isn’t changed. > > This means that you cannot add a test without a *.out* file, which may > > be useful when you don't use the out file for validation, but we can > > add this later if needed. > > I don’t think tests work without a reference output, do they? At least > a couple of years ago, the ./check script would refuse to run tests > without a corresponding .out file. This may be true, but most tests do not really need an out file and better be verified by asserting. There are some python tests that have pointless out file with the output of python unittest: $ cat tests/qemu-iotests/tests/nbd-multiconn.out ... -- Ran 3 tests OK This is not only unhelpful (update the output when adding a 4th test) but fragile. if unitests changes the output, maybe adding info about skipped tests, or changing "---" to "", the test will break. But for now I agree the test framework should keep the current behavior. Nir
Re: [PATCH v2 2/5] Support format or cache specific out file
On Mon, Dec 12, 2022 at 12:38 PM Hanna Reitz wrote: > > On 28.11.22 15:15, Nir Soffer wrote: > > Extend the test finder to find tests with format (*.out.qcow2) or cache > > specific (*.out.nocache) out file. This worked before only for the > > numbered tests. > > --- > > tests/qemu-iotests/findtests.py | 10 -- > > 1 file changed, 8 insertions(+), 2 deletions(-) > > This patch lacks an S-o-b, too. > > > diff --git a/tests/qemu-iotests/findtests.py > > b/tests/qemu-iotests/findtests.py > > index dd77b453b8..f4344ce78c 100644 > > --- a/tests/qemu-iotests/findtests.py > > +++ b/tests/qemu-iotests/findtests.py > > @@ -38,31 +38,37 @@ def chdir(path: Optional[str] = None) -> Iterator[None]: > > os.chdir(saved_dir) > > > > > > class TestFinder: > > def __init__(self, test_dir: Optional[str] = None) -> None: > > self.groups = defaultdict(set) > > > > with chdir(test_dir): > > self.all_tests = glob.glob('[0-9][0-9][0-9]') > > self.all_tests += [f for f in glob.iglob('tests/*') > > - if not f.endswith('.out') and > > - os.path.isfile(f + '.out')] > > + if self.is_test(f)] > > So previously a file was only considered a test file if there was a > corresponding reference output file (`f + '.out'`), so files without > such a reference output aren’t considered test files... > > > for t in self.all_tests: > > with open(t, encoding="utf-8") as f: > > for line in f: > > if line.startswith('# group: '): > > for g in line.split()[2:]: > > self.groups[g].add(t) > > break > > > > +def is_test(self, fname: str) -> bool: > > +""" > > +The tests directory contains tests (no extension) and out files > > +(*.out, *.out.{format}, *.out.{option}). > > +""" > > +return re.search(r'.+\.out(\.\w+)?$', fname) is None > > ...but this new function doesn’t check that. I think we should check it > (just whether there’s any variant of `/{fname}\.out(\.\w+)?/` to go with > `fname`) so that behavior isn’t changed. This means that you cannot add a test without a *.out* file, which may be useful when you don't use the out file for validation, but we can add this later if needed. I'll change the code to check both conditions.
[PATCH v2 3/5] qemu-img: Add checksum command
The checksum command compute a checksum for disk image content using the blkhash library[1]. The blkhash library is not packaged yet, but it is available via copr[2]. Example run: $ ./qemu-img checksum -p fedora-35.qcow2 6e5c00c995056319d52395f8d91c7f84725ae3da69ffcba4de4c7d22cff713a5 fedora-35.qcow2 The block checksum is constructed by splitting the image to fixed sized blocks and computing a digest of every block. The image checksum is the digest of the all block digests. The checksum uses internally the "sha256" algorithm but it cannot be compared with checksums created by other tools such as `sha256sum`. The blkhash library supports sparse images, zero detection, and optimizes zero block hashing (they are practically free). The library uses multiple threads to speed up the computation. Comparing to `sha256sum`, `qemu-img checksum` is 3.5-4800[3] times faster, depending on the amount of data in the image: $ ./qemu-img info /scratch/50p.raw file format: raw virtual size: 6 GiB (6442450944 bytes) disk size: 2.91 GiB $ hyperfine -w2 -r5 -p "sleep 1" "./qemu-img checksum /scratch/50p.raw" \ "sha256sum /scratch/50p.raw" Benchmark 1: ./qemu-img checksum /scratch/50p.raw Time (mean ± σ): 1.849 s ± 0.037 s[User: 7.764 s, System: 0.962 s] Range (min … max):1.813 s … 1.908 s5 runs Benchmark 2: sha256sum /scratch/50p.raw Time (mean ± σ): 14.585 s ± 0.072 s[User: 13.537 s, System: 1.003 s] Range (min … max): 14.501 s … 14.697 s5 runs Summary './qemu-img checksum /scratch/50p.raw' ran 7.89 ± 0.16 times faster than 'sha256sum /scratch/50p.raw' The new command is available only when `blkhash` is available during build. To test the new command please install the `blkhash-devel` package: $ dnf copr enable nsoffer/blkhash $ sudo dnf install blkhash-devel [1] https://gitlab.com/nirs/blkhash [2] https://copr.fedorainfracloud.org/coprs/nsoffer/blkhash/ [3] Computing checksum for 8T empty image: qemu-img checksum: 3.7s, sha256sum (estimate): 17,749s Signed-off-by: Nir Soffer --- docs/tools/qemu-img.rst | 24 ++ meson.build | 10 ++- meson_options.txt | 2 + qemu-img-cmds.hx| 8 ++ qemu-img.c | 183 5 files changed, 226 insertions(+), 1 deletion(-) diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst index 15aeddc6d8..d856785ecc 100644 --- a/docs/tools/qemu-img.rst +++ b/docs/tools/qemu-img.rst @@ -347,20 +347,44 @@ Command description: Check completed, image is corrupted 3 Check completed, image has leaked clusters, but is not corrupted 63 Checks are not supported by the image format If ``-r`` is specified, exit codes representing the image state refer to the state after (the attempt at) repairing it. That is, a successful ``-r all`` will yield the exit code 0, independently of the image state before. +.. option:: checksum [--object OBJECTDEF] [--image-opts] [-f FMT] [-T SRC_CACHE] [-p] FILENAME + + Print a checksum for image *FILENAME* guest visible content. Images with + different format or settings will have the same checksum. + + The format is probed unless you specify it by ``-f``. + + The checksum is computed for guest visible content. Allocated areas full of + zeroes, zero clusters, and unallocated areas are read as zeros so they will + have the same checksum. Images with single or multiple files or backing files + will have the same checksums if the guest will see the same content when + reading the image. + + Image metadata that is not visible to the guest such as dirty bitmaps does + not affect the checksum. + + Computing a checksum requires a read-only image. You cannot compute a + checksum of an active image used by a guest, but you can compute a checksum + of a guest during pull mode incremental backup using NBD URL. + + The checksum is not compatible with other tools such as *sha256sum* for + optimization purposes; using multithreading and optimized handling of zero + areas. For more info please see https://gitlab.com/nirs/blkhash. + .. option:: commit [--object OBJECTDEF] [--image-opts] [-q] [-f FMT] [-t CACHE] [-b BASE] [-r RATE_LIMIT] [-d] [-p] FILENAME Commit the changes recorded in *FILENAME* in its base image or backing file. If the backing file is smaller than the snapshot, then the backing file will be resized to be the same size as the snapshot. If the snapshot is smaller than the backing file, the backing file will not be truncated. If you want the backing file to match the size of the smaller snapshot, you can safely truncate it yourself once the commit operation successfully completes. The image *FILENAME* is emptied after the operation has succeeded. If you do diff --git a/meson.b
[PATCH v2 0/5] Add qemu-img checksum command using blkhash
Since blkhash is available only via copr now, the new command is added as optional feature, built only if blkhash-devel package is installed. Changes since v1 (Hanna): - Move IO_BUF_SIZE to top of the file - Extend TestFinder to support format or cache specific out files - Improve online help (note about optimization and lint to blkhash project) - Guard blkhash.h include with CONFIG_BLKHASH - Using user_creatable_process_cmdline() instead of user_creatable_add_from_str() - Rename ret to exit_code - Add static assert to ensure that read buffer is algined to block size - Drop unneeded pnum variable - Change test to work like other tests; use iotest.imgfmt and iotest.cachemode - Simplify test to test only raw and qcow2 format using file protocol - Fix code style issues (multi-line comments, missing braces) - Make error checking more clear (checksum_block_status(s) < 0) v1: https://lists.nongnu.org/archive/html/qemu-block/2022-09/msg00021.html v1 discussion: - https://lists.nongnu.org/archive/html/qemu-block/2022-10/msg00602.html - https://lists.nongnu.org/archive/html/qemu-block/2022-10/msg00603.html - https://lists.nongnu.org/archive/html/qemu-block/2022-10/msg00604.html - https://lists.nongnu.org/archive/html/qemu-block/2022-11/msg00171.html - https://lists.nongnu.org/archive/html/qemu-block/2022-11/msg00173.html Nir Soffer (5): qemu-img.c: Move IO_BUF_SIZE to the top of the file Support format or cache specific out file qemu-img: Add checksum command iotests: Test qemu-img checksum qemu-img: Speed up checksum docs/tools/qemu-img.rst | 24 ++ meson.build | 10 +- meson_options.txt | 2 + qemu-img-cmds.hx | 8 + qemu-img.c| 390 +- tests/qemu-iotests/findtests.py | 10 +- tests/qemu-iotests/tests/qemu-img-checksum| 63 +++ .../tests/qemu-img-checksum.out.qcow2 | 11 + .../tests/qemu-img-checksum.out.raw | 10 + 9 files changed, 523 insertions(+), 5 deletions(-) create mode 100755 tests/qemu-iotests/tests/qemu-img-checksum create mode 100644 tests/qemu-iotests/tests/qemu-img-checksum.out.qcow2 create mode 100644 tests/qemu-iotests/tests/qemu-img-checksum.out.raw -- 2.38.1
[PATCH v2 5/5] qemu-img: Speed up checksum
Add coroutine based loop inspired by `qemu-img convert` design. Changes compared to `qemu-img convert`: - State for the entire image is kept in ImgChecksumState - State for single worker coroutine is kept in ImgChecksumworker. - "Writes" are always in-order, ensured using a queue. - Calling block status once per image extent, when the current extent is consumed by the workers. - Using 1m buffer size - testings shows that this gives best read performance both with buffered and direct I/O. - Number of coroutines is not configurable. Testing does not show improvement when using more than 8 coroutines. - Progress include entire image, not only the allocated state. Comparing to the simple read loop shows that this version is up to 4.67 times faster when computing a checksum for an image full of zeroes. For real images it is 1.59 times faster with direct I/O, and with buffered I/O there is no difference. Test results on Dell PowerEdge R640 in a CentOS Stream 9 container: | image| size | i/o | before | after | change | |--|--|---|||| | zero [1] | 6g | buffered | 1.600s ±0.014s | 0.342s ±0.016s | x4.67 | | zero | 6g | direct| 4.684s ±0.093s | 2.211s ±0.009s | x2.12 | | real [2] | 6g | buffered | 1.841s ±0.075s | 1.806s ±0.036s | x1.02 | | real | 6g | direct| 3.094s ±0.079s | 1.947s ±0.017s | x1.59 | | nbd [3] | 6g | buffered | 2.455s ±0.183s | 1.808s ±0.016s | x1.36 | | nbd | 6g | direct| 3.540s ±0.020s | 1.749s ±0.018s | x2.02 | [1] raw image full of zeroes [2] raw fedora 35 image with additional random data, 50% full [3] image [2] exported by qemu-nbd via unix socket Signed-off-by: Nir Soffer --- qemu-img.c | 350 ++--- 1 file changed, 277 insertions(+), 73 deletions(-) diff --git a/qemu-img.c b/qemu-img.c index 4b4ca7add3..5f63a769a9 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -1618,50 +1618,296 @@ out: qemu_vfree(buf2); blk_unref(blk2); out2: blk_unref(blk1); out3: qemu_progress_end(); return ret; } #ifdef CONFIG_BLKHASH + +#define CHECKSUM_COROUTINES 8 +#define CHECKSUM_BUF_SIZE (1 * MiB) +#define CHECKSUM_ZERO_SIZE MIN(16 * GiB, SIZE_MAX) + +typedef struct ImgChecksumState ImgChecksumState; + +typedef struct ImgChecksumWorker { +QTAILQ_ENTRY(ImgChecksumWorker) entry; +ImgChecksumState *state; +Coroutine *co; +uint8_t *buf; + +/* The current chunk. */ +int64_t offset; +int64_t length; +bool zero; + +/* + * Always true for zero extent, false for data extent. Set to true + * when reading the chunk completes. + */ +bool ready; +} ImgChecksumWorker; + +struct ImgChecksumState { +const char *filename; +BlockBackend *blk; +BlockDriverState *bs; +int64_t total_size; + +/* Current extent, modified in checksum_co_next. */ +int64_t offset; +int64_t length; +bool zero; + +int running_coroutines; +CoMutex lock; +ImgChecksumWorker workers[CHECKSUM_COROUTINES]; + +/* + * Ensure in-order updates. Update are scheduled at the tail of the + * queue and processed from the head of the queue when a worker is + * ready. + */ +QTAILQ_HEAD(, ImgChecksumWorker) update_queue; + +struct blkhash *hash; +int ret; +}; + +static int checksum_block_status(ImgChecksumState *s) +{ +int64_t length; +int status; + +/* Must be called when current extent is consumed. */ +assert(s->length == 0); + +status = bdrv_block_status_above(s->bs, NULL, s->offset, + s->total_size - s->offset, &length, NULL, + NULL); +if (status < 0) { +error_report("Error checking status at offset %" PRId64 " for %s", + s->offset, s->filename); +s->ret = status; +return -1; +} + +assert(length > 0); + +s->length = length; +s->zero = !!(status & BDRV_BLOCK_ZERO); + +return 0; +} + +/** + * Grab the next chunk from the current extent, getting the next extent if + * needed, and schecule the next update at the end fo the update queue. + * + * Retrun true if the worker has work to do, false if the worker has + * finished or there was an error getting the next extent. + */ +static coroutine_fn bool checksum_co_next(ImgChecksumWorker *w) +{ +ImgChecksumState *s = w->state; + +qemu_co_mutex_lock(&s->lock); + +if (s->offset == s->total_size || s->ret != -EINPROGRESS) { +qemu_co_mutex_unlock(&s->lock); +return false; +} + +if (s->length == 0 && checksum_block_status(s) < 0) { +qemu_co_mutex_unlock(&s->lock); +return false; +} + +/* Grab one chunk from current extent. */ +w->offset
[PATCH v2 4/5] iotests: Test qemu-img checksum
Add simple tests computing a checksum for image with all kinds of extents in raw and qcow2 formats. The test can be extended later for other formats, format options (e..g compressed qcow2), protocols (e.g. nbd), and image with a backing chain, but I'm not sure this is really needed. To help debugging in case of failures, the output includes a json map of the test image. Signed-off-by: Nir Soffer --- tests/qemu-iotests/tests/qemu-img-checksum| 63 +++ .../tests/qemu-img-checksum.out.qcow2 | 11 .../tests/qemu-img-checksum.out.raw | 10 +++ 3 files changed, 84 insertions(+) create mode 100755 tests/qemu-iotests/tests/qemu-img-checksum create mode 100644 tests/qemu-iotests/tests/qemu-img-checksum.out.qcow2 create mode 100644 tests/qemu-iotests/tests/qemu-img-checksum.out.raw diff --git a/tests/qemu-iotests/tests/qemu-img-checksum b/tests/qemu-iotests/tests/qemu-img-checksum new file mode 100755 index 00..3577a0bc41 --- /dev/null +++ b/tests/qemu-iotests/tests/qemu-img-checksum @@ -0,0 +1,63 @@ +#!/usr/bin/env python3 +# group: rw auto quick +# +# Test cases for qemu-img checksum. +# +# Copyright (C) 2022 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +import re + +import iotests + +from iotests import ( +filter_testfiles, +qemu_img, +qemu_img_log, +qemu_io, +) + + +def checksum_available(): +out = qemu_img("--help").stdout +return re.search(r"\bchecksum .+ filename\b", out) is not None + + +if not checksum_available(): +iotests.notrun("checksum command not available") + +iotests.script_initialize( +supported_fmts=["raw", "qcow2"], +supported_cache_modes=["none", "writeback"], +supported_protocols=["file"], +) + +print("=== Create test image ===\n") + +disk = iotests.file_path('disk') +qemu_img("create", "-f", iotests.imgfmt, disk, "10m") +qemu_io("-f", iotests.imgfmt, +"-c", "write -P 0x1 0 2m", # data +"-c", "write -P 0x0 2m 2m", # data with zeroes +"-c", "write -z 4m 2m", # zero allocated +"-c", "write -z -u 6m 2m", # zero hole +# unallocated +disk) +print(filter_testfiles(disk)) +qemu_img_log("map", "--output", "json", disk) + +print("=== Compute checksum ===\n") + +qemu_img_log("checksum", "-T", iotests.cachemode, disk) diff --git a/tests/qemu-iotests/tests/qemu-img-checksum.out.qcow2 b/tests/qemu-iotests/tests/qemu-img-checksum.out.qcow2 new file mode 100644 index 00..02b9616e5b --- /dev/null +++ b/tests/qemu-iotests/tests/qemu-img-checksum.out.qcow2 @@ -0,0 +1,11 @@ +=== Create test image === + +TEST_DIR/PID-disk +[{ "start": 0, "length": 4194304, "depth": 0, "present": true, "zero": false, "data": true, "offset": 327680}, +{ "start": 4194304, "length": 4194304, "depth": 0, "present": true, "zero": true, "data": false}, +{ "start": 8388608, "length": 2097152, "depth": 0, "present": false, "zero": true, "data": false}] + +=== Compute checksum === + +57cd8ef0cfad106d737f8fb0de3a0306a8a1a41db7bf7c0c36e2dfe75ee9bd26 TEST_DIR/PID-disk + diff --git a/tests/qemu-iotests/tests/qemu-img-checksum.out.raw b/tests/qemu-iotests/tests/qemu-img-checksum.out.raw new file mode 100644 index 00..6294e4dace --- /dev/null +++ b/tests/qemu-iotests/tests/qemu-img-checksum.out.raw @@ -0,0 +1,10 @@ +=== Create test image === + +TEST_DIR/PID-disk +[{ "start": 0, "length": 4194304, "depth": 0, "present": true, "zero": false, "data": true, "offset": 0}, +{ "start": 4194304, "length": 6291456, "depth": 0, "present": true, "zero": true, "data": false, "offset": 4194304}] + +=== Compute checksum === + +57cd8ef0cfad106d737f8fb0de3a0306a8a1a41db7bf7c0c36e2dfe75ee9bd26 TEST_DIR/PID-disk + -- 2.38.1
[PATCH v2 2/5] Support format or cache specific out file
Extend the test finder to find tests with format (*.out.qcow2) or cache specific (*.out.nocache) out file. This worked before only for the numbered tests. --- tests/qemu-iotests/findtests.py | 10 -- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/tests/qemu-iotests/findtests.py b/tests/qemu-iotests/findtests.py index dd77b453b8..f4344ce78c 100644 --- a/tests/qemu-iotests/findtests.py +++ b/tests/qemu-iotests/findtests.py @@ -38,31 +38,37 @@ def chdir(path: Optional[str] = None) -> Iterator[None]: os.chdir(saved_dir) class TestFinder: def __init__(self, test_dir: Optional[str] = None) -> None: self.groups = defaultdict(set) with chdir(test_dir): self.all_tests = glob.glob('[0-9][0-9][0-9]') self.all_tests += [f for f in glob.iglob('tests/*') - if not f.endswith('.out') and - os.path.isfile(f + '.out')] + if self.is_test(f)] for t in self.all_tests: with open(t, encoding="utf-8") as f: for line in f: if line.startswith('# group: '): for g in line.split()[2:]: self.groups[g].add(t) break +def is_test(self, fname: str) -> bool: +""" +The tests directory contains tests (no extension) and out files +(*.out, *.out.{format}, *.out.{option}). +""" +return re.search(r'.+\.out(\.\w+)?$', fname) is None + def add_group_file(self, fname: str) -> None: with open(fname, encoding="utf-8") as f: for line in f: line = line.strip() if (not line) or line[0] == '#': continue words = line.split() test_file = self.parse_test_name(words[0]) -- 2.38.1
[PATCH v2 1/5] qemu-img.c: Move IO_BUF_SIZE to the top of the file
This macro is used by various commands (compare, convert, rebase) but it is defined somewhere in the middle of the file. I'm going to use it in the new checksum command so lets clean up a bit before that. --- qemu-img.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/qemu-img.c b/qemu-img.c index a9b3a8103c..c03d6b4b31 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -49,20 +49,21 @@ #include "block/block_int.h" #include "block/blockjob.h" #include "block/qapi.h" #include "crypto/init.h" #include "trace/control.h" #include "qemu/throttle.h" #include "block/throttle-groups.h" #define QEMU_IMG_VERSION "qemu-img version " QEMU_FULL_VERSION \ "\n" QEMU_COPYRIGHT "\n" +#define IO_BUF_SIZE (2 * MiB) typedef struct img_cmd_t { const char *name; int (*handler)(int argc, char **argv); } img_cmd_t; enum { OPTION_OUTPUT = 256, OPTION_BACKING_CHAIN = 257, OPTION_OBJECT = 258, @@ -1281,22 +1282,20 @@ static int compare_buffers(const uint8_t *buf1, const uint8_t *buf2, if (!!memcmp(buf1 + i, buf2 + i, len) != res) { break; } i += len; } *pnum = i; return res; } -#define IO_BUF_SIZE (2 * MiB) - /* * Check if passed sectors are empty (not allocated or contain only 0 bytes) * * Intended for use by 'qemu-img compare': Returns 0 in case sectors are * filled with 0, 1 if sectors contain non-zero data (this is a comparison * failure), and 4 on error (the exit status for read errors), after emitting * an error message. * * @param blk: BlockBackend for the image * @param offset: Starting offset to check -- 2.38.1
Re: [PATCH 2/3] iotests: Test qemu-img checksum
On Mon, Nov 7, 2022 at 1:41 PM Hanna Reitz wrote: > On 30.10.22 18:38, Nir Soffer wrote: > > On Wed, Oct 26, 2022 at 4:31 PM Hanna Reitz wrote: > > > > On 01.09.22 16:32, Nir Soffer wrote: > > > Add simple tests creating an image with all kinds of extents, > > different > > > formats, different backing chain, different protocol, and different > > > image options. Since all images have the same guest visible > > content they > > > must have the same checksum. > > > > > > To help debugging in case of failures, the output includes a > > json map of > > > every test image. > > > > > > Signed-off-by: Nir Soffer > > > --- > > > tests/qemu-iotests/tests/qemu-img-checksum| 149 > > ++ > > > .../qemu-iotests/tests/qemu-img-checksum.out | 74 + > > > 2 files changed, 223 insertions(+) > > > create mode 100755 tests/qemu-iotests/tests/qemu-img-checksum > > > create mode 100644 tests/qemu-iotests/tests/qemu-img-checksum.out > > > > > > diff --git a/tests/qemu-iotests/tests/qemu-img-checksum > > b/tests/qemu-iotests/tests/qemu-img-checksum > > > new file mode 100755 > > > index 00..3a85ba33f2 > > > --- /dev/null > > > +++ b/tests/qemu-iotests/tests/qemu-img-checksum > > > @@ -0,0 +1,149 @@ > > > +#!/usr/bin/env python3 > > > +# group: rw auto quick > > > +# > > > +# Test cases for qemu-img checksum. > > > +# > > > +# Copyright (C) 2022 Red Hat, Inc. > > > +# > > > +# This program is free software; you can redistribute it and/or > > modify > > > +# it under the terms of the GNU General Public License as > > published by > > > +# the Free Software Foundation; either version 2 of the License, > or > > > +# (at your option) any later version. > > > +# > > > +# This program is distributed in the hope that it will be useful, > > > +# but WITHOUT ANY WARRANTY; without even the implied warranty of > > > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > > > +# GNU General Public License for more details. > > > +# > > > +# You should have received a copy of the GNU General Public > License > > > +# along with this program. If not, see > > <http://www.gnu.org/licenses/>. > > > + > > > +import re > > > + > > > +import iotests > > > + > > > +from iotests import ( > > > +filter_testfiles, > > > +qemu_img, > > > +qemu_img_log, > > > +qemu_io, > > > +qemu_nbd_popen, > > > +) > > > + > > > + > > > +def checksum_available(): > > > +out = qemu_img("--help").stdout > > > +return re.search(r"\bchecksum .+ filename\b", out) is not None > > > + > > > + > > > +if not checksum_available(): > > > +iotests.notrun("checksum command not available") > > > + > > > +iotests.script_initialize( > > > +supported_fmts=["raw", "qcow2"], > > > +supported_cache_modes=["none", "writeback"], > > > > It doesn’t work with writeback, though, because it uses -T none > below. > > > > > > Good point > > > > > > Which by the way is a heavy cost, because I usually run tests in > > tmpfs, > > where this won’t work. Is there any way of not doing the -T none > > below? > > > > > > Testing using tempfs is problematic since you cannot test -T none.In > > oVirt > > we alway use /var/tmp which usually uses something that supports > > direct I/O. > > > > Do we have a way to specify cache mode in the tests, so we can use -T > none > > only when the option is set? > > `./check` has a `-c` option (e.g. `./check -c none`), which lands in > `iotests.cachemode`. That isn’t automatically passed to qemu-img calls, > but you can do it manually (i.e. `qemu_img_log("checksum", "-T", > iotests.cachemode, disk_top)` instead of `"-T", "none"`). > Ok, I will change to use the current cache setting. > > > > > +supported_p
Re: [PATCH 1/3] qemu-img: Add checksum command
On Mon, Nov 7, 2022 at 12:20 PM Hanna Reitz wrote: > On 30.10.22 18:37, Nir Soffer wrote: > > On Wed, Oct 26, 2022 at 4:00 PM Hanna Reitz wrote: > > > > On 01.09.22 16:32, Nir Soffer wrote: > [...] > > > --- > > > docs/tools/qemu-img.rst | 22 + > > > meson.build | 10 ++- > > > meson_options.txt | 2 + > > > qemu-img-cmds.hx| 8 ++ > > > qemu-img.c | 191 > > > > > 5 files changed, 232 insertions(+), 1 deletion(-) > > > > > > diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst > > > index 85a6e05b35..8be9c45cbf 100644 > > > --- a/docs/tools/qemu-img.rst > > > +++ b/docs/tools/qemu-img.rst > > > @@ -347,20 +347,42 @@ Command description: > > > Check completed, image is corrupted > > > 3 > > > Check completed, image has leaked clusters, but is not > > corrupted > > > 63 > > > Checks are not supported by the image format > > > > > > If ``-r`` is specified, exit codes representing the image > > state refer to the > > > state after (the attempt at) repairing it. That is, a > > successful ``-r all`` > > > will yield the exit code 0, independently of the image state > > before. > > > > > > +.. option:: checksum [--object OBJECTDEF] [--image-opts] [-f > > FMT] [-T SRC_CACHE] [-p] FILENAME > > > + > > > + Print a checksum for image *FILENAME* guest visible content. > > > > Why not say which kind of checksum it is? > > > > > > Do you mean the algorithm used? This may be confusing, for example we > > write > > > >Print a sha256 checksum ... > > > > User will expect to get the same result from "sha256sum disk.img". How > > about > > > >Print a blkhash checksum ... > > > > And add a link to the blkhash project? > > I did mean sha256, but if it isn’t pure sha256, then a link to any > description how it is computed would be good, I think. > Ok, will link to https://gitlab.com/nirs/blkhash [...] > > > > + The checksum is not compatible with other tools such as > > *sha256sum*. > > > > Why not? I can see it differs even for raw images, but why? I would > > have very much assumed that this gives me exactly what sha256sum > > in the > > guest on the guest device would yield. > > > > > > The blkhash is a construction based on other cryptographic hash > > functions (e.g. sha256). > > The way the hash is constructed is explained here: > > https://gitlab.com/nirs/blkhash/-/blob/master/blkhash.py#L52 > > > > We can provide a very slow version using a single thread and no zero > > optimization > > that will create the same hash as sha256sum for raw image. > > Ah, right. Yes, especially zero optimization is likely to make a huge > difference. Thanks for the explanation! > > Maybe that could be mentioned here as a side note, though? E.g. “The > checksum is not compatible with other tools such as *sha256sum* for > optimization purposes (to allow multithreading and optimized handling of > zero areas).”? > Ok, I will improve the text in the next version. [...] > > In blksum I do not allow changing the block size. > > > > I'll add an assert in the next version to keeps this default optimal. > > Thanks! (Static assert should work, right?) > I think it should Nir
Re: [PATCH 3/3] qemu-img: Speed up checksum
On Sun, Oct 30, 2022 at 7:38 PM Nir Soffer wrote: > On Wed, Oct 26, 2022 at 4:54 PM Hanna Reitz wrote: > >> On 01.09.22 16:32, Nir Soffer wrote: >> > [...] > > +/* The current chunk. */ >> > +int64_t offset; >> > +int64_t length; >> > +bool zero; >> > + >> > +/* Always true for zero extent, false for data extent. Set to true >> > + * when reading the chunk completes. */ >> >> Qemu codestyle requires /* and */ to be on separate lines for multi-line >> comments (see checkpatch.pl). >> > > I'll change that. Do we have a good way to run checkpatch.pl when using > git-publish? > > Maybe a way to run checkpatch.pl on all patches generated by git publish > automatically? > I found https://blog.vmsplice.net/2011/03/how-to-automatically-run-checkpatchpl.html and it seems to work well.
Re: [PATCH 1/3] qemu-img: Add checksum command
On Wed, Oct 26, 2022 at 4:00 PM Hanna Reitz wrote: > On 01.09.22 16:32, Nir Soffer wrote: > > The checksum command compute a checksum for disk image content using the > > blkhash library[1]. The blkhash library is not packaged yet, but it is > > available via copr[2]. > > > > Example run: > > > > $ ./qemu-img checksum -p fedora-35.qcow2 > > 6e5c00c995056319d52395f8d91c7f84725ae3da69ffcba4de4c7d22cff713a5 > fedora-35.qcow2 > > > > The block checksum is constructed by splitting the image to fixed sized > > blocks and computing a digest of every block. The image checksum is the > > digest of the all block digests. > > > > The checksum uses internally the "sha256" algorithm but it cannot be > > compared with checksums created by other tools such as `sha256sum`. > > > > The blkhash library supports sparse images, zero detection, and > > optimizes zero block hashing (they are practically free). The library > > uses multiple threads to speed up the computation. > > > > Comparing to `sha256sum`, `qemu-img checksum` is 3.5-4800[3] times > > faster, depending on the amount of data in the image: > > > > $ ./qemu-img info /scratch/50p.raw > > file format: raw > > virtual size: 6 GiB (6442450944 bytes) > > disk size: 2.91 GiB > > > > $ hyperfine -w2 -r5 -p "sleep 1" "./qemu-img checksum > /scratch/50p.raw" \ > > "sha256sum /scratch/50p.raw" > > Benchmark 1: ./qemu-img checksum /scratch/50p.raw > >Time (mean ± σ): 1.849 s ± 0.037 s[User: 7.764 s, > System: 0.962 s] > >Range (min … max):1.813 s … 1.908 s5 runs > > > > Benchmark 2: sha256sum /scratch/50p.raw > >Time (mean ± σ): 14.585 s ± 0.072 s[User: 13.537 s, > System: 1.003 s] > >Range (min … max): 14.501 s … 14.697 s5 runs > > > > Summary > >'./qemu-img checksum /scratch/50p.raw' ran > > 7.89 ± 0.16 times faster than 'sha256sum /scratch/50p.raw' > > > > The new command is available only when `blkhash` is available during > > build. To test the new command please install the `blkhash-devel` > > package: > > > > $ dnf copr enable nsoffer/blkhash > > $ sudo dnf install blkhash-devel > > > > [1] https://gitlab.com/nirs/blkhash > > [2] https://copr.fedorainfracloud.org/coprs/nsoffer/blkhash/ > > [3] Computing checksum for 8T empty image: qemu-img checksum: 3.7s, > > sha256sum (estimate): 17,749s > > > > Signed-off-by: Nir Soffer > > --- > > docs/tools/qemu-img.rst | 22 + > > meson.build | 10 ++- > > meson_options.txt | 2 + > > qemu-img-cmds.hx| 8 ++ > > qemu-img.c | 191 > > 5 files changed, 232 insertions(+), 1 deletion(-) > > > > diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst > > index 85a6e05b35..8be9c45cbf 100644 > > --- a/docs/tools/qemu-img.rst > > +++ b/docs/tools/qemu-img.rst > > @@ -347,20 +347,42 @@ Command description: > > Check completed, image is corrupted > > 3 > > Check completed, image has leaked clusters, but is not corrupted > > 63 > > Checks are not supported by the image format > > > > If ``-r`` is specified, exit codes representing the image state > refer to the > > state after (the attempt at) repairing it. That is, a successful > ``-r all`` > > will yield the exit code 0, independently of the image state before. > > > > +.. option:: checksum [--object OBJECTDEF] [--image-opts] [-f FMT] [-T > SRC_CACHE] [-p] FILENAME > > + > > + Print a checksum for image *FILENAME* guest visible content. > > Why not say which kind of checksum it is? > Do you mean the algorithm used? This may be confusing, for example we write Print a sha256 checksum ... User will expect to get the same result from "sha256sum disk.img". How about Print a blkhash checksum ... And add a link to the blkhash project? > > > Images > with > > + different format or settings wil have the same checksum. > > s/wil/will/ > Fixing > > > + > > + The format is probed unless you specify it by ``-f``. > > + > > + The checksum is computed for guest visible content. Allocated areas > full of > > + zeroes, zero clusters, and unallocated
Re: [PATCH 3/3] qemu-img: Speed up checksum
On Wed, Oct 26, 2022 at 4:54 PM Hanna Reitz wrote: > On 01.09.22 16:32, Nir Soffer wrote: > > Add coroutine based loop inspired by `qemu-img convert` design. > > > > Changes compared to `qemu-img convert`: > > > > - State for the entire image is kept in ImgChecksumState > > > > - State for single worker coroutine is kept in ImgChecksumworker. > > > > - "Writes" are always in-order, ensured using a queue. > > > > - Calling block status once per image extent, when the current extent is > >consumed by the workers. > > > > - Using 1m buffer size - testings shows that this gives best read > >performance both with buffered and direct I/O. > > Why does patch 1 then choose to use 2 MB? > The first patch uses sync I/O, and in this case 2 MB is a little faster. > > - Number of coroutines is not configurable. Testing does not show > >improvement when using more than 8 coroutines. > > > > - Progress include entire image, not only the allocated state. > > > > Comparing to the simple read loop shows that this version is up to 4.67 > > times faster when computing a checksum for an image full of zeroes. For > > real images it is 1.59 times faster with direct I/O, and with buffered > > I/O there is no difference. > > > > Test results on Dell PowerEdge R640 in a CentOS Stream 9 container: > > > > | image| size | i/o | before | after | change > | > > > |--|--|---|||| > > | zero [1] | 6g | buffered | 1.600s ±0.014s | 0.342s ±0.016s | x4.67 > | > > | zero | 6g | direct| 4.684s ±0.093s | 2.211s ±0.009s | x2.12 > | > > | real [2] | 6g | buffered | 1.841s ±0.075s | 1.806s ±0.036s | x1.02 > | > > | real | 6g | direct| 3.094s ±0.079s | 1.947s ±0.017s | x1.59 > | > > | nbd [3] | 6g | buffered | 2.455s ±0.183s | 1.808s ±0.016s | x1.36 > | > > | nbd | 6g | direct| 3.540s ±0.020s | 1.749s ±0.018s | x2.02 > | > > > > [1] raw image full of zeroes > > [2] raw fedora 35 image with additional random data, 50% full > > [3] image [2] exported by qemu-nbd via unix socket > > > > Signed-off-by: Nir Soffer > > --- > > qemu-img.c | 343 + > > 1 file changed, 270 insertions(+), 73 deletions(-) > > Looks good! > > Just a couple of style comments below. > > > diff --git a/qemu-img.c b/qemu-img.c > > index 7edcfe4bc8..bfa8e2862f 100644 > > --- a/qemu-img.c > > +++ b/qemu-img.c > > @@ -1613,48 +1613,288 @@ out: > > qemu_vfree(buf2); > > blk_unref(blk2); > > out2: > > blk_unref(blk1); > > out3: > > qemu_progress_end(); > > return ret; > > } > > > > #ifdef CONFIG_BLKHASH > > + > > +#define CHECKSUM_COROUTINES 8 > > +#define CHECKSUM_BUF_SIZE (1 * MiB) > > +#define CHECKSUM_ZERO_SIZE MIN(16 * GiB, SIZE_MAX) > > + > > +typedef struct ImgChecksumState ImgChecksumState; > > + > > +typedef struct ImgChecksumWorker { > > +QTAILQ_ENTRY(ImgChecksumWorker) entry; > > +ImgChecksumState *state; > > +Coroutine *co; > > +uint8_t *buf; > > + > > +/* The current chunk. */ > > +int64_t offset; > > +int64_t length; > > +bool zero; > > + > > +/* Always true for zero extent, false for data extent. Set to true > > + * when reading the chunk completes. */ > > Qemu codestyle requires /* and */ to be on separate lines for multi-line > comments (see checkpatch.pl). > I'll change that. Do we have a good way to run checkpatch.pl when using git-publish? Maybe a way to run checkpatch.pl on all patches generated by git publish automatically? > > +bool ready; > > +} ImgChecksumWorker; > > + > > +struct ImgChecksumState { > > +const char *filename; > > +BlockBackend *blk; > > +BlockDriverState *bs; > > +int64_t total_size; > > + > > +/* Current extent, modified in checksum_co_next. */ > > +int64_t offset; > > +int64_t length; > > +bool zero; > > + > > +int running_coroutines; > > +CoMutex lock; > > +ImgChecksumWorker workers[CHECKSUM_COROUTINES]; > > + > > +/* Ensure in-order updates. Update are scheduled at the tail of the > > + * queue and processed from the head of the queue when a worker is > > + * ready. */ > > Qemu codestyle requires /* and */ to be on separat
Re: [PATCH 2/3] iotests: Test qemu-img checksum
On Wed, Oct 26, 2022 at 4:31 PM Hanna Reitz wrote: > On 01.09.22 16:32, Nir Soffer wrote: > > Add simple tests creating an image with all kinds of extents, different > > formats, different backing chain, different protocol, and different > > image options. Since all images have the same guest visible content they > > must have the same checksum. > > > > To help debugging in case of failures, the output includes a json map of > > every test image. > > > > Signed-off-by: Nir Soffer > > --- > > tests/qemu-iotests/tests/qemu-img-checksum| 149 ++ > > .../qemu-iotests/tests/qemu-img-checksum.out | 74 + > > 2 files changed, 223 insertions(+) > > create mode 100755 tests/qemu-iotests/tests/qemu-img-checksum > > create mode 100644 tests/qemu-iotests/tests/qemu-img-checksum.out > > > > diff --git a/tests/qemu-iotests/tests/qemu-img-checksum > b/tests/qemu-iotests/tests/qemu-img-checksum > > new file mode 100755 > > index 00..3a85ba33f2 > > --- /dev/null > > +++ b/tests/qemu-iotests/tests/qemu-img-checksum > > @@ -0,0 +1,149 @@ > > +#!/usr/bin/env python3 > > +# group: rw auto quick > > +# > > +# Test cases for qemu-img checksum. > > +# > > +# Copyright (C) 2022 Red Hat, Inc. > > +# > > +# This program is free software; you can redistribute it and/or modify > > +# it under the terms of the GNU General Public License as published by > > +# the Free Software Foundation; either version 2 of the License, or > > +# (at your option) any later version. > > +# > > +# This program is distributed in the hope that it will be useful, > > +# but WITHOUT ANY WARRANTY; without even the implied warranty of > > +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the > > +# GNU General Public License for more details. > > +# > > +# You should have received a copy of the GNU General Public License > > +# along with this program. If not, see <http://www.gnu.org/licenses/>. > > + > > +import re > > + > > +import iotests > > + > > +from iotests import ( > > +filter_testfiles, > > +qemu_img, > > +qemu_img_log, > > +qemu_io, > > +qemu_nbd_popen, > > +) > > + > > + > > +def checksum_available(): > > +out = qemu_img("--help").stdout > > +return re.search(r"\bchecksum .+ filename\b", out) is not None > > + > > + > > +if not checksum_available(): > > +iotests.notrun("checksum command not available") > > + > > +iotests.script_initialize( > > +supported_fmts=["raw", "qcow2"], > > +supported_cache_modes=["none", "writeback"], > > It doesn’t work with writeback, though, because it uses -T none below. > Good point > > Which by the way is a heavy cost, because I usually run tests in tmpfs, > where this won’t work. Is there any way of not doing the -T none below? > Testing using tempfs is problematic since you cannot test -T none. In oVirt we alway use /var/tmp which usually uses something that supports direct I/O. Do we have a way to specify cache mode in the tests, so we can use -T none only when the option is set? > > > +supported_protocols=["file", "nbd"], > > +required_fmts=["raw", "qcow2"], > > +) > > + > > +print() > > +print("=== Test images ===") > > +print() > > + > > +disk_raw = iotests.file_path('raw') > > +qemu_img("create", "-f", "raw", disk_raw, "10m") > > +qemu_io("-f", "raw", > > +"-c", "write -P 0x1 0 2m", # data > > +"-c", "write -P 0x0 2m 2m", # data with zeroes > > +"-c", "write -z 4m 2m", # zero allocated > > +"-c", "write -z -u 6m 2m", # zero hole > > +# unallocated > > +disk_raw) > > +print(filter_testfiles(disk_raw)) > > +qemu_img_log("map", "--output", "json", disk_raw) > > + > > +disk_qcow2 = iotests.file_path('qcow2') > > +qemu_img("create", "-f", "qcow2", disk_qcow2, "10m") > > +qemu_io("-f", "qcow2", > > +"-c", "write -P 0x1 0 2m", # data > > +"-c", "write -P 0x0 2m 2m", # data with zeroes > > +
Re: [PATCH 0/3] Add qemu-img checksum command using blkhash
On Sun, Sep 18, 2022 at 12:35 PM Nir Soffer wrote: > ping > > Kevin, Hanna, I hope you have time to take a look. > > https://lists.nongnu.org/archive/html/qemu-block/2022-09/msg00021.html Ping again, hopefully someone has time to look at this :-) > > > > On Thu, Sep 1, 2022 at 5:32 PM Nir Soffer wrote: > > > > Since blkhash is available only via copr now, the new command is added as > > optional feature, built only if blkhash-devel package is installed. > > > > Nir Soffer (3): > > qemu-img: Add checksum command > > iotests: Test qemu-img checksum > > qemu-img: Speed up checksum > > > > docs/tools/qemu-img.rst | 22 + > > meson.build | 10 +- > > meson_options.txt | 2 + > > qemu-img-cmds.hx | 8 + > > qemu-img.c| 388 ++ > > tests/qemu-iotests/tests/qemu-img-checksum| 149 +++ > > .../qemu-iotests/tests/qemu-img-checksum.out | 74 > > 7 files changed, 652 insertions(+), 1 deletion(-) > > create mode 100755 tests/qemu-iotests/tests/qemu-img-checksum > > create mode 100644 tests/qemu-iotests/tests/qemu-img-checksum.out > > > > -- > > 2.37.2 > > >
Re: [PATCH 0/3] Add qemu-img checksum command using blkhash
ping Kevin, Hanna, I hope you have time to take a look. https://lists.nongnu.org/archive/html/qemu-block/2022-09/msg00021.html On Thu, Sep 1, 2022 at 5:32 PM Nir Soffer wrote: > > Since blkhash is available only via copr now, the new command is added as > optional feature, built only if blkhash-devel package is installed. > > Nir Soffer (3): > qemu-img: Add checksum command > iotests: Test qemu-img checksum > qemu-img: Speed up checksum > > docs/tools/qemu-img.rst | 22 + > meson.build | 10 +- > meson_options.txt | 2 + > qemu-img-cmds.hx | 8 + > qemu-img.c| 388 ++ > tests/qemu-iotests/tests/qemu-img-checksum| 149 +++ > .../qemu-iotests/tests/qemu-img-checksum.out | 74 > 7 files changed, 652 insertions(+), 1 deletion(-) > create mode 100755 tests/qemu-iotests/tests/qemu-img-checksum > create mode 100644 tests/qemu-iotests/tests/qemu-img-checksum.out > > -- > 2.37.2 >
[PATCH 1/3] qemu-img: Add checksum command
The checksum command compute a checksum for disk image content using the blkhash library[1]. The blkhash library is not packaged yet, but it is available via copr[2]. Example run: $ ./qemu-img checksum -p fedora-35.qcow2 6e5c00c995056319d52395f8d91c7f84725ae3da69ffcba4de4c7d22cff713a5 fedora-35.qcow2 The block checksum is constructed by splitting the image to fixed sized blocks and computing a digest of every block. The image checksum is the digest of the all block digests. The checksum uses internally the "sha256" algorithm but it cannot be compared with checksums created by other tools such as `sha256sum`. The blkhash library supports sparse images, zero detection, and optimizes zero block hashing (they are practically free). The library uses multiple threads to speed up the computation. Comparing to `sha256sum`, `qemu-img checksum` is 3.5-4800[3] times faster, depending on the amount of data in the image: $ ./qemu-img info /scratch/50p.raw file format: raw virtual size: 6 GiB (6442450944 bytes) disk size: 2.91 GiB $ hyperfine -w2 -r5 -p "sleep 1" "./qemu-img checksum /scratch/50p.raw" \ "sha256sum /scratch/50p.raw" Benchmark 1: ./qemu-img checksum /scratch/50p.raw Time (mean ± σ): 1.849 s ± 0.037 s[User: 7.764 s, System: 0.962 s] Range (min … max):1.813 s … 1.908 s5 runs Benchmark 2: sha256sum /scratch/50p.raw Time (mean ± σ): 14.585 s ± 0.072 s[User: 13.537 s, System: 1.003 s] Range (min … max): 14.501 s … 14.697 s5 runs Summary './qemu-img checksum /scratch/50p.raw' ran 7.89 ± 0.16 times faster than 'sha256sum /scratch/50p.raw' The new command is available only when `blkhash` is available during build. To test the new command please install the `blkhash-devel` package: $ dnf copr enable nsoffer/blkhash $ sudo dnf install blkhash-devel [1] https://gitlab.com/nirs/blkhash [2] https://copr.fedorainfracloud.org/coprs/nsoffer/blkhash/ [3] Computing checksum for 8T empty image: qemu-img checksum: 3.7s, sha256sum (estimate): 17,749s Signed-off-by: Nir Soffer --- docs/tools/qemu-img.rst | 22 + meson.build | 10 ++- meson_options.txt | 2 + qemu-img-cmds.hx| 8 ++ qemu-img.c | 191 5 files changed, 232 insertions(+), 1 deletion(-) diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst index 85a6e05b35..8be9c45cbf 100644 --- a/docs/tools/qemu-img.rst +++ b/docs/tools/qemu-img.rst @@ -347,20 +347,42 @@ Command description: Check completed, image is corrupted 3 Check completed, image has leaked clusters, but is not corrupted 63 Checks are not supported by the image format If ``-r`` is specified, exit codes representing the image state refer to the state after (the attempt at) repairing it. That is, a successful ``-r all`` will yield the exit code 0, independently of the image state before. +.. option:: checksum [--object OBJECTDEF] [--image-opts] [-f FMT] [-T SRC_CACHE] [-p] FILENAME + + Print a checksum for image *FILENAME* guest visible content. Images with + different format or settings wil have the same checksum. + + The format is probed unless you specify it by ``-f``. + + The checksum is computed for guest visible content. Allocated areas full of + zeroes, zero clusters, and unallocated areas are read as zeros so they will + have the same checksum. Images with single or multiple files or backing files + will have the same checksums if the guest will see the same content when + reading the image. + + Image metadata that is not visible to the guest such as dirty bitmaps does + not affect the checksum. + + Computing a checksum requires a read-only image. You cannot compute a + checksum of an active image used by a guest, but you can compute a checksum + of a guest during pull mode incremental backup using NBD URL. + + The checksum is not compatible with other tools such as *sha256sum*. + .. option:: commit [--object OBJECTDEF] [--image-opts] [-q] [-f FMT] [-t CACHE] [-b BASE] [-r RATE_LIMIT] [-d] [-p] FILENAME Commit the changes recorded in *FILENAME* in its base image or backing file. If the backing file is smaller than the snapshot, then the backing file will be resized to be the same size as the snapshot. If the snapshot is smaller than the backing file, the backing file will not be truncated. If you want the backing file to match the size of the smaller snapshot, you can safely truncate it yourself once the commit operation successfully completes. The image *FILENAME* is emptied after the operation has succeeded. If you do diff --git a/meson.build b/meson.build index 20fddbd707..56b648d8a7 100644 --- a/meson.build +++ b/meson.build @@ -727,20 +727,24 @@ if not get_option('curl').a
[PATCH 0/3] Add qemu-img checksum command using blkhash
Since blkhash is available only via copr now, the new command is added as optional feature, built only if blkhash-devel package is installed. Nir Soffer (3): qemu-img: Add checksum command iotests: Test qemu-img checksum qemu-img: Speed up checksum docs/tools/qemu-img.rst | 22 + meson.build | 10 +- meson_options.txt | 2 + qemu-img-cmds.hx | 8 + qemu-img.c| 388 ++ tests/qemu-iotests/tests/qemu-img-checksum| 149 +++ .../qemu-iotests/tests/qemu-img-checksum.out | 74 7 files changed, 652 insertions(+), 1 deletion(-) create mode 100755 tests/qemu-iotests/tests/qemu-img-checksum create mode 100644 tests/qemu-iotests/tests/qemu-img-checksum.out -- 2.37.2
[PATCH 3/3] qemu-img: Speed up checksum
Add coroutine based loop inspired by `qemu-img convert` design. Changes compared to `qemu-img convert`: - State for the entire image is kept in ImgChecksumState - State for single worker coroutine is kept in ImgChecksumworker. - "Writes" are always in-order, ensured using a queue. - Calling block status once per image extent, when the current extent is consumed by the workers. - Using 1m buffer size - testings shows that this gives best read performance both with buffered and direct I/O. - Number of coroutines is not configurable. Testing does not show improvement when using more than 8 coroutines. - Progress include entire image, not only the allocated state. Comparing to the simple read loop shows that this version is up to 4.67 times faster when computing a checksum for an image full of zeroes. For real images it is 1.59 times faster with direct I/O, and with buffered I/O there is no difference. Test results on Dell PowerEdge R640 in a CentOS Stream 9 container: | image| size | i/o | before | after | change | |--|--|---|||| | zero [1] | 6g | buffered | 1.600s ±0.014s | 0.342s ±0.016s | x4.67 | | zero | 6g | direct| 4.684s ±0.093s | 2.211s ±0.009s | x2.12 | | real [2] | 6g | buffered | 1.841s ±0.075s | 1.806s ±0.036s | x1.02 | | real | 6g | direct| 3.094s ±0.079s | 1.947s ±0.017s | x1.59 | | nbd [3] | 6g | buffered | 2.455s ±0.183s | 1.808s ±0.016s | x1.36 | | nbd | 6g | direct| 3.540s ±0.020s | 1.749s ±0.018s | x2.02 | [1] raw image full of zeroes [2] raw fedora 35 image with additional random data, 50% full [3] image [2] exported by qemu-nbd via unix socket Signed-off-by: Nir Soffer --- qemu-img.c | 343 + 1 file changed, 270 insertions(+), 73 deletions(-) diff --git a/qemu-img.c b/qemu-img.c index 7edcfe4bc8..bfa8e2862f 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -1613,48 +1613,288 @@ out: qemu_vfree(buf2); blk_unref(blk2); out2: blk_unref(blk1); out3: qemu_progress_end(); return ret; } #ifdef CONFIG_BLKHASH + +#define CHECKSUM_COROUTINES 8 +#define CHECKSUM_BUF_SIZE (1 * MiB) +#define CHECKSUM_ZERO_SIZE MIN(16 * GiB, SIZE_MAX) + +typedef struct ImgChecksumState ImgChecksumState; + +typedef struct ImgChecksumWorker { +QTAILQ_ENTRY(ImgChecksumWorker) entry; +ImgChecksumState *state; +Coroutine *co; +uint8_t *buf; + +/* The current chunk. */ +int64_t offset; +int64_t length; +bool zero; + +/* Always true for zero extent, false for data extent. Set to true + * when reading the chunk completes. */ +bool ready; +} ImgChecksumWorker; + +struct ImgChecksumState { +const char *filename; +BlockBackend *blk; +BlockDriverState *bs; +int64_t total_size; + +/* Current extent, modified in checksum_co_next. */ +int64_t offset; +int64_t length; +bool zero; + +int running_coroutines; +CoMutex lock; +ImgChecksumWorker workers[CHECKSUM_COROUTINES]; + +/* Ensure in-order updates. Update are scheduled at the tail of the + * queue and processed from the head of the queue when a worker is + * ready. */ +QTAILQ_HEAD(, ImgChecksumWorker) update_queue; + +struct blkhash *hash; +int ret; +}; + +static int checksum_block_status(ImgChecksumState *s) +{ +int64_t length; +int status; + +/* Must be called when current extent is consumed. */ +assert(s->length == 0); + +status = bdrv_block_status_above(s->bs, NULL, s->offset, + s->total_size - s->offset, &length, NULL, + NULL); +if (status < 0) { +error_report("Error checking status at offset %" PRId64 " for %s", + s->offset, s->filename); +s->ret = status; +return -1; +} + +assert(length > 0); + +s->length = length; +s->zero = !!(status & BDRV_BLOCK_ZERO); + +return 0; +} + +/** + * Grab the next chunk from the current extent, getting the next extent if + * needed, and schecule the next update at the end fo the update queue. + * + * Retrun true if the worker has work to do, false if the worker has + * finished or there was an error getting the next extent. + */ +static coroutine_fn bool checksum_co_next(ImgChecksumWorker *w) +{ +ImgChecksumState *s = w->state; + +qemu_co_mutex_lock(&s->lock); + +if (s->offset == s->total_size || s->ret != -EINPROGRESS) { +qemu_co_mutex_unlock(&s->lock); +return false; +} + +if (s->length == 0 && checksum_block_status(s)) { +qemu_co_mutex_unlock(&s->lock); +return false; +} + +/* Grab one chunk from current extent. */ +w->offset = s->offset; +w->length = MI
[PATCH 2/3] iotests: Test qemu-img checksum
Add simple tests creating an image with all kinds of extents, different formats, different backing chain, different protocol, and different image options. Since all images have the same guest visible content they must have the same checksum. To help debugging in case of failures, the output includes a json map of every test image. Signed-off-by: Nir Soffer --- tests/qemu-iotests/tests/qemu-img-checksum| 149 ++ .../qemu-iotests/tests/qemu-img-checksum.out | 74 + 2 files changed, 223 insertions(+) create mode 100755 tests/qemu-iotests/tests/qemu-img-checksum create mode 100644 tests/qemu-iotests/tests/qemu-img-checksum.out diff --git a/tests/qemu-iotests/tests/qemu-img-checksum b/tests/qemu-iotests/tests/qemu-img-checksum new file mode 100755 index 00..3a85ba33f2 --- /dev/null +++ b/tests/qemu-iotests/tests/qemu-img-checksum @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 +# group: rw auto quick +# +# Test cases for qemu-img checksum. +# +# Copyright (C) 2022 Red Hat, Inc. +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +import re + +import iotests + +from iotests import ( +filter_testfiles, +qemu_img, +qemu_img_log, +qemu_io, +qemu_nbd_popen, +) + + +def checksum_available(): +out = qemu_img("--help").stdout +return re.search(r"\bchecksum .+ filename\b", out) is not None + + +if not checksum_available(): +iotests.notrun("checksum command not available") + +iotests.script_initialize( +supported_fmts=["raw", "qcow2"], +supported_cache_modes=["none", "writeback"], +supported_protocols=["file", "nbd"], +required_fmts=["raw", "qcow2"], +) + +print() +print("=== Test images ===") +print() + +disk_raw = iotests.file_path('raw') +qemu_img("create", "-f", "raw", disk_raw, "10m") +qemu_io("-f", "raw", +"-c", "write -P 0x1 0 2m", # data +"-c", "write -P 0x0 2m 2m", # data with zeroes +"-c", "write -z 4m 2m", # zero allocated +"-c", "write -z -u 6m 2m", # zero hole +# unallocated +disk_raw) +print(filter_testfiles(disk_raw)) +qemu_img_log("map", "--output", "json", disk_raw) + +disk_qcow2 = iotests.file_path('qcow2') +qemu_img("create", "-f", "qcow2", disk_qcow2, "10m") +qemu_io("-f", "qcow2", +"-c", "write -P 0x1 0 2m", # data +"-c", "write -P 0x0 2m 2m", # data with zeroes +"-c", "write -z 4m 2m", # zero allocated +"-c", "write -z -u 6m 2m", # zero hole +# unallocated +disk_qcow2) +print(filter_testfiles(disk_qcow2)) +qemu_img_log("map", "--output", "json", disk_qcow2) + +disk_compressed = iotests.file_path('compressed') +qemu_img("convert", "-f", "qcow2", "-O", "qcow2", "-c", + disk_qcow2, disk_compressed) +print(filter_testfiles(disk_compressed)) +qemu_img_log("map", "--output", "json", disk_compressed) + +disk_base = iotests.file_path('base') +qemu_img("create", "-f", "raw", disk_base, "10m") +qemu_io("-f", "raw", +"-c", "write -P 0x1 0 2m", +"-c", "write -P 0x0 2m 2m", +disk_base) +print(filter_testfiles(disk_base)) +qemu_img_log("map", "--output", "json", disk_base) + +disk_top = iotests.file_path('top') +qemu_img("create", "-f", "qcow2", "-b", disk_base, "-F", "raw", + disk_top) +qemu_io("-f", "qcow2", +"-c", "write -z 4m 2m", +"-c", "write -z -u 6m 2m", +disk_top) +print(filter_testfiles(disk_top)) +qemu_img_log("
Re: [PATCH 2/6] virtio-scsi: don't waste CPU polling the event virtqueue
On Wed, Apr 27, 2022 at 5:35 PM Stefan Hajnoczi wrote: > > The virtio-scsi event virtqueue is not emptied by its handler function. > This is typical for rx virtqueues where the device uses buffers when > some event occurs (e.g. a packet is received, an error condition > happens, etc). > > Polling non-empty virtqueues wastes CPU cycles. We are not waiting for > new buffers to become available, we are waiting for an event to occur, > so it's a misuse of CPU resources to poll for buffers. > > Introduce the new virtio_queue_aio_attach_host_notifier_no_poll() API, > which is identical to virtio_queue_aio_attach_host_notifier() except > that it does not poll the virtqueue. > > Before this patch the following command-line consumed 100% CPU in the > IOThread polling and calling virtio_scsi_handle_event(): > > $ qemu-system-x86_64 -M accel=kvm -m 1G -cpu host \ > --object iothread,id=iothread0 \ > --device virtio-scsi-pci,iothread=iothread0 \ > --blockdev > file,filename=test.img,aio=native,cache.direct=on,node-name=drive0 \ > --device scsi-hd,drive=drive0 > > After this patch CPU is no longer wasted. > > Reported-by: Nir Soffer > Signed-off-by: Stefan Hajnoczi > --- > include/hw/virtio/virtio.h | 1 + > hw/scsi/virtio-scsi-dataplane.c | 2 +- > hw/virtio/virtio.c | 13 + > 3 files changed, 15 insertions(+), 1 deletion(-) > > diff --git a/include/hw/virtio/virtio.h b/include/hw/virtio/virtio.h > index b31c4507f5..b62a35fdca 100644 > --- a/include/hw/virtio/virtio.h > +++ b/include/hw/virtio/virtio.h > @@ -317,6 +317,7 @@ EventNotifier *virtio_queue_get_host_notifier(VirtQueue > *vq); > void virtio_queue_set_host_notifier_enabled(VirtQueue *vq, bool enabled); > void virtio_queue_host_notifier_read(EventNotifier *n); > void virtio_queue_aio_attach_host_notifier(VirtQueue *vq, AioContext *ctx); > +void virtio_queue_aio_attach_host_notifier_no_poll(VirtQueue *vq, AioContext > *ctx); > void virtio_queue_aio_detach_host_notifier(VirtQueue *vq, AioContext *ctx); > VirtQueue *virtio_vector_first_queue(VirtIODevice *vdev, uint16_t vector); > VirtQueue *virtio_vector_next_queue(VirtQueue *vq); > diff --git a/hw/scsi/virtio-scsi-dataplane.c b/hw/scsi/virtio-scsi-dataplane.c > index 29575cbaf6..8bb6e6acfc 100644 > --- a/hw/scsi/virtio-scsi-dataplane.c > +++ b/hw/scsi/virtio-scsi-dataplane.c > @@ -138,7 +138,7 @@ int virtio_scsi_dataplane_start(VirtIODevice *vdev) > > aio_context_acquire(s->ctx); > virtio_queue_aio_attach_host_notifier(vs->ctrl_vq, s->ctx); > -virtio_queue_aio_attach_host_notifier(vs->event_vq, s->ctx); > +virtio_queue_aio_attach_host_notifier_no_poll(vs->event_vq, s->ctx); > > for (i = 0; i < vs->conf.num_queues; i++) { > virtio_queue_aio_attach_host_notifier(vs->cmd_vqs[i], s->ctx); > diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c > index 9d637e043e..67a873f54a 100644 > --- a/hw/virtio/virtio.c > +++ b/hw/virtio/virtio.c > @@ -3534,6 +3534,19 @@ void virtio_queue_aio_attach_host_notifier(VirtQueue > *vq, AioContext *ctx) > virtio_queue_host_notifier_aio_poll_end); > } > > +/* > + * Same as virtio_queue_aio_attach_host_notifier() but without polling. Use > + * this for rx virtqueues and similar cases where the virtqueue handler > + * function does not pop all elements. When the virtqueue is left non-empty > + * polling consumes CPU cycles and should not be used. > + */ > +void virtio_queue_aio_attach_host_notifier_no_poll(VirtQueue *vq, AioContext > *ctx) > +{ > +aio_set_event_notifier(ctx, &vq->host_notifier, true, > + virtio_queue_host_notifier_read, > + NULL, NULL); > +} > + > void virtio_queue_aio_detach_host_notifier(VirtQueue *vq, AioContext *ctx) > { > aio_set_event_notifier(ctx, &vq->host_notifier, true, NULL, NULL, NULL); > -- > 2.35.1 > I tested patches 1 and 2 on top of 34723f59371f3fd02ea59b94674314b875504426 and it solved the issue. Tested-by: Nir Soffer Nir
Re: [PATCH v2] nbd/server: Allow MULTI_CONN for shared writable exports
On Wed, Feb 16, 2022 at 10:08 AM Vladimir Sementsov-Ogievskiy wrote: > > 16.02.2022 02:24, Eric Blake wrote: > > On Tue, Feb 15, 2022 at 09:23:36PM +0200, Nir Soffer wrote: > >> On Tue, Feb 15, 2022 at 7:22 PM Eric Blake wrote: > >> > >>> According to the NBD spec, a server advertising > >>> NBD_FLAG_CAN_MULTI_CONN promises that multiple client connections will > >>> not see any cache inconsistencies: when properly separated by a single > >>> flush, actions performed by one client will be visible to another > >>> client, regardless of which client did the flush. We satisfy these > >>> conditions in qemu when our block layer is backed by the local > >>> filesystem (by virtue of the semantics of fdatasync(), and the fact > >>> that qemu itself is not buffering writes beyond flushes). It is > >>> harder to state whether we satisfy these conditions for network-based > >>> protocols, so the safest course of action is to allow users to opt-in > >>> to advertising multi-conn. We may later tweak defaults to advertise > >>> by default when the block layer can confirm that the underlying > >>> protocol driver is cache consistent between multiple writers, but for > >>> now, this at least allows savvy users (such as virt-v2v or nbdcopy) to > >>> explicitly start qemu-nbd or qemu-storage-daemon with multi-conn > >>> advertisement in a known-safe setup where the client end can then > >>> benefit from parallel clients. > >>> > >> > >> It makes sense, and will be used by oVirt. Actually we are already using > >> multiple connections for writing about 2 years, based on your promise > >> that if every client writes to district areas this is safe. > > > > I presume s/district/distinct/, but yes, I'm glad we're finally trying > > to make the code match existing practice ;) > > > >>> +++ b/docs/tools/qemu-nbd.rst > >>> @@ -139,8 +139,7 @@ driver options if ``--image-opts`` is specified. > >>> .. option:: -e, --shared=NUM > >>> > >>> Allow up to *NUM* clients to share the device (default > >>> - ``1``), 0 for unlimited. Safe for readers, but for now, > >>> - consistency is not guaranteed between multiple writers. > >>> + ``1``), 0 for unlimited. > >>> > >> > >> Removing the note means that now consistency is guaranteed between > >> multiple writers, no? > >> > >> Or maybe we want to mention here that consistency depends on the protocol > >> and users can opt in, or refer to the section where this is discussed? > > > > Yeah, a link to the QAPI docs where multi-conn is documented might be > > nice, except I'm not sure the best way to do that in our sphinx > > documentation setup. > > > >>> +## > >>> +# @NbdExportMultiConn: > >>> +# > >>> +# Possible settings for advertising NBD multiple client support. > >>> +# > >>> +# @off: Do not advertise multiple clients. > >>> +# > >>> +# @on: Allow multiple clients (for writable clients, this is only safe > >>> +# if the underlying BDS is cache-consistent, such as when backed > >>> +# by the raw file driver); ignored if the NBD server was set up > >>> +# with max-connections of 1. > >>> +# > >>> +# @auto: Behaves like @off if the export is writable, and @on if the > >>> +#export is read-only. > >>> +# > >>> +# Since: 7.0 > >>> +## > >>> +{ 'enum': 'NbdExportMultiConn', > >>> + 'data': ['off', 'on', 'auto'] } > >>> > >> > >> Are we going to have --multi-con=(on|off|auto)? > > > > Oh. The QMP command (which is immediately visible through > > nbd-server-add/block-storage-add to qemu and qemu-storage-daemon) > > gains "multi-conn":"on", but you may be right that qemu-nbd would want > > a command line option (either that, or we accellerate our plans that > > qsd should replace qemu-nbd). > > > >>> +++ b/blockdev-nbd.c > >>> @@ -44,6 +44,11 @@ bool nbd_server_is_running(void) > >>> return nbd_server || is_qemu_nbd; > >>> } > >>> > >>> +int nbd_server_max_connections(void) > >>> +{ > >>> +return nbd_server ? nbd_server->max_connections : -1; > >>> +} > &
Re: [PATCH v2] nbd/server: Allow MULTI_CONN for shared writable exports
On Wed, Feb 16, 2022 at 12:13 PM Richard W.M. Jones wrote: > On Tue, Feb 15, 2022 at 05:24:14PM -0600, Eric Blake wrote: > > Oh. The QMP command (which is immediately visible through > > nbd-server-add/block-storage-add to qemu and qemu-storage-daemon) > > gains "multi-conn":"on", but you may be right that qemu-nbd would want > > a command line option (either that, or we accellerate our plans that > > qsd should replace qemu-nbd). > > I really hope there will always be something called "qemu-nbd" > that acts like qemu-nbd. > I share this hope. Most projects I work on are based on qemu-nbd. However in oVirt use case, we want to provide an NBD socket for clients to allow direct access to disks. One of the issues we need to solve for this is having a way to tell if the qemu-nbd is active, so we can terminate idle transfers. The way we do this with the ovirt-imageio server is to query the status of the transfer, and use the idle time (time since last request) and active status (has inflight requests) to detect a stale transfer that should be terminated. An example use case is a process on a remote host that started an image transfer, and killed or crashed in the middle of the transfer without cleaning up properly. To be more specific, every request to the imageio server (read, write, flush, zero, options) updates a timestamp in the transfer state. When we get the status we report the time since that timestamp was updated. Additionally we keep and report the number of inflight requests, so we can tell the case when requests are blocked on inaccessible storage (e.g. non responsive NFS). We don't have a way to do this with qemu-nbd, but I guess that using qemu-storage-daemon when we have qmp access will make such monitoring possible. Nir
Re: [PATCH v2] nbd/server: Allow MULTI_CONN for shared writable exports
On Tue, Feb 15, 2022 at 7:22 PM Eric Blake wrote: > According to the NBD spec, a server advertising > NBD_FLAG_CAN_MULTI_CONN promises that multiple client connections will > not see any cache inconsistencies: when properly separated by a single > flush, actions performed by one client will be visible to another > client, regardless of which client did the flush. We satisfy these > conditions in qemu when our block layer is backed by the local > filesystem (by virtue of the semantics of fdatasync(), and the fact > that qemu itself is not buffering writes beyond flushes). It is > harder to state whether we satisfy these conditions for network-based > protocols, so the safest course of action is to allow users to opt-in > to advertising multi-conn. We may later tweak defaults to advertise > by default when the block layer can confirm that the underlying > protocol driver is cache consistent between multiple writers, but for > now, this at least allows savvy users (such as virt-v2v or nbdcopy) to > explicitly start qemu-nbd or qemu-storage-daemon with multi-conn > advertisement in a known-safe setup where the client end can then > benefit from parallel clients. > It makes sense, and will be used by oVirt. Actually we are already using multiple connections for writing about 2 years, based on your promise that if every client writes to district areas this is safe. Note, however, that we don't want to advertise MULTI_CONN when we know > that a second client cannot connect (for historical reasons, qemu-nbd > defaults to a single connection while nbd-server-add and QMP commands > default to unlimited connections; but we already have existing means > to let either style of NBD server creation alter those defaults). The > harder part of this patch is setting up an iotest to demonstrate > behavior of multiple NBD clients to a single server. It might be > possible with parallel qemu-io processes, but concisely managing that > in shell is painful. I found it easier to do by relying on the libnbd > project's nbdsh, which means this test will be skipped on platforms > where that is not available. > > Signed-off-by: Eric Blake > Fixes: https://bugzilla.redhat.com/1708300 > --- > > v1 was in Aug 2021 [1], with further replies in Sep [2] and Oct [3]. > > [1] https://lists.gnu.org/archive/html/qemu-devel/2021-08/msg04900.html > [2] https://lists.gnu.org/archive/html/qemu-devel/2021-09/msg00038.html > [3] https://lists.gnu.org/archive/html/qemu-devel/2021-10/msg06744.html > > Since then, I've tweaked the QAPI to mention 7.0 (instead of 6.2), and > reworked the logic so that default behavior is unchanged for now > (advertising multi-conn on a writable export requires opt-in during > the command line or QMP, but remains default for a readonly export). > I've also expanded the amount of testing done in the new iotest. > > docs/interop/nbd.txt | 1 + > docs/tools/qemu-nbd.rst| 3 +- > qapi/block-export.json | 34 +++- > include/block/nbd.h| 3 +- > blockdev-nbd.c | 5 + > nbd/server.c | 27 ++- > MAINTAINERS| 1 + > tests/qemu-iotests/tests/nbd-multiconn | 188 + > tests/qemu-iotests/tests/nbd-multiconn.out | 112 > 9 files changed, 363 insertions(+), 11 deletions(-) > create mode 100755 tests/qemu-iotests/tests/nbd-multiconn > create mode 100644 tests/qemu-iotests/tests/nbd-multiconn.out > > diff --git a/docs/interop/nbd.txt b/docs/interop/nbd.txt > index bdb0f2a41aca..6c99070b99c8 100644 > --- a/docs/interop/nbd.txt > +++ b/docs/interop/nbd.txt > @@ -68,3 +68,4 @@ NBD_CMD_BLOCK_STATUS for "qemu:dirty-bitmap:", > NBD_CMD_CACHE > * 4.2: NBD_FLAG_CAN_MULTI_CONN for shareable read-only exports, > NBD_CMD_FLAG_FAST_ZERO > * 5.2: NBD_CMD_BLOCK_STATUS for "qemu:allocation-depth" > +* 7.0: NBD_FLAG_CAN_MULTI_CONN for shareable writable exports > diff --git a/docs/tools/qemu-nbd.rst b/docs/tools/qemu-nbd.rst > index 6031f9689312..1de785524c36 100644 > --- a/docs/tools/qemu-nbd.rst > +++ b/docs/tools/qemu-nbd.rst > @@ -139,8 +139,7 @@ driver options if ``--image-opts`` is specified. > .. option:: -e, --shared=NUM > >Allow up to *NUM* clients to share the device (default > - ``1``), 0 for unlimited. Safe for readers, but for now, > - consistency is not guaranteed between multiple writers. > + ``1``), 0 for unlimited. > Removing the note means that now consistency is guaranteed between multiple writers, no? Or maybe we want to mention here that consistency depends on the protocol and users can opt in, or refer to the section where this is discussed? .. option:: -t, --persistent > > diff --git a/qapi/block-export.json b/qapi/block-export.json > index f183522d0d2c..0a27e8ee84f9 100644 > --- a/qapi/block-export.json > +++ b/qapi/block-export.json > @@ -21,7 +21,9 @@ > # recreated
Re: [PATCH v2 2/2] iotests/block-status-cache: New test
#x27;map', '--output=json', '--image-opts', > +nbd_img_opts) > + > +# qemu:allocation-depth maps for want_zero=false. > +# want_zero=false should (with the file driver, which the server is > +# using) report everything as data. While this is sufficient for > +# want_zero=false, this is nothing that should end up in the > +# block-status cache. > +# Due to a bug, this information did end up in the cache, though, and > +# this would lead to wrong information being returned on subsequent > +# want_zero=true calls. > +# > +# We need to run this map twice: On the first call, we probably still > +# have the first sector in the cache, and so this will be served from > +# the cache; and only the subsequent range will be queried from the > +# block driver. This subsequent range will then be entered into the > +# cache. > +# If we did a want_zero=true call at this point, we would thus get > +# correct information: The first sector is not covered by the cache, > so > +# we would get fresh block-status information from the driver, which > +# would return a data range, and this would then go into the cache, > +# evicting the wrong range from the want_zero=false call before. > +# > +# Therefore, we need a second want_zero=false map to reproduce: > +# Since the first sector is not in the cache, the query for its > status > +# will go to the driver, which will return a result that reports the > +# whole image to be a single data area. This result will then go > into > +# the cache, and so the cache will then report the whole image to > +# contain data. > +# > +# Note that once the cache reports the whole image to contain data, > any > +# subsequent map operation will be served from the cache, and so we > can > +# never loop too many times here. > +for _ in range(2): > +# (Ignore the result, this is just to contaminate the cache) > +qemu_img_pipe('map', '--output=json', '--image-opts', > + nbd_img_opts_alloc_depth) > + > +# Now let's see whether the cache reports everything as data, or > +# whether we get correct information (i.e. the same as we got on our > +# first attempt). > +map_post = qemu_img_pipe('map', '--output=json', '--image-opts', > + nbd_img_opts) > + > +if map_pre != map_post: > +print('ERROR: Map information differs before and after querying > ' + > + 'qemu:allocation-depth') > +print('Before:') > +print(map_pre) > +print('After:') > +print(map_post) > + > +self.fail("Map information differs") > + > + > +if __name__ == '__main__': > +# The block-status cache only works on the protocol layer, so to test it, > +# we can only use the raw format > +iotests.main(supported_fmts=['raw'], > + supported_protocols=['file']) > diff --git a/tests/qemu-iotests/tests/block-status-cache.out > b/tests/qemu-iotests/tests/block-status-cache.out > new file mode 100644 > index 00..ae1213e6f8 > --- /dev/null > +++ b/tests/qemu-iotests/tests/block-status-cache.out > @@ -0,0 +1,5 @@ > +. > +-- > +Ran 1 tests > + > +OK > -- > 2.33.1 > The out file is not very useful, and even fragile - if the test framework will change the output format, the test will fail. Ideally we depend only on the relevant output of our tools, and using a different version of the test framework on replacing it (e.g pytest) will not require modifying the out files. Regardless I would like to see this fix merged and this issue already exists in other tests. Some tests in tests/ do have useful output that can make debugging failures easier. Reviewed-by: Nir Soffer
Re: [PATCH 1/2] block/io: Update BSC only if want_zero is true
On Mon, Jan 17, 2022 at 6:26 PM Hanna Reitz wrote: > > We update the block-status cache whenever we get new information from a > bdrv_co_block_status() call to the block driver. However, if we have > passed want_zero=false to that call, it may flag areas containing zeroes > as data, and so we would update the block-status cache with wrong > information. > > Therefore, we should not update the cache with want_zero=false. > > Reported-by: Nir Soffer > Fixes: 0bc329fbb009f8601cec23bf2bc48ead0c5a5fa2 >("block: block-status cache for data regions") > Signed-off-by: Hanna Reitz > --- > block/io.c | 6 +- > 1 file changed, 5 insertions(+), 1 deletion(-) > > diff --git a/block/io.c b/block/io.c > index bb0a254def..4e4cb556c5 100644 > --- a/block/io.c > +++ b/block/io.c > @@ -2497,8 +2497,12 @@ static int coroutine_fn > bdrv_co_block_status(BlockDriverState *bs, > * non-protocol nodes, and then it is never used. However, > filling > * the cache requires an RCU update, so double check here to > avoid > * such an update if possible. > + * > + * Check want_zero, because we only want to update the cache > when we > + * have accurate information about what is zero and what is data. > */ > -if (ret == (BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID) && > +if (want_zero && > +ret == (BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID) && > QLIST_EMPTY(&bs->children)) > { > /* > -- > 2.33.1 > ovirt-imageio tests pass with this change. Thanks for the quick fix! Reviewed-by: Nir Soffer
Re: [PATCH 2/2] iotests/block-status-cache: New test
# This will probably detect an allocated data sector first (qemu > likes > +# to allocate the first sector to facilitate alignment probing), and > +# then the rest to be zero. The BSC will thus contain (if anything) > +# one range covering the first sector. > +map_pre = qemu_img_pipe('map', '--output=json', '--image-opts', > +nbd_img_opts) > + > +# qemu:allocation-depth maps for want_zero=false. > +# want_zero=false should (with the file driver, which the server is > +# using) report everything as data. While this is sufficient for > +# want_zero=false, this is nothing that should end up in the > +# block-status cache. > +# Due to a bug, this information did end up in the cache, though, and > +# this would lead to wrong information being returned on subsequent > +# want_zero=true calls. > +# > +# We need to run this map twice: On the first call, we probably still > +# have the first sector in the cache, and so this will be served from > +# the cache; and only the subsequent range will be queried from the > +# block driver. This subsequent range will then be entered into the > +# cache. > +# If we did a want_zero=true call at this point, we would thus get > +# correct information: The first sector is not covered by the cache, > so > +# we would get fresh block-status information from the driver, which > +# would return a data range, and this would then go into the cache, > +# evicting the wrong range from the want_zero=false call before. > +# > +# Therefore, we need a second want_zero=false map to reproduce: > +# Since the first sector is not in the cache, the query for its > status > +# will go to the driver, which will return a result that reports the > +# whole image to be a single data area. This result will then go > into > +# the cache, and so the cache will then report the whole image to > +# contain data. Interesting, but once we fix the bug this complex flow is gone so we can eliminate this text, no? > +# > +# Note that once the cache reports the whole image to contain data, > any > +# subsequent map operation will be served from the cache, and so we > can > +# never loop too many times here. > +for _ in range(2): > +# (Ignore the result, this is just to contaminate the cache) > +qemu_img_pipe('map', '--output=json', '--image-opts', > + nbd_img_opts_alloc_depth) > + > +# Now let's see whether the cache reports everything as data, or > +# whether we get correct information (i.e. the same as we got on our > +# first attempt). > +map_post = qemu_img_pipe('map', '--output=json', '--image-opts', > + nbd_img_opts) > + > +if map_pre != map_post: > +print('ERROR: Map information differs before and after querying > ' + > + 'qemu:allocation-depth') > +print('Before:') > + print(map_pre) > +print('After:') > +print(map_post) > + > +self.fail("Map information differs") > + > + > +if __name__ == '__main__': > +# The block-status cache only works on the protocol layer, so to test it, > +# we can only use the raw format > +iotests.main(supported_fmts=['raw'], > + supported_protocols=['file']) > diff --git a/tests/qemu-iotests/tests/block-status-cache.out > b/tests/qemu-iotests/tests/block-status-cache.out > new file mode 100644 > index 00..ae1213e6f8 > --- /dev/null > +++ b/tests/qemu-iotests/tests/block-status-cache.out > @@ -0,0 +1,5 @@ > +. > +-- > +Ran 1 tests > + > +OK > -- > 2.33.1 > Reviewed-by: Nir Soffer
[PATCH] nbd/server.c: Remove unused field
NBDRequestData struct has unused QSIMPLEQ_ENTRY filed. It seems that this field exists since the first git commit and was never used. Signed-off-by: Nir Soffer --- nbd/server.c | 1 - 1 file changed, 1 deletion(-) diff --git a/nbd/server.c b/nbd/server.c index 3927f7789d..ce5b2a1d02 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -70,21 +70,20 @@ static int system_errno_to_nbd_errno(int err) default: return NBD_EINVAL; } } /* Definitions for opaque data types */ typedef struct NBDRequestData NBDRequestData; struct NBDRequestData { -QSIMPLEQ_ENTRY(NBDRequestData) entry; NBDClient *client; uint8_t *data; bool complete; }; struct NBDExport { BlockExport common; char *name; char *description; -- 2.34.1
Re: [PATCH 3/4] qemu-img: add --shallow option for qemu-img compare --stat
On Wed, Sep 29, 2021 at 7:28 PM Vladimir Sementsov-Ogievskiy wrote: > > 29.09.2021 19:00, Nir Soffer wrote: > > On Wed, Sep 29, 2021 at 4:37 PM Vladimir Sementsov-Ogievskiy > > wrote: > >> > >> Allow compare only top images of backing chains. That's useful for > >> comparing two increments from the same chain of incremental backups. > >> > >> Signed-off-by: Vladimir Sementsov-Ogievskiy > >> --- > >> docs/tools/qemu-img.rst | 8 +++- > >> qemu-img.c | 14 -- > >> qemu-img-cmds.hx| 4 ++-- > >> 3 files changed, 21 insertions(+), 5 deletions(-) > >> > >> diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst > >> index 4b382ca2b0..c8ae96be6a 100644 > >> --- a/docs/tools/qemu-img.rst > >> +++ b/docs/tools/qemu-img.rst > >> @@ -176,6 +176,12 @@ Parameters to compare subcommand: > >> - If both files don't specify cluster-size, use default of 64K > >> - If only one file specify cluster-size, just use it. > >> > >> +.. option:: --shallow > > > > We use the same term in oVirt when we upload/download one layer from a > > chain. > > > >> + Only allowed with ``--stat``. This option prevents opening and comparing > >> + any backing files. This is useful to compare incremental images from > >> + the chain of incremental backups. > > > > This is useful also without --stat. Our current workaround in oVirt is > > to use unsafe > > rebase to disconnect the top image from the base image so we can compare > > source and destination image after backup. > > > > Here is an example of test code that could use --shallow (regardless of > > --stat): > > https://github.com/oVirt/ovirt-imageio/blob/master/daemon/test/backup_test.py#L114 > > > > Do you have any reason to limit --shallow to --stats? > > > Hmm. I wrongly thought that without --stat qemu-img compare will fail on > first mismatch, which will occur soon, as we don't have backing images and > it's just superfluous. > > But actually, qemu-img will not compare "unallocated" areas. > > Ok, I agree, in v2 I'll allow --shallow without --stat. > > > Another question to discuss: we already have "-u" option in qemu-img create > and qemu-img rebase to not open backing files. And 'u' means 'unsafe'. > I don't think that "unsafe" term is good for qemu-img compare --stat, that's > why I decided to call it differently: "shallow". > Still for qemu-img compare (without --stat) "unsafe" term make sense. > > > So, it probably better to follow common notation, and call the option "-u". --shallow is better, comparing a single image from a chain is a safe operation. Replacing a backing file or creating an image on top of one without checking the backing file is not. > > > > >> + > >> Parameters to convert subcommand: > >> > >> .. program:: qemu-img-convert > >> @@ -395,7 +401,7 @@ Command description: > >> > >> The rate limit for the commit process is specified by ``-r``. > >> > >> -.. option:: compare [--object OBJECTDEF] [--image-opts] [-f FMT] [-F FMT] > >> [-T SRC_CACHE] [-p] [-q] [-s] [-U] [--stat [--block-size BLOCK_SIZE]] > >> FILENAME1 FILENAME2 > >> +.. option:: compare [--object OBJECTDEF] [--image-opts] [-f FMT] [-F FMT] > >> [-T SRC_CACHE] [-p] [-q] [-s] [-U] [--stat [--block-size BLOCK_SIZE] > >> [--shallow]] FILENAME1 FILENAME2 > >> > >> Check if two images have the same content. You can compare images with > >> different format or settings. > >> diff --git a/qemu-img.c b/qemu-img.c > >> index 61e7f470bb..e8ae412c38 100644 > >> --- a/qemu-img.c > >> +++ b/qemu-img.c > >> @@ -85,6 +85,7 @@ enum { > >> OPTION_SKIP_BROKEN = 277, > >> OPTION_STAT = 277, > >> OPTION_BLOCK_SIZE = 278, > >> +OPTION_SHALLOW = 279, > >> }; > >> > >> typedef enum OutputFormat { > >> @@ -1482,7 +1483,7 @@ static int img_compare(int argc, char **argv) > >> int64_t block_end; > >> int ret = 0; /* return value - 0 Ident, 1 Different, >1 Error */ > >> bool progress = false, quiet = false, strict = false; > >> -int flags; > >> +int flags = 0; > >> bool writethrough; > >> int64_t total_size; > >> int64_t offset =
Re: [PATCH 3/4] qemu-img: add --shallow option for qemu-img compare --stat
On Wed, Sep 29, 2021 at 4:37 PM Vladimir Sementsov-Ogievskiy wrote: > > Allow compare only top images of backing chains. That's useful for > comparing two increments from the same chain of incremental backups. > > Signed-off-by: Vladimir Sementsov-Ogievskiy > --- > docs/tools/qemu-img.rst | 8 +++- > qemu-img.c | 14 -- > qemu-img-cmds.hx| 4 ++-- > 3 files changed, 21 insertions(+), 5 deletions(-) > > diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst > index 4b382ca2b0..c8ae96be6a 100644 > --- a/docs/tools/qemu-img.rst > +++ b/docs/tools/qemu-img.rst > @@ -176,6 +176,12 @@ Parameters to compare subcommand: > - If both files don't specify cluster-size, use default of 64K > - If only one file specify cluster-size, just use it. > > +.. option:: --shallow We use the same term in oVirt when we upload/download one layer from a chain. > + Only allowed with ``--stat``. This option prevents opening and comparing > + any backing files. This is useful to compare incremental images from > + the chain of incremental backups. This is useful also without --stat. Our current workaround in oVirt is to use unsafe rebase to disconnect the top image from the base image so we can compare source and destination image after backup. Here is an example of test code that could use --shallow (regardless of --stat): https://github.com/oVirt/ovirt-imageio/blob/master/daemon/test/backup_test.py#L114 Do you have any reason to limit --shallow to --stats? > + > Parameters to convert subcommand: > > .. program:: qemu-img-convert > @@ -395,7 +401,7 @@ Command description: > >The rate limit for the commit process is specified by ``-r``. > > -.. option:: compare [--object OBJECTDEF] [--image-opts] [-f FMT] [-F FMT] > [-T SRC_CACHE] [-p] [-q] [-s] [-U] [--stat [--block-size BLOCK_SIZE]] > FILENAME1 FILENAME2 > +.. option:: compare [--object OBJECTDEF] [--image-opts] [-f FMT] [-F FMT] > [-T SRC_CACHE] [-p] [-q] [-s] [-U] [--stat [--block-size BLOCK_SIZE] > [--shallow]] FILENAME1 FILENAME2 > >Check if two images have the same content. You can compare images with >different format or settings. > diff --git a/qemu-img.c b/qemu-img.c > index 61e7f470bb..e8ae412c38 100644 > --- a/qemu-img.c > +++ b/qemu-img.c > @@ -85,6 +85,7 @@ enum { > OPTION_SKIP_BROKEN = 277, > OPTION_STAT = 277, > OPTION_BLOCK_SIZE = 278, > +OPTION_SHALLOW = 279, > }; > > typedef enum OutputFormat { > @@ -1482,7 +1483,7 @@ static int img_compare(int argc, char **argv) > int64_t block_end; > int ret = 0; /* return value - 0 Ident, 1 Different, >1 Error */ > bool progress = false, quiet = false, strict = false; > -int flags; > +int flags = 0; > bool writethrough; > int64_t total_size; > int64_t offset = 0; > @@ -1504,6 +1505,7 @@ static int img_compare(int argc, char **argv) > {"force-share", no_argument, 0, 'U'}, > {"stat", no_argument, 0, OPTION_STAT}, > {"block-size", required_argument, 0, OPTION_BLOCK_SIZE}, > +{"shallow", no_argument, 0, OPTION_SHALLOW}, > {0, 0, 0, 0} > }; > c = getopt_long(argc, argv, ":hf:F:T:pqsU", > @@ -1569,6 +1571,9 @@ static int img_compare(int argc, char **argv) > exit(EXIT_SUCCESS); > } > break; > +case OPTION_SHALLOW: > +flags |= BDRV_O_NO_BACKING; > +break; > } > } > > @@ -1590,10 +1595,15 @@ static int img_compare(int argc, char **argv) > goto out; > } > > +if (!do_stat && (flags & BDRV_O_NO_BACKING)) { > +error_report("--shallow can be used only together with --stat"); > +ret = 1; > +goto out; > +} > + > /* Initialize before goto out */ > qemu_progress_init(progress, 2.0); > > -flags = 0; > ret = bdrv_parse_cache_mode(cache, &flags, &writethrough); > if (ret < 0) { > error_report("Invalid source cache option: %s", cache); > diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx > index 96a193eea8..a295bc6860 100644 > --- a/qemu-img-cmds.hx > +++ b/qemu-img-cmds.hx > @@ -40,9 +40,9 @@ SRST > ERST > > DEF("compare", img_compare, > -"compare [--object objectdef] [--image-opts] [-f fmt] [-F fmt] [-T > src_cache] [-p] [-q] [-s] [-U] [--stat [--block-size BLOCK_SIZE]] filename1 > filename2") > +"compare [--object objectdef] [--image-opts] [-f fmt] [-F fmt] [-T > src_cache] [-p] [-q] [-s] [-U] [--stat [--block-size BLOCK_SIZE] [--shallow]] > filename1 filename2") > SRST > -.. option:: compare [--object OBJECTDEF] [--image-opts] [-f FMT] [-F FMT] > [-T SRC_CACHE] [-p] [-q] [-s] [-U] [--stat [--block-size BLOCK_SIZE]] > FILENAME1 FILENAME2 > +.. option:: compare [--object OBJECTDEF] [--image-opts] [-f FMT] [-F FMT] > [-T SRC_CACHE] [-p] [-q] [-s] [-U] [--stat [--block-size BLOCK_SIZE] > [--shallow]] FILENAME1 FILENAME2 > ERST > > DEF("convert",
Re: [PATCH] qemu-nbd: Change default cache mode to writeback
On Mon, Aug 16, 2021 at 6:50 PM Eric Blake wrote: > > On Fri, Aug 13, 2021 at 11:55:19PM +0300, Nir Soffer wrote: > > Both qemu and qemu-img use writeback cache mode by default, which is > > already documented in qemu(1). qemu-nbd uses writethrough cache mode by > > default, and the default cache mode is not documented. > > > > According to the qemu-nbd(8): > > > >--cache=CACHE > > The cache mode to be used with the file. See the > > documentation of the emulator's -drive cache=... option for > > allowed values. > > > > qemu(1) says: > > > > The default mode is cache=writeback. > > > > So users have no reason to assume that qemu-nbd is using writethough > > cache mode. The only hint is the painfully slow writing when using the > > defaults. > > Oh, good catch. Unfortunately too late for 6.1 proper, but I'll add > qemu-stable in cc and queue this through my NBD tree for 6.2. I don't see this in master, lost in your NBD tree? > > Users can avoid the issue by using --cache=writeback[1] but the defaults > > should give good performance for the common use case. > > > > [1] https://bugzilla.redhat.com/1990656 > > > > Signed-off-by: Nir Soffer > > --- > > Reviewed-by: Eric Blake > > -- > Eric Blake, Principal Software Engineer > Red Hat, Inc. +1-919-301-3266 > Virtualization: qemu.org | libvirt.org
[PATCH] qemu-nbd: Change default cache mode to writeback
Both qemu and qemu-img use writeback cache mode by default, which is already documented in qemu(1). qemu-nbd uses writethrough cache mode by default, and the default cache mode is not documented. According to the qemu-nbd(8): --cache=CACHE The cache mode to be used with the file. See the documentation of the emulator's -drive cache=... option for allowed values. qemu(1) says: The default mode is cache=writeback. So users have no reason to assume that qemu-nbd is using writethough cache mode. The only hint is the painfully slow writing when using the defaults. Looking in git history, it seems that qemu used writethrough in the past to support broken guests that did not flush data properly, or could not flush due to limitations in qemu. But qemu-nbd clients can use NBD_CMD_FLUSH to flush data, so using writethrough does not help anyone. Change the default cache mode to writback, and document the default and available values properly in the online help and manual. With this change converting image via qemu-nbd is 3.5 times faster. $ qemu-img create dst.img 50g $ qemu-nbd -t -f raw -k /tmp/nbd.sock dst.img Before this change: $ hyperfine -r3 "./qemu-img convert -p -f raw -O raw -T none -W fedora34.img nbd+unix:///?socket=/tmp/nbd.sock" Benchmark #1: ./qemu-img convert -p -f raw -O raw -T none -W fedora34.img nbd+unix:///?socket=/tmp/nbd.sock Time (mean ± σ): 83.639 s ± 5.970 s[User: 2.733 s, System: 6.112 s] Range (min … max): 76.749 s … 87.245 s3 runs After this change: $ hyperfine -r3 "./qemu-img convert -p -f raw -O raw -T none -W fedora34.img nbd+unix:///?socket=/tmp/nbd.sock" Benchmark #1: ./qemu-img convert -p -f raw -O raw -T none -W fedora34.img nbd+unix:///?socket=/tmp/nbd.sock Time (mean ± σ): 23.522 s ± 0.433 s[User: 2.083 s, System: 5.475 s] Range (min … max): 23.234 s … 24.019 s3 runs Users can avoid the issue by using --cache=writeback[1] but the defaults should give good performance for the common use case. [1] https://bugzilla.redhat.com/1990656 Signed-off-by: Nir Soffer --- docs/tools/qemu-nbd.rst | 6 -- qemu-nbd.c | 6 -- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/docs/tools/qemu-nbd.rst b/docs/tools/qemu-nbd.rst index ee862fa0bc..5643da26e9 100644 --- a/docs/tools/qemu-nbd.rst +++ b/docs/tools/qemu-nbd.rst @@ -98,8 +98,10 @@ driver options if ``--image-opts`` is specified. .. option:: --cache=CACHE - The cache mode to be used with the file. See the documentation of - the emulator's ``-drive cache=...`` option for allowed values. + The cache mode to be used with the file. Valid values are: + ``none``, ``writeback`` (the default), ``writethrough``, + ``directsync`` and ``unsafe``. See the documentation of + the emulator's ``-drive cache=...`` option for more info. .. option:: -n, --nocache diff --git a/qemu-nbd.c b/qemu-nbd.c index 26ffbf15af..6c18fcd19a 100644 --- a/qemu-nbd.c +++ b/qemu-nbd.c @@ -135,7 +135,9 @@ static void usage(const char *name) "'snapshot.id=[ID],snapshot.name=[NAME]', or\n" "'[ID_OR_NAME]'\n" " -n, --nocache disable host cache\n" -" --cache=MODE set cache mode (none, writeback, ...)\n" +" --cache=MODE set cache mode used to access the disk image, the\n" +"valid options are: 'none', 'writeback' (default),\n" +"'writethrough', 'directsync' and 'unsafe'\n" " --aio=MODEset AIO mode (native, io_uring or threads)\n" " --discard=MODEset discard mode (ignore, unmap)\n" " --detect-zeroes=MODE set detect-zeroes mode (off, on, unmap)\n" @@ -552,7 +554,7 @@ int main(int argc, char **argv) bool alloc_depth = false; const char *tlscredsid = NULL; bool imageOpts = false; -bool writethrough = true; +bool writethrough = false; /* Client will flush as needed. */ bool fork_process = false; bool list = false; int old_stderr = -1; -- 2.31.1
Re: [PATCH for-6.1?] iotest: Further enhance qemu-img-bitmaps
On 7/21/21 11:46 PM, Eric Blake wrote: Add a regression test to make sure we detect attempts to use 'qemu-img bitmap' to modify an in-use local file. Suggested-by: Nir Soffer Signed-off-by: Eric Blake --- Sadly, this missed my bitmaps pull request today. If there's any reason to respin that pull request, I'm inclined to add this in, as it just touches the iotests; otherwise, if it slips to 6.2 it's not too bad. tests/qemu-iotests/tests/qemu-img-bitmaps | 6 ++ tests/qemu-iotests/tests/qemu-img-bitmaps.out | 5 + 2 files changed, 11 insertions(+) diff --git a/tests/qemu-iotests/tests/qemu-img-bitmaps b/tests/qemu-iotests/tests/qemu-img-bitmaps index 7a3fe8c3d37a..3b6fade11735 100755 --- a/tests/qemu-iotests/tests/qemu-img-bitmaps +++ b/tests/qemu-iotests/tests/qemu-img-bitmaps @@ -129,6 +129,12 @@ $QEMU_IMG map --output=json --image-opts \ $QEMU_IMG map --output=json --image-opts \ "$IMG,x-dirty-bitmap=qemu:dirty-bitmap:b3" | _filter_qemu_img_map +echo +echo "=== bitmap command fails to modify image already in use ===" +echo + +$QEMU_IMG bitmap --add "$TEST_IMG" b4 2>&1 | _filter_testdir + nbd_server_stop echo diff --git a/tests/qemu-iotests/tests/qemu-img-bitmaps.out b/tests/qemu-iotests/tests/qemu-img-bitmaps.out index e851f0320ecb..c6e12dd700aa 100644 --- a/tests/qemu-iotests/tests/qemu-img-bitmaps.out +++ b/tests/qemu-iotests/tests/qemu-img-bitmaps.out @@ -116,6 +116,11 @@ Format specific information: { "start": 2097152, "length": 1048576, "depth": 0, "present": false, "zero": false, "data": false}, { "start": 3145728, "length": 7340032, "depth": 0, "present": true, "zero": false, "data": true, "offset": OFFSET}] +=== bitmap command fails to modify image already in use === + +qemu-img: Could not open 'TEST_DIR/t.qcow2': Failed to get "write" lock +Is another process using the image [TEST_DIR/t.qcow2]? + === Check handling of inconsistent bitmap === image: TEST_DIR/t.IMGFMT It would be nice to test more than --add. I guess the implementation is shared but if someone change it the test will protect us. Reviewed-by: Nir Soffer
Re: [PATCH v2 3/3] qemu-img: Add --skip-broken-bitmaps for 'convert --bitmaps'
On Tue, Jul 13, 2021 at 8:53 PM Eric Blake wrote: > > On Sat, Jul 10, 2021 at 09:37:35PM +0300, Nir Soffer wrote: > > > We don't want to delete inconsistent bitmaps by default: although a > > > corrupt bitmap is only a loss of optimization rather than a corruption > > > of user-visible data, it is still nice to require the user to opt in > > > to the fact that they are aware of the loss of the bitmap. Still, > > > requiring the user to check 'qemu-img info' to see whether bitmaps are > > > consistent, then use 'qemu-img bitmap --remove' to remove offenders, > > > all before using 'qemu-img convert', is a lot more work than just > > > adding a knob 'qemu-img convert --bitmaps --skip-broken-bitmaps' which > > > opts in to skipping the broken bitmaps. > > > > I think this is more than convenience. During live storage migration in > > oVirt, we mirror the top layer to the destination using libvirt blockCopy, > > and copy the rest of the chain using qemu-img convert with the --bitmaps > > option. > > Still, this feels like enough of a feature that I'd really like R-b in > time to prepare a pull request for inclusion in soft freeze; the > justification for it being a bug fix is a tough sell. This is not a bug in the current code, more like missing handling of important use case. Without this we cannot copy images in some cases, or we must require downtime to check and repair images before copying disks. > > > +.. option:: convert [--object OBJECTDEF] [--image-opts] > > > [--target-image-opts] [--target-is-zero] [--bitmaps > > > [--skip-broken-bitmaps]] [-U] [-C] [-c] [-p] [-q] [-n] [-f FMT] [-t > > > CACHE] [-T SRC_CACHE] [-O OUTPUT_FMT] [-B BACKING_FILE] [-o OPTIONS] [-l > > > SNAPSHOT_PARAM] [-S SPARSE_SIZE] [-r RATE_LIMIT] [-m NUM_COROUTINES] [-W] > > > FILENAME [FILENAME2 [...]] OUTPUT_FILENAME > > > > I liked --skip-broken more, but Vladimir is right that this is not really a > > sub-option. > > getopt_long() lets you abbreviate; '--sk' and '--skip-broken' are both > unambiguous prefixes of '--skip-broken-bitmaps'. Nice to learn that > > > @@ -2117,7 +2118,7 @@ static int convert_check_bitmaps(BlockDriverState > > > *src) > > > continue; > > > } > > > name = bdrv_dirty_bitmap_name(bm); > > > -if (bdrv_dirty_bitmap_inconsistent(bm)) { > > > +if (!skip_broken && bdrv_dirty_bitmap_inconsistent(bm)) { > > > error_report("Cannot copy inconsistent bitmap '%s'", name); > > > > We can add another hint: > > > > Try --skip-brocken-bitmaps to skip this bitmap or "qemu-img bitmap > > --remove" to delete it from disk. > > Sure, I can see about adding that. > > > > > > > return -1; > > > } > > > @@ -2125,7 +2126,8 @@ static int convert_check_bitmaps(BlockDriverState > > > *src) > > > return 0; > > > } > > > > > > -static int convert_copy_bitmaps(BlockDriverState *src, BlockDriverState > > > *dst) > > > +static int convert_copy_bitmaps(BlockDriverState *src, BlockDriverState > > > *dst, > > > +bool skip_broken) > > > { > > > BdrvDirtyBitmap *bm; > > > Error *err = NULL; > > > @@ -2137,6 +2139,10 @@ static int convert_copy_bitmaps(BlockDriverState > > > *src, BlockDriverState *dst) > > > continue; > > > } > > > name = bdrv_dirty_bitmap_name(bm); > > > +if (skip_broken && bdrv_dirty_bitmap_inconsistent(bm)) { > > > +warn_report("Skipping inconsistent bitmap %s", name); > > > > In other logs we quote the bitmap name:'%s' > > Yes, will fix. > > > > +++ b/tests/qemu-iotests/tests/qemu-img-bitmaps > > > @@ -143,6 +143,16 @@ $QEMU_IMG convert --bitmaps -O qcow2 "$TEST_IMG" > > > "$TEST_IMG.copy" && > > > echo "unexpected success" > > > TEST_IMG=$TEST_IMG.copy _img_info --format-specific \ > > > | _filter_irrelevant_img_info > > > > A new title here will make the test output much more clear. > > Or even just a bare 'echo' to separate things with blank lines. Will > improve. > > > > +++ b/tests/qemu-iotests/tests/qemu-img-bitmaps.out > > > @@ -145,4 +145,35 @@ Format specific information: > > > corrupt: false > > > qemu-img: Cannot copy inconsistent bitmap 'b0' > > > qemu-img: Could not open 'TEST_DIR/t.IMGFMT.copy': Could not open > > > 'TEST_DIR/t.IMGFMT.copy': No such file or directory > > > > Why to we get this error? I guess it is part of the first copy that should > > fail? > > Yes - proof that we no longer leave a broken file around, but instead > failed fast (in fact, that's part of the previous patch). > > > > > > +qemu-img: warning: Skipping inconsistent bitmap b0 > > > +qemu-img: warning: Skipping inconsistent bitmap b2 > > > > Looks useful, I need to check that we log such warnings. > > > > Anything else I should improve before sending a v2? I think we covered everything, but Vladimir may want to comment.
Re: [PATCH v2 2/3] qemu-img: Fail fast on convert --bitmaps with inconsistent bitmap
On Tue, Jul 13, 2021 at 8:48 PM Eric Blake wrote: > > On Sat, Jul 10, 2021 at 09:06:24PM +0300, Nir Soffer wrote: > > On 7/9/21 6:39 PM, Eric Blake wrote: > > > Waiting until the end of the convert operation (a potentially > > > time-consuming task) to finally detect that we can't copy a bitmap is > > > bad, comparing to failing fast up front. Furthermore, this prevents > > > us from leaving a file behind with a bitmap that is not marked as > > > inconsistent even though it does not have sane contents. > > > > I don't think this is an issue since qemu-img terminate with non-zero > > exit code, and we cannot ensure that image is complete if we fail in > > the middle of the operation for all image formats and protocols. > > > > For files we could use a temporary file and rename after successful > > conversion for for raw format on block device we don't have any way > > to mark the contents as temporary. > > Atomic rename into place for files is nice, but as you point out, it > doesn't help when targetting block devices. So whatever we do to keep > block devices robust even across temporary state changes is also > sufficient for files, even if we can indeed improve the situation for > files in a later patch. I think management tools should handle this. In oVirt we keep metadata and cluster locks for any kind of volume and we use them to mark volumes being copied as temporary, so from our point of view proper cleanup in failure flows is non-issue. > > But failing fast is very important. > > > > > This fixes the problems exposed in the previous patch to the iotest. > > > > > > Signed-off-by: Eric Blake > > > --- > > > qemu-img.c| 30 +-- > > > tests/qemu-iotests/tests/qemu-img-bitmaps | 2 -- > > > tests/qemu-iotests/tests/qemu-img-bitmaps.out | 20 ++--- > > > 3 files changed, 29 insertions(+), 23 deletions(-) > > > > > > diff --git a/qemu-img.c b/qemu-img.c > > > index 7956a8996512..e84b3c530155 100644 > > > --- a/qemu-img.c > > > +++ b/qemu-img.c > > > @@ -2101,6 +2101,30 @@ static int convert_do_copy(ImgConvertState *s) > > > return s->ret; > > > } > > > > > > +/* Check that bitmaps can be copied, or output an error */ > > > +static int convert_check_bitmaps(BlockDriverState *src) > > > +{ > > > +BdrvDirtyBitmap *bm; > > > + > > > +if (!bdrv_supports_persistent_dirty_bitmap(src)) { > > > +error_report("Source lacks bitmap support"); > > > +return -1; > > > +} > > > +FOR_EACH_DIRTY_BITMAP(src, bm) { > > > +const char *name; > > > + > > > +if (!bdrv_dirty_bitmap_get_persistence(bm)) { > > > +continue; > > > +} > > > +name = bdrv_dirty_bitmap_name(bm); > > > +if (bdrv_dirty_bitmap_inconsistent(bm)) { > > > +error_report("Cannot copy inconsistent bitmap '%s'", name); > > > > We can add a useful hint: > > > > Try "qemu-img bitmap --remove" to delete this bitmap from disk. > > Yeah, that might be worthwhile. > > > > > > +return -1; > > > +} > > > +} > > > +return 0; > > > +} > > > + > > > static int convert_copy_bitmaps(BlockDriverState *src, BlockDriverState > > > *dst) > > > { > > > BdrvDirtyBitmap *bm; > > > @@ -2127,6 +2151,7 @@ static int convert_copy_bitmaps(BlockDriverState > > > *src, BlockDriverState *dst) > > > &err); > > > if (err) { > > > error_reportf_err(err, "Failed to populate bitmap %s: ", > > > name); > > > +qmp_block_dirty_bitmap_remove(dst->node_name, name, NULL); > > > > This may fail for the same reason populate failed (e.g. storage became > > inaccessibel in the middle of the copy). Since we fail the convert, I don't > > think it worth to try to do this kind of cleanup. > > > > If we have a way to disable the bitmap before merge, and enable it after > > successful merge it make more sense, since if the operation fails we are > > left with disabled bitmap. > > If we got this far, the guest-visible data WAS copied successfully. > 'qemu-img compare' will report success. The only thing broken at this > point is a bogus bitmap, and leaving a just-created (but empty) bitmap > in place rather than erasing it (since we just created it a few lines > above) is not nice. I see no problem with keeping this cleanup path > intact, even if it is seldom reached, and even though we still exit > the overall qemu-img convert with an error. Sure, no reason to delay this fix. With or without hint on errors, Reviewed-by: Nir Soffer
Re: [PATCH v2 3/3] qemu-img: Add --skip-broken-bitmaps for 'convert --bitmaps'
On 7/9/21 6:39 PM, Eric Blake wrote: The point of 'qemu-img convert --bitmaps' is to be a convenience for actions that are already possible through a string of smaller 'qemu-img bitmap' sub-commands. One situation not accounted for already is that if a source image contains an inconsistent bitmap (for example, because a qemu process died abruptly before flushing bitmap state), the user MUST delete those inconsistent bitmaps before anything else useful can be done with the image. The only thing affected by inconsistent bitmap is creating incremental backup, and taking some space on storage. Anything else should not be affected by having such bitmap so the user does not need to remove it. In oVirt we don't check or repair images after unclean guest shutdown. Maybe this is a good idea for future version. Inconsistent bitmaps are removed only when the user ask to remove the related checkpoint. We don't want to delete inconsistent bitmaps by default: although a corrupt bitmap is only a loss of optimization rather than a corruption of user-visible data, it is still nice to require the user to opt in to the fact that they are aware of the loss of the bitmap. Still, requiring the user to check 'qemu-img info' to see whether bitmaps are consistent, then use 'qemu-img bitmap --remove' to remove offenders, all before using 'qemu-img convert', is a lot more work than just adding a knob 'qemu-img convert --bitmaps --skip-broken-bitmaps' which opts in to skipping the broken bitmaps. I think this is more than convenience. During live storage migration in oVirt, we mirror the top layer to the destination using libvirt blockCopy, and copy the rest of the chain using qemu-img convert with the --bitmaps option. If we have to remove inconsistent bitmaps at this point we need to modify images opened for reading by qemu, which is likely not possible and even if it is possible, sounds like a bad idea. After testing the new option, also demonstrate the way to manually fix things (either deleting bad bitmaps, or re-creating them as empty) so that it is possible to convert without the option. Fixes: https://bugzilla.redhat.com/show_bug.cgi?id=1946084 Signed-off-by: Eric Blake --- docs/tools/qemu-img.rst | 8 - qemu-img.c| 26 +--- tests/qemu-iotests/tests/qemu-img-bitmaps | 10 ++ tests/qemu-iotests/tests/qemu-img-bitmaps.out | 31 +++ 4 files changed, 69 insertions(+), 6 deletions(-) diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst index cfe11478791f..4d407b180450 100644 --- a/docs/tools/qemu-img.rst +++ b/docs/tools/qemu-img.rst @@ -414,7 +414,7 @@ Command description: 4 Error on reading data -.. option:: convert [--object OBJECTDEF] [--image-opts] [--target-image-opts] [--target-is-zero] [--bitmaps] [-U] [-C] [-c] [-p] [-q] [-n] [-f FMT] [-t CACHE] [-T SRC_CACHE] [-O OUTPUT_FMT] [-B BACKING_FILE] [-o OPTIONS] [-l SNAPSHOT_PARAM] [-S SPARSE_SIZE] [-r RATE_LIMIT] [-m NUM_COROUTINES] [-W] FILENAME [FILENAME2 [...]] OUTPUT_FILENAME +.. option:: convert [--object OBJECTDEF] [--image-opts] [--target-image-opts] [--target-is-zero] [--bitmaps [--skip-broken-bitmaps]] [-U] [-C] [-c] [-p] [-q] [-n] [-f FMT] [-t CACHE] [-T SRC_CACHE] [-O OUTPUT_FMT] [-B BACKING_FILE] [-o OPTIONS] [-l SNAPSHOT_PARAM] [-S SPARSE_SIZE] [-r RATE_LIMIT] [-m NUM_COROUTINES] [-W] FILENAME [FILENAME2 [...]] OUTPUT_FILENAME I liked --skip-broken more, but Vladimir is right that this is not really a sub-option. Convert the disk image *FILENAME* or a snapshot *SNAPSHOT_PARAM* to disk image *OUTPUT_FILENAME* using format *OUTPUT_FMT*. It can @@ -456,6 +456,12 @@ Command description: *NUM_COROUTINES* specifies how many coroutines work in parallel during the convert process (defaults to 8). + Use of ``--bitmaps`` requests that any persistent bitmaps present in + the original are also copied to the destination. If any bitmap is + inconsistent in the source, the conversion will fail unless + ``--skip-broken-bitmaps`` is also specified to copy only the + consistent bitmaps. + .. option:: create [--object OBJECTDEF] [-q] [-f FMT] [-b BACKING_FILE] [-F BACKING_FMT] [-u] [-o OPTIONS] FILENAME [SIZE] Create the new disk image *FILENAME* of size *SIZE* and format diff --git a/qemu-img.c b/qemu-img.c index e84b3c530155..661538edd785 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -82,6 +82,7 @@ enum { OPTION_MERGE = 274, OPTION_BITMAPS = 275, OPTION_FORCE = 276, +OPTION_SKIP_BROKEN = 277, }; typedef enum OutputFormat { @@ -2102,7 +2103,7 @@ static int convert_do_copy(ImgConvertState *s) } /* Check that bitmaps can be copied, or output an error */ -static int convert_check_bitmaps(BlockDriverState *src) +static int convert_check_bitmaps(BlockDriverState *src, bool skip_broken) { BdrvDirtyBitmap *bm; @@ -2117,7 +2118,7 @@ static int convert_che
Re: [PATCH v2 2/3] qemu-img: Fail fast on convert --bitmaps with inconsistent bitmap
On 7/9/21 6:39 PM, Eric Blake wrote: Waiting until the end of the convert operation (a potentially time-consuming task) to finally detect that we can't copy a bitmap is bad, comparing to failing fast up front. Furthermore, this prevents us from leaving a file behind with a bitmap that is not marked as inconsistent even though it does not have sane contents. I don't think this is an issue since qemu-img terminate with non-zero exit code, and we cannot ensure that image is complete if we fail in the middle of the operation for all image formats and protocols. For files we could use a temporary file and rename after successful conversion for for raw format on block device we don't have any way to mark the contents as temporary. But failing fast is very important. This fixes the problems exposed in the previous patch to the iotest. Signed-off-by: Eric Blake --- qemu-img.c| 30 +-- tests/qemu-iotests/tests/qemu-img-bitmaps | 2 -- tests/qemu-iotests/tests/qemu-img-bitmaps.out | 20 ++--- 3 files changed, 29 insertions(+), 23 deletions(-) diff --git a/qemu-img.c b/qemu-img.c index 7956a8996512..e84b3c530155 100644 --- a/qemu-img.c +++ b/qemu-img.c @@ -2101,6 +2101,30 @@ static int convert_do_copy(ImgConvertState *s) return s->ret; } +/* Check that bitmaps can be copied, or output an error */ +static int convert_check_bitmaps(BlockDriverState *src) +{ +BdrvDirtyBitmap *bm; + +if (!bdrv_supports_persistent_dirty_bitmap(src)) { +error_report("Source lacks bitmap support"); +return -1; +} +FOR_EACH_DIRTY_BITMAP(src, bm) { +const char *name; + +if (!bdrv_dirty_bitmap_get_persistence(bm)) { +continue; +} +name = bdrv_dirty_bitmap_name(bm); +if (bdrv_dirty_bitmap_inconsistent(bm)) { +error_report("Cannot copy inconsistent bitmap '%s'", name); We can add a useful hint: Try "qemu-img bitmap --remove" to delete this bitmap from disk. +return -1; +} +} +return 0; +} + static int convert_copy_bitmaps(BlockDriverState *src, BlockDriverState *dst) { BdrvDirtyBitmap *bm; @@ -2127,6 +2151,7 @@ static int convert_copy_bitmaps(BlockDriverState *src, BlockDriverState *dst) &err); if (err) { error_reportf_err(err, "Failed to populate bitmap %s: ", name); +qmp_block_dirty_bitmap_remove(dst->node_name, name, NULL); This may fail for the same reason populate failed (e.g. storage became inaccessibel in the middle of the copy). Since we fail the convert, I don't think it worth to try to do this kind of cleanup. If we have a way to disable the bitmap before merge, and enable it after successful merge it make more sense, since if the operation fails we are left with disabled bitmap. return -1; } } @@ -2552,9 +2577,8 @@ static int img_convert(int argc, char **argv) ret = -1; goto out; } -if (!bdrv_supports_persistent_dirty_bitmap(blk_bs(s.src[0]))) { -error_report("Source lacks bitmap support"); -ret = -1; +ret = convert_check_bitmaps(blk_bs(s.src[0])); +if (ret < 0) { goto out; } } diff --git a/tests/qemu-iotests/tests/qemu-img-bitmaps b/tests/qemu-iotests/tests/qemu-img-bitmaps index 2f51651d0ce5..3fde95907515 100755 --- a/tests/qemu-iotests/tests/qemu-img-bitmaps +++ b/tests/qemu-iotests/tests/qemu-img-bitmaps @@ -141,8 +141,6 @@ $QEMU_IMG bitmap --remove "$TEST_IMG" b1 _img_info --format-specific | _filter_irrelevant_img_info $QEMU_IMG convert --bitmaps -O qcow2 "$TEST_IMG" "$TEST_IMG.copy" && echo "unexpected success" -# Bug - even though we failed at conversion, we left a file around with -# a bitmap marked as not corrupt TEST_IMG=$TEST_IMG.copy _img_info --format-specific \ | _filter_irrelevant_img_info diff --git a/tests/qemu-iotests/tests/qemu-img-bitmaps.out b/tests/qemu-iotests/tests/qemu-img-bitmaps.out index b762362075d1..546aaa404bba 100644 --- a/tests/qemu-iotests/tests/qemu-img-bitmaps.out +++ b/tests/qemu-iotests/tests/qemu-img-bitmaps.out @@ -143,22 +143,6 @@ Format specific information: name: b4 granularity: 65536 corrupt: false -qemu-img: Failed to populate bitmap b0: Bitmap 'b0' is inconsistent and cannot be used -Try block-dirty-bitmap-remove to delete this bitmap from disk -image: TEST_DIR/t.IMGFMT.copy -file format: IMGFMT -virtual size: 10 MiB (10485760 bytes) -cluster_size: 65536 -Format specific information: -bitmaps: -[0]: -flags: -name: b0 -granularity: 65536 -[1]: -flags: -[0]: auto -name: b4 -granularity: 65536 -corrupt: false +qemu-img: Cannot copy inconsistent bitmap 'b0' +qemu-img: Could not o
Re: [PATCH v2 1/3] iotests: Improve and rename test 291 to qemu-img-bitmap
to populate bitmap b0: Bitmap 'b0' is inconsistent and cannot be used +Try block-dirty-bitmap-remove to delete this bitmap from disk In this context a more useful error message would be: Try "qemu-img bitmap --remove" ... but this is not a new issue. +image: TEST_DIR/t.IMGFMT.copy +file format: IMGFMT +virtual size: 10 MiB (10485760 bytes) +cluster_size: 65536 +Format specific information: +bitmaps: +[0]: +flags: +name: b0 +granularity: 65536 +[1]: +flags: +[0]: auto +name: b4 +granularity: 65536 +corrupt: false *** done Reviewed-by: Nir Soffer
Re: [PATCH v3 3/2] qemu-img: Reword 'qemu-img map --output=json' docs
On Wed, Jul 7, 2021 at 9:41 PM Eric Blake wrote: > > Reword the paragraphs to list the JSON key first, rather than in the > middle of prose. > > Suggested-by: Vladimir Sementsov-Ogievskiy > Signed-off-by: Eric Blake > --- > docs/tools/qemu-img.rst | 20 ++-- > 1 file changed, 10 insertions(+), 10 deletions(-) > > diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst > index d6300f7ee03d..1d8470eada0e 100644 > --- a/docs/tools/qemu-img.rst > +++ b/docs/tools/qemu-img.rst > @@ -593,16 +593,16 @@ Command description: >the ``start``, ``length``, ``offset`` fields; >it will also include other more specific information: > > - - whether the sectors contain actual data or not (boolean field ``data``; > -if false, the sectors are either unallocated or stored as optimized > -all-zero clusters); > - - whether the data is known to read as zero (boolean field ``zero``); > - - whether the data is actually present (boolean field ``present``); > -if false, rebasing the backing chain onto a deeper file would pick > -up data from the deeper file; > - - in order to make the output shorter, the target file is expressed as > -a ``depth``; for example, a depth of 2 refers to the backing file > -of the backing file of *FILENAME*. > + - boolean field ``data``: true if the sectors contain actual data, > +false if the sectors are either unallocated or stored as optimized > +all-zero clusters > + - boolean field ``zero``: true if the data is known to read as zero > + - boolean field ``present``: true if the data belongs to the backing > +chain, false if rebasing the backing chain onto a deeper file > +would pick up data from the deeper file; > + - integer field ``depth``: the depth within the backing chain at > +which the data was resolved; for example, a depth of 2 refers to > +the backing file of the backing file of *FILENAME*. > >In JSON format, the ``offset`` field is optional; it is absent in >cases where ``human`` format would omit the entry or exit with an error. > -- > 2.31.1 Would be nice if this could be generated from the json schema instead of repeating the type and description of the fields, but this is a nice improvement. Reviewed-by: Nir Soffer
Re: [PATCH v2] docs: document file-posix locking protocol
On Sat, Jul 3, 2021 at 4:51 PM Vladimir Sementsov-Ogievskiy wrote: > > Let's document how we use file locks in file-posix driver, to allow > external programs to "communicate" in this way with Qemu. This makes the locking implementation public, so qemu can never change it without breaking external programs. I'm not sure this is an issue since even now qemu cannot change without breaking compatibility with older qemu versions. Maybe a better way to integrate with external programs is to provide a library/tool to perform locking? For example we can have tool like: qemu-img lock [how] image command This example will take the lock specified by "how" on image while "command" is running. > Signed-off-by: Vladimir Sementsov-Ogievskiy > --- > > v2: improve some descriptions > add examples > add notice about old bad POSIX file locks > > docs/system/qemu-block-drivers.rst.inc | 186 + > 1 file changed, 186 insertions(+) > > diff --git a/docs/system/qemu-block-drivers.rst.inc > b/docs/system/qemu-block-drivers.rst.inc > index 16225710eb..74fb71600d 100644 > --- a/docs/system/qemu-block-drivers.rst.inc > +++ b/docs/system/qemu-block-drivers.rst.inc > @@ -909,3 +909,189 @@ some additional tasks, hooking io requests. >.. option:: prealloc-size > > How much to preallocate (in bytes), default 128M. > + > +Image locking protocol > +~~ > + > +QEMU holds rd locks and never rw locks. Instead, GETLK fcntl is used with > F_WRLCK > +to handle permissions as described below. > +QEMU process may rd-lock the following bytes of the image with corresponding > +meaning: > + > +Permission bytes. If permission byte is rd-locked, it means that some process > +uses corresponding permission on that file. > + > +ByteOperation > +100 read > + Lock holder can read > +101 write > + Lock holder can write > +102 write-unchanged > + Lock holder can write same data if it sure, that this write doesn't > + break concurrent readers. This is mostly used internally in Qemu > + and it wouldn't be good idea to exploit it somehow. > +103 resize > + Lock holder can resize the file. "write" permission is also > required > + for resizing, so lock byte 103 only if you also lock byte 101. > +104 graph-mod > + Undefined. QEMU may sometimes locks this byte, but external > programs > + should not. QEMU will stop locking this byte in future > + > +Unshare bytes. If permission byte is rd-locked, it means that some process > +does not allow the others use corresponding options on that file. > + > +ByteOperation > +200 read > + Lock holder don't allow read operation to other processes. > +201 write > + Lock holder don't allow write operation to other processes. This > + still allows others to do write-uncahnged operations. Better not > + exploit outside of Qemu. > +202 write-unchanged > + Lock holder don't allow write-unchanged operation to other > processes. > +203 resize > + Lock holder don't allow resizing the file by other processes. > +204 graph-mod > + Undefined. QEMU may sometimes locks this byte, but external > programs > + should not. QEMU will stop locking this byte in future > + > +Handling the permissions works as follows: assume we want to open the file > to do > +some operations and in the same time want to disallow some operation to other > +processes. So, we want to lock some of the bytes described above. We operate > as > +follows: > + > +1. rd-lock all needed bytes, both "permission" bytes and "unshare" bytes. > + > +2. For each "unshare" byte we rd-locked, do GETLK that "tries" to wr-lock > +corresponding "permission" byte. So, we check is there any other process that > +uses the permission we want to unshare. If it exists we fail. > + > +3. For each "permission" byte we rd-locked, do GETLK that "tries" to wr-lock > +corresponding "unshare" byte. So, we check is there any other process that > +unshares the permission we want to have. If it exists we fail. > + > +Important notice: Qemu may fallback to POSIX file locks only if OFD locks > +unavailable. Other programs should behave similarly: use POSIX file locks > +only if OFD locks unavailable and if you are OK with drawbacks of POSIX > +file locks (for example, they are lost on close() of any file descriptor > +for that file). Worth an example. > + > +Image locking examples > +~~ > + > +Read-only, allow others to write > + > + > +So, we want to read and don't care what other users do with the image. We > only > +need to lock byte 100. Operation is as follows: > + > +1. rd-lock byte 100 > + > +.. highlight:: c > + > +struct flock fl = { > +.l_whence = SEEK_SET, > +.l_start = 100, > +.l_len= 1, > +.l_type = F_RDLCK, > +}; >
Re: [PATCH v2 2/1] qemu-img: Add "backing":true to unallocated map segments
On Tue, Jun 29, 2021 at 5:40 PM Kevin Wolf wrote: > > Am 29.06.2021 um 09:23 hat Vladimir Sementsov-Ogievskiy geschrieben: > > 28.06.2021 20:42, Eric Blake wrote: > > > On Wed, Jun 23, 2021 at 06:04:19PM +0200, Kevin Wolf wrote: > > > > > This is fine, but it means that this flag will present in all ranges, > > > > > instead of only in unallocated ranges (what this patch is doing). > > > > > > > > An argument for always having the flag would be that it's probably > > > > useful for a tool to know whether a given block is actually absent or > > > > whether it's just running an old qemu-img. > > > > > > > > If we didn't care about this, I would still define the actual value, but > > > > also document a default. > > > > > > So to summarize, it looks like my v3 will have the best chance of > > > approval if I go with always outputting the new field (instead of only > > > on one of its two boolean values), and put it at the end of the JSON > > > output. Since the "present" key is always present, it does not need to be at the end. > > > It also looks like we have consensus on spelling the new > > > field "present":true for data found in the backing chain, and > > > "present":false for places where we would defer to another file if a > > > backing file is later added. > > > > > > > I didn't follow the discussion carefully, but that sounds good to me. > > To me, too. > > > What's the decision about patch 1? > > I think we won't need patch 1 (and the potential backwards compatibility > problems it would introduce) when we have this one. Yes, looks good and patch 1 is not needed. Nir
Re: [PATCH v2 2/1] qemu-img: Add "backing":true to unallocated map segments
On Wed, Jun 23, 2021 at 7:04 PM Kevin Wolf wrote: > > Am 23.06.2021 um 15:58 hat Nir Soffer geschrieben: > > On Wed, Jun 23, 2021 at 11:58 AM Kevin Wolf wrote: > > > > > > Am 22.06.2021 um 18:56 hat Nir Soffer geschrieben: > > > > On Tue, Jun 22, 2021 at 6:38 PM Kevin Wolf wrote: > > > > > > > > > > Am 11.06.2021 um 21:03 hat Eric Blake geschrieben: > > > > > > To save the user from having to check 'qemu-img info > > > > > > --backing-chain' > > > > > > or other followup command to determine which "depth":n goes beyond > > > > > > the > > > > > > chain, add a boolean field "backing" that is set only for > > > > > > unallocated > > > > > > portions of the disk. > > > > > > > > > > > > Signed-off-by: Eric Blake > > > > > > --- > > > > > > > > > > > > Touches the same iotest output as 1/1. If we decide that switching > > > > > > to > > > > > > "depth":n+1 is too risky, and that the mere addition of > > > > > > "backing":true > > > > > > while keeping "depth":n is good enough, then we'd have just one > > > > > > patch, > > > > > > instead of this double churn. Preferences? > > > > > > > > > > I think the additional flag is better because it's guaranteed to be > > > > > backwards compatible, and because you don't need to know the number of > > > > > layers to infer whether a cluster was allocated in the whole backing > > > > > chain. And by exposing ALLOCATED we definitely give access to the > > > > > whole > > > > > information that exists in QEMU. > > > > > > > > > > However, to continue with the bike shedding: I won't insist on > > > > > "allocated" even if that is what the flag is called internally and > > > > > consistency is usually helpful, but "backing" is misleading, too, > > > > > because intuitively it doesn't cover the top layer or standalone > > > > > images > > > > > without a backing file. How about something like "present"? > > > > > > > > Looks hard to document: > > > > > > > > # @present: if present and false, the range is not allocated within the > > > > # backing chain (since 6.1) > > > > > > I'm not sure why you would document it with a double negative. > > > > > > > And is not consistent with "offset". It would work better as: > > > > > > > > # @present: if present, the range is allocated within the backing > > > > # chain (since 6.1) > > > > > > Completely ignoring the value? I would have documented it like this, but > > > with "if true..." instead of "if present...". > > > > This is fine, but it means that this flag will present in all ranges, > > instead of only in unallocated ranges (what this patch is doing). > > An argument for always having the flag would be that it's probably > useful for a tool to know whether a given block is actually absent or > whether it's just running an old qemu-img. Good point, this is the best option. The disadvantage is a bigger output but if you use json you don't care about the size of the output. > If we didn't care about this, I would still define the actual value, but > also document a default. > > Kevin >
Re: [PATCH v2 2/1] qemu-img: Add "backing":true to unallocated map segments
On Wed, Jun 23, 2021 at 11:58 AM Kevin Wolf wrote: > > Am 22.06.2021 um 18:56 hat Nir Soffer geschrieben: > > On Tue, Jun 22, 2021 at 6:38 PM Kevin Wolf wrote: > > > > > > Am 11.06.2021 um 21:03 hat Eric Blake geschrieben: > > > > To save the user from having to check 'qemu-img info --backing-chain' > > > > or other followup command to determine which "depth":n goes beyond the > > > > chain, add a boolean field "backing" that is set only for unallocated > > > > portions of the disk. > > > > > > > > Signed-off-by: Eric Blake > > > > --- > > > > > > > > Touches the same iotest output as 1/1. If we decide that switching to > > > > "depth":n+1 is too risky, and that the mere addition of "backing":true > > > > while keeping "depth":n is good enough, then we'd have just one patch, > > > > instead of this double churn. Preferences? > > > > > > I think the additional flag is better because it's guaranteed to be > > > backwards compatible, and because you don't need to know the number of > > > layers to infer whether a cluster was allocated in the whole backing > > > chain. And by exposing ALLOCATED we definitely give access to the whole > > > information that exists in QEMU. > > > > > > However, to continue with the bike shedding: I won't insist on > > > "allocated" even if that is what the flag is called internally and > > > consistency is usually helpful, but "backing" is misleading, too, > > > because intuitively it doesn't cover the top layer or standalone images > > > without a backing file. How about something like "present"? > > > > Looks hard to document: > > > > # @present: if present and false, the range is not allocated within the > > # backing chain (since 6.1) > > I'm not sure why you would document it with a double negative. > > > And is not consistent with "offset". It would work better as: > > > > # @present: if present, the range is allocated within the backing > > # chain (since 6.1) > > Completely ignoring the value? I would have documented it like this, but > with "if true..." instead of "if present...". This is fine, but it means that this flag will present in all ranges, instead of only in unallocated ranges (what this patch is doing). > > > Or: > > > > # @absent: if present, the range is not allocated within the backing > > # chain (since 6.1) > > This is possible, too, but generally positive flags are preferable to > negative ones, and the internal one is already positive. > > > This is used by libnbd now: > > https://github.com/libguestfs/libnbd/commit/1d01d2ac4f6443b160b7d81119d555e1aaedb56d > > > > But I'm fine with "backing", It is consistent with BLK_BACKING_FILE, > > meaning this area exposes data from a backing file (if one exists). > > > > We use "backing" internally to be consistent with future qemu-img. > > I just realised that I actually misunderstood "backing" to mean the > opposite of what it is in this patch! > > It really means "the data comes from some imaginary additional backing > file that doesn't exist in the backing chain", while I understood it as > "something in the (real) backing chain contains the data". > > "present" or "absent" should be much less prone to such > misunderstandings. > > Kevin >
Re: [PATCH v2 2/1] qemu-img: Add "backing":true to unallocated map segments
On Fri, Jun 11, 2021 at 10:03 PM Eric Blake wrote: > > To save the user from having to check 'qemu-img info --backing-chain' > or other followup command to determine which "depth":n goes beyond the > chain, add a boolean field "backing" that is set only for unallocated > portions of the disk. > > Signed-off-by: Eric Blake > --- > > Touches the same iotest output as 1/1. If we decide that switching to > "depth":n+1 is too risky, and that the mere addition of "backing":true > while keeping "depth":n is good enough, then we'd have just one patch, > instead of this double churn. Preferences? > > docs/tools/qemu-img.rst| 3 ++ > qapi/block-core.json | 7 ++- > qemu-img.c | 15 +- > tests/qemu-iotests/122.out | 34 +++--- > tests/qemu-iotests/154.out | 96 +++--- > tests/qemu-iotests/179.out | 66 +- > tests/qemu-iotests/223.out | 24 +- > tests/qemu-iotests/244.out | 6 +-- > tests/qemu-iotests/252.out | 4 +- > tests/qemu-iotests/274.out | 16 +++ > tests/qemu-iotests/291.out | 8 ++-- > tests/qemu-iotests/309.out | 4 +- > 12 files changed, 150 insertions(+), 133 deletions(-) > > diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst > index c155b1bf3cc8..fbc623b645c3 100644 > --- a/docs/tools/qemu-img.rst > +++ b/docs/tools/qemu-img.rst > @@ -601,6 +601,9 @@ Command description: > a ``depth``; for example, a depth of 2 refers to the backing file > of the backing file of *FILENAME*. Depth will be one larger than > the chain length if no file in the chain provides the data. > + - an optional ``backing`` field is present with value true if no > +file in the backing chain provides the data (making it easier to > +identify when ``depth`` exceeds the chain length). > >In JSON format, the ``offset`` field is optional; it is absent in >cases where ``human`` format would omit the entry or exit with an error. > diff --git a/qapi/block-core.json b/qapi/block-core.json > index 2ea294129e08..cebe12ba16a0 100644 > --- a/qapi/block-core.json > +++ b/qapi/block-core.json > @@ -264,6 +264,9 @@ > # @offset: if present, the image file stores the data for this range > # in raw format at the given (host) offset > # > +# @backing: if present, the range is not allocated within the backing > +# chain (since 6.1) > +# > # @filename: filename that is referred to by @offset > # > # Since: 2.6 > @@ -271,8 +274,8 @@ > ## > { 'struct': 'MapEntry', >'data': {'start': 'int', 'length': 'int', 'data': 'bool', > - 'zero': 'bool', 'depth': 'int', '*offset': 'int', > - '*filename': 'str' } } > + 'zero': 'bool', 'depth': 'int', '*backing': 'bool', > + '*offset': 'int', '*filename': 'str' } } > > ## > # @BlockdevCacheInfo: > diff --git a/qemu-img.c b/qemu-img.c > index 33a5cd012b8b..4d357f534803 100644 > --- a/qemu-img.c > +++ b/qemu-img.c > @@ -2977,8 +2977,13 @@ static int dump_map_entry(OutputFormat output_format, > MapEntry *e, > break; > case OFORMAT_JSON: > printf("{ \"start\": %"PRId64", \"length\": %"PRId64"," > - " \"depth\": %"PRId64", \"zero\": %s, \"data\": %s", > - e->start, e->length, e->depth, > + " \"depth\": %"PRId64, e->start, e->length, e->depth); > +if (e->has_backing) { > +/* Backing should only be set at the end of the chain */ > +assert(e->backing && e->depth > 0); > +printf(", \"backing\": true"); > +} It will be easier to inspect the output if common fields come before optional fields. > +printf(", \"zero\": %s, \"data\": %s", > e->zero ? "true" : "false", > e->data ? "true" : "false"); > if (e->has_offset) { ... > diff --git a/tests/qemu-iotests/122.out b/tests/qemu-iotests/122.out > index 779dab4847f0..c5aa2c9866f1 100644 > --- a/tests/qemu-iotests/122.out > +++ b/tests/qemu-iotests/122.out > @@ -68,11 +68,11 @@ read 65536/65536 bytes at offset 4194304 > read 65536/65536 bytes at offset 8388608 > 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) > [{ "start": 0, "length": 65536, "depth": 0, "zero": false, "data": true}, > -{ "start": 65536, "length": 4128768, "depth": 1, "zero": true, "data": > false}, > +{ "start": 65536, "length": 4128768, "depth": 1, "backing": true, "zero": > true, "data": false}, So this output would be: [{ "start": 0, "length": 65536, "depth": 0, "zero": false, "data": true}, { "start": 65536, "length": 4128768, "depth": 1, "zero": true, "data": false, "backing": true},
Re: [PATCH v2 2/1] qemu-img: Add "backing":true to unallocated map segments
On Tue, Jun 22, 2021 at 6:38 PM Kevin Wolf wrote: > > Am 11.06.2021 um 21:03 hat Eric Blake geschrieben: > > To save the user from having to check 'qemu-img info --backing-chain' > > or other followup command to determine which "depth":n goes beyond the > > chain, add a boolean field "backing" that is set only for unallocated > > portions of the disk. > > > > Signed-off-by: Eric Blake > > --- > > > > Touches the same iotest output as 1/1. If we decide that switching to > > "depth":n+1 is too risky, and that the mere addition of "backing":true > > while keeping "depth":n is good enough, then we'd have just one patch, > > instead of this double churn. Preferences? > > I think the additional flag is better because it's guaranteed to be > backwards compatible, and because you don't need to know the number of > layers to infer whether a cluster was allocated in the whole backing > chain. And by exposing ALLOCATED we definitely give access to the whole > information that exists in QEMU. > > However, to continue with the bike shedding: I won't insist on > "allocated" even if that is what the flag is called internally and > consistency is usually helpful, but "backing" is misleading, too, > because intuitively it doesn't cover the top layer or standalone images > without a backing file. How about something like "present"? Looks hard to document: # @present: if present and false, the range is not allocated within the # backing chain (since 6.1) And is not consistent with "offset". It would work better as: # @present: if present, the range is allocated within the backing # chain (since 6.1) Or: # @absent: if present, the range is not allocated within the backing # chain (since 6.1) This is used by libnbd now: https://github.com/libguestfs/libnbd/commit/1d01d2ac4f6443b160b7d81119d555e1aaedb56d But I'm fine with "backing", It is consistent with BLK_BACKING_FILE, meaning this area exposes data from a backing file (if one exists). We use "backing" internally to be consistent with future qemu-img.
Re: [PATCH v2 2/1] qemu-img: Add "backing":true to unallocated map segments
On Tue, Jun 15, 2021 at 11:54 AM Vladimir Sementsov-Ogievskiy wrote: > > 11.06.2021 22:03, Eric Blake wrote: > > To save the user from having to check 'qemu-img info --backing-chain' > > or other followup command to determine which "depth":n goes beyond the > > chain, add a boolean field "backing" that is set only for unallocated > > portions of the disk. > > > > Signed-off-by: Eric Blake > > --- > > > > Touches the same iotest output as 1/1. If we decide that switching to > > "depth":n+1 is too risky, and that the mere addition of "backing":true > > while keeping "depth":n is good enough, then we'd have just one patch, > > instead of this double churn. Preferences? > > If change something, this one patch seems safer. Still, Nir said he don't use > qemu-img map, so probably we don't need to modify qemu-img at all? Even our > iotests change shows that this change may be incompatible with at least > tests.. > > I'm not against the patch and don't have strict opinion. > > And what I really think, is that qemu-img is outdated thing and we'd better > develop QMP interface, which can be used with qemu binary or with > qemu-storage-daemon. I don't think qemu-storage-daemon can replace qemu-img. Having an easy to use command line tool is important. Using qmp with qemu-storage-daemon sounds like a better option for programs that want ultimate control. Adding only "backing: true" seems a safe change that should not break existing users and make qemu-img map better. The tests are broken because they compare strings instead of parsing the json. A program parsing qemu-img json output will not be broken by adding a new key. Nir
Re: [PATCH 2/2] nbd: Add new qemu:joint-allocation metadata context
On Mon, Jun 14, 2021 at 4:56 PM Eric Blake wrote: > > On Sat, Jun 12, 2021 at 02:39:44AM +0300, Nir Soffer wrote: > > Since this change is not simple, and the chance that we also get the dirty > > bitmap included in the result seems to be very low, I decided to check the > > direction of merging multiple extents. > > > > I started with merging "base:allocation" and "qemu:dirty-bitmap:xxx" since > > we already have both. It was not hard to do, although it is not completely > > tested yet. > > > > Here is the merging code: > > https://gerrit.ovirt.org/c/ovirt-imageio/+/115216/1/daemon/ovirt_imageio/_internal/nbdutil.py > > > > To make merging easy and safe, we map the NBD_STATE_DIRTY bit to a private > > bit > > so it cannot clash with the NBD_STATE_HOLE bit: > > https://gerrit.ovirt.org/c/ovirt-imageio/+/115215/1/daemon/ovirt_imageio/_internal/nbd.py > > > > Here is a functional test using qemu-nbd showing that it works: > > https://gerrit.ovirt.org/c/ovirt-imageio/+/115216/1/daemon/test/client_test.py > > > > I'll try to use "qemu:allocation-depth" in a similar way next week, probably > > mapping depth > 0 to EXTENT_EXISTS, to use when reporting holes in > > single qcow2 images. > > > > If this is successful, we can start using this in the next ovirt release, > > and we > > don't need "qemu:joint-allocation". > > That's nice to know. So at this point, we'll drop the patch on > qemu:joint-allocation, and instead focus on teh patch that improves > qemu-img map output to make it easier to use in the same way that > qemu:allocation-depth is. I can update that everything looks good on our side so far, thanks!
Re: [PATCH 2/2] nbd: Add new qemu:joint-allocation metadata context
On Wed, Jun 9, 2021 at 9:01 PM Eric Blake wrote: > > When trying to reconstruct a qcow2 chain using information provided > over NBD, ovirt had been relying on an unsafe assumption that any > portion of the qcow2 file advertised as sparse would defer to the > backing image; this worked with what qemu 5.2 reports for a qcow2 BSD > loaded with "backing":null. However, in 6.0, commit 0da9856851 (nbd: > server: Report holes for raw images) also had a side-effect of > reporting unallocated zero clusters in qcow2 files as sparse. This > change is correct from the NBD spec perspective (advertising bits has > always been optional based on how much information the server has > available, and should only be used to optimize behavior when a bit is > set, while not assuming semantics merely because a bit is clear), but > means that a qcow2 file that uses an unallocated zero cluster to > override a backing file now shows up as sparse over NBD, and causes > ovirt to fail to reproduce that cluster (ie. ovirt was assuming it > only had to write clusters where the bit was clear, and the 6.0 > behavior change shows the flaw in that assumption). > > The correct fix is for ovirt to additionally use the > qemu:allocation-depth metadata context added in 5.2: after all, the > actual determination for what is needed to recreate a qcow2 file is > not whether a cluster is sparse, but whether the allocation-depth > shows the cluster to be local. But reproducing an image is more > efficient when handling known-zero clusters, which means that ovirt > has to track both base:allocation and qemu:allocation-depth metadata > contexts simultaneously. While NBD_CMD_BLOCK_STATUS is just fine > sending back information for two contexts in parallel, it comes with > some bookkeeping overhead at the client side: the two contexts need > not report the same length of replies, and it involves more network > traffic. Since this change is not simple, and the chance that we also get the dirty bitmap included in the result seems to be very low, I decided to check the direction of merging multiple extents. I started with merging "base:allocation" and "qemu:dirty-bitmap:xxx" since we already have both. It was not hard to do, although it is not completely tested yet. Here is the merging code: https://gerrit.ovirt.org/c/ovirt-imageio/+/115216/1/daemon/ovirt_imageio/_internal/nbdutil.py To make merging easy and safe, we map the NBD_STATE_DIRTY bit to a private bit so it cannot clash with the NBD_STATE_HOLE bit: https://gerrit.ovirt.org/c/ovirt-imageio/+/115215/1/daemon/ovirt_imageio/_internal/nbd.py Here is a functional test using qemu-nbd showing that it works: https://gerrit.ovirt.org/c/ovirt-imageio/+/115216/1/daemon/test/client_test.py I'll try to use "qemu:allocation-depth" in a similar way next week, probably mapping depth > 0 to EXTENT_EXISTS, to use when reporting holes in single qcow2 images. If this is successful, we can start using this in the next ovirt release, and we don't need "qemu:joint-allocation". Nir
Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole
On Fri, Jun 11, 2021 at 9:34 PM Eric Blake wrote: > > On Fri, Jun 11, 2021 at 08:35:01PM +0300, Nir Soffer wrote: > > On Fri, Jun 11, 2021 at 4:28 PM Eric Blake wrote: > > > > > > On Fri, Jun 11, 2021 at 10:09:09AM +0200, Kevin Wolf wrote: > > > > > Yes, that might work as well. But we didn't previously document > > > > > depth to be optional. Removing something from output risks breaking > > > > > more downstream tools that expect it to be non-optional, compared to > > > > > providing a new value. > > > > > > > > A negative value isn't any less unexpected than a missing key. I don't > > > > think any existing tool would be able to handle it. Encoding different > > > > meanings in a single value isn't very QAPI-like either. Usually strings > > > > that are parsed are the problem, but negative integers really isn't that > > > > much different. I don't really like this solution. > > > > > > > > Leaving out the depth feels like a better suggestion to me. > > > > > > > > But anyway, this seems to only happen at the end of the backing chain. > > > > So if the backing chain consistents of n images, why not report 'depth': > > > > n + 1? So, in the above example, you would get 1. I think this has the > > > > best chances of tools actually working correctly with the new output, > > > > even though it's still not unlikely to break something. > > > > > > Ooh, I like that. It is closer to reality - the file data really > > > comes from the next depth, even if we have no filename at that depth. > > > v2 of my patch coming up. > > > > How do you know the number of the layer? this info is not presented in > > qemu-img map output. ... > Otherwise, you do have a point: "depth":1 in isolation is ambiguous > between "not allocated anywhere in this 1-element chain" and > "allocated at the first backing file in this chain of length 2 or > more". At which point you can indeed use "qemu-img info" to determine > the backing chain depth. How painful is that extra step? Does it > justify the addition of a new optional "backing":true to any portion > of the file that was beyond the end of the chain (and omit that line > for all other regions, rather than printing "backing":false)? Dealing with depth: N + 1 is not that painful, but also not great. I think it is worth a little more effort, and it will save time in the long term for users and for developers. Better APIs need simpler and shorter documentation and require less support. I'm not sure about backing: false, maybe absent: true to match libnbd? Nir
Re: [PATCH v2] qemu-img: Make unallocated part of backing chain obvious in map
On Fri, Jun 11, 2021 at 5:59 PM Eric Blake wrote: > > On Fri, Jun 11, 2021 at 05:35:12PM +0300, Vladimir Sementsov-Ogievskiy wrote: > > > An obvious solution is to make 'qemu-img map --output=json' > > > distinguish between clusters that have a local allocation from those > > > that are found nowhere in the chain. We already have a one-off > > > mismatch between qemu-img map and NBD qemu:allocation-depth (the > > > former chose 0, and the latter 1 for the local layer), so exposing the > > > latter's choice of 0 for unallocated in the entire chain would mean > > > using using "depth":-1 in the former, but a negative depth may confuse > > > existing tools. But there is an easy out: for any chain of length N, > > > we can simply represent an unallocated cluster as "depth":N+1. This > > > does have a slight risk of confusing any tool that might try to > > > dereference NULL when finding the backing image for the last file in > > > the backing chain, but that risk sseems worth the more precise output. > > > The iotests have several examples where this distinction demonstrates > > > the additional accuracy. > > > > > > Signed-off-by: Eric Blake > > > --- > > > > > > Replaces v1: 20210610213906.1313440-1-ebl...@redhat.com > > > (qemu-img: Use "depth":-1 to make backing probes obvious) > > > > > > Use N+1 instead of -1 for unallocated [Kevin] > > > > > > > Bit in contrast with -1, or with separate boolean flag, you lose the > > possibility to distinguish case when we have 3 layers and the cluster is > > absent in all of them, and the case when we have 4 layers and the cluster > > is absent in top 3 but in 4 it is qcow2 UNALLOCATED_ZERO cluster. > > Using just 'qemu-img map --output-json', you only see depth numbers. > You also have to use 'qemu-img info --backing-chain' to see what file > those depth numbers correspond to, at which point it becomes obvious > whether "depth":4 meant unallocated (because the chain was length 3) > or allocated at depth 4 (because the chain was length 4 or longer). > But that's no worse than pre-patch, where you had to use qemu-img info > --backing-chain to learn which file a particular "depth" maps to. > > > > > So, if someone use this API to reconstruct the chain, then for original 3 > > empty layers he will create 3 empty layers and 4rd additional ZERO layer. > > And such reconstructed chain would not be equal to original chain (as if we > > take these two chains and add additional backing file as a new bottom > > layer, effect would be different).. I'm not sure is it a problem in the > > task you are solving :\ > > It should be fairly easy to optimize the case of a backing chain where > EVERY listed cluster at the final depth was "data":false,"zero":true > to omit that file after all. > > And in oVirt's case, Nir pointed out that we have one more tool at our > disposal in recreating a backing chain: if you use > json:{"driver":"qcow2", "backing":null, ...} as your image file, you > don't have to worry about arbitrary files in the backing chain, only > about recreating the top-most layer of a chain. And in that case, it > becomes very obvious that "depth":0 is something you must recreate, > and "depth":1 would be a non-existent backing file because you just > passed "backing":null. Note that oVirt does not use qemu-img map, we use qemu-nbd to get image extents, since it is used only in context we already connect to qemu-nbd server or run qemu-nbd. Management tools already know the image format (they should avoid doing format probing anyway), and using a json uri allows single command to get the needed info when you inspect a single layer. But this change introduces a risk that some program using qemu-img map will interrupt the result in the wrong way, assuming that there is N+1 layer. I think adding a new flag for absent extents is better. It cannot break any user and it is easier to understand and use. Nir
Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole
On Fri, Jun 11, 2021 at 4:28 PM Eric Blake wrote: > > On Fri, Jun 11, 2021 at 10:09:09AM +0200, Kevin Wolf wrote: > > > Yes, that might work as well. But we didn't previously document > > > depth to be optional. Removing something from output risks breaking > > > more downstream tools that expect it to be non-optional, compared to > > > providing a new value. > > > > A negative value isn't any less unexpected than a missing key. I don't > > think any existing tool would be able to handle it. Encoding different > > meanings in a single value isn't very QAPI-like either. Usually strings > > that are parsed are the problem, but negative integers really isn't that > > much different. I don't really like this solution. > > > > Leaving out the depth feels like a better suggestion to me. > > > > But anyway, this seems to only happen at the end of the backing chain. > > So if the backing chain consistents of n images, why not report 'depth': > > n + 1? So, in the above example, you would get 1. I think this has the > > best chances of tools actually working correctly with the new output, > > even though it's still not unlikely to break something. > > Ooh, I like that. It is closer to reality - the file data really > comes from the next depth, even if we have no filename at that depth. > v2 of my patch coming up. How do you know the number of the layer? this info is not presented in qemu-img map output. Users will have to run "qemu-img info --backing-chain" to understand the output of qemu-img map.
Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole
> ב-11 ביוני 2021, בשעה 11:14, Vladimir Sementsov-Ogievskiy > כתב/ה: > > 11.06.2021 11:09, Kevin Wolf wrote: >> Am 10.06.2021 um 22:46 hat Eric Blake geschrieben: >>>> On Thu, Jun 10, 2021 at 11:09:05PM +0300, Nir Soffer wrote: >>>>>> But: >>>>>> >>>>>> $ qemu-img map --output=json -f qcow2 >>>>>> json:'{"driver":"qcow2","backing":null, \ >>>>>> "file":{"driver":"file","filename":"top.qcow2"}}' >>>>>> [{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false}, >>>>>> { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": >>>>>> true, "offset": 327680}, >>>>>> { "start": 131072, "length": 131072, "depth": 0, "zero": true, "data": >>>>>> false}] >>>>>> >>>>>> also reports the entire file at "depth":0, which is misleading, since >>>>>> we have just been arguing from the qemu:allocation-depth perspective >>>>>> (and also from bdrv_block_status) that the qcow2 image is NOT 100% >>>>>> allocated (in the sense where allocation == data comes locally). >>>>>> Perhaps it might be better if we tweaked the above qemu-img map to >>>>>> produce: >>>>>> >>>>>> [{ "start": 0, "length": 65536, "depth": -1, "zero": true, "data": >>>>>> false}, >>>>>> { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": >>>>>> true, "offset": 327680}, >>>>>> { "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": >>>>>> false}, >>>>>> { "start": 196608, "length": 65536, "depth": -1, "zero": true, "data": >>>>>> false}] >>>>> >>>>> It will be more consistent with "offset" to drop "depth" from output >>>>> if we don't have it: >>>>> >>>>> [{ "start": 0, "length": 65536, "zero": true, "data": false}, >>>>> { "start": 65536, "length": 65536, "depth": 0, "zero": false, >>>>> "data": true, "offset": 327680}, >>>>> { "start": 131072, "length": 65536, "depth": 0, "zero": true, >>>>> "data": false}, >>>>> { "start": 196608, "length": 65536, "zero": true, "data": false}] >>> >>> Yes, that might work as well. But we didn't previously document >>> depth to be optional. Removing something from output risks breaking >>> more downstream tools that expect it to be non-optional, compared to >>> providing a new value. >> A negative value isn't any less unexpected than a missing key. I don't >> think any existing tool would be able to handle it. Encoding different >> meanings in a single value isn't very QAPI-like either. Usually strings >> that are parsed are the problem, but negative integers really isn't that >> much different. I don't really like this solution. >> Leaving out the depth feels like a better suggestion to me. >> But anyway, this seems to only happen at the end of the backing chain. >> So if the backing chain consistents of n images, why not report 'depth': >> n + 1? So, in the above example, you would get 1. I think this has the >> best chances of tools actually working correctly with the new output, >> even though it's still not unlikely to break something. > > Did you consider just add a new field? > > So, "depth" keeps its meaning "which level provides data". > > And we add additional optional field like > > absolutely-completely-absent: bool hole: bool? > > Which is true if data is nowhere in the backing chain. > > > -- > Best regards, > Vladimir
Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole
On Thu, Jun 10, 2021 at 9:35 PM Eric Blake wrote: > > On Tue, Jun 08, 2021 at 07:38:10PM +0300, Nir Soffer wrote: > > The example I provided was not detailed enough, what we actually do is: > > > > qemu-nbd .. 'json:{"driver": "qcow2", "backing": null, "file": > > {"driver": "file", "filename": "top.qcow2"}}' > > > > So there is no backing chain and allocation depth is not relevant. > > - Allocated areas should be reported with flags 0 > > - Zero areas which are not holes should be reported as NBD_STATE_ZERO > > - Zero areas which are holes (not allocated in this image) should be > > reported as NBD_STATE_HOLE > > Thinking about this a bit more, here's something I noticed: > > $ qemu-img map --output=json -f raw base.raw > [{ "start": 0, "length": 196608, "depth": 0, "zero": false, "data": true, > "offset": 0}, > { "start": 196608, "length": 65536, "depth": 0, "zero": true, "data": false, > "offset": 196608}] > > which matches what I've said elsewhere in this thread: the entire > image is reported as "depth":0 because the raw file is responsible for > 100% of the content. > > But: > > $ qemu-img map --output=json -f qcow2 json:'{"driver":"qcow2","backing":null, > \ > "file":{"driver":"file","filename":"top.qcow2"}}' > [{ "start": 0, "length": 65536, "depth": 0, "zero": true, "data": false}, > { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, > "offset": 327680}, > { "start": 131072, "length": 131072, "depth": 0, "zero": true, "data": false}] > > also reports the entire file at "depth":0, which is misleading, since > we have just been arguing from the qemu:allocation-depth perspective > (and also from bdrv_block_status) that the qcow2 image is NOT 100% > allocated (in the sense where allocation == data comes locally). > Perhaps it might be better if we tweaked the above qemu-img map to > produce: > > [{ "start": 0, "length": 65536, "depth": -1, "zero": true, "data": false}, > { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, > "offset": 327680}, > { "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": false}, > { "start": 196608, "length": 65536, "depth": -1, "zero": true, "data": false}] It will be more consistent with "offset" to drop "depth" from output if we don't have it: [{ "start": 0, "length": 65536, "zero": true, "data": false}, { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": 327680}, { "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": false}, { "start": 196608, "length": 65536, "zero": true, "data": false}]
Re: [RFC libnbd PATCH] info: Add support for new qemu:joint-allocation
On Thu, Jun 10, 2021 at 4:06 PM Eric Blake wrote: > > On Thu, Jun 10, 2021 at 01:20:13AM +0300, Nir Soffer wrote: > > > + else if (strcmp (metacontext, "qemu:joint-allocation") == 0) { > > > +/* Combo of base:allocation and stripped-down qemu:allocation-depth > > > */ > > > +const char *base, *depth; > > > +switch (type & 3) { > > > +case 0: base = "allocated"; break; > > > +case 1: base = "hole"; break; > > > +case 2: base = "zero"; break; > > > +case 3: base = "hole,zero"; break; > > > +} > > > +switch (type & 0xc) { > > > +case 0: depth = "unallocated"; break; > > > > Is this possible? qemu reports BDRV_BLOCK_DATA but not BDRV_BLOCK_ALLOCATED? > > No, qemu should never report a status of 0 (which in this code would > produce the string "allocated,unallocated", although a v2 may change > to print ""). > > Remember, BDRV_BLOCK_ALLOCATED is a bit of a misnomer - it has nothing > to do with whether a cluster occupies allocated space, but rather > whether the local image in the backing chain provides the contents of > the cluster (rather than deferring to the backing chain). The code in > block/io.c guarantees that if a block device reports BDRV_BLOCK_DATA, > then the block layer also reports BDRV_BLOCK_ALLOCATED (that is, any > cluster that provides guest-visible data by necessity implies that the > current layer of the backing chain is important). > > However, it DOES point out that "allocated" might not be the best name > in libnbd; perhaps "data" or "normal" would be better for the NBD > base:allocation status of 0. Yes! it also aligns better with zero, and the output is similar to qemu-img map. Hopefully the semantics of "data" in qemu-img map and libnbd is the same.
Re: [PATCH 2/2] nbd: Add new qemu:joint-allocation metadata context
On Thu, Jun 10, 2021 at 2:52 AM Nir Soffer wrote: > > On Wed, Jun 9, 2021 at 9:01 PM Eric Blake wrote: I posted a work in progress patch implementing support for qemu:joint-allocaition in oVirt: https://gerrit.ovirt.org/c/ovirt-imageio/+/115197 The most important part is the nbd client: https://gerrit.ovirt.org/c/ovirt-imageio/+/115197/1/daemon/ovirt_imageio/_internal/nbd.py With this our tests pass with qemu-nbd build with Eric patch: https://gerrit.ovirt.org/c/ovirt-imageio/+/115197/1/daemon/test/client_test.py We may need to use qemu:joint-allocation only for qcow2 images, and base:allocation for raw images, because allocation depth reporting is not correct for raw images. Since we control the qemu-nbd in both cases this should not be an issue. But it would be better if allocation depth would work for any kind of image, and we always use qemu:joint-allocation. Nir
Re: [PATCH 2/2] nbd: Add new qemu:joint-allocation metadata context
qemu-io -f raw -c "w -P 65 0 64k" -c "w -P 66 64k 64k" \ > -c "w -P 67 128k 64k" base.raw > > # Write to second and third clusters of top, hiding base: > $ qemu-io -f qcow2 -c "w -P 69 64k 64k" -c "w -z 128k 64k" top.qcow2 Looks familiar but nicer :-) > # Expose top.qcow2 without backing file over NBD > $ ./qemu-nbd -r -t -f qcow2 -A 'json:{"driver":"qcow2", "backing":null, \ > "file":{"driver":"file", "filename":"top.qcow2"}}' > $ nbdinfo --map=qemu:joint-allocation nbd://localhost > 0 655363 > 65536 655364 > 131072 655367 > 196608 655363 Using the libnbd patch this shows: $ ./nbdinfo --map="qemu:joint-allocation" nbd://localhost 0 655363 hole,zero,unallocated 65536 655364 allocated,local 131072 655367 hole,zero,local 196608 655363 hole,zero,unallocated Looks good. We need to convert this output to: {"start": 0, "length": 65536, "zero": true, "hole": true}, {"start": 65536, "length": 65536, "zero": false, "hole": false}, {"start": 131072, "length": 65536, "zero": true, "hole": false}, {"start": 196608, "length": 65536, "zero": true, "hole": true}, So it seems that we need to use this logic for holes when we inspect a single qcow2 image: hole = not (flags & NBD_STATE_LOCAL) And ignore the NBD_STATE_HOLE, which is about qcow2 internals. This patch fixes the critical issue for oVirt, but in a way it returns the previous state when you could not report holes in raw images. With this patch holes in raw image looks like: $ truncate -s 1g empty.raw $ ./qemu-nbd -r -t -f raw empty.raw --allocation-depth $ ./nbdinfo --map="qemu:joint-allocation" nbd://localhost 0 10737418247 hole,zero,local This is not a practical issue for oVirt, but it would be better to report: $ ./nbdinfo --map="qemu:joint-allocation" nbd://localhost 0 10737418243 hole,zero,unallocated This is the output for the empty qcow2 image. And this also affects --allocation-depth, for raw empty image we get: $ ./nbdinfo --map="qemu:allocation-depth" nbd://localhost 0 10737418241 local But for empty qcow2 we get: $ ./nbdinfo --map="qemu:allocation-depth" nbd://localhost 0 10737418240 unallocated I think we have a bug reporting BDRV_BLOCK_ALLOCATED when the BDRV_BLOCK_DATA bit is not set. > [This was output from nbdinfo 1.8.0; a later version will also add a > column to decode the bits into human-readable strings] > > Additionally, later qemu patches may try to improve qemu-img to > automatically take advantage of additional NBD context information, > without having to use x-dirty-bitmap. > > Reported-by: Nir Soffer > Resolves: https://bugzilla.redhat.com/1968693 > Signed-off-by: Eric Blake > --- > docs/interop/nbd.txt | 31 ++- > docs/tools/qemu-nbd.rst | 4 +- > qapi/block-export.json| 4 +- > include/block/nbd.h | 10 ++- > nbd/server.c | 87 +-- > .../tests/nbd-qemu-allocation.out | 3 +- > 6 files changed, 125 insertions(+), 14 deletions(-) > > diff --git a/docs/interop/nbd.txt b/docs/interop/nbd.txt > index 10ce098a29bf..cc8ce2d5389f 100644 > --- a/docs/interop/nbd.txt > +++ b/docs/interop/nbd.txt > @@ -17,7 +17,7 @@ namespace "qemu". > > == "qemu" namespace == > > -The "qemu" namespace currently contains two available metadata context > +The "qemu" namespace currently contains three available metadata context > types. The first is related to exposing the contents of a dirty > bitmap alongside the associated disk contents. That metadata context > is named with the following form: > @@ -39,8 +39,32 @@ depth of which layer in a thin-provisioned backing chain > provided the > data (0 for unallocated, 1 for the active layer, 2 for the first > backing layer, and so forth). > > -For NBD_OPT_LIST_META_CONTEXT the following queries are supported > -in addition to the specific "qemu:allocation-depth" and > +The third is for convenience in querying the results of > +base:allocation and qemu:allocation-depth in one go, under the > +metadata context named > + > +qemu:joint-allocation > + > +In this context, bits 0 and 1 refle
Re: [RFC libnbd PATCH] info: Add support for new qemu:joint-allocation
On Thu, Jun 10, 2021 at 12:32 AM Eric Blake wrote: > > Qemu is adding qemu:joint-allocation as a single context combining the > two bits of base:allocation and a compression of qemu:allocation-depth > into two bits [1]. Decoding the bits makes it easier for humans to > see the result of that context. > > [1] https://lists.gnu.org/archive/html/qemu-devel/2021-06/msg02446.html > --- > > Obviously, this libnbd patch should only go in if the qemu RFC is > accepted favorably. With this patch applied, the example listed in my > qemu patch 2/2 commit message [2] becomes > > $ ~/libnbd/run nbdinfo --map=qemu:joint-allocation nbd://localhost > 0 655363 hole,zero,unallocated > 65536 655364 allocated,local > 131072 655367 hole,zero,local > 196608 655363 hole,zero,unallocated > > [2] https://lists.gnu.org/archive/html/qemu-devel/2021-06/msg02448.html > > For what it's worth, you can also play with the qemu+libnbd patches at: > https://repo.or.cz/qemu/ericb.git/ master > https://repo.or.cz/libnbd/ericb.git/ master > > (I sometimes rewind those branches, but they'll be stable for at least > a few days after this email) > > info/map.c | 21 + > 1 file changed, 21 insertions(+) > > diff --git a/info/map.c b/info/map.c > index ae6d4fe..21e8657 100644 > --- a/info/map.c > +++ b/info/map.c > @@ -226,6 +226,27 @@ extent_description (const char *metacontext, uint32_t > type) >return ret; > } >} > + else if (strcmp (metacontext, "qemu:joint-allocation") == 0) { > +/* Combo of base:allocation and stripped-down qemu:allocation-depth */ > +const char *base, *depth; > +switch (type & 3) { > +case 0: base = "allocated"; break; > +case 1: base = "hole"; break; > +case 2: base = "zero"; break; > +case 3: base = "hole,zero"; break; > +} > +switch (type & 0xc) { > +case 0: depth = "unallocated"; break; Is this possible? qemu reports BDRV_BLOCK_DATA but not BDRV_BLOCK_ALLOCATED? Anyway this seems like a valid way to present qemu response. > +case 4: depth = "local"; break; > +case 8: depth = "backing"; break; > +case 12: depth = ""; break; This should not be possible based on the qemu patch, but printing this seems like a good solution, and can help to debug such an issue. Thinking about client code trying to copy extents based on the flags, the client should abort the operation since qemu response is invalid. > +} > +if (asprintf (&ret, "%s,%s", base, depth) == -1) { > + perror ("asprintf"); > + exit (EXIT_FAILURE); > +} > +return ret; > + } > >return NULL; /* Don't know - description field will be omitted. */ > } > -- > 2.31.1 >
Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole
On Tue, Jun 8, 2021 at 9:46 PM Eric Blake wrote: > > On Tue, Jun 08, 2021 at 07:38:10PM +0300, Nir Soffer wrote: > > On Tue, Jun 8, 2021 at 12:22 AM Eric Blake wrote: > > > > > > On Mon, Jun 07, 2021 at 11:22:04PM +0300, Nir Soffer wrote: > > > > When zeroing a cluster in an image with backing file, qemu-img and > > > > qemu-nbd reported the area as a hole. This does not affect the guest > > > > since the area is read as zero, but breaks code trying to reconstruct > > > > the image chain based on qemu-img map or qemu-nbd block status response. > > > > > > Trying to reconstruct the image chain based on qemu-nbd block status > > > should not be attempted on just base:allocation data, but should also > > > take into account qemu:allocation-depth. > > > > This is correct when looking at the entire chain, but when we reconstruct > > image data, we copy each image in the layer *without* the backing chain. > > > > The example I provided was not detailed enough, what we actually do is: > > > > qemu-nbd .. 'json:{"driver": "qcow2", "backing": null, "file": > > {"driver": "file", "filename": "top.qcow2"}}' > > > > So there is no backing chain and allocation depth is not relevant. > > - Allocated areas should be reported with flags 0 > > - Zero areas which are not holes should be reported as NBD_STATE_ZERO > > - Zero areas which are holes (not allocated in this image) should be > > reported as NBD_STATE_HOLE > > Again, what you WANT is qemu:allocation-depth. > > $ ./qemu-nbd -r -t -f qcow2 -A 'json:{"driver":"qcow2", "backing":null, \ > "file":{"driver":"file", "filename":"top.qcow2"}}' > $ nbdinfo --map=qemu:allocation-depth nbd://localhost > 0 655360 unallocated > 65536 1310721 local > 196608 655360 unallocated > > $ nbdinfo --map nbd://localhost > 0 655363 hole,zero > 65536 655360 allocated > 131072 1310723 hole,zero > > You don't care whether the information reads as zero or not, but > whether top.qcow2 is responsible for the data at that cluster. > base:allocation does not answer that question. But > qemu:allocation-depth answers it perfectly. > > > > > > From the perspective of the > > > core NBD protocol, there is no backing file, so trying to guess what > > > the backing file contains without using qemu extensions is unlikely to > > > be correct, as shown in your example. The fact that you could abuse > > > it with qemu 5.2 but it broke in 6.0 > > > > I'm not abusing anything, I'm only using public APIs. qemu-nbd behavior > > should not change without good reason, and we did not have any good > > reason to change the behavior for qcow2 images. > > Ah, but we did. Exposing BDRV_BLOCK_ALLOCATED as server, but > consuming it as BDRV_BLOCK_DATA as client, was inconsistent. It was a > bug that we ever used BLOCK_ALLOCATED in the first place, when it has > _always_ been that the NBD semantics were supposed to be modeled on > our definition of BLOCK_DATA. That it took us a couple of years to > notice our bug is unfortunate, but we DO have a good reason for the > change - we were fixing an actual bug where we were reporting > incorrect information compared to what the NBD spec was documenting. > > > > > > is not necessarily the sign of a > > > regression in 6.0, but rather could be evidence that you have been > > > trying to use an undocumented implementation quirk rather than a > > > stable interface. > > > > I'm pretty convinced that this is a regression in qemu-nbd 6.0 since I > > created > > this regression :-) > > I understand that you were surprised by the ramifications of your > patch causing more changes than what you expected, but I still argue > that your patch was correct and that the decision to incorporate it > was intentional because it was the right thing to do. Papering over > the fallout for the sake of clients that should be using > qemu:allocation-depth instead does not seem like it is worth the > maintenance nightmare to me. > > > > > Since we started using qemu-nbd in 2018, qemu-nbd has always reported > > holes in qcow2 images, but not in raw files. We discussed this several > > times, > > and you explained that we have allocation information from qcow2, but not > > from raw format. > > &
Re: [PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole
On Tue, Jun 8, 2021 at 12:22 AM Eric Blake wrote: > > On Mon, Jun 07, 2021 at 11:22:04PM +0300, Nir Soffer wrote: > > When zeroing a cluster in an image with backing file, qemu-img and > > qemu-nbd reported the area as a hole. This does not affect the guest > > since the area is read as zero, but breaks code trying to reconstruct > > the image chain based on qemu-img map or qemu-nbd block status response. > > Trying to reconstruct the image chain based on qemu-nbd block status > should not be attempted on just base:allocation data, but should also > take into account qemu:allocation-depth. This is correct when looking at the entire chain, but when we reconstruct image data, we copy each image in the layer *without* the backing chain. The example I provided was not detailed enough, what we actually do is: qemu-nbd .. 'json:{"driver": "qcow2", "backing": null, "file": {"driver": "file", "filename": "top.qcow2"}}' So there is no backing chain and allocation depth is not relevant. - Allocated areas should be reported with flags 0 - Zero areas which are not holes should be reported as NBD_STATE_ZERO - Zero areas which are holes (not allocated in this image) should be reported as NBD_STATE_HOLE > From the perspective of the > core NBD protocol, there is no backing file, so trying to guess what > the backing file contains without using qemu extensions is unlikely to > be correct, as shown in your example. The fact that you could abuse > it with qemu 5.2 but it broke in 6.0 I'm not abusing anything, I'm only using public APIs. qemu-nbd behavior should not change without good reason, and we did not have any good reason to change the behavior for qcow2 images. > is not necessarily the sign of a > regression in 6.0, but rather could be evidence that you have been > trying to use an undocumented implementation quirk rather than a > stable interface. I'm pretty convinced that this is a regression in qemu-nbd 6.0 since I created this regression :-) Since we started using qemu-nbd in 2018, qemu-nbd has always reported holes in qcow2 images, but not in raw files. We discussed this several times, and you explained that we have allocation information from qcow2, but not from raw format. My attempt to fix hole reporting in raw images has failed; reporting holes in raw images is nice to have, but it broke the behavior of qemu-nbd with qcow2 images, which is a critical issue for ovirt. The code using this was tested and released 3-4 month ago. This was added to support backup vendors using snapshot based backup, so they can move to use the NBD based pipeline, which is safer than the old way, uploading qcow2 images directly to storage. If I revert: commit 0da9856851dcca09222a1467e16ddd05dc66e460 Author: Nir Soffer Date: Fri Feb 19 18:07:52 2021 +0200 nbd: server: Report holes for raw images qemu-nbd reports zeroed areas in a useful way like it always did: $ ./qemu-nbd -r -t 'json:{"driver": "qcow2", "backing": null, "file": {"driver": "file", "filename": "top.qcow2"}}' & $ nbdinfo --map nbd://localhost 0 655363 hole,zero 65536 655360 allocated 131072 655362 zero 196608 655363 hole,zero There is no need to use allocation depth info, the base:allocation works fine for this use case, and the output makes sense. > > Here is simpler reproducer: > > > > # Create a qcow2 image with a raw backing file: > > $ qemu-img create base.raw $((4*64*1024)) > > $ qemu-img create -f qcow2 -b base.raw -F raw top.qcow2 > > > > # Write to first 3 clusters of base: > > $ qemu-io -f raw -c "write -P 65 0 64k" base.raw > > $ qemu-io -f raw -c "write -P 66 64k 64k" base.raw > > $ qemu-io -f raw -c "write -P 67 128k 64k" base.raw > > > > # Write to second cluster of top, hiding second cluster of base: > > $ qemu-io -f qcow2 -c "write -P 69 64k 64k" top.qcow2 > > > > # Write zeroes to third cluster of top, hiding third cluster of base: > > $ qemu-io -f qcow2 -c "write -z 128k 64k" top.qcow2 > > > > This creates: > > > > top: -D0- > > base: ABC- > > > > How current qemu-img and qemu-nbd report the state: > > > > $ qemu-img map --output json top.qcow2 > > [{ "start": 0, "length": 65536, "depth": 1, "zero": false, "data": > > true, "offset": 0}, > > { "start": 65536, "length": 65536, "depth": 0, "zero": false, "
[PATCH] qemu-{img,nbd}: Don't report zeroed cluster as a hole
When zeroing a cluster in an image with backing file, qemu-img and qemu-nbd reported the area as a hole. This does not affect the guest since the area is read as zero, but breaks code trying to reconstruct the image chain based on qemu-img map or qemu-nbd block status response. Here is simpler reproducer: # Create a qcow2 image with a raw backing file: $ qemu-img create base.raw $((4*64*1024)) $ qemu-img create -f qcow2 -b base.raw -F raw top.qcow2 # Write to first 3 clusters of base: $ qemu-io -f raw -c "write -P 65 0 64k" base.raw $ qemu-io -f raw -c "write -P 66 64k 64k" base.raw $ qemu-io -f raw -c "write -P 67 128k 64k" base.raw # Write to second cluster of top, hiding second cluster of base: $ qemu-io -f qcow2 -c "write -P 69 64k 64k" top.qcow2 # Write zeroes to third cluster of top, hiding third cluster of base: $ qemu-io -f qcow2 -c "write -z 128k 64k" top.qcow2 This creates: top: -D0- base: ABC- How current qemu-img and qemu-nbd report the state: $ qemu-img map --output json top.qcow2 [{ "start": 0, "length": 65536, "depth": 1, "zero": false, "data": true, "offset": 0}, { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": 327680}, { "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": false}, { "start": 196608, "length": 65536, "depth": 1, "zero": true, "data": false, "offset": 196608}] $ qemu-nbd -r -t -f qcow2 top.qcow2 & $ qemu-img map --output json nbd://localhost [{ "start": 0, "length": 131072, "depth": 0, "zero": false, "data": true, "offset": 0}, { "start": 131072, "length": 131072, "depth": 0, "zero": true, "data": false, "offset": 131072}] $ nbdinfo --map nbd://localhost 0 1310720 allocated 131072 1310723 hole,zero The third extents is reported as a hole in both cases. In qmeu-nbd the cluster is merged with forth cluster which is actually a hole. This is incorrect since if it was a hole, the third cluster would be exposed to the guest. Programs using qemu-nbd output to reconstruct the image chain on other storage would be confused and copy only the first 2 cluster. The results of this copy will be an image exposing the third cluster from the base image, corrupting the guest data. I found that it can be fixed using BDRV_BLOCK_OFFSET_VALID when reporting the status of the extent. When we have a valid offset, we report based on BDRV_BLOCK_DATA. Otherwise we report based on BDRV_BLOCK_ALLOCATED. With this fix we get: $ build/qemu-img map --output json top.qcow2 [{ "start": 0, "length": 65536, "depth": 1, "zero": false, "data": true, "offset": 0}, { "start": 65536, "length": 65536, "depth": 0, "zero": false, "data": true, "offset": 327680}, { "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": true}, { "start": 196608, "length": 65536, "depth": 1, "zero": true, "data": false, "offset": 196608}] $ build/qemu-nbd -r -t -f qcow2 top.qcow2 & $ qemu-img map --output json nbd://localhost [{ "start": 0, "length": 131072, "depth": 0, "zero": false, "data": true, "offset": 0}, { "start": 131072, "length": 65536, "depth": 0, "zero": true, "data": true, "offset": 131072}, { "start": 196608, "length": 65536, "depth": 0, "zero": true, "data": false, "offset": 196608}] $ nbdinfo --map nbd://localhost 0 1310720 allocated 131072 655362 zero 196608 655363 hole,zero The issue was found by ovirt-imageio functional tests: https://github.com/oVirt/ovirt-imageio/blob/master/daemon/test/client_test.py I did not update any of the existing tests, and I'm sure many tests are missing, and the documentation should change to describe the new behavior. Posting as is for early review. Signed-off-by: Nir Soffer Resolves: https://bugzilla.redhat.com/1968693 --- nbd/server.c | 8 ++-- qemu-img.c | 4 +++- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/nbd/server.c b/nbd/server.c index b60ebc3ab6..adf37905d5 100644 --- a/nbd/server.c +++ b/nbd/server.c
Re: [PATCH] block/file-posix: Fix problem with fallocate(PUNCH_HOLE) on GPFS
On Fri, Apr 16, 2021 at 8:23 AM Thomas Huth wrote: > > A customer reported that running > > qemu-img convert -t none -O qcow2 -f qcow2 input.qcow2 output.qcow2 > > fails for them with the following error message when the images are > stored on a GPFS file system: > > qemu-img: error while writing sector 0: Invalid argument > > After analyzing the strace output, it seems like the problem is in > handle_aiocb_write_zeroes(): The call to fallocate(FALLOC_FL_PUNCH_HOLE) > returns EINVAL, which can apparently happen if the file system has > a different idea of the granularity of the operation. It's arguably > a bug in GPFS, since the PUNCH_HOLE mode should not result in EINVAL > according to the man-page of fallocate(), but the file system is out > there in production and so we have to deal with it. In commit 294682cc3a > ("block: workaround for unaligned byte range in fallocate()") we also > already applied the a work-around for the same problem to the earlier > fallocate(FALLOC_FL_ZERO_RANGE) call, so do it now similar with the > PUNCH_HOLE call. > > Signed-off-by: Thomas Huth > --- > block/file-posix.c | 7 +++ > 1 file changed, 7 insertions(+) > > diff --git a/block/file-posix.c b/block/file-posix.c > index 20e14f8e96..7a40428d52 100644 > --- a/block/file-posix.c > +++ b/block/file-posix.c > @@ -1675,6 +1675,13 @@ static int handle_aiocb_write_zeroes(void *opaque) > } > s->has_fallocate = false; > } else if (ret != -ENOTSUP) { > +if (ret == -EINVAL) { > +/* > + * File systems like GPFS do not like unaligned byte ranges, > + * treat it like unsupported (so caller falls back to pwrite) > + */ > +return -ENOTSUP; This skips the next fallback, using plain fallocate(0) if we write after the end of the file. Is this intended? We can treat the buggy EINVAL return value as "filesystem is buggy, let's not try other options", or "let's try the next option". Since falling back to actually writing zeroes is so much slower, I think it is better to try the next option. This issue affects also libnbd (nbdcopy file backend). Do we have a bug for GFS? Nir > +} > return ret; > } else { > s->has_discard = false; > -- > 2.27.0 > >
[PATCH] qemu-iotest: Test NBD hole reporting for qcow2
In commit commit 0da9856851dcca09222a1467e16ddd05dc66e460 nbd: server: Report holes for raw images we changed the way holes are reported for raw images, but also how known-zero portions of qcow2 files are reported. This was not covered by iotests, and revealed recently by libnbd tests[1]. Add the missing tests for single qcow2 image and qcow2 image with a backing file. [1] https://listman.redhat.com/archives/libguestfs/2021-April/msg00050.html Signed-off-by: Nir Soffer --- tests/qemu-iotests/314 | 96 tests/qemu-iotests/314.out | 34 + tests/qemu-iotests/common.rc | 1 + 3 files changed, 131 insertions(+) create mode 100755 tests/qemu-iotests/314 create mode 100644 tests/qemu-iotests/314.out diff --git a/tests/qemu-iotests/314 b/tests/qemu-iotests/314 new file mode 100755 index 00..81c0169eac --- /dev/null +++ b/tests/qemu-iotests/314 @@ -0,0 +1,96 @@ +#!/usr/bin/env bash +# group: rw quick +# +# Test qemu-nbd base:allocation metacontext +# +# Copyright (C) 2021 Nir Soffer +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. +# +# owner=nir...@gmail.com + +seq="$(basename $0)" +echo "QA output created by $seq" + +status=1 # failure is the default! + +_cleanup() +{ +_cleanup_test_img +rm -f "$TEST_DIR/server.log" +nbd_server_stop +} +trap "_cleanup; exit \$status" 0 1 2 3 15 + +# get standard environment, filters and checks +. ./common.rc +. ./common.filter +. ./common.nbd + +_supported_fmt qcow2 +_supported_proto nbd +_supported_os Linux +_require_command QEMU_NBD + +TEST_IMG="nbd+unix:///?socket=$nbd_unix_socket" + +echo +echo "=== Single image ===" +echo + +$QEMU_IMG create -f "$IMGFMT" -o cluster_size=64k \ +"$TEST_IMG_FILE" 384k | _filter_img_create_filenames + +$QEMU_IO -f $IMGFMT -c "write -P 1 0k 64k" "$TEST_IMG_FILE" | _filter_qemu_io +$QEMU_IO -f $IMGFMT -c "write -P 2 64k 512" "$TEST_IMG_FILE" | _filter_qemu_io +$QEMU_IO -f $IMGFMT -c "write -z 192k 64k" "$TEST_IMG_FILE" | _filter_qemu_io +$QEMU_IO -f $IMGFMT -c "write -z 256k 512" "$TEST_IMG_FILE" | _filter_qemu_io + +nbd_server_start_unix_socket -f $IMGFMT "$TEST_IMG_FILE" + +echo +$QEMU_NBD_PROG --list -k $nbd_unix_socket >/dev/null +$QEMU_IMG map -f raw --output=json "$TEST_IMG" | _filter_qemu_img_map + +nbd_server_stop + +echo +echo "=== Image with backing file ===" +echo + +$QEMU_IMG create -f "$IMGFMT" -o cluster_size=64k \ +"$TEST_IMG_FILE.base" 384k | _filter_img_create_filenames + +$QEMU_IO -f $IMGFMT -c "write -P 1 0k 64k" "$TEST_IMG_FILE.base" | _filter_qemu_io +$QEMU_IO -f $IMGFMT -c "write -P 2 64k 512" "$TEST_IMG_FILE.base" | _filter_qemu_io + +$QEMU_IMG create -f "$IMGFMT" -o cluster_size=64k \ +-b "$TEST_IMG_FILE.base" -F $IMGFMT "$TEST_IMG_FILE" | _filter_img_create_filenames + +$QEMU_IO -f $IMGFMT -c "write -z 192k 64k" "$TEST_IMG_FILE" | _filter_qemu_io +$QEMU_IO -f $IMGFMT -c "write -z 256k 512" "$TEST_IMG_FILE" | _filter_qemu_io + +nbd_server_start_unix_socket -f $IMGFMT "$TEST_IMG_FILE" + +echo +$QEMU_NBD_PROG --list -k $nbd_unix_socket >/dev/null +$QEMU_IMG map -f raw --output=json "$TEST_IMG" | _filter_qemu_img_map + +nbd_server_stop + +# success, all done +echo +echo '*** done' +rm -f $seq.full +status=0 diff --git a/tests/qemu-iotests/314.out b/tests/qemu-iotests/314.out new file mode 100644 index 00..df7eef023f --- /dev/null +++ b/tests/qemu-iotests/314.out @@ -0,0 +1,34 @@ +QA output created by 314 + +=== Single image === + +Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT cluster_size=65536 extended_l2=off compression_type=zlib size=393216 lazy_refcounts=off refcount_bits=16 +wrote 65536/65536 bytes at offset 0 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 512/512 bytes at offset 65536 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 65536/65536 bytes at offset 196608 +64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec) +wrote 512/512 bytes at offset 262144 +512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec
Re: [PATCH V4] file-posix: allow -EBUSY error during ioctl(fd, BLKZEROOUT, range) on block
On Fri, Mar 26, 2021 at 3:21 AM ChangLimin wrote: > >On Thu, Mar 25, 2021 at 8:07 AM ChangLimin > wrote: > >>On Wed, Mar 24, 2021 at 4:52 PM Max Reitz wrote: > >>On 22.03.21 10:25, ChangLimin wrote: > >>> For Linux 5.10/5.11, qemu write zeros to a multipath device using > >>> ioctl(fd, BLKZEROOUT, range) with cache none or directsync return > -EBUSY > >>> permanently. > >> > >>So as far as I can track back the discussion, Kevin asked on v1 why we’d > >>set has_write_zeroes to false, i.e. whether the EBUSY might not go away > >>at some point, and if it did, whether we shouldn’t retry BLKZEROOUT then. > >>You haven’t explicitly replied to that question (as far as I can see), > >>so it kind of still stands. > >> > >>Implicitly, there are two conflicting answers in this patch: On one > >>hand, the commit message says “permanently”, and this is what you told > >>Nir as a realistic case where this can occur. > > > >For Linux 5.10/5.11, the EBUSY is permanently, the reproduce step is > below. > >For other Linux version, the EBUSY may be temporary. > >Because Linux 5.10/5.11 is not used widely, so do not set > has_write_zeroes to false. > > > >>I'm afraid ChangLimin did not answer my question. I'm looking for real > >>world used case when qemu cannot write zeros to multipath device, when > >>nobody else is using the device. > >> > >>I tried to reproduce this on Fedora (kernel 5.10) with qemu-img convert, > >>once with a multipath device, and once with logical volume on a vg > created > >>on the multipath device, and I could not reproduce this issue. > > > >The following is steps to reproduct the issue on Fedora 34. > > > ># uname -a > >Linux fedora-34 5.11.3-300.fc34.x86_64 #1 SMP Thu Mar 4 19:03:18 UTC 2021 > x86_64 x86_64 x86_64 GNU/Linux > > > >Is this the most recent kernel? I have 5.11.7 in fedora 32. > > > > > ># qemu-img -V > >qemu-img version 5.2.0 (qemu-5.2.0-5.fc34.1) > > > >1. Login in an ISCSI LUN created using targetcli on ubuntu 20.04 > ># iscsiadm -m discovery -t st -p 192.169.1.109 > >192.169.1.109:3260,1 iqn.2003-01.org.linux-iscsi:lio-lv100 > > > ># iscsiadm -m node -l -T iqn.2003-01.org.linux-iscsi:lio-lv100 > ># iscsiadm -m session > >tcp: [1] 192.169.1.109:3260,1 iqn.2003-01.org.linux-iscsi:lio-lv100 > (non-flash) > > > >2. start multipathd service > ># mpathconf --enable > ># systemctl start multipathd > > > >3. add multipath path > ># multipath -a `/lib/udev/scsi_id -g /dev/sdb` # sdb means the ISCSI LUN > >wwid '36001405b76856e4816b48b99c6a77de3' added > > > ># multipathd add path /dev/sdb > >ok > > > ># multipath -ll # /dev/dm-1 is the multipath device based on /dev/sdb > >mpatha (36001405bebfc3a0522541cda30220db9) dm-1 LIO-ORG,lv102 > >size=1.0G features='0' hwhandler='1 alua' wp=rw > >`-+- policy='service-time 0' prio=50 status=active > > `- 5:0:0:0 sdd 8:48 active ready running > > > >You are using user_friendly_names which is (sadly) the default. > >But I don't think it should matter. > > > >4. qemu-img return EBUSY both to dm-1 and sdb > ># wget > http://download.cirros-cloud.net/0.4.0/cirros-0.4.0-x86_64-disk.img > ># qemu-img convert -O raw -t none cirros-0.4.0-x86_64-disk.img /dev/dm-1 > >qemu-img: error while writing at byte 0: Device or resource busy > > > ># qemu-img convert -O raw -t none cirros-0.4.0-x86_64-disk.img /dev/sdb > >qemu-img: error while writing at byte 0: Device or resource busy > > > >5. blkdiscard also return EBUSY both to dm-1 and sdb > ># blkdiscard -o 0 -l 4096 /dev/dm-1 > >blkdiscard: cannot open /dev/dm-1: Device or resource busy > > > ># blkdiscard -o 0 -l 4096 /dev/sdb > >blkdiscard: cannot open /dev/sdb: No such file or directory > > > >6. dd write zero is good, because it does not use blkdiscard > ># dd if=/dev/zero of=/dev/dm-1 bs=1M count=100 oflag=direct > >100+0 records in > >100+0 records out > >104857600 bytes (105 MB, 100 MiB) copied, 2.33623 s, 44.9 MB/s > > > >7. The LUN should support blkdiscard feature, otherwise it will not write > zero > >with ioctl(fd, BLKZEROOUT, range) > > > >Thanks! > > > >I could not reproduce this with kernel 5.10, but now I'm no 5.11: > ># uname -r > >5.11.7-100.fc32.x86_64 > > > ># qemu-img --version > >qemu-img version 5.2.0 (qemu-5.2.0-6.fc32.1) > >Copyright (c) 2003-2020 Fabrice Bellard and the QEMU Project developers > > > ># cat /etc/multipath.conf > >defaults { > >user_friendly_names no > >find_multipaths no > >} > > > >blacklist_exceptions { > >property "(SCSI_IDENT_|ID_WWN)" > >} > > > >blacklist { > >} > > > ># multipath -ll 36001405e884ab8ff4b44fdba6901099c > >36001405e884ab8ff4b44fdba6901099c dm-8 LIO-ORG,3-09 > >size=6.0G features='0' hwhandler='1 alua' wp=rw > >`-+- policy='service-time 0' prio=50 status=active > > `- 1:0:0:9 sdk 8:160 active ready running > > > >$ lsblk /dev/sdk > >NAMEMAJ:MIN RM SIZE RO TYPE MOUNTPOINT > >sdk 8:160 0 6G 0 disk > >└─36001405e884ab8ff4b44fdba6901099c 253:13 0 6G
Re: [PATCH V4] file-posix: allow -EBUSY error during ioctl(fd, BLKZEROOUT, range) on block
On Thu, Mar 25, 2021 at 8:07 AM ChangLimin wrote: > >On Wed, Mar 24, 2021 at 4:52 PM Max Reitz wrote: > >On 22.03.21 10:25, ChangLimin wrote: > >> For Linux 5.10/5.11, qemu write zeros to a multipath device using > >> ioctl(fd, BLKZEROOUT, range) with cache none or directsync return -EBUSY > >> permanently. > > > >So as far as I can track back the discussion, Kevin asked on v1 why we’d > >set has_write_zeroes to false, i.e. whether the EBUSY might not go away > >at some point, and if it did, whether we shouldn’t retry BLKZEROOUT then. > >You haven’t explicitly replied to that question (as far as I can see), > >so it kind of still stands. > > > >Implicitly, there are two conflicting answers in this patch: On one > >hand, the commit message says “permanently”, and this is what you told > >Nir as a realistic case where this can occur. > > For Linux 5.10/5.11, the EBUSY is permanently, the reproduce step is > below. > For other Linux version, the EBUSY may be temporary. > Because Linux 5.10/5.11 is not used widely, so do not set has_write_zeroes > to false. > > >I'm afraid ChangLimin did not answer my question. I'm looking for real > >world used case when qemu cannot write zeros to multipath device, when > >nobody else is using the device. > > > >I tried to reproduce this on Fedora (kernel 5.10) with qemu-img convert, > >once with a multipath device, and once with logical volume on a vg created > >on the multipath device, and I could not reproduce this issue. > > The following is steps to reproduct the issue on Fedora 34. > > # uname -a > Linux fedora-34 5.11.3-300.fc34.x86_64 #1 SMP Thu Mar 4 19:03:18 UTC 2021 > x86_64 x86_64 x86_64 GNU/Linux > Is this the most recent kernel? I have 5.11.7 in fedora 32. > > # qemu-img -V > qemu-img version 5.2.0 (qemu-5.2.0-5.fc34.1) > > 1. Login in an ISCSI LUN created using targetcli on ubuntu 20.04 > # iscsiadm -m discovery -t st -p 192.169.1.109 > 192.169.1.109:3260,1 iqn.2003-01.org.linux-iscsi:lio-lv100 > > # iscsiadm -m node -l -T iqn.2003-01.org.linux-iscsi:lio-lv100 > # iscsiadm -m session > tcp: [1] 192.169.1.109:3260,1 iqn.2003-01.org.linux-iscsi:lio-lv100 > (non-flash) > > 2. start multipathd service > # mpathconf --enable > # systemctl start multipathd > > 3. add multipath path > # multipath -a `/lib/udev/scsi_id -g /dev/sdb` # sdb means the ISCSI LUN > wwid '36001405b76856e4816b48b99c6a77de3' added > > # multipathd add path /dev/sdb > ok > > # multipath -ll # /dev/dm-1 is the multipath device based on /dev/sdb > mpatha (36001405bebfc3a0522541cda30220db9) dm-1 LIO-ORG,lv102 > size=1.0G features='0' hwhandler='1 alua' wp=rw > `-+- policy='service-time 0' prio=50 status=active > `- 5:0:0:0 sdd 8:48 active ready running > You are using user_friendly_names which is (sadly) the default. But I don't think it should matter. 4. qemu-img return EBUSY both to dm-1 and sdb > # wget http://download.cirros-cloud.net/0.4.0/cirros-0.4.0-x86_64-disk.img > # qemu-img convert -O raw -t none cirros-0.4.0-x86_64-disk.img /dev/dm-1 > qemu-img: error while writing at byte 0: Device or resource busy > > # qemu-img convert -O raw -t none cirros-0.4.0-x86_64-disk.img /dev/sdb > qemu-img: error while writing at byte 0: Device or resource busy > > 5. blkdiscard also return EBUSY both to dm-1 and sdb > # blkdiscard -o 0 -l 4096 /dev/dm-1 > blkdiscard: cannot open /dev/dm-1: Device or resource busy > > # blkdiscard -o 0 -l 4096 /dev/sdb > blkdiscard: cannot open /dev/sdb: No such file or directory > > 6. dd write zero is good, because it does not use blkdiscard > # dd if=/dev/zero of=/dev/dm-1 bs=1M count=100 oflag=direct > 100+0 records in > 100+0 records out > 104857600 bytes (105 MB, 100 MiB) copied, 2.33623 s, 44.9 MB/s > > 7. The LUN should support blkdiscard feature, otherwise it will not write > zero > with ioctl(fd, BLKZEROOUT, range) > Thanks! I could not reproduce this with kernel 5.10, but now I'm no 5.11: # uname -r 5.11.7-100.fc32.x86_64 # qemu-img --version qemu-img version 5.2.0 (qemu-5.2.0-6.fc32.1) Copyright (c) 2003-2020 Fabrice Bellard and the QEMU Project developers # cat /etc/multipath.conf defaults { user_friendly_names no find_multipaths no } blacklist_exceptions { property "(SCSI_IDENT_|ID_WWN)" } blacklist { } # multipath -ll 36001405e884ab8ff4b44fdba6901099c 36001405e884ab8ff4b44fdba6901099c dm-8 LIO-ORG,3-09 size=6.0G features='0' hwhandler='1 alua' wp=rw `-+- policy='service-time 0' prio=50 status=active `- 1:0:0:9 sdk 8:160 active ready running $ lsblk /dev/sdk NAMEMAJ:MIN RM SIZE RO TYPE MOUNTPOINT sdk 8:160 0 6G 0 disk └─36001405e884ab8ff4b44fdba6901099c 253:13 0 6G 0 mpath $ virt-builder fedora-32 -o disk.img [ 2.9] Downloading: http://builder.libguestfs.org/fedora-32.xz [ 3.8] Planning how to build this image [ 3.8] Uncompressing [ 11.1] Opening the new disk [ 16.1] Setting a random seed [ 16.1] Setting passwords virt-builder: S
Re: [PATCH V4] file-posix: allow -EBUSY error during ioctl(fd, BLKZEROOUT, range) on block
On Wed, Mar 24, 2021 at 4:52 PM Max Reitz wrote: > On 22.03.21 10:25, ChangLimin wrote: > > For Linux 5.10/5.11, qemu write zeros to a multipath device using > > ioctl(fd, BLKZEROOUT, range) with cache none or directsync return -EBUSY > > permanently. > > So as far as I can track back the discussion, Kevin asked on v1 why we’d > set has_write_zeroes to false, i.e. whether the EBUSY might not go away > at some point, and if it did, whether we shouldn’t retry BLKZEROOUT then. > You haven’t explicitly replied to that question (as far as I can see), > so it kind of still stands. > > Implicitly, there are two conflicting answers in this patch: On one > hand, the commit message says “permanently”, and this is what you told > Nir as a realistic case where this can occur. I'm afraid ChangLimin did not answer my question. I'm looking for real world used case when qemu cannot write zeros to multipath device, when nobody else is using the device. I tried to reproduce this on Fedora (kernel 5.10) with qemu-img convert, once with a multipath device, and once with logical volume on a vg created on the multipath device, and I could not reproduce this issue. If I understand the kernel change correctly, this can happen when there is a mounted file system on top of the multipath device. I don't think we have a use case when qemu accesses a multipath device when the device is used by a file system, but maybe I missed something. > So that to me implies > that we actually should not retry BLKZEROOUT, because the EBUSY will > remain, and that condition won’t change while the block device is in use > by qemu. > > On the other hand, in the code, you have decided not to reset > has_write_zeroes to false, so the implementation will retry. > EBUSY is usually a temporary error, so retrying makes sense. The question is if we really can write zeroes manually in this case? > So I don’t quite understand. Should we keep trying BLKZEROOUT or is > there no chance of it working after it has at one point failed with > EBUSY? (Are there other cases besides what’s described in this commit > message where EBUSY might be returned and it is only temporary?) > > > Fallback to pwritev instead of exit for -EBUSY error. > > > > The issue was introduced in Linux 5.10: > > > https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=384d87ef2c954fc58e6c5fd8253e4a1984f5fe02 > > > > Fixed in Linux 5.12: > > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=56887cffe946bb0a90c74429fa94d6110a73119d > > > > Signed-off-by: ChangLimin > > --- > > block/file-posix.c | 8 ++-- > > 1 file changed, 6 insertions(+), 2 deletions(-) > > > > diff --git a/block/file-posix.c b/block/file-posix.c > > index 20e14f8e96..d4054ac9cb 100644 > > --- a/block/file-posix.c > > +++ b/block/file-posix.c > > @@ -1624,8 +1624,12 @@ static ssize_t > > handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb) > > } while (errno == EINTR); > > > > ret = translate_err(-errno); > > -if (ret == -ENOTSUP) { > > -s->has_write_zeroes = false; > > +switch (ret) { > > +case -ENOTSUP: > > +s->has_write_zeroes = false; /* fall through */ > > +case -EBUSY: /* Linux 5.10/5.11 may return -EBUSY for multipath > > devices */ > > +return -ENOTSUP; > > +break; > > (Not sure why this break is here.) > > Max > > > } > > } > > #endif > > -- > > 2.27.0 > > > > >
Re: [PATCH] nbd: server: Report holes for raw images
On Thu, Feb 25, 2021 at 8:51 PM Vladimir Sementsov-Ogievskiy < vsement...@virtuozzo.com> wrote: > 19.02.2021 19:58, Eric Blake wrote: > > On 2/19/21 10:42 AM, Eric Blake wrote: > > > >>> To me, data=false looks compatible with NBD_STATE_HOLE. From user point > >>> of view, getting same results from qemu-nbd and qemu-img is more > >>> important than being more correct about allocation status. > >> > >> More to the point, here is our inconsistency: > >> > >> In nbd/server.c, we turn !BDRV_BLOCK_ALLOCATED into NBD_STATE_HOLE > >> > >> In block/nbd.c, we turn !NBD_STATE_HOLE into BDRV_BLOCK_DATA > >> > >> The fact that we are not doing a round-trip conversion means that one of > >> the two places is wrong. And your argument that the server side is > >> wrong makes sense to me. > > > > In fact, when I went back and researched when this was introduced (see > > commit e7b1948d51 in 2018), we may have been aware of the inconsistency > > between client and server, but didn't make up our minds at the time: > > https://lists.gnu.org/archive/html/qemu-devel/2018-03/msg03465.html > > "? Hm, don't remember, what we decided about DATA/HOLE flags mapping.." > > > >> > >> I'll wait a few days for any other reviewer commentary before taking > >> this through my NBD tree. > >> > > > > > I can add the following. > > First, link to my research of block_status in Qemu: > https://lists.gnu.org/archive/html/qemu-devel/2020-04/msg05136.html > > And about HOLE and ZERO.. > > As I've noted in the research above, SCSI may return HOLE & !ZERO: > > from SCSI: > Logical Block Provisioning Read Zeros (LBPRZ) bit > 1 If the logical block provisioning read zeros (LBPRZ) bit is set to > one, then, for an unmapped LBA specified by a read operation, the > deviceserver shall send user data with all bits set to zero to the data-in > buffer. > 0 If the TPRZ bit is set to zero, then, for an unmapped LBA specified > by a read operation, the device server may send user data with all bitsset > to any value to the data-in buffer. > > So we can have an unmapped area that can be read as any random data. Same > thing can be said about null-co driver with read-zeroes=false > > Also, qcow2 support ALLOCATED ZERO clusters which reads as zero but data > is allocated - they are reasonable to report as ZERO & !HOLE > > And of-course UNALLOCATED ZERO clusters in qcow2 and lseek-holes are > reasonable to report as ZERO & HOLE, because they reads as zero and > "future writes to that area may cause fragmentation or encounter an > NBD_ENOSPC".. > > So, all combination are reasonable, we just need to fix Qemu NBD server to > report correct statuses in all these cases. > > It seems that ZERO/HOLE specification is a lot more reasonable than what > we have with ZERO/DATA/ALLOCATED in Qemu, and may be true way is move > internal block_status to use NBD terms. > > > And thanks for CCing me. Hmm, maybe, I'll suggest myself as co-maintainer > for NBD? Kevin, Max, are you ok with this change?
Re: [PATCH] file-posix: allow -EBUSY errors during write zeros on block
On Tue, Mar 2, 2021 at 4:08 AM ChangLimin wrote: > > After Linux 5.10, write zeros to a multipath device using > ioctl(fd, BLKZEROOUT, range) with cache none or directsync will return EBUSY. > > Similar to handle_aiocb_write_zeroes_unmap, handle_aiocb_write_zeroes_block > allow -EBUSY errors during ioctl(fd, BLKZEROOUT, range). > > Reference commit in Linux 5.10: > https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=384d87ef2c954fc58e6c5fd8253e4a1984f5fe02 But this can happen only when the block device is used by a file system or maybe someone else. In qemu we assume that we are the only user of the block device, so EBUSY is a fatal error that should never happen, no? Can you explain a real world use case when we get EBUSY? Nir > Signed-off-by: ChangLimin > --- > block/file-posix.c | 7 ++- > 1 file changed, 6 insertions(+), 1 deletion(-) > > diff --git a/block/file-posix.c b/block/file-posix.c > index 05079b40ca..3e60c96214 100644 > --- a/block/file-posix.c > +++ b/block/file-posix.c > @@ -1629,8 +1629,13 @@ static ssize_t > handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb) > } while (errno == EINTR); > > ret = translate_err(-errno); > -if (ret == -ENOTSUP) { > +switch (ret) { > +case -ENOTSUP: > +case -EINVAL: > +case -EBUSY: > s->has_write_zeroes = false; > +return -ENOTSUP; > +break; > } > } > #endif > -- > 2.27.0 >
[PATCH] nbd: server: Report holes for raw images
When querying image extents for raw image, qemu-nbd reports holes as zero: $ qemu-nbd -t -r -f raw empty-6g.raw $ qemu-img map --output json nbd://localhost [{ "start": 0, "length": 6442450944, "depth": 0, "zero": true, "data": true, "offset": 0}] $ qemu-img map --output json empty-6g.raw [{ "start": 0, "length": 6442450944, "depth": 0, "zero": true, "data": false, "offset": 0}] Turns out that qemu-img map reports a hole based on BDRV_BLOCK_DATA, but nbd server reports a hole based on BDRV_BLOCK_ALLOCATED. The NBD protocol says: NBD_STATE_HOLE (bit 0): if set, the block represents a hole (and future writes to that area may cause fragmentation or encounter an NBD_ENOSPC error); if clear, the block is allocated or the server could not otherwise determine its status. qemu-img manual says: whether the sectors contain actual data or not (boolean field data; if false, the sectors are either unallocated or stored as optimized all-zero clusters); To me, data=false looks compatible with NBD_STATE_HOLE. From user point of view, getting same results from qemu-nbd and qemu-img is more important than being more correct about allocation status. Changing nbd server to report holes using BDRV_BLOCK_DATA makes qemu-nbd results compatible with qemu-img map: $ qemu-img map --output json nbd://localhost [{ "start": 0, "length": 6442450944, "depth": 0, "zero": true, "data": false, "offset": 0}] Signed-off-by: Nir Soffer --- nbd/server.c | 4 ++-- tests/qemu-iotests/241.out | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/nbd/server.c b/nbd/server.c index 7229f487d2..86a44a9b41 100644 --- a/nbd/server.c +++ b/nbd/server.c @@ -2087,8 +2087,8 @@ static int blockstatus_to_extents(BlockDriverState *bs, uint64_t offset, return ret; } -flags = (ret & BDRV_BLOCK_ALLOCATED ? 0 : NBD_STATE_HOLE) | -(ret & BDRV_BLOCK_ZERO ? NBD_STATE_ZERO : 0); +flags = (ret & BDRV_BLOCK_DATA ? 0 : NBD_STATE_HOLE) | +(ret & BDRV_BLOCK_ZERO ? NBD_STATE_ZERO : 0); if (nbd_extent_array_add(ea, num, flags) < 0) { return 0; diff --git a/tests/qemu-iotests/241.out b/tests/qemu-iotests/241.out index 75f9f465e5..3f8c173cc8 100644 --- a/tests/qemu-iotests/241.out +++ b/tests/qemu-iotests/241.out @@ -5,7 +5,7 @@ QA output created by 241 size: 1024 min block: 1 [{ "start": 0, "length": 1000, "depth": 0, "zero": false, "data": true, "offset": OFFSET}, -{ "start": 1000, "length": 24, "depth": 0, "zero": true, "data": true, "offset": OFFSET}] +{ "start": 1000, "length": 24, "depth": 0, "zero": true, "data": false, "offset": OFFSET}] 1 KiB (0x400) bytes allocated at offset 0 bytes (0x0) === Exporting unaligned raw image, forced server sector alignment === @@ -23,6 +23,6 @@ WARNING: Image format was not specified for 'TEST_DIR/t.raw' and probing guessed size: 1024 min block: 1 [{ "start": 0, "length": 1000, "depth": 0, "zero": false, "data": true, "offset": OFFSET}, -{ "start": 1000, "length": 24, "depth": 0, "zero": true, "data": true, "offset": OFFSET}] +{ "start": 1000, "length": 24, "depth": 0, "zero": true, "data": false, "offset": OFFSET}] 1 KiB (0x400) bytes allocated at offset 0 bytes (0x0) *** done -- 2.26.2
Re: [PATCH v3 2/2] qemu-nbd: Permit --shared=0 for unlimited clients
On Tue, Feb 9, 2021 at 5:28 PM Eric Blake wrote: > > This gives us better feature parity with QMP nbd-server-start, where > max-connections defaults to 0 for unlimited. Sound useful > Signed-off-by: Eric Blake > --- > docs/tools/qemu-nbd.rst | 4 ++-- > qemu-nbd.c | 7 +++ > 2 files changed, 5 insertions(+), 6 deletions(-) > > diff --git a/docs/tools/qemu-nbd.rst b/docs/tools/qemu-nbd.rst > index fe41336dc550..ee862fa0bc02 100644 > --- a/docs/tools/qemu-nbd.rst > +++ b/docs/tools/qemu-nbd.rst > @@ -136,8 +136,8 @@ driver options if ``--image-opts`` is specified. > .. option:: -e, --shared=NUM > >Allow up to *NUM* clients to share the device (default > - ``1``). Safe for readers, but for now, consistency is not > - guaranteed between multiple writers. > + ``1``), 0 for unlimited. Safe for readers, but for now, > + consistency is not guaranteed between multiple writers. > > .. option:: -t, --persistent > > diff --git a/qemu-nbd.c b/qemu-nbd.c > index 1a340ea4858d..5416509ece18 100644 > --- a/qemu-nbd.c > +++ b/qemu-nbd.c > @@ -328,7 +328,7 @@ static void *nbd_client_thread(void *arg) > > static int nbd_can_accept(void) > { > -return state == RUNNING && nb_fds < shared; > +return state == RUNNING && (shared == 0 || nb_fds < shared); > } > > static void nbd_update_server_watch(void); > @@ -706,8 +706,8 @@ int main(int argc, char **argv) > device = optarg; > break; > case 'e': > if (qemu_strtoi(optarg, NULL, 0, &shared) < 0 || > -shared < 1) { > +shared < 0) { > error_report("Invalid shared device number '%s'", optarg); > exit(EXIT_FAILURE); > } > @@ -966,7 +965,7 @@ int main(int argc, char **argv) > if (socket_activation == 0) { > int backlog; > > -if (persistent) { > +if (persistent || shared == 0) { > backlog = SOMAXCONN; > } else { > backlog = MIN(shared, SOMAXCONN); > -- > 2.30.0 > Reviewed-by: Nir Soffer
Re: [PATCH v3 1/2] qemu-nbd: Use SOMAXCONN for socket listen() backlog
On Tue, Feb 9, 2021 at 5:28 PM Eric Blake wrote: > > Our default of a backlog of 1 connection is rather puny; it gets in > the way when we are explicitly allowing multiple clients (such as > qemu-nbd -e N [--shared], or nbd-server-start with its default > "max-connections":0 for unlimited), but is even a problem when we > stick to qemu-nbd's default of only 1 active client but use -t > [--persistent] where a second client can start using the server once > the first finishes. While the effects are less noticeable on TCP > sockets (since the client can poll() to learn when the server is ready > again), it is definitely observable on Unix sockets, where on Unix, a > client will fail with EAGAIN and no recourse but to sleep an arbitrary > amount of time before retrying if the server backlog is already full. > > Since QMP nbd-server-start is always persistent, it now always > requests a backlog of SOMAXCONN; This makes sense since we don't limit the number of connections. > meanwhile, qemu-nbd will request > SOMAXCONN if persistent, otherwise its backlog should be based on the > expected number of clients. If --persistent is used without --shared, we allow only one concurrent connection, so not clear why we need maximum backlog. I think that separating --persistent and --shared would be easier to understand and use. The backlog will always be based on shared value. > See https://bugzilla.redhat.com/1925045 for a demonstration of where > our low backlog prevents libnbd from connecting as many parallel > clients as it wants. > > Reported-by: Richard W.M. Jones > Signed-off-by: Eric Blake > CC: qemu-sta...@nongnu.org > --- > blockdev-nbd.c | 7 ++- > qemu-nbd.c | 10 +- > 2 files changed, 15 insertions(+), 2 deletions(-) > > diff --git a/blockdev-nbd.c b/blockdev-nbd.c > index d8443d235b73..b264620b98d8 100644 > --- a/blockdev-nbd.c > +++ b/blockdev-nbd.c > @@ -134,7 +134,12 @@ void nbd_server_start(SocketAddress *addr, const char > *tls_creds, > qio_net_listener_set_name(nbd_server->listener, >"nbd-listener"); > > -if (qio_net_listener_open_sync(nbd_server->listener, addr, 1, errp) < 0) > { > +/* > + * Because this server is persistent, a backlog of SOMAXCONN is > + * better than trying to size it to max_connections. The comment is not clear. Previously we used hard code value (1) but we do support more than one connection. Maybe it is better to explain that we don't know how many connections are needed? > + */ > +if (qio_net_listener_open_sync(nbd_server->listener, addr, SOMAXCONN, > + errp) < 0) { > goto error; > } > > diff --git a/qemu-nbd.c b/qemu-nbd.c > index 608c63e82a25..1a340ea4858d 100644 > --- a/qemu-nbd.c > +++ b/qemu-nbd.c > @@ -964,8 +964,16 @@ int main(int argc, char **argv) > > server = qio_net_listener_new(); > if (socket_activation == 0) { > +int backlog; > + > +if (persistent) { > +backlog = SOMAXCONN; This increases the backlog, but since default shared is still 1, we will not accept more than 1 connection, so not clear why SOMAXCONN is better. > +} else { > +backlog = MIN(shared, SOMAXCONN); > +} > saddr = nbd_build_socket_address(sockpath, bindto, port); > -if (qio_net_listener_open_sync(server, saddr, 1, &local_err) < 0) { > +if (qio_net_listener_open_sync(server, saddr, backlog, > + &local_err) < 0) { > object_unref(OBJECT(server)); > error_report_err(local_err); > exit(EXIT_FAILURE); > -- > 2.30.0 >
Re: [PATCH v2] qemu-nbd: Use SOMAXCONN for socket listen() backlog
On Fri, Feb 5, 2021 at 8:57 PM Eric Blake wrote: > > Our default of a backlog of 1 connection is rather puny, particularly > for scenarios where we expect multiple listeners to connect (such as > qemu-nbd -e X). This is especially important for Unix sockets, as a > definite benefit to clients: at least on Linux, a client trying to > connect to a Unix socket with a backlog gets an EAGAIN failure with no > way to poll() for when the backlog is no longer present short of > sleeping an arbitrary amount of time before retrying. > > See https://bugzilla.redhat.com/1925045 for a demonstration of where > our low backlog prevents libnbd from connecting as many parallel > clients as it wants. > > Reported-by: Richard W.M. Jones > Signed-off-by: Eric Blake > --- > > v2: target the correct API used by qemu-nbd, rather than an unrelated > legacy wrapper [Dan] > > qemu-nbd.c | 3 ++- > 1 file changed, 2 insertions(+), 1 deletion(-) > > diff --git a/qemu-nbd.c b/qemu-nbd.c > index 608c63e82a25..cd20ee73be19 100644 > --- a/qemu-nbd.c > +++ b/qemu-nbd.c > @@ -965,7 +965,8 @@ int main(int argc, char **argv) > server = qio_net_listener_new(); > if (socket_activation == 0) { > saddr = nbd_build_socket_address(sockpath, bindto, port); > -if (qio_net_listener_open_sync(server, saddr, 1, &local_err) < 0) { > +if (qio_net_listener_open_sync(server, saddr, SOMAXCONN, Shouldn't we use value based on --shared=N? Using maximum value makes sense for generic server expecting to handle many connections from different clients. qemu-nbd is typically used by one client, and we need to make it possible to connect a known number of connections quickly. > + &local_err) < 0) { > object_unref(OBJECT(server)); > error_report_err(local_err); > exit(EXIT_FAILURE); > -- > 2.30.0 > >
Re: Potential regression in 'qemu-img convert' to LVM
On Tue, Sep 15, 2020 at 2:51 PM Stefan Reiter wrote: > > On 9/15/20 11:08 AM, Nir Soffer wrote: > > On Mon, Sep 14, 2020 at 3:25 PM Stefan Reiter wrote: > >> > >> Hi list, > >> > >> following command fails since 5.1 (tested on kernel 5.4.60): > >> > >> # qemu-img convert -p -f raw -O raw /dev/zvol/pool/disk-1 /dev/vg/disk-1 > >> qemu-img: error while writing at byte 2157968896: Device or resource busy > >> > >> (source is ZFS here, but doesn't matter in practice, it always fails the > >> same; offset changes slightly but consistently hovers around 2^31) > >> > >> strace shows the following: > >> fallocate(13, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, 2157968896, > >> 4608) = -1 EBUSY (Device or resource busy) > > > > What is the size of the LV? > > > > Same as the source, 5GB in my test case. Created with: > > # lvcreate -ay --size 5242880k --name disk-1 vg > > > Does it happen if you change sparse minimum size (-S)? > > > > For example: -S 64k > > > > qemu-img convert -p -f raw -O raw -S 64k /dev/zvol/pool/disk-1 > > /dev/vg/disk-1 > > > > Tried a few different values, always the same result: EBUSY at byte > 2157968896. > > >> Other fallocate calls leading up to this work fine. > >> > >> This happens since commit edafc70c0c "qemu-img convert: Don't pre-zero > >> images", before that all fallocates happened at the start. Reverting the > >> commit and calling qemu-img exactly the same way on the same data works > >> fine. > > > > But slowly, doing up to 100% more work for fully allocated images. > > > > Of course, I'm not saying the patch is wrong, reverting it just avoids > triggering the bug. > > >> Simply retrying the syscall on EBUSY (like EINTR) does *not* work, > >> once it fails it keeps failing with the same error. > >> > >> I couldn't find anything related to EBUSY on fallocate, and it only > >> happens on LVM targets... Any idea or pointers where to look? > > > > Is this thin LV? > > > > No, regular LV. See command above. > > > This works for us using regular LVs. > > > > Which kernel? which distro? > > > > Reproducible on: > * PVE w/ kernel 5.4.60 (Ubuntu based) > * Manjaro w/ kernel 5.8.6 > > I found that it does not happen with all images, I suppose there must be > a certain number of smaller holes for it to happen. I am using a VM > image with a bare-bones Alpine Linux installation, but it's not an > isolated case, we've had two people report the issue on our bug tracker: > https://bugzilla.proxmox.com/show_bug.cgi?id=3002 I think that this issue may be fixed by https://lists.nongnu.org/archive/html/qemu-block/2020-11/msg00358.html Nir
Re: [PULL 20/21] python/qemu/qmp.py: re-raise OSError when encountered
On Tue, Oct 20, 2020 at 8:52 PM John Snow wrote: > > Nested if conditions don't change when the exception block fires; we > need to explicitly re-raise the error if we didn't intend to capture and > suppress it. > > Signed-off-by: John Snow > Reviewed-by: Philippe Mathieu-Daudé > Message-id: 20201009175123.249009-3-js...@redhat.com > Signed-off-by: John Snow > --- > python/qemu/qmp.py | 11 ++- > 1 file changed, 6 insertions(+), 5 deletions(-) > > diff --git a/python/qemu/qmp.py b/python/qemu/qmp.py > index d911999da1..4969e5741c 100644 > --- a/python/qemu/qmp.py > +++ b/python/qemu/qmp.py > @@ -165,14 +165,15 @@ def __get_events(self, wait: Union[bool, float] = > False) -> None: > """ > > # Check for new events regardless and pull them into the cache: > -self.__sock.setblocking(False) > try: > +self.__sock.setblocking(False) This change is not required. The idiom is: do stuff try: something finally: undo stuff If do stuff failed, there is no need to undo it. socket.setblocking() should not fail with EAGAIN, so it does not need to be inside the try block. > self.__json_read() > except OSError as err: > -if err.errno == errno.EAGAIN: > -# No data available > -pass > -self.__sock.setblocking(True) > +# EAGAIN: No data available; not critical > +if err.errno != errno.EAGAIN: > +raise In python 3 this can be simplified to: try: self.__json_read() except BlockingIOError: pass https://docs.python.org/3.6/library/exceptions.html#BlockingIOError > +finally: > +self.__sock.setblocking(True) > > # Wait for new events, if needed. > # if wait is 0.0, this means "no wait" and is also implicitly false. > -- > 2.26.2 Nir
Re: [RFC PATCH 13/21] contrib/gitdm: Add more entries to the Red Hat domain
On Sun, Oct 4, 2020 at 9:05 PM Philippe Mathieu-Daudé wrote: > > Cc: Frediano Ziglio > Cc: Frediano Ziglio > Cc: Nir Soffer > Cc: Nir Soffer > Signed-off-by: Philippe Mathieu-Daudé > --- > To the developers Cc'ed: If you agree with your entry, please > reply with a Reviewed-by/Acked-by tag. If you disagree or doesn't > care, please either reply with Nack-by or ignore this patch. > I'll repost in 2 weeks as formal patch (not RFC) with only the > entries acked by their author. > --- > contrib/gitdm/group-map-redhat | 2 ++ > 1 file changed, 2 insertions(+) > > diff --git a/contrib/gitdm/group-map-redhat b/contrib/gitdm/group-map-redhat > index d15db2d35e..0419e82795 100644 > --- a/contrib/gitdm/group-map-redhat > +++ b/contrib/gitdm/group-map-redhat > @@ -6,3 +6,5 @@ da...@gibson.dropbear.id.au > laur...@vivier.eu > p...@fedoraproject.org > arm...@pond.sub.org > +fredd...@gmail.com > +nir...@gmail.com Acked-by Nir Soffer > -- > 2.26.2 >
Re: [PATCH] docs: Better mention of qemu-img amend limitations
On Wed, Sep 23, 2020 at 11:38 PM Eric Blake wrote: > > Missed during merge resolution of commit bc5ee6da71. > > Signed-off-by: Eric Blake > --- > docs/tools/qemu-img.rst | 4 > 1 file changed, 4 insertions(+) > > diff --git a/docs/tools/qemu-img.rst b/docs/tools/qemu-img.rst > index c35bd6482203..2b5891b54db7 100644 > --- a/docs/tools/qemu-img.rst > +++ b/docs/tools/qemu-img.rst > @@ -265,6 +265,10 @@ Command description: >--force allows some unsafe operations. Currently for -f luks, it allows to >erase the last encryption key, and to overwrite an active encryption key. > > + The set of options that can be amended are dependent on the image > + format, but note that amending the backing chain relationship should > + instead be performed with ``qemu-img rebase``. Because of the backing format? > + > .. option:: bench [-c COUNT] [-d DEPTH] [-f FMT] > [--flush-interval=FLUSH_INTERVAL] [-i AIO] [-n] [--no-drain] [-o OFFSET] > [--pattern=PATTERN] [-q] [-s BUFFER_SIZE] [-S STEP_SIZE] [-t CACHE] [-w] [-U] > FILENAME > >Run a simple sequential I/O benchmark on the specified image. If ``-w`` is > -- > 2.28.0 > >
Re: Potential regression in 'qemu-img convert' to LVM
On Mon, Sep 14, 2020 at 3:25 PM Stefan Reiter wrote: > > Hi list, > > following command fails since 5.1 (tested on kernel 5.4.60): > > # qemu-img convert -p -f raw -O raw /dev/zvol/pool/disk-1 /dev/vg/disk-1 > qemu-img: error while writing at byte 2157968896: Device or resource busy > > (source is ZFS here, but doesn't matter in practice, it always fails the > same; offset changes slightly but consistently hovers around 2^31) > > strace shows the following: > fallocate(13, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, 2157968896, > 4608) = -1 EBUSY (Device or resource busy) What is the size of the LV? Does it happen if you change sparse minimum size (-S)? For example: -S 64k qemu-img convert -p -f raw -O raw -S 64k /dev/zvol/pool/disk-1 /dev/vg/disk-1 > Other fallocate calls leading up to this work fine. > > This happens since commit edafc70c0c "qemu-img convert: Don't pre-zero > images", before that all fallocates happened at the start. Reverting the > commit and calling qemu-img exactly the same way on the same data works > fine. But slowly, doing up to 100% more work for fully allocated images. > Simply retrying the syscall on EBUSY (like EINTR) does *not* work, > once it fails it keeps failing with the same error. > > I couldn't find anything related to EBUSY on fallocate, and it only > happens on LVM targets... Any idea or pointers where to look? Is this thin LV? This works for us using regular LVs. Which kernel? which distro? Nir
Re: [PATCH 2/2] block: file-posix: Replace posix_fallocate with fallocate
On Mon, Sep 14, 2020 at 8:32 PM Daniel P. Berrangé wrote: > > On Mon, Aug 31, 2020 at 05:01:27PM +0300, Nir Soffer wrote: > > If fallocate() is not supported, posix_fallocate() falls back to > > inefficient allocation, writing one byte for every 4k bytes[1]. This is > > very slow compared with writing zeros. In oVirt we measured ~400% > > improvement in allocation time when replacing posix_fallocate() with > > manually writing zeroes[2]. > > > > We also know that posix_fallocated() does not work well when using OFD > > locks[3]. We don't know the reason yet for this issue yet. > > > > Change preallocate_falloc() to use fallocate() instead of > > posix_falloate(), and fall back to full preallocation if not supported. > > > > Here are quick test results with this change. > > > > Before (qemu-img-5.1.0-2.fc32.x86_64): > > > > $ time qemu-img create -f raw -o preallocation=falloc /tmp/nfs3/test.raw 6g > > Formatting '/tmp/nfs3/test.raw', fmt=raw size=6442450944 > > preallocation=falloc > > > > real 0m42.100s > > user 0m0.602s > > sys 0m4.137s > > > > NFS stats: > > calls retransauthrefrshwrite > > 15715830 1572205 1571321 > > > > After: > > > > $ time ./qemu-img create -f raw -o preallocation=falloc /tmp/nfs3/test.raw > > 6g > > Formatting '/tmp/nfs3/test.raw', fmt=raw size=6442450944 > > preallocation=falloc > > > > real 0m15.551s > > user 0m0.070s > > sys 0m2.623s > > > > NFS stats: > > calls retransauthrefrshwrite > > 24620 0 24624 24567 > > > > [1] > > https://code.woboq.org/userspace/glibc/sysdeps/posix/posix_fallocate.c.html#96 > > [2] https://bugzilla.redhat.com/1850267#c25 > > [3] https://bugzilla.redhat.com/1851097 > > This bug appears to be private to RH employees only, so rather than link > to it, please summarize any important facts in it for benefit of nonn-RH > QEMU contributors. Thanks, I missed that detail when linking to the bug. The bug is public now. > > Signed-off-by: Nir Soffer > > --- > > block/file-posix.c | 32 +- > > docs/system/qemu-block-drivers.rst.inc | 11 + > > docs/tools/qemu-img.rst| 11 + > > qapi/block-core.json | 4 ++-- > > 4 files changed, 25 insertions(+), 33 deletions(-) > > Regards, > Daniel > -- > |: https://berrange.com -o-https://www.flickr.com/photos/dberrange :| > |: https://libvirt.org -o-https://fstop138.berrange.com :| > |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :| >
Re: [PATCH 0/2] Replace posix_fallocate() with falloate()
On Mon, Aug 31, 2020 at 5:01 PM Nir Soffer wrote: > > Change preallocation=falloc to use fallocate() instead of > posix_fallocte(), improving performance when legacy filesystem that do > not support fallocate, and avoiding issues seen with OFD locks. > > More work is needed to respect cache mode when using full preallocation > and maybe optimize buffer size. > > Continuing the discussion at: > https://lists.nongnu.org/archive/html/qemu-block/2020-08/msg00947.html > > Nir Soffer (2): > block: file-posix: Extract preallocate helpers > block: file-posix: Replace posix_fallocate with fallocate > > block/file-posix.c | 202 ++--- > docs/system/qemu-block-drivers.rst.inc | 11 +- > docs/tools/qemu-img.rst| 11 +- > qapi/block-core.json | 4 +- > 4 files changed, 127 insertions(+), 101 deletions(-) Ping
Re: [PATCH 1/2] block: file-posix: Extract preallocate helpers
On Tue, Sep 1, 2020 at 1:27 PM Alberto Garcia wrote: > > On Mon 31 Aug 2020 04:01:26 PM CEST, Nir Soffer wrote: > > +static int preallocate_falloc(int fd, int64_t current_length, int64_t > > offset, > > + Error **errp) > > +{ > > +#ifdef CONFIG_POSIX_FALLOCATE > > +int result; > > + > > +if (offset == current_length) > > +return 0; > > You can also take the chance to add the missing braces here (there's a > similar warning for the other patch). Sure, I'll change it in the next version. I forgot to run checkpatch.pl, and it also seems extra work when using git publish.
[PATCH 2/2] block: file-posix: Replace posix_fallocate with fallocate
If fallocate() is not supported, posix_fallocate() falls back to inefficient allocation, writing one byte for every 4k bytes[1]. This is very slow compared with writing zeros. In oVirt we measured ~400% improvement in allocation time when replacing posix_fallocate() with manually writing zeroes[2]. We also know that posix_fallocated() does not work well when using OFD locks[3]. We don't know the reason yet for this issue yet. Change preallocate_falloc() to use fallocate() instead of posix_falloate(), and fall back to full preallocation if not supported. Here are quick test results with this change. Before (qemu-img-5.1.0-2.fc32.x86_64): $ time qemu-img create -f raw -o preallocation=falloc /tmp/nfs3/test.raw 6g Formatting '/tmp/nfs3/test.raw', fmt=raw size=6442450944 preallocation=falloc real 0m42.100s user 0m0.602s sys 0m4.137s NFS stats: calls retrans authrefrshwrite 1571583 0 1572205 1571321 After: $ time ./qemu-img create -f raw -o preallocation=falloc /tmp/nfs3/test.raw 6g Formatting '/tmp/nfs3/test.raw', fmt=raw size=6442450944 preallocation=falloc real 0m15.551s user 0m0.070s sys 0m2.623s NFS stats: calls retrans authrefrshwrite 24620 0 24624 24567 [1] https://code.woboq.org/userspace/glibc/sysdeps/posix/posix_fallocate.c.html#96 [2] https://bugzilla.redhat.com/1850267#c25 [3] https://bugzilla.redhat.com/1851097 Signed-off-by: Nir Soffer --- block/file-posix.c | 32 +- docs/system/qemu-block-drivers.rst.inc | 11 + docs/tools/qemu-img.rst| 11 + qapi/block-core.json | 4 ++-- 4 files changed, 25 insertions(+), 33 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index 341ffb1cb4..eac3c0b412 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -1835,36 +1835,24 @@ static int allocate_first_block(int fd, size_t max_size) static int preallocate_falloc(int fd, int64_t current_length, int64_t offset, Error **errp) { -#ifdef CONFIG_POSIX_FALLOCATE +#ifdef CONFIG_FALLOCATE int result; if (offset == current_length) return 0; -/* - * Truncating before posix_fallocate() makes it about twice slower on - * file systems that do not support fallocate(), trying to check if a - * block is allocated before allocating it, so don't do that here. - */ - -result = -posix_fallocate(fd, current_length, - offset - current_length); +result = do_fallocate(fd, 0, current_length, offset - current_length); if (result != 0) { -/* posix_fallocate() doesn't set errno. */ -error_setg_errno(errp, -result, - "Could not preallocate new data"); +error_setg_errno(errp, -result, "Could not preallocate new data"); return result; } if (current_length == 0) { /* - * posix_fallocate() uses fallocate() if the filesystem supports - * it, or fallback to manually writing zeroes. If fallocate() - * was used, unaligned reads from the fallocated area in - * raw_probe_alignment() will succeed, hence we need to allocate - * the first block. + * Unaligned reads from the fallocated area in raw_probe_alignment() + * will succeed, hence we need to allocate the first block. * - * Optimize future alignment probing; ignore failures. + * Optimizes future alignment probing; ignore failures. */ allocate_first_block(fd, offset); } @@ -1973,10 +1961,12 @@ static int handle_aiocb_truncate(void *opaque) } switch (prealloc) { -#ifdef CONFIG_POSIX_FALLOCATE +#ifdef CONFIG_FALLOCATE case PREALLOC_MODE_FALLOC: result = preallocate_falloc(fd, current_length, offset, errp); -goto out; +if (result != -ENOTSUP) +goto out; +/* If fallocate() is not supported, fallback to full preallocation. */ #endif case PREALLOC_MODE_FULL: result = preallocate_full(fd, current_length, offset, errp); @@ -3080,7 +3070,7 @@ static QemuOptsList raw_create_opts = { .name = BLOCK_OPT_PREALLOC, .type = QEMU_OPT_STRING, .help = "Preallocation mode (allowed values: off" -#ifdef CONFIG_POSIX_FALLOCATE +#ifdef CONFIG_FALLOCATE ", falloc" #endif ", full)" diff --git a/docs/system/qemu-block-drivers.rst.inc b/docs/system/qemu-block-drivers.rst.inc index b052a6d14e..8e4acf397e 100644 --- a/docs/system/qemu-block-drivers.rst.inc +++ b/docs/system/qemu-block-drivers.rst.inc @@ -25,11 +25,12 @@ This section describes each format and the options that are supported for it. .. program:: raw .. option:: preallocation -Preallocation mode (allowed values:
[PATCH 1/2] block: file-posix: Extract preallocate helpers
handle_aiocb_truncate() was too big and complex, implementing 3 different preallocation modes. In a future patch I want to introduce a fallback from "falloc" to "full"; it will be too messy and error prone with the current code. Extract a helper for each of the preallocation modes (falloc, full, off) and leave only the common preparation and cleanup code in handle_aiocb_truncate(). Signed-off-by: Nir Soffer --- block/file-posix.c | 206 ++--- 1 file changed, 120 insertions(+), 86 deletions(-) diff --git a/block/file-posix.c b/block/file-posix.c index 9a00d4190a..341ffb1cb4 100644 --- a/block/file-posix.c +++ b/block/file-posix.c @@ -1832,12 +1832,128 @@ static int allocate_first_block(int fd, size_t max_size) return ret; } +static int preallocate_falloc(int fd, int64_t current_length, int64_t offset, + Error **errp) +{ +#ifdef CONFIG_POSIX_FALLOCATE +int result; + +if (offset == current_length) +return 0; + +/* + * Truncating before posix_fallocate() makes it about twice slower on + * file systems that do not support fallocate(), trying to check if a + * block is allocated before allocating it, so don't do that here. + */ + +result = -posix_fallocate(fd, current_length, + offset - current_length); +if (result != 0) { +/* posix_fallocate() doesn't set errno. */ +error_setg_errno(errp, -result, + "Could not preallocate new data"); +return result; +} + +if (current_length == 0) { +/* + * posix_fallocate() uses fallocate() if the filesystem supports + * it, or fallback to manually writing zeroes. If fallocate() + * was used, unaligned reads from the fallocated area in + * raw_probe_alignment() will succeed, hence we need to allocate + * the first block. + * + * Optimize future alignment probing; ignore failures. + */ +allocate_first_block(fd, offset); +} + +return 0; +#else +return -ENOTSUP; +#endif +} + +static int preallocate_full(int fd, int64_t current_length, int64_t offset, +Error **errp) +{ +int64_t num = 0, left = offset - current_length; +off_t seek_result; +int result; +char *buf = NULL; + +/* + * Knowing the final size from the beginning could allow the file + * system driver to do less allocations and possibly avoid + * fragmentation of the file. + */ +if (ftruncate(fd, offset) != 0) { +result = -errno; +error_setg_errno(errp, -result, "Could not resize file"); +goto out; +} + +buf = g_malloc0(65536); + +seek_result = lseek(fd, current_length, SEEK_SET); +if (seek_result < 0) { +result = -errno; +error_setg_errno(errp, -result, + "Failed to seek to the old end of file"); +goto out; +} + +while (left > 0) { +num = MIN(left, 65536); +result = write(fd, buf, num); +if (result < 0) { +if (errno == EINTR) { +continue; +} +result = -errno; +error_setg_errno(errp, -result, + "Could not write zeros for preallocation"); +goto out; +} +left -= result; +} + +result = fsync(fd); +if (result < 0) { +result = -errno; +error_setg_errno(errp, -result, "Could not flush file to disk"); +goto out; +} + +out: +g_free(buf); + +return result; +} + +static int preallocate_off(int fd, int64_t current_length, int64_t offset, + Error **errp) +{ +if (ftruncate(fd, offset) != 0) { +int result = -errno; +error_setg_errno(errp, -result, "Could not resize file"); +return result; +} + +if (current_length == 0 && offset > current_length) { +/* Optimize future alignment probing; ignore failures. */ +allocate_first_block(fd, offset); +} + +return 0; +} + static int handle_aiocb_truncate(void *opaque) { RawPosixAIOData *aiocb = opaque; int result = 0; int64_t current_length = 0; -char *buf = NULL; struct stat st; int fd = aiocb->aio_fildes; int64_t offset = aiocb->aio_offset; @@ -1859,95 +1975,14 @@ static int handle_aiocb_truncate(void *opaque) switch (prealloc) { #ifdef CONFIG_POSIX_FALLOCATE case PREALLOC_MODE_FALLOC: -/* - * Truncating before posix_fallocate() makes it about twice slower on - * file systems that do not support fallocate(), trying to check if a - * block is allocated before allocating it, so don't do that here. - */ -if (offset != current_length) { -re
[PATCH 0/2] Replace posix_fallocate() with falloate()
Change preallocation=falloc to use fallocate() instead of posix_fallocte(), improving performance when legacy filesystem that do not support fallocate, and avoiding issues seen with OFD locks. More work is needed to respect cache mode when using full preallocation and maybe optimize buffer size. Continuing the discussion at: https://lists.nongnu.org/archive/html/qemu-block/2020-08/msg00947.html Nir Soffer (2): block: file-posix: Extract preallocate helpers block: file-posix: Replace posix_fallocate with fallocate block/file-posix.c | 202 ++--- docs/system/qemu-block-drivers.rst.inc | 11 +- docs/tools/qemu-img.rst| 11 +- qapi/block-core.json | 4 +- 4 files changed, 127 insertions(+), 101 deletions(-) -- 2.26.2