Re: [Qemu-block] [PATCH] qemu-img convert: Deprecate using -n and -o together

2019-08-09 Thread Nir Soffer
On Fri, Aug 9, 2019 at 12:11 PM Kevin Wolf  wrote:

> bdrv_create options specified with -o have no effect when skipping image
> creation with -n, so this doesn't make sense. Warn against the misuse
> and deprecate the combination so we can make it a hard error later.
>
> Signed-off-by: Kevin Wolf 
> ---
>  qemu-img.c   | 5 +
>  qemu-deprecated.texi | 7 +++
>  2 files changed, 12 insertions(+)
>
> diff --git a/qemu-img.c b/qemu-img.c
> index 79983772de..d9321f6418 100644
> --- a/qemu-img.c
> +++ b/qemu-img.c
> @@ -2231,6 +2231,11 @@ static int img_convert(int argc, char **argv)
>  goto fail_getopt;
>  }
>
> +if (skip_create && options) {
> +warn_report("-o has no effect when skipping image creation");
> +warn_report("This will become an error in future QEMU versions.");
> +}
> +
>  s.src_num = argc - optind - 1;
>  out_filename = s.src_num >= 1 ? argv[argc - 1] : NULL;
>
> diff --git a/qemu-deprecated.texi b/qemu-deprecated.texi
> index fff07bb2a3..7673d079c5 100644
> --- a/qemu-deprecated.texi
> +++ b/qemu-deprecated.texi
> @@ -305,6 +305,13 @@ to just export the entire image and then mount only
> /dev/nbd0p1 than
>  it is to reinvoke @command{qemu-nbd -c /dev/nbd0} limited to just a
>  subset of the image.
>
> +@subsection qemu-img convert -n -o (since 4.2.0)
> +
> +All options specified in @option{-o} are image creation options, so they
> +have no effect when used with @option{-n} to skip image creation. This
> +combination never made sense and shows that the user misunderstood the
> +effect of the options, so this will be made an error in future versions.
>

The user misunderstood by not reading qemu code?

Both the online help and the manual page do not mention anything about
that, so I think
they should be fixed to explain the behavior, and this text should mention
that the behavior
was never documented.

Nir


> +
>  @section Build system
>
>  @subsection Python 2 support (since 4.1.0)
> --
> 2.20.1
>
>
>


[Qemu-block] [PATCH] block/backup: install notifier during creation

2019-08-09 Thread John Snow
Backup jobs may yield prior to installing their handler, because of the
job_co_entry shim which guarantees that a job won't begin work until
we are ready to start an entire transaction.

Unfortunately, this makes proving correctness about transactional
points-in-time for backup hard to reason about. Make it explicitly clear
by moving the handler registration to creation time, and changing the
write notifier to a no-op until the job is started.

Reported-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: John Snow 
---
 block/backup.c | 32 +++-
 include/qemu/job.h |  5 +
 job.c  |  2 +-
 3 files changed, 29 insertions(+), 10 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index 07d751aea4..4df5b95415 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -344,6 +344,13 @@ static int coroutine_fn backup_before_write_notify(
 assert(QEMU_IS_ALIGNED(req->offset, BDRV_SECTOR_SIZE));
 assert(QEMU_IS_ALIGNED(req->bytes, BDRV_SECTOR_SIZE));
 
+/* The handler is installed at creation time; the actual point-in-time
+ * starts at job_start(). Transactions guarantee those two points are
+ * the same point in time. */
+if (!job_started(&job->common.job)) {
+return 0;
+}
+
 return backup_do_cow(job, req->offset, req->bytes, NULL, true);
 }
 
@@ -398,6 +405,12 @@ static void backup_clean(Job *job)
 BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
 BlockDriverState *bs = blk_bs(s->common.blk);
 
+/* cancelled before job_start: remove write_notifier */
+if (s->before_write.notify) {
+notifier_with_return_remove(&s->before_write);
+s->before_write.notify = NULL;
+}
+
 if (s->copy_bitmap) {
 bdrv_release_dirty_bitmap(bs, s->copy_bitmap);
 s->copy_bitmap = NULL;
@@ -527,17 +540,8 @@ static void backup_init_copy_bitmap(BackupBlockJob *job)
 static int coroutine_fn backup_run(Job *job, Error **errp)
 {
 BackupBlockJob *s = container_of(job, BackupBlockJob, common.job);
-BlockDriverState *bs = blk_bs(s->common.blk);
 int ret = 0;
 
-QLIST_INIT(&s->inflight_reqs);
-qemu_co_rwlock_init(&s->flush_rwlock);
-
-backup_init_copy_bitmap(s);
-
-s->before_write.notify = backup_before_write_notify;
-bdrv_add_before_write_notifier(bs, &s->before_write);
-
 if (s->sync_mode == MIRROR_SYNC_MODE_TOP) {
 int64_t offset = 0;
 int64_t count;
@@ -572,6 +576,7 @@ static int coroutine_fn backup_run(Job *job, Error **errp)
 
  out:
 notifier_with_return_remove(&s->before_write);
+s->before_write.notify = NULL;
 
 /* wait until pending backup_do_cow() calls have completed */
 qemu_co_rwlock_wrlock(&s->flush_rwlock);
@@ -767,6 +772,15 @@ BlockJob *backup_job_create(const char *job_id, 
BlockDriverState *bs,
&error_abort);
 job->len = len;
 
+/* Finally, install a write notifier that takes effect after job_start() */
+backup_init_copy_bitmap(job);
+
+QLIST_INIT(&job->inflight_reqs);
+qemu_co_rwlock_init(&job->flush_rwlock);
+
+job->before_write.notify = backup_before_write_notify;
+bdrv_add_before_write_notifier(bs, &job->before_write);
+
 return &job->common;
 
  error:
diff --git a/include/qemu/job.h b/include/qemu/job.h
index 9e7cd1e4a0..733afb696b 100644
--- a/include/qemu/job.h
+++ b/include/qemu/job.h
@@ -394,6 +394,11 @@ void job_enter_cond(Job *job, bool(*fn)(Job *job));
  */
 void job_start(Job *job);
 
+/**
+ * job_started returns true if the @job has started.
+ */
+bool job_started(Job *job);
+
 /**
  * @job: The job to enter.
  *
diff --git a/job.c b/job.c
index 28dd48f8a5..745af659ff 100644
--- a/job.c
+++ b/job.c
@@ -243,7 +243,7 @@ bool job_is_completed(Job *job)
 return false;
 }
 
-static bool job_started(Job *job)
+bool job_started(Job *job)
 {
 return job->co;
 }
-- 
2.21.0




Re: [Qemu-block] [Qemu-devel] backup bug or question

2019-08-09 Thread John Snow



On 8/9/19 9:18 AM, Vladimir Sementsov-Ogievskiy wrote:
> Hi!
> 
> Hmm, hacking around backup I have a question:
> 
> What prevents guest write request after job_start but before setting
> write notifier?
> 
> code path:
> 
> qmp_drive_backup or transaction with backup
> 
> job_start
>aio_co_enter(job_co_entry) /* may only schedule execution, isn't it ? 
> */
> 
> 
> 
> job_co_entry
> job_pause_point() /* it definitely yields, isn't it bad? */
> job->driver->run() /* backup_run */
> 
> 
> 
> backup_run()
> bdrv_add_before_write_notifier()
> 
> ...
> 

I think you're right... :(


We create jobs like this:

job->paused= true;
job->pause_count   = 1;


And then job_start does this:

job->co = qemu_coroutine_create(job_co_entry, job);
job->pause_count--;
job->busy = true;
job->paused = false;


Which means that job_co_entry is being called before we lift the pause:

assert(job && job->driver && job->driver->run);
job_pause_point(job);
job->ret = job->driver->run(job, &job->err);

...Which means that we are definitely yielding in job_pause_point.

Yeah, that's a race condition waiting to happen.

> And what guarantees we give to the user? Is it guaranteed that write notifier 
> is
> set when qmp command returns?
> 
> And I guess, if we start several backups in a transaction it should be 
> guaranteed
> that the set of backups is consistent and correspond to one point in time...
> 

I would have hoped that maybe the drain_all coupled with the individual
jobs taking drain_start and drain_end would save us, but I guess we
simply don't have a guarantee that all backup jobs WILL have installed
their handler by the time the transaction ends.

Or, if there is that guarantee, I don't know what provides it, so I
think we shouldn't count on it accidentally working anymore.



I think we should do two things:

1. Move the handler installation to creation time.
2. Modify backup_before_write_notify to return without invoking
backup_do_cow if the job isn't started yet.

I'll send a patch in just a moment ...

--js



Re: [Qemu-block] [PATCH v2 00/11] block: Fix some things about bdrv_has_zero_init()

2019-08-09 Thread Max Reitz
On 24.07.19 19:12, Max Reitz wrote:
> Hi,
> 
> See the previous cover letter for the reason for patches 6 through 9:
> https://lists.nongnu.org/archive/html/qemu-block/2019-07/msg00563.html
> 
> But no only some bdrv_has_zero_init() implementations are wrong, some
> callers also use it the wrong way.
> 
> First, qemu-img and mirror use it for pre-existing images, where it
> doesn’t have any meaning.  Both should consider pre-existing images to
> always be non-zero and not look at bdrv_has-zero_init() (patches 1, 2,
> and the tests in 10 and 11).
> 
> Second, vhdx and parallels call bdrv_has_zero_init() when they do not
> really care about an image’s post-create state but only about what
> happens when you grow an image.  That is a bit ugly, and also overly
> safe when growing preallocated images without preallocating the new
> areas.  So this series adds a new function bdrv_has_zero_init_truncate()
> that is more suited to vhdx's and parallel's needs (patches 3 through
> 5).

Thanks for the reviews, I took a part of this last paragraph, added it
as patch 5’s commit message, and applied the series to my block-next branch.

Max



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [Qemu-devel] [PATCH v2 09/11] iotests: Convert to preallocated encrypted qcow2

2019-08-09 Thread Max Reitz
On 25.07.19 18:27, Max Reitz wrote:
> On 25.07.19 17:30, Maxim Levitsky wrote:
>> On Wed, 2019-07-24 at 19:12 +0200, Max Reitz wrote:
>>> Add a test case for converting an empty image (which only returns zeroes
>>> when read) to a preallocated encrypted qcow2 image.
>>> qcow2_has_zero_init() should return 0 then, thus forcing qemu-img
>>> convert to create zero clusters.
>>>
>>> Signed-off-by: Max Reitz 
>>> Acked-by: Stefano Garzarella 
>>> Tested-by: Stefano Garzarella 
>>> ---
>>>  tests/qemu-iotests/188 | 20 +++-
>>>  tests/qemu-iotests/188.out |  4 
>>>  2 files changed, 23 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/tests/qemu-iotests/188 b/tests/qemu-iotests/188
>>> index be7278aa65..afca44df54 100755
>>> --- a/tests/qemu-iotests/188
>>> +++ b/tests/qemu-iotests/188
>>> @@ -48,7 +48,7 @@ SECRETALT="secret,id=sec0,data=platypus"
>>>  
>>>  _make_test_img --object $SECRET -o 
>>> "encrypt.format=luks,encrypt.key-secret=sec0,encrypt.iter-time=10" $size
>>>  
>>> -IMGSPEC="driver=$IMGFMT,file.filename=$TEST_IMG,encrypt.key-secret=sec0"
>>> +IMGSPEC="driver=$IMGFMT,encrypt.key-secret=sec0,file.filename=$TEST_IMG"
>> This change I think doesn't change anything

Just noticed now: Yes, it does; it puts the TEST_IMG at end so we can
append to it...

[...]

>>> +
>>> +$QEMU_IMG convert -O "$IMGFMT" --object $SECRET \
>>> +-o 
>>> "encrypt.format=luks,encrypt.key-secret=sec0,encrypt.iter-time=10,preallocation=metadata"
>>>  \
>>> +"${TEST_IMG}.orig" "$TEST_IMG"
>>> +
>>> +$QEMU_IMG compare --object $SECRET --image-opts "${IMGSPEC}.orig" 
>>> "$IMGSPEC"

...right here.

Max



signature.asc
Description: OpenPGP digital signature


[Qemu-block] [PATCH] iotests: Fix 141 when run with qed

2019-08-09 Thread Max Reitz
69f47505ee has changed qcow2 in such a way that the commit job run in
test 141 (and 144[1]) returns before it emits the READY event.  However,
141 also runs with qed, where the order is still the other way around.
Just filter out the {"return": {}} so the test passes for qed again.

[1] 144 only runs with qcow2, so it is fine as it is.

Suggested-by: Vladimir Sementsov-Ogievskiy 
Fixes: 69f47505ee66afaa513305de0c1895a224e52c45
Signed-off-by: Max Reitz 
---
 tests/qemu-iotests/141   | 9 +++--
 tests/qemu-iotests/141.out   | 5 -
 tests/qemu-iotests/common.filter | 5 +
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/tests/qemu-iotests/141 b/tests/qemu-iotests/141
index 2197a82d45..8c2ae79f2b 100755
--- a/tests/qemu-iotests/141
+++ b/tests/qemu-iotests/141
@@ -58,16 +58,21 @@ test_blockjob()
   }}}" \
 'return'
 
+# If "$2" is an event, we may or may not see it before the
+# {"return": {}}.  Therefore, filter the {"return": {}} out both
+# here and in the next command.  (Naturally, if we do not see it
+# here, we will see it before the next command can be executed,
+# so it will appear in the next _send_qemu_cmd's output.)
 _send_qemu_cmd $QEMU_HANDLE \
 "$1" \
 "$2" \
-| _filter_img_create
+| _filter_img_create | _filter_qmp_empty_return
 
 # We want this to return an error because the block job is still running
 _send_qemu_cmd $QEMU_HANDLE \
 "{'execute': 'blockdev-del',
   'arguments': {'node-name': 'drv0'}}" \
-'error' | _filter_generated_node_ids
+'error' | _filter_generated_node_ids | _filter_qmp_empty_return
 
 _send_qemu_cmd $QEMU_HANDLE \
 "{'execute': 'block-job-cancel',
diff --git a/tests/qemu-iotests/141.out b/tests/qemu-iotests/141.out
index 4d71d9dcae..dbd3bdef6c 100644
--- a/tests/qemu-iotests/141.out
+++ b/tests/qemu-iotests/141.out
@@ -10,7 +10,6 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 
backing_file=TEST_DIR/m.
 Formatting 'TEST_DIR/o.IMGFMT', fmt=IMGFMT size=1048576 
backing_file=TEST_DIR/t.IMGFMT backing_fmt=IMGFMT
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": 
"JOB_STATUS_CHANGE", "data": {"status": "created", "id": "job0"}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": 
"JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job0"}}
-{"return": {}}
 {"error": {"class": "GenericError", "desc": "Node drv0 is in use"}}
 {"return": {}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": 
"JOB_STATUS_CHANGE", "data": {"status": "aborting", "id": "job0"}}
@@ -27,7 +26,6 @@ Formatting 'TEST_DIR/o.IMGFMT', fmt=IMGFMT size=1048576 
backing_file=TEST_DIR/t.
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": 
"JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job0"}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": 
"JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "job0"}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": 
"BLOCK_JOB_READY", "data": {"device": "job0", "len": 0, "offset": 0, "speed": 
0, "type": "mirror"}}
-{"return": {}}
 {"error": {"class": "GenericError", "desc": "Node 'drv0' is busy: block device 
is in use by block job: mirror"}}
 {"return": {}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": 
"JOB_STATUS_CHANGE", "data": {"status": "waiting", "id": "job0"}}
@@ -42,7 +40,6 @@ Formatting 'TEST_DIR/o.IMGFMT', fmt=IMGFMT size=1048576 
backing_file=TEST_DIR/t.
 {"return": {}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": 
"JOB_STATUS_CHANGE", "data": {"status": "created", "id": "job0"}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": 
"JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job0"}}
-{"return": {}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": 
"JOB_STATUS_CHANGE", "data": {"status": "ready", "id": "job0"}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": 
"BLOCK_JOB_READY", "data": {"device": "job0", "len": 0, "offset": 0, "speed": 
0, "type": "commit"}}
 {"error": {"class": "GenericError", "desc": "Node 'drv0' is busy: block device 
is in use by block job: commit"}}
@@ -61,7 +58,6 @@ wrote 1048576/1048576 bytes at offset 0
 {"return": {}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": 
"JOB_STATUS_CHANGE", "data": {"status": "created", "id": "job0"}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": 
"JOB_STATUS_CHANGE", "data": {"status": "running", "id": "job0"}}
-{"return": {}}
 {"error": {"class": "GenericError", "desc": "Node drv0 is in use"}}
 {"return": {}}
 {"timestamp": {"seconds":  TIMESTAMP, "microseconds":  TIMESTAMP}, "event": 
"JOB_STATUS_CHANGE", "data": {"status": "aborting", "id": "j

Re: [Qemu-block] [Qemu-devel] [PATCH v6 06/42] qcow2: Implement .bdrv_storage_child()

2019-08-09 Thread Eric Blake
On 8/9/19 11:13 AM, Max Reitz wrote:
> Signed-off-by: Max Reitz 
> Reviewed-by: Vladimir Sementsov-Ogievskiy 
> ---
>  block/qcow2.c | 9 +
>  1 file changed, 9 insertions(+)
> 

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [Qemu-devel] [PATCH v6 05/42] block: Add chain helper functions

2019-08-09 Thread Eric Blake
On 8/9/19 11:13 AM, Max Reitz wrote:
> Add some helper functions for skipping filters in a chain of block
> nodes.
> 
> Signed-off-by: Max Reitz 
> Reviewed-by: Vladimir Sementsov-Ogievskiy 
> ---
>  include/block/block_int.h |  3 +++
>  block.c   | 55 +++
>  2 files changed, 58 insertions(+)

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [Qemu-devel] [PATCH v6 04/42] block: Add child access functions

2019-08-09 Thread Eric Blake
On 8/9/19 11:13 AM, Max Reitz wrote:
> There are BDS children that the general block layer code can access,
> namely bs->file and bs->backing.  Since the introduction of filters and
> external data files, their meaning is not quite clear.  bs->backing can
> be a COW source, or it can be an R/W-filtered child; bs->file can be an
> R/W-filtered child, it can be data and metadata storage, or it can be
> just metadata storage.
> 
> This overloading really is not helpful.  This patch adds function that
> retrieve the correct child for each exact purpose.  Later patches in
> this series will make use of them.  Doing so will allow us to handle
> filter nodes and external data files in a meaningful way.
> 
> Signed-off-by: Max Reitz 
> Reviewed-by: Vladimir Sementsov-Ogievskiy 
> ---
>  include/block/block_int.h | 57 --
>  block.c   | 99 +++
>  2 files changed, 153 insertions(+), 3 deletions(-)
> 

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH v2 4/7] block/backup: drop handling of max_transfer for copy_range

2019-08-09 Thread Max Reitz
On 09.08.19 17:32, Vladimir Sementsov-Ogievskiy wrote:
> Since previous commit, copy_range supports max_transfer, so we don't
> need to handle it by hand.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  block/backup.c | 11 ++-
>  1 file changed, 2 insertions(+), 9 deletions(-)

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH v2 3/7] block/io: handle alignment and max_transfer for copy_range

2019-08-09 Thread Max Reitz
On 09.08.19 17:32, Vladimir Sementsov-Ogievskiy wrote:
> copy_range ignores these limitations, let's improve it.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  block/io.c | 48 
>  1 file changed, 40 insertions(+), 8 deletions(-)

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature


[Qemu-block] [PATCH v6 42/42] iotests: Test committing to overridden backing

2019-08-09 Thread Max Reitz
Signed-off-by: Max Reitz 
---
 tests/qemu-iotests/040 | 61 ++
 tests/qemu-iotests/040.out |  4 +--
 2 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/tests/qemu-iotests/040 b/tests/qemu-iotests/040
index a0a0db8889..558fdb9a09 100755
--- a/tests/qemu-iotests/040
+++ b/tests/qemu-iotests/040
@@ -605,5 +605,66 @@ class TestCommitWithFilters(iotests.QMPTestCase):
 # 3 has been comitted into 2
 self.pattern_files[3] = self.img2
 
+class TestCommitWithOverriddenBacking(iotests.QMPTestCase):
+img_base_a = os.path.join(iotests.test_dir, 'base_a.img')
+img_base_b = os.path.join(iotests.test_dir, 'base_b.img')
+img_top = os.path.join(iotests.test_dir, 'top.img')
+
+def setUp(self):
+qemu_img('create', '-f', iotests.imgfmt, self.img_base_a, '1M')
+qemu_img('create', '-f', iotests.imgfmt, self.img_base_b, '1M')
+qemu_img('create', '-f', iotests.imgfmt, '-b', self.img_base_a, \
+ self.img_top)
+
+self.vm = iotests.VM()
+self.vm.launch()
+
+# Use base_b instead of base_a as the backing of top
+result = self.vm.qmp('blockdev-add', **{
+'node-name': 'top',
+'driver': iotests.imgfmt,
+'file': {
+'driver': 'file',
+'filename': self.img_top
+},
+'backing': {
+'node-name': 'base',
+'driver': iotests.imgfmt,
+'file': {
+'driver': 'file',
+'filename': self.img_base_b
+}
+}
+})
+self.assert_qmp(result, 'return', {})
+
+def tearDown(self):
+self.vm.shutdown()
+os.remove(self.img_top)
+os.remove(self.img_base_a)
+os.remove(self.img_base_b)
+
+def test_commit_to_a(self):
+# Try committing to base_a (which should fail, as top's
+# backing image is base_b instead)
+result = self.vm.qmp('block-commit',
+ job_id='commit',
+ device='top',
+ base=self.img_base_a)
+self.assert_qmp(result, 'error/class', 'GenericError')
+
+def test_commit_to_b(self):
+# Try committing to base_b (which should work, since that is
+# actually top's backing image)
+result = self.vm.qmp('block-commit',
+ job_id='commit',
+ device='top',
+ base=self.img_base_b)
+self.assert_qmp(result, 'return', {})
+
+self.vm.event_wait('BLOCK_JOB_READY')
+self.vm.qmp('block-job-complete', device='commit')
+self.vm.event_wait('BLOCK_JOB_COMPLETED')
+
 if __name__ == '__main__':
 iotests.main(supported_fmts=['qcow2', 'qed'])
diff --git a/tests/qemu-iotests/040.out b/tests/qemu-iotests/040.out
index fe58934d7a..499af0e2ff 100644
--- a/tests/qemu-iotests/040.out
+++ b/tests/qemu-iotests/040.out
@@ -1,5 +1,5 @@
-...
+.
 --
-Ran 51 tests
+Ran 53 tests
 
 OK
-- 
2.21.0




[Qemu-block] [PATCH v6 41/42] iotests: Add test for commit in sub directory

2019-08-09 Thread Max Reitz
Add a test for committing an overlay in a sub directory to one of the
images in its backing chain, using both relative and absolute filenames.

Signed-off-by: Max Reitz 
---
 tests/qemu-iotests/020 | 36 
 tests/qemu-iotests/020.out | 10 ++
 2 files changed, 46 insertions(+)

diff --git a/tests/qemu-iotests/020 b/tests/qemu-iotests/020
index 6b0ebb37d2..94633c3a32 100755
--- a/tests/qemu-iotests/020
+++ b/tests/qemu-iotests/020
@@ -31,6 +31,11 @@ _cleanup()
_cleanup_test_img
 rm -f "$TEST_IMG.base"
 rm -f "$TEST_IMG.orig"
+
+rm -f "$TEST_DIR/subdir/t.$IMGFMT.base"
+rm -f "$TEST_DIR/subdir/t.$IMGFMT.mid"
+rm -f "$TEST_DIR/subdir/t.$IMGFMT"
+rmdir "$TEST_DIR/subdir" &> /dev/null
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
@@ -133,6 +138,37 @@ $QEMU_IO -c 'writev 0 64k' "$TEST_IMG" | _filter_qemu_io
 $QEMU_IMG commit "$TEST_IMG"
 _cleanup
 
+
+echo
+echo 'Testing commit in sub-directory with relative filenames'
+echo
+
+pushd "$TEST_DIR" > /dev/null
+
+mkdir subdir
+
+TEST_IMG="subdir/t.$IMGFMT.base" _make_test_img 1M
+TEST_IMG="subdir/t.$IMGFMT.mid" _make_test_img -b "t.$IMGFMT.base"
+TEST_IMG="subdir/t.$IMGFMT" _make_test_img -b "t.$IMGFMT.mid"
+
+# Should work
+$QEMU_IMG commit -b "t.$IMGFMT.mid" "subdir/t.$IMGFMT"
+
+# Might theoretically work, but does not in practice (we have to
+# decide between this and the above; and since we always represent
+# backing file names as relative to the overlay, we go for the above)
+$QEMU_IMG commit -b "subdir/t.$IMGFMT.mid" "subdir/t.$IMGFMT" 2>&1 | \
+_filter_imgfmt
+
+# This should work as well
+$QEMU_IMG commit -b "$TEST_DIR/subdir/t.$IMGFMT.mid" "subdir/t.$IMGFMT"
+
+popd > /dev/null
+
+# Now let's try with just absolute filenames
+$QEMU_IMG commit -b "$TEST_DIR/subdir/t.$IMGFMT.mid" \
+"$TEST_DIR/subdir/t.$IMGFMT"
+
 # success, all done
 echo "*** done"
 rm -f $seq.full
diff --git a/tests/qemu-iotests/020.out b/tests/qemu-iotests/020.out
index 4b722b2dd0..228c37dded 100644
--- a/tests/qemu-iotests/020.out
+++ b/tests/qemu-iotests/020.out
@@ -1094,4 +1094,14 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=1048576 
backing_file=json:{'driv
 wrote 65536/65536 bytes at offset 0
 64 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 qemu-img: Block job failed: No space left on device
+
+Testing commit in sub-directory with relative filenames
+
+Formatting 'subdir/t.IMGFMT.base', fmt=IMGFMT size=1048576
+Formatting 'subdir/t.IMGFMT.mid', fmt=IMGFMT size=1048576 
backing_file=t.IMGFMT.base
+Formatting 'subdir/t.IMGFMT', fmt=IMGFMT size=1048576 backing_file=t.IMGFMT.mid
+Image committed.
+qemu-img: Did not find 'subdir/t.IMGFMT.mid' in the backing chain of 
'subdir/t.IMGFMT'
+Image committed.
+Image committed.
 *** done
-- 
2.21.0




[Qemu-block] [PATCH v6 40/42] iotests: Add filter mirror test cases

2019-08-09 Thread Max Reitz
This patch adds some test cases how mirroring relates to filters.  One
of them tests what happens when you mirror off a filtered COW node, two
others use the mirror filter node as basically our only example of an
implicitly created filter node so far (besides the commit filter).

Signed-off-by: Max Reitz 
---
 tests/qemu-iotests/041 | 146 -
 tests/qemu-iotests/041.out |   4 +-
 2 files changed, 147 insertions(+), 3 deletions(-)

diff --git a/tests/qemu-iotests/041 b/tests/qemu-iotests/041
index 0c1432f189..c2b5299f62 100755
--- a/tests/qemu-iotests/041
+++ b/tests/qemu-iotests/041
@@ -20,8 +20,9 @@
 
 import time
 import os
+import json
 import iotests
-from iotests import qemu_img, qemu_io
+from iotests import qemu_img, qemu_img_pipe, qemu_io
 
 backing_img = os.path.join(iotests.test_dir, 'backing.img')
 target_backing_img = os.path.join(iotests.test_dir, 'target-backing.img')
@@ -1191,5 +1192,148 @@ class TestReplaces(iotests.QMPTestCase):
 os.remove(test_img)
 os.remove(target_img)
 
+# Tests for mirror with filters (and how the mirror filter behaves, as
+# an example for an implicit filter)
+class TestFilters(iotests.QMPTestCase):
+def setUp(self):
+qemu_img('create', '-f', iotests.imgfmt, backing_img, '1M')
+qemu_img('create', '-f', iotests.imgfmt, '-b', backing_img, test_img)
+qemu_img('create', '-f', iotests.imgfmt, '-b', backing_img, target_img)
+
+qemu_io('-c', 'write -P 1 0 512k', backing_img)
+qemu_io('-c', 'write -P 2 512k 512k', test_img)
+
+self.vm = iotests.VM()
+self.vm.launch()
+
+result = self.vm.qmp('blockdev-add', **{
+'node-name': 'target',
+'driver': iotests.imgfmt,
+'file': {
+'driver': 'file',
+'filename': target_img
+},
+'backing': None
+})
+self.assert_qmp(result, 'return', {})
+
+self.filterless_chain = {
+'node-name': 'source',
+'driver': iotests.imgfmt,
+'file': {
+'driver': 'file',
+'filename': test_img
+},
+'backing': {
+'node-name': 'backing',
+'driver': iotests.imgfmt,
+'file': {
+'driver': 'file',
+'filename': backing_img
+}
+}
+}
+
+def tearDown(self):
+self.vm.shutdown()
+
+os.remove(test_img)
+os.remove(target_img)
+os.remove(backing_img)
+
+def test_cor(self):
+result = self.vm.qmp('blockdev-add', **{
+'node-name': 'filter',
+'driver': 'copy-on-read',
+'file': self.filterless_chain
+})
+self.assert_qmp(result, 'return', {})
+
+result = self.vm.qmp('blockdev-mirror',
+ job_id='mirror',
+ device='filter',
+ target='target',
+ sync='top')
+self.assert_qmp(result, 'return', {})
+
+self.complete_and_wait('mirror')
+
+self.vm.qmp('blockdev-del', node_name='target')
+
+target_map = qemu_img_pipe('map', '--output=json', target_img)
+target_map = json.loads(target_map)
+
+assert target_map[0]['start'] == 0
+assert target_map[0]['length'] == 512 * 1024
+assert target_map[0]['depth'] == 1
+
+assert target_map[1]['start'] == 512 * 1024
+assert target_map[1]['length'] == 512 * 1024
+assert target_map[1]['depth'] == 0
+
+def test_implicit_mirror_filter(self):
+result = self.vm.qmp('blockdev-add', **self.filterless_chain)
+self.assert_qmp(result, 'return', {})
+
+# We need this so we can query from above the mirror node
+result = self.vm.qmp('device_add',
+ driver='virtio-blk',
+ id='virtio',
+ bus='pci.0',
+ drive='source')
+self.assert_qmp(result, 'return', {})
+
+result = self.vm.qmp('blockdev-mirror',
+ job_id='mirror',
+ device='source',
+ target='target',
+ sync='top')
+self.assert_qmp(result, 'return', {})
+
+# The mirror filter is now an implicit node, so it should be
+# invisible when querying the backing chain
+device_info = self.vm.qmp('query-block')['return'][0]
+assert device_info['qdev'] == 
'/machine/peripheral/vir

[Qemu-block] [PATCH v6 39/42] iotests: Add filter commit test cases

2019-08-09 Thread Max Reitz
This patch adds some tests on how commit copes with filter nodes.

Signed-off-by: Max Reitz 
---
 tests/qemu-iotests/040 | 177 +
 tests/qemu-iotests/040.out |   4 +-
 2 files changed, 179 insertions(+), 2 deletions(-)

diff --git a/tests/qemu-iotests/040 b/tests/qemu-iotests/040
index 6db9abf8e6..a0a0db8889 100755
--- a/tests/qemu-iotests/040
+++ b/tests/qemu-iotests/040
@@ -428,5 +428,182 @@ class TestReopenOverlay(ImageCommitTestCase):
 def test_reopen_overlay(self):
 self.run_commit_test(self.img1, self.img0)
 
+class TestCommitWithFilters(iotests.QMPTestCase):
+img0 = os.path.join(iotests.test_dir, '0.img')
+img1 = os.path.join(iotests.test_dir, '1.img')
+img2 = os.path.join(iotests.test_dir, '2.img')
+img3 = os.path.join(iotests.test_dir, '3.img')
+
+def setUp(self):
+qemu_img('create', '-f', iotests.imgfmt, self.img0, '64M')
+qemu_img('create', '-f', iotests.imgfmt, self.img1, '64M')
+qemu_img('create', '-f', iotests.imgfmt, self.img2, '64M')
+qemu_img('create', '-f', iotests.imgfmt, self.img3, '64M')
+
+qemu_io('-f', iotests.imgfmt, '-c', 'write -P 1 0M 1M', self.img0)
+qemu_io('-f', iotests.imgfmt, '-c', 'write -P 2 1M 1M', self.img1)
+qemu_io('-f', iotests.imgfmt, '-c', 'write -P 3 2M 1M', self.img2)
+qemu_io('-f', iotests.imgfmt, '-c', 'write -P 4 3M 1M', self.img3)
+
+# Distributions of the patterns in the files; this is checked
+# by tearDown() and should be changed by the test cases as is
+# necessary
+self.pattern_files = [self.img0, self.img1, self.img2, self.img3]
+
+self.vm = iotests.VM()
+self.vm.launch()
+self.has_quit = False
+
+result = self.vm.qmp('object-add', qom_type='throttle-group', id='tg')
+self.assert_qmp(result, 'return', {})
+
+result = self.vm.qmp('blockdev-add', **{
+'node-name': 'top-filter',
+'driver': 'throttle',
+'throttle-group': 'tg',
+'file': {
+'node-name': 'cow-3',
+'driver': iotests.imgfmt,
+'file': {
+'driver': 'file',
+'filename': self.img3
+},
+'backing': {
+'node-name': 'cow-2',
+'driver': iotests.imgfmt,
+'file': {
+'driver': 'file',
+'filename': self.img2
+},
+'backing': {
+'node-name': 'cow-1',
+'driver': iotests.imgfmt,
+'file': {
+'driver': 'file',
+'filename': self.img1
+},
+'backing': {
+'node-name': 'bottom-filter',
+'driver': 'throttle',
+'throttle-group': 'tg',
+'file': {
+'node-name': 'cow-0',
+'driver': iotests.imgfmt,
+'file': {
+'driver': 'file',
+'filename': self.img0
+}
+}
+}
+}
+}
+}
+})
+self.assert_qmp(result, 'return', {})
+
+def tearDown(self):
+self.vm.shutdown(has_quit=self.has_quit)
+
+for index in range(len(self.pattern_files)):
+result = qemu_io('-f', iotests.imgfmt,
+ '-c', 'read -P %i %iM 1M' % (index + 1, index),
+ self.pattern_files[index])
+self.assertFalse('Pattern verification failed' in result)
+
+os.remove(self.img3)
+os.remove(self.img2)
+os.remove(self.img1)
+os.remove(self.img0)
+
+# Filters make for funny filenames, so we cannot just use
+# self.imgX to get them
+def get_filename(self, node):
+return self.vm.node_info(node)['image']['filename']
+
+def test_filterless_commit(self):
+self.assert_no_active_block_jobs()
+result = self.vm.qmp('block-commit',
+ job_id='commit',
+ device='top-filter',
+ top_node='cow-2',
+ base_node='cow-1')
+self.assert_qmp(result, 'return', {})
+self.wait_until_completed(drive='commit')
+
+self.assertIsNotNone(self.vm.node_info('cow-3'))
+self.assertIsNone(self.vm.node_info('cow-2'))
+self.assertIsNo

[Qemu-block] [PATCH v6 33/42] blockdev: Fix active commit choice

2019-08-09 Thread Max Reitz
We have to perform an active commit whenever the top node has a parent
that has taken the WRITE permission on it.

Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 blockdev.c | 24 +---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index ee8b951154..4e72f6f701 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -3356,6 +3356,7 @@ void qmp_block_commit(bool has_job_id, const char 
*job_id, const char *device,
  */
 BlockdevOnError on_error = BLOCKDEV_ON_ERROR_REPORT;
 int job_flags = JOB_DEFAULT;
+uint64_t top_perm, top_shared;
 
 if (!has_speed) {
 speed = 0;
@@ -3468,14 +3469,31 @@ void qmp_block_commit(bool has_job_id, const char 
*job_id, const char *device,
 goto out;
 }
 
-if (top_bs == bs) {
+/*
+ * Active commit is required if and only if someone has taken a
+ * WRITE permission on the top node.  Historically, we have always
+ * used active commit for top nodes, so continue that practice.
+ * (Active commit is never really wrong.)
+ */
+bdrv_get_cumulative_perm(top_bs, &top_perm, &top_shared);
+if (top_perm & BLK_PERM_WRITE ||
+bdrv_skip_rw_filters(top_bs) == bdrv_skip_rw_filters(bs))
+{
 if (has_backing_file) {
 error_setg(errp, "'backing-file' specified,"
  " but 'top' is the active layer");
 goto out;
 }
-commit_active_start(has_job_id ? job_id : NULL, bs, base_bs,
-job_flags, speed, on_error,
+if (!has_job_id) {
+/*
+ * Emulate here what block_job_create() does, because it
+ * is possible that @bs != @top_bs (the block job should
+ * be named after @bs, even if @top_bs is the actual
+ * source)
+ */
+job_id = bdrv_get_device_name(bs);
+}
+commit_active_start(job_id, top_bs, base_bs, job_flags, speed, 
on_error,
 filter_node_name, NULL, NULL, false, &local_err);
 } else {
 BlockDriverState *overlay_bs = bdrv_find_overlay(bs, top_bs);
-- 
2.21.0




[Qemu-block] [PATCH v6 38/42] iotests: Let complete_and_wait() work with commit

2019-08-09 Thread Max Reitz
complete_and_wait() and wait_ready() currently only work for mirror
jobs.  Let them work for active commit jobs, too.

Signed-off-by: Max Reitz 
---
 tests/qemu-iotests/iotests.py | 10 +++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/tests/qemu-iotests/iotests.py b/tests/qemu-iotests/iotests.py
index 84438e837c..3ef846d1dc 100644
--- a/tests/qemu-iotests/iotests.py
+++ b/tests/qemu-iotests/iotests.py
@@ -761,8 +761,12 @@ class QMPTestCase(unittest.TestCase):
 
 def wait_ready(self, drive='drive0'):
 '''Wait until a block job BLOCK_JOB_READY event'''
-f = {'data': {'type': 'mirror', 'device': drive } }
-event = self.vm.event_wait(name='BLOCK_JOB_READY', match=f)
+event = self.vm.events_wait([
+('BLOCK_JOB_READY',
+ {'data': {'type': 'mirror', 'device': drive } }),
+('BLOCK_JOB_READY',
+ {'data': {'type': 'commit', 'device': drive } })
+])
 
 def wait_ready_and_cancel(self, drive='drive0'):
 self.wait_ready(drive=drive)
@@ -780,7 +784,7 @@ class QMPTestCase(unittest.TestCase):
 self.assert_qmp(result, 'return', {})
 
 event = self.wait_until_completed(drive=drive)
-self.assert_qmp(event, 'data/type', 'mirror')
+self.assertTrue(event['data']['type'] in ['mirror', 'commit'])
 
 def pause_wait(self, job_id='job0'):
 with Timeout(1, "Timeout waiting for job to pause"):
-- 
2.21.0




[Qemu-block] [PATCH v6 35/42] block: Fix check_to_replace_node()

2019-08-09 Thread Max Reitz
Currently, check_to_replace_node() only allows mirror to replace a node
in the chain of the source node, and only if it is the first non-filter
node below the source.  Well, technically, the idea is that you can
exactly replace a quorum child by mirroring from quorum.

This has (probably) two reasons:
(1) We do not want to create loops.
(2) @replaces and @device should have exactly the same content so
replacing them does not cause visible data to change.

This has two issues:
(1) It is overly restrictive.  It is completely fine for @replaces to be
a filter.
(2) It is not restrictive enough.  You can create loops with this as
follows:

$ qemu-img create -f qcow2 /tmp/source.qcow2 64M
$ qemu-system-x86_64 -qmp stdio
{"execute": "qmp_capabilities"}
{"execute": "object-add",
 "arguments": {"qom-type": "throttle-group", "id": "tg0"}}
{"execute": "blockdev-add",
 "arguments": {
 "node-name": "source",
 "driver": "throttle",
 "throttle-group": "tg0",
 "file": {
 "node-name": "filtered",
 "driver": "qcow2",
 "file": {
 "driver": "file",
 "filename": "/tmp/source.qcow2"
 } } } }
{"execute": "drive-mirror",
 "arguments": {
 "job-id": "mirror",
 "device": "source",
 "target": "/tmp/target.qcow2",
 "format": "qcow2",
 "node-name": "target",
 "sync" :"none",
 "replaces": "filtered"
 } }
{"execute": "block-job-complete", "arguments": {"device": "mirror"}}

And qemu crashes because of a stack overflow due to the loop being
created (target's backing file is source, so when it replaces filtered,
it points to itself through source).

(blockdev-mirror can be broken similarly.)

So let us make the checks for the two conditions above explicit, which
makes the whole function exactly as restrictive as it needs to be.

Signed-off-by: Max Reitz 
---
 include/block/block.h |  1 +
 block.c   | 83 +++
 blockdev.c| 34 --
 3 files changed, 110 insertions(+), 8 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index 6ba853fb90..8da706cd89 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -404,6 +404,7 @@ bool bdrv_is_first_non_filter(BlockDriverState *candidate);
 
 /* check if a named node can be replaced when doing drive-mirror */
 BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs,
+BlockDriverState *backing_bs,
 const char *node_name, Error **errp);
 
 /* async block I/O */
diff --git a/block.c b/block.c
index 915b80153c..4858d3e718 100644
--- a/block.c
+++ b/block.c
@@ -6290,7 +6290,59 @@ bool bdrv_is_first_non_filter(BlockDriverState 
*candidate)
 return false;
 }
 
+static bool is_child_of(BlockDriverState *child, BlockDriverState *parent)
+{
+BdrvChild *c;
+
+if (!parent) {
+return false;
+}
+
+QLIST_FOREACH(c, &parent->children, next) {
+if (c->bs == child || is_child_of(child, c->bs)) {
+return true;
+}
+}
+
+return false;
+}
+
+/*
+ * Return true if there are only filters in [@top, @base).  Note that
+ * this may include quorum (which bdrv_chain_contains() cannot
+ * handle).
+ */
+static bool is_filtered_child(BlockDriverState *top, BlockDriverState *base)
+{
+BdrvChild *c;
+
+if (!top) {
+return false;
+}
+
+if (top == base) {
+return true;
+}
+
+if (!top->drv->is_filter) {
+return false;
+}
+
+QLIST_FOREACH(c, &top->children, next) {
+if (is_filtered_child(c->bs, base)) {
+return true;
+}
+}
+
+return false;
+}
+
+/*
+ * @parent_bs is mirror's source BDS, @backing_bs is the BDS which
+ * will be attached to the target when mirror completes.
+ */
 BlockDriverState *check_to_replace_node(BlockDriverState *parent_bs,
+BlockDriverState *backing_bs,
 const char *node_name, Error **errp)
 {
 BlockDriverState *to_replace_bs = bdrv_find_node(node_name);
@@ -6309,13 +6361,32 @@ BlockDriverState 
*check_to_replace_node(BlockDriverState *parent_bs,
 goto out;
 }
 
-/* We don't want arbitrary node of the BDS chain to be replaced only the 
top
- * most non filter in order to prevent data corruption.
- * Another benefit is that this tests exclude backing files which are
- * blocked by the backing blockers.
+/*
+ * If to_replace_bs is (recursively) a child of backing_bs,
+ * replacing it may create a loop.  We cannot allow that.
  */
-if (!bdrv_recurse_is_first_non_filter(parent_bs, to_replace_bs)) {
-error_setg(errp, "Only top most non filter can be replaced");
+if (to_replace_bs == backing_bs || is_child_of(to_replace_bs, backing_bs)) 
{
+error_setg(errp, "Replacing this node would result in a loop");
+t

[Qemu-block] [PATCH v6 31/42] block: Drop backing_bs()

2019-08-09 Thread Max Reitz
We want to make it explicit where bs->backing is used, and we have done
so.  The old role of backing_bs() is now effectively taken by
bdrv_filtered_cow_bs().

Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 include/block/block_int.h | 5 -
 1 file changed, 5 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index 5bec3361fd..786801c32f 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -932,11 +932,6 @@ typedef enum BlockMirrorBackingMode {
 MIRROR_LEAVE_BACKING_CHAIN,
 } BlockMirrorBackingMode;
 
-static inline BlockDriverState *backing_bs(BlockDriverState *bs)
-{
-return bs->backing ? bs->backing->bs : NULL;
-}
-
 
 /* Essential block drivers which must always be statically linked into qemu, 
and
  * which therefore can be accessed without using bdrv_find_format() */
-- 
2.21.0




[Qemu-block] [PATCH v6 25/42] mirror: Deal with filters

2019-08-09 Thread Max Reitz
This includes some permission limiting (for example, we only need to
take the RESIZE permission for active commits where the base is smaller
than the top).

Signed-off-by: Max Reitz 
---
 block/mirror.c | 117 ++---
 blockdev.c |  47 +---
 2 files changed, 131 insertions(+), 33 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index 54bafdf176..6ddbfb9708 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -42,6 +42,7 @@ typedef struct MirrorBlockJob {
 BlockBackend *target;
 BlockDriverState *mirror_top_bs;
 BlockDriverState *base;
+BlockDriverState *base_overlay;
 
 /* The name of the graph node to replace */
 char *replaces;
@@ -665,8 +666,10 @@ static int mirror_exit_common(Job *job)
  &error_abort);
 if (!abort && s->backing_mode == MIRROR_SOURCE_BACKING_CHAIN) {
 BlockDriverState *backing = s->is_none_mode ? src : s->base;
-if (backing_bs(target_bs) != backing) {
-bdrv_set_backing_hd(target_bs, backing, &local_err);
+BlockDriverState *unfiltered_target = bdrv_skip_rw_filters(target_bs);
+
+if (bdrv_filtered_cow_bs(unfiltered_target) != backing) {
+bdrv_set_backing_hd(unfiltered_target, backing, &local_err);
 if (local_err) {
 error_report_err(local_err);
 ret = -EPERM;
@@ -715,7 +718,7 @@ static int mirror_exit_common(Job *job)
  * valid.
  */
 block_job_remove_all_bdrv(bjob);
-bdrv_replace_node(mirror_top_bs, backing_bs(mirror_top_bs), &error_abort);
+bdrv_replace_node(mirror_top_bs, mirror_top_bs->backing->bs, &error_abort);
 
 /* We just changed the BDS the job BB refers to (with either or both of the
  * bdrv_replace_node() calls), so switch the BB back so the cleanup does
@@ -812,7 +815,8 @@ static int coroutine_fn mirror_dirty_init(MirrorBlockJob *s)
 return 0;
 }
 
-ret = bdrv_is_allocated_above(bs, base, false, offset, bytes, &count);
+ret = bdrv_is_allocated_above(bs, s->base_overlay, true, offset, bytes,
+  &count);
 if (ret < 0) {
 return ret;
 }
@@ -908,7 +912,7 @@ static int coroutine_fn mirror_run(Job *job, Error **errp)
 } else {
 s->target_cluster_size = BDRV_SECTOR_SIZE;
 }
-if (backing_filename[0] && !target_bs->backing &&
+if (backing_filename[0] && !bdrv_backing_chain_next(target_bs) &&
 s->granularity < s->target_cluster_size) {
 s->buf_size = MAX(s->buf_size, s->target_cluster_size);
 s->cow_bitmap = bitmap_new(length);
@@ -1088,8 +1092,9 @@ static void mirror_complete(Job *job, Error **errp)
 if (s->backing_mode == MIRROR_OPEN_BACKING_CHAIN) {
 int ret;
 
-assert(!target->backing);
-ret = bdrv_open_backing_file(target, NULL, "backing", errp);
+assert(!bdrv_backing_chain_next(target));
+ret = bdrv_open_backing_file(bdrv_skip_rw_filters(target), NULL,
+ "backing", errp);
 if (ret < 0) {
 return;
 }
@@ -1531,8 +1536,8 @@ static BlockJob *mirror_start_job(
 MirrorBlockJob *s;
 MirrorBDSOpaque *bs_opaque;
 BlockDriverState *mirror_top_bs;
-bool target_graph_mod;
 bool target_is_backing;
+uint64_t target_perms, target_shared_perms;
 Error *local_err = NULL;
 int ret;
 
@@ -1551,7 +1556,7 @@ static BlockJob *mirror_start_job(
 buf_size = DEFAULT_MIRROR_BUF_SIZE;
 }
 
-if (bs == target) {
+if (bdrv_skip_rw_filters(bs) == bdrv_skip_rw_filters(target)) {
 error_setg(errp, "Can't mirror node into itself");
 return NULL;
 }
@@ -1615,15 +1620,50 @@ static BlockJob *mirror_start_job(
  * In the case of active commit, things look a bit different, though,
  * because the target is an already populated backing file in active use.
  * We can allow anything except resize there.*/
+
+target_perms = BLK_PERM_WRITE;
+target_shared_perms = BLK_PERM_WRITE_UNCHANGED;
+
 target_is_backing = bdrv_chain_contains(bs, target);
-target_graph_mod = (backing_mode != MIRROR_LEAVE_BACKING_CHAIN);
+if (target_is_backing) {
+int64_t bs_size, target_size;
+bs_size = bdrv_getlength(bs);
+if (bs_size < 0) {
+error_setg_errno(errp, -bs_size,
+ "Could not inquire top image size");
+goto fail;
+}
+
+target_size = bdrv_getlength(target);
+if (target_size < 0) {
+error_setg_errno(errp, -target_size,
+ "Could not inquire base image size");
+goto fail;
+}
+
+if (target_size < bs_size) {
+target_perms |= BLK_PERM_RESIZE;
+}
+
+target_shared_perms |= BLK_PERM_CONSISTENT_READ
+|  BLK_PERM_WRITE
+   

[Qemu-block] [PATCH v6 24/42] block: Use child access functions for QAPI queries

2019-08-09 Thread Max Reitz
query-block, query-named-block-nodes, and query-blockstats now return
any filtered child under "backing", not just bs->backing or COW
children.  This is so that filters do not interrupt the reported backing
chain.  This changes the output for iotest 184, as the throttled node
now appears as a backing child.

Signed-off-by: Max Reitz 
---
 block/qapi.c   | 39 +++---
 tests/qemu-iotests/184.out |  7 ++-
 2 files changed, 30 insertions(+), 16 deletions(-)

diff --git a/block/qapi.c b/block/qapi.c
index 9a185cba48..4f59ac1c0f 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -156,9 +156,13 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
 return NULL;
 }
 
-if (bs0->drv && bs0->backing) {
+if (bs0->drv && bdrv_filtered_child(bs0)) {
+/*
+ * Put any filtered child here (for backwards compatibility to when
+ * we put bs0->backing here, which might be any filtered child).
+ */
 info->backing_file_depth++;
-bs0 = bs0->backing->bs;
+bs0 = bdrv_filtered_bs(bs0);
 (*p_image_info)->has_backing_image = true;
 p_image_info = &((*p_image_info)->backing_image);
 } else {
@@ -167,9 +171,8 @@ BlockDeviceInfo *bdrv_block_device_info(BlockBackend *blk,
 
 /* Skip automatically inserted nodes that the user isn't aware of for
  * query-block (blk != NULL), but not for query-named-block-nodes */
-while (blk && bs0->drv && bs0->implicit) {
-bs0 = backing_bs(bs0);
-assert(bs0);
+if (blk) {
+bs0 = bdrv_skip_implicit_filters(bs0);
 }
 }
 
@@ -354,9 +357,9 @@ static void bdrv_query_info(BlockBackend *blk, BlockInfo 
**p_info,
 BlockDriverState *bs = blk_bs(blk);
 char *qdev;
 
-/* Skip automatically inserted nodes that the user isn't aware of */
-while (bs && bs->drv && bs->implicit) {
-bs = backing_bs(bs);
+if (bs) {
+/* Skip automatically inserted nodes that the user isn't aware of */
+bs = bdrv_skip_implicit_filters(bs);
 }
 
 info->device = g_strdup(blk_name(blk));
@@ -513,6 +516,7 @@ static void bdrv_query_blk_stats(BlockDeviceStats *ds, 
BlockBackend *blk)
 static BlockStats *bdrv_query_bds_stats(BlockDriverState *bs,
 bool blk_level)
 {
+BlockDriverState *storage_bs, *filtered_bs;
 BlockStats *s = NULL;
 
 s = g_malloc0(sizeof(*s));
@@ -525,9 +529,8 @@ static BlockStats *bdrv_query_bds_stats(BlockDriverState 
*bs,
 /* Skip automatically inserted nodes that the user isn't aware of in
  * a BlockBackend-level command. Stay at the exact node for a node-level
  * command. */
-while (blk_level && bs->drv && bs->implicit) {
-bs = backing_bs(bs);
-assert(bs);
+if (blk_level) {
+bs = bdrv_skip_implicit_filters(bs);
 }
 
 if (bdrv_get_node_name(bs)[0]) {
@@ -537,14 +540,20 @@ static BlockStats *bdrv_query_bds_stats(BlockDriverState 
*bs,
 
 s->stats->wr_highest_offset = stat64_get(&bs->wr_highest_offset);
 
-if (bs->file) {
+storage_bs = bdrv_storage_bs(bs);
+if (storage_bs) {
 s->has_parent = true;
-s->parent = bdrv_query_bds_stats(bs->file->bs, blk_level);
+s->parent = bdrv_query_bds_stats(storage_bs, blk_level);
 }
 
-if (blk_level && bs->backing) {
+filtered_bs = bdrv_filtered_bs(bs);
+if (blk_level && filtered_bs) {
+/*
+ * Put any filtered child here (for backwards compatibility to when
+ * we put bs0->backing here, which might be any filtered child).
+ */
 s->has_backing = true;
-s->backing = bdrv_query_bds_stats(bs->backing->bs, blk_level);
+s->backing = bdrv_query_bds_stats(filtered_bs, blk_level);
 }
 
 return s;
diff --git a/tests/qemu-iotests/184.out b/tests/qemu-iotests/184.out
index 3deb3cfb94..1d61f7e224 100644
--- a/tests/qemu-iotests/184.out
+++ b/tests/qemu-iotests/184.out
@@ -27,6 +27,11 @@ Testing:
 "iops_rd": 0,
 "detect_zeroes": "off",
 "image": {
+"backing-image": {
+"virtual-size": 1073741824,
+"filename": "null-co://",
+"format": "null-co"
+},
 "virtual-size": 1073741824,
 "filename": "json:{\"throttle-group\": \"group0\", \"driver\": 
\"throttle\", \"file\": {\"driver\": \"null-co\"}}",
 "format": "throttle"
@@ -34,7 +39,7 @@ Testing:
 "iops_wr": 0,
 "ro": false,
 "node-name": "throttle0",
-"backing_file_depth": 0,
+"backing_file_depth": 1,
 "drv": "throttle",
 "iops": 0,
 "bps_wr": 0,
-- 
2.21.0




[Qemu-block] [PATCH v6 22/42] block: Fix bdrv_get_allocated_file_size's fallback

2019-08-09 Thread Max Reitz
If the driver does not implement bdrv_get_allocated_file_size(), we
should fall back to cumulating the allocated size of all non-COW
children instead of just bs->file.

Suggested-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Max Reitz 
---
 block.c | 22 --
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/block.c b/block.c
index 1070aa1ba9..6e1ddab056 100644
--- a/block.c
+++ b/block.c
@@ -4650,9 +4650,27 @@ int64_t bdrv_get_allocated_file_size(BlockDriverState 
*bs)
 if (drv->bdrv_get_allocated_file_size) {
 return drv->bdrv_get_allocated_file_size(bs);
 }
-if (bs->file) {
-return bdrv_get_allocated_file_size(bs->file->bs);
+
+if (!QLIST_EMPTY(&bs->children)) {
+BdrvChild *child;
+int64_t child_size, total_size = 0;
+
+QLIST_FOREACH(child, &bs->children, next) {
+if (child == bdrv_filtered_cow_child(bs)) {
+/* Ignore COW backing files */
+continue;
+}
+
+child_size = bdrv_get_allocated_file_size(child->bs);
+if (child_size < 0) {
+return child_size;
+}
+total_size += child_size;
+}
+
+return total_size;
 }
+
 return -ENOTSUP;
 }
 
-- 
2.21.0




[Qemu-block] [PATCH v6 21/42] block: Use CAFs for debug breakpoints

2019-08-09 Thread Max Reitz
When looking for a blkdebug node (which implements debug breakpoints),
use bdrv_primary_bs() to iterate through the graph, because that is
where a blkdebug node would be.

Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 block.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/block.c b/block.c
index a467b175c6..1070aa1ba9 100644
--- a/block.c
+++ b/block.c
@@ -5218,7 +5218,7 @@ int bdrv_debug_breakpoint(BlockDriverState *bs, const 
char *event,
   const char *tag)
 {
 while (bs && bs->drv && !bs->drv->bdrv_debug_breakpoint) {
-bs = bs->file ? bs->file->bs : NULL;
+bs = bdrv_primary_bs(bs);
 }
 
 if (bs && bs->drv && bs->drv->bdrv_debug_breakpoint) {
@@ -5231,7 +5231,7 @@ int bdrv_debug_breakpoint(BlockDriverState *bs, const 
char *event,
 int bdrv_debug_remove_breakpoint(BlockDriverState *bs, const char *tag)
 {
 while (bs && bs->drv && !bs->drv->bdrv_debug_remove_breakpoint) {
-bs = bs->file ? bs->file->bs : NULL;
+bs = bdrv_primary_bs(bs);
 }
 
 if (bs && bs->drv && bs->drv->bdrv_debug_remove_breakpoint) {
@@ -5244,7 +5244,7 @@ int bdrv_debug_remove_breakpoint(BlockDriverState *bs, 
const char *tag)
 int bdrv_debug_resume(BlockDriverState *bs, const char *tag)
 {
 while (bs && (!bs->drv || !bs->drv->bdrv_debug_resume)) {
-bs = bs->file ? bs->file->bs : NULL;
+bs = bdrv_primary_bs(bs);
 }
 
 if (bs && bs->drv && bs->drv->bdrv_debug_resume) {
@@ -5257,7 +5257,7 @@ int bdrv_debug_resume(BlockDriverState *bs, const char 
*tag)
 bool bdrv_debug_is_suspended(BlockDriverState *bs, const char *tag)
 {
 while (bs && bs->drv && !bs->drv->bdrv_debug_is_suspended) {
-bs = bs->file ? bs->file->bs : NULL;
+bs = bdrv_primary_bs(bs);
 }
 
 if (bs && bs->drv && bs->drv->bdrv_debug_is_suspended) {
-- 
2.21.0




[Qemu-block] [PATCH v6 32/42] block: Make bdrv_get_cumulative_perm() public

2019-08-09 Thread Max Reitz
This is useful in other files like blockdev.c to determine for example
whether a node can be written to or not.

Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 include/block/block_int.h | 3 +++
 block.c   | 6 ++
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index 786801c32f..c17df3808a 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -1205,6 +1205,9 @@ int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, 
uint64_t shared,
  */
 int bdrv_child_refresh_perms(BlockDriverState *bs, BdrvChild *c, Error **errp);
 
+void bdrv_get_cumulative_perm(BlockDriverState *bs,
+  uint64_t *perm, uint64_t *shared_perm);
+
 /* Default implementation for BlockDriver.bdrv_child_perm() that can be used by
  * block filters: Forward CONSISTENT_READ, WRITE, WRITE_UNCHANGED and RESIZE to
  * all children */
diff --git a/block.c b/block.c
index 6e1ddab056..915b80153c 100644
--- a/block.c
+++ b/block.c
@@ -1713,8 +1713,6 @@ static int bdrv_child_check_perm(BdrvChild *c, 
BlockReopenQueue *q,
  bool *tighten_restrictions, Error **errp);
 static void bdrv_child_abort_perm_update(BdrvChild *c);
 static void bdrv_child_set_perm(BdrvChild *c, uint64_t perm, uint64_t shared);
-static void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm,
- uint64_t *shared_perm);
 
 typedef struct BlockReopenQueueEntry {
  bool prepared;
@@ -1938,8 +1936,8 @@ static void bdrv_set_perm(BlockDriverState *bs, uint64_t 
cumulative_perms,
 }
 }
 
-static void bdrv_get_cumulative_perm(BlockDriverState *bs, uint64_t *perm,
- uint64_t *shared_perm)
+void bdrv_get_cumulative_perm(BlockDriverState *bs,
+  uint64_t *perm, uint64_t *shared_perm)
 {
 BdrvChild *c;
 uint64_t cumulative_perms = 0;
-- 
2.21.0




[Qemu-block] [PATCH v6 30/42] qemu-img: Use child access functions

2019-08-09 Thread Max Reitz
This changes iotest 204's output, because blkdebug on top of a COW node
used to make qemu-img map disregard the rest of the backing chain (the
backing chain was broken by the filter).  With this patch, the
allocation in the base image is reported correctly.

Signed-off-by: Max Reitz 
---
 qemu-img.c | 33 -
 tests/qemu-iotests/204.out |  1 +
 2 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index 79983772de..3b30c5ae70 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1012,7 +1012,7 @@ static int img_commit(int argc, char **argv)
 /* This is different from QMP, which by default uses the deepest file 
in
  * the backing chain (i.e., the very base); however, the traditional
  * behavior of qemu-img commit is using the immediate backing file. */
-base_bs = backing_bs(bs);
+base_bs = bdrv_backing_chain_next(bs);
 if (!base_bs) {
 error_setg(&local_err, "Image does not have a backing file");
 goto done;
@@ -1632,18 +1632,20 @@ static int convert_iteration_sectors(ImgConvertState 
*s, int64_t sector_num)
 if (s->sector_next_status <= sector_num) {
 uint64_t offset = (sector_num - src_cur_offset) * BDRV_SECTOR_SIZE;
 int64_t count;
+BlockDriverState *src_bs = blk_bs(s->src[src_cur]);
+BlockDriverState *base;
+
+if (s->target_has_backing) {
+base = bdrv_filtered_cow_bs(bdrv_skip_rw_filters(src_bs));
+} else {
+base = NULL;
+}
 
 do {
 count = n * BDRV_SECTOR_SIZE;
 
-if (s->target_has_backing) {
-ret = bdrv_block_status(blk_bs(s->src[src_cur]), offset,
-count, &count, NULL, NULL);
-} else {
-ret = bdrv_block_status_above(blk_bs(s->src[src_cur]), NULL,
-  offset, count, &count, NULL,
-  NULL);
-}
+ret = bdrv_block_status_above(src_bs, base, offset, count, &count,
+  NULL, NULL);
 
 if (ret < 0) {
 if (s->salvage) {
@@ -2490,7 +2492,8 @@ static int img_convert(int argc, char **argv)
  * s.target_backing_sectors has to be negative, which it will
  * be automatically).  The backing file length is used only
  * for optimizations, so such a case is not fatal. */
-s.target_backing_sectors = bdrv_nb_sectors(out_bs->backing->bs);
+s.target_backing_sectors =
+bdrv_nb_sectors(bdrv_filtered_cow_bs(out_bs));
 } else {
 s.target_backing_sectors = -1;
 }
@@ -2853,6 +2856,7 @@ static int get_block_status(BlockDriverState *bs, int64_t 
offset,
 
 depth = 0;
 for (;;) {
+bs = bdrv_skip_rw_filters(bs);
 ret = bdrv_block_status(bs, offset, bytes, &bytes, &map, &file);
 if (ret < 0) {
 return ret;
@@ -2861,7 +2865,7 @@ static int get_block_status(BlockDriverState *bs, int64_t 
offset,
 if (ret & (BDRV_BLOCK_ZERO|BDRV_BLOCK_DATA)) {
 break;
 }
-bs = backing_bs(bs);
+bs = bdrv_filtered_cow_bs(bs);
 if (bs == NULL) {
 ret = 0;
 break;
@@ -3216,6 +3220,7 @@ static int img_rebase(int argc, char **argv)
 uint8_t *buf_old = NULL;
 uint8_t *buf_new = NULL;
 BlockDriverState *bs = NULL, *prefix_chain_bs = NULL;
+BlockDriverState *unfiltered_bs;
 char *filename;
 const char *fmt, *cache, *src_cache, *out_basefmt, *out_baseimg;
 int c, flags, src_flags, ret;
@@ -3350,6 +3355,8 @@ static int img_rebase(int argc, char **argv)
 }
 bs = blk_bs(blk);
 
+unfiltered_bs = bdrv_skip_rw_filters(bs);
+
 if (out_basefmt != NULL) {
 if (bdrv_find_format(out_basefmt) == NULL) {
 error_report("Invalid format name: '%s'", out_basefmt);
@@ -3361,7 +3368,7 @@ static int img_rebase(int argc, char **argv)
 /* For safe rebasing we need to compare old and new backing file */
 if (!unsafe) {
 QDict *options = NULL;
-BlockDriverState *base_bs = backing_bs(bs);
+BlockDriverState *base_bs = bdrv_filtered_cow_bs(unfiltered_bs);
 
 if (base_bs) {
 blk_old_backing = blk_new(qemu_get_aio_context(),
@@ -3517,7 +3524,7 @@ static int img_rebase(int argc, char **argv)
  * If cluster wasn't changed since prefix_chain, we don't need
  * to take action
  */
-ret = bdrv_is_allocated_above(backing_bs(bs), prefix_chain_bs,
+ret = bdrv_is_allocated_above(unfiltered_bs, prefix_chain_bs,
   false, offset, n, &n);
 if (ret < 0) {
 error_report("error while reading image metadata: %s",
diff --git a/tests/qemu-i

[Qemu-block] [PATCH v6 29/42] nbd: Use CAF when looking for dirty bitmap

2019-08-09 Thread Max Reitz
When looking for a dirty bitmap to share, we should handle filters by
just including them in the search (so they do not break backing chains).

Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Eric Blake 
---
 nbd/server.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/nbd/server.c b/nbd/server.c
index fbd51b48a7..a1c73ce957 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -1511,13 +1511,13 @@ NBDExport *nbd_export_new(BlockDriverState *bs, 
uint64_t dev_offset,
 if (bitmap) {
 BdrvDirtyBitmap *bm = NULL;
 
-while (true) {
+while (bs) {
 bm = bdrv_find_dirty_bitmap(bs, bitmap);
-if (bm != NULL || bs->backing == NULL) {
+if (bm != NULL) {
 break;
 }
 
-bs = bs->backing->bs;
+bs = bdrv_filtered_bs(bs);
 }
 
 if (bm == NULL) {
-- 
2.21.0




[Qemu-block] [PATCH v6 34/42] block: Inline bdrv_co_block_status_from_*()

2019-08-09 Thread Max Reitz
With bdrv_filtered_rw_bs(), we can easily handle this default filter
behavior in bdrv_co_block_status().

blkdebug wants to have an additional assertion, so it keeps its own
implementation, except bdrv_co_block_status_from_file() needs to be
inlined there.

Suggested-by: Eric Blake 
Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 include/block/block_int.h | 22 -
 block/blkdebug.c  |  7 --
 block/blklogwrites.c  |  1 -
 block/commit.c|  1 -
 block/copy-on-read.c  |  2 --
 block/io.c| 51 +--
 block/mirror.c|  1 -
 block/throttle.c  |  1 -
 8 files changed, 22 insertions(+), 64 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index c17df3808a..42ee2fcf7f 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -1227,28 +1227,6 @@ void bdrv_format_default_perms(BlockDriverState *bs, 
BdrvChild *c,
uint64_t perm, uint64_t shared,
uint64_t *nperm, uint64_t *nshared);
 
-/*
- * Default implementation for drivers to pass bdrv_co_block_status() to
- * their file.
- */
-int coroutine_fn bdrv_co_block_status_from_file(BlockDriverState *bs,
-bool want_zero,
-int64_t offset,
-int64_t bytes,
-int64_t *pnum,
-int64_t *map,
-BlockDriverState **file);
-/*
- * Default implementation for drivers to pass bdrv_co_block_status() to
- * their backing file.
- */
-int coroutine_fn bdrv_co_block_status_from_backing(BlockDriverState *bs,
-   bool want_zero,
-   int64_t offset,
-   int64_t bytes,
-   int64_t *pnum,
-   int64_t *map,
-   BlockDriverState **file);
 const char *bdrv_get_parent_name(const BlockDriverState *bs);
 void blk_dev_change_media_cb(BlockBackend *blk, bool load, Error **errp);
 bool blk_dev_has_removable_media(BlockBackend *blk);
diff --git a/block/blkdebug.c b/block/blkdebug.c
index 5ae96c52b0..0011e068ce 100644
--- a/block/blkdebug.c
+++ b/block/blkdebug.c
@@ -679,8 +679,11 @@ static int coroutine_fn 
blkdebug_co_block_status(BlockDriverState *bs,
 return err;
 }
 
-return bdrv_co_block_status_from_file(bs, want_zero, offset, bytes,
-  pnum, map, file);
+assert(bs->file && bs->file->bs);
+*pnum = bytes;
+*map = offset;
+*file = bs->file->bs;
+return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID;
 }
 
 static void blkdebug_close(BlockDriverState *bs)
diff --git a/block/blklogwrites.c b/block/blklogwrites.c
index 04d8b33607..8982fd15c4 100644
--- a/block/blklogwrites.c
+++ b/block/blklogwrites.c
@@ -519,7 +519,6 @@ static BlockDriver bdrv_blk_log_writes = {
 .bdrv_co_pwrite_zeroes  = blk_log_writes_co_pwrite_zeroes,
 .bdrv_co_flush_to_disk  = blk_log_writes_co_flush_to_disk,
 .bdrv_co_pdiscard   = blk_log_writes_co_pdiscard,
-.bdrv_co_block_status   = bdrv_co_block_status_from_file,
 
 .is_filter  = true,
 .strong_runtime_opts= blk_log_writes_strong_runtime_opts,
diff --git a/block/commit.c b/block/commit.c
index 40d1c8eeac..c94678bea3 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -252,7 +252,6 @@ static void bdrv_commit_top_child_perm(BlockDriverState 
*bs, BdrvChild *c,
 static BlockDriver bdrv_commit_top = {
 .format_name= "commit_top",
 .bdrv_co_preadv = bdrv_commit_top_preadv,
-.bdrv_co_block_status   = bdrv_co_block_status_from_backing,
 .bdrv_refresh_filename  = bdrv_commit_top_refresh_filename,
 .bdrv_child_perm= bdrv_commit_top_child_perm,
 
diff --git a/block/copy-on-read.c b/block/copy-on-read.c
index 16bdf630b6..ad6577d078 100644
--- a/block/copy-on-read.c
+++ b/block/copy-on-read.c
@@ -160,8 +160,6 @@ static BlockDriver bdrv_copy_on_read = {
 .bdrv_eject = cor_eject,
 .bdrv_lock_medium   = cor_lock_medium,
 
-.bdrv_co_block_status   = bdrv_co_block_status_from_file,
-
 .bdrv_recurse_is_first_non_filter   = cor_recurse_is_first_non_filter,
 
 .has_variable_length= true,
diff --git a/block/io.c b/block/io.c
index e222d91893..d7d9757f46 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2028,36 +2028,6 @@ typedef struct BdrvCoBlockStatusData {
 bool done;
 } BdrvCoBlockStatusData;
 
-int coroutine_fn bdrv_co_block_

[Qemu-block] [PATCH v6 36/42] iotests: Add tests for mirror @replaces loops

2019-08-09 Thread Max Reitz
This adds two tests for cases where our old check_to_replace_node()
function failed to detect that executing this job with these parameters
would result in a cyclic graph.

Signed-off-by: Max Reitz 
Reviewed-by: Eric Blake 
---
 tests/qemu-iotests/041 | 124 +
 tests/qemu-iotests/041.out |   4 +-
 2 files changed, 126 insertions(+), 2 deletions(-)

diff --git a/tests/qemu-iotests/041 b/tests/qemu-iotests/041
index 26bf1701eb..0c1432f189 100755
--- a/tests/qemu-iotests/041
+++ b/tests/qemu-iotests/041
@@ -1067,5 +1067,129 @@ class TestOrphanedSource(iotests.QMPTestCase):
  target='dest-ro')
 self.assert_qmp(result, 'error/class', 'GenericError')
 
+# Various tests for the @replaces option (independent of quorum)
+class TestReplaces(iotests.QMPTestCase):
+def setUp(self):
+self.vm = iotests.VM()
+self.vm.launch()
+
+def tearDown(self):
+self.vm.shutdown()
+
+def test_drive_mirror_loop(self):
+qemu_img('create', '-f', iotests.imgfmt, test_img, '1M')
+
+result = self.vm.qmp('object-add', qom_type='throttle-group', id='tg')
+self.assert_qmp(result, 'return', {})
+
+result = self.vm.qmp('blockdev-add', **{
+'node-name': 'source',
+'driver': 'throttle',
+'throttle-group': 'tg',
+'file': {
+'node-name': 'filtered',
+'driver': iotests.imgfmt,
+'file': {
+'driver': 'file',
+'filename': test_img
+}
+}
+})
+self.assert_qmp(result, 'return', {})
+
+# Mirror from @source to @target in sync=none, so that @source
+# will be @target's backing file; but replace @filtered.
+# Then, @target's backing file will be @source, whose backing
+# file is now @target instead of @filtered.  That is a loop.
+# (But apart from the loop, replacing @filtered instead of
+# @source is fine, because both are just filtered versions of
+# each other.)
+result = self.vm.qmp('drive-mirror',
+ job_id='mirror',
+ device='source',
+ target=target_img,
+ format=iotests.imgfmt,
+ node_name='target',
+ sync='none',
+ replaces='filtered')
+if 'error' in result:
+# This is the correct result
+self.assert_qmp(result, 'error/class', 'GenericError')
+else:
+# This is wrong, but let's run it to the bitter conclusion
+self.complete_and_wait(drive='mirror')
+# Fail for good measure, although qemu should have crashed
+# anyway
+self.fail('Loop creation was successful')
+
+os.remove(test_img)
+try:
+os.remove(target_img)
+except OSError:
+pass
+
+def test_blockdev_mirror_loop(self):
+qemu_img('create', '-f', iotests.imgfmt, test_img, '1M')
+qemu_img('create', '-f', iotests.imgfmt, target_img, '1M')
+
+result = self.vm.qmp('object-add', qom_type='throttle-group', id='tg')
+self.assert_qmp(result, 'return', {})
+
+result = self.vm.qmp('blockdev-add', **{
+'node-name': 'source',
+'driver': 'throttle',
+'throttle-group': 'tg',
+'file': {
+'node-name': 'middle',
+'driver': 'throttle',
+'throttle-group': 'tg',
+'file': {
+'node-name': 'bottom',
+'driver': iotests.imgfmt,
+'file': {
+'driver': 'file',
+'filename': test_img
+}
+}
+}
+})
+self.assert_qmp(result, 'return', {})
+
+result = self.vm.qmp('blockdev-add', **{
+'node-name': 'target',
+'driver': iotests.imgfmt,
+'file': {
+'driver': 'file',
+'filename': target_img
+},
+'backing': 'middle'
+})
+
+# Mirror from @source to @target.  With blockdev-mirror, the
+# current (old) backing file is retained (which is @middle).
+# By replacing @bottom, @middle's file will be @target, whose
+# backing file is @middle again.  That is a loop.
+# (But apart from the loop, replacing @bottom instead of
+# @source is fine, because both are just filte

[Qemu-block] [PATCH v6 37/42] block: Leave BDS.backing_file constant

2019-08-09 Thread Max Reitz
Parts of the block layer treat BDS.backing_file as if it were whatever
the image header says (i.e., if it is a relative path, it is relative to
the overlay), other parts treat it like a cache for
bs->backing->bs->filename (relative paths are relative to the CWD).
Considering bs->backing->bs->filename exists, let us make it mean the
former.

Among other things, this now allows the user to specify a base when
using qemu-img to commit an image file in a directory that is not the
CWD (assuming, everything uses relative filenames).

Before this patch:

$ ./qemu-img create -f qcow2 foo/bot.qcow2 1M
$ ./qemu-img create -f qcow2 -b bot.qcow2 foo/mid.qcow2
$ ./qemu-img create -f qcow2 -b mid.qcow2 foo/top.qcow2
$ ./qemu-img commit -b mid.qcow2 foo/top.qcow2
qemu-img: Did not find 'mid.qcow2' in the backing chain of 'foo/top.qcow2'
$ ./qemu-img commit -b foo/mid.qcow2 foo/top.qcow2
qemu-img: Did not find 'foo/mid.qcow2' in the backing chain of 'foo/top.qcow2'
$ ./qemu-img commit -b $PWD/foo/mid.qcow2 foo/top.qcow2
qemu-img: Did not find '[...]/foo/mid.qcow2' in the backing chain of 
'foo/top.qcow2'

After this patch:

$ ./qemu-img commit -b mid.qcow2 foo/top.qcow2
Image committed.
$ ./qemu-img commit -b foo/mid.qcow2 foo/top.qcow2
qemu-img: Did not find 'foo/mid.qcow2' in the backing chain of 'foo/top.qcow2'
$ ./qemu-img commit -b $PWD/foo/mid.qcow2 foo/top.qcow2
Image committed.

With this change, bdrv_find_backing_image() must look at whether the
user has overridden a BDS's backing file.  If so, it can no longer use
bs->backing_file, but must instead compare the given filename against
the backing node's filename directly.

Note that this changes the QAPI output for a node's backing_file.  We
had very inconsistent output there (sometimes what the image header
said, sometimes the actual filename of the backing image).  This
inconsistent output was effectively useless, so we have to decide one
way or the other.  Considering that bs->backing_file usually at runtime
contained the path to the image relative to qemu's CWD (or absolute),
this patch changes QAPI's backing_file to always report the
bs->backing->bs->filename from now on.  If you want to receive the image
header information, you have to refer to full-backing-filename.

This necessitates a change to iotest 228.  The interesting information
it really wanted is the image header, and it can get that now, but it
has to use full-backing-filename instead of backing_file.  Because of
this patch's changes to bs->backing_file's behavior, we also need some
reference output changes.

Along with the changes to bs->backing_file, stop updating
BDS.backing_format in bdrv_backing_attach() as well.  This necessitates
a change to the reference output of iotest 191.

iotest 245 changes in behavior: With the backing node no longer
overriding the parent node's backing_file string, you can now omit the
@backing option when reopening a node with neither a default nor a
current backing file even if it used to have a backing node at some
point.

Signed-off-by: Max Reitz 
---
 include/block/block_int.h  | 19 ++-
 block.c| 35 ---
 block/qapi.c   |  7 ---
 tests/qemu-iotests/191.out |  1 -
 tests/qemu-iotests/228 |  6 +++---
 tests/qemu-iotests/228.out |  6 +++---
 tests/qemu-iotests/245 |  4 +++-
 7 files changed, 55 insertions(+), 23 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index 42ee2fcf7f..993bafc090 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -784,11 +784,20 @@ struct BlockDriverState {
 bool walking_aio_notifiers; /* to make removal during iteration safe */
 
 char filename[PATH_MAX];
-char backing_file[PATH_MAX]; /* if non zero, the image is a diff of
-this file image */
-/* The backing filename indicated by the image header; if we ever
- * open this file, then this is replaced by the resulting BDS's
- * filename (i.e. after a bdrv_refresh_filename() run). */
+/*
+ * If not empty, this image is a diff in relation to backing_file.
+ * Note that this is the name given in the image header and
+ * therefore may or may not be equal to .backing->bs->filename.
+ * If this field contains a relative path, it is to be resolved
+ * relatively to the overlay's location.
+ */
+char backing_file[PATH_MAX];
+/*
+ * The backing filename indicated by the image header.  Contrary
+ * to backing_file, if we ever open this file, auto_backing_file
+ * is replaced by the resulting BDS's filename (i.e. after a
+ * bdrv_refresh_filename() run).
+ */
 char auto_backing_file[PATH_MAX];
 char backing_format[16]; /* if non-zero and backing_file exists */
 
diff --git a/block.c b/block.c
index 4858d3e718..88533fa0d3 100644
--- a/block.c
+++ b/block.c
@@ -78,6 +78,8 @@ static BlockDriverState *bdrv_open_inherit(const char 
*filena

[Qemu-block] [PATCH v6 26/42] backup: Deal with filters

2019-08-09 Thread Max Reitz
Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 block/backup.c |  9 +
 blockdev.c | 19 +++
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index ecadb61af3..7854d7575b 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -611,6 +611,7 @@ static int64_t 
backup_calculate_cluster_size(BlockDriverState *target,
 {
 int ret;
 BlockDriverInfo bdi;
+bool target_does_cow = bdrv_backing_chain_next(target);
 
 /*
  * If there is no backing file on the target, we cannot rely on COW if our
@@ -618,7 +619,7 @@ static int64_t 
backup_calculate_cluster_size(BlockDriverState *target,
  * targets with a backing file, try to avoid COW if possible.
  */
 ret = bdrv_get_info(target, &bdi);
-if (ret == -ENOTSUP && !target->backing) {
+if (ret == -ENOTSUP && !target_does_cow) {
 /* Cluster size is not defined */
 warn_report("The target block device doesn't provide "
 "information about the block size and it doesn't have a "
@@ -627,14 +628,14 @@ static int64_t 
backup_calculate_cluster_size(BlockDriverState *target,
 "this default, the backup may be unusable",
 BACKUP_CLUSTER_SIZE_DEFAULT);
 return BACKUP_CLUSTER_SIZE_DEFAULT;
-} else if (ret < 0 && !target->backing) {
+} else if (ret < 0 && !target_does_cow) {
 error_setg_errno(errp, -ret,
 "Couldn't determine the cluster size of the target image, "
 "which has no backing file");
 error_append_hint(errp,
 "Aborting, since this may create an unusable destination image\n");
 return ret;
-} else if (ret < 0 && target->backing) {
+} else if (ret < 0 && target_does_cow) {
 /* Not fatal; just trudge on ahead. */
 return BACKUP_CLUSTER_SIZE_DEFAULT;
 }
@@ -683,7 +684,7 @@ BlockJob *backup_job_create(const char *job_id, 
BlockDriverState *bs,
 return NULL;
 }
 
-if (compress && target->drv->bdrv_co_pwritev_compressed == NULL) {
+if (compress && !bdrv_supports_compressed_writes(target)) {
 error_setg(errp, "Compression is not supported for this drive %s",
bdrv_get_device_name(target));
 return NULL;
diff --git a/blockdev.c b/blockdev.c
index c451f553f7..c6f79b4e0e 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -3656,7 +3656,13 @@ static BlockJob *do_drive_backup(DriveBackup *backup, 
JobTxn *txn,
 /* See if we have a backing HD we can use to create our new image
  * on top of. */
 if (backup->sync == MIRROR_SYNC_MODE_TOP) {
-source = backing_bs(bs);
+/*
+ * Backup will not replace the source by the target, so none
+ * of the filters skipped here will be removed (in contrast to
+ * mirror).  Therefore, we can skip all of them when looking
+ * for the first COW relationship.
+ */
+source = bdrv_filtered_cow_bs(bdrv_skip_rw_filters(bs));
 if (!source) {
 backup->sync = MIRROR_SYNC_MODE_FULL;
 }
@@ -3676,9 +3682,14 @@ static BlockJob *do_drive_backup(DriveBackup *backup, 
JobTxn *txn,
 if (backup->mode != NEW_IMAGE_MODE_EXISTING) {
 assert(backup->format);
 if (source) {
-bdrv_refresh_filename(source);
-bdrv_img_create(backup->target, backup->format, source->filename,
-source->drv->format_name, NULL,
+/* Implicit filters should not appear in the filename */
+BlockDriverState *explicit_backing =
+bdrv_skip_implicit_filters(source);
+
+bdrv_refresh_filename(explicit_backing);
+bdrv_img_create(backup->target, backup->format,
+explicit_backing->filename,
+explicit_backing->drv->format_name, NULL,
 size, flags, false, &local_err);
 } else {
 bdrv_img_create(backup->target, backup->format, NULL, NULL, NULL,
-- 
2.21.0




[Qemu-block] [PATCH v6 23/42] blockdev: Use CAF in external_snapshot_prepare()

2019-08-09 Thread Max Reitz
This allows us to differentiate between filters and nodes with COW
backing files: Filters cannot be used as overlays at all (for this
function).

Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 blockdev.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/blockdev.c b/blockdev.c
index 29c6c6044a..c540802127 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1664,7 +1664,12 @@ static void external_snapshot_prepare(BlkActionState 
*common,
 goto out;
 }
 
-if (state->new_bs->backing != NULL) {
+if (state->new_bs->drv->is_filter) {
+error_setg(errp, "Filters cannot be used as overlays");
+goto out;
+}
+
+if (bdrv_filtered_cow_child(state->new_bs)) {
 error_setg(errp, "The overlay already has a backing image");
 goto out;
 }
-- 
2.21.0




[Qemu-block] [PATCH v6 28/42] stream: Deal with filters

2019-08-09 Thread Max Reitz
Because of the recent changes that make the stream job independent of
the base node and instead track the node above it, we have to split that
"bottom" node into two cases: The bottom COW node, and the node directly
above the base node (which may be an R/W filter or the bottom COW node).

Signed-off-by: Max Reitz 
---
 qapi/block-core.json |  4 
 block/stream.c   | 52 
 blockdev.c   |  2 +-
 3 files changed, 38 insertions(+), 20 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index 38c4dbd7c3..3c54717870 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2516,6 +2516,10 @@
 # On successful completion the image file is updated to drop the backing file
 # and the BLOCK_JOB_COMPLETED event is emitted.
 #
+# In case @device is a filter node, block-stream modifies the first non-filter
+# overlay node below it to point to base's backing node (or NULL if @base was
+# not specified) instead of modifying @device itself.
+#
 # @job-id: identifier for the newly-created block job. If
 #  omitted, the device name will be used. (Since 2.7)
 #
diff --git a/block/stream.c b/block/stream.c
index 4c8b89884a..bd4a351dae 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -31,7 +31,8 @@ enum {
 
 typedef struct StreamBlockJob {
 BlockJob common;
-BlockDriverState *bottom;
+BlockDriverState *bottom_cow_node;
+BlockDriverState *above_base;
 BlockdevOnError on_error;
 char *backing_file_str;
 bool bs_read_only;
@@ -54,7 +55,7 @@ static void stream_abort(Job *job)
 
 if (s->chain_frozen) {
 BlockJob *bjob = &s->common;
-bdrv_unfreeze_chain(blk_bs(bjob->blk), s->bottom);
+bdrv_unfreeze_chain(blk_bs(bjob->blk), s->above_base);
 }
 }
 
@@ -63,14 +64,15 @@ static int stream_prepare(Job *job)
 StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
 BlockJob *bjob = &s->common;
 BlockDriverState *bs = blk_bs(bjob->blk);
-BlockDriverState *base = backing_bs(s->bottom);
+BlockDriverState *unfiltered_bs = bdrv_skip_rw_filters(bs);
+BlockDriverState *base = bdrv_filtered_bs(s->above_base);
 Error *local_err = NULL;
 int ret = 0;
 
-bdrv_unfreeze_chain(bs, s->bottom);
+bdrv_unfreeze_chain(bs, s->above_base);
 s->chain_frozen = false;
 
-if (bs->backing) {
+if (bdrv_filtered_cow_child(unfiltered_bs)) {
 const char *base_id = NULL, *base_fmt = NULL;
 if (base) {
 base_id = s->backing_file_str;
@@ -78,8 +80,8 @@ static int stream_prepare(Job *job)
 base_fmt = base->drv->format_name;
 }
 }
-bdrv_set_backing_hd(bs, base, &local_err);
-ret = bdrv_change_backing_file(bs, base_id, base_fmt);
+bdrv_set_backing_hd(unfiltered_bs, base, &local_err);
+ret = bdrv_change_backing_file(unfiltered_bs, base_id, base_fmt);
 if (local_err) {
 error_report_err(local_err);
 return -EPERM;
@@ -110,7 +112,8 @@ static int coroutine_fn stream_run(Job *job, Error **errp)
 StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
 BlockBackend *blk = s->common.blk;
 BlockDriverState *bs = blk_bs(blk);
-bool enable_cor = !backing_bs(s->bottom);
+BlockDriverState *unfiltered_bs = bdrv_skip_rw_filters(bs);
+bool enable_cor = !bdrv_filtered_bs(s->above_base);
 int64_t len;
 int64_t offset = 0;
 uint64_t delay_ns = 0;
@@ -119,7 +122,7 @@ static int coroutine_fn stream_run(Job *job, Error **errp)
 int64_t n = 0; /* bytes */
 void *buf;
 
-if (bs == s->bottom) {
+if (unfiltered_bs == s->bottom_cow_node) {
 /* Nothing to stream */
 return 0;
 }
@@ -154,13 +157,14 @@ static int coroutine_fn stream_run(Job *job, Error **errp)
 
 copy = false;
 
-ret = bdrv_is_allocated(bs, offset, STREAM_BUFFER_SIZE, &n);
+ret = bdrv_is_allocated(unfiltered_bs, offset, STREAM_BUFFER_SIZE, &n);
 if (ret == 1) {
 /* Allocated in the top, no need to copy.  */
 } else if (ret >= 0) {
 /* Copy if allocated in the intermediate images.  Limit to the
  * known-unallocated area [offset, offset+n*BDRV_SECTOR_SIZE).  */
-ret = bdrv_is_allocated_above(backing_bs(bs), s->bottom, true,
+ret = bdrv_is_allocated_above(bdrv_filtered_cow_bs(unfiltered_bs),
+  s->bottom_cow_node, true,
   offset, n, &n);
 /* Finish early if end of backing file has been reached */
 if (ret == 0 && n == 0) {
@@ -231,9 +235,16 @@ void stream_start(const char *job_id, BlockDriverState *bs,
 BlockDriverState *iter;
 bool bs_read_only;
 int basic_flags = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED;
-BlockDriverState *bottom = bdrv_find_overlay(bs, base);
+BlockDriverState *bottom

[Qemu-block] [PATCH v6 13/42] block: Use CAFs in block status functions

2019-08-09 Thread Max Reitz
Use the child access functions in the block status inquiry functions as
appropriate.

Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 block/io.c | 19 ++-
 1 file changed, 10 insertions(+), 9 deletions(-)

diff --git a/block/io.c b/block/io.c
index 4d6cf4b3c2..c5a8e3e6a3 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2180,11 +2180,12 @@ static int coroutine_fn 
bdrv_co_block_status(BlockDriverState *bs,
 if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) {
 ret |= BDRV_BLOCK_ALLOCATED;
 } else if (want_zero) {
+BlockDriverState *cow_bs = bdrv_filtered_cow_bs(bs);
+
 if (bdrv_unallocated_blocks_are_zero(bs)) {
 ret |= BDRV_BLOCK_ZERO;
-} else if (bs->backing) {
-BlockDriverState *bs2 = bs->backing->bs;
-int64_t size2 = bdrv_getlength(bs2);
+} else if (cow_bs) {
+int64_t size2 = bdrv_getlength(cow_bs);
 
 if (size2 >= 0 && offset >= size2) {
 ret |= BDRV_BLOCK_ZERO;
@@ -2250,7 +2251,7 @@ static int coroutine_fn 
bdrv_co_block_status_above(BlockDriverState *bs,
 bool first = true;
 
 assert(bs != base);
-for (p = bs; p != base; p = backing_bs(p)) {
+for (p = bs; p != base; p = bdrv_filtered_bs(p)) {
 ret = bdrv_co_block_status(p, want_zero, offset, bytes, pnum, map,
file);
 if (ret < 0) {
@@ -2336,7 +2337,7 @@ int bdrv_block_status_above(BlockDriverState *bs, 
BlockDriverState *base,
 int bdrv_block_status(BlockDriverState *bs, int64_t offset, int64_t bytes,
   int64_t *pnum, int64_t *map, BlockDriverState **file)
 {
-return bdrv_block_status_above(bs, backing_bs(bs),
+return bdrv_block_status_above(bs, bdrv_filtered_bs(bs),
offset, bytes, pnum, map, file);
 }
 
@@ -2346,9 +2347,9 @@ int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, 
int64_t offset,
 int ret;
 int64_t dummy;
 
-ret = bdrv_common_block_status_above(bs, backing_bs(bs), false, offset,
- bytes, pnum ? pnum : &dummy, NULL,
- NULL);
+ret = bdrv_common_block_status_above(bs, bdrv_filtered_bs(bs), false,
+ offset, bytes, pnum ? pnum : &dummy,
+ NULL, NULL);
 if (ret < 0) {
 return ret;
 }
@@ -2411,7 +2412,7 @@ int bdrv_is_allocated_above(BlockDriverState *top,
 break;
 }
 
-intermediate = backing_bs(intermediate);
+intermediate = bdrv_filtered_bs(intermediate);
 }
 
 *pnum = n;
-- 
2.21.0




[Qemu-block] [PATCH v6 27/42] commit: Deal with filters

2019-08-09 Thread Max Reitz
This includes some permission limiting (for example, we only need to
take the RESIZE permission if the base is smaller than the top).

Signed-off-by: Max Reitz 
---
 block/block-backend.c | 16 +---
 block/commit.c| 96 +++
 blockdev.c|  6 ++-
 3 files changed, 85 insertions(+), 33 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index c13c5c83b0..0bc592d023 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -2180,11 +2180,17 @@ int blk_commit_all(void)
 AioContext *aio_context = blk_get_aio_context(blk);
 
 aio_context_acquire(aio_context);
-if (blk_is_inserted(blk) && blk->root->bs->backing) {
-int ret = bdrv_commit(blk->root->bs);
-if (ret < 0) {
-aio_context_release(aio_context);
-return ret;
+if (blk_is_inserted(blk)) {
+BlockDriverState *non_filter;
+
+/* Legacy function, so skip implicit filters */
+non_filter = bdrv_skip_implicit_filters(blk->root->bs);
+if (bdrv_filtered_cow_child(non_filter)) {
+int ret = bdrv_commit(non_filter);
+if (ret < 0) {
+aio_context_release(aio_context);
+return ret;
+}
 }
 }
 aio_context_release(aio_context);
diff --git a/block/commit.c b/block/commit.c
index 5a7672c7c7..40d1c8eeac 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -37,6 +37,7 @@ typedef struct CommitBlockJob {
 BlockBackend *top;
 BlockBackend *base;
 BlockDriverState *base_bs;
+BlockDriverState *above_base;
 BlockdevOnError on_error;
 bool base_read_only;
 bool chain_frozen;
@@ -110,7 +111,7 @@ static void commit_abort(Job *job)
  * XXX Can (or should) we somehow keep 'consistent read' blocked even
  * after the failed/cancelled commit job is gone? If we already wrote
  * something to base, the intermediate images aren't valid any more. */
-bdrv_replace_node(s->commit_top_bs, backing_bs(s->commit_top_bs),
+bdrv_replace_node(s->commit_top_bs, s->commit_top_bs->backing->bs,
   &error_abort);
 
 bdrv_unref(s->commit_top_bs);
@@ -174,7 +175,7 @@ static int coroutine_fn commit_run(Job *job, Error **errp)
 break;
 }
 /* Copy if allocated above the base */
-ret = bdrv_is_allocated_above(blk_bs(s->top), blk_bs(s->base), false,
+ret = bdrv_is_allocated_above(blk_bs(s->top), s->above_base, true,
   offset, COMMIT_BUFFER_SIZE, &n);
 copy = (ret == 1);
 trace_commit_one_iteration(s, offset, n, ret);
@@ -267,15 +268,35 @@ void commit_start(const char *job_id, BlockDriverState 
*bs,
 CommitBlockJob *s;
 BlockDriverState *iter;
 BlockDriverState *commit_top_bs = NULL;
+BlockDriverState *filtered_base;
 Error *local_err = NULL;
+int64_t base_size, top_size;
+uint64_t perms, iter_shared_perms;
 int ret;
 
 assert(top != bs);
-if (top == base) {
+if (bdrv_skip_rw_filters(top) == bdrv_skip_rw_filters(base)) {
 error_setg(errp, "Invalid files for merge: top and base are the same");
 return;
 }
 
+base_size = bdrv_getlength(base);
+if (base_size < 0) {
+error_setg_errno(errp, -base_size, "Could not inquire base image 
size");
+return;
+}
+
+top_size = bdrv_getlength(top);
+if (top_size < 0) {
+error_setg_errno(errp, -top_size, "Could not inquire top image size");
+return;
+}
+
+perms = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE;
+if (base_size < top_size) {
+perms |= BLK_PERM_RESIZE;
+}
+
 s = block_job_create(job_id, &commit_job_driver, NULL, bs, 0, BLK_PERM_ALL,
  speed, creation_flags, NULL, NULL, errp);
 if (!s) {
@@ -315,17 +336,43 @@ void commit_start(const char *job_id, BlockDriverState 
*bs,
 
 s->commit_top_bs = commit_top_bs;
 
-/* Block all nodes between top and base, because they will
- * disappear from the chain after this operation. */
-assert(bdrv_chain_contains(top, base));
-for (iter = top; iter != base; iter = backing_bs(iter)) {
-/* XXX BLK_PERM_WRITE needs to be allowed so we don't block ourselves
- * at s->base (if writes are blocked for a node, they are also blocked
- * for its backing file). The other options would be a second filter
- * driver above s->base. */
+/*
+ * Block all nodes between top and base, because they will
+ * disappear from the chain after this operation.
+ * Note that this assumes that the user is fine with removing all
+ * nodes (including R/W filters) between top and base.  Assuring
+ * this is the responsibility of the interface (i.e. whoever calls
+ * commit_start()).
+ */
+s->above_base = bdrv_find_overlay(t

[Qemu-block] [PATCH v6 17/42] block: Use CAFs in bdrv_refresh_limits()

2019-08-09 Thread Max Reitz
Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 block/io.c | 14 --
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/block/io.c b/block/io.c
index bcc770d336..dca4689b2f 100644
--- a/block/io.c
+++ b/block/io.c
@@ -135,6 +135,8 @@ static void bdrv_merge_limits(BlockLimits *dst, const 
BlockLimits *src)
 void bdrv_refresh_limits(BlockDriverState *bs, Error **errp)
 {
 BlockDriver *drv = bs->drv;
+BlockDriverState *storage_bs = bdrv_storage_bs(bs);
+BlockDriverState *cow_bs = bdrv_filtered_cow_bs(bs);
 Error *local_err = NULL;
 
 memset(&bs->bl, 0, sizeof(bs->bl));
@@ -148,13 +150,13 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error 
**errp)
 drv->bdrv_aio_preadv) ? 1 : 512;
 
 /* Take some limits from the children as a default */
-if (bs->file) {
-bdrv_refresh_limits(bs->file->bs, &local_err);
+if (storage_bs) {
+bdrv_refresh_limits(storage_bs, &local_err);
 if (local_err) {
 error_propagate(errp, local_err);
 return;
 }
-bdrv_merge_limits(&bs->bl, &bs->file->bs->bl);
+bdrv_merge_limits(&bs->bl, &storage_bs->bl);
 } else {
 bs->bl.min_mem_alignment = 512;
 bs->bl.opt_mem_alignment = getpagesize();
@@ -163,13 +165,13 @@ void bdrv_refresh_limits(BlockDriverState *bs, Error 
**errp)
 bs->bl.max_iov = IOV_MAX;
 }
 
-if (bs->backing) {
-bdrv_refresh_limits(bs->backing->bs, &local_err);
+if (cow_bs) {
+bdrv_refresh_limits(cow_bs, &local_err);
 if (local_err) {
 error_propagate(errp, local_err);
 return;
 }
-bdrv_merge_limits(&bs->bl, &bs->backing->bs->bl);
+bdrv_merge_limits(&bs->bl, &cow_bs->bl);
 }
 
 /* Then let the driver override it */
-- 
2.21.0




[Qemu-block] [PATCH v6 20/42] block/snapshot: Fix fallback

2019-08-09 Thread Max Reitz
If the top node's driver does not provide snapshot functionality and we
want to fall back to a node down the chain, we need to snapshot all
non-COW children.  For simplicity's sake, just do not fall back if there
is more than one such child.

bdrv_snapshot_goto() becomes a bit weird because we may have to redirect
the actual child pointer, so it only works if the fallback child is
bs->file or bs->backing (and then we have to find out which it is).

Suggested-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Max Reitz 
---
 block/snapshot.c | 100 +--
 1 file changed, 79 insertions(+), 21 deletions(-)

diff --git a/block/snapshot.c b/block/snapshot.c
index f2f48f926a..35403c167f 100644
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -146,6 +146,32 @@ bool bdrv_snapshot_find_by_id_and_name(BlockDriverState 
*bs,
 return ret;
 }
 
+/**
+ * Return the child BDS to which we can fall back if the given BDS
+ * does not support snapshots.
+ * Return NULL if there is no BDS to (safely) fall back to.
+ */
+static BlockDriverState *bdrv_snapshot_fallback(BlockDriverState *bs)
+{
+BlockDriverState *child_bs = NULL;
+BdrvChild *child;
+
+QLIST_FOREACH(child, &bs->children, next) {
+if (child == bdrv_filtered_cow_child(bs)) {
+/* Ignore: COW children need not be included in snapshots */
+continue;
+}
+
+if (child_bs) {
+/* Cannot fall back to a single child if there are multiple */
+return NULL;
+}
+child_bs = child->bs;
+}
+
+return child_bs;
+}
+
 int bdrv_can_snapshot(BlockDriverState *bs)
 {
 BlockDriver *drv = bs->drv;
@@ -154,8 +180,9 @@ int bdrv_can_snapshot(BlockDriverState *bs)
 }
 
 if (!drv->bdrv_snapshot_create) {
-if (bs->file != NULL) {
-return bdrv_can_snapshot(bs->file->bs);
+BlockDriverState *fallback_bs = bdrv_snapshot_fallback(bs);
+if (fallback_bs) {
+return bdrv_can_snapshot(fallback_bs);
 }
 return 0;
 }
@@ -167,14 +194,15 @@ int bdrv_snapshot_create(BlockDriverState *bs,
  QEMUSnapshotInfo *sn_info)
 {
 BlockDriver *drv = bs->drv;
+BlockDriverState *fallback_bs = bdrv_snapshot_fallback(bs);
 if (!drv) {
 return -ENOMEDIUM;
 }
 if (drv->bdrv_snapshot_create) {
 return drv->bdrv_snapshot_create(bs, sn_info);
 }
-if (bs->file) {
-return bdrv_snapshot_create(bs->file->bs, sn_info);
+if (fallback_bs) {
+return bdrv_snapshot_create(fallback_bs, sn_info);
 }
 return -ENOTSUP;
 }
@@ -184,6 +212,7 @@ int bdrv_snapshot_goto(BlockDriverState *bs,
Error **errp)
 {
 BlockDriver *drv = bs->drv;
+BlockDriverState *fallback_bs;
 int ret, open_ret;
 
 if (!drv) {
@@ -204,39 +233,66 @@ int bdrv_snapshot_goto(BlockDriverState *bs,
 return ret;
 }
 
-if (bs->file) {
-BlockDriverState *file;
-QDict *options = qdict_clone_shallow(bs->options);
+fallback_bs = bdrv_snapshot_fallback(bs);
+if (fallback_bs) {
+QDict *options;
 QDict *file_options;
 Error *local_err = NULL;
+bool is_backing_child;
+BdrvChild **child_pointer;
+
+/*
+ * We need a pointer to the fallback child pointer, so let us
+ * see whether the child is referenced by a field in the BDS
+ * object.
+ */
+if (fallback_bs == bs->file->bs) {
+is_backing_child = false;
+child_pointer = &bs->file;
+} else if (fallback_bs == bs->backing->bs) {
+is_backing_child = true;
+child_pointer = &bs->backing;
+} else {
+/*
+ * The fallback child is not referenced by a field in the
+ * BDS object.  We cannot go on then.
+ */
+error_setg(errp, "Block driver does not support snapshots");
+return -ENOTSUP;
+}
+
+options = qdict_clone_shallow(bs->options);
 
-file = bs->file->bs;
 /* Prevent it from getting deleted when detached from bs */
-bdrv_ref(file);
+bdrv_ref(fallback_bs);
 
-qdict_extract_subqdict(options, &file_options, "file.");
+qdict_extract_subqdict(options, &file_options,
+   is_backing_child ? "backing." : "file.");
 qobject_unref(file_options);
-qdict_put_str(options, "file", bdrv_get_node_name(file));
+qdict_put_str(options, is_backing_child ? "backing" : "file",
+  bdrv_get_node_name(fallback_bs));
 
 if (drv->bdrv_close) {
 drv->bdrv_close(bs);
 }
-bdrv_unref_child(bs, bs->file);
-bs->file = NULL;
 
-ret = bdrv_snapshot_goto(file, snapshot_id, errp);
+assert(fallback_bs == (*child_pointer)->bs);
+bdrv_unref_child(bs, *child_pointer);
+ 

[Qemu-block] [PATCH v6 19/42] block: Use CAF in bdrv_co_rw_vmstate()

2019-08-09 Thread Max Reitz
If a node whose driver does not provide VM state functions has a
metadata child, the VM state should probably go there; if it is a
filter, the VM state should probably go there.  It follows that we
should generally go down to the primary child.

Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 block/io.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/block/io.c b/block/io.c
index dca4689b2f..e222d91893 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2434,6 +2434,7 @@ bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector 
*qiov, int64_t pos,
bool is_read)
 {
 BlockDriver *drv = bs->drv;
+BlockDriverState *child_bs = bdrv_primary_bs(bs);
 int ret = -ENOTSUP;
 
 bdrv_inc_in_flight(bs);
@@ -2446,8 +2447,8 @@ bdrv_co_rw_vmstate(BlockDriverState *bs, QEMUIOVector 
*qiov, int64_t pos,
 } else {
 ret = drv->bdrv_save_vmstate(bs, qiov, pos);
 }
-} else if (bs->file) {
-ret = bdrv_co_rw_vmstate(bs->file->bs, qiov, pos, is_read);
+} else if (child_bs) {
+ret = bdrv_co_rw_vmstate(child_bs, qiov, pos, is_read);
 }
 
 bdrv_dec_in_flight(bs);
-- 
2.21.0




[Qemu-block] [PATCH v6 18/42] block: Use CAFs in bdrv_refresh_filename()

2019-08-09 Thread Max Reitz
bdrv_refresh_filename() and the kind of related bdrv_dirname() should
look to the primary child when they wish to copy the underlying file's
filename.

Signed-off-by: Max Reitz 
---
 block.c | 29 +
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/block.c b/block.c
index 064cc99857..a467b175c6 100644
--- a/block.c
+++ b/block.c
@@ -6432,6 +6432,7 @@ void bdrv_refresh_filename(BlockDriverState *bs)
 {
 BlockDriver *drv = bs->drv;
 BdrvChild *child;
+BlockDriverState *primary_child_bs;
 QDict *opts;
 bool backing_overridden;
 bool generate_json_filename; /* Whether our default implementation should
@@ -6500,20 +6501,30 @@ void bdrv_refresh_filename(BlockDriverState *bs)
 qobject_unref(bs->full_open_options);
 bs->full_open_options = opts;
 
+primary_child_bs = bdrv_primary_bs(bs);
+
 if (drv->bdrv_refresh_filename) {
 /* Obsolete information is of no use here, so drop the old file name
  * information before refreshing it */
 bs->exact_filename[0] = '\0';
 
 drv->bdrv_refresh_filename(bs);
-} else if (bs->file) {
-/* Try to reconstruct valid information from the underlying file */
+} else if (primary_child_bs) {
+/*
+ * Try to reconstruct valid information from the underlying
+ * file -- this only works for format nodes (filter nodes
+ * cannot be probed and as such must be selected by the user
+ * either through an options dict, or through a special
+ * filename which the filter driver must construct in its
+ * .bdrv_refresh_filename() implementation).
+ */
 
 bs->exact_filename[0] = '\0';
 
 /*
  * We can use the underlying file's filename if:
  * - it has a filename,
+ * - the current BDS is not a filter,
  * - the file is a protocol BDS, and
  * - opening that file (as this BDS's format) will automatically create
  *   the BDS tree we have right now, that is:
@@ -6522,11 +6533,11 @@ void bdrv_refresh_filename(BlockDriverState *bs)
  *   - no non-file child of this BDS has been overridden by the user
  *   Both of these conditions are represented by 
generate_json_filename.
  */
-if (bs->file->bs->exact_filename[0] &&
-bs->file->bs->drv->bdrv_file_open &&
-!generate_json_filename)
+if (primary_child_bs->exact_filename[0] &&
+primary_child_bs->drv->bdrv_file_open &&
+!drv->is_filter && !generate_json_filename)
 {
-strcpy(bs->exact_filename, bs->file->bs->exact_filename);
+strcpy(bs->exact_filename, primary_child_bs->exact_filename);
 }
 }
 
@@ -6543,6 +6554,7 @@ void bdrv_refresh_filename(BlockDriverState *bs)
 char *bdrv_dirname(BlockDriverState *bs, Error **errp)
 {
 BlockDriver *drv = bs->drv;
+BlockDriverState *child_bs;
 
 if (!drv) {
 error_setg(errp, "Node '%s' is ejected", bs->node_name);
@@ -6553,8 +6565,9 @@ char *bdrv_dirname(BlockDriverState *bs, Error **errp)
 return drv->bdrv_dirname(bs, errp);
 }
 
-if (bs->file) {
-return bdrv_dirname(bs->file->bs, errp);
+child_bs = bdrv_primary_bs(bs);
+if (child_bs) {
+return bdrv_dirname(child_bs, errp);
 }
 
 bdrv_refresh_filename(bs);
-- 
2.21.0




[Qemu-block] [PATCH v6 16/42] block: Flush all children in generic code

2019-08-09 Thread Max Reitz
If the driver does not support .bdrv_co_flush() so bdrv_co_flush()
itself has to flush the children of the given node, it should not flush
just bs->file->bs, but in fact all children.

In any case, the BLKDBG_EVENT() should be emitted on the primary child,
because that is where a blkdebug node would be if there is any.

Suggested-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Max Reitz 
---
 block/io.c | 23 +--
 1 file changed, 17 insertions(+), 6 deletions(-)

diff --git a/block/io.c b/block/io.c
index c5a8e3e6a3..bcc770d336 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2572,6 +2572,8 @@ static void coroutine_fn bdrv_flush_co_entry(void *opaque)
 
 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 {
+BdrvChild *primary_child = bdrv_primary_child(bs);
+BdrvChild *child;
 int current_gen;
 int ret = 0;
 
@@ -2601,7 +2603,7 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 }
 
 /* Write back cached data to the OS even with cache=unsafe */
-BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
+BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_OS);
 if (bs->drv->bdrv_co_flush_to_os) {
 ret = bs->drv->bdrv_co_flush_to_os(bs);
 if (ret < 0) {
@@ -2611,15 +2613,15 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 
 /* But don't actually force it to the disk with cache=unsafe */
 if (bs->open_flags & BDRV_O_NO_FLUSH) {
-goto flush_parent;
+goto flush_children;
 }
 
 /* Check if we really need to flush anything */
 if (bs->flushed_gen == current_gen) {
-goto flush_parent;
+goto flush_children;
 }
 
-BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK);
+BLKDBG_EVENT(primary_child, BLKDBG_FLUSH_TO_DISK);
 if (!bs->drv) {
 /* bs->drv->bdrv_co_flush() might have ejected the BDS
  * (even in case of apparent success) */
@@ -2663,8 +2665,17 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
  * in the case of cache=unsafe, so there are no useless flushes.
  */
-flush_parent:
-ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
+flush_children:
+ret = 0;
+QLIST_FOREACH(child, &bs->children, next) {
+int this_child_ret;
+
+this_child_ret = bdrv_co_flush(child->bs);
+if (!ret) {
+ret = this_child_ret;
+}
+}
+
 out:
 /* Notify any pending flushes that we have completed */
 if (ret == 0) {
-- 
2.21.0




[Qemu-block] [PATCH v6 09/42] block: Include filters when freezing backing chain

2019-08-09 Thread Max Reitz
In order to make filters work in backing chains, the associated
functions must be able to deal with them and freeze all filter links, be
they COW or R/W filter links.

In the process, rename these functions to reflect that they now act on
generalized chains of filter nodes instead of backing chains alone.

While at it, add some comments that note which functions require their
caller to ensure that a given child link is not frozen, and how the
callers do so.

Signed-off-by: Max Reitz 
---
 include/block/block.h | 10 +++---
 block.c   | 81 +--
 block/commit.c|  8 ++---
 block/mirror.c|  4 +--
 block/stream.c|  8 ++---
 5 files changed, 62 insertions(+), 49 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index 50a07c1c33..f6f09b95cd 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -364,11 +364,11 @@ int bdrv_drop_intermediate(BlockDriverState *top, 
BlockDriverState *base,
 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
 BlockDriverState *bs);
 BlockDriverState *bdrv_find_base(BlockDriverState *bs);
-bool bdrv_is_backing_chain_frozen(BlockDriverState *bs, BlockDriverState *base,
-  Error **errp);
-int bdrv_freeze_backing_chain(BlockDriverState *bs, BlockDriverState *base,
-  Error **errp);
-void bdrv_unfreeze_backing_chain(BlockDriverState *bs, BlockDriverState *base);
+bool bdrv_is_chain_frozen(BlockDriverState *bs, BlockDriverState *base,
+  Error **errp);
+int bdrv_freeze_chain(BlockDriverState *bs, BlockDriverState *base,
+  Error **errp);
+void bdrv_unfreeze_chain(BlockDriverState *bs, BlockDriverState *base);
 
 
 typedef struct BdrvCheckResult {
diff --git a/block.c b/block.c
index adf82efb0e..650c00d182 100644
--- a/block.c
+++ b/block.c
@@ -2303,12 +2303,15 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
  * If @new_bs is not NULL, bdrv_check_perm() must be called beforehand, as this
  * function uses bdrv_set_perm() to update the permissions according to the new
  * reference that @new_bs gets.
+ *
+ * Callers must ensure that child->frozen is false.
  */
 static void bdrv_replace_child(BdrvChild *child, BlockDriverState *new_bs)
 {
 BlockDriverState *old_bs = child->bs;
 uint64_t perm, shared_perm;
 
+/* Asserts that child->frozen == false */
 bdrv_replace_child_noperm(child, new_bs);
 
 /*
@@ -2468,6 +2471,7 @@ static void bdrv_detach_child(BdrvChild *child)
 g_free(child);
 }
 
+/* Callers must ensure that child->frozen is false. */
 void bdrv_root_unref_child(BdrvChild *child)
 {
 BlockDriverState *child_bs;
@@ -2477,10 +2481,6 @@ void bdrv_root_unref_child(BdrvChild *child)
 bdrv_unref(child_bs);
 }
 
-/**
- * Clear all inherits_from pointers from children and grandchildren of
- * @root that point to @root, where necessary.
- */
 static void bdrv_unset_inherits_from(BlockDriverState *root, BdrvChild *child)
 {
 BdrvChild *c;
@@ -2505,6 +2505,7 @@ static void bdrv_unset_inherits_from(BlockDriverState 
*root, BdrvChild *child)
 }
 }
 
+/* Callers must ensure that child->frozen is false. */
 void bdrv_unref_child(BlockDriverState *parent, BdrvChild *child)
 {
 if (child == NULL) {
@@ -2548,7 +2549,7 @@ void bdrv_set_backing_hd(BlockDriverState *bs, 
BlockDriverState *backing_hd,
 bool update_inherits_from = bdrv_chain_contains(bs, backing_hd) &&
 bdrv_inherits_from_recursive(backing_hd, bs);
 
-if (bdrv_is_backing_chain_frozen(bs, child_bs(bs->backing), errp)) {
+if (bdrv_is_chain_frozen(bs, child_bs(bs->backing), errp)) {
 return;
 }
 
@@ -2557,6 +2558,7 @@ void bdrv_set_backing_hd(BlockDriverState *bs, 
BlockDriverState *backing_hd,
 }
 
 if (bs->backing) {
+/* Cannot be frozen, we checked that above */
 bdrv_unref_child(bs, bs->backing);
 }
 
@@ -3674,8 +3676,7 @@ static int bdrv_reopen_parse_backing(BDRVReopenState 
*reopen_state,
 return -EPERM;
 }
 /* Check if the backing link that we want to replace is frozen */
-if (bdrv_is_backing_chain_frozen(overlay_bs, backing_bs(overlay_bs),
- errp)) {
+if (bdrv_is_chain_frozen(overlay_bs, backing_bs(overlay_bs), errp)) {
 return -EPERM;
 }
 reopen_state->replace_backing_bs = true;
@@ -4029,6 +4030,7 @@ static void bdrv_close(BlockDriverState *bs)
 
 if (bs->drv) {
 if (bs->drv->bdrv_close) {
+/* Must unfreeze all children, so bdrv_unref_child() works */
 bs->drv->bdrv_close(bs);
 }
 bs->drv = NULL;
@@ -4398,20 +4400,22 @@ BlockDriverState *bdrv_find_base(BlockDriverState *bs)
 }
 
 /*
- * Return true if at least one of the backing links between @bs and
- * @base is frozen. @errp is set if that's the case.
+ *

[Qemu-block] [PATCH v6 06/42] qcow2: Implement .bdrv_storage_child()

2019-08-09 Thread Max Reitz
Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 block/qcow2.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/block/qcow2.c b/block/qcow2.c
index 039bdc2f7e..f8570d6210 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -5086,6 +5086,13 @@ void qcow2_signal_corruption(BlockDriverState *bs, bool 
fatal, int64_t offset,
 s->signaled_corruption = true;
 }
 
+static BdrvChild *qcow2_storage_child(BlockDriverState *bs)
+{
+BDRVQcow2State *s = bs->opaque;
+
+return s->data_file;
+}
+
 static QemuOptsList qcow2_create_opts = {
 .name = "qcow2-create-opts",
 .head = QTAILQ_HEAD_INITIALIZER(qcow2_create_opts.head),
@@ -5232,6 +5239,8 @@ BlockDriver bdrv_qcow2 = {
 .bdrv_reopen_bitmaps_rw = qcow2_reopen_bitmaps_rw,
 .bdrv_can_store_new_dirty_bitmap = qcow2_can_store_new_dirty_bitmap,
 .bdrv_remove_persistent_dirty_bitmap = 
qcow2_remove_persistent_dirty_bitmap,
+
+.bdrv_storage_child = qcow2_storage_child,
 };
 
 static void bdrv_qcow2_init(void)
-- 
2.21.0




[Qemu-block] [PATCH v6 04/42] block: Add child access functions

2019-08-09 Thread Max Reitz
There are BDS children that the general block layer code can access,
namely bs->file and bs->backing.  Since the introduction of filters and
external data files, their meaning is not quite clear.  bs->backing can
be a COW source, or it can be an R/W-filtered child; bs->file can be an
R/W-filtered child, it can be data and metadata storage, or it can be
just metadata storage.

This overloading really is not helpful.  This patch adds function that
retrieve the correct child for each exact purpose.  Later patches in
this series will make use of them.  Doing so will allow us to handle
filter nodes and external data files in a meaningful way.

Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 include/block/block_int.h | 57 --
 block.c   | 99 +++
 2 files changed, 153 insertions(+), 3 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index 60d9261f8e..6c60abc4c3 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -90,9 +90,11 @@ struct BlockDriver {
 int instance_size;
 
 /* set to true if the BlockDriver is a block filter. Block filters pass
- * certain callbacks that refer to data (see block.c) to their bs->file if
- * the driver doesn't implement them. Drivers that do not wish to forward
- * must implement them and return -ENOTSUP.
+ * certain callbacks that refer to data (see block.c) to their bs->file
+ * or bs->backing (whichever one exists) if the driver doesn't implement
+ * them. Drivers that do not wish to forward must implement them and return
+ * -ENOTSUP.
+ * Note that filters are not allowed to modify data.
  */
 bool is_filter;
 /* for snapshots block filter like Quorum can implement the
@@ -562,6 +564,13 @@ struct BlockDriver {
  * If this pointer is NULL, the array is considered empty.
  * "filename" and "driver" are always considered strong. */
 const char *const *strong_runtime_opts;
+
+/**
+ * Return the data storage child, if there is exactly one.  If
+ * this function is not implemented, the block layer will assume
+ * bs->file to be this child.
+ */
+BdrvChild *(*bdrv_storage_child)(BlockDriverState *bs);
 };
 
 typedef struct BlockLimits {
@@ -1276,4 +1285,46 @@ int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, 
uint64_t src_offset,
 
 int refresh_total_sectors(BlockDriverState *bs, int64_t hint);
 
+BdrvChild *bdrv_filtered_cow_child(BlockDriverState *bs);
+BdrvChild *bdrv_filtered_rw_child(BlockDriverState *bs);
+BdrvChild *bdrv_filtered_child(BlockDriverState *bs);
+BdrvChild *bdrv_metadata_child(BlockDriverState *bs);
+BdrvChild *bdrv_storage_child(BlockDriverState *bs);
+BdrvChild *bdrv_primary_child(BlockDriverState *bs);
+
+static inline BlockDriverState *child_bs(BdrvChild *child)
+{
+return child ? child->bs : NULL;
+}
+
+static inline BlockDriverState *bdrv_filtered_cow_bs(BlockDriverState *bs)
+{
+return child_bs(bdrv_filtered_cow_child(bs));
+}
+
+static inline BlockDriverState *bdrv_filtered_rw_bs(BlockDriverState *bs)
+{
+return child_bs(bdrv_filtered_rw_child(bs));
+}
+
+static inline BlockDriverState *bdrv_filtered_bs(BlockDriverState *bs)
+{
+return child_bs(bdrv_filtered_child(bs));
+}
+
+static inline BlockDriverState *bdrv_metadata_bs(BlockDriverState *bs)
+{
+return child_bs(bdrv_metadata_child(bs));
+}
+
+static inline BlockDriverState *bdrv_storage_bs(BlockDriverState *bs)
+{
+return child_bs(bdrv_storage_child(bs));
+}
+
+static inline BlockDriverState *bdrv_primary_bs(BlockDriverState *bs)
+{
+return child_bs(bdrv_primary_child(bs));
+}
+
 #endif /* BLOCK_INT_H */
diff --git a/block.c b/block.c
index aae3417dd5..f6c5f8c3eb 100644
--- a/block.c
+++ b/block.c
@@ -6556,3 +6556,102 @@ bool bdrv_can_store_new_dirty_bitmap(BlockDriverState 
*bs, const char *name,
 
 return drv->bdrv_can_store_new_dirty_bitmap(bs, name, granularity, errp);
 }
+
+/*
+ * Return the child that @bs acts as an overlay for, and from which data may be
+ * copied in COW or COR operations.  Usually this is the backing file.
+ */
+BdrvChild *bdrv_filtered_cow_child(BlockDriverState *bs)
+{
+if (!bs || !bs->drv) {
+return NULL;
+}
+
+if (bs->drv->is_filter) {
+return NULL;
+}
+
+return bs->backing;
+}
+
+/*
+ * If @bs acts as a pass-through filter for one of its children,
+ * return that child.  "Pass-through" means that write operations to
+ * @bs are forwarded to that child instead of triggering COW.
+ */
+BdrvChild *bdrv_filtered_rw_child(BlockDriverState *bs)
+{
+if (!bs || !bs->drv) {
+return NULL;
+}
+
+if (!bs->drv->is_filter) {
+return NULL;
+}
+
+/* Only one of @backing or @file may be used */
+assert(!(bs->backing && bs->file));
+
+return bs->backing ?: bs->file;
+}
+
+/*
+ * Return any filtered child, independently of how it reacts to write

[Qemu-block] [PATCH v6 15/42] block: Re-evaluate backing file handling in reopen

2019-08-09 Thread Max Reitz
Reopening a node's backing child needs a bit of special handling because
the "backing" child has different defaults than all other children
(among other things).  Adding filter support here is a bit more
difficult than just using the child access functions.  In fact, we often
have to directly use bs->backing because these functions are about the
"backing" child (which may or may not be the COW backing file).

Signed-off-by: Max Reitz 
---
 block.c | 45 ++---
 1 file changed, 38 insertions(+), 7 deletions(-)

diff --git a/block.c b/block.c
index 42abbaf0ba..064cc99857 100644
--- a/block.c
+++ b/block.c
@@ -3660,25 +3660,56 @@ static int bdrv_reopen_parse_backing(BDRVReopenState 
*reopen_state,
 }
 }
 
+/*
+ * Ensure that @bs can really handle backing files, because we are
+ * about to give it one (or swap the existing one)
+ */
+if (bs->drv->is_filter) {
+/* Filters always have a file or a backing child */
+if (!bs->backing) {
+error_setg(errp, "'%s' is a %s filter node that does not support a 
"
+   "backing child", bs->node_name, bs->drv->format_name);
+return -EINVAL;
+}
+} else if (!bs->drv->supports_backing) {
+error_setg(errp, "Driver '%s' of node '%s' does not support backing "
+   "files", bs->drv->format_name, bs->node_name);
+return -EINVAL;
+}
+
 /*
  * Find the "actual" backing file by skipping all links that point
  * to an implicit node, if any (e.g. a commit filter node).
+ * We cannot use any of the bdrv_skip_*() functions here because
+ * those return the first explicit node, while we are looking for
+ * its overlay here.
  */
 overlay_bs = bs;
-while (backing_bs(overlay_bs) && backing_bs(overlay_bs)->implicit) {
-overlay_bs = backing_bs(overlay_bs);
+while (bdrv_filtered_bs(overlay_bs) &&
+   bdrv_filtered_bs(overlay_bs)->implicit)
+{
+overlay_bs = bdrv_filtered_bs(overlay_bs);
 }
 
 /* If we want to replace the backing file we need some extra checks */
-if (new_backing_bs != backing_bs(overlay_bs)) {
+if (new_backing_bs != bdrv_filtered_bs(overlay_bs)) {
 /* Check for implicit nodes between bs and its backing file */
 if (bs != overlay_bs) {
 error_setg(errp, "Cannot change backing link if '%s' has "
"an implicit backing file", bs->node_name);
 return -EPERM;
 }
-/* Check if the backing link that we want to replace is frozen */
-if (bdrv_is_chain_frozen(overlay_bs, backing_bs(overlay_bs), errp)) {
+/*
+ * Check if the backing link that we want to replace is frozen.
+ * Note that
+ * bdrv_filtered_child(overlay_bs) == overlay_bs->backing,
+ * because we know that overlay_bs == bs, and that @bs
+ * either is an R/W filter that uses ->backing or a COW format
+ * with bs->drv->supports_backing == true.
+ */
+if (bdrv_is_chain_frozen(overlay_bs, child_bs(overlay_bs->backing),
+ errp))
+{
 return -EPERM;
 }
 reopen_state->replace_backing_bs = true;
@@ -3829,7 +3860,7 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, 
BlockReopenQueue *queue,
  * its metadata. Otherwise the 'backing' option can be omitted.
  */
 if (drv->supports_backing && reopen_state->backing_missing &&
-(backing_bs(reopen_state->bs) || reopen_state->bs->backing_file[0])) {
+(reopen_state->bs->backing || reopen_state->bs->backing_file[0])) {
 error_setg(errp, "backing is missing for '%s'",
reopen_state->bs->node_name);
 ret = -EINVAL;
@@ -3974,7 +4005,7 @@ void bdrv_reopen_commit(BDRVReopenState *reopen_state)
  * from bdrv_set_backing_hd()) has the new values.
  */
 if (reopen_state->replace_backing_bs) {
-BlockDriverState *old_backing_bs = backing_bs(bs);
+BlockDriverState *old_backing_bs = child_bs(bs->backing);
 assert(!old_backing_bs || !old_backing_bs->implicit);
 /* Abort the permission update on the backing bs we're detaching */
 if (old_backing_bs) {
-- 
2.21.0




[Qemu-block] [PATCH v6 08/42] block: bdrv_set_backing_hd() is about bs->backing

2019-08-09 Thread Max Reitz
bdrv_set_backing_hd() is a function that explicitly cares about the
bs->backing child.  Highlight that in its description and use
child_bs(bs->backing) instead of backing_bs(bs) to make it more obvious.

Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 block.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block.c b/block.c
index 486c75d847..adf82efb0e 100644
--- a/block.c
+++ b/block.c
@@ -2539,7 +2539,7 @@ static bool bdrv_inherits_from_recursive(BlockDriverState 
*child,
 }
 
 /*
- * Sets the backing file link of a BDS. A new reference is created; callers
+ * Sets the bs->backing link of a BDS. A new reference is created; callers
  * which don't need their own reference any more must call bdrv_unref().
  */
 void bdrv_set_backing_hd(BlockDriverState *bs, BlockDriverState *backing_hd,
@@ -2548,7 +2548,7 @@ void bdrv_set_backing_hd(BlockDriverState *bs, 
BlockDriverState *backing_hd,
 bool update_inherits_from = bdrv_chain_contains(bs, backing_hd) &&
 bdrv_inherits_from_recursive(backing_hd, bs);
 
-if (bdrv_is_backing_chain_frozen(bs, backing_bs(bs), errp)) {
+if (bdrv_is_backing_chain_frozen(bs, child_bs(bs->backing), errp)) {
 return;
 }
 
-- 
2.21.0




[Qemu-block] [PATCH v6 07/42] block: *filtered_cow_child() for *has_zero_init()

2019-08-09 Thread Max Reitz
bdrv_has_zero_init() and the related bdrv_unallocated_blocks_are_zero()
should use bdrv_filtered_cow_child() if they want to check whether the
given BDS has a COW backing file.

Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 block.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block.c b/block.c
index bfa5e27850..486c75d847 100644
--- a/block.c
+++ b/block.c
@@ -5063,7 +5063,7 @@ int bdrv_has_zero_init(BlockDriverState *bs)
 
 /* If BS is a copy on write image, it is initialized to
the contents of the base image, which may not be zeroes.  */
-if (bs->backing) {
+if (bdrv_filtered_cow_child(bs)) {
 return 0;
 }
 if (bs->drv->bdrv_has_zero_init) {
@@ -5081,7 +5081,7 @@ bool bdrv_unallocated_blocks_are_zero(BlockDriverState 
*bs)
 {
 BlockDriverInfo bdi;
 
-if (bs->backing) {
+if (bdrv_filtered_cow_child(bs)) {
 return false;
 }
 
-- 
2.21.0




[Qemu-block] [PATCH v6 10/42] block: Drop bdrv_is_encrypted()

2019-08-09 Thread Max Reitz
The original purpose of bdrv_is_encrypted() was to inquire whether a BDS
can be used without the user entering a password or not.  It has not
been used for that purpose for quite some time.

Actually, it is not even fit for that purpose, because to answer that
question, it would have recursively query all of the given node's
children.

So now we have to decide in which direction we want to fix
bdrv_is_encrypted(): Recursively query all children, or drop it and just
use bs->encrypted to get the current node's status?

Nowadays, its only purpose is to report through bdrv_query_image_info()
whether the given image is encrypted or not.  For this purpose, it is
probably more interesting to see whether a given node itself is
encrypted or not (otherwise, a management application cannot discern for
certain which nodes are really encrypted and which just have encrypted
children).

Suggested-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Max Reitz 
---
 include/block/block.h | 1 -
 block.c   | 8 
 block/qapi.c  | 2 +-
 3 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index f6f09b95cd..9cfe77abaf 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -487,7 +487,6 @@ BlockDriverState *bdrv_next(BdrvNextIterator *it);
 void bdrv_next_cleanup(BdrvNextIterator *it);
 
 BlockDriverState *bdrv_next_monitor_owned(BlockDriverState *bs);
-bool bdrv_is_encrypted(BlockDriverState *bs);
 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
  void *opaque, bool read_only);
 const char *bdrv_get_node_name(const BlockDriverState *bs);
diff --git a/block.c b/block.c
index 650c00d182..415c555bf5 100644
--- a/block.c
+++ b/block.c
@@ -4696,14 +4696,6 @@ bool bdrv_is_sg(BlockDriverState *bs)
 return bs->sg;
 }
 
-bool bdrv_is_encrypted(BlockDriverState *bs)
-{
-if (bs->backing && bs->backing->bs->encrypted) {
-return true;
-}
-return bs->encrypted;
-}
-
 const char *bdrv_get_format_name(BlockDriverState *bs)
 {
 return bs->drv ? bs->drv->format_name : NULL;
diff --git a/block/qapi.c b/block/qapi.c
index 15f1030264..9a185cba48 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -281,7 +281,7 @@ void bdrv_query_image_info(BlockDriverState *bs,
 info->virtual_size= size;
 info->actual_size = bdrv_get_allocated_file_size(bs);
 info->has_actual_size = info->actual_size >= 0;
-if (bdrv_is_encrypted(bs)) {
+if (bs->encrypted) {
 info->encrypted = true;
 info->has_encrypted = true;
 }
-- 
2.21.0




[Qemu-block] [PATCH v6 14/42] block: Use CAFs when working with backing chains

2019-08-09 Thread Max Reitz
Use child access functions when iterating through backing chains so
filters do not break the chain.

Signed-off-by: Max Reitz 
---
 block.c | 40 
 1 file changed, 28 insertions(+), 12 deletions(-)

diff --git a/block.c b/block.c
index 86b84bea21..42abbaf0ba 100644
--- a/block.c
+++ b/block.c
@@ -4376,7 +4376,8 @@ int bdrv_change_backing_file(BlockDriverState *bs,
 }
 
 /*
- * Finds the image layer in the chain that has 'bs' as its backing file.
+ * Finds the image layer in the chain that has 'bs' (or a filter on
+ * top of it) as its backing file.
  *
  * active is the current topmost image.
  *
@@ -4388,11 +4389,18 @@ int bdrv_change_backing_file(BlockDriverState *bs,
 BlockDriverState *bdrv_find_overlay(BlockDriverState *active,
 BlockDriverState *bs)
 {
-while (active && bs != backing_bs(active)) {
-active = backing_bs(active);
+bs = bdrv_skip_rw_filters(bs);
+active = bdrv_skip_rw_filters(active);
+
+while (active) {
+BlockDriverState *next = bdrv_backing_chain_next(active);
+if (bs == next) {
+return active;
+}
+active = next;
 }
 
-return active;
+return NULL;
 }
 
 /* Given a BDS, searches for the base layer. */
@@ -4544,9 +4552,7 @@ int bdrv_drop_intermediate(BlockDriverState *top, 
BlockDriverState *base,
  * other intermediate nodes have been dropped.
  * If 'top' is an implicit node (e.g. "commit_top") we should skip
  * it because no one inherits from it. We use explicit_top for that. */
-while (explicit_top && explicit_top->implicit) {
-explicit_top = backing_bs(explicit_top);
-}
+explicit_top = bdrv_skip_implicit_filters(explicit_top);
 update_inherits_from = bdrv_inherits_from_recursive(base, explicit_top);
 
 /* success - we can delete the intermediate states, and link top->base */
@@ -5014,7 +5020,7 @@ BlockDriverState *bdrv_lookup_bs(const char *device,
 bool bdrv_chain_contains(BlockDriverState *top, BlockDriverState *base)
 {
 while (top && top != base) {
-top = backing_bs(top);
+top = bdrv_filtered_bs(top);
 }
 
 return top != NULL;
@@ -5253,7 +5259,17 @@ BlockDriverState 
*bdrv_find_backing_image(BlockDriverState *bs,
 
 is_protocol = path_has_protocol(backing_file);
 
-for (curr_bs = bs; curr_bs->backing; curr_bs = curr_bs->backing->bs) {
+/*
+ * Being largely a legacy function, skip any filters here
+ * (because filters do not have normal filenames, so they cannot
+ * match anyway; and allowing json:{} filenames is a bit out of
+ * scope).
+ */
+for (curr_bs = bdrv_skip_rw_filters(bs);
+ bdrv_filtered_cow_child(curr_bs) != NULL;
+ curr_bs = bdrv_backing_chain_next(curr_bs))
+{
+BlockDriverState *bs_below = bdrv_backing_chain_next(curr_bs);
 
 /* If either of the filename paths is actually a protocol, then
  * compare unmodified paths; otherwise make paths relative */
@@ -5261,7 +5277,7 @@ BlockDriverState 
*bdrv_find_backing_image(BlockDriverState *bs,
 char *backing_file_full_ret;
 
 if (strcmp(backing_file, curr_bs->backing_file) == 0) {
-retval = curr_bs->backing->bs;
+retval = bs_below;
 break;
 }
 /* Also check against the full backing filename for the image */
@@ -5271,7 +5287,7 @@ BlockDriverState 
*bdrv_find_backing_image(BlockDriverState *bs,
 bool equal = strcmp(backing_file, backing_file_full_ret) == 0;
 g_free(backing_file_full_ret);
 if (equal) {
-retval = curr_bs->backing->bs;
+retval = bs_below;
 break;
 }
 }
@@ -5297,7 +5313,7 @@ BlockDriverState 
*bdrv_find_backing_image(BlockDriverState *bs,
 g_free(filename_tmp);
 
 if (strcmp(backing_file_full, filename_full) == 0) {
-retval = curr_bs->backing->bs;
+retval = bs_below;
 break;
 }
 }
-- 
2.21.0




[Qemu-block] [PATCH v6 12/42] block: Use bdrv_filtered_rw* where obvious

2019-08-09 Thread Max Reitz
Places that use patterns like

if (bs->drv->is_filter && bs->file) {
... something about bs->file->bs ...
}

should be

BlockDriverState *filtered = bdrv_filtered_rw_bs(bs);
if (filtered) {
... something about @filtered ...
}

instead.

Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 block.c| 23 +++
 block/io.c |  5 +++--
 2 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/block.c b/block.c
index 029c809a8e..86b84bea21 100644
--- a/block.c
+++ b/block.c
@@ -556,11 +556,12 @@ int bdrv_create_file(const char *filename, QemuOpts 
*opts, Error **errp)
 int bdrv_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
 {
 BlockDriver *drv = bs->drv;
+BlockDriverState *filtered = bdrv_filtered_rw_bs(bs);
 
 if (drv && drv->bdrv_probe_blocksizes) {
 return drv->bdrv_probe_blocksizes(bs, bsz);
-} else if (drv && drv->is_filter && bs->file) {
-return bdrv_probe_blocksizes(bs->file->bs, bsz);
+} else if (filtered) {
+return bdrv_probe_blocksizes(filtered, bsz);
 }
 
 return -ENOTSUP;
@@ -575,11 +576,12 @@ int bdrv_probe_blocksizes(BlockDriverState *bs, 
BlockSizes *bsz)
 int bdrv_probe_geometry(BlockDriverState *bs, HDGeometry *geo)
 {
 BlockDriver *drv = bs->drv;
+BlockDriverState *filtered = bdrv_filtered_rw_bs(bs);
 
 if (drv && drv->bdrv_probe_geometry) {
 return drv->bdrv_probe_geometry(bs, geo);
-} else if (drv && drv->is_filter && bs->file) {
-return bdrv_probe_geometry(bs->file->bs, geo);
+} else if (filtered) {
+return bdrv_probe_geometry(filtered, geo);
 }
 
 return -ENOTSUP;
@@ -5084,6 +5086,8 @@ int bdrv_has_zero_init_1(BlockDriverState *bs)
 
 int bdrv_has_zero_init(BlockDriverState *bs)
 {
+BlockDriverState *filtered;
+
 if (!bs->drv) {
 return 0;
 }
@@ -5096,8 +5100,10 @@ int bdrv_has_zero_init(BlockDriverState *bs)
 if (bs->drv->bdrv_has_zero_init) {
 return bs->drv->bdrv_has_zero_init(bs);
 }
-if (bs->file && bs->drv->is_filter) {
-return bdrv_has_zero_init(bs->file->bs);
+
+filtered = bdrv_filtered_rw_bs(bs);
+if (filtered) {
+return bdrv_has_zero_init(filtered);
 }
 
 /* safe default */
@@ -5142,8 +5148,9 @@ int bdrv_get_info(BlockDriverState *bs, BlockDriverInfo 
*bdi)
 return -ENOMEDIUM;
 }
 if (!drv->bdrv_get_info) {
-if (bs->file && drv->is_filter) {
-return bdrv_get_info(bs->file->bs, bdi);
+BlockDriverState *filtered = bdrv_filtered_rw_bs(bs);
+if (filtered) {
+return bdrv_get_info(filtered, bdi);
 }
 return -ENOTSUP;
 }
diff --git a/block/io.c b/block/io.c
index 06305c6ea6..4d6cf4b3c2 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3186,8 +3186,9 @@ int coroutine_fn bdrv_co_truncate(BdrvChild *child, 
int64_t offset,
 }
 
 if (!drv->bdrv_co_truncate) {
-if (bs->file && drv->is_filter) {
-ret = bdrv_co_truncate(bs->file, offset, prealloc, errp);
+BdrvChild *filtered = bdrv_filtered_rw_child(bs);
+if (filtered) {
+ret = bdrv_co_truncate(filtered, offset, prealloc, errp);
 goto out;
 }
 error_setg(errp, "Image format driver does not support resize");
-- 
2.21.0




[Qemu-block] [PATCH v6 11/42] block: Add bdrv_supports_compressed_writes()

2019-08-09 Thread Max Reitz
Filters cannot compress data themselves but they have to implement
.bdrv_co_pwritev_compressed() still (or they cannot forward compressed
writes).  Therefore, checking whether
bs->drv->bdrv_co_pwritev_compressed is non-NULL is not sufficient to
know whether the node can actually handle compressed writes.  This
function looks down the filter chain to see whether there is a
non-filter that can actually convert the compressed writes into
compressed data (and thus normal writes).

Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 include/block/block.h |  1 +
 block.c   | 22 ++
 2 files changed, 23 insertions(+)

diff --git a/include/block/block.h b/include/block/block.h
index 9cfe77abaf..6ba853fb90 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -487,6 +487,7 @@ BlockDriverState *bdrv_next(BdrvNextIterator *it);
 void bdrv_next_cleanup(BdrvNextIterator *it);
 
 BlockDriverState *bdrv_next_monitor_owned(BlockDriverState *bs);
+bool bdrv_supports_compressed_writes(BlockDriverState *bs);
 void bdrv_iterate_format(void (*it)(void *opaque, const char *name),
  void *opaque, bool read_only);
 const char *bdrv_get_node_name(const BlockDriverState *bs);
diff --git a/block.c b/block.c
index 415c555bf5..029c809a8e 100644
--- a/block.c
+++ b/block.c
@@ -4696,6 +4696,28 @@ bool bdrv_is_sg(BlockDriverState *bs)
 return bs->sg;
 }
 
+/**
+ * Return whether the given node supports compressed writes.
+ */
+bool bdrv_supports_compressed_writes(BlockDriverState *bs)
+{
+BlockDriverState *filtered = bdrv_filtered_rw_bs(bs);
+
+if (!bs->drv || !bs->drv->bdrv_co_pwritev_compressed) {
+return false;
+}
+
+if (filtered) {
+/*
+ * Filters can only forward compressed writes, so we have to
+ * check the child.
+ */
+return bdrv_supports_compressed_writes(filtered);
+}
+
+return true;
+}
+
 const char *bdrv_get_format_name(BlockDriverState *bs)
 {
 return bs->drv ? bs->drv->format_name : NULL;
-- 
2.21.0




[Qemu-block] [PATCH v6 00/42] block: Deal with filters

2019-08-09 Thread Max Reitz
Hi,

When we introduced filters, we did it a bit casually.  Sure, we talked a
lot about them before, but that was mostly discussion about where
implicit filters should be added to the graph (note that we currently
only have two implicit filters, those being mirror and commit).  But in
the end, we really just designated some drivers filters (Quorum,
blkdebug, etc.) and added some specifically (throttle, COR), without
really looking through the block layer to see where issues might occur.

It turns out vast areas of the block layer just don’t know about filters
and cannot really handle them.  Many cases will work in practice, in
others, well, too bad, you cannot use some feature because some part
deep inside the block layer looks at your filters and thinks they are
format nodes.

This is one reason why this series is needed.  Over time (since v1), a
second reason has made its way in:

bs->file is not necessarily the place where a node’s data is stored.
qcow2 now has external data files, and currently there is no way for the
general block layer to know that the data is not stored in bs->file.
Right now, I do not think that has any real consequences (all functions
that need access to the actual data storage file should only do so as a
fallback if the driver does not provide some functionality, but qcow2
should provide it all), but it still shows that we need some way to let
the general block layer know about such data files.  (Also, I will need
this for v1 of my “Inquire images’ rotational info” series.)

I won’t go on and on about this series now, I think the patches pretty
much speak for themselves now.  If the cover letter gets too long,
nobody reads it anyway (see previous versions).


*** I’ve based this series on John’s bitmap branch, which I’ve rebased
on my block-next branch. ***

I’ve pushed the patches here:

  https://git.xanclic.moe/XanClic/qemu child-access-functions-v6
  https://github.com/XanClic/qemu child-access-functions-v6

I’ve also pushed the base branch to each of those repos, it’s
“child-access-functions-base”.


v6:
- Patch 9: Rename *freeze_backing_chain (etc.) to *freeze_chain (etc.)
- Patch 10: Drop bdrv_is_encrypted() instead of fixing it the wrong way
- Patch 15: Add a comment on why this works
- Patch 16: Just flush all children instead of one
- Patch 20: We have to snapshot all non-backing children, so both
metadata and storage children; for simplification, just
disallow the fallback path if there is more than one such
child
- Patch 22: bdrv_get_allocated_file_size() should report all children’s
sizes, not just the primary or storage child’s
- Patch 24: Make query-blockstats too report any filtered child under
“backing”
- Patch 25:
  - bdrv_is_allocated_above()’s new @include_base parameter makes things
a bit simpler
  - Forbid mirroring to a filter on top of the base
- Patch 27: bdrv_is_allocated_above()’s new @include_base parameter
makes things a bit simpler
- Patch 28: Requires a few changes to keep stream independent of the
base node
- Patch 30:
  - Sprinkle in a few bdrv_skip_implicit_filters()s
  - Conflicts with the convert --salvage patches


git backport-diff against v5:

Key:
[] : patches are identical
[] : number of functional differences between upstream/downstream patch
[down] : patch is downstream-only
The flags [FC] indicate (F)unctional and (C)ontextual differences, respectively

001/42:[] [-C] 'block: Mark commit and mirror as filter drivers'
002/42:[] [--] 'copy-on-read: Support compressed writes'
003/42:[] [--] 'throttle: Support compressed writes'
004/42:[] [--] 'block: Add child access functions'
005/42:[] [--] 'block: Add chain helper functions'
006/42:[] [--] 'qcow2: Implement .bdrv_storage_child()'
007/42:[] [--] 'block: *filtered_cow_child() for *has_zero_init()'
008/42:[] [--] 'block: bdrv_set_backing_hd() is about bs->backing'
009/42:[0078] [FC] 'block: Include filters when freezing backing chain'
010/42:[down] 'block: Drop bdrv_is_encrypted()'
011/42:[] [-C] 'block: Add bdrv_supports_compressed_writes()'
012/42:[] [--] 'block: Use bdrv_filtered_rw* where obvious'
013/42:[] [-C] 'block: Use CAFs in block status functions'
014/42:[] [--] 'block: Use CAFs when working with backing chains'
015/42:[0017] [FC] 'block: Re-evaluate backing file handling in reopen'
016/42:[down] 'block: Flush all children in generic code'
017/42:[] [--] 'block: Use CAFs in bdrv_refresh_limits()'
018/42:[] [--] 'block: Use CAFs in bdrv_refresh_filename()'
019/42:[] [--] 'block: Use CAF in bdrv_co_rw_vmstate()'
020/42:[down] 'block/snapshot: Fix fallback'
021/42:[] [--] 'block: Use CAFs for debug breakpoints'
022/42:[down] 'block: Fix bdrv_get_allocated_file_size's fallback'
023/42:[] [--] 'blockdev: Use CAF in external_snapshot_prepare()'
024/42:[0012] [FC] 'block: Use child access functions for QAPI queries'
025

[Qemu-block] [PATCH v6 01/42] block: Mark commit and mirror as filter drivers

2019-08-09 Thread Max Reitz
The commit and mirror block nodes are filters, so they should be marked
as such.  (Strictly speaking, BDS.is_filter's documentation states that
a filter's child must be bs->file.  The following patch will relax this
restriction, however.)

Signed-off-by: Max Reitz 
Reviewed-by: Alberto Garcia 
Reviewed-by: Eric Blake 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 block/commit.c | 2 ++
 block/mirror.c | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/block/commit.c b/block/commit.c
index 408ae15389..19915603ae 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -254,6 +254,8 @@ static BlockDriver bdrv_commit_top = {
 .bdrv_co_block_status   = bdrv_co_block_status_from_backing,
 .bdrv_refresh_filename  = bdrv_commit_top_refresh_filename,
 .bdrv_child_perm= bdrv_commit_top_child_perm,
+
+.is_filter  = true,
 };
 
 void commit_start(const char *job_id, BlockDriverState *bs,
diff --git a/block/mirror.c b/block/mirror.c
index 2b870683f1..a8f2d7a305 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -1507,6 +1507,8 @@ static BlockDriver bdrv_mirror_top = {
 .bdrv_refresh_filename  = bdrv_mirror_top_refresh_filename,
 .bdrv_child_perm= bdrv_mirror_top_child_perm,
 .bdrv_refresh_limits= bdrv_mirror_top_refresh_limits,
+
+.is_filter  = true,
 };
 
 static BlockJob *mirror_start_job(
-- 
2.21.0




[Qemu-block] [PATCH v6 05/42] block: Add chain helper functions

2019-08-09 Thread Max Reitz
Add some helper functions for skipping filters in a chain of block
nodes.

Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 include/block/block_int.h |  3 +++
 block.c   | 55 +++
 2 files changed, 58 insertions(+)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index 6c60abc4c3..5bec3361fd 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -1291,6 +1291,9 @@ BdrvChild *bdrv_filtered_child(BlockDriverState *bs);
 BdrvChild *bdrv_metadata_child(BlockDriverState *bs);
 BdrvChild *bdrv_storage_child(BlockDriverState *bs);
 BdrvChild *bdrv_primary_child(BlockDriverState *bs);
+BlockDriverState *bdrv_skip_implicit_filters(BlockDriverState *bs);
+BlockDriverState *bdrv_skip_rw_filters(BlockDriverState *bs);
+BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs);
 
 static inline BlockDriverState *child_bs(BdrvChild *child)
 {
diff --git a/block.c b/block.c
index f6c5f8c3eb..bfa5e27850 100644
--- a/block.c
+++ b/block.c
@@ -6655,3 +6655,58 @@ BdrvChild *bdrv_primary_child(BlockDriverState *bs)
 {
 return bdrv_filtered_rw_child(bs) ?: bs->file;
 }
+
+static BlockDriverState *bdrv_skip_filters(BlockDriverState *bs,
+   bool stop_on_explicit_filter)
+{
+BdrvChild *filtered;
+
+if (!bs) {
+return NULL;
+}
+
+while (!(stop_on_explicit_filter && !bs->implicit)) {
+filtered = bdrv_filtered_rw_child(bs);
+if (!filtered) {
+break;
+}
+bs = filtered->bs;
+}
+/*
+ * Note that this treats nodes with bs->drv == NULL as not being
+ * R/W filters (bs->drv == NULL should be replaced by something
+ * else anyway).
+ * The advantage of this behavior is that this function will thus
+ * always return a non-NULL value (given a non-NULL @bs).
+ */
+
+return bs;
+}
+
+/*
+ * Return the first BDS that has not been added implicitly or that
+ * does not have an RW-filtered child down the chain starting from @bs
+ * (including @bs itself).
+ */
+BlockDriverState *bdrv_skip_implicit_filters(BlockDriverState *bs)
+{
+return bdrv_skip_filters(bs, true);
+}
+
+/*
+ * Return the first BDS that does not have an RW-filtered child down
+ * the chain starting from @bs (including @bs itself).
+ */
+BlockDriverState *bdrv_skip_rw_filters(BlockDriverState *bs)
+{
+return bdrv_skip_filters(bs, false);
+}
+
+/*
+ * For a backing chain, return the first non-filter backing image of
+ * the first non-filter image.
+ */
+BlockDriverState *bdrv_backing_chain_next(BlockDriverState *bs)
+{
+return 
bdrv_skip_rw_filters(bdrv_filtered_cow_bs(bdrv_skip_rw_filters(bs)));
+}
-- 
2.21.0




[Qemu-block] [PATCH v6 03/42] throttle: Support compressed writes

2019-08-09 Thread Max Reitz
Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 block/throttle.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/block/throttle.c b/block/throttle.c
index 0349f42257..958a2bcfa6 100644
--- a/block/throttle.c
+++ b/block/throttle.c
@@ -153,6 +153,15 @@ static int coroutine_fn 
throttle_co_pdiscard(BlockDriverState *bs,
 return bdrv_co_pdiscard(bs->file, offset, bytes);
 }
 
+static int coroutine_fn throttle_co_pwritev_compressed(BlockDriverState *bs,
+   uint64_t offset,
+   uint64_t bytes,
+   QEMUIOVector *qiov)
+{
+return throttle_co_pwritev(bs, offset, bytes, qiov,
+   BDRV_REQ_WRITE_COMPRESSED);
+}
+
 static int throttle_co_flush(BlockDriverState *bs)
 {
 return bdrv_co_flush(bs->file->bs);
@@ -251,6 +260,7 @@ static BlockDriver bdrv_throttle = {
 
 .bdrv_co_pwrite_zeroes  =   throttle_co_pwrite_zeroes,
 .bdrv_co_pdiscard   =   throttle_co_pdiscard,
+.bdrv_co_pwritev_compressed =   throttle_co_pwritev_compressed,
 
 .bdrv_recurse_is_first_non_filter   =   
throttle_recurse_is_first_non_filter,
 
-- 
2.21.0




[Qemu-block] [PATCH v6 02/42] copy-on-read: Support compressed writes

2019-08-09 Thread Max Reitz
Signed-off-by: Max Reitz 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 block/copy-on-read.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/block/copy-on-read.c b/block/copy-on-read.c
index 6631f30205..16bdf630b6 100644
--- a/block/copy-on-read.c
+++ b/block/copy-on-read.c
@@ -113,6 +113,16 @@ static int coroutine_fn cor_co_pdiscard(BlockDriverState 
*bs,
 }
 
 
+static int coroutine_fn cor_co_pwritev_compressed(BlockDriverState *bs,
+  uint64_t offset,
+  uint64_t bytes,
+  QEMUIOVector *qiov)
+{
+return bdrv_co_pwritev(bs->file, offset, bytes, qiov,
+   BDRV_REQ_WRITE_COMPRESSED);
+}
+
+
 static void cor_eject(BlockDriverState *bs, bool eject_flag)
 {
 bdrv_eject(bs->file->bs, eject_flag);
@@ -145,6 +155,7 @@ static BlockDriver bdrv_copy_on_read = {
 .bdrv_co_pwritev= cor_co_pwritev,
 .bdrv_co_pwrite_zeroes  = cor_co_pwrite_zeroes,
 .bdrv_co_pdiscard   = cor_co_pdiscard,
+.bdrv_co_pwritev_compressed = cor_co_pwritev_compressed,
 
 .bdrv_eject = cor_eject,
 .bdrv_lock_medium   = cor_lock_medium,
-- 
2.21.0




Re: [Qemu-block] [Qemu-devel] [PATCH v2 6/7] block/backup: teach backup_cow_with_bounce_buffer to copy more at once

2019-08-09 Thread Eric Blake
On 8/9/19 10:32 AM, Vladimir Sementsov-Ogievskiy wrote:
> backup_cow_with_offload can transfer more than on cluster. Let

s/on/one/

> backup_cow_with_bounce_buffer behave similarly. It reduces number
> of IO and there are no needs to copy cluster by cluster.

It reduces the number of IO requests, since there is no need to copy
cluster by cluster.

> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  block/backup.c | 29 +++--
>  1 file changed, 15 insertions(+), 14 deletions(-)
> 
> diff --git a/block/backup.c b/block/backup.c
> index d482d93458..155e21d0a3 100644
> --- a/block/backup.c
> +++ b/block/backup.c
> @@ -104,22 +104,25 @@ static int coroutine_fn 
> backup_cow_with_bounce_buffer(BackupBlockJob *job,
>int64_t start,
>int64_t end,
>bool is_write_notifier,
> -  bool *error_is_read,
> -  void **bounce_buffer)
> +  bool *error_is_read)

Why is this signature changing?

>  {
>  int ret;
>  BlockBackend *blk = job->common.blk;
>  int nbytes;
>  int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
> +void *bounce_buffer;
>  
>  assert(QEMU_IS_ALIGNED(start, job->cluster_size));
> -bdrv_reset_dirty_bitmap(job->copy_bitmap, start, job->cluster_size);
> -nbytes = MIN(job->cluster_size, job->len - start);
> -if (!*bounce_buffer) {
> -*bounce_buffer = blk_blockalign(blk, job->cluster_size);

Pre-patch, you allocate the bounce_buffer at most once (but limited to a
cluster size), post-patch, you are now allocating and freeing a bounce
buffer every iteration through.  There may be fewer calls because you
are allocating something bigger, but still, isn't it a good goal to try
and allocate the bounce buffer as few times as possible and reuse it,
rather than getting a fresh one each iteration?

> +
> +nbytes = MIN(end - start, job->len - start);

What is the largest this will be? Will it exhaust memory on a 32-bit or
otherwise memory-constrained system, where you should have some maximum
cap for how large the bounce buffer will be while still getting better
performance than one cluster at a time and not slowing things down by
over-sizing malloc?  64M, maybe?

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [Qemu-devel] [PATCH v7 7/9] qemu/units: add SI decimal units

2019-08-09 Thread Peter Maydell
On Fri, 9 Aug 2019 at 16:39, Eric Blake  wrote:
> Also, would it be worth swapping out existing constants in the code base
> that should instead be using these macros, so that they actually have a
> use and so that we can see whether using them adds legibility?
>
> For example, block/nvme.c, block/qapi.c, block/sheepdog.c, blockdev.c,
> util/async.c, util/oslib-win32.c, util/qemu-thread-posix.c,
> util/qemu-timer.c all seem to be dealing with conversions between
> seconds and subdivisions thereof, where constants 100 or larger are
> in use and could be rewritten with these.

I'm not sure that it would be more readable to replace
10LL with SI_G -- I would tend to assume the latter
would be 2^30. Using "1000LL * 1000 * 1000" inline at
the point of use, or better still abstracting any particular use
into something more semantically meaningful as we already
do with NANOSECONDS_PER_SECOND, would be my personal preference.

thanks
-- PMM



Re: [Qemu-block] [PATCH v7 7/9] qemu/units: add SI decimal units

2019-08-09 Thread Eric Blake
On 6/18/19 6:43 AM, Vladimir Sementsov-Ogievskiy wrote:
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  include/qemu/units.h | 7 +++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/include/qemu/units.h b/include/qemu/units.h
> index 692db3fbb2..52ccc7445c 100644
> --- a/include/qemu/units.h
> +++ b/include/qemu/units.h
> @@ -17,4 +17,11 @@
>  #define PiB (INT64_C(1) << 50)
>  #define EiB (INT64_C(1) << 60)
>  
> +#define SI_k 1000LL
> +#define SI_M 100LL
> +#define SI_G 10LL
> +#define SI_T 1LL
> +#define SI_P 1000LL
> +#define SI_E 100LL

Looks correct; and it's the sort of thing that if we do once here, we
don't have to repeat elsewhere. But bike-shedding a bit, would it be any
easier to read as:

#define SI_k 1000LL
#define SI_M (1000LL * 1000)
#define SI_G (1000LL * 1000 * 1000)
...

or even:

#define SI_k 1000LL
#define SI_M (SI_k * 1000)
#define SI_G (SI_M * 1000)
...

Also, would it be worth swapping out existing constants in the code base
that should instead be using these macros, so that they actually have a
use and so that we can see whether using them adds legibility?

For example, block/nvme.c, block/qapi.c, block/sheepdog.c, blockdev.c,
util/async.c, util/oslib-win32.c, util/qemu-thread-posix.c,
util/qemu-timer.c all seem to be dealing with conversions between
seconds and subdivisions thereof, where constants 100 or larger are
in use and could be rewritten with these.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH v2 0/7] backup improvements

2019-08-09 Thread Vladimir Sementsov-Ogievskiy
09.08.2019 18:32, Vladimir Sementsov-Ogievskiy wrote:
> Hi all!
> 
> There are some fixes and refactorings I need on my way to resend
> my backup-top series. It's obvious now that I need to share copying
> code between backup and backup-top, as backup copying code becomes
> smarter and more complicated. So the goal of the series is to make copying
> code more share-able.
> 
> It's based on John's bitmaps branch, rebased on master.

No, it's based directly on John's bitmaps branch:
Based-on: https://github.com/jnsnow/qemu bitmaps

> 
> v2 (by Max's comments):
> 
> 02: Add Max's r-b
> 03: - split out backup changes to 03
>  - handle common max_transfer = 0 case
> 04: splat out from 02
> 06: fix allocation size
> 07: - rebase on 06 changes
>  - add Max's r-b
> 
> two patches are dropped or postponed for the next step
> 
> Vladimir Sementsov-Ogievskiy (7):
>block/backup: deal with zero detection
>block/backup: refactor write_flags
>block/io: handle alignment and max_transfer for copy_range
>block/backup: drop handling of max_transfer for copy_range
>block/backup: fix backup_cow_with_offload for last cluster
>block/backup: teach backup_cow_with_bounce_buffer to copy more at once
>block/backup: merge duplicated logic into backup_do_cow
> 
>   block/backup.c | 125 +++--
>   block/io.c |  48 +++
>   blockdev.c |   8 ++--
>   3 files changed, 91 insertions(+), 90 deletions(-)
> 


-- 
Best regards,
Vladimir


[Qemu-block] [PATCH v2 4/7] block/backup: drop handling of max_transfer for copy_range

2019-08-09 Thread Vladimir Sementsov-Ogievskiy
Since previous commit, copy_range supports max_transfer, so we don't
need to handle it by hand.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/backup.c | 11 ++-
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index c6a3b2b7bb..228ba9423c 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -54,7 +54,6 @@ typedef struct BackupBlockJob {
 QLIST_HEAD(, CowRequest) inflight_reqs;
 
 bool use_copy_range;
-int64_t copy_range_size;
 
 BdrvRequestFlags write_flags;
 bool initializing_bitmap;
@@ -156,12 +155,11 @@ static int coroutine_fn 
backup_cow_with_offload(BackupBlockJob *job,
 int ret;
 int nr_clusters;
 BlockBackend *blk = job->common.blk;
-int nbytes;
+int nbytes = end - start;
 int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
 
-assert(QEMU_IS_ALIGNED(job->copy_range_size, job->cluster_size));
+assert(end - start < INT_MAX);
 assert(QEMU_IS_ALIGNED(start, job->cluster_size));
-nbytes = MIN(job->copy_range_size, end - start);
 nr_clusters = DIV_ROUND_UP(nbytes, job->cluster_size);
 bdrv_reset_dirty_bitmap(job->copy_bitmap, start,
 job->cluster_size * nr_clusters);
@@ -756,11 +754,6 @@ BlockJob *backup_job_create(const char *job_id, 
BlockDriverState *bs,
 job->copy_bitmap = copy_bitmap;
 copy_bitmap = NULL;
 job->use_copy_range = !compress; /* compression isn't supported for it */
-job->copy_range_size = MIN_NON_ZERO(blk_get_max_transfer(job->common.blk),
-blk_get_max_transfer(job->target));
-job->copy_range_size = MAX(job->cluster_size,
-   QEMU_ALIGN_UP(job->copy_range_size,
- job->cluster_size));
 
 /* Required permissions are already taken with target's blk_new() */
 block_job_add_bdrv(&job->common, "target", target, 0, BLK_PERM_ALL,
-- 
2.18.0




[Qemu-block] [PATCH v2 3/7] block/io: handle alignment and max_transfer for copy_range

2019-08-09 Thread Vladimir Sementsov-Ogievskiy
copy_range ignores these limitations, let's improve it.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/io.c | 48 
 1 file changed, 40 insertions(+), 8 deletions(-)

diff --git a/block/io.c b/block/io.c
index 06305c6ea6..a5efb2200f 100644
--- a/block/io.c
+++ b/block/io.c
@@ -3005,11 +3005,24 @@ static int coroutine_fn bdrv_co_copy_range_internal(
 {
 BdrvTrackedRequest req;
 int ret;
+uint32_t align = MAX(src->bs->bl.request_alignment,
+ dst->bs->bl.request_alignment);
+uint32_t max_transfer =
+QEMU_ALIGN_DOWN(MIN_NON_ZERO(MIN_NON_ZERO(src->bs->bl.max_transfer,
+  
dst->bs->bl.max_transfer),
+ INT_MAX), align);
 
 /* TODO We can support BDRV_REQ_NO_FALLBACK here */
 assert(!(read_flags & BDRV_REQ_NO_FALLBACK));
 assert(!(write_flags & BDRV_REQ_NO_FALLBACK));
 
+if (max_transfer == 0 && bytes > 0) {
+/*
+ * For example, if source max_transfer is smaller than target 
alignment.
+ */
+return -ENOTSUP;
+}
+
 if (!dst || !dst->bs) {
 return -ENOMEDIUM;
 }
@@ -3031,7 +3044,10 @@ static int coroutine_fn bdrv_co_copy_range_internal(
 
 if (!src->bs->drv->bdrv_co_copy_range_from
 || !dst->bs->drv->bdrv_co_copy_range_to
-|| src->bs->encrypted || dst->bs->encrypted) {
+|| src->bs->encrypted || dst->bs->encrypted ||
+!QEMU_IS_ALIGNED(src_offset, src->bs->bl.request_alignment) ||
+!QEMU_IS_ALIGNED(dst_offset, dst->bs->bl.request_alignment) ||
+!QEMU_IS_ALIGNED(bytes, align)) {
 return -ENOTSUP;
 }
 
@@ -3046,11 +3062,22 @@ static int coroutine_fn bdrv_co_copy_range_internal(
 wait_serialising_requests(&req);
 }
 
-ret = src->bs->drv->bdrv_co_copy_range_from(src->bs,
-src, src_offset,
-dst, dst_offset,
-bytes,
-read_flags, write_flags);
+while (bytes) {
+int num = MIN(bytes, max_transfer);
+
+ret = src->bs->drv->bdrv_co_copy_range_from(src->bs,
+src, src_offset,
+dst, dst_offset,
+num,
+read_flags,
+write_flags);
+if (ret < 0) {
+break;
+}
+bytes -= num;
+src_offset += num;
+dst_offset += num;
+}
 
 tracked_request_end(&req);
 bdrv_dec_in_flight(src->bs);
@@ -3060,12 +3087,17 @@ static int coroutine_fn bdrv_co_copy_range_internal(
   BDRV_TRACKED_WRITE);
 ret = bdrv_co_write_req_prepare(dst, dst_offset, bytes, &req,
 write_flags);
-if (!ret) {
+while (!ret && bytes) {
+int num = MIN(bytes, max_transfer);
+
 ret = dst->bs->drv->bdrv_co_copy_range_to(dst->bs,
   src, src_offset,
   dst, dst_offset,
-  bytes,
+  num,
   read_flags, write_flags);
+bytes -= num;
+src_offset += num;
+dst_offset += num;
 }
 bdrv_co_write_req_finish(dst, dst_offset, bytes, &req, ret);
 tracked_request_end(&req);
-- 
2.18.0




[Qemu-block] [PATCH v2 1/7] block/backup: deal with zero detection

2019-08-09 Thread Vladimir Sementsov-Ogievskiy
We have detect_zeroes option, so at least for blockdev-backup user
should define it if zero-detection is needed. For drive-backup leave
detection enabled by default but do it through existing option instead
of open-coding.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Max Reitz 
---
 block/backup.c | 15 ++-
 blockdev.c |  8 
 2 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index adc4d44244..d815436455 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -113,7 +113,10 @@ static int coroutine_fn 
backup_cow_with_bounce_buffer(BackupBlockJob *job,
 BlockBackend *blk = job->common.blk;
 int nbytes;
 int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
-int write_flags = job->serialize_target_writes ? BDRV_REQ_SERIALISING : 0;
+int write_flags =
+(job->serialize_target_writes ? BDRV_REQ_SERIALISING : 0) |
+(job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0);
+
 
 assert(QEMU_IS_ALIGNED(start, job->cluster_size));
 bdrv_reset_dirty_bitmap(job->copy_bitmap, start, job->cluster_size);
@@ -131,14 +134,8 @@ static int coroutine_fn 
backup_cow_with_bounce_buffer(BackupBlockJob *job,
 goto fail;
 }
 
-if (buffer_is_zero(*bounce_buffer, nbytes)) {
-ret = blk_co_pwrite_zeroes(job->target, start,
-   nbytes, write_flags | BDRV_REQ_MAY_UNMAP);
-} else {
-ret = blk_co_pwrite(job->target, start,
-nbytes, *bounce_buffer, write_flags |
-(job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0));
-}
+ret = blk_co_pwrite(job->target, start, nbytes, *bounce_buffer,
+write_flags);
 if (ret < 0) {
 trace_backup_do_cow_write_fail(job, start, ret);
 if (error_is_read) {
diff --git a/blockdev.c b/blockdev.c
index 29c6c6044a..2d7e7be538 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -3613,7 +3613,7 @@ static BlockJob *do_drive_backup(DriveBackup *backup, 
JobTxn *txn,
 BlockDriverState *source = NULL;
 BlockJob *job = NULL;
 AioContext *aio_context;
-QDict *options = NULL;
+QDict *options;
 Error *local_err = NULL;
 int flags;
 int64_t size;
@@ -3686,10 +3686,10 @@ static BlockJob *do_drive_backup(DriveBackup *backup, 
JobTxn *txn,
 goto out;
 }
 
+options = qdict_new();
+qdict_put_str(options, "discard", "unmap");
+qdict_put_str(options, "detect-zeroes", "unmap");
 if (backup->format) {
-if (!options) {
-options = qdict_new();
-}
 qdict_put_str(options, "driver", backup->format);
 }
 
-- 
2.18.0




[Qemu-block] [PATCH v2 6/7] block/backup: teach backup_cow_with_bounce_buffer to copy more at once

2019-08-09 Thread Vladimir Sementsov-Ogievskiy
backup_cow_with_offload can transfer more than on cluster. Let
backup_cow_with_bounce_buffer behave similarly. It reduces number
of IO and there are no needs to copy cluster by cluster.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/backup.c | 29 +++--
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index d482d93458..155e21d0a3 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -104,22 +104,25 @@ static int coroutine_fn 
backup_cow_with_bounce_buffer(BackupBlockJob *job,
   int64_t start,
   int64_t end,
   bool is_write_notifier,
-  bool *error_is_read,
-  void **bounce_buffer)
+  bool *error_is_read)
 {
 int ret;
 BlockBackend *blk = job->common.blk;
 int nbytes;
 int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
+void *bounce_buffer;
 
 assert(QEMU_IS_ALIGNED(start, job->cluster_size));
-bdrv_reset_dirty_bitmap(job->copy_bitmap, start, job->cluster_size);
-nbytes = MIN(job->cluster_size, job->len - start);
-if (!*bounce_buffer) {
-*bounce_buffer = blk_blockalign(blk, job->cluster_size);
+
+nbytes = MIN(end - start, job->len - start);
+bounce_buffer = blk_try_blockalign(blk, nbytes);
+if (!bounce_buffer) {
+return -ENOMEM;
 }
 
-ret = blk_co_pread(blk, start, nbytes, *bounce_buffer, read_flags);
+bdrv_reset_dirty_bitmap(job->copy_bitmap, start, end - start);
+
+ret = blk_co_pread(blk, start, nbytes, bounce_buffer, read_flags);
 if (ret < 0) {
 trace_backup_do_cow_read_fail(job, start, ret);
 if (error_is_read) {
@@ -128,7 +131,7 @@ static int coroutine_fn 
backup_cow_with_bounce_buffer(BackupBlockJob *job,
 goto fail;
 }
 
-ret = blk_co_pwrite(job->target, start, nbytes, *bounce_buffer,
+ret = blk_co_pwrite(job->target, start, nbytes, bounce_buffer,
 job->write_flags);
 if (ret < 0) {
 trace_backup_do_cow_write_fail(job, start, ret);
@@ -138,9 +141,12 @@ static int coroutine_fn 
backup_cow_with_bounce_buffer(BackupBlockJob *job,
 goto fail;
 }
 
+qemu_vfree(bounce_buffer);
 return nbytes;
+
 fail:
 bdrv_set_dirty_bitmap(job->copy_bitmap, start, job->cluster_size);
+qemu_vfree(bounce_buffer);
 return ret;
 
 }
@@ -254,7 +260,6 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
 CowRequest cow_request;
 int ret = 0;
 int64_t start, end; /* bytes */
-void *bounce_buffer = NULL;
 int64_t skip_bytes;
 
 qemu_co_rwlock_rdlock(&job->flush_rwlock);
@@ -303,7 +308,7 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
 if (!job->use_copy_range) {
 ret = backup_cow_with_bounce_buffer(job, start, dirty_end,
 is_write_notifier,
-error_is_read, &bounce_buffer);
+error_is_read);
 }
 if (ret < 0) {
 break;
@@ -318,10 +323,6 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
 ret = 0;
 }
 
-if (bounce_buffer) {
-qemu_vfree(bounce_buffer);
-}
-
 cow_request_end(&cow_request);
 
 trace_backup_do_cow_return(job, offset, bytes, ret);
-- 
2.18.0




[Qemu-block] [PATCH v2 7/7] block/backup: merge duplicated logic into backup_do_cow

2019-08-09 Thread Vladimir Sementsov-Ogievskiy
backup_cow_with_offload and backup_cow_with_bounce_buffer contains a
lot of duplicated logic. Move it into backup_do_cow.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Max Reitz 
---
 block/backup.c | 84 +++---
 1 file changed, 31 insertions(+), 53 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index 155e21d0a3..ae780e1260 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -100,85 +100,60 @@ static void cow_request_end(CowRequest *req)
 
 /* Copy range to target with a bounce buffer and return the bytes copied. If
  * error occurred, return a negative error number */
-static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
-  int64_t start,
-  int64_t end,
-  bool is_write_notifier,
-  bool *error_is_read)
+static int coroutine_fn backup_cow_with_bounce_buffer(
+BackupBlockJob *job, int64_t offset, int64_t bytes,
+BdrvRequestFlags read_flags, bool *error_is_read)
 {
-int ret;
+int ret = 0;
 BlockBackend *blk = job->common.blk;
-int nbytes;
-int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
-void *bounce_buffer;
-
-assert(QEMU_IS_ALIGNED(start, job->cluster_size));
+void *bounce_buffer = blk_try_blockalign(blk, bytes);
 
-nbytes = MIN(end - start, job->len - start);
-bounce_buffer = blk_try_blockalign(blk, nbytes);
 if (!bounce_buffer) {
 return -ENOMEM;
 }
 
-bdrv_reset_dirty_bitmap(job->copy_bitmap, start, end - start);
-
-ret = blk_co_pread(blk, start, nbytes, bounce_buffer, read_flags);
+ret = blk_co_pread(blk, offset, bytes, bounce_buffer, read_flags);
 if (ret < 0) {
-trace_backup_do_cow_read_fail(job, start, ret);
+trace_backup_do_cow_read_fail(job, offset, ret);
 if (error_is_read) {
 *error_is_read = true;
 }
-goto fail;
+goto out;
 }
 
-ret = blk_co_pwrite(job->target, start, nbytes, bounce_buffer,
+ret = blk_co_pwrite(job->target, offset, bytes, bounce_buffer,
 job->write_flags);
 if (ret < 0) {
-trace_backup_do_cow_write_fail(job, start, ret);
+trace_backup_do_cow_write_fail(job, offset, ret);
 if (error_is_read) {
 *error_is_read = false;
 }
-goto fail;
+goto out;
 }
 
+out:
 qemu_vfree(bounce_buffer);
-return nbytes;
 
-fail:
-bdrv_set_dirty_bitmap(job->copy_bitmap, start, job->cluster_size);
-qemu_vfree(bounce_buffer);
 return ret;
-
 }
 
 /* Copy range to target and return the bytes copied. If error occurred, return 
a
  * negative error number. */
 static int coroutine_fn backup_cow_with_offload(BackupBlockJob *job,
-int64_t start,
-int64_t end,
-bool is_write_notifier)
+int64_t offset,
+int64_t bytes,
+BdrvRequestFlags read_flags)
 {
 int ret;
-int nr_clusters;
 BlockBackend *blk = job->common.blk;
-int nbytes = MIN(end - start, job->len - start);
-int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
-
-assert(end - start < INT_MAX);
-assert(QEMU_IS_ALIGNED(start, job->cluster_size));
-nr_clusters = DIV_ROUND_UP(nbytes, job->cluster_size);
-bdrv_reset_dirty_bitmap(job->copy_bitmap, start,
-job->cluster_size * nr_clusters);
-ret = blk_co_copy_range(blk, start, job->target, start, nbytes,
+
+ret = blk_co_copy_range(blk, offset, job->target, offset, bytes,
 read_flags, job->write_flags);
 if (ret < 0) {
-trace_backup_do_cow_copy_range_fail(job, start, ret);
-bdrv_set_dirty_bitmap(job->copy_bitmap, start,
-  job->cluster_size * nr_clusters);
-return ret;
+trace_backup_do_cow_copy_range_fail(job, offset, ret);
 }
 
-return nbytes;
+return ret;
 }
 
 /*
@@ -261,6 +236,8 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
 int ret = 0;
 int64_t start, end; /* bytes */
 int64_t skip_bytes;
+BdrvRequestFlags read_flags =
+is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
 
 qemu_co_rwlock_rdlock(&job->flush_rwlock);
 
@@ -274,6 +251,7 @@ static int coroutine_fn backup_do_cow(BackupBlockJob *job,
 
 while (start < end) {
 int64_t dirty_end;
+int64_t cur_bytes;
 
 if (!bdrv_dirty_bitmap_get(job->copy_bitmap, start)) {
 trace_backup_do_cow_skip(job, start);
@@ -297,30 +275,30 @@ s

[Qemu-block] [PATCH v2 5/7] block/backup: fix backup_cow_with_offload for last cluster

2019-08-09 Thread Vladimir Sementsov-Ogievskiy
We shouldn't try to copy bytes beyond EOF. Fix it.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Max Reitz 
---
 block/backup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/backup.c b/block/backup.c
index 228ba9423c..d482d93458 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -155,7 +155,7 @@ static int coroutine_fn 
backup_cow_with_offload(BackupBlockJob *job,
 int ret;
 int nr_clusters;
 BlockBackend *blk = job->common.blk;
-int nbytes = end - start;
+int nbytes = MIN(end - start, job->len - start);
 int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
 
 assert(end - start < INT_MAX);
-- 
2.18.0




[Qemu-block] [PATCH v2 2/7] block/backup: refactor write_flags

2019-08-09 Thread Vladimir Sementsov-Ogievskiy
write flags are constant, let's store it in BackupBlockJob instead of
recalculating. It also makes two boolean fields to be unused, so,
drop them.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: John Snow 
Reviewed-by: Max Reitz 
---
 block/backup.c | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index d815436455..c6a3b2b7bb 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -50,14 +50,13 @@ typedef struct BackupBlockJob {
 uint64_t len;
 uint64_t bytes_read;
 int64_t cluster_size;
-bool compress;
 NotifierWithReturn before_write;
 QLIST_HEAD(, CowRequest) inflight_reqs;
 
 bool use_copy_range;
 int64_t copy_range_size;
 
-bool serialize_target_writes;
+BdrvRequestFlags write_flags;
 bool initializing_bitmap;
 } BackupBlockJob;
 
@@ -113,10 +112,6 @@ static int coroutine_fn 
backup_cow_with_bounce_buffer(BackupBlockJob *job,
 BlockBackend *blk = job->common.blk;
 int nbytes;
 int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
-int write_flags =
-(job->serialize_target_writes ? BDRV_REQ_SERIALISING : 0) |
-(job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0);
-
 
 assert(QEMU_IS_ALIGNED(start, job->cluster_size));
 bdrv_reset_dirty_bitmap(job->copy_bitmap, start, job->cluster_size);
@@ -135,7 +130,7 @@ static int coroutine_fn 
backup_cow_with_bounce_buffer(BackupBlockJob *job,
 }
 
 ret = blk_co_pwrite(job->target, start, nbytes, *bounce_buffer,
-write_flags);
+job->write_flags);
 if (ret < 0) {
 trace_backup_do_cow_write_fail(job, start, ret);
 if (error_is_read) {
@@ -163,7 +158,6 @@ static int coroutine_fn 
backup_cow_with_offload(BackupBlockJob *job,
 BlockBackend *blk = job->common.blk;
 int nbytes;
 int read_flags = is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
-int write_flags = job->serialize_target_writes ? BDRV_REQ_SERIALISING : 0;
 
 assert(QEMU_IS_ALIGNED(job->copy_range_size, job->cluster_size));
 assert(QEMU_IS_ALIGNED(start, job->cluster_size));
@@ -172,7 +166,7 @@ static int coroutine_fn 
backup_cow_with_offload(BackupBlockJob *job,
 bdrv_reset_dirty_bitmap(job->copy_bitmap, start,
 job->cluster_size * nr_clusters);
 ret = blk_co_copy_range(blk, start, job->target, start, nbytes,
-read_flags, write_flags);
+read_flags, job->write_flags);
 if (ret < 0) {
 trace_backup_do_cow_copy_range_fail(job, start, ret);
 bdrv_set_dirty_bitmap(job->copy_bitmap, start,
@@ -748,10 +742,16 @@ BlockJob *backup_job_create(const char *job_id, 
BlockDriverState *bs,
 job->sync_mode = sync_mode;
 job->sync_bitmap = sync_bitmap;
 job->bitmap_mode = bitmap_mode;
-job->compress = compress;
 
-/* Detect image-fleecing (and similar) schemes */
-job->serialize_target_writes = bdrv_chain_contains(target, bs);
+/*
+ * Set write flags:
+ *  1. Detect image-fleecing (and similar) schemes
+ *  2. Handle compression
+ */
+job->write_flags =
+(bdrv_chain_contains(target, bs) ? BDRV_REQ_SERIALISING : 0) |
+(compress ? BDRV_REQ_WRITE_COMPRESSED : 0);
+
 job->cluster_size = cluster_size;
 job->copy_bitmap = copy_bitmap;
 copy_bitmap = NULL;
-- 
2.18.0




[Qemu-block] [PATCH v2 0/7] backup improvements

2019-08-09 Thread Vladimir Sementsov-Ogievskiy
Hi all!

There are some fixes and refactorings I need on my way to resend
my backup-top series. It's obvious now that I need to share copying
code between backup and backup-top, as backup copying code becomes
smarter and more complicated. So the goal of the series is to make copying
code more share-able.

It's based on John's bitmaps branch, rebased on master.

v2 (by Max's comments):

02: Add Max's r-b
03: - split out backup changes to 03
- handle common max_transfer = 0 case
04: splat out from 02
06: fix allocation size
07: - rebase on 06 changes
- add Max's r-b

two patches are dropped or postponed for the next step

Vladimir Sementsov-Ogievskiy (7):
  block/backup: deal with zero detection
  block/backup: refactor write_flags
  block/io: handle alignment and max_transfer for copy_range
  block/backup: drop handling of max_transfer for copy_range
  block/backup: fix backup_cow_with_offload for last cluster
  block/backup: teach backup_cow_with_bounce_buffer to copy more at once
  block/backup: merge duplicated logic into backup_do_cow

 block/backup.c | 125 +++--
 block/io.c |  48 +++
 blockdev.c |   8 ++--
 3 files changed, 91 insertions(+), 90 deletions(-)

-- 
2.18.0




Re: [Qemu-block] [PATCH v7 5/9] block/nbd: refactor nbd connection parameters

2019-08-09 Thread Eric Blake
On 6/18/19 6:43 AM, Vladimir Sementsov-Ogievskiy wrote:
> We'll need some connection parameters to be available all the time to
> implement nbd reconnect. So, let's refactor them: define additional
> parameters in BDRVNBDState, drop them from function parameters, drop
> nbd_client_init and separate options parsing instead from nbd_open.
> 
> Signed-off-by: Vladimir Sementsov-Ogievskiy 
> ---
>  block/nbd.c | 125 +++-
>  1 file changed, 64 insertions(+), 61 deletions(-)
> 

> @@ -1659,20 +1630,19 @@ static int nbd_open(BlockDriverState *bs, QDict 
> *options, int flags,
>  error_setg(errp, "TLS only supported over IP sockets");
>  goto error;
>  }
> -hostname = s->saddr->u.inet.host;
> +s->hostname = s->saddr->u.inet.host;
>  }
>  
> -/* NBD handshake */
> -ret = nbd_client_init(bs, s->saddr, s->export, tlscreds, hostname,
> -  qemu_opt_get(opts, "x-dirty-bitmap"),
> -  qemu_opt_get_number(opts, "reconnect-delay", 0),
> -  errp);
> +s->x_dirty_bitmap = g_strdup(qemu_opt_get(opts, "x-dirty-bitmap"));
> +s->reconnect_delay = qemu_opt_get_number(opts, "reconnect-delay", 0);
> +
> +ret = 0;
>  
>   error:
> -if (tlscreds) {
> -object_unref(OBJECT(tlscreds));
> -}
>  if (ret < 0) {
> +if (s->tlscreds) {
> +object_unref(OBJECT(s->tlscreds));
> +}

Pre-existing, but object_unref(NULL) is safe, making this a useless 'if'.


> @@ -1726,9 +1725,13 @@ static void nbd_close(BlockDriverState *bs)
>  
>  nbd_client_close(bs);
>  
> +if (s->tlscreds) {
> +object_unref(OBJECT(s->tlscreds));
> +}

and copied here.

>  qapi_free_SocketAddress(s->saddr);
>  g_free(s->export);
>  g_free(s->tlscredsid);
> +g_free(s->x_dirty_bitmap);

Do we need to duplicate s->x_dirty_bitmap with s->info.x_dirty_bitmap,
or make the latter be const char * and reuse the pointer from the former
rather than strdup'ing it? (I don't think it affects the refactoring
done in this patch, but is possibly worth a separate cleanup patch).

I can fix up the two useless 'if's.

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH v3 07/33] automatically add vmstate for reset support in devices

2019-08-09 Thread Dr. David Alan Gilbert
* Peter Maydell (peter.mayd...@linaro.org) wrote:
> On Thu, 8 Aug 2019 at 16:42, Dr. David Alan Gilbert  
> wrote:
> >
> > * Peter Maydell (peter.mayd...@linaro.org) wrote:
> > > On Mon, 29 Jul 2019 at 15:59, Damien Hedde  
> > > wrote:
> > > >
> > > > This add the reset related sections for every QOM
> > > > device.
> > >
> > > A bit more detail in the commit message would help, I think --
> > > this is adding extra machinery which has to copy and modify
> > > the VMStateDescription passed in by the device in order to
> > > add the subsection that handles reset.
> > >
> > > I've added Dave Gilbert to the already long cc list since this
> > > is migration related.
> >
> > I don't like dynamically modifying all the vmsds.
> 
> Yeah, I'm not a huge fan of it either, but it does mean that
> the state gets migrated and we don't have a pile of really
> easy to miss bugs where we forgot to say "this device needs to
> migrate the reset state" and it means we don't have every
> device we ever write in the future needing to say "this device
> needs to migrate reset state"...
> 
> > Aren't you going to have to understand each devices reset behaviour
> > and make sure it does something sane? e.g. it might have a postload
> > that registers a timer or something that you wouldn't want to do if it's
> > in reset.
> >
> > The easiest way is to write a macro that you can easily add to devices
> > you've checked subsection list (like the way we have a
> > VMSTATE_USB_DEVICE).
> 
> One problem is that as soon as somebody writes a USB controller
> or PCI controller model that can be held in reset, every
> device that could sat behind it automatically now could find
> that it's migrated while it's in reset.

I'm not convinced though that they'll all be fixed by adding this
subsection; I think some of those devices you're going to have to
look at and see what they do.

Dave

> thanks
> -- PMM
--
Dr. David Alan Gilbert / dgilb...@redhat.com / Manchester, UK



[Qemu-block] backup bug or question

2019-08-09 Thread Vladimir Sementsov-Ogievskiy
Hi!

Hmm, hacking around backup I have a question:

What prevents guest write request after job_start but before setting
write notifier?

code path:

qmp_drive_backup or transaction with backup

job_start
   aio_co_enter(job_co_entry) /* may only schedule execution, isn't it ? */



job_co_entry
job_pause_point() /* it definitely yields, isn't it bad? */
job->driver->run() /* backup_run */



backup_run()
bdrv_add_before_write_notifier()

...

And what guarantees we give to the user? Is it guaranteed that write notifier is
set when qmp command returns?

And I guess, if we start several backups in a transaction it should be 
guaranteed
that the set of backups is consistent and correspond to one point in time...

-- 
Best regards,
Vladimir


Re: [Qemu-block] [PATCH v3 07/33] automatically add vmstate for reset support in devices

2019-08-09 Thread Juan Quintela
Damien Hedde  wrote:
> On 8/9/19 12:32 PM, Peter Maydell wrote:
>> On Fri, 9 Aug 2019 at 11:29, Damien Hedde  wrote:
>>>
>>> One way to keep the feature without copy-pasting vmsd would be to add
>>> a new vmstate_register with an additional argument to pass the base
>>> class vmsd section and handle the whole thing there.
>> 
>> If we have a vmstate section which contains no actual data,
>> only subsections with 'needed' functions, is it migration
>> compatible with previous versions in the same way that
>> tacking a subsection onto an existing function is?
>
> I don't think so because of the naming schema. I had to forge the
> correct name for the reset subsection for every device.
> Each subsection must be named after its parent section plus '/something'.

That bit is easy.  You jsut named it: "parent_name/subsection_name", no?

Later, Juan.



Re: [Qemu-block] [PATCH 4/8] block/backup: improve unallocated clusters skipping

2019-08-09 Thread Max Reitz
On 09.08.19 14:47, Vladimir Sementsov-Ogievskiy wrote:
> 09.08.2019 15:25, Max Reitz wrote:
>> On 09.08.19 09:50, Vladimir Sementsov-Ogievskiy wrote:
>>> 07.08.2019 21:01, Max Reitz wrote:
 On 07.08.19 10:07, Vladimir Sementsov-Ogievskiy wrote:
> Limit block_status querying to request bounds on write notifier to
> avoid extra seeking.

 I don’t understand this reasoning.  Checking whether something is
 allocated for qcow2 should just mean an L2 cache lookup.  Which we have
 to do anyway when we try to copy data off the source.
>>>
>>> But for raw it's seeking.
>>
>> (1) That’s a bug in block_status then, isn’t it?
>>
>> file-posix cannot determine the allocation status, or rather, everything
>> is allocated.  bdrv_co_block_status() should probably pass @want_zero on
>> to the driver’s implementation, and file-posix should just
>> unconditionally return DATA if it’s false.
>>
>> (2) Why would you even use sync=top for raw nodes?
>>
> 
> As I described in parallel letters, raw was bad example. NBD is good.

Does NBD support backing files?

> Anyway, now I'm refactoring cluster skipping more deeply for v2.
> 
> About top-mode: finally block-status should be used to improve other
> modes too. In virtuozzo we skip unallocated for full mode too, for example.

But this patch here is about sync=top.

Skipping is an optimization, the block_status querying here happens
because copying anything that isn’t allocated in the top layer would be
wrong.

Max

> Unfortunately, backup is most long-term thing to upstream for me..



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [Qemu-devel] [PATCH] qemu-img convert: Deprecate using -n and -o together

2019-08-09 Thread Eric Blake
On 8/9/19 4:11 AM, Kevin Wolf wrote:
> bdrv_create options specified with -o have no effect when skipping image
> creation with -n, so this doesn't make sense. Warn against the misuse
> and deprecate the combination so we can make it a hard error later.
> 
> Signed-off-by: Kevin Wolf 
> ---
>  qemu-img.c   | 5 +
>  qemu-deprecated.texi | 7 +++
>  2 files changed, 12 insertions(+)
> 
> diff --git a/qemu-img.c b/qemu-img.c
> index 79983772de..d9321f6418 100644
> --- a/qemu-img.c
> +++ b/qemu-img.c
> @@ -2231,6 +2231,11 @@ static int img_convert(int argc, char **argv)
>  goto fail_getopt;
>  }
>  
> +if (skip_create && options) {
> +warn_report("-o has no effect when skipping image creation");
> +warn_report("This will become an error in future QEMU versions.");

It looks a bit odd to have two output lines, where one is a phrase and
the other is a full sentence. But I don't have any better ideas how to
represent it, so

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3226
Virtualization:  qemu.org | libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH v3 07/33] automatically add vmstate for reset support in devices

2019-08-09 Thread Juan Quintela
Peter Maydell  wrote:
> On Fri, 9 Aug 2019 at 11:29, Damien Hedde  wrote:
>>
>> One way to keep the feature without copy-pasting vmsd would be to add
>> a new vmstate_register with an additional argument to pass the base
>> class vmsd section and handle the whole thing there.
>
> If we have a vmstate section which contains no actual data,
> only subsections with 'needed' functions, is it migration
> compatible with previous versions in the same way that
> tacking a subsection onto an existing function is?

Not now, but we can make it happen.

Right now, we first send the VMSD "header", and then we sent whatever
data is in it.

We can it make work the way you want.  Not sure if that is the best way,
though.  Sections that can(or can't) be there make just reasoning about
the stream more difficult.

Later, Juan.



Re: [Qemu-block] [PATCH] qemu-img convert: Deprecate using -n and -o together

2019-08-09 Thread Max Reitz
On 09.08.19 11:11, Kevin Wolf wrote:
> bdrv_create options specified with -o have no effect when skipping image
> creation with -n, so this doesn't make sense. Warn against the misuse
> and deprecate the combination so we can make it a hard error later.
> 
> Signed-off-by: Kevin Wolf 
> ---
>  qemu-img.c   | 5 +
>  qemu-deprecated.texi | 7 +++
>  2 files changed, 12 insertions(+)

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH 4/8] block/backup: improve unallocated clusters skipping

2019-08-09 Thread Vladimir Sementsov-Ogievskiy
09.08.2019 15:25, Max Reitz wrote:
> On 09.08.19 09:50, Vladimir Sementsov-Ogievskiy wrote:
>> 07.08.2019 21:01, Max Reitz wrote:
>>> On 07.08.19 10:07, Vladimir Sementsov-Ogievskiy wrote:
 Limit block_status querying to request bounds on write notifier to
 avoid extra seeking.
>>>
>>> I don’t understand this reasoning.  Checking whether something is
>>> allocated for qcow2 should just mean an L2 cache lookup.  Which we have
>>> to do anyway when we try to copy data off the source.
>>
>> But for raw it's seeking.
> 
> (1) That’s a bug in block_status then, isn’t it?
> 
> file-posix cannot determine the allocation status, or rather, everything
> is allocated.  bdrv_co_block_status() should probably pass @want_zero on
> to the driver’s implementation, and file-posix should just
> unconditionally return DATA if it’s false.
> 
> (2) Why would you even use sync=top for raw nodes?
> 

As I described in parallel letters, raw was bad example. NBD is good.
Anyway, now I'm refactoring cluster skipping more deeply for v2.

About top-mode: finally block-status should be used to improve other
modes too. In virtuozzo we skip unallocated for full mode too, for example.
Unfortunately, backup is most long-term thing to upstream for me..

-- 
Best regards,
Vladimir


Re: [Qemu-block] [PATCH 4/8] block/backup: improve unallocated clusters skipping

2019-08-09 Thread Max Reitz
On 09.08.19 09:50, Vladimir Sementsov-Ogievskiy wrote:
> 07.08.2019 21:01, Max Reitz wrote:
>> On 07.08.19 10:07, Vladimir Sementsov-Ogievskiy wrote:
>>> Limit block_status querying to request bounds on write notifier to
>>> avoid extra seeking.
>>
>> I don’t understand this reasoning.  Checking whether something is
>> allocated for qcow2 should just mean an L2 cache lookup.  Which we have
>> to do anyway when we try to copy data off the source.
> 
> But for raw it's seeking.

(1) That’s a bug in block_status then, isn’t it?

file-posix cannot determine the allocation status, or rather, everything
is allocated.  bdrv_co_block_status() should probably pass @want_zero on
to the driver’s implementation, and file-posix should just
unconditionally return DATA if it’s false.

(2) Why would you even use sync=top for raw nodes?

Max



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH v3 05/33] Switch to new api in qdev/bus

2019-08-09 Thread Cédric Le Goater


>>> So.. is this change in the device_reset() signature really necessary?
>>> Even if there are compelling reasons to handle warm reset in the new
>>> API, that doesn't been you need to change device_reset() itself from
>>> its established meaning of a cold (i.e. as per power cycle) reset.
>>> Warm resets are generally called in rather more specific circumstances
>>> (often under guest software direction) so it seems likely that users
>>> would want to engage with the new reset API directly.  Or we could
>>> just create a device_warm_reset() wrapper.  That would also avoid the
>>> bare boolean parameter, which is not great for readability (you have
>>> to look up the signature to have any idea what it means).
>>
>> I've added device_reset_cold/warm wrapper functions to avoid having to
>> pass the boolean parameter. it seems I forgot to use them in qdev.c
>> I suppose, like you said, we could live with
>> + no function with the boolean parameter
>> + device_reset doing cold reset
>> + device_reset_warm (or device_warm_reset) for the warm version
> 
> Ok, good.
> 
> I'm afraid the whole series still makes me pretty uncomfortable,
> though, since the whole "warm reset" concept still seems way to vague
> to me.

Isn't the reset after the CAS negotiation sequence between the hypervisor
and the pseries machine some sort of warm reset driven by SW ? 


C.  



Re: [Qemu-block] [Qemu-devel] [PATCH v3 05/33] Switch to new api in qdev/bus

2019-08-09 Thread Peter Maydell
On Fri, 9 Aug 2019 at 01:10, David Gibson  wrote:
>
> On Wed, Jul 31, 2019 at 01:31:28PM +0200, Philippe Mathieu-Daudé wrote:
> > On 7/31/19 11:29 AM, Damien Hedde wrote:
> > > On 7/31/19 8:05 AM, David Gibson wrote:
> > >> On Mon, Jul 29, 2019 at 04:56:26PM +0200, Damien Hedde wrote:
> > >>> @@ -922,7 +906,7 @@ static void device_set_realized(Object *obj, bool 
> > >>> value, Error **errp)
> > >>>  }
> > >>>  }
> > >>>  if (dev->hotplugged) {
> > >>> -device_legacy_reset(dev);
> > >>> +device_reset(dev, true);
> > >>
> > >> So.. is this change in the device_reset() signature really necessary?
> > >> Even if there are compelling reasons to handle warm reset in the new
> > >> API, that doesn't been you need to change device_reset() itself from
> > >> its established meaning of a cold (i.e. as per power cycle) reset.

So, I don't think that actually is the established meaning of
device_reset(). I think our current semantics for this function are
"reset of some sort, probably cold, but in practice called in
lots of places which really wanted some other kind of reset,
because our current reset semantics are not well-defined".

For instance in s390-pci-inst.c the handling of CLP_SET_DISABLE_PCI_FN
calls device_reset() on a device. I bet that's not really intentionally
trying to model "we powered it off and on again".
hw/scsi/vmw_pvscsi.c uses device_reset() in its handling of
the guest "reset the SCSI bus" command. I know that isn't literally
powering off the SCSI disks and powering them on again.

The advantage of an actual API change here is that it means we go
and look at all the call sites and find out what the semantics
they actually wanted were, and hopefully that then feeds into the
design of the new API and we make sure we can handle those
semantics in a non-confused way.

> > >> Warm resets are generally called in rather more specific circumstances
> > >> (often under guest software direction) so it seems likely that users
> > >> would want to engage with the new reset API directly.  Or we could
> > >> just create a device_warm_reset() wrapper.  That would also avoid the
> > >> bare boolean parameter, which is not great for readability (you have
> > >> to look up the signature to have any idea what it means).
> >
> > If the boolean is not meaningful, we can use an enum...
>
> That's certainly better, but I'm not seeing a compelling reason not to
> have separate function names.  It's just as clear and means less churn.

One advantage of an enum is that we have an extendable API,
so we could say something like "all devices support reset types
'cold' and 'warm', but individual devices or families of devices
can also support other kinds of reset". So the relevant s390
devices could directly support the kinds of reset currently
enumerated by the enum s390_reset, and then if you know you're
dealing with an s390 device you can ask it to reset with the
right semantics rather than fudging it with one of the generic ones.
Or devices with "if I'm reset by the guest writing to a register then
I reset less stuff than a reset via external reset line" have a
way to model that.

thanks
-- PMM



Re: [Qemu-block] [PATCH v3 07/33] automatically add vmstate for reset support in devices

2019-08-09 Thread Damien Hedde


On 8/9/19 12:32 PM, Peter Maydell wrote:
> On Fri, 9 Aug 2019 at 11:29, Damien Hedde  wrote:
>>
>> One way to keep the feature without copy-pasting vmsd would be to add
>> a new vmstate_register with an additional argument to pass the base
>> class vmsd section and handle the whole thing there.
> 
> If we have a vmstate section which contains no actual data,
> only subsections with 'needed' functions, is it migration
> compatible with previous versions in the same way that
> tacking a subsection onto an existing function is?

I don't think so because of the naming schema. I had to forge the
correct name for the reset subsection for every device.
Each subsection must be named after its parent section plus '/something'.

--
Damien



Re: [Qemu-block] [PATCH v3 07/33] automatically add vmstate for reset support in devices

2019-08-09 Thread Peter Maydell
On Fri, 9 Aug 2019 at 11:29, Damien Hedde  wrote:
>
> One way to keep the feature without copy-pasting vmsd would be to add
> a new vmstate_register with an additional argument to pass the base
> class vmsd section and handle the whole thing there.

If we have a vmstate section which contains no actual data,
only subsections with 'needed' functions, is it migration
compatible with previous versions in the same way that
tacking a subsection onto an existing function is?

thanks
-- PMM



Re: [Qemu-block] [PATCH v3 07/33] automatically add vmstate for reset support in devices

2019-08-09 Thread Damien Hedde



On 8/9/19 12:07 PM, Peter Maydell wrote:
> On Thu, 8 Aug 2019 at 16:42, Dr. David Alan Gilbert  
> wrote:
>>
>> * Peter Maydell (peter.mayd...@linaro.org) wrote:
>>> On Mon, 29 Jul 2019 at 15:59, Damien Hedde  
>>> wrote:

 This add the reset related sections for every QOM
 device.
>>>
>>> A bit more detail in the commit message would help, I think --
>>> this is adding extra machinery which has to copy and modify
>>> the VMStateDescription passed in by the device in order to
>>> add the subsection that handles reset.
>>>
>>> I've added Dave Gilbert to the already long cc list since this
>>> is migration related.
>>
>> I don't like dynamically modifying all the vmsds.
> 
> Yeah, I'm not a huge fan of it either, but it does mean that
> the state gets migrated and we don't have a pile of really
> easy to miss bugs where we forgot to say "this device needs to
> migrate the reset state" and it means we don't have every
> device we ever write in the future needing to say "this device
> needs to migrate reset state"...
> 
>> Aren't you going to have to understand each devices reset behaviour
>> and make sure it does something sane? e.g. it might have a postload
>> that registers a timer or something that you wouldn't want to do if it's
>> in reset.
>>
>> The easiest way is to write a macro that you can easily add to devices
>> you've checked subsection list (like the way we have a
>> VMSTATE_USB_DEVICE).
> 
> One problem is that as soon as somebody writes a USB controller
> or PCI controller model that can be held in reset, every
> device that could sat behind it automatically now could find
> that it's migrated while it's in reset.
> 

One way to keep the feature without copy-pasting vmsd would be to add
a new vmstate_register with an additional argument to pass the base
class vmsd section and handle the whole thing there.

--
Damien



Re: [Qemu-block] [PATCH 4/8] block/backup: improve unallocated clusters skipping

2019-08-09 Thread Vladimir Sementsov-Ogievskiy
09.08.2019 12:12, Vladimir Sementsov-Ogievskiy wrote:
> 09.08.2019 10:50, Vladimir Sementsov-Ogievskiy wrote:
>> 07.08.2019 21:01, Max Reitz wrote:
>>> On 07.08.19 10:07, Vladimir Sementsov-Ogievskiy wrote:
 Limit block_status querying to request bounds on write notifier to
 avoid extra seeking.
>>>
>>> I don’t understand this reasoning.  Checking whether something is
>>> allocated for qcow2 should just mean an L2 cache lookup.  Which we have
>>> to do anyway when we try to copy data off the source.
>>
>> But for raw it's seeking. I think we just shouldn't do any unnecessary 
>> operations
>> in copy-before-write handling. So instead of calling
>> block_status(request_start, disk_end) I think it's better to call
>> block_status(request_start, request_end). And it is more applicable for 
>> reusing this
>> code too.
>>
>>>
>>> I could understand saying this makes the code easier, but I actually
>>> don’t think it does.  If you implemented checking the allocation status
>>> only for areas where the bitmap is reset (which I think this patch
>>> should), then it’d just duplicate the existing loop.
>>>
 Signed-off-by: Vladimir Sementsov-Ogievskiy 
 ---
   block/backup.c | 38 +-
   1 file changed, 21 insertions(+), 17 deletions(-)

 diff --git a/block/backup.c b/block/backup.c
 index 11e27c844d..a4d37d2d62 100644
 --- a/block/backup.c
 +++ b/block/backup.c
>>>
>>> [...]
>>>
 @@ -267,6 +267,18 @@ static int coroutine_fn backup_do_cow(BackupBlockJob 
 *job,
   wait_for_overlapping_requests(job, start, end);
   cow_request_begin(&cow_request, job, start, end);
 +    if (job->initializing_bitmap) {
 +    int64_t off, chunk;
 +
 +    for (off = offset; offset < end; offset += chunk) {
>>>
>>> This is what I’m referring to, I think this loop should skip areas that
>>> are clean.
>>
>> Agree, will do.
> 
> Hmm, I remembered that I thought of it, but decided that it may be not 
> efficient if
> bitmap fragmentation is higher than block-status fragmentation. So, if we 
> don't know
> what is better, let's keep simpler code.

Hmm2, but I see a compromise (may be you meant exactly this): not using next 
zero as limit
(but always use request_end), but before each iteration skip to next_dirty.

> 
>>
>>>
 +    ret = backup_bitmap_reset_unallocated(job, off, end - off, 
 &chunk);
 +    if (ret < 0) {
 +    chunk = job->cluster_size;
 +    }
 +    }
 +    }
 +    ret = 0;
 +
   while (start < end) {
   int64_t dirty_end;
 @@ -276,15 +288,6 @@ static int coroutine_fn backup_do_cow(BackupBlockJob 
 *job,
   continue; /* already copied */
   }
 -    if (job->initializing_bitmap) {
 -    ret = backup_bitmap_reset_unallocated(job, start, 
 &skip_bytes);
 -    if (ret == 0) {
 -    trace_backup_do_cow_skip_range(job, start, skip_bytes);
 -    start += skip_bytes;
 -    continue;
 -    }
 -    }
 -
   dirty_end = bdrv_dirty_bitmap_next_zero(job->copy_bitmap, start,
   end - start);
   if (dirty_end < 0) {
>>>
>>> Hm, you resolved that conflict differently from me.
>>>
>>> I decided the bdrv_dirty_bitmap_next_zero() call should go before the
>>> backup_bitmap_reset_unallocated() call so that we can then do a
>>>
>>>    dirty_end = MIN(dirty_end, start + skip_bytes);
>>>
>>> because we probably don’t want to copy anything past what
>>> backup_bitmap_reset_unallocated() has inquired.
>>>
>>>
>>> But then again this patch eliminates the difference anyway...
>>>  >
 @@ -546,7 +549,8 @@ static int coroutine_fn backup_run(Job *job, Error 
 **errp)
   goto out;
   }
 -    ret = backup_bitmap_reset_unallocated(s, offset, &count);
 +    ret = backup_bitmap_reset_unallocated(s, offset, s->len - 
 offset,
 +  &count);
   if (ret < 0) {
   goto out;
   }

>>>
>>>
>>
>>
> 
> 


-- 
Best regards,
Vladimir


Re: [Qemu-block] [PATCH 4/8] block/backup: improve unallocated clusters skipping

2019-08-09 Thread Vladimir Sementsov-Ogievskiy
09.08.2019 10:50, Vladimir Sementsov-Ogievskiy wrote:
> 07.08.2019 21:01, Max Reitz wrote:
>> On 07.08.19 10:07, Vladimir Sementsov-Ogievskiy wrote:
>>> Limit block_status querying to request bounds on write notifier to
>>> avoid extra seeking.
>>
>> I don’t understand this reasoning.  Checking whether something is
>> allocated for qcow2 should just mean an L2 cache lookup.  Which we have
>> to do anyway when we try to copy data off the source.
> 
> But for raw it's seeking. I think we just shouldn't do any unnecessary 
> operations
> in copy-before-write handling. So instead of calling
> block_status(request_start, disk_end) I think it's better to call
> block_status(request_start, request_end). And it is more applicable for 
> reusing this
> code too.

oops, and seek lack the limit anyway.

But anyway, we have API with count limit and it seems natural to assume that 
some
driver may do more calculations for bigger request. So smaller request is good 
for
copy-before-write operation when we are in a harry to unfreeze guest write as 
soon
as possible.

Hmm, example of such drive may be NBD, which can concatenate block-status 
results of
exported disk.


> 
>>
>> I could understand saying this makes the code easier, but I actually
>> don’t think it does.  If you implemented checking the allocation status
>> only for areas where the bitmap is reset (which I think this patch
>> should), then it’d just duplicate the existing loop.
>>
>>> Signed-off-by: Vladimir Sementsov-Ogievskiy 
>>> ---
>>>   block/backup.c | 38 +-
>>>   1 file changed, 21 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/block/backup.c b/block/backup.c
>>> index 11e27c844d..a4d37d2d62 100644
>>> --- a/block/backup.c
>>> +++ b/block/backup.c
>>
>> [...]
>>
>>> @@ -267,6 +267,18 @@ static int coroutine_fn backup_do_cow(BackupBlockJob 
>>> *job,
>>>   wait_for_overlapping_requests(job, start, end);
>>>   cow_request_begin(&cow_request, job, start, end);
>>> +    if (job->initializing_bitmap) {
>>> +    int64_t off, chunk;
>>> +
>>> +    for (off = offset; offset < end; offset += chunk) {
>>
>> This is what I’m referring to, I think this loop should skip areas that
>> are clean.
> 
> Agree, will do.
> 
>>
>>> +    ret = backup_bitmap_reset_unallocated(job, off, end - off, 
>>> &chunk);
>>> +    if (ret < 0) {
>>> +    chunk = job->cluster_size;
>>> +    }
>>> +    }
>>> +    }
>>> +    ret = 0;
>>> +
>>>   while (start < end) {
>>>   int64_t dirty_end;
>>> @@ -276,15 +288,6 @@ static int coroutine_fn backup_do_cow(BackupBlockJob 
>>> *job,
>>>   continue; /* already copied */
>>>   }
>>> -    if (job->initializing_bitmap) {
>>> -    ret = backup_bitmap_reset_unallocated(job, start, &skip_bytes);
>>> -    if (ret == 0) {
>>> -    trace_backup_do_cow_skip_range(job, start, skip_bytes);
>>> -    start += skip_bytes;
>>> -    continue;
>>> -    }
>>> -    }
>>> -
>>>   dirty_end = bdrv_dirty_bitmap_next_zero(job->copy_bitmap, start,
>>>   end - start);
>>>   if (dirty_end < 0) {
>>
>> Hm, you resolved that conflict differently from me.
>>
>> I decided the bdrv_dirty_bitmap_next_zero() call should go before the
>> backup_bitmap_reset_unallocated() call so that we can then do a
>>
>>    dirty_end = MIN(dirty_end, start + skip_bytes);
>>
>> because we probably don’t want to copy anything past what
>> backup_bitmap_reset_unallocated() has inquired.
>>
>>
>> But then again this patch eliminates the difference anyway...
>>  >
>>> @@ -546,7 +549,8 @@ static int coroutine_fn backup_run(Job *job, Error 
>>> **errp)
>>>   goto out;
>>>   }
>>> -    ret = backup_bitmap_reset_unallocated(s, offset, &count);
>>> +    ret = backup_bitmap_reset_unallocated(s, offset, s->len - 
>>> offset,
>>> +  &count);
>>>   if (ret < 0) {
>>>   goto out;
>>>   }
>>>
>>
>>
> 
> 


-- 
Best regards,
Vladimir


Re: [Qemu-block] [PATCH v3 07/33] automatically add vmstate for reset support in devices

2019-08-09 Thread Peter Maydell
On Thu, 8 Aug 2019 at 16:42, Dr. David Alan Gilbert  wrote:
>
> * Peter Maydell (peter.mayd...@linaro.org) wrote:
> > On Mon, 29 Jul 2019 at 15:59, Damien Hedde  
> > wrote:
> > >
> > > This add the reset related sections for every QOM
> > > device.
> >
> > A bit more detail in the commit message would help, I think --
> > this is adding extra machinery which has to copy and modify
> > the VMStateDescription passed in by the device in order to
> > add the subsection that handles reset.
> >
> > I've added Dave Gilbert to the already long cc list since this
> > is migration related.
>
> I don't like dynamically modifying all the vmsds.

Yeah, I'm not a huge fan of it either, but it does mean that
the state gets migrated and we don't have a pile of really
easy to miss bugs where we forgot to say "this device needs to
migrate the reset state" and it means we don't have every
device we ever write in the future needing to say "this device
needs to migrate reset state"...

> Aren't you going to have to understand each devices reset behaviour
> and make sure it does something sane? e.g. it might have a postload
> that registers a timer or something that you wouldn't want to do if it's
> in reset.
>
> The easiest way is to write a macro that you can easily add to devices
> you've checked subsection list (like the way we have a
> VMSTATE_USB_DEVICE).

One problem is that as soon as somebody writes a USB controller
or PCI controller model that can be held in reset, every
device that could sat behind it automatically now could find
that it's migrated while it's in reset.

thanks
-- PMM



Re: [Qemu-block] [Qemu-devel] [PATCH v3 12/33] hw/pci/: remove qdev/qbus_reset_all call

2019-08-09 Thread Damien Hedde



On 8/7/19 5:31 PM, Peter Maydell wrote:
> On Mon, 29 Jul 2019 at 15:59, Damien Hedde  wrote:
>>
>> Replace deprecated qdev/bus_reset_all by device/bus_reset_warm.
>>
>> This does not impact the behavior.
>>
>> Signed-off-by: Damien Hedde 
> 
> I'll come back to patches 12-28 later. They're all ok
> in principle, we just need to check that in each individual
> case:
>  * we've made the right choice of cold vs warm reset
>  * we're ok to switch to 'reset including children' from
>the legacy 'reset not including children' semantics

I'm working on a patch reroll to fix what's been reviewed so far. Should
I put aside the patches 12-28 for now ? They could be part of 1 or 2
separate following series or I can re-add them later on when we agree on
the api.

--
Damien



Re: [Qemu-block] [Qemu-devel] [PATCH v3 02/33] add temporary device_legacy_reset function to replace device_reset

2019-08-09 Thread Damien Hedde


On 8/7/19 4:27 PM, Peter Maydell wrote:
> On Mon, 29 Jul 2019 at 15:58, Damien Hedde  wrote:
>>
>> Provide a temporary function doing what device_reset does to do the
>> transition with Resettable API which will trigger a prototype change
>> of device_reset.
> 
> The other point here is that device_legacy_reset() resets
> only that device, not any of its qbus children, right?
> So the new function which we eventually replace the callsites
> with also has different semantics, which is why we do the
> changes one by one in patches 10-28.

Yes, for device_reset there is a change of scope.

> 
> So you could add:
> 
> The new resettable API function also has different semantics
> (resetting child buses as well as the specified device).
> Subsequent commits will make the changeover for each callsite
> individually; once that is complete device_legacy_reset() will be
> removed.

sure

> 
>> Signed-off-by: Damien Hedde 
> 
> I agree with David that patch 3 could be squashed into this one.

ok

> 
> If you do that and tweak the commit message you can have
> Reviewed-by: Peter Maydell 
> 
> thanks
> -- PMM
> 



Re: [Qemu-block] [PATCH 4/8] block/backup: improve unallocated clusters skipping

2019-08-09 Thread Vladimir Sementsov-Ogievskiy
09.08.2019 10:50, Vladimir Sementsov-Ogievskiy wrote:
> 07.08.2019 21:01, Max Reitz wrote:
>> On 07.08.19 10:07, Vladimir Sementsov-Ogievskiy wrote:
>>> Limit block_status querying to request bounds on write notifier to
>>> avoid extra seeking.
>>
>> I don’t understand this reasoning.  Checking whether something is
>> allocated for qcow2 should just mean an L2 cache lookup.  Which we have
>> to do anyway when we try to copy data off the source.
> 
> But for raw it's seeking. I think we just shouldn't do any unnecessary 
> operations
> in copy-before-write handling. So instead of calling
> block_status(request_start, disk_end) I think it's better to call
> block_status(request_start, request_end). And it is more applicable for 
> reusing this
> code too.
> 
>>
>> I could understand saying this makes the code easier, but I actually
>> don’t think it does.  If you implemented checking the allocation status
>> only for areas where the bitmap is reset (which I think this patch
>> should), then it’d just duplicate the existing loop.
>>
>>> Signed-off-by: Vladimir Sementsov-Ogievskiy 
>>> ---
>>>   block/backup.c | 38 +-
>>>   1 file changed, 21 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/block/backup.c b/block/backup.c
>>> index 11e27c844d..a4d37d2d62 100644
>>> --- a/block/backup.c
>>> +++ b/block/backup.c
>>
>> [...]
>>
>>> @@ -267,6 +267,18 @@ static int coroutine_fn backup_do_cow(BackupBlockJob 
>>> *job,
>>>   wait_for_overlapping_requests(job, start, end);
>>>   cow_request_begin(&cow_request, job, start, end);
>>> +    if (job->initializing_bitmap) {
>>> +    int64_t off, chunk;
>>> +
>>> +    for (off = offset; offset < end; offset += chunk) {
>>
>> This is what I’m referring to, I think this loop should skip areas that
>> are clean.
> 
> Agree, will do.

Hmm, I remembered that I thought of it, but decided that it may be not 
efficient if
bitmap fragmentation is higher than block-status fragmentation. So, if we don't 
know
what is better, let's keep simpler code.

> 
>>
>>> +    ret = backup_bitmap_reset_unallocated(job, off, end - off, 
>>> &chunk);
>>> +    if (ret < 0) {
>>> +    chunk = job->cluster_size;
>>> +    }
>>> +    }
>>> +    }
>>> +    ret = 0;
>>> +
>>>   while (start < end) {
>>>   int64_t dirty_end;
>>> @@ -276,15 +288,6 @@ static int coroutine_fn backup_do_cow(BackupBlockJob 
>>> *job,
>>>   continue; /* already copied */
>>>   }
>>> -    if (job->initializing_bitmap) {
>>> -    ret = backup_bitmap_reset_unallocated(job, start, &skip_bytes);
>>> -    if (ret == 0) {
>>> -    trace_backup_do_cow_skip_range(job, start, skip_bytes);
>>> -    start += skip_bytes;
>>> -    continue;
>>> -    }
>>> -    }
>>> -
>>>   dirty_end = bdrv_dirty_bitmap_next_zero(job->copy_bitmap, start,
>>>   end - start);
>>>   if (dirty_end < 0) {
>>
>> Hm, you resolved that conflict differently from me.
>>
>> I decided the bdrv_dirty_bitmap_next_zero() call should go before the
>> backup_bitmap_reset_unallocated() call so that we can then do a
>>
>>    dirty_end = MIN(dirty_end, start + skip_bytes);
>>
>> because we probably don’t want to copy anything past what
>> backup_bitmap_reset_unallocated() has inquired.
>>
>>
>> But then again this patch eliminates the difference anyway...
>>  >
>>> @@ -546,7 +549,8 @@ static int coroutine_fn backup_run(Job *job, Error 
>>> **errp)
>>>   goto out;
>>>   }
>>> -    ret = backup_bitmap_reset_unallocated(s, offset, &count);
>>> +    ret = backup_bitmap_reset_unallocated(s, offset, s->len - 
>>> offset,
>>> +  &count);
>>>   if (ret < 0) {
>>>   goto out;
>>>   }
>>>
>>
>>
> 
> 


-- 
Best regards,
Vladimir


[Qemu-block] [PATCH] qemu-img convert: Deprecate using -n and -o together

2019-08-09 Thread Kevin Wolf
bdrv_create options specified with -o have no effect when skipping image
creation with -n, so this doesn't make sense. Warn against the misuse
and deprecate the combination so we can make it a hard error later.

Signed-off-by: Kevin Wolf 
---
 qemu-img.c   | 5 +
 qemu-deprecated.texi | 7 +++
 2 files changed, 12 insertions(+)

diff --git a/qemu-img.c b/qemu-img.c
index 79983772de..d9321f6418 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -2231,6 +2231,11 @@ static int img_convert(int argc, char **argv)
 goto fail_getopt;
 }
 
+if (skip_create && options) {
+warn_report("-o has no effect when skipping image creation");
+warn_report("This will become an error in future QEMU versions.");
+}
+
 s.src_num = argc - optind - 1;
 out_filename = s.src_num >= 1 ? argv[argc - 1] : NULL;
 
diff --git a/qemu-deprecated.texi b/qemu-deprecated.texi
index fff07bb2a3..7673d079c5 100644
--- a/qemu-deprecated.texi
+++ b/qemu-deprecated.texi
@@ -305,6 +305,13 @@ to just export the entire image and then mount only 
/dev/nbd0p1 than
 it is to reinvoke @command{qemu-nbd -c /dev/nbd0} limited to just a
 subset of the image.
 
+@subsection qemu-img convert -n -o (since 4.2.0)
+
+All options specified in @option{-o} are image creation options, so they
+have no effect when used with @option{-n} to skip image creation. This
+combination never made sense and shows that the user misunderstood the
+effect of the options, so this will be made an error in future versions.
+
 @section Build system
 
 @subsection Python 2 support (since 4.1.0)
-- 
2.20.1




Re: [Qemu-block] [PATCH v3 08/33] Add function to control reset with gpio inputs

2019-08-09 Thread Damien Hedde



On 8/9/19 7:51 AM, David Gibson wrote:
> On Wed, Aug 07, 2019 at 11:37:51AM +0100, Peter Maydell wrote:
>> On Wed, 31 Jul 2019 at 07:33, David Gibson  
>> wrote:
>>>
>>> On Mon, Jul 29, 2019 at 04:56:29PM +0200, Damien Hedde wrote:
 It adds the possibility to add 2 gpios to control the warm and cold reset.
 With theses ios, the reset can be maintained during some time.
 Each io is associated with a state to detect level changes.

 Vmstate subsections are also added to the existsing device_reset
 subsection.
>>>
>>> This doesn't seem like a thing that should be present on every single
>>> DeviceState.
>>
>> It's a facility that's going to be useful to multiple different
>> subclasses of DeviceState, so it seems to me cleaner to
>> have base class support for the common feature rather than
>> to reimplement it entirely from scratch in every subclass
>> that wants it.
> 
> Hm, I suppose so.  Would it really have to be from scratch, though?
> Couldn't some suitable helper functions make adding such GPIOs to a
> device pretty straightforward?
> 

This patch does that. A device does have to use the helper to add the
gpio. Either qdev_init_warm_reset_gpio(...) or
qdev_init_cold_reset_gpio(...) , like any another gpio.

The mechanics to control the reset with gpio change is done in the base
class and there is some state pre-allocated (and associated vmstate
description) to it.

If that's a problem I can only provide helpers and let devices handle
state allocation and vmstate addition.

Damien



Re: [Qemu-block] [PATCH v3 14/33] hw/s390x/s390-virtio-ccw.c: remove qdev_reset_all call

2019-08-09 Thread Damien Hedde



On 8/8/19 12:50 PM, Cornelia Huck wrote:
> On Mon, 29 Jul 2019 16:56:35 +0200
> Damien Hedde  wrote:
> 
>> Replace deprecated qdev_reset_all by device_reset_warm.
>>
>> This does not impact the behavior.
> 
> Not so sure about that; see below.

In this case, qdev_reset_all is used. The qdev subtree is already reset.
device_reset_* does keep the same children-then-parent call order for
the legacy reset method. This is why the behavior is unchanged.

I will add this point in the commit message of this patch (and also in
other qdev/qbus_reset_all replacement patches) so it will be more clear.

--
Damien



Re: [Qemu-block] [Qemu-devel] [PATCH] tests/test-hbitmap: test next_zero and _next_dirty_area after truncate

2019-08-09 Thread Vladimir Sementsov-Ogievskiy
08.08.2019 3:04, John Snow wrote:
> 
> 
> On 8/5/19 12:46 PM, Vladimir Sementsov-Ogievskiy wrote:
>> Test that hbitmap_next_zero and hbitmap_next_dirty_area can find things
>> after old bitmap end.
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy 
>> ---
>>
>> It's a follow-up for
>>
>>  [PATCH for-4.1] util/hbitmap: update orig_size on truncate
>>
>>   tests/test-hbitmap.c | 22 ++
>>   1 file changed, 22 insertions(+)
>>
>> diff --git a/tests/test-hbitmap.c b/tests/test-hbitmap.c
>> index 592d8219db..eed5d288cb 100644
>> --- a/tests/test-hbitmap.c
>> +++ b/tests/test-hbitmap.c
>> @@ -1004,6 +1004,15 @@ static void test_hbitmap_next_zero_4(TestHBitmapData 
>> *data, const void *unused)
>>   test_hbitmap_next_zero_do(data, 4);
>>   }
>>   
>> +static void test_hbitmap_next_zero_after_truncate(TestHBitmapData *data,
>> +  const void *unused)
>> +{
>> +hbitmap_test_init(data, L1, 0);
>> +hbitmap_test_truncate_impl(data, L1 * 2);
>> +hbitmap_set(data->hb, 0, L1);
>> +test_hbitmap_next_zero_check(data, 0);
>> +}
>> +
>>   static void test_hbitmap_next_dirty_area_check(TestHBitmapData *data,
>>  uint64_t offset,
>>  uint64_t count)
>> @@ -1104,6 +1113,15 @@ static void 
>> test_hbitmap_next_dirty_area_4(TestHBitmapData *data,
>>   test_hbitmap_next_dirty_area_do(data, 4);
>>   }
>>   
>> +static void test_hbitmap_next_dirty_area_after_truncate(TestHBitmapData 
>> *data,
>> +const void *unused)
>> +{
>> +hbitmap_test_init(data, L1, 0);
>> +hbitmap_test_truncate_impl(data, L1 * 2);
>> +hbitmap_set(data->hb, L1 + 1, 1);
>> +test_hbitmap_next_dirty_area_check(data, 0, UINT64_MAX);
>> +}
>> +
>>   int main(int argc, char **argv)
>>   {
>>   g_test_init(&argc, &argv, NULL);
>> @@ -1169,6 +1187,8 @@ int main(int argc, char **argv)
>>test_hbitmap_next_zero_0);
>>   hbitmap_test_add("/hbitmap/next_zero/next_zero_4",
>>test_hbitmap_next_zero_4);
>> +hbitmap_test_add("/hbitmap/next_zero/next_zero_after_truncate",
>> + test_hbitmap_next_zero_after_truncate);
>>   
>>   hbitmap_test_add("/hbitmap/next_dirty_area/next_dirty_area_0",
>>test_hbitmap_next_dirty_area_0);
>> @@ -1176,6 +1196,8 @@ int main(int argc, char **argv)
>>test_hbitmap_next_dirty_area_1);
>>   hbitmap_test_add("/hbitmap/next_dirty_area/next_dirty_area_4",
>>test_hbitmap_next_dirty_area_4);
>> +
>> hbitmap_test_add("/hbitmap/next_dirty_area/next_dirty_area_after_truncate",
>> + test_hbitmap_next_dirty_area_after_truncate);
>>   
>>   g_test_run();
>>   
>>
> 
> Tested-by: John Snow 
> Reviewed-by: John Snow 
> 
> And staged:
> 
> Thanks, applied to my bitmaps tree:
> 
> https://github.com/jnsnow/qemu/commits/bitmaps

Thanks! Hmm but I don't see the patch at this link, neither 01 and 03 from
"[Qemu-devel] [PATCH 0/3] backup fixes for 4.1?"...


-- 
Best regards,
Vladimir


Re: [Qemu-block] [PATCH 8/8] block/backup: backup_do_cow: use bdrv_dirty_bitmap_next_dirty_area

2019-08-09 Thread Vladimir Sementsov-Ogievskiy
07.08.2019 21:46, Max Reitz wrote:
> On 07.08.19 10:07, Vladimir Sementsov-Ogievskiy wrote:
>> Use effective bdrv_dirty_bitmap_next_dirty_area interface.
>>
>> Signed-off-by: Vladimir Sementsov-Ogievskiy 
>> ---
>>   block/backup.c | 56 ++
>>   1 file changed, 24 insertions(+), 32 deletions(-)
>>
>> diff --git a/block/backup.c b/block/backup.c
>> index f19c9195fe..5ede0c8290 100644
>> --- a/block/backup.c
>> +++ b/block/backup.c
>> @@ -235,25 +235,28 @@ static int coroutine_fn backup_do_cow(BackupBlockJob 
>> *job,
>>   {
>>   CowRequest cow_request;
>>   int ret = 0;
>> -int64_t start, end; /* bytes */
>> +uint64_t off, cur_bytes;
>> +int64_t aligned_offset, aligned_bytes, aligned_end;
>>   BdrvRequestFlags read_flags =
>>   is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0;
>>   
>>   qemu_co_rwlock_rdlock(&job->flush_rwlock);
>>   
>> -start = QEMU_ALIGN_DOWN(offset, job->cluster_size);
>> -end = QEMU_ALIGN_UP(bytes + offset, job->cluster_size);
>> +aligned_offset = QEMU_ALIGN_DOWN(offset, job->cluster_size);
>> +aligned_end = QEMU_ALIGN_UP(bytes + offset, job->cluster_size);
>> +aligned_bytes = aligned_end - aligned_offset;
>>   
>> -trace_backup_do_cow_enter(job, start, offset, bytes);
>> +trace_backup_do_cow_enter(job, aligned_offset, offset, bytes);
>>   
>> -wait_for_overlapping_requests(job, start, end);
>> -cow_request_begin(&cow_request, job, start, end);
>> +wait_for_overlapping_requests(job, aligned_offset, aligned_end);
>> +cow_request_begin(&cow_request, job, aligned_offset, aligned_end);
>>   
>>   if (job->initializing_bitmap) {
>> -int64_t off, chunk;
>> +int64_t chunk;
>>   
>> -for (off = offset; offset < end; offset += chunk) {
>> -ret = backup_bitmap_reset_unallocated(job, off, end - off, 
>> &chunk);
>> +for (off = aligned_offset; off < aligned_end; off += chunk) {
>> +ret = backup_bitmap_reset_unallocated(job, off, aligned_end - 
>> off,
>> +  &chunk);
>>   if (ret < 0) {
>>   chunk = job->cluster_size;
>>   }
>> @@ -261,47 +264,36 @@ static int coroutine_fn backup_do_cow(BackupBlockJob 
>> *job,
>>   }
>>   ret = 0;
>>   
>> -while (start < end) {
>> -int64_t dirty_end;
>> -int64_t cur_bytes;
>> -
>> -if (!bdrv_dirty_bitmap_get(job->copy_bitmap, start)) {
>> -trace_backup_do_cow_skip(job, start);
>> -start += job->cluster_size;
>> -continue; /* already copied */
>> -}
>> -
>> -dirty_end = bdrv_dirty_bitmap_next_zero(job->copy_bitmap, start,
>> -end - start);
>> -if (dirty_end < 0) {
>> -dirty_end = end;
>> -}
>> -
>> -trace_backup_do_cow_process(job, start);
>> -cur_bytes = MIN(dirty_end - start, job->len - start);
>> -bdrv_reset_dirty_bitmap(job->copy_bitmap, start, dirty_end - start);
>> +off = aligned_offset;
>> +cur_bytes = aligned_bytes;
>> +while (bdrv_dirty_bitmap_next_dirty_area(job->copy_bitmap,
>> + &off, &cur_bytes))
>> +{
>> +trace_backup_do_cow_process(job, off);
>> +bdrv_reset_dirty_bitmap(job->copy_bitmap, off, cur_bytes);
>>   
>>   if (job->use_copy_range) {
>> -ret = backup_cow_with_offload(job, start, cur_bytes, 
>> read_flags);
>> +ret = backup_cow_with_offload(job, off, cur_bytes, read_flags);
>>   if (ret < 0) {
>>   job->use_copy_range = false;
>>   }
>>   }
>>   if (!job->use_copy_range) {
>> -ret = backup_cow_with_bounce_buffer(job, start, cur_bytes,
>> +ret = backup_cow_with_bounce_buffer(job, off, cur_bytes,
>>   read_flags, error_is_read);
>>   }
>>   if (ret < 0) {
>> -bdrv_set_dirty_bitmap(job->copy_bitmap, start, dirty_end - 
>> start);
>> +bdrv_set_dirty_bitmap(job->copy_bitmap, off, cur_bytes);
>>   break;
>>   }
>>   
>>   /* Publish progress, guest I/O counts as progress too.  Note that 
>> the
>>* offset field is an opaque progress value, it is not a disk 
>> offset.
>>*/
>> -start += cur_bytes;
>> +off += cur_bytes;
>>   job->bytes_read += cur_bytes;
>>   job_progress_update(&job->common.job, cur_bytes);
>> +cur_bytes = offset + bytes - off;
> 
> Hm, why not aligned_end - off?
> 
> (You could also drop aligned_bytes altogether and always set cur_bytes
> to aligned_end - off.)
> 

Hmm, right.

Thank you for reviewing!

-- 
Best regards,
Vladimir


Re: [Qemu-block] [PATCH 3/8] block/io: handle alignment and max_transfer for copy_range

2019-08-09 Thread Vladimir Sementsov-Ogievskiy
07.08.2019 20:28, Max Reitz wrote:
> On 07.08.19 10:07, Vladimir Sementsov-Ogievskiy wrote:
>> copy_range ignores these limitations, let's improve it. block/backup
>> code handles max_transfer for copy_range by itself, now it's not needed
>> more, drop it.
> 
> Shouldn’t this be two separate patches?

Not a problem, will do.

> 
>> Signed-off-by: Vladimir Sementsov-Ogievskiy 
>> ---
>>   block/backup.c | 11 ++-
>>   block/io.c | 41 +
>>   2 files changed, 35 insertions(+), 17 deletions(-)
> 
> [...]
> 
>> diff --git a/block/io.c b/block/io.c
>> index 06305c6ea6..5abbd0fff2 100644
>> --- a/block/io.c
>> +++ b/block/io.c
>> @@ -3005,6 +3005,12 @@ static int coroutine_fn bdrv_co_copy_range_internal(
>>   {
>>   BdrvTrackedRequest req;
>>   int ret;
>> +uint32_t align = MAX(src->bs->bl.request_alignment,
>> + dst->bs->bl.request_alignment);
>> +uint32_t max_transfer =
>> +
>> QEMU_ALIGN_DOWN(MIN_NON_ZERO(MIN_NON_ZERO(src->bs->bl.max_transfer,
>> +  
>> dst->bs->bl.max_transfer),
>> + INT_MAX), align);
> 
> I suppose the outer QEMU_ALIGN_DOWN() may result in @max_transfer of 0
> (in theory, if one’s max_transfer is smaller than the other’s alignment).
> 
> Not likely, but should still be handled.
> 
> The rest looks good to me.
> 
> Max
> 
>>   /* TODO We can support BDRV_REQ_NO_FALLBACK here */
>>   assert(!(read_flags & BDRV_REQ_NO_FALLBACK));
> 


-- 
Best regards,
Vladimir


Re: [Qemu-block] [PATCH 4/8] block/backup: improve unallocated clusters skipping

2019-08-09 Thread Vladimir Sementsov-Ogievskiy
07.08.2019 21:01, Max Reitz wrote:
> On 07.08.19 10:07, Vladimir Sementsov-Ogievskiy wrote:
>> Limit block_status querying to request bounds on write notifier to
>> avoid extra seeking.
> 
> I don’t understand this reasoning.  Checking whether something is
> allocated for qcow2 should just mean an L2 cache lookup.  Which we have
> to do anyway when we try to copy data off the source.

But for raw it's seeking. I think we just shouldn't do any unnecessary 
operations
in copy-before-write handling. So instead of calling
block_status(request_start, disk_end) I think it's better to call
block_status(request_start, request_end). And it is more applicable for reusing 
this
code too.

> 
> I could understand saying this makes the code easier, but I actually
> don’t think it does.  If you implemented checking the allocation status
> only for areas where the bitmap is reset (which I think this patch
> should), then it’d just duplicate the existing loop.
> 
>> Signed-off-by: Vladimir Sementsov-Ogievskiy 
>> ---
>>   block/backup.c | 38 +-
>>   1 file changed, 21 insertions(+), 17 deletions(-)
>>
>> diff --git a/block/backup.c b/block/backup.c
>> index 11e27c844d..a4d37d2d62 100644
>> --- a/block/backup.c
>> +++ b/block/backup.c
> 
> [...]
> 
>> @@ -267,6 +267,18 @@ static int coroutine_fn backup_do_cow(BackupBlockJob 
>> *job,
>>   wait_for_overlapping_requests(job, start, end);
>>   cow_request_begin(&cow_request, job, start, end);
>>   
>> +if (job->initializing_bitmap) {
>> +int64_t off, chunk;
>> +
>> +for (off = offset; offset < end; offset += chunk) {
> 
> This is what I’m referring to, I think this loop should skip areas that
> are clean.

Agree, will do.

> 
>> +ret = backup_bitmap_reset_unallocated(job, off, end - off, 
>> &chunk);
>> +if (ret < 0) {
>> +chunk = job->cluster_size;
>> +}
>> +}
>> +}
>> +ret = 0;
>> +
>>   while (start < end) {
>>   int64_t dirty_end;
>>   
>> @@ -276,15 +288,6 @@ static int coroutine_fn backup_do_cow(BackupBlockJob 
>> *job,
>>   continue; /* already copied */
>>   }
>>   
>> -if (job->initializing_bitmap) {
>> -ret = backup_bitmap_reset_unallocated(job, start, &skip_bytes);
>> -if (ret == 0) {
>> -trace_backup_do_cow_skip_range(job, start, skip_bytes);
>> -start += skip_bytes;
>> -continue;
>> -}
>> -}
>> -
>>   dirty_end = bdrv_dirty_bitmap_next_zero(job->copy_bitmap, start,
>>   end - start);
>>   if (dirty_end < 0) {
> 
> Hm, you resolved that conflict differently from me.
> 
> I decided the bdrv_dirty_bitmap_next_zero() call should go before the
> backup_bitmap_reset_unallocated() call so that we can then do a
> 
>dirty_end = MIN(dirty_end, start + skip_bytes);
> 
> because we probably don’t want to copy anything past what
> backup_bitmap_reset_unallocated() has inquired.
> 
> 
> But then again this patch eliminates the difference anyway...
>  >
>> @@ -546,7 +549,8 @@ static int coroutine_fn backup_run(Job *job, Error 
>> **errp)
>>   goto out;
>>   }
>>   
>> -ret = backup_bitmap_reset_unallocated(s, offset, &count);
>> +ret = backup_bitmap_reset_unallocated(s, offset, s->len - 
>> offset,
>> +  &count);
>>   if (ret < 0) {
>>   goto out;
>>   }
>>
> 
> 


-- 
Best regards,
Vladimir