date:20201215

[PATCH v15 12/13] block/stream: add s->target_bs

2020-12-15 Thread Vladimir Sementsov-Ogievskiy

Add a direct link to target bs for convenience and to simplify
following commit which will insert COR filter above target bs.

This is a part of original commit written by Andrey.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Max Reitz 
---
 block/stream.c | 23 ++-
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/block/stream.c b/block/stream.c
index 045d6bc76b..626dfa2b22 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -33,6 +33,7 @@ typedef struct StreamBlockJob {
 BlockJob common;
 BlockDriverState *base_overlay; /* COW overlay (stream from this) */
 BlockDriverState *above_base;   /* Node directly above the base */
+BlockDriverState *target_bs;
 BlockdevOnError on_error;
 char *backing_file_str;
 bool bs_read_only;
@@ -53,23 +54,20 @@ static void stream_abort(Job *job)
 StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
 
 if (s->chain_frozen) {
-BlockJob *bjob = >common;
-bdrv_unfreeze_backing_chain(blk_bs(bjob->blk), s->above_base);
+bdrv_unfreeze_backing_chain(s->target_bs, s->above_base);
 }
 }
 
 static int stream_prepare(Job *job)
 {
 StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
-BlockJob *bjob = >common;
-BlockDriverState *bs = blk_bs(bjob->blk);
-BlockDriverState *unfiltered_bs = bdrv_skip_filters(bs);
+BlockDriverState *unfiltered_bs = bdrv_skip_filters(s->target_bs);
 BlockDriverState *base = bdrv_filter_or_cow_bs(s->above_base);
 BlockDriverState *unfiltered_base = bdrv_skip_filters(base);
 Error *local_err = NULL;
 int ret = 0;
 
-bdrv_unfreeze_backing_chain(bs, s->above_base);
+bdrv_unfreeze_backing_chain(s->target_bs, s->above_base);
 s->chain_frozen = false;
 
 if (bdrv_cow_child(unfiltered_bs)) {
@@ -95,13 +93,12 @@ static void stream_clean(Job *job)
 {
 StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
 BlockJob *bjob = >common;
-BlockDriverState *bs = blk_bs(bjob->blk);
 
 /* Reopen the image back in read-only mode if necessary */
 if (s->bs_read_only) {
 /* Give up write permissions before making it read-only */
 blk_set_perm(bjob->blk, 0, BLK_PERM_ALL, _abort);
-bdrv_reopen_set_read_only(bs, true, NULL);
+bdrv_reopen_set_read_only(s->target_bs, true, NULL);
 }
 
 g_free(s->backing_file_str);
@@ -111,8 +108,7 @@ static int coroutine_fn stream_run(Job *job, Error **errp)
 {
 StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
 BlockBackend *blk = s->common.blk;
-BlockDriverState *bs = blk_bs(blk);
-BlockDriverState *unfiltered_bs = bdrv_skip_filters(bs);
+BlockDriverState *unfiltered_bs = bdrv_skip_filters(s->target_bs);
 bool enable_cor = !bdrv_cow_child(s->base_overlay);
 int64_t len;
 int64_t offset = 0;
@@ -125,7 +121,7 @@ static int coroutine_fn stream_run(Job *job, Error **errp)
 return 0;
 }
 
-len = bdrv_getlength(bs);
+len = bdrv_getlength(s->target_bs);
 if (len < 0) {
 return len;
 }
@@ -137,7 +133,7 @@ static int coroutine_fn stream_run(Job *job, Error **errp)
  * account.
  */
 if (enable_cor) {
-bdrv_enable_copy_on_read(bs);
+bdrv_enable_copy_on_read(s->target_bs);
 }
 
 for ( ; offset < len; offset += n) {
@@ -199,7 +195,7 @@ static int coroutine_fn stream_run(Job *job, Error **errp)
 }
 
 if (enable_cor) {
-bdrv_disable_copy_on_read(bs);
+bdrv_disable_copy_on_read(s->target_bs);
 }
 
 /* Do not remove the backing file if an error was there but ignored. */
@@ -314,6 +310,7 @@ void stream_start(const char *job_id, BlockDriverState *bs,
 s->base_overlay = base_overlay;
 s->above_base = above_base;
 s->backing_file_str = g_strdup(backing_file_str);
+s->target_bs = bs;
 s->bs_read_only = bs_read_only;
 s->chain_frozen = true;
 
-- 
2.25.4

[PATCH v15 11/13] iotests: 30: prepare to COR filter insertion by stream job

2020-12-15 Thread Vladimir Sementsov-Ogievskiy

test_stream_parallel run parallel stream jobs, intersecting so that top
of one is base of another. It's OK now, but it would be a problem if
insert the filter, as one job will want to use another job's filter as
above_base node.

Correct thing to do is move to new interface: "bottom" argument instead
of base. This guarantees that jobs don't intersect by their actions.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Max Reitz 
---
 tests/qemu-iotests/030 | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/030 b/tests/qemu-iotests/030
index dcb4b5d6a6..bd8cf9cff7 100755
--- a/tests/qemu-iotests/030
+++ b/tests/qemu-iotests/030
@@ -245,7 +245,9 @@ class TestParallelOps(iotests.QMPTestCase):
 node_name = 'node%d' % i
 job_id = 'stream-%s' % node_name
 pending_jobs.append(job_id)
-result = self.vm.qmp('block-stream', device=node_name, 
job_id=job_id, base=self.imgs[i-2], speed=1024)
+result = self.vm.qmp('block-stream', device=node_name,
+ job_id=job_id, bottom=f'node{i-1}',
+ speed=1024)
 self.assert_qmp(result, 'return', {})
 
 for job in pending_jobs:
-- 
2.25.4

Re: [PATCH] block: report errno when flock fcntl fails

2020-12-15 Thread Vladimir Sementsov-Ogievskiy


15.12.2020 22:01, David Edmondson wrote:

When a call to fcntl(2) for the purpose of manipulating file locks
fails, report the error returned by fcntl.

Signed-off-by: David Edmondson


Reviewed-by: Vladimir Sementsov-Ogievskiy 

--
Best regards,
Vladimir

[PATCH v15 13/13] block: apply COR-filter to block-stream jobs

2020-12-15 Thread Vladimir Sementsov-Ogievskiy

From: Andrey Shinkevich 

This patch completes the series with the COR-filter applied to
block-stream operations.

Adding the filter makes it possible in future implement discarding
copied regions in backing files during the block-stream job, to reduce
the disk overuse (we need control on permissions).

Also, the filter now is smart enough to do copy-on-read with specified
base, so we have benefit on guest reads even when doing block-stream of
the part of the backing chain.

Several iotests are slightly modified due to filter insertion.

Signed-off-by: Andrey Shinkevich 
Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/stream.c | 105 ++---
 tests/qemu-iotests/030 |   8 +--
 tests/qemu-iotests/141.out |   2 +-
 tests/qemu-iotests/245 |  20 ---
 4 files changed, 80 insertions(+), 55 deletions(-)

diff --git a/block/stream.c b/block/stream.c
index 626dfa2b22..1fa742b0db 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -17,8 +17,10 @@
 #include "block/blockjob_int.h"
 #include "qapi/error.h"
 #include "qapi/qmp/qerror.h"
+#include "qapi/qmp/qdict.h"
 #include "qemu/ratelimit.h"
 #include "sysemu/block-backend.h"
+#include "block/copy-on-read.h"
 
 enum {
 /*
@@ -33,11 +35,11 @@ typedef struct StreamBlockJob {
 BlockJob common;
 BlockDriverState *base_overlay; /* COW overlay (stream from this) */
 BlockDriverState *above_base;   /* Node directly above the base */
+BlockDriverState *cor_filter_bs;
 BlockDriverState *target_bs;
 BlockdevOnError on_error;
 char *backing_file_str;
 bool bs_read_only;
-bool chain_frozen;
 } StreamBlockJob;
 
 static int coroutine_fn stream_populate(BlockBackend *blk,
@@ -45,17 +47,7 @@ static int coroutine_fn stream_populate(BlockBackend *blk,
 {
 assert(bytes < SIZE_MAX);
 
-return blk_co_preadv(blk, offset, bytes, NULL,
- BDRV_REQ_COPY_ON_READ | BDRV_REQ_PREFETCH);
-}
-
-static void stream_abort(Job *job)
-{
-StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
-
-if (s->chain_frozen) {
-bdrv_unfreeze_backing_chain(s->target_bs, s->above_base);
-}
+return blk_co_preadv(blk, offset, bytes, NULL, BDRV_REQ_PREFETCH);
 }
 
 static int stream_prepare(Job *job)
@@ -67,8 +59,9 @@ static int stream_prepare(Job *job)
 Error *local_err = NULL;
 int ret = 0;
 
-bdrv_unfreeze_backing_chain(s->target_bs, s->above_base);
-s->chain_frozen = false;
+/* We should drop filter at this point, as filter hold the backing chain */
+bdrv_cor_filter_drop(s->cor_filter_bs);
+s->cor_filter_bs = NULL;
 
 if (bdrv_cow_child(unfiltered_bs)) {
 const char *base_id = NULL, *base_fmt = NULL;
@@ -94,6 +87,11 @@ static void stream_clean(Job *job)
 StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
 BlockJob *bjob = >common;
 
+if (s->cor_filter_bs) {
+bdrv_cor_filter_drop(s->cor_filter_bs);
+s->cor_filter_bs = NULL;
+}
+
 /* Reopen the image back in read-only mode if necessary */
 if (s->bs_read_only) {
 /* Give up write permissions before making it read-only */
@@ -109,7 +107,6 @@ static int coroutine_fn stream_run(Job *job, Error **errp)
 StreamBlockJob *s = container_of(job, StreamBlockJob, common.job);
 BlockBackend *blk = s->common.blk;
 BlockDriverState *unfiltered_bs = bdrv_skip_filters(s->target_bs);
-bool enable_cor = !bdrv_cow_child(s->base_overlay);
 int64_t len;
 int64_t offset = 0;
 uint64_t delay_ns = 0;
@@ -127,15 +124,6 @@ static int coroutine_fn stream_run(Job *job, Error **errp)
 }
 job_progress_set_remaining(>common.job, len);
 
-/* Turn on copy-on-read for the whole block device so that guest read
- * requests help us make progress.  Only do this when copying the entire
- * backing chain since the copy-on-read operation does not take base into
- * account.
- */
-if (enable_cor) {
-bdrv_enable_copy_on_read(s->target_bs);
-}
-
 for ( ; offset < len; offset += n) {
 bool copy;
 int ret;
@@ -194,10 +182,6 @@ static int coroutine_fn stream_run(Job *job, Error **errp)
 }
 }
 
-if (enable_cor) {
-bdrv_disable_copy_on_read(s->target_bs);
-}
-
 /* Do not remove the backing file if an error was there but ignored. */
 return error;
 }
@@ -209,7 +193,6 @@ static const BlockJobDriver stream_job_driver = {
 .free  = block_job_free,
 .run   = stream_run,
 .prepare   = stream_prepare,
-.abort = stream_abort,
 .clean = stream_clean,
 .user_resume   = block_job_user_resume,
 },
@@ -228,7 +211,9 @@ void stream_start(const char *job_id, BlockDriverState *bs,
 bool bs_read_only;
 int basic_flags = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED;
 BlockDriverState *base_overlay;
+BlockDriverState

[PATCH v15 06/13] iotests: add #310 to test bottom node in COR driver

2020-12-15 Thread Vladimir Sementsov-Ogievskiy

From: Andrey Shinkevich 

The test case #310 is similar to #216 by Max Reitz. The difference is
that the test #310 involves a bottom node to the COR filter driver.

Signed-off-by: Andrey Shinkevich 
Signed-off-by: Vladimir Sementsov-Ogievskiy 
  [vsementsov: detach backing to test reads from top, limit to qcow2]
---
 tests/qemu-iotests/310 | 116 +
 tests/qemu-iotests/310.out |  15 +
 tests/qemu-iotests/group   |   1 +
 3 files changed, 132 insertions(+)
 create mode 100755 tests/qemu-iotests/310
 create mode 100644 tests/qemu-iotests/310.out

diff --git a/tests/qemu-iotests/310 b/tests/qemu-iotests/310
new file mode 100755
index 00..a35e8e14f5
--- /dev/null
+++ b/tests/qemu-iotests/310
@@ -0,0 +1,116 @@
+#!/usr/bin/env python3
+#
+# Copy-on-read tests using a COR filter with a bottom node
+#
+# Copyright (C) 2018 Red Hat, Inc.
+# Copyright (c) 2020 Virtuozzo International GmbH
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see .
+#
+
+import iotests
+from iotests import log, qemu_img, qemu_io_silent
+
+# Need backing file support
+iotests.script_initialize(supported_fmts=['qcow2'],
+  supported_platforms=['linux'])
+
+log('')
+log('=== Copy-on-read across nodes ===')
+log('')
+
+# This test is similar to the 216 one by Max Reitz 
+# The difference is that this test case involves a bottom node to the
+# COR filter driver.
+
+with iotests.FilePath('base.img') as base_img_path, \
+ iotests.FilePath('mid.img') as mid_img_path, \
+ iotests.FilePath('top.img') as top_img_path, \
+ iotests.VM() as vm:
+
+log('--- Setting up images ---')
+log('')
+
+assert qemu_img('create', '-f', iotests.imgfmt, base_img_path, '64M') == 0
+assert qemu_io_silent(base_img_path, '-c', 'write -P 1 0M 1M') == 0
+assert qemu_io_silent(base_img_path, '-c', 'write -P 1 3M 1M') == 0
+assert qemu_img('create', '-f', iotests.imgfmt, '-b', base_img_path,
+'-F', iotests.imgfmt, mid_img_path) == 0
+assert qemu_io_silent(mid_img_path,  '-c', 'write -P 3 2M 1M') == 0
+assert qemu_io_silent(mid_img_path,  '-c', 'write -P 3 4M 1M') == 0
+assert qemu_img('create', '-f', iotests.imgfmt, '-b', mid_img_path,
+'-F', iotests.imgfmt, top_img_path) == 0
+assert qemu_io_silent(top_img_path,  '-c', 'write -P 2 1M 1M') == 0
+
+#  0 1 2 3 4
+# top2
+# mid  3   3
+# base 1 1
+
+log('Done')
+
+log('')
+log('--- Doing COR ---')
+log('')
+
+vm.launch()
+
+log(vm.qmp('blockdev-add',
+   node_name='node0',
+   driver='copy-on-read',
+   bottom='node2',
+   file={
+   'driver': iotests.imgfmt,
+   'file': {
+   'driver': 'file',
+   'filename': top_img_path
+   },
+   'backing': {
+   'node-name': 'node2',
+   'driver': iotests.imgfmt,
+   'file': {
+   'driver': 'file',
+   'filename': mid_img_path
+   },
+   'backing': {
+   'driver': iotests.imgfmt,
+   'file': {
+   'driver': 'file',
+   'filename': base_img_path
+   }
+   },
+   }
+   }))
+
+# Trigger COR
+log(vm.qmp('human-monitor-command',
+   command_line='qemu-io node0 "read 0 5M"'))
+
+vm.shutdown()
+
+log('')
+log('--- Checking COR result ---')
+log('')
+
+# Detach backing to check that we can read the data from the top level now
+assert qemu_img('rebase', '-u', '-b', '', '-f', iotests.imgfmt,
+top_img_path,) == 0
+
+assert qemu_io_silent(top_img_path,  '-c', 'read -P 0 0 1M') == 0
+assert qemu_io_silent(top_img_path,  '-c', 'read -P 2 1M 1M') == 0
+assert qemu_io_silent(top_img_path,  '-c', 'read -P 3 2M 1M') == 0
+assert qemu_io_silent(top_img_path,  '-c', 'read -P 0 3M 1M') == 0
+assert qemu_io_silent(top_img_path,  '-c', 'read -P 3 4M 1M') == 0
+
+log('Done')
diff --git a/tests/qemu-iotests/310.out b/tests/qemu-iotests/310.out
new file mode 100644
index

[PATCH v15 08/13] copy-on-read: skip non-guest reads if no copy needed

2020-12-15 Thread Vladimir Sementsov-Ogievskiy

From: Andrey Shinkevich 

If the flag BDRV_REQ_PREFETCH was set, skip idling read/write
operations in COR-driver. It can be taken into account for the
COR-algorithms optimization. That check is being made during the
block stream job by the moment.

Add the BDRV_REQ_PREFETCH flag to the supported_read_flags of the
COR-filter.

block: Modify the comment for the flag BDRV_REQ_PREFETCH as we are
going to use it alone and pass it to the COR-filter driver for further
processing.

Signed-off-by: Andrey Shinkevich 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Max Reitz 
---
 include/block/block.h |  8 +---
 block/copy-on-read.c  | 14 ++
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index 8ea794959b..f652f31406 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -81,9 +81,11 @@ typedef enum {
 BDRV_REQ_NO_FALLBACK= 0x100,
 
 /*
- * BDRV_REQ_PREFETCH may be used only together with BDRV_REQ_COPY_ON_READ
- * on read request and means that caller doesn't really need data to be
- * written to qiov parameter which may be NULL.
+ * BDRV_REQ_PREFETCH makes sense only in the context of copy-on-read
+ * (i.e., together with the BDRV_REQ_COPY_ON_READ flag or when a COR
+ * filter is involved), in which case it signals that the COR operation
+ * need not read the data into memory (qiov) but only ensure they are
+ * copied to the top layer (i.e., that COR operation is done).
  */
 BDRV_REQ_PREFETCH  = 0x200,
 /* Mask of valid flags */
diff --git a/block/copy-on-read.c b/block/copy-on-read.c
index 71560984f6..9cad9e1b8c 100644
--- a/block/copy-on-read.c
+++ b/block/copy-on-read.c
@@ -50,6 +50,8 @@ static int cor_open(BlockDriverState *bs, QDict *options, int 
flags,
 return -EINVAL;
 }
 
+bs->supported_read_flags = BDRV_REQ_PREFETCH;
+
 bs->supported_write_flags = BDRV_REQ_WRITE_UNCHANGED |
 (BDRV_REQ_FUA & bs->file->bs->supported_write_flags);
 
@@ -172,10 +174,14 @@ static int coroutine_fn 
cor_co_preadv_part(BlockDriverState *bs,
 }
 }
 
-ret = bdrv_co_preadv_part(bs->file, offset, n, qiov, qiov_offset,
-  local_flags);
-if (ret < 0) {
-return ret;
+/* Skip if neither read nor write are needed */
+if ((local_flags & (BDRV_REQ_PREFETCH | BDRV_REQ_COPY_ON_READ)) !=
+BDRV_REQ_PREFETCH) {
+ret = bdrv_co_preadv_part(bs->file, offset, n, qiov, qiov_offset,
+  local_flags);
+if (ret < 0) {
+return ret;
+}
 }
 
 offset += n;
-- 
2.25.4

[PATCH v15 03/13] copy-on-read: add filter drop function

2020-12-15 Thread Vladimir Sementsov-Ogievskiy

From: Andrey Shinkevich 

Provide API for the COR-filter removal. Also, drop the filter child
permissions for an inactive state when the filter node is being
removed.
To insert the filter, the block generic layer function
bdrv_insert_node() can be used.
The new function bdrv_cor_filter_drop() may be considered as an
intermediate solution before the QEMU permission update system has
overhauled. Then we are able to implement the API function
bdrv_remove_node() on the block generic layer.

Signed-off-by: Andrey Shinkevich 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Max Reitz 
---
 block/copy-on-read.h | 32 +
 block/copy-on-read.c | 56 
 2 files changed, 88 insertions(+)
 create mode 100644 block/copy-on-read.h

diff --git a/block/copy-on-read.h b/block/copy-on-read.h
new file mode 100644
index 00..7bf405dccd
--- /dev/null
+++ b/block/copy-on-read.h
@@ -0,0 +1,32 @@
+/*
+ * Copy-on-read filter block driver
+ *
+ * The filter driver performs Copy-On-Read (COR) operations
+ *
+ * Copyright (c) 2018-2020 Virtuozzo International GmbH.
+ *
+ * Author:
+ *   Andrey Shinkevich 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see .
+ */
+
+#ifndef BLOCK_COPY_ON_READ
+#define BLOCK_COPY_ON_READ
+
+#include "block/block_int.h"
+
+void bdrv_cor_filter_drop(BlockDriverState *cor_filter_bs);
+
+#endif /* BLOCK_COPY_ON_READ */
diff --git a/block/copy-on-read.c b/block/copy-on-read.c
index cb03e0f2d3..618c4c4f43 100644
--- a/block/copy-on-read.c
+++ b/block/copy-on-read.c
@@ -23,11 +23,20 @@
 #include "qemu/osdep.h"
 #include "block/block_int.h"
 #include "qemu/module.h"
+#include "qapi/error.h"
+#include "block/copy-on-read.h"
+
+
+typedef struct BDRVStateCOR {
+bool active;
+} BDRVStateCOR;
 
 
 static int cor_open(BlockDriverState *bs, QDict *options, int flags,
 Error **errp)
 {
+BDRVStateCOR *state = bs->opaque;
+
 bs->file = bdrv_open_child(NULL, options, "file", bs, _of_bds,
BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
false, errp);
@@ -42,6 +51,13 @@ static int cor_open(BlockDriverState *bs, QDict *options, 
int flags,
 ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
 bs->file->bs->supported_zero_flags);
 
+state->active = true;
+
+/*
+ * We don't need to call bdrv_child_refresh_perms() now as the permissions
+ * will be updated later when the filter node gets its parent.
+ */
+
 return 0;
 }
 
@@ -57,6 +73,17 @@ static void cor_child_perm(BlockDriverState *bs, BdrvChild 
*c,
uint64_t perm, uint64_t shared,
uint64_t *nperm, uint64_t *nshared)
 {
+BDRVStateCOR *s = bs->opaque;
+
+if (!s->active) {
+/*
+ * While the filter is being removed
+ */
+*nperm = 0;
+*nshared = BLK_PERM_ALL;
+return;
+}
+
 *nperm = perm & PERM_PASSTHROUGH;
 *nshared = (shared & PERM_PASSTHROUGH) | PERM_UNCHANGED;
 
@@ -135,6 +162,7 @@ static void cor_lock_medium(BlockDriverState *bs, bool 
locked)
 
 static BlockDriver bdrv_copy_on_read = {
 .format_name= "copy-on-read",
+.instance_size  = sizeof(BDRVStateCOR),
 
 .bdrv_open  = cor_open,
 .bdrv_child_perm= cor_child_perm,
@@ -154,6 +182,34 @@ static BlockDriver bdrv_copy_on_read = {
 .is_filter  = true,
 };
 
+
+void bdrv_cor_filter_drop(BlockDriverState *cor_filter_bs)
+{
+BdrvChild *child;
+BlockDriverState *bs;
+BDRVStateCOR *s = cor_filter_bs->opaque;
+
+child = bdrv_filter_child(cor_filter_bs);
+if (!child) {
+return;
+}
+bs = child->bs;
+
+/* Retain the BDS until we complete the graph change. */
+bdrv_ref(bs);
+/* Hold a guest back from writing while permissions are being reset. */
+bdrv_drained_begin(bs);
+/* Drop permissions before the graph change. */
+s->active = false;
+bdrv_child_refresh_perms(cor_filter_bs, child, _abort);
+bdrv_replace_node(cor_filter_bs, bs, _abort);
+
+bdrv_drained_end(bs);
+bdrv_unref(bs);
+bdrv_unref(cor_filter_bs);
+}
+
+
 static void bdrv_copy_on_read_init(void)
 {

[PATCH v15 07/13] block: include supported_read_flags into BDS structure

2020-12-15 Thread Vladimir Sementsov-Ogievskiy

From: Andrey Shinkevich 

Add the new member supported_read_flags to the BlockDriverState
structure. It will control the flags set for copy-on-read operations.
Make the block generic layer evaluate supported read flags before they
go to a block driver.

Suggested-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Andrey Shinkevich 
Signed-off-by: Vladimir Sementsov-Ogievskiy 
 [vsementsov: use assert instead of abort]
Reviewed-by: Max Reitz 
---
 include/block/block_int.h |  4 
 block/io.c| 10 --
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/include/block/block_int.h b/include/block/block_int.h
index 6f778e2517..1f56443440 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -881,6 +881,10 @@ struct BlockDriverState {
 /* I/O Limits */
 BlockLimits bl;
 
+/*
+ * Flags honored during pread
+ */
+unsigned int supported_read_flags;
 /* Flags honored during pwrite (so far: BDRV_REQ_FUA,
  * BDRV_REQ_WRITE_UNCHANGED).
  * If a driver does not support BDRV_REQ_WRITE_UNCHANGED, those
diff --git a/block/io.c b/block/io.c
index 24205f5168..851fe53604 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1431,6 +1431,9 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild 
*child,
 if (flags & BDRV_REQ_COPY_ON_READ) {
 int64_t pnum;
 
+/* The flag BDRV_REQ_COPY_ON_READ has reached its addressee */
+flags &= ~BDRV_REQ_COPY_ON_READ;
+
 ret = bdrv_is_allocated(bs, offset, bytes, );
 if (ret < 0) {
 goto out;
@@ -1452,9 +1455,11 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild 
*child,
 goto out;
 }
 
+assert(!(flags & ~bs->supported_read_flags));
+
 max_bytes = ROUND_UP(MAX(0, total_bytes - offset), align);
 if (bytes <= max_bytes && bytes <= max_transfer) {
-ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, 0);
+ret = bdrv_driver_preadv(bs, offset, bytes, qiov, qiov_offset, flags);
 goto out;
 }
 
@@ -1467,7 +1472,8 @@ static int coroutine_fn bdrv_aligned_preadv(BdrvChild 
*child,
 
 ret = bdrv_driver_preadv(bs, offset + bytes - bytes_remaining,
  num, qiov,
- qiov_offset + bytes - bytes_remaining, 0);
+ qiov_offset + bytes - bytes_remaining,
+ flags);
 max_bytes -= num;
 } else {
 num = bytes_remaining;
-- 
2.25.4

[PATCH v15 01/13] copy-on-read: support preadv/pwritev_part functions

2020-12-15 Thread Vladimir Sementsov-Ogievskiy

From: Andrey Shinkevich 

Add support for the recently introduced functions
bdrv_co_preadv_part()
and
bdrv_co_pwritev_part()
to the COR-filter driver.

Signed-off-by: Andrey Shinkevich 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
---
 block/copy-on-read.c | 28 
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/block/copy-on-read.c b/block/copy-on-read.c
index 2816e61afe..cb03e0f2d3 100644
--- a/block/copy-on-read.c
+++ b/block/copy-on-read.c
@@ -74,21 +74,25 @@ static int64_t cor_getlength(BlockDriverState *bs)
 }
 
 
-static int coroutine_fn cor_co_preadv(BlockDriverState *bs,
-  uint64_t offset, uint64_t bytes,
-  QEMUIOVector *qiov, int flags)
+static int coroutine_fn cor_co_preadv_part(BlockDriverState *bs,
+   uint64_t offset, uint64_t bytes,
+   QEMUIOVector *qiov,
+   size_t qiov_offset,
+   int flags)
 {
-return bdrv_co_preadv(bs->file, offset, bytes, qiov,
-  flags | BDRV_REQ_COPY_ON_READ);
+return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset,
+   flags | BDRV_REQ_COPY_ON_READ);
 }
 
 
-static int coroutine_fn cor_co_pwritev(BlockDriverState *bs,
-   uint64_t offset, uint64_t bytes,
-   QEMUIOVector *qiov, int flags)
+static int coroutine_fn cor_co_pwritev_part(BlockDriverState *bs,
+uint64_t offset,
+uint64_t bytes,
+QEMUIOVector *qiov,
+size_t qiov_offset, int flags)
 {
-
-return bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
+return bdrv_co_pwritev_part(bs->file, offset, bytes, qiov, qiov_offset,
+flags);
 }
 
 
@@ -137,8 +141,8 @@ static BlockDriver bdrv_copy_on_read = {
 
 .bdrv_getlength = cor_getlength,
 
-.bdrv_co_preadv = cor_co_preadv,
-.bdrv_co_pwritev= cor_co_pwritev,
+.bdrv_co_preadv_part= cor_co_preadv_part,
+.bdrv_co_pwritev_part   = cor_co_pwritev_part,
 .bdrv_co_pwrite_zeroes  = cor_co_pwrite_zeroes,
 .bdrv_co_pdiscard   = cor_co_pdiscard,
 .bdrv_co_pwritev_compressed = cor_co_pwritev_compressed,
-- 
2.25.4

[PATCH v15 10/13] qapi: block-stream: add "bottom" argument

2020-12-15 Thread Vladimir Sementsov-Ogievskiy

The code already don't freeze base node and we try to make it prepared
for the situation when base node is changed during the operation. In
other words, block-stream doesn't own base node.

Let's introduce a new interface which should replace the current one,
which will in better relations with the code. Specifying bottom node
instead of base, and requiring it to be non-filter gives us the
following benefits:

 - drop difference between above_base and base_overlay, which will be
   renamed to just bottom, when old interface dropped

 - clean way to work with parallel streams/commits on the same backing
   chain, which otherwise become a problem when we introduce a filter
   for stream job

 - cleaner interface. Nobody will surprised the fact that base node may
   disappear during block-stream, when there is no word about "base" in
   the interface.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 qapi/block-core.json   | 12 ---
 include/block/block_int.h  |  1 +
 block/monitor/block-hmp-cmds.c |  3 +-
 block/stream.c | 50 +++-
 blockdev.c | 59 --
 5 files changed, 94 insertions(+), 31 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index b8094a5ec7..cb0066fd5c 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2517,10 +2517,14 @@
 # @device: the device or node name of the top image
 #
 # @base: the common backing file name.
-#It cannot be set if @base-node is also set.
+#It cannot be set if @base-node or @bottom is also set.
 #
 # @base-node: the node name of the backing file.
-# It cannot be set if @base is also set. (Since 2.8)
+# It cannot be set if @base or @bottom is also set. (Since 2.8)
+#
+# @bottom: the last node in the chain that should be streamed into
+#  top. It cannot be set if @base or @base-node is also set.
+#  It cannot be filter node. (Since 6.0)
 #
 # @backing-file: The backing file string to write into the top
 #image. This filename is not validated.
@@ -2576,8 +2580,8 @@
 ##
 { 'command': 'block-stream',
   'data': { '*job-id': 'str', 'device': 'str', '*base': 'str',
-'*base-node': 'str', '*backing-file': 'str', '*speed': 'int',
-'*on-error': 'BlockdevOnError',
+'*base-node': 'str', '*backing-file': 'str', '*bottom': 'str',
+'*speed': 'int', '*on-error': 'BlockdevOnError',
 '*filter-node-name': 'str',
 '*auto-finalize': 'bool', '*auto-dismiss': 'bool' } }
 
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 1f56443440..4b8aa61fb4 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -1160,6 +1160,7 @@ int is_windows_drive(const char *filename);
  */
 void stream_start(const char *job_id, BlockDriverState *bs,
   BlockDriverState *base, const char *backing_file_str,
+  BlockDriverState *bottom,
   int creation_flags, int64_t speed,
   BlockdevOnError on_error,
   const char *filter_node_name,
diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
index e8a58f326e..afd75ab628 100644
--- a/block/monitor/block-hmp-cmds.c
+++ b/block/monitor/block-hmp-cmds.c
@@ -507,7 +507,8 @@ void hmp_block_stream(Monitor *mon, const QDict *qdict)
 int64_t speed = qdict_get_try_int(qdict, "speed", 0);
 
 qmp_block_stream(true, device, device, base != NULL, base, false, NULL,
- false, NULL, qdict_haskey(qdict, "speed"), speed, true,
+ false, NULL, false, NULL,
+ qdict_haskey(qdict, "speed"), speed, true,
  BLOCKDEV_ON_ERROR_REPORT, false, NULL, false, false, 
false,
  false, );
 
diff --git a/block/stream.c b/block/stream.c
index 6a525a5edf..045d6bc76b 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -221,6 +221,7 @@ static const BlockJobDriver stream_job_driver = {
 
 void stream_start(const char *job_id, BlockDriverState *bs,
   BlockDriverState *base, const char *backing_file_str,
+  BlockDriverState *bottom,
   int creation_flags, int64_t speed,
   BlockdevOnError on_error,
   const char *filter_node_name,
@@ -230,25 +231,42 @@ void stream_start(const char *job_id, BlockDriverState 
*bs,
 BlockDriverState *iter;
 bool bs_read_only;
 int basic_flags = BLK_PERM_CONSISTENT_READ | BLK_PERM_WRITE_UNCHANGED;
-BlockDriverState *base_overlay = bdrv_find_overlay(bs, base);
+BlockDriverState *base_overlay;
 BlockDriverState *above_base;
 
-if (!base_overlay) {
-error_setg(errp, "'%s' is not in the backing chain of '%s'",
-   base->node_name, bs->node_name);
-return;
-}
+assert(!(base && bottom));
+assert(!(backing_file_str &&

[PATCH v15 09/13] stream: rework backing-file changing

2020-12-15 Thread Vladimir Sementsov-Ogievskiy

From: Andrey Shinkevich 

Stream in stream_prepare calls bdrv_change_backing_file() to change
backing-file in the metadata of bs.

It may use either backing-file parameter given by user or just take
filename of base on job start.

Backing file format is determined by base on job finish.

There are some problems with this design, we solve only two by this
patch:

1. Consider scenario with backing-file unset. Current concept of stream
supports changing of the base during the job (we don't freeze link to
the base). So, we should not save base filename at job start,

  - let's determine name of the base on job finish.

2. Using direct base to determine filename and format is not very good:
base node may be a filter, so its filename may be JSON, and format_name
is not good for storing into qcow2 metadata as backing file format.

  - let's use unfiltered_base

Signed-off-by: Andrey Shinkevich 
Signed-off-by: Vladimir Sementsov-Ogievskiy 
  [vsementsov: change commit subject, change logic in stream_prepare]
---
 block/stream.c | 9 +
 blockdev.c | 8 +---
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/block/stream.c b/block/stream.c
index 6e281c71ac..6a525a5edf 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -65,6 +65,7 @@ static int stream_prepare(Job *job)
 BlockDriverState *bs = blk_bs(bjob->blk);
 BlockDriverState *unfiltered_bs = bdrv_skip_filters(bs);
 BlockDriverState *base = bdrv_filter_or_cow_bs(s->above_base);
+BlockDriverState *unfiltered_base = bdrv_skip_filters(base);
 Error *local_err = NULL;
 int ret = 0;
 
@@ -73,10 +74,10 @@ static int stream_prepare(Job *job)
 
 if (bdrv_cow_child(unfiltered_bs)) {
 const char *base_id = NULL, *base_fmt = NULL;
-if (base) {
-base_id = s->backing_file_str;
-if (base->drv) {
-base_fmt = base->drv->format_name;
+if (unfiltered_base) {
+base_id = s->backing_file_str ?: unfiltered_base->filename;
+if (unfiltered_base->drv) {
+base_fmt = unfiltered_base->drv->format_name;
 }
 }
 bdrv_set_backing_hd(unfiltered_bs, base, _err);
diff --git a/blockdev.c b/blockdev.c
index c290cb1dca..b58f36fc31 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2510,7 +2510,6 @@ void qmp_block_stream(bool has_job_id, const char 
*job_id, const char *device,
 BlockDriverState *base_bs = NULL;
 AioContext *aio_context;
 Error *local_err = NULL;
-const char *base_name = NULL;
 int job_flags = JOB_DEFAULT;
 
 if (!has_on_error) {
@@ -2538,7 +2537,6 @@ void qmp_block_stream(bool has_job_id, const char 
*job_id, const char *device,
 goto out;
 }
 assert(bdrv_get_aio_context(base_bs) == aio_context);
-base_name = base;
 }
 
 if (has_base_node) {
@@ -2553,7 +2551,6 @@ void qmp_block_stream(bool has_job_id, const char 
*job_id, const char *device,
 }
 assert(bdrv_get_aio_context(base_bs) == aio_context);
 bdrv_refresh_filename(base_bs);
-base_name = base_bs->filename;
 }
 
 /* Check for op blockers in the whole chain between bs and base */
@@ -2573,9 +2570,6 @@ void qmp_block_stream(bool has_job_id, const char 
*job_id, const char *device,
 goto out;
 }
 
-/* backing_file string overrides base bs filename */
-base_name = has_backing_file ? backing_file : base_name;
-
 if (has_auto_finalize && !auto_finalize) {
 job_flags |= JOB_MANUAL_FINALIZE;
 }
@@ -2583,7 +2577,7 @@ void qmp_block_stream(bool has_job_id, const char 
*job_id, const char *device,
 job_flags |= JOB_MANUAL_DISMISS;
 }
 
-stream_start(has_job_id ? job_id : NULL, bs, base_bs, base_name,
+stream_start(has_job_id ? job_id : NULL, bs, base_bs, backing_file,
  job_flags, has_speed ? speed : 0, on_error,
  filter_node_name, _err);
 if (local_err) {
-- 
2.25.4

[PATCH v15 05/13] qapi: copy-on-read filter: add 'bottom' option

2020-12-15 Thread Vladimir Sementsov-Ogievskiy

From: Andrey Shinkevich 

Add an option to limit copy-on-read operations to specified sub-chain
of backing-chain, to make copy-on-read filter useful for block-stream
job.

Suggested-by: Max Reitz 
Suggested-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Andrey Shinkevich 
Signed-off-by: Vladimir Sementsov-Ogievskiy 
  [vsementsov: change subject, modified to freeze the chain,
   do some fixes]
---
 qapi/block-core.json | 20 -
 block/copy-on-read.c | 98 +++-
 2 files changed, 115 insertions(+), 3 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index 6050cf3c39..b8094a5ec7 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -3942,6 +3942,24 @@
   'data': { 'throttle-group': 'str',
 'file' : 'BlockdevRef'
  } }
+
+##
+# @BlockdevOptionsCor:
+#
+# Driver specific block device options for the copy-on-read driver.
+#
+# @bottom: The name of a non-filter node (allocation-bearing layer) that
+#  limits the COR operations in the backing chain (inclusive), so
+#  that no data below this node will be copied by this filter.
+#  If option is absent, the limit is not applied, so that data
+#  from all backing layers may be copied.
+#
+# Since: 6.0
+##
+{ 'struct': 'BlockdevOptionsCor',
+  'base': 'BlockdevOptionsGenericFormat',
+  'data': { '*bottom': 'str' } }
+
 ##
 # @BlockdevOptions:
 #
@@ -3994,7 +4012,7 @@
   'bochs':  'BlockdevOptionsGenericFormat',
   'cloop':  'BlockdevOptionsGenericFormat',
   'compress':   'BlockdevOptionsGenericFormat',
-  'copy-on-read':'BlockdevOptionsGenericFormat',
+  'copy-on-read':'BlockdevOptionsCor',
   'dmg':'BlockdevOptionsGenericFormat',
   'file':   'BlockdevOptionsFile',
   'ftp':'BlockdevOptionsCurlFtp',
diff --git a/block/copy-on-read.c b/block/copy-on-read.c
index 618c4c4f43..71560984f6 100644
--- a/block/copy-on-read.c
+++ b/block/copy-on-read.c
@@ -24,18 +24,24 @@
 #include "block/block_int.h"
 #include "qemu/module.h"
 #include "qapi/error.h"
+#include "qapi/qmp/qdict.h"
 #include "block/copy-on-read.h"
 
 
 typedef struct BDRVStateCOR {
 bool active;
+BlockDriverState *bottom_bs;
+bool chain_frozen;
 } BDRVStateCOR;
 
 
 static int cor_open(BlockDriverState *bs, QDict *options, int flags,
 Error **errp)
 {
+BlockDriverState *bottom_bs = NULL;
 BDRVStateCOR *state = bs->opaque;
+/* Find a bottom node name, if any */
+const char *bottom_node = qdict_get_try_str(options, "bottom");
 
 bs->file = bdrv_open_child(NULL, options, "file", bs, _of_bds,
BDRV_CHILD_FILTERED | BDRV_CHILD_PRIMARY,
@@ -51,7 +57,38 @@ static int cor_open(BlockDriverState *bs, QDict *options, 
int flags,
 ((BDRV_REQ_FUA | BDRV_REQ_MAY_UNMAP | BDRV_REQ_NO_FALLBACK) &
 bs->file->bs->supported_zero_flags);
 
+if (bottom_node) {
+bottom_bs = bdrv_find_node(bottom_node);
+if (!bottom_bs) {
+error_setg(errp, "Bottom node '%s' not found", bottom_node);
+qdict_del(options, "bottom");
+return -EINVAL;
+}
+qdict_del(options, "bottom");
+
+if (!bottom_bs->drv) {
+error_setg(errp, "Bottom node '%s' not opened", bottom_node);
+return -EINVAL;
+}
+
+if (bottom_bs->drv->is_filter) {
+error_setg(errp, "Bottom node '%s' is a filter", bottom_node);
+return -EINVAL;
+}
+
+if (bdrv_freeze_backing_chain(bs, bottom_bs, errp) < 0) {
+return -EINVAL;
+}
+state->chain_frozen = true;
+
+/*
+ * We do freeze the chain, so it shouldn't be removed. Still, storing a
+ * pointer worth bdrv_ref().
+ */
+bdrv_ref(bottom_bs);
+}
 state->active = true;
+state->bottom_bs = bottom_bs;
 
 /*
  * We don't need to call bdrv_child_refresh_perms() now as the permissions
@@ -107,8 +144,46 @@ static int coroutine_fn 
cor_co_preadv_part(BlockDriverState *bs,
size_t qiov_offset,
int flags)
 {
-return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset,
-   flags | BDRV_REQ_COPY_ON_READ);
+int64_t n;
+int local_flags;
+int ret;
+BDRVStateCOR *state = bs->opaque;
+
+if (!state->bottom_bs) {
+return bdrv_co_preadv_part(bs->file, offset, bytes, qiov, qiov_offset,
+   flags | BDRV_REQ_COPY_ON_READ);
+}
+
+while (bytes) {
+local_flags = flags;
+
+/* In case of failure, try to copy-on-read anyway */
+ret = bdrv_is_allocated(bs->file->bs, offset, bytes, );
+if (ret <= 0) {
+ret = 
bdrv_is_allocated_above(bdrv_backing_chain_next(bs->file->bs),
+

[PATCH v15 04/13] qapi: add filter-node-name to block-stream

2020-12-15 Thread Vladimir Sementsov-Ogievskiy

From: Andrey Shinkevich 

Provide the possibility to pass the 'filter-node-name' parameter to the
block-stream job as it is done for the commit block job.

Signed-off-by: Andrey Shinkevich 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
  [vsementsov: comment indentation, s/Since: 5.2/Since: 6.0/]
Reviewed-by: Max Reitz 
---
 qapi/block-core.json   | 6 ++
 include/block/block_int.h  | 7 ++-
 block/monitor/block-hmp-cmds.c | 4 ++--
 block/stream.c | 4 +++-
 blockdev.c | 4 +++-
 5 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index 04c5196e59..6050cf3c39 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2543,6 +2543,11 @@
 #'stop' and 'enospc' can only be used if the block device
 #supports io-status (see BlockInfo).  Since 1.3.
 #
+# @filter-node-name: the node name that should be assigned to the
+#filter driver that the stream job inserts into the graph
+#above @device. If this option is not given, a node name is
+#autogenerated. (Since: 6.0)
+#
 # @auto-finalize: When false, this job will wait in a PENDING state after it 
has
 # finished its work, waiting for @block-job-finalize before
 # making any block graph changes.
@@ -2573,6 +2578,7 @@
   'data': { '*job-id': 'str', 'device': 'str', '*base': 'str',
 '*base-node': 'str', '*backing-file': 'str', '*speed': 'int',
 '*on-error': 'BlockdevOnError',
+'*filter-node-name': 'str',
 '*auto-finalize': 'bool', '*auto-dismiss': 'bool' } }
 
 ##
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 1eeafc118c..6f778e2517 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -1142,6 +1142,9 @@ int is_windows_drive(const char *filename);
  *  See @BlockJobCreateFlags
  * @speed: The maximum speed, in bytes per second, or 0 for unlimited.
  * @on_error: The action to take upon error.
+ * @filter_node_name: The node name that should be assigned to the filter
+ *driver that the commit job inserts into the graph above
+ *@bs. NULL means that a node name should be autogenerated.
  * @errp: Error object.
  *
  * Start a streaming operation on @bs.  Clusters that are unallocated
@@ -1154,7 +1157,9 @@ int is_windows_drive(const char *filename);
 void stream_start(const char *job_id, BlockDriverState *bs,
   BlockDriverState *base, const char *backing_file_str,
   int creation_flags, int64_t speed,
-  BlockdevOnError on_error, Error **errp);
+  BlockdevOnError on_error,
+  const char *filter_node_name,
+  Error **errp);
 
 /**
  * commit_start:
diff --git a/block/monitor/block-hmp-cmds.c b/block/monitor/block-hmp-cmds.c
index d15a2be827..e8a58f326e 100644
--- a/block/monitor/block-hmp-cmds.c
+++ b/block/monitor/block-hmp-cmds.c
@@ -508,8 +508,8 @@ void hmp_block_stream(Monitor *mon, const QDict *qdict)
 
 qmp_block_stream(true, device, device, base != NULL, base, false, NULL,
  false, NULL, qdict_haskey(qdict, "speed"), speed, true,
- BLOCKDEV_ON_ERROR_REPORT, false, false, false, false,
- );
+ BLOCKDEV_ON_ERROR_REPORT, false, NULL, false, false, 
false,
+ false, );
 
 hmp_handle_error(mon, error);
 }
diff --git a/block/stream.c b/block/stream.c
index 236384f2f7..6e281c71ac 100644
--- a/block/stream.c
+++ b/block/stream.c
@@ -221,7 +221,9 @@ static const BlockJobDriver stream_job_driver = {
 void stream_start(const char *job_id, BlockDriverState *bs,
   BlockDriverState *base, const char *backing_file_str,
   int creation_flags, int64_t speed,
-  BlockdevOnError on_error, Error **errp)
+  BlockdevOnError on_error,
+  const char *filter_node_name,
+  Error **errp)
 {
 StreamBlockJob *s;
 BlockDriverState *iter;
diff --git a/blockdev.c b/blockdev.c
index 412354b4b6..c290cb1dca 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2501,6 +2501,7 @@ void qmp_block_stream(bool has_job_id, const char 
*job_id, const char *device,
   bool has_backing_file, const char *backing_file,
   bool has_speed, int64_t speed,
   bool has_on_error, BlockdevOnError on_error,
+  bool has_filter_node_name, const char *filter_node_name,
   bool has_auto_finalize, bool auto_finalize,
   bool has_auto_dismiss, bool auto_dismiss,
   Error **errp)
@@ -2583,7 +2584,8 @@ void qmp_block_stream(bool has_job_id, const char 
*job_id, const char *device,
 }
 
 stream_start(has_job_id ?

[PATCH v15 00/13] Apply COR-filter to the block-stream permanently

2020-12-15 Thread Vladimir Sementsov-Ogievskiy

Hi all!

Here is a new version of cor-filter in block-stream series. Main change
is freezing the chain in cor-filter itself.

v15:
02: s/ =  / = /
add Max's r-b
03: add Max's r-b
04: since: 6.0
indent comment
add Max's r-b
05: changed commit msg
wording
document the default
since: 6.0
use bdrv_find_node(), fix errp overwriting
freeze the chain
check bottom is not filter
ref bottom_bs
06: limit to qcow2 to not care
use qemu-img rebase -u -b ''
07: use assert instead of abort
add Max's r-b
08: add Max's r-b
09: changed commit msg (was "stream: skip filters when writing backing file 
name to QCOW2 header")
keep mostly same logic for the case when backing-file is specified, don't 
do bdrv_find_backing_image()
10: don't restrict backing-file for now
11: add Max's r-b
12: add Max's r-b
13: chain is now frozen in filter, so the logic changed around add/remove 
fitlter

Andrey Shinkevich (10):
  copy-on-read: support preadv/pwritev_part functions
  block: add API function to insert a node
  copy-on-read: add filter drop function
  qapi: add filter-node-name to block-stream
  qapi: copy-on-read filter: add 'bottom' option
  iotests: add #310 to test bottom node in COR driver
  block: include supported_read_flags into BDS structure
  copy-on-read: skip non-guest reads if no copy needed
  stream: rework backing-file changing
  block: apply COR-filter to block-stream jobs

Vladimir Sementsov-Ogievskiy (3):
  qapi: block-stream: add "bottom" argument
  iotests: 30: prepare to COR filter insertion by stream job
  block/stream: add s->target_bs

 qapi/block-core.json   |  38 ++-
 block/copy-on-read.h   |  32 ++
 include/block/block.h  |  10 +-
 include/block/block_int.h  |  12 ++-
 block.c|  25 +
 block/copy-on-read.c   | 184 +---
 block/io.c |  10 +-
 block/monitor/block-hmp-cmds.c |   7 +-
 block/stream.c | 185 -
 blockdev.c |  69 +---
 tests/qemu-iotests/030 |  12 ++-
 tests/qemu-iotests/141.out |   2 +-
 tests/qemu-iotests/245 |  20 ++--
 tests/qemu-iotests/310 | 116 +
 tests/qemu-iotests/310.out |  15 +++
 tests/qemu-iotests/group   |   1 +
 16 files changed, 608 insertions(+), 130 deletions(-)
 create mode 100644 block/copy-on-read.h
 create mode 100755 tests/qemu-iotests/310
 create mode 100644 tests/qemu-iotests/310.out

-- 
2.25.4

[PATCH v15 02/13] block: add API function to insert a node

2020-12-15 Thread Vladimir Sementsov-Ogievskiy

From: Andrey Shinkevich 

Provide API for insertion a node to backing chain.

Suggested-by: Max Reitz 
Signed-off-by: Andrey Shinkevich 
Reviewed-by: Vladimir Sementsov-Ogievskiy 
Reviewed-by: Max Reitz 
---
 include/block/block.h |  2 ++
 block.c   | 25 +
 2 files changed, 27 insertions(+)

diff --git a/include/block/block.h b/include/block/block.h
index 5b81e33e94..8ea794959b 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -360,6 +360,8 @@ void bdrv_append(BlockDriverState *bs_new, BlockDriverState 
*bs_top,
  Error **errp);
 void bdrv_replace_node(BlockDriverState *from, BlockDriverState *to,
Error **errp);
+BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *node_options,
+   int flags, Error **errp);
 
 int bdrv_parse_aio(const char *mode, int *flags);
 int bdrv_parse_cache_mode(const char *mode, int *flags, bool *writethrough);
diff --git a/block.c b/block.c
index 8f177504d4..8ed9c5eaf9 100644
--- a/block.c
+++ b/block.c
@@ -4704,6 +4704,31 @@ static void bdrv_delete(BlockDriverState *bs)
 g_free(bs);
 }
 
+BlockDriverState *bdrv_insert_node(BlockDriverState *bs, QDict *node_options,
+   int flags, Error **errp)
+{
+BlockDriverState *new_node_bs;
+Error *local_err = NULL;
+
+new_node_bs = bdrv_open(NULL, NULL, node_options, flags, errp);
+if (new_node_bs == NULL) {
+error_prepend(errp, "Could not create node: ");
+return NULL;
+}
+
+bdrv_drained_begin(bs);
+bdrv_replace_node(bs, new_node_bs, _err);
+bdrv_drained_end(bs);
+
+if (local_err) {
+bdrv_unref(new_node_bs);
+error_propagate(errp, local_err);
+return NULL;
+}
+
+return new_node_bs;
+}
+
 /*
  * Run consistency checks on an image
  *
-- 
2.25.4

Re: [PATCH v11 00/13] hw/block/nvme: Support Namespace Types and Zoned Namespace Command Set

2020-12-15 Thread Keith Busch

Hi Dmitry,

Looks good to me, thanks for sticking with it.

Reviewed-by: Keith Busch

[PATCH] block: report errno when flock fcntl fails

2020-12-15 Thread David Edmondson

When a call to fcntl(2) for the purpose of manipulating file locks
fails, report the error returned by fcntl.

Signed-off-by: David Edmondson 
---
 block/file-posix.c | 20 ++--
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 9804681d5c..f866fc9742 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -836,7 +836,7 @@ static int raw_apply_lock_bytes(BDRVRawState *s, int fd,
 if ((perm_lock_bits & bit) && !(locked_perm & bit)) {
 ret = qemu_lock_fd(fd, off, 1, false);
 if (ret) {
-error_setg(errp, "Failed to lock byte %d", off);
+error_setg_errno(errp, -ret, "Failed to lock byte %d", off);
 return ret;
 } else if (s) {
 s->locked_perm |= bit;
@@ -844,7 +844,7 @@ static int raw_apply_lock_bytes(BDRVRawState *s, int fd,
 } else if (unlock && (locked_perm & bit) && !(perm_lock_bits & bit)) {
 ret = qemu_unlock_fd(fd, off, 1);
 if (ret) {
-error_setg(errp, "Failed to unlock byte %d", off);
+error_setg_errno(errp, -ret, "Failed to unlock byte %d", off);
 return ret;
 } else if (s) {
 s->locked_perm &= ~bit;
@@ -857,7 +857,7 @@ static int raw_apply_lock_bytes(BDRVRawState *s, int fd,
 if ((shared_perm_lock_bits & bit) && !(locked_shared_perm & bit)) {
 ret = qemu_lock_fd(fd, off, 1, false);
 if (ret) {
-error_setg(errp, "Failed to lock byte %d", off);
+error_setg_errno(errp, -ret, "Failed to lock byte %d", off);
 return ret;
 } else if (s) {
 s->locked_shared_perm |= bit;
@@ -866,7 +866,7 @@ static int raw_apply_lock_bytes(BDRVRawState *s, int fd,
!(shared_perm_lock_bits & bit)) {
 ret = qemu_unlock_fd(fd, off, 1);
 if (ret) {
-error_setg(errp, "Failed to unlock byte %d", off);
+error_setg_errno(errp, -ret, "Failed to unlock byte %d", off);
 return ret;
 } else if (s) {
 s->locked_shared_perm &= ~bit;
@@ -890,9 +890,9 @@ static int raw_check_lock_bytes(int fd, uint64_t perm, 
uint64_t shared_perm,
 ret = qemu_lock_fd_test(fd, off, 1, true);
 if (ret) {
 char *perm_name = bdrv_perm_names(p);
-error_setg(errp,
-   "Failed to get \"%s\" lock",
-   perm_name);
+error_setg_errno(errp, -ret,
+ "Failed to get \"%s\" lock",
+ perm_name);
 g_free(perm_name);
 return ret;
 }
@@ -905,9 +905,9 @@ static int raw_check_lock_bytes(int fd, uint64_t perm, 
uint64_t shared_perm,
 ret = qemu_lock_fd_test(fd, off, 1, true);
 if (ret) {
 char *perm_name = bdrv_perm_names(p);
-error_setg(errp,
-   "Failed to get shared \"%s\" lock",
-   perm_name);
+error_setg_errno(errp, -ret,
+ "Failed to get shared \"%s\" lock",
+ perm_name);
 g_free(perm_name);
 return ret;
 }
-- 
2.29.2

Re: [PATCH v2 09/36] block: return value from bdrv_replace_node()

2020-12-15 Thread Alberto Garcia

On Fri 27 Nov 2020 03:44:55 PM CET, Vladimir Sementsov-Ogievskiy wrote:
> Functions with errp argument are not recommened to be void-functions.
> Improve bdrv_replace_node().
>
> Signed-off-by: Vladimir Sementsov-Ogievskiy 

Reviewed-by: Alberto Garcia 

Berto

Re: [PATCH v2 08/36] block: make bdrv_reopen_{prepare, commit, abort} private

2020-12-15 Thread Alberto Garcia

On Fri 27 Nov 2020 03:44:54 PM CET, Vladimir Sementsov-Ogievskiy via wrote:
> These functions are called only from bdrv_reopen_multiple() in block.c.
> No reason to publish them.
>
> Signed-off-by: Vladimir Sementsov-Ogievskiy 

Reviewed-by: Alberto Garcia 

Berto

Re: [PATCH v2 4/4] block: Close block exports in two steps

2020-12-15 Thread Sergio Lopez

On Tue, Dec 15, 2020 at 04:34:05PM +0100, Kevin Wolf wrote:
> Am 14.12.2020 um 18:05 hat Sergio Lopez geschrieben:
> > There's a cross-dependency between closing the block exports and
> > draining the block layer. The latter needs that we close all export's
> > client connections to ensure they won't queue more requests, but the
> > exports may have coroutines yielding in the block layer, which implies
> > they can't be fully closed until we drain it.
> 
> A coroutine that yielded must have some way to be reentered. So I guess
> the quesiton becomes why they aren't reentered until drain. We do
> process events:
> 
> AIO_WAIT_WHILE(NULL, blk_exp_has_type(type));
> 
> So in theory, anything that would finalise the block export closing
> should still execute.
> 
> What is the difference that drain makes compared to a simple
> AIO_WAIT_WHILE, so that coroutine are reentered during drain, but not
> during AIO_WAIT_WHILE?
> 
> This is an even more interesting question because the NBD server isn't a
> block node nor a BdrvChildClass implementation, so it shouldn't even
> notice a drain operation.

I agree in that this deserves a deeper analysis. I'm going to drop
this patch from the series, and will re-analyze the issue later.

Thanks,
Sergio.


signature.asc
Description: PGP signature

Re: [PATCH v2 2/4] block: Avoid processing BDS twice in bdrv_set_aio_context_ignore()

2020-12-15 Thread Sergio Lopez

On Tue, Dec 15, 2020 at 04:01:19PM +0100, Kevin Wolf wrote:
> Am 15.12.2020 um 14:15 hat Sergio Lopez geschrieben:
> > On Tue, Dec 15, 2020 at 01:12:33PM +0100, Kevin Wolf wrote:
> > > Am 14.12.2020 um 18:05 hat Sergio Lopez geschrieben:
> > > > While processing the parents of a BDS, one of the parents may process
> > > > the child that's doing the tail recursion, which leads to a BDS being
> > > > processed twice. This is especially problematic for the aio_notifiers,
> > > > as they might attempt to work on both the old and the new AIO
> > > > contexts.
> > > > 
> > > > To avoid this, add the BDS pointer to the ignore list, and check the
> > > > child BDS pointer while iterating over the children.
> > > > 
> > > > Signed-off-by: Sergio Lopez 
> > > 
> > > Ugh, so we get a mixed list of BdrvChild and BlockDriverState? :-/
> > 
> > I know, it's effective but quite ugly...
> > 
> > > What is the specific scenario where you saw this breaking? Did you have
> > > multiple BdrvChild connections between two nodes so that we would go to
> > > the parent node through one and then come back to the child node through
> > > the other?
> > 
> > I don't think this is a corner case. If the graph is walked top->down,
> > there's no problem since children are added to the ignore list before
> > getting processed, and siblings don't process each other. But, if the
> > graph is walked bottom->up, a BDS will start processing its parents
> > without adding itself to the ignore list, so there's nothing
> > preventing them from processing it again.
> 
> I don't understand. child is added to ignore before calling the parent
> callback on it, so how can we come back through the same BdrvChild?
> 
> QLIST_FOREACH(child, >parents, next_parent) {
> if (g_slist_find(*ignore, child)) {
> continue;
> }
> assert(child->klass->set_aio_ctx);
> *ignore = g_slist_prepend(*ignore, child);
> child->klass->set_aio_ctx(child, new_context, ignore);
> }

Perhaps I'm missing something, but the way I understand it, that loop
is adding the BdrvChild pointer of each of its parents, but not the
BdrvChild pointer of the BDS that was passed as an argument to
b_s_a_c_i.

> You didn't dump the BdrvChild here. I think that would add some
> information on why we re-entered 0x555ee2fbf660. Maybe you can also add
> bs->drv->format_name for each node to make the scenario less abstract?

I've generated another trace with more data:

bs=0x565505e48030 (backup-top) enter
bs=0x565505e48030 (backup-top) processing children
bs=0x565505e48030 (backup-top) calling bsaci child=0x565505e42090 
(child->bs=0x565505e5d420)
bs=0x565505e5d420 (qcow2) enter
bs=0x565505e5d420 (qcow2) processing children
bs=0x565505e5d420 (qcow2) calling bsaci child=0x565505e41ea0 
(child->bs=0x565505e52060)
bs=0x565505e52060 (file) enter
bs=0x565505e52060 (file) processing children
bs=0x565505e52060 (file) processing parents
bs=0x565505e52060 (file) processing itself
bs=0x565505e5d420 (qcow2) processing parents
bs=0x565505e5d420 (qcow2) calling set_aio_ctx child=0x5655066a34d0
bs=0x565505fbf660 (qcow2) enter
bs=0x565505fbf660 (qcow2) processing children
bs=0x565505fbf660 (qcow2) calling bsaci child=0x565505e41d20 
(child->bs=0x565506bc0c00)
bs=0x565506bc0c00 (file) enter
bs=0x565506bc0c00 (file) processing children
bs=0x565506bc0c00 (file) processing parents
bs=0x565506bc0c00 (file) processing itself
bs=0x565505fbf660 (qcow2) processing parents
bs=0x565505fbf660 (qcow2) calling set_aio_ctx child=0x565505fc7aa0
bs=0x565505fbf660 (qcow2) calling set_aio_ctx child=0x5655068b8510
bs=0x565505e48030 (backup-top) enter
bs=0x565505e48030 (backup-top) processing children
bs=0x565505e48030 (backup-top) calling bsaci child=0x565505e3c450 
(child->bs=0x565505fbf660)
bs=0x565505fbf660 (qcow2) enter
bs=0x565505fbf660 (qcow2) processing children
bs=0x565505fbf660 (qcow2) processing parents
bs=0x565505fbf660 (qcow2) processing itself
bs=0x565505e48030 (backup-top) processing parents
bs=0x565505e48030 (backup-top) calling set_aio_ctx child=0x565505e402d0
bs=0x565505e48030 (backup-top) processing itself
bs=0x565505fbf660 (qcow2) processing itself


So it seems this is happening:

backup-top (5e48030) <-| (5)
   ||  |
   || (6) > qcow2 (5fbf660)
   |   ^|
   |   (3) || (4)
   |-> (1) qcow2 (5e5d420) -|-> file (6bc0c00)
   |
   |-> (2) file (5e52060)

backup-top (5e48030), the BDS that was passed as argument in the first
bdrv_set_aio_context_ignore() call, is re-entered when qcow2 (5fbf660)
is processing its parents, and the latter is also re-entered when the
first one starts processing its children again.

Sergio.


signature.asc
Description: PGP signature

Re: [PATCH] hw/block: m25p80: Fix fast read for SST flashes

2020-12-15 Thread Francisco Iglesias

Hello Bin,

On [2020 Dec 12] Sat 17:44:27, Bin Meng wrote:
> Hi Francisco,
> 
> On Sat, Dec 12, 2020 at 5:24 PM Francisco Iglesias
>  wrote:
> >
> > Hi bin,
> >
> > On [2020 Dec 12] Sat 16:16:59, Bin Meng wrote:
> > > Hi Francisco,
> > >
> > > On Sat, Dec 12, 2020 at 12:11 AM Francisco Iglesias
> > >  wrote:
> > > >
> > > > Hello Bin,
> > > >
> > > > On [2020 Dec 11] Fri 23:29:16, Bin Meng wrote:
> > > > > Hi Francisco,
> > > > >
> > > > > On Fri, Dec 11, 2020 at 11:16 PM Francisco Iglesias
> > > > >  wrote:
> > > > > >
> > > > > > Hello Bin,
> > > > > >
> > > > > > On [2020 Dec 11] Fri 14:07:21, Bin Meng wrote:
> > > > > > > Hi Francisco,
> > > > > > >
> > > > > > > On Fri, Dec 4, 2020 at 7:28 PM Francisco Iglesias
> > > > > > >  wrote:
> > > > > > > >
> > > > > > > > Hello Bin,
> > > > > > > >
> > > > > > > > On [2020 Dec 04] Fri 18:52:50, Bin Meng wrote:
> > > > > > > > > Hi Francisco,
> > > > > > > > >
> > > > > > > > > On Fri, Dec 4, 2020 at 6:46 PM Francisco Iglesias
> > > > > > > > >  wrote:
> > > > > > > > > >
> > > > > > > > > > Hello Bin,
> > > > > > > > > >
> > > > > > > > > > On [2020 Dec 04] Fri 15:52:12, Bin Meng wrote:
> > > > > > > > > > > Hi Francisco,
> > > > > > > > > > >
> > > > > > > > > > > On Thu, Dec 3, 2020 at 4:38 PM Francisco Iglesias
> > > > > > > > > > >  wrote:
> > > > > > > > > > > >
> > > > > > > > > > > > Hi Bin and Alistair,
> > > > > > > > > > > >
> > > > > > > > > > > > On [2020 Dec 02] Wed 11:40:11, Alistair Francis wrote:
> > > > > > > > > > > > > On Sun, Nov 29, 2020 at 6:55 PM Bin Meng 
> > > > > > > > > > > > >  wrote:
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > From: Bin Meng 
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > SST flashes require a dummy byte after the address 
> > > > > > > > > > > > > > bits.
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > Signed-off-by: Bin Meng 
> > > > > > > > > > > > >
> > > > > > > > > > > > > I couldn't find a datasheet that says this... But the 
> > > > > > > > > > > > > actual code
> > > > > > > > > > > > > change looks fine, so:
> > > > > > > > > > > > >
> > > > > > > > > > > > > Acked-by: Alistair Francis 
> > > > > > > > > > > > >
> > > > > > > > > > > > > Alistair
> > > > > > > > > > > > >
> > > > > > > > > > > > > > ---
> > > > > > > > > > > > > >
> > > > > > > > > > > > > >  hw/block/m25p80.c | 3 +++
> > > > > > > > > > > > > >  1 file changed, 3 insertions(+)
> > > > > > > > > > > > > >
> > > > > > > > > > > > > > diff --git a/hw/block/m25p80.c b/hw/block/m25p80.c
> > > > > > > > > > > > > > index 483925f..9b36762 100644
> > > > > > > > > > > > > > --- a/hw/block/m25p80.c
> > > > > > > > > > > > > > +++ b/hw/block/m25p80.c
> > > > > > > > > > > > > > @@ -825,6 +825,9 @@ static void 
> > > > > > > > > > > > > > decode_fast_read_cmd(Flash *s)
> > > > > > > > > > > > > >  s->needed_bytes = get_addr_length(s);
> > > > > > > > > > > > > >  switch (get_man(s)) {
> > > > > > > > > > > > > >  /* Dummy cycles - modeled with bytes writes 
> > > > > > > > > > > > > > instead of bits */
> > > > > > > > > > > > > > +case MAN_SST:
> > > > > > > > > > > > > > +s->needed_bytes += 1;
> > > > > > > > > > > >
> > > > > > > > > > > > 1 dummy clk cycle is modelled as 1 byte write (see the 
> > > > > > > > > > > > comment above), so 1
> > > > > > > > > > > > dummy byte (8 dummy clk cycles) will need +8 above.
> > > > > > > > > > >
> > > > > > > > > > > I think you were confused by the WINBOND codes. The 
> > > > > > > > > > > comments are
> > > > > > > > > > > correct. It is modeled with bytes instead of bits, so we 
> > > > > > > > > > > should +=1.
> > > > > > > > > >
> > > > > > > > > > What the comment says is (perhaps not superclear) that 1 
> > > > > > > > > > dummy clock cycle
> > > > > > > > > > is modeled as one 1 byte write into the flash (meaining 
> > > > > > > > > > that 8 byte writes
> > > > > > > > > > are needed for 1 dummy byte). Perhaps it is easier to 
> > > > > > > > > > understand
> > > > > > > > > > looking into how the controllers issue the command towards 
> > > > > > > > > > the flash model
> > > > > > > > > > (for example the xilinx_spips), the start of the FAST_READ 
> > > > > > > > > > cmd is issued
> > > > > > > > > > as writing the following into the flash: 1 byte (cmd), 3 
> > > > > > > > > > bytes (address),
> > > > > > > > > > 8 bytes (8 dummy cycles -> 1 dummy byte).
> > > > > > > > > >
> > > > > > > > >
> > > > > > > > > My interpretation of the comments are opposite: one cycle is 
> > > > > > > > > a bit,
> > > > > > > > > but we are not using bits, instead we are using bytes.
> > > > > > > >
> > > > > > > > Yes, the mentioning of 'bits' in the comment makes it not very 
> > > > > > > > clear at first read.
> > > > > > > > Maybe just bellow would have been better:
> > > > > > > >
> > > > > > > > /* Dummy clock cycles - modeled with bytes writes */
> > > > > > > >
> > > > > > > > >
> > > > > > > > > Testing shows that +=1 is the correct way with the

Re: [PATCH v11 08/13] block/nvme: Make ZNS-related definitions

2020-12-15 Thread Stefan Hajnoczi

On Wed, Dec 09, 2020 at 07:37:11AM +0100, Klaus Jensen wrote:
> CC for Stefan (nvme block driver co-maintainer).
> 
> On Dec  9 05:04, Dmitry Fomichev wrote:
> > Define values and structures that are needed to support Zoned
> > Namespace Command Set (NVMe TP 4053).
> > 
> > Signed-off-by: Dmitry Fomichev 
> > ---
> >  include/block/nvme.h | 114 ++-
> >  1 file changed, 113 insertions(+), 1 deletion(-)
> > 
> > diff --git a/include/block/nvme.h b/include/block/nvme.h
> > index 29d826ab19..a9165402d6 100644
> > --- a/include/block/nvme.h
> > +++ b/include/block/nvme.h
> > @@ -489,6 +489,9 @@ enum NvmeIoCommands {
> >  NVME_CMD_COMPARE= 0x05,
> >  NVME_CMD_WRITE_ZEROES   = 0x08,
> >  NVME_CMD_DSM= 0x09,
> > +NVME_CMD_ZONE_MGMT_SEND = 0x79,
> > +NVME_CMD_ZONE_MGMT_RECV = 0x7a,
> > +NVME_CMD_ZONE_APPEND= 0x7d,
> >  };
> >  
> >  typedef struct QEMU_PACKED NvmeDeleteQ {
> > @@ -648,9 +651,13 @@ typedef struct QEMU_PACKED NvmeAerResult {
> >  uint8_t resv;
> >  } NvmeAerResult;
> >  
> > +typedef struct QEMU_PACKED NvmeZonedResult {
> > +uint64_t slba;
> > +} NvmeZonedResult;
> > +
> >  typedef struct QEMU_PACKED NvmeCqe {
> >  uint32_tresult;
> > -uint32_trsvd;
> > +uint32_tdw1;
> >  uint16_tsq_head;
> >  uint16_tsq_id;
> >  uint16_tcid;
> > @@ -679,6 +686,7 @@ enum NvmeStatusCodes {
> >  NVME_INVALID_USE_OF_CMB = 0x0012,
> >  NVME_INVALID_PRP_OFFSET = 0x0013,
> >  NVME_CMD_SET_CMB_REJECTED   = 0x002b,
> > +NVME_INVALID_CMD_SET= 0x002c,
> >  NVME_LBA_RANGE  = 0x0080,
> >  NVME_CAP_EXCEEDED   = 0x0081,
> >  NVME_NS_NOT_READY   = 0x0082,
> > @@ -703,6 +711,14 @@ enum NvmeStatusCodes {
> >  NVME_CONFLICTING_ATTRS  = 0x0180,
> >  NVME_INVALID_PROT_INFO  = 0x0181,
> >  NVME_WRITE_TO_RO= 0x0182,
> > +NVME_ZONE_BOUNDARY_ERROR= 0x01b8,
> > +NVME_ZONE_FULL  = 0x01b9,
> > +NVME_ZONE_READ_ONLY = 0x01ba,
> > +NVME_ZONE_OFFLINE   = 0x01bb,
> > +NVME_ZONE_INVALID_WRITE = 0x01bc,
> > +NVME_ZONE_TOO_MANY_ACTIVE   = 0x01bd,
> > +NVME_ZONE_TOO_MANY_OPEN = 0x01be,
> > +NVME_ZONE_INVAL_TRANSITION  = 0x01bf,
> >  NVME_WRITE_FAULT= 0x0280,
> >  NVME_UNRECOVERED_READ   = 0x0281,
> >  NVME_E2E_GUARD_ERROR= 0x0282,
> > @@ -888,6 +904,11 @@ typedef struct QEMU_PACKED NvmeIdCtrl {
> >  uint8_t vs[1024];
> >  } NvmeIdCtrl;
> >  
> > +typedef struct NvmeIdCtrlZoned {
> > +uint8_t zasl;
> > +uint8_t rsvd1[4095];
> > +} NvmeIdCtrlZoned;
> > +
> >  enum NvmeIdCtrlOacs {
> >  NVME_OACS_SECURITY  = 1 << 0,
> >  NVME_OACS_FORMAT= 1 << 1,
> > @@ -1016,6 +1037,12 @@ typedef struct QEMU_PACKED NvmeLBAF {
> >  uint8_t rp;
> >  } NvmeLBAF;
> >  
> > +typedef struct QEMU_PACKED NvmeLBAFE {
> > +uint64_tzsze;
> > +uint8_t zdes;
> > +uint8_t rsvd9[7];
> > +} NvmeLBAFE;
> > +
> >  #define NVME_NSID_BROADCAST 0x
> >  
> >  typedef struct QEMU_PACKED NvmeIdNs {
> > @@ -1075,10 +1102,24 @@ enum NvmeNsIdentifierType {
> >  
> >  enum NvmeCsi {
> >  NVME_CSI_NVM= 0x00,
> > +NVME_CSI_ZONED  = 0x02,
> >  };
> >  
> >  #define NVME_SET_CSI(vec, csi) (vec |= (uint8_t)(1 << (csi)))
> >  
> > +typedef struct QEMU_PACKED NvmeIdNsZoned {
> > +uint16_tzoc;
> > +uint16_tozcs;
> > +uint32_tmar;
> > +uint32_tmor;
> > +uint32_trrl;
> > +uint32_tfrl;
> > +uint8_t rsvd20[2796];
> > +NvmeLBAFE   lbafe[16];
> > +uint8_t rsvd3072[768];
> > +uint8_t vs[256];
> > +} NvmeIdNsZoned;
> > +
> >  /*Deallocate Logical Block Features*/
> >  #define NVME_ID_NS_DLFEAT_GUARD_CRC(dlfeat)   ((dlfeat) & 0x10)
> >  #define NVME_ID_NS_DLFEAT_WRITE_ZEROES(dlfeat)((dlfeat) & 0x08)
> > @@ -,10 +1152,76 @@ enum NvmeIdNsDps {
> >  DPS_FIRST_EIGHT = 8,
> >  };
> >  
> > +enum NvmeZoneAttr {
> > +NVME_ZA_FINISHED_BY_CTLR = 1 << 0,
> > +NVME_ZA_FINISH_RECOMMENDED   = 1 << 1,
> > +NVME_ZA_RESET_RECOMMENDED= 1 << 2,
> > +NVME_ZA_ZD_EXT_VALID = 1 << 7,
> > +};
> > +
> > +typedef struct QEMU_PACKED NvmeZoneReportHeader {
> > +uint64_tnr_zones;
> > +uint8_t rsvd[56];
> > +} NvmeZoneReportHeader;
> > +
> > +enum NvmeZoneReceiveAction {
> > +NVME_ZONE_REPORT = 0,
> > +NVME_ZONE_REPORT_EXTENDED= 1,
> > +};
> > +
> > +enum NvmeZoneReportType {
> > +NVME_ZONE_REPORT_ALL = 0,
> > +NVME_ZONE_REPORT_EMPTY   = 1,
> > +NVME_ZONE_REPORT_IMPLICITLY_OPEN = 2,
> > +NVME_ZONE_REPORT_EXPLICITLY_OPEN = 3,
> > +NVME_ZONE_REPORT_CLOSED  = 4,
> > +NVME_ZONE_REPORT_FULL= 5,
> > +

Re: [PATCH v3 1/3] docs: generate qemu-storage-daemon-qmp-ref(7) man page

2020-12-15 Thread Kevin Wolf

Am 09.12.2020 um 11:38 hat Stefan Hajnoczi geschrieben:
> Although individual qemu-storage-daemon QMP commands are identical to
> QEMU QMP commands, qemu-storage-daemon only supports a subset of QEMU's
> QMP commands. Generate a manual page of just the commands supported by
> qemu-storage-daemon so that users know exactly what is available in
> qemu-storage-daemon.
> 
> Add an h1 heading in storage-daemon/qapi/qapi-schema.json so that
> block-core.json is at the h2 heading level.
> 
> Signed-off-by: Stefan Hajnoczi 
> ---
>  docs/interop/index.rst   |  1 +
>  docs/interop/qemu-storage-daemon-qmp-ref.rst | 13 +
>  storage-daemon/qapi/qapi-schema.json |  3 +++
>  docs/interop/conf.py |  2 ++
>  docs/meson.build |  1 +
>  5 files changed, 20 insertions(+)
>  create mode 100644 docs/interop/qemu-storage-daemon-qmp-ref.rst
> 
> diff --git a/docs/interop/index.rst b/docs/interop/index.rst
> index cd78d679d8..95d56495f6 100644
> --- a/docs/interop/index.rst
> +++ b/docs/interop/index.rst
> @@ -20,6 +20,7 @@ Contents:
> qemu-ga
> qemu-ga-ref
> qemu-qmp-ref
> +   qemu-storage-daemon-qmp-ref
> vhost-user
> vhost-user-gpu
> vhost-vdpa
> diff --git a/docs/interop/qemu-storage-daemon-qmp-ref.rst 
> b/docs/interop/qemu-storage-daemon-qmp-ref.rst
> new file mode 100644
> index 00..caf9dad23a
> --- /dev/null
> +++ b/docs/interop/qemu-storage-daemon-qmp-ref.rst
> @@ -0,0 +1,13 @@
> +QEMU Storage Daemon QMP Reference Manual
> +
> +
> +..
> +   TODO: the old Texinfo manual used to note that this manual
> +   is GPL-v2-or-later. We should make that reader-visible
> +   both here and in our Sphinx manuals more generally.
> +
> +..
> +   TODO: display the QEMU version, both here and in our Sphinx manuals
> +   more generally.
> +
> +.. qapi-doc:: storage-daemon/qapi/qapi-schema.json

Did you intend to actually merge the TODO comments like this into master
or was this meant to be resolved before you send the series?

Kevin

Re: [PATCH] block/nfs: fix int overflow in nfs_client_open_qdict

2020-12-15 Thread Kevin Wolf

Am 10.12.2020 um 10:00 hat Stefano Garzarella geschrieben:
> On Wed, Dec 09, 2020 at 01:17:35PM +0100, Peter Lieven wrote:
> > nfs_client_open returns the file size in sectors. This effectively
> > makes it impossible to open files larger than 1TB.
> > 
> > Fixes: a1a42af422d46812f1f0cebe6b230c20409a3731
> > Cc: qemu-sta...@nongnu.org
> > Signed-off-by: Peter Lieven 
> > ---
> > block/nfs.c | 2 +-
> > 1 file changed, 1 insertion(+), 1 deletion(-)
> 
> The issue seems to be pre-existing to the commit
> a1a42af422d46812f1f0cebe6b230c20409a3731, but of course that commit touched
> this code and this patch would not apply before, so it seems okay to me:

I think it's commit c22a0345, which is the one right before a1a42af4.
I'll update the commit message accordingly.

> Reviewed-by: Stefano Garzarella 

Thanks, applied to the block branch.

Kevin

Re: [PATCH] iotests: Quote $cmd in _send_qemu_cmd

2020-12-15 Thread Max Reitz


On 15.12.20 16:49, Kevin Wolf wrote:

[...]


Ah, looks like you don't even need to quote it like cmd="$1"? Maybe I
should learn bash sometime...


No, normally you don’t, which is the reason why the "cmd=${$@: x:y}" 
line wasn’t quoted so far.  (Probably a bug in bash that was then fixed 
in 5.1.)



But yes, this is what I was thinking of.


OK, I’ll cook up a v2 then.

Max

Re: [PATCH v3] hw/block/nand: Decommission the NAND museum

2020-12-15 Thread Kevin Wolf

Am 14.12.2020 um 11:02 hat Peter Maydell geschrieben:
> On Mon, 14 Dec 2020 at 00:26, Philippe Mathieu-Daudé  wrote:
> >
> > This is the QEMU equivalent of this Linux commit (but 7 years later):
> > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=f7025a43a9da2
> >
> > The MTD subsystem has its own small museum of ancient NANDs
> > in a form of the CONFIG_MTD_NAND_MUSEUM_IDS configuration option.
> > The museum contains stone age NANDs with 256 bytes pages, as well
> > as iron age NANDs with 512 bytes per page and up to 8MiB page size.
> >
> > It is with great sorrow that I inform you that the museum is being
> > decommissioned. The MTD subsystem is out of budget for Kconfig
> > options and already has too many of them, and there is a general
> > kernel trend to simplify the configuration menu.
> >
> > We remove the stone age exhibits along with closing the museum,
> > but some of the iron age ones are transferred to the regular NAND
> > depot. Namely, only those which have unique device IDs are
> > transferred, and the ones which have conflicting device IDs are
> > removed.
> >
> > The machine using this device are:
> > - axis-dev88
> > - tosa (via tc6393xb_init)
> > - spitz based (akita, borzoi, terrier)
> >
> > Reviewed-by: Richard Henderson 
> > Signed-off-by: Philippe Mathieu-Daudé 
> > ---
> > v3: Do not manually convert tabs to space to avoid mistakes...
> 
> Reviewed-by: Peter Maydell 

Thanks, applied to the block branch.

Kevin

Re: [PATCH] iotests: Quote $cmd in _send_qemu_cmd

2020-12-15 Thread Kevin Wolf

Am 15.12.2020 um 13:14 hat Max Reitz geschrieben:
> On 15.12.20 12:43, Kevin Wolf wrote:
> > Am 14.12.2020 um 18:58 hat Max Reitz geschrieben:
> > > With bash 5.1, the output of the following script (which creates an
> > > array with a single element, then takes a single-element slice from that
> > > array, and echos the result) changes:
> > > 
> > >a=("double  space")
> > >a=${a[@]:0:1}
> > >echo "$a"
> > > 
> > > from "double space" to "double  space", i.e. all white space is
> > > preserved as-is.  This is probably what we actually want here (judging
> > > from the "...to accommodate pathnames with spaces" comment), but before
> > > 5.1, we have to quote the ${} slice to get the same behavior.
> > > 
> > > In any case, without quoting, the reference output of many iotests is
> > > different between bash 5.1 and pre-5.1, which is not very good.  We
> > > should fix it by quoting here, and then adjusting all the reference
> > > output of all iotests using _send_qemu_cmd accordingly.  (The only
> > > ones we do not need to change are those that do not have multiple
> > > consecutive whitespaces in their _send_qemu_cmd parameters.)
> > > 
> > > Signed-off-by: Max Reitz 
> > > ---
> > > Alternatively, we could explicitly collapse all whitespace sequences
> > > into single spaces, but I believe that would defeat the purpose of
> > > "accomodat[ing] pathnames with spaces".
> > 
> > I agree that quoting is the better alternative.
> > 
> > The whole thing shouldn't be needed if callers propertly quote
> > filenames. If I understand correctly, these lines just try to merge
> > back filenames that have been split into multiple words because the
> > caller wasn't careful. But we can't reliably do that because we don't
> > know which and how much whitespace was removed.
> 
> No, without quoting here, the slice operation will actually collapse spaces
> inside of array elements, as shown in the commit message
> (a=("x  y") creates an array with a single element that just has two spaces
> in it).  If the caller didn’t quote, then quoting in _send_qemu_cmd wouldn’t
> help either, because the whitespace would no longer be present in $@.

This is exactly what I mean. The whole slicing seems to be an attempt at
reconstructing an original unquoted string, which can't work because you
can't reconstruct the right whitespace.

So if it can't work anyway, we probably shouldn't even attempt to and
just use a single parameter instead of an array slice and require
callers to do proper quoting...

> Most callers of _send_qemu_cmd actually do something like this:
> 
> _send_qemu_cmd \
> $QEMU_HANDLE \
> "{ 'execute': 'something',
>'arguments': ... }" \
> 'return'
> 
> So they do quote all of their arguments, and actually only pass three (or
> four, with success_or_failure=y), so basically the slicing is generally
> unnecessary and we’d just need cmd=$1 (not $2, because _send_qemu_cmd likes
> to invoke “shift”).

...which they apparently already do.

> In fact, if you do that on top of this patch (cmd=$1 instead of the fancy
> slice), you’ll see that the only test that then doesn’t pass is 102.  I
> think that’s because it has this one _send_qemu_cmd invocation that forgot
> to pass $QEMU_HANDLE...?
> 
> So, alternatively to quoting the slice, we could also just set cmd=$1 and
> fix 102.

Ah, looks like you don't even need to quote it like cmd="$1"? Maybe I
should learn bash sometime...

But yes, this is what I was thinking of.

Kevin

Re: [PATCH v2 4/4] block: Close block exports in two steps

2020-12-15 Thread Kevin Wolf

Am 14.12.2020 um 18:05 hat Sergio Lopez geschrieben:
> There's a cross-dependency between closing the block exports and
> draining the block layer. The latter needs that we close all export's
> client connections to ensure they won't queue more requests, but the
> exports may have coroutines yielding in the block layer, which implies
> they can't be fully closed until we drain it.

A coroutine that yielded must have some way to be reentered. So I guess
the quesiton becomes why they aren't reentered until drain. We do
process events:

AIO_WAIT_WHILE(NULL, blk_exp_has_type(type));

So in theory, anything that would finalise the block export closing
should still execute.

What is the difference that drain makes compared to a simple
AIO_WAIT_WHILE, so that coroutine are reentered during drain, but not
during AIO_WAIT_WHILE?

This is an even more interesting question because the NBD server isn't a
block node nor a BdrvChildClass implementation, so it shouldn't even
notice a drain operation.

Kevin

> To break this cross-dependency, this change adds a "bool wait"
> argument to blk_exp_close_all() and blk_exp_close_all_type(), so
> callers can decide whether they want to wait for the exports to be
> fully quiesced, or just return after requesting them to shut down.
> 
> Then, in bdrv_close_all we make two calls, one without waiting to
> close all client connections, and another after draining the block
> layer, this time waiting for the exports to be fully quiesced.
> 
> RHBZ: https://bugzilla.redhat.com/show_bug.cgi?id=1900505
> Signed-off-by: Sergio Lopez 
> ---
>  block.c   | 20 +++-
>  block/export/export.c | 10 ++
>  blockdev-nbd.c|  2 +-
>  include/block/export.h|  4 ++--
>  qemu-nbd.c|  2 +-
>  stubs/blk-exp-close-all.c |  2 +-
>  6 files changed, 30 insertions(+), 10 deletions(-)
> 
> diff --git a/block.c b/block.c
> index bc8a66ab6e..41db70ac07 100644
> --- a/block.c
> +++ b/block.c
> @@ -4472,13 +4472,31 @@ static void bdrv_close(BlockDriverState *bs)
>  void bdrv_close_all(void)
>  {
>  assert(job_next(NULL) == NULL);
> -blk_exp_close_all();
> +
> +/*
> + * There's a cross-dependency between closing the block exports and
> + * draining the block layer. The latter needs that we close all export's
> + * client connections to ensure they won't queue more requests, but the
> + * exports may have coroutines yielding in the block layer, which implies
> + * they can't be fully closed until we drain it.
> + *
> + * Make a first call to close all export's client connections, without
> + * waiting for each export to be fully quiesced.
> + */
> +blk_exp_close_all(false);
>  
>  /* Drop references from requests still in flight, such as canceled block
>   * jobs whose AIO context has not been polled yet */
>  bdrv_drain_all();
>  
>  blk_remove_all_bs();
> +
> +/*
> + * Make a second call to shut down the exports, this time waiting for 
> them
> + * to be fully quiesced.
> + */
> +blk_exp_close_all(true);
> +
>  blockdev_close_all_bdrv_states();
>  
>  assert(QTAILQ_EMPTY(_bdrv_states));
> diff --git a/block/export/export.c b/block/export/export.c
> index bad6f21b1c..0124ebd9f9 100644
> --- a/block/export/export.c
> +++ b/block/export/export.c
> @@ -280,7 +280,7 @@ static bool blk_exp_has_type(BlockExportType type)
>  }
>  
>  /* type == BLOCK_EXPORT_TYPE__MAX for all types */
> -void blk_exp_close_all_type(BlockExportType type)
> +void blk_exp_close_all_type(BlockExportType type, bool wait)
>  {
>  BlockExport *exp, *next;
>  
> @@ -293,12 +293,14 @@ void blk_exp_close_all_type(BlockExportType type)
>  blk_exp_request_shutdown(exp);
>  }
>  
> -AIO_WAIT_WHILE(NULL, blk_exp_has_type(type));
> +if (wait) {
> +AIO_WAIT_WHILE(NULL, blk_exp_has_type(type));
> +}
>  }
>  
> -void blk_exp_close_all(void)
> +void blk_exp_close_all(bool wait)
>  {
> -blk_exp_close_all_type(BLOCK_EXPORT_TYPE__MAX);
> +blk_exp_close_all_type(BLOCK_EXPORT_TYPE__MAX, wait);
>  }
>  
>  void qmp_block_export_add(BlockExportOptions *export, Error **errp)
> diff --git a/blockdev-nbd.c b/blockdev-nbd.c
> index d8443d235b..d71d4da7c2 100644
> --- a/blockdev-nbd.c
> +++ b/blockdev-nbd.c
> @@ -266,7 +266,7 @@ void qmp_nbd_server_stop(Error **errp)
>  return;
>  }
>  
> -blk_exp_close_all_type(BLOCK_EXPORT_TYPE_NBD);
> +blk_exp_close_all_type(BLOCK_EXPORT_TYPE_NBD, true);
>  
>  nbd_server_free(nbd_server);
>  nbd_server = NULL;
> diff --git a/include/block/export.h b/include/block/export.h
> index 7feb02e10d..71c25928ce 100644
> --- a/include/block/export.h
> +++ b/include/block/export.h
> @@ -83,7 +83,7 @@ BlockExport *blk_exp_find(const char *id);
>  void blk_exp_ref(BlockExport *exp);
>  void blk_exp_unref(BlockExport *exp);
>  void blk_exp_request_shutdown(BlockExport *exp);
> -void

Re: [PATCH v2 2/4] block: Avoid processing BDS twice in bdrv_set_aio_context_ignore()

2020-12-15 Thread Kevin Wolf

Am 15.12.2020 um 14:15 hat Sergio Lopez geschrieben:
> On Tue, Dec 15, 2020 at 01:12:33PM +0100, Kevin Wolf wrote:
> > Am 14.12.2020 um 18:05 hat Sergio Lopez geschrieben:
> > > While processing the parents of a BDS, one of the parents may process
> > > the child that's doing the tail recursion, which leads to a BDS being
> > > processed twice. This is especially problematic for the aio_notifiers,
> > > as they might attempt to work on both the old and the new AIO
> > > contexts.
> > > 
> > > To avoid this, add the BDS pointer to the ignore list, and check the
> > > child BDS pointer while iterating over the children.
> > > 
> > > Signed-off-by: Sergio Lopez 
> > 
> > Ugh, so we get a mixed list of BdrvChild and BlockDriverState? :-/
> 
> I know, it's effective but quite ugly...
> 
> > What is the specific scenario where you saw this breaking? Did you have
> > multiple BdrvChild connections between two nodes so that we would go to
> > the parent node through one and then come back to the child node through
> > the other?
> 
> I don't think this is a corner case. If the graph is walked top->down,
> there's no problem since children are added to the ignore list before
> getting processed, and siblings don't process each other. But, if the
> graph is walked bottom->up, a BDS will start processing its parents
> without adding itself to the ignore list, so there's nothing
> preventing them from processing it again.

I don't understand. child is added to ignore before calling the parent
callback on it, so how can we come back through the same BdrvChild?

QLIST_FOREACH(child, >parents, next_parent) {
if (g_slist_find(*ignore, child)) {
continue;
}
assert(child->klass->set_aio_ctx);
*ignore = g_slist_prepend(*ignore, child);
child->klass->set_aio_ctx(child, new_context, ignore);
}

> I'm pasting here an annotated trace of bdrv_set_aio_context_ignore I
> generated while triggering the issue:
> 
> <- begin -->
> bdrv_set_aio_context_ignore: bs=0x555ee2e48030 enter
> bdrv_set_aio_context_ignore: bs=0x555ee2e48030 processing children
> bdrv_set_aio_context_ignore: bs=0x555ee2e5d420 enter
> bdrv_set_aio_context_ignore: bs=0x555ee2e5d420 processing children
> bdrv_set_aio_context_ignore: bs=0x555ee2e52060 enter
> bdrv_set_aio_context_ignore: bs=0x555ee2e52060 processing children
> bdrv_set_aio_context_ignore: bs=0x555ee2e52060 processing parents
> bdrv_set_aio_context_ignore: bs=0x555ee2e52060 processing itself
> bdrv_set_aio_context_ignore: bs=0x555ee2e5d420 processing parents
> 
>  - We enter b_s_a_c_i with BDS 2fbf660 the first time:
>  
> bdrv_set_aio_context_ignore: bs=0x555ee2fbf660 enter
> bdrv_set_aio_context_ignore: bs=0x555ee2fbf660 processing children
> 
>  - We enter b_s_a_c_i with BDS 3bc0c00, a child of 2fbf660:
>  
> bdrv_set_aio_context_ignore: bs=0x555ee3bc0c00 enter
> bdrv_set_aio_context_ignore: bs=0x555ee3bc0c00 processing children
> bdrv_set_aio_context_ignore: bs=0x555ee3bc0c00 processing parents

> 
>  - We start processing its parents:
>  
> bdrv_set_aio_context_ignore: bs=0x555ee2fbf660 processing parents
> 
>  - We enter b_s_a_c_i with BDS 2e48030, a parent of 2fbf660:
>  
> bdrv_set_aio_context_ignore: bs=0x555ee2e48030 enter
> bdrv_set_aio_context_ignore: bs=0x555ee2e48030 processing children
> 
>  - We enter b_s_a_c_i with BDS 2fbf660 again, because of parent
>2e48030 didn't found us it in the ignore list:
>
> bdrv_set_aio_context_ignore: bs=0x555ee2fbf660 enter
> bdrv_set_aio_context_ignore: bs=0x555ee2fbf660 processing children
> bdrv_set_aio_context_ignore: bs=0x555ee2fbf660 processing parents
> bdrv_set_aio_context_ignore: bs=0x555ee2fbf660 processing itself
> bdrv_set_aio_context_ignore: bs=0x555ee2e48030 processing parents
> bdrv_set_aio_context_ignore: bs=0x555ee2e48030 processing itself
> 
>  - BDS 2fbf660 will be processed here a second time, triggering the
>issue:
>
> bdrv_set_aio_context_ignore: bs=0x555ee2fbf660 processing itself
> <- end -->

You didn't dump the BdrvChild here. I think that would add some
information on why we re-entered 0x555ee2fbf660. Maybe you can also add
bs->drv->format_name for each node to make the scenario less abstract?

So far my reconstruction of the graph is something like this:

0x555ee2e48030 --+
   |  |  |
   |  |  +-> 0x555ee2e5d420 -> 0x555ee2e52060
   v  v  |
0x555ee2fbf660 --+
   |
   +---> 0x555ee3bc0c00

It doesn't look quite trivial, but if 0x555ee2e48030 is the filter node
of a block job, it's not hard to imagine either.

> I suspect this has been happening for a while, and has only surfaced
> now due to the need to run an AIO context BH in an aio_notifier
> function that the "nbd/server: Quiesce coroutines on context switch"
> patch introduces. There the problem is that the first time the
> aio_notifier AIO detach function is called, it works on the old
> context (as it should be), and the second one

Re: [PATCH v4 23/32] qdev: Move dev->realized check to qdev_property_set()

2020-12-15 Thread Igor Mammedov

On Mon, 14 Dec 2020 12:24:18 -0500
Eduardo Habkost  wrote:

> On Mon, Dec 14, 2020 at 03:55:30PM +0100, Igor Mammedov wrote:
> > On Fri, 11 Dec 2020 17:05:20 -0500
> > Eduardo Habkost  wrote:
> >   
> > > Every single qdev property setter function manually checks
> > > dev->realized.  We can just check dev->realized inside
> > > qdev_property_set() instead.
> > > 
> > > The check is being added as a separate function
> > > (qdev_prop_allow_set()) because it will become a callback later.  
> > 
> > is callback added within this series?
> > and I'd add here what's the purpose of it.  
> 
> It will be added in part 2 of the series.  See v3:
> https://lore.kernel.org/qemu-devel/20201112214350.872250-35-ehabk...@redhat.com/
> 
> I don't know what else I could say about its purpose, in addition
> to what I wrote above, and the comment below[1].
> 
> If you are just curious about the callback and confused because
> it is not anywhere in this series, I can just remove the
> paragraph above from the commit message.  Would that be enough?
lets remove it for now to avoid confusion

Reviewed-by: Igor Mammedov 

> 
> >   
> [...]
> > > +/* returns: true if property is allowed to be set, false otherwise */  
> 
> [1] ^^^
> 
> > > +static bool qdev_prop_allow_set(Object *obj, const char *name,
> > > +Error **errp)
> > > +{
> > > +DeviceState *dev = DEVICE(obj);
> > > +
> > > +if (dev->realized) {
> > > +qdev_prop_set_after_realize(dev, name, errp);
> > > +return false;
> > > +}
> > > +return true;
> > > +}
> > > +  
>

Re: [PATCH v5 0/4] hw/block/m25p80: Numonyx: Fix dummy cycles and check for SPI mode on cmds

2020-12-15 Thread Peter Maydell

On Mon, 16 Nov 2020 at 23:13, Joe Komlodi  wrote:
>
> Changelog:
> v4 -> v5
>  - 3/4: Simplify logic when changing state and checking mode.
>  - 3/4: numonyx_get_mode -> numonyx_mode
>  - 4/4: Reword commit message to include QIO mode.
>
> v3 -> v4
>  - 1/4: Patch changed to change names of register fields to be more accurate.
>  - 1/4: Revert polarity change from v3.
>  - 2/4: Added, fixes polarity of VCFG XIP mode when copied from NVCFG.
>  - 3/4: Removed check_cmd_mode function, each command check is done in 
> decode_new_cmd instead.
>  - 3/4: Add guest error print if JEDEC read is executed in QIO or DIO mode.
>  - 3/4: Don't check PP and PP4, they work regardless of mode. PP4_4 is left 
> as is.
>  - 3/4: Simplify get_mode function.
>  - 4/4: Simplify extract_cfg_num_dummies function.
>  - 4/4: Use switch statement instead of table for cycle retrieving.
>
> v2 -> v3
>  - 1/3: Added, Fixes NVCFG polarity for DIO/QIO.
>  - 2/3: Added, Checks if we can execute the current command in 
> standard/DIO/QIO mode.
>  - 3/3: Was 1/1 in v2.  Added cycle counts for DIO/QIO mode.
>
> v1 -> v2
>  - 1/2: Change function name to be more accurate
>  - 2/2: Dropped
>
> Hi all,
>
> The series fixes the behavior of the dummy cycle register for Numonyx flashes 
> so
> it's closer to how hardware behaves.
> It also checks if a command can be executed in the current SPI mode
> (standard, DIO, or QIO) before extracting dummy cycles for the command.
>
> On hardware, the dummy cycles for fast read commands are set to a specific 
> value
> (8 or 10) if the register is all 0s or 1s.
> If the register value isn't all 0s or 1s, then the flash expects the amount of
> cycles sent to be equal to the count in the register.



Applied to target-arm.next, thanks.

-- PMM

Re: [PATCH v2 2/4] block: Avoid processing BDS twice in bdrv_set_aio_context_ignore()

2020-12-15 Thread Sergio Lopez

On Tue, Dec 15, 2020 at 01:12:33PM +0100, Kevin Wolf wrote:
> Am 14.12.2020 um 18:05 hat Sergio Lopez geschrieben:
> > While processing the parents of a BDS, one of the parents may process
> > the child that's doing the tail recursion, which leads to a BDS being
> > processed twice. This is especially problematic for the aio_notifiers,
> > as they might attempt to work on both the old and the new AIO
> > contexts.
> > 
> > To avoid this, add the BDS pointer to the ignore list, and check the
> > child BDS pointer while iterating over the children.
> > 
> > Signed-off-by: Sergio Lopez 
> 
> Ugh, so we get a mixed list of BdrvChild and BlockDriverState? :-/

I know, it's effective but quite ugly...

> What is the specific scenario where you saw this breaking? Did you have
> multiple BdrvChild connections between two nodes so that we would go to
> the parent node through one and then come back to the child node through
> the other?

I don't think this is a corner case. If the graph is walked top->down,
there's no problem since children are added to the ignore list before
getting processed, and siblings don't process each other. But, if the
graph is walked bottom->up, a BDS will start processing its parents
without adding itself to the ignore list, so there's nothing
preventing them from processing it again.

I'm pasting here an annotated trace of bdrv_set_aio_context_ignore I
generated while triggering the issue:

<- begin -->
bdrv_set_aio_context_ignore: bs=0x555ee2e48030 enter
bdrv_set_aio_context_ignore: bs=0x555ee2e48030 processing children
bdrv_set_aio_context_ignore: bs=0x555ee2e5d420 enter
bdrv_set_aio_context_ignore: bs=0x555ee2e5d420 processing children
bdrv_set_aio_context_ignore: bs=0x555ee2e52060 enter
bdrv_set_aio_context_ignore: bs=0x555ee2e52060 processing children
bdrv_set_aio_context_ignore: bs=0x555ee2e52060 processing parents
bdrv_set_aio_context_ignore: bs=0x555ee2e52060 processing itself
bdrv_set_aio_context_ignore: bs=0x555ee2e5d420 processing parents

 - We enter b_s_a_c_i with BDS 2fbf660 the first time:
 
bdrv_set_aio_context_ignore: bs=0x555ee2fbf660 enter
bdrv_set_aio_context_ignore: bs=0x555ee2fbf660 processing children

 - We enter b_s_a_c_i with BDS 3bc0c00, a child of 2fbf660:
 
bdrv_set_aio_context_ignore: bs=0x555ee3bc0c00 enter
bdrv_set_aio_context_ignore: bs=0x555ee3bc0c00 processing children
bdrv_set_aio_context_ignore: bs=0x555ee3bc0c00 processing parents
bdrv_set_aio_context_ignore: bs=0x555ee3bc0c00 processing itself

 - We start processing its parents:
 
bdrv_set_aio_context_ignore: bs=0x555ee2fbf660 processing parents

 - We enter b_s_a_c_i with BDS 2e48030, a parent of 2fbf660:
 
bdrv_set_aio_context_ignore: bs=0x555ee2e48030 enter
bdrv_set_aio_context_ignore: bs=0x555ee2e48030 processing children

 - We enter b_s_a_c_i with BDS 2fbf660 again, because of parent
   2e48030 didn't found us it in the ignore list:
   
bdrv_set_aio_context_ignore: bs=0x555ee2fbf660 enter
bdrv_set_aio_context_ignore: bs=0x555ee2fbf660 processing children
bdrv_set_aio_context_ignore: bs=0x555ee2fbf660 processing parents
bdrv_set_aio_context_ignore: bs=0x555ee2fbf660 processing itself
bdrv_set_aio_context_ignore: bs=0x555ee2e48030 processing parents
bdrv_set_aio_context_ignore: bs=0x555ee2e48030 processing itself

 - BDS 2fbf660 will be processed here a second time, triggering the
   issue:
   
bdrv_set_aio_context_ignore: bs=0x555ee2fbf660 processing itself
<- end -->

I suspect this has been happening for a while, and has only surfaced
now due to the need to run an AIO context BH in an aio_notifier
function that the "nbd/server: Quiesce coroutines on context switch"
patch introduces. There the problem is that the first time the
aio_notifier AIO detach function is called, it works on the old
context (as it should be), and the second one works on the new context
(which is wrong).

> Maybe if what we really need to do is not processing every edge once,
> but processing every node once, the list should be changed to contain
> _only_ BDS objects. But then blk_do_set_aio_context() probably won't
> work any more because it can't have blk->root ignored any more...

I tried that in my first attempt and it broke badly. I didn't take a
deeper look at the causes.

> Anyway, if we end up changing what the list contains, the comment needs
> an update, too. Currently it says:
> 
>  * @ignore will accumulate all visited BdrvChild object. The caller is
>  * responsible for freeing the list afterwards.
> 
> Another option: Split the parents QLIST_FOREACH loop in two. First add
> all parent BdrvChild objects to the ignore list, remember which of them
> were newly added, and only after adding all of them call
> child->klass->set_aio_ctx() for each parent that was previously not on
> the ignore list. This will avoid that we come back to the same node
> because all of its incoming edges are ignored now.

I don't think this strategy will fix the issue illustrated in the
trace above, as the

Re: [PATCH v12 0/7] Introduce 'yank' oob qmp command to recover from hanging qemu

2020-12-15 Thread Markus Armbruster

Lukas Straub  writes:

> Hello Everyone,
> So here is v12.
> @Marc-André Lureau, We still need an ACK for the chardev patch.

Marc-André?  Would be good to get this wrapped before Christmas.

[PATCH v4 4/7] block-backend: Enable retry action on errors

2020-12-15 Thread Jiahui Cen

Enable retry action when backend's retry timer is available. It would
trigger the timer to do device specific retry action.

Signed-off-by: Jiahui Cen 
Signed-off-by: Ying Fang 
---
 block/block-backend.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/block/block-backend.c b/block/block-backend.c
index bca7c581ee..9c6e50e568 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1803,6 +1803,9 @@ BlockErrorAction blk_get_error_action(BlockBackend *blk, 
bool is_read,
 return BLOCK_ERROR_ACTION_REPORT;
 case BLOCKDEV_ON_ERROR_IGNORE:
 return BLOCK_ERROR_ACTION_IGNORE;
+case BLOCKDEV_ON_ERROR_RETRY:
+return (blk->retry_timer) ?
+   BLOCK_ERROR_ACTION_RETRY : BLOCK_ERROR_ACTION_REPORT;
 case BLOCKDEV_ON_ERROR_AUTO:
 default:
 abort();
@@ -1850,6 +1853,10 @@ void blk_error_action(BlockBackend *blk, 
BlockErrorAction action,
 qemu_system_vmstop_request_prepare();
 send_qmp_error_event(blk, action, is_read, error);
 qemu_system_vmstop_request(RUN_STATE_IO_ERROR);
+} else if (action == BLOCK_ERROR_ACTION_RETRY) {
+timer_mod(blk->retry_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) +
+blk->retry_interval);
+send_qmp_error_event(blk, action, is_read, error);
 } else {
 send_qmp_error_event(blk, action, is_read, error);
 }
-- 
2.28.0

[PATCH v4 6/7] block: Add error retry param setting

2020-12-15 Thread Jiahui Cen

Add "retry_interval" and "retry_timeout" parameter for drive and device
option. These parameter are valid only when werror/rerror=retry.

eg. --drive file=image,rerror=retry,retry_interval=1000,retry_timeout=5000

Signed-off-by: Jiahui Cen 
Signed-off-by: Ying Fang 
---
 block/block-backend.c  | 13 +++--
 blockdev.c | 50 
 hw/block/block.c   | 10 
 include/hw/block/block.h   |  7 ++-
 include/sysemu/block-backend.h |  5 ++
 5 files changed, 81 insertions(+), 4 deletions(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index f5386fabb9..230b1c65b5 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -35,9 +35,6 @@
 
 static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb);
 
-/* block backend default retry interval */
-#define BLOCK_BACKEND_DEFAULT_RETRY_INTERVAL   1000
-
 typedef struct BlockBackendAioNotifier {
 void (*attached_aio_context)(AioContext *new_context, void *opaque);
 void (*detach_aio_context)(void *opaque);
@@ -1776,6 +1773,16 @@ void blk_drain_all(void)
 bdrv_drain_all_end();
 }
 
+void blk_set_on_error_retry_interval(BlockBackend *blk, int64_t interval)
+{
+blk->retry_interval = interval;
+}
+
+void blk_set_on_error_retry_timeout(BlockBackend *blk, int64_t timeout)
+{
+blk->retry_timeout = timeout;
+}
+
 static bool blk_error_retry_timeout(BlockBackend *blk)
 {
 /* No timeout set, infinite retries. */
diff --git a/blockdev.c b/blockdev.c
index 47c0e6db52..39c0669981 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -489,6 +489,7 @@ static BlockBackend *blockdev_init(const char *file, QDict 
*bs_opts,
 const char *buf;
 int bdrv_flags = 0;
 int on_read_error, on_write_error;
+int64_t retry_interval, retry_timeout;
 bool account_invalid, account_failed;
 bool writethrough, read_only;
 BlockBackend *blk;
@@ -581,6 +582,10 @@ static BlockBackend *blockdev_init(const char *file, QDict 
*bs_opts,
 }
 }
 
+retry_interval = qemu_opt_get_number(opts, "retry_interval",
+ BLOCK_BACKEND_DEFAULT_RETRY_INTERVAL);
+retry_timeout = qemu_opt_get_number(opts, "retry_timeout", 0);
+
 if (snapshot) {
 bdrv_flags |= BDRV_O_SNAPSHOT;
 }
@@ -645,6 +650,11 @@ static BlockBackend *blockdev_init(const char *file, QDict 
*bs_opts,
 
 blk_set_enable_write_cache(blk, !writethrough);
 blk_set_on_error(blk, on_read_error, on_write_error);
+if (on_read_error == BLOCKDEV_ON_ERROR_RETRY ||
+on_write_error == BLOCKDEV_ON_ERROR_RETRY) {
+blk_set_on_error_retry_interval(blk, retry_interval);
+blk_set_on_error_retry_timeout(blk, retry_timeout);
+}
 
 if (!monitor_add_blk(blk, id, errp)) {
 blk_unref(blk);
@@ -771,6 +781,14 @@ QemuOptsList qemu_legacy_drive_opts = {
 .name = "werror",
 .type = QEMU_OPT_STRING,
 .help = "write error action",
+},{
+.name = "retry_interval",
+.type = QEMU_OPT_NUMBER,
+.help = "interval for retry action in millisecond",
+},{
+.name = "retry_timeout",
+.type = QEMU_OPT_NUMBER,
+.help = "timeout for retry action in millisecond",
 },{
 .name = "copy-on-read",
 .type = QEMU_OPT_BOOL,
@@ -793,6 +811,7 @@ DriveInfo *drive_new(QemuOpts *all_opts, BlockInterfaceType 
block_default_type,
 BlockInterfaceType type;
 int max_devs, bus_id, unit_id, index;
 const char *werror, *rerror;
+int64_t retry_interval, retry_timeout;
 bool read_only = false;
 bool copy_on_read;
 const char *filename;
@@ -1004,6 +1023,29 @@ DriveInfo *drive_new(QemuOpts *all_opts, 
BlockInterfaceType block_default_type,
 qdict_put_str(bs_opts, "rerror", rerror);
 }
 
+if (qemu_opt_find(legacy_opts, "retry_interval")) {
+if ((werror == NULL || strcmp(werror, "retry")) &&
+(rerror == NULL || strcmp(rerror, "retry"))) {
+error_setg(errp, "retry_interval is only supported "
+ "by werror/rerror=retry");
+goto fail;
+}
+retry_interval = qemu_opt_get_number(legacy_opts, "retry_interval",
+ BLOCK_BACKEND_DEFAULT_RETRY_INTERVAL);
+qdict_put_int(bs_opts, "retry_interval", retry_interval);
+}
+
+if (qemu_opt_find(legacy_opts, "retry_timeout")) {
+if ((werror == NULL || strcmp(werror, "retry")) &&
+(rerror == NULL || strcmp(rerror, "retry"))) {
+error_setg(errp, "retry_timeout is only supported "
+ "by werror/rerror=retry");
+goto fail;
+}
+retry_timeout = qemu_opt_get_number(legacy_opts, "retry_timeout", 0);
+qdict_put_int(bs_opts, "retry_timeout", retry_timeout);
+}
+
 /* Actual block device init: Functionality shared with blockdev-add */
 blk =

[PATCH v4 5/7] block-backend: Add timeout support for retry

2020-12-15 Thread Jiahui Cen

Retry should only be triggered when timeout is not reached, so let's check
timeout before retry. Device should also reset retry_start_time after
successful retry.

Signed-off-by: Jiahui Cen 
Signed-off-by: Ying Fang 
---
 block/block-backend.c  | 25 +++-
 include/sysemu/block-backend.h |  1 +
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index 9c6e50e568..f5386fabb9 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -1776,6 +1776,29 @@ void blk_drain_all(void)
 bdrv_drain_all_end();
 }
 
+static bool blk_error_retry_timeout(BlockBackend *blk)
+{
+/* No timeout set, infinite retries. */
+if (!blk->retry_timeout) {
+return false;
+}
+
+/* The first time an error occurs. */
+if (!blk->retry_start_time) {
+blk->retry_start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+return false;
+}
+
+return qemu_clock_get_ms(QEMU_CLOCK_REALTIME) > (blk->retry_start_time +
+ blk->retry_timeout);
+}
+
+void blk_error_retry_reset_timeout(BlockBackend *blk)
+{
+if (blk->retry_timer && blk->retry_start_time)
+blk->retry_start_time = 0;
+}
+
 void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error,
   BlockdevOnError on_write_error)
 {
@@ -1804,7 +1827,7 @@ BlockErrorAction blk_get_error_action(BlockBackend *blk, 
bool is_read,
 case BLOCKDEV_ON_ERROR_IGNORE:
 return BLOCK_ERROR_ACTION_IGNORE;
 case BLOCKDEV_ON_ERROR_RETRY:
-return (blk->retry_timer) ?
+return (blk->retry_timer && !blk_error_retry_timeout(blk)) ?
BLOCK_ERROR_ACTION_RETRY : BLOCK_ERROR_ACTION_REPORT;
 case BLOCKDEV_ON_ERROR_AUTO:
 default:
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index b31144aca9..070eb7786c 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -188,6 +188,7 @@ void blk_inc_in_flight(BlockBackend *blk);
 void blk_dec_in_flight(BlockBackend *blk);
 void blk_drain(BlockBackend *blk);
 void blk_drain_all(void);
+void blk_error_retry_reset_timeout(BlockBackend *blk);
 void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error,
   BlockdevOnError on_write_error);
 BlockdevOnError blk_get_on_error(BlockBackend *blk, bool is_read);
-- 
2.28.0

[PATCH v4 0/7] block: Add retry for werror=/rerror= mechanism

2020-12-15 Thread Jiahui Cen

A VM in the cloud environment may use a virutal disk as the backend storage,
and there are usually filesystems on the virtual block device. When backend
storage is temporarily down, any I/O issued to the virtual block device
will cause an error. For example, an error occurred in ext4 filesystem would
make the filesystem readonly. In production environment, a cloud backend
storage can be soon recovered. For example, an IP-SAN may be down due to
network failure and will be online soon after network is recovered. However,
the error in the filesystem may not be recovered unless a device reattach
or system restart. Thus an I/O retry mechanism is in need to implement a
self-healing system.

This patch series propose to extend the werror=/rerror= mechanism to add
a 'retry' feature. It can automatically retry failed I/O requests on error
without sending error back to guest, and guest can get back running smoothly
when I/O is recovred.

v3->v4:
* Adapt to werror=/rerror= mechanism.

v2->v3:
* Add a doc to describe I/O hang.

v1->v2:
* Rebase to fix compile problems.
* Fix incorrect remove of rehandle list.
* Provide rehandle pause interface.

REF: https://lists.gnu.org/archive/html/qemu-devel/2020-10/msg06560.html

Signed-off-by: Jiahui Cen 
Signed-off-by: Ying Fang 

Jiahui Cen (7):
  qapi/block-core: Add retry option for error action
  block-backend: Introduce retry timer
  block-backend: Add device specific retry callback
  block-backend: Enable retry action on errors
  block-backend: Add timeout support for retry
  block: Add error retry param setting
  virtio_blk: Add support for retry on errors

 block/block-backend.c  | 66 
 blockdev.c | 52 +++
 hw/block/block.c   | 10 +++
 hw/block/virtio-blk.c  | 19 +-
 include/hw/block/block.h   |  7 ++-
 include/sysemu/block-backend.h | 10 +++
 qapi/block-core.json   |  4 +-
 7 files changed, 162 insertions(+), 6 deletions(-)

-- 
2.28.0

[PATCH v4 2/7] block-backend: Introduce retry timer

2020-12-15 Thread Jiahui Cen

Add a timer to regularly trigger retry on errors.

Signed-off-by: Jiahui Cen 
Signed-off-by: Ying Fang 
---
 block/block-backend.c | 21 
 1 file changed, 21 insertions(+)

diff --git a/block/block-backend.c b/block/block-backend.c
index ce78d30794..fe775ea298 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -35,6 +35,9 @@
 
 static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb);
 
+/* block backend default retry interval */
+#define BLOCK_BACKEND_DEFAULT_RETRY_INTERVAL   1000
+
 typedef struct BlockBackendAioNotifier {
 void (*attached_aio_context)(AioContext *new_context, void *opaque);
 void (*detach_aio_context)(void *opaque);
@@ -95,6 +98,15 @@ struct BlockBackend {
  * Accessed with atomic ops.
  */
 unsigned int in_flight;
+
+/* Timer for retry on errors. */
+QEMUTimer *retry_timer;
+/* Interval in ms to trigger next retry. */
+int64_t retry_interval;
+/* Start time of the first error. Used to check timeout. */
+int64_t retry_start_time;
+/* Retry timeout. 0 represents infinite retry. */
+int64_t retry_timeout;
 };
 
 typedef struct BlockBackendAIOCB {
@@ -345,6 +357,11 @@ BlockBackend *blk_new(AioContext *ctx, uint64_t perm, 
uint64_t shared_perm)
 blk->on_read_error = BLOCKDEV_ON_ERROR_REPORT;
 blk->on_write_error = BLOCKDEV_ON_ERROR_ENOSPC;
 
+blk->retry_timer = NULL;
+blk->retry_interval = BLOCK_BACKEND_DEFAULT_RETRY_INTERVAL;
+blk->retry_start_time = 0;
+blk->retry_timeout = 0;
+
 block_acct_init(>stats);
 
 qemu_co_queue_init(>queued_requests);
@@ -456,6 +473,10 @@ static void blk_delete(BlockBackend *blk)
 QTAILQ_REMOVE(_backends, blk, link);
 drive_info_del(blk->legacy_dinfo);
 block_acct_cleanup(>stats);
+if (blk->retry_timer) {
+timer_del(blk->retry_timer);
+timer_free(blk->retry_timer);
+}
 g_free(blk);
 }
 
-- 
2.28.0

[PATCH v4 3/7] block-backend: Add device specific retry callback

2020-12-15 Thread Jiahui Cen

Add retry_request_cb in BlockDevOps to do device specific retry action.
Backend's timer would be registered only when the backend is set 'retry'
on errors and the device supports retry action.

Signed-off-by: Jiahui Cen 
Signed-off-by: Ying Fang 
---
 block/block-backend.c  | 8 
 include/sysemu/block-backend.h | 4 
 2 files changed, 12 insertions(+)

diff --git a/block/block-backend.c b/block/block-backend.c
index fe775ea298..bca7c581ee 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -995,6 +995,14 @@ void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps 
*ops,
 blk->dev_ops = ops;
 blk->dev_opaque = opaque;
 
+if ((blk->on_read_error == BLOCKDEV_ON_ERROR_RETRY ||
+ blk->on_write_error == BLOCKDEV_ON_ERROR_RETRY) &&
+ops->retry_request_cb) {
+blk->retry_timer = aio_timer_new(blk->ctx, QEMU_CLOCK_REALTIME,
+ SCALE_MS, ops->retry_request_cb,
+ opaque);
+}
+
 /* Are we currently quiesced? Should we enforce this right now? */
 if (blk->quiesce_counter && ops->drained_begin) {
 ops->drained_begin(opaque);
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index 8203d7f6f9..b31144aca9 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -66,6 +66,10 @@ typedef struct BlockDevOps {
  * Runs when the backend's last drain request ends.
  */
 void (*drained_end)(void *opaque);
+/*
+ * Runs when retrying failed requests.
+ */
+void (*retry_request_cb)(void *opaque);
 } BlockDevOps;
 
 /* This struct is embedded in (the private) BlockBackend struct and contains
-- 
2.28.0

[PATCH v4 7/7] virtio_blk: Add support for retry on errors

2020-12-15 Thread Jiahui Cen

Insert failed requests into device's list for later retry and handle
queued requests to implement retry_request_cb.

Signed-off-by: Jiahui Cen 
Signed-off-by: Ying Fang 
---
 hw/block/virtio-blk.c | 19 ---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index bac2d6fa2b..cf8b350eaf 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -108,6 +108,10 @@ static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, 
int error,
 block_acct_failed(blk_get_stats(s->blk), >acct);
 }
 virtio_blk_free_request(req);
+} else if (action == BLOCK_ERROR_ACTION_RETRY) {
+req->mr_next = NULL;
+req->next = s->rq;
+s->rq = req;
 }
 
 blk_error_action(s->blk, action, is_read, error);
@@ -149,6 +153,7 @@ static void virtio_blk_rw_complete(void *opaque, int ret)
 }
 }
 
+blk_error_retry_reset_timeout(s->blk);
 virtio_blk_req_complete(req, VIRTIO_BLK_S_OK);
 block_acct_done(blk_get_stats(s->blk), >acct);
 virtio_blk_free_request(req);
@@ -828,12 +833,12 @@ static void virtio_blk_handle_output(VirtIODevice *vdev, 
VirtQueue *vq)
 
 void virtio_blk_process_queued_requests(VirtIOBlock *s, bool is_bh)
 {
-VirtIOBlockReq *req = s->rq;
+VirtIOBlockReq *req;
 MultiReqBuffer mrb = {};
 
-s->rq = NULL;
-
 aio_context_acquire(blk_get_aio_context(s->conf.conf.blk));
+req = s->rq;
+s->rq = NULL;
 while (req) {
 VirtIOBlockReq *next = req->next;
 if (virtio_blk_handle_request(req, )) {
@@ -1134,8 +1139,16 @@ static void virtio_blk_resize(void *opaque)
 aio_bh_schedule_oneshot(qemu_get_aio_context(), virtio_resize_cb, vdev);
 }
 
+static void virtio_blk_retry_request(void *opaque)
+{
+VirtIOBlock *s = VIRTIO_BLK(opaque);
+
+virtio_blk_process_queued_requests(s, false);
+}
+
 static const BlockDevOps virtio_block_ops = {
 .resize_cb = virtio_blk_resize,
+.retry_request_cb = virtio_blk_retry_request,
 };
 
 static void virtio_blk_device_realize(DeviceState *dev, Error **errp)
-- 
2.28.0

[PATCH v4 1/7] qapi/block-core: Add retry option for error action

2020-12-15 Thread Jiahui Cen

Add a new error action 'retry' to support retry on errors.

Signed-off-by: Jiahui Cen 
Signed-off-by: Ying Fang 
---
 blockdev.c   | 2 ++
 qapi/block-core.json | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 412354b4b6..47c0e6db52 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -342,6 +342,8 @@ static int parse_block_error_action(const char *buf, bool 
is_read, Error **errp)
 return BLOCKDEV_ON_ERROR_STOP;
 } else if (!strcmp(buf, "report")) {
 return BLOCKDEV_ON_ERROR_REPORT;
+} else if (!strcmp(buf, "retry")) {
+return BLOCKDEV_ON_ERROR_RETRY;
 } else {
 error_setg(errp, "'%s' invalid %s error action",
buf, is_read ? "read" : "write");
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 04c5196e59..ef5492bcdf 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -1146,7 +1146,7 @@
 # Since: 1.3
 ##
 { 'enum': 'BlockdevOnError',
-  'data': ['report', 'ignore', 'enospc', 'stop', 'auto'] }
+  'data': ['report', 'ignore', 'enospc', 'stop', 'auto', 'retry'] }
 
 ##
 # @MirrorSyncMode:
@@ -4770,7 +4770,7 @@
 # Since: 2.1
 ##
 { 'enum': 'BlockErrorAction',
-  'data': [ 'ignore', 'report', 'stop' ] }
+  'data': [ 'ignore', 'report', 'stop', 'retry' ] }
 
 
 ##
-- 
2.28.0

Re: [PATCH] iotests: Quote $cmd in _send_qemu_cmd

2020-12-15 Thread Max Reitz


On 15.12.20 12:43, Kevin Wolf wrote:

Am 14.12.2020 um 18:58 hat Max Reitz geschrieben:

With bash 5.1, the output of the following script (which creates an
array with a single element, then takes a single-element slice from that
array, and echos the result) changes:

   a=("double  space")
   a=${a[@]:0:1}
   echo "$a"

from "double space" to "double  space", i.e. all white space is
preserved as-is.  This is probably what we actually want here (judging
from the "...to accommodate pathnames with spaces" comment), but before
5.1, we have to quote the ${} slice to get the same behavior.

In any case, without quoting, the reference output of many iotests is
different between bash 5.1 and pre-5.1, which is not very good.  We
should fix it by quoting here, and then adjusting all the reference
output of all iotests using _send_qemu_cmd accordingly.  (The only
ones we do not need to change are those that do not have multiple
consecutive whitespaces in their _send_qemu_cmd parameters.)

Signed-off-by: Max Reitz 
---
Alternatively, we could explicitly collapse all whitespace sequences
into single spaces, but I believe that would defeat the purpose of
"accomodat[ing] pathnames with spaces".


I agree that quoting is the better alternative.

The whole thing shouldn't be needed if callers propertly quote
filenames. If I understand correctly, these lines just try to merge
back filenames that have been split into multiple words because the
caller wasn't careful. But we can't reliably do that because we don't
know which and how much whitespace was removed.


No, without quoting here, the slice operation will actually collapse 
spaces inside of array elements, as shown in the commit message
(a=("x  y") creates an array with a single element that just has two 
spaces in it).  If the caller didn’t quote, then quoting in 
_send_qemu_cmd wouldn’t help either, because the whitespace would no 
longer be present in $@.


Most callers of _send_qemu_cmd actually do something like this:

_send_qemu_cmd \
$QEMU_HANDLE \
"{ 'execute': 'something',
   'arguments': ... }" \
'return'

So they do quote all of their arguments, and actually only pass three 
(or four, with success_or_failure=y), so basically the slicing is 
generally unnecessary and we’d just need cmd=$1 (not $2, because 
_send_qemu_cmd likes to invoke “shift”).


In fact, if you do that on top of this patch (cmd=$1 instead of the 
fancy slice), you’ll see that the only test that then doesn’t pass is 
102.  I think that’s because it has this one _send_qemu_cmd invocation 
that forgot to pass $QEMU_HANDLE...?


So, alternatively to quoting the slice, we could also just set cmd=$1 
and fix 102.



I used this script to verify that the git-diff only changes whitespace
(except for the hunk in common.qemu):

   https://gist.github.com/XanClic/41cfcc2ac4619421883e8a97790f5c9e


'git diff --word-diff' is good enough for me.


Learn a new thing every day.


This seems to be based on your branch (312.out doesn't exist yet for
me), so instead of merging, you can just have my:

Reviewed-by: Kevin Wolf 


In any case (because perhaps we just want cmd=$1, I don’t know), thanks. :)

Max

Re: [PATCH v2 2/4] block: Avoid processing BDS twice in bdrv_set_aio_context_ignore()

2020-12-15 Thread Kevin Wolf

Am 14.12.2020 um 18:05 hat Sergio Lopez geschrieben:
> While processing the parents of a BDS, one of the parents may process
> the child that's doing the tail recursion, which leads to a BDS being
> processed twice. This is especially problematic for the aio_notifiers,
> as they might attempt to work on both the old and the new AIO
> contexts.
> 
> To avoid this, add the BDS pointer to the ignore list, and check the
> child BDS pointer while iterating over the children.
> 
> Signed-off-by: Sergio Lopez 

Ugh, so we get a mixed list of BdrvChild and BlockDriverState? :-/

What is the specific scenario where you saw this breaking? Did you have
multiple BdrvChild connections between two nodes so that we would go to
the parent node through one and then come back to the child node through
the other?

Maybe if what we really need to do is not processing every edge once,
but processing every node once, the list should be changed to contain
_only_ BDS objects. But then blk_do_set_aio_context() probably won't
work any more because it can't have blk->root ignored any more...

Anyway, if we end up changing what the list contains, the comment needs
an update, too. Currently it says:

 * @ignore will accumulate all visited BdrvChild object. The caller is
 * responsible for freeing the list afterwards.

Another option: Split the parents QLIST_FOREACH loop in two. First add
all parent BdrvChild objects to the ignore list, remember which of them
were newly added, and only after adding all of them call
child->klass->set_aio_ctx() for each parent that was previously not on
the ignore list. This will avoid that we come back to the same node
because all of its incoming edges are ignored now.

Kevin

>  block.c | 7 ++-
>  1 file changed, 6 insertions(+), 1 deletion(-)
> 
> diff --git a/block.c b/block.c
> index f1cedac362..bc8a66ab6e 100644
> --- a/block.c
> +++ b/block.c
> @@ -6465,12 +6465,17 @@ void bdrv_set_aio_context_ignore(BlockDriverState *bs,
>  bdrv_drained_begin(bs);
>  
>  QLIST_FOREACH(child, >children, next) {
> -if (g_slist_find(*ignore, child)) {
> +if (g_slist_find(*ignore, child) || g_slist_find(*ignore, 
> child->bs)) {
>  continue;
>  }
>  *ignore = g_slist_prepend(*ignore, child);
>  bdrv_set_aio_context_ignore(child->bs, new_context, ignore);
>  }
> +/*
> + * Add a reference to this BS to the ignore list, so its
> + * parents won't attempt to process it again.
> + */
> +*ignore = g_slist_prepend(*ignore, bs);
>  QLIST_FOREACH(child, >parents, next_parent) {
>  if (g_slist_find(*ignore, child)) {
>  continue;
> -- 
> 2.26.2
>

Re: [PATCH v2 1/4] block: Honor blk_set_aio_context() context requirements

2020-12-15 Thread Kevin Wolf

Am 14.12.2020 um 18:05 hat Sergio Lopez geschrieben:
> The documentation for bdrv_set_aio_context_ignore() states this:
> 
>  * The caller must own the AioContext lock for the old AioContext of bs, but 
> it
>  * must not own the AioContext lock for new_context (unless new_context is the
>  * same as the current context of bs).
> 
> As blk_set_aio_context() makes use of this function, this rule also
> applies to it.
> 
> Fix all occurrences where this rule wasn't honored.
> 
> Suggested-by: Kevin Wolf 
> Signed-off-by: Sergio Lopez 

Reviewed-by: Kevin Wolf

Re: [PATCH] iotests: Quote $cmd in _send_qemu_cmd

2020-12-15 Thread Kevin Wolf

Am 14.12.2020 um 18:58 hat Max Reitz geschrieben:
> With bash 5.1, the output of the following script (which creates an
> array with a single element, then takes a single-element slice from that
> array, and echos the result) changes:
> 
>   a=("double  space")
>   a=${a[@]:0:1}
>   echo "$a"
> 
> from "double space" to "double  space", i.e. all white space is
> preserved as-is.  This is probably what we actually want here (judging
> from the "...to accommodate pathnames with spaces" comment), but before
> 5.1, we have to quote the ${} slice to get the same behavior.
> 
> In any case, without quoting, the reference output of many iotests is
> different between bash 5.1 and pre-5.1, which is not very good.  We
> should fix it by quoting here, and then adjusting all the reference
> output of all iotests using _send_qemu_cmd accordingly.  (The only
> ones we do not need to change are those that do not have multiple
> consecutive whitespaces in their _send_qemu_cmd parameters.)
> 
> Signed-off-by: Max Reitz 
> ---
> Alternatively, we could explicitly collapse all whitespace sequences
> into single spaces, but I believe that would defeat the purpose of
> "accomodat[ing] pathnames with spaces".

I agree that quoting is the better alternative.

The whole thing shouldn't be needed if callers propertly quote
filenames. If I understand correctly, these lines just try to merge
back filenames that have been split into multiple words because the
caller wasn't careful. But we can't reliably do that because we don't
know which and how much whitespace was removed.

> I used this script to verify that the git-diff only changes whitespace
> (except for the hunk in common.qemu):
> 
>   https://gist.github.com/XanClic/41cfcc2ac4619421883e8a97790f5c9e

'git diff --word-diff' is good enough for me.

This seems to be based on your branch (312.out doesn't exist yet for
me), so instead of merging, you can just have my:

Reviewed-by: Kevin Wolf

Re: [PATCH] iotests/210: Fix reference output

2020-12-15 Thread Kevin Wolf

Am 14.12.2020 um 18:51 hat Max Reitz geschrieben:
> Commit 8b1170012b1 has added a global maximum disk length for the block
> layer, so the error message when creating an overly large disk has
> changed.
> 
> Fixes: 8b1170012b1de6649c66ac1887f4df7e312abf3b
>("block: introduce BDRV_MAX_LENGTH")
> Signed-off-by: Max Reitz 

Thanks, applied to the block branch.

Kevin

Re: [PATCH v5] file-posix: detect the lock using the real file

2020-12-15 Thread Daniel P . Berrangé

On Tue, Dec 15, 2020 at 06:53:56PM +0800, Li Feng wrote:
> This patch addresses this issue:
> When accessing a volume on an NFS filesystem without supporting the file lock,
> tools, like qemu-img, will complain "Failed to lock byte 100".
> 
> In the original code, the qemu_has_ofd_lock will test the lock on the
> "/dev/null" pseudo-file. Actually, the file.locking is per-drive property,
> which depends on the underlay filesystem.
> 
> In this patch, add a new 'qemu_has_file_lock' to detect whether the
> file supports the file lock. And disable the lock when the underlay file
> system doesn't support locks.
> 
> Signed-off-by: Li Feng 
> ---
> v5: simplify the code.
> v4: use the fd as the qemu_has_file_lock argument.
> v3: don't call the qemu_has_ofd_lock, use a new function instead.
> v2: remove the refactoring.
> ---
>  block/file-posix.c   | 61 +++-
>  include/qemu/osdep.h |  1 +
>  util/osdep.c | 14 ++
>  3 files changed, 47 insertions(+), 29 deletions(-)

Reviewed-by: Daniel P. Berrangé 


Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

[PATCH v5] file-posix: detect the lock using the real file

2020-12-15 Thread Li Feng

This patch addresses this issue:
When accessing a volume on an NFS filesystem without supporting the file lock,
tools, like qemu-img, will complain "Failed to lock byte 100".

In the original code, the qemu_has_ofd_lock will test the lock on the
"/dev/null" pseudo-file. Actually, the file.locking is per-drive property,
which depends on the underlay filesystem.

In this patch, add a new 'qemu_has_file_lock' to detect whether the
file supports the file lock. And disable the lock when the underlay file
system doesn't support locks.

Signed-off-by: Li Feng 
---
v5: simplify the code.
v4: use the fd as the qemu_has_file_lock argument.
v3: don't call the qemu_has_ofd_lock, use a new function instead.
v2: remove the refactoring.
---
 block/file-posix.c   | 61 +++-
 include/qemu/osdep.h |  1 +
 util/osdep.c | 14 ++
 3 files changed, 47 insertions(+), 29 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 806764f7e3..4e00111031 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -584,6 +584,21 @@ static int raw_open_common(BlockDriverState *bs, QDict 
*options,
 s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING);
 #endif
 
+s->open_flags = open_flags;
+raw_parse_flags(bdrv_flags, >open_flags, false);
+
+s->fd = -1;
+fd = qemu_open(filename, s->open_flags, errp);
+ret = fd < 0 ? -errno : 0;
+
+if (ret < 0) {
+if (ret == -EROFS) {
+ret = -EACCES;
+}
+goto fail;
+}
+s->fd = fd;
+
 locking = qapi_enum_parse(_lookup,
   qemu_opt_get(opts, "locking"),
   ON_OFF_AUTO_AUTO, _err);
@@ -606,7 +621,7 @@ static int raw_open_common(BlockDriverState *bs, QDict 
*options,
 s->use_lock = false;
 break;
 case ON_OFF_AUTO_AUTO:
-s->use_lock = qemu_has_ofd_lock();
+s->use_lock = qemu_has_file_lock(s->fd) && qemu_has_ofd_lock();
 break;
 default:
 abort();
@@ -625,22 +640,6 @@ static int raw_open_common(BlockDriverState *bs, QDict 
*options,
 s->drop_cache = qemu_opt_get_bool(opts, "drop-cache", true);
 s->check_cache_dropped = qemu_opt_get_bool(opts, "x-check-cache-dropped",
false);
-
-s->open_flags = open_flags;
-raw_parse_flags(bdrv_flags, >open_flags, false);
-
-s->fd = -1;
-fd = qemu_open(filename, s->open_flags, errp);
-ret = fd < 0 ? -errno : 0;
-
-if (ret < 0) {
-if (ret == -EROFS) {
-ret = -EACCES;
-}
-goto fail;
-}
-s->fd = fd;
-
 /* Check s->open_flags rather than bdrv_flags due to auto-read-only */
 if (s->open_flags & O_RDWR) {
 ret = check_hdev_writable(s->fd);
@@ -2388,6 +2387,7 @@ raw_co_create(BlockdevCreateOptions *options, Error 
**errp)
 int fd;
 uint64_t perm, shared;
 int result = 0;
+bool use_lock;
 
 /* Validate options and set default values */
 assert(options->driver == BLOCKDEV_DRIVER_FILE);
@@ -2428,19 +2428,22 @@ raw_co_create(BlockdevCreateOptions *options, Error 
**errp)
 perm = BLK_PERM_WRITE | BLK_PERM_RESIZE;
 shared = BLK_PERM_ALL & ~BLK_PERM_RESIZE;
 
-/* Step one: Take locks */
-result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, errp);
-if (result < 0) {
-goto out_close;
-}
+use_lock = qemu_has_file_lock(fd);
+if (use_lock) {
+/* Step one: Take locks */
+result = raw_apply_lock_bytes(NULL, fd, perm, ~shared, false, errp);
+if (result < 0) {
+goto out_close;
+}
 
-/* Step two: Check that nobody else has taken conflicting locks */
-result = raw_check_lock_bytes(fd, perm, shared, errp);
-if (result < 0) {
-error_append_hint(errp,
-  "Is another process using the image [%s]?\n",
-  file_opts->filename);
-goto out_unlock;
+/* Step two: Check that nobody else has taken conflicting locks */
+result = raw_check_lock_bytes(fd, perm, shared, errp);
+if (result < 0) {
+error_append_hint(errp,
+  "Is another process using the image [%s]?\n",
+  file_opts->filename);
+goto out_unlock;
+}
 }
 
 /* Clear the file by truncating it to 0 */
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index f9ec8c84e9..c7587be99d 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -513,6 +513,7 @@ int qemu_lock_fd(int fd, int64_t start, int64_t len, bool 
exclusive);
 int qemu_unlock_fd(int fd, int64_t start, int64_t len);
 int qemu_lock_fd_test(int fd, int64_t start, int64_t len, bool exclusive);
 bool qemu_has_ofd_lock(void);
+bool qemu_has_file_lock(int fd);
 #endif
 
 #if defined(__HAIKU__) && defined(__i386__)
diff --git a/util/osdep.c b/util/osdep.c
index

Re: [PATCH v4] file-posix: detect the lock using the real file

2020-12-15 Thread Li Feng

Daniel P. Berrangé  于2020年12月15日周二 下午6:08写道：
>
> On Tue, Dec 15, 2020 at 03:09:28PM +0800, Li Feng wrote:
> > This patch addresses this issue:
> > When accessing a volume on an NFS filesystem without supporting the file 
> > lock,
> > tools, like qemu-img, will complain "Failed to lock byte 100".
> >
> > In the original code, the qemu_has_ofd_lock will test the lock on the
> > "/dev/null" pseudo-file. Actually, the file.locking is per-drive property,
> > which depends on the underlay filesystem.
> >
> > In this patch, add a new 'qemu_has_file_lock' to detect whether the
> > file supports the file lock. And disable the lock when the underlay file
> > system doesn't support locks.
> >
> > Signed-off-by: Li Feng 
> > ---
> > v4: use the fd as the qemu_has_file_lock argument.
> > v3: don't call the qemu_has_ofd_lock, use a new function instead.
> > v2: remove the refactoring.
> > ---
> >  block/file-posix.c   | 66 +---
> >  include/qemu/osdep.h |  1 +
> >  util/osdep.c | 19 +
> >  3 files changed, 58 insertions(+), 28 deletions(-)
> >
> > diff --git a/block/file-posix.c b/block/file-posix.c
> > index 806764f7e3..9708212f01 100644
> > --- a/block/file-posix.c
> > +++ b/block/file-posix.c
> > @@ -584,6 +584,21 @@ static int raw_open_common(BlockDriverState *bs, QDict 
> > *options,
> >  s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING);
> >  #endif
> >
> > +s->open_flags = open_flags;
> > +raw_parse_flags(bdrv_flags, >open_flags, false);
> > +
> > +s->fd = -1;
> > +fd = qemu_open(filename, s->open_flags, errp);
> > +ret = fd < 0 ? -errno : 0;
> > +
> > +if (ret < 0) {
> > +if (ret == -EROFS) {
> > +ret = -EACCES;
> > +}
> > +goto fail;
> > +}
> > +s->fd = fd;
> > +
> >  locking = qapi_enum_parse(_lookup,
> >qemu_opt_get(opts, "locking"),
> >ON_OFF_AUTO_AUTO, _err);
> > @@ -607,6 +622,13 @@ static int raw_open_common(BlockDriverState *bs, QDict 
> > *options,
> >  break;
> >  case ON_OFF_AUTO_AUTO:
> >  s->use_lock = qemu_has_ofd_lock();
> > +if (s->use_lock && !qemu_has_file_lock(s->fd)) {
> > +/*
> > + * When the os supports the OFD lock, but the filesystem 
> > doesn't
> > + * support, just disable the file lock.
> > + */
> > +s->use_lock = false;
> > +}
>
> This should just be
>
>   s->use_lock = qemu_has_file_lock(s->fd) && qemu_has_ofd_lock();
>
Acked.

>
>
> > diff --git a/util/osdep.c b/util/osdep.c
> > index 66d01b9160..07de97e2c5 100644
> > --- a/util/osdep.c
> > +++ b/util/osdep.c
> > @@ -225,6 +225,25 @@ static void qemu_probe_lock_ops(void)
> >  }
> >  }
> >
> > +bool qemu_has_file_lock(int fd)
> > +{
> > +#ifdef F_OFD_SETLK
> > +int cmd = F_OFD_GETLK;
> > +#else
> > +int cmd = F_GETLK;
> > +#endif
>
> This should unconditionally use  F_GETLK.
Acked.

>
> > +int ret;
> > +struct flock fl = {
> > +.l_whence = SEEK_SET,
> > +.l_start  = 0,
> > +.l_len= 0,
> > +.l_type   = F_WRLCK,
> > +};
> > +
> > +ret = fcntl(fd, cmd, );
> > +return ret == 0;
> > +}
>
> Regards,
> Daniel
> --
> |: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
> |: https://libvirt.org -o-https://fstop138.berrange.com :|
> |: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|
>

Re: [PATCH v4] file-posix: detect the lock using the real file

2020-12-15 Thread Daniel P . Berrangé

On Tue, Dec 15, 2020 at 03:09:28PM +0800, Li Feng wrote:
> This patch addresses this issue:
> When accessing a volume on an NFS filesystem without supporting the file lock,
> tools, like qemu-img, will complain "Failed to lock byte 100".
> 
> In the original code, the qemu_has_ofd_lock will test the lock on the
> "/dev/null" pseudo-file. Actually, the file.locking is per-drive property,
> which depends on the underlay filesystem.
> 
> In this patch, add a new 'qemu_has_file_lock' to detect whether the
> file supports the file lock. And disable the lock when the underlay file
> system doesn't support locks.
> 
> Signed-off-by: Li Feng 
> ---
> v4: use the fd as the qemu_has_file_lock argument.
> v3: don't call the qemu_has_ofd_lock, use a new function instead.
> v2: remove the refactoring.
> ---
>  block/file-posix.c   | 66 +---
>  include/qemu/osdep.h |  1 +
>  util/osdep.c | 19 +
>  3 files changed, 58 insertions(+), 28 deletions(-)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 806764f7e3..9708212f01 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -584,6 +584,21 @@ static int raw_open_common(BlockDriverState *bs, QDict 
> *options,
>  s->use_linux_io_uring = (aio == BLOCKDEV_AIO_OPTIONS_IO_URING);
>  #endif
>  
> +s->open_flags = open_flags;
> +raw_parse_flags(bdrv_flags, >open_flags, false);
> +
> +s->fd = -1;
> +fd = qemu_open(filename, s->open_flags, errp);
> +ret = fd < 0 ? -errno : 0;
> +
> +if (ret < 0) {
> +if (ret == -EROFS) {
> +ret = -EACCES;
> +}
> +goto fail;
> +}
> +s->fd = fd;
> +
>  locking = qapi_enum_parse(_lookup,
>qemu_opt_get(opts, "locking"),
>ON_OFF_AUTO_AUTO, _err);
> @@ -607,6 +622,13 @@ static int raw_open_common(BlockDriverState *bs, QDict 
> *options,
>  break;
>  case ON_OFF_AUTO_AUTO:
>  s->use_lock = qemu_has_ofd_lock();
> +if (s->use_lock && !qemu_has_file_lock(s->fd)) {
> +/*
> + * When the os supports the OFD lock, but the filesystem doesn't
> + * support, just disable the file lock.
> + */
> +s->use_lock = false;
> +}

This should just be

  s->use_lock = qemu_has_file_lock(s->fd) && qemu_has_ofd_lock();



> diff --git a/util/osdep.c b/util/osdep.c
> index 66d01b9160..07de97e2c5 100644
> --- a/util/osdep.c
> +++ b/util/osdep.c
> @@ -225,6 +225,25 @@ static void qemu_probe_lock_ops(void)
>  }
>  }
>  
> +bool qemu_has_file_lock(int fd)
> +{
> +#ifdef F_OFD_SETLK
> +int cmd = F_OFD_GETLK;
> +#else
> +int cmd = F_GETLK;
> +#endif

This should unconditionally use  F_GETLK.

> +int ret;
> +struct flock fl = {
> +.l_whence = SEEK_SET,
> +.l_start  = 0,
> +.l_len= 0,
> +.l_type   = F_WRLCK,
> +};
> +
> +ret = fcntl(fd, cmd, );
> +return ret == 0;
> +}

Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|

Re: [PATCH] iotests/210: Fix reference output

2020-12-15 Thread Vladimir Sementsov-Ogievskiy


14.12.2020 20:51, Max Reitz wrote:

Commit 8b1170012b1 has added a global maximum disk length for the block
layer, so the error message when creating an overly large disk has
changed.

Fixes: 8b1170012b1de6649c66ac1887f4df7e312abf3b
("block: introduce BDRV_MAX_LENGTH")
Signed-off-by: Max Reitz


Reviewed-by: Vladimir Sementsov-Ogievskiy 

--
Best regards,
Vladimir

52 matches

Mail list logo