[Qemu-block] [PATCH v2 0/3] block: More complete inactivate/invalidate of graph nodes

2016-05-10 Thread Fam Zheng
v2: Two passes inactivation. [Kevin]

For now we only consider bs->file chain. In fact this is incomplete. For
example, if qcow2 is a quorum child, we don't properly invalidate or inactivate
it.  This series recurses into the subtrees in both bdrv_invalidate_cache_all
and bdrv_inactivate_all. This is also necessary to make the image locking
cooperate with migration.


Fam Zheng (3):
  block: Invalidate all children
  block: Drop superfluous invalidating bs->file from drivers
  block: Inactivate all children

 block.c| 67 +++---
 block/qcow2.c  |  7 --
 block/qed.c|  6 --
 block/quorum.c | 16 --
 4 files changed, 50 insertions(+), 46 deletions(-)

-- 
2.8.2




[Qemu-block] [PATCH v2 2/3] block: Drop superfluous invalidating bs->file from drivers

2016-05-10 Thread Fam Zheng
Now they are invalidated by the block layer, so it's not necessary to
do this in block drivers' implementations of .bdrv_invalidate_cache.

Signed-off-by: Fam Zheng 
---
 block/qcow2.c  |  7 ---
 block/qed.c|  6 --
 block/quorum.c | 16 
 3 files changed, 29 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 470734b..d4431e0 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1757,13 +1757,6 @@ static void qcow2_invalidate_cache(BlockDriverState *bs, 
Error **errp)
 
 qcow2_close(bs);
 
-bdrv_invalidate_cache(bs->file->bs, _err);
-if (local_err) {
-error_propagate(errp, local_err);
-bs->drv = NULL;
-return;
-}
-
 memset(s, 0, sizeof(BDRVQcow2State));
 options = qdict_clone_shallow(bs->options);
 
diff --git a/block/qed.c b/block/qed.c
index 0af5274..9081941 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -1594,12 +1594,6 @@ static void bdrv_qed_invalidate_cache(BlockDriverState 
*bs, Error **errp)
 
 bdrv_qed_close(bs);
 
-bdrv_invalidate_cache(bs->file->bs, _err);
-if (local_err) {
-error_propagate(errp, local_err);
-return;
-}
-
 memset(s, 0, sizeof(BDRVQEDState));
 ret = bdrv_qed_open(bs, NULL, bs->open_flags, _err);
 if (local_err) {
diff --git a/block/quorum.c b/block/quorum.c
index da15465..8f7c099 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -747,21 +747,6 @@ static int64_t quorum_getlength(BlockDriverState *bs)
 return result;
 }
 
-static void quorum_invalidate_cache(BlockDriverState *bs, Error **errp)
-{
-BDRVQuorumState *s = bs->opaque;
-Error *local_err = NULL;
-int i;
-
-for (i = 0; i < s->num_children; i++) {
-bdrv_invalidate_cache(s->children[i]->bs, _err);
-if (local_err) {
-error_propagate(errp, local_err);
-return;
-}
-}
-}
-
 static coroutine_fn int quorum_co_flush(BlockDriverState *bs)
 {
 BDRVQuorumState *s = bs->opaque;
@@ -1070,7 +1055,6 @@ static BlockDriver bdrv_quorum = {
 
 .bdrv_aio_readv = quorum_aio_readv,
 .bdrv_aio_writev= quorum_aio_writev,
-.bdrv_invalidate_cache  = quorum_invalidate_cache,
 
 .bdrv_detach_aio_context= quorum_detach_aio_context,
 .bdrv_attach_aio_context= quorum_attach_aio_context,
-- 
2.8.2




[Qemu-block] [PATCH v2 3/3] block: Inactivate all children

2016-05-10 Thread Fam Zheng
Currently we only inactivate the top BDS. Actually bdrv_inactivate
should be the opposite of bdrv_invalidate_cache.

Recurse into the whole subtree instead.

Because a node may have multiple parents, and because once
BDRV_O_INACTIVE is set for a node, further writes are not allowed, we
cannot interleave flag settings and .bdrv_inactivate calls (that may
submit write to other nodes in a graph) within a single pass. Therefore
two passes are used here.

Signed-off-by: Fam Zheng 
---
 block.c | 47 ---
 1 file changed, 36 insertions(+), 11 deletions(-)

diff --git a/block.c b/block.c
index fa8b38f..93481c4 100644
--- a/block.c
+++ b/block.c
@@ -3258,38 +3258,63 @@ void bdrv_invalidate_cache_all(Error **errp)
 }
 }
 
-static int bdrv_inactivate(BlockDriverState *bs)
+static int bdrv_inactivate_recurse(BlockDriverState *bs,
+   bool setting_flag)
 {
+BdrvChild *child;
 int ret;
 
-if (bs->drv->bdrv_inactivate) {
+if (!setting_flag && bs->drv->bdrv_inactivate) {
 ret = bs->drv->bdrv_inactivate(bs);
 if (ret < 0) {
 return ret;
 }
 }
 
-bs->open_flags |= BDRV_O_INACTIVE;
+QLIST_FOREACH(child, >children, next) {
+ret = bdrv_inactivate_recurse(child->bs, setting_flag);
+if (ret < 0) {
+return ret;
+}
+}
+
+if (setting_flag) {
+bs->open_flags |= BDRV_O_INACTIVE;
+}
 return 0;
 }
 
 int bdrv_inactivate_all(void)
 {
 BlockDriverState *bs = NULL;
-int ret;
+int ret = 0;
+int pass;
 
 while ((bs = bdrv_next(bs)) != NULL) {
-AioContext *aio_context = bdrv_get_aio_context(bs);
+aio_context_acquire(bdrv_get_aio_context(bs));
+}
 
-aio_context_acquire(aio_context);
-ret = bdrv_inactivate(bs);
-aio_context_release(aio_context);
-if (ret < 0) {
-return ret;
+/* We do two passes of inactivation. The first pass calls to drivers'
+ * .bdrv_inactivate callbacks recursively so all cache is flushed to disk;
+ * the second pass sets the BDRV_O_INACTIVE flag so that no further write
+ * is allowed. */
+for (pass = 0; pass < 2; pass++) {
+bs = NULL;
+while ((bs = bdrv_next(bs)) != NULL) {
+ret = bdrv_inactivate_recurse(bs, pass);
+if (ret < 0) {
+goto out;
+}
 }
 }
 
-return 0;
+out:
+bs = NULL;
+while ((bs = bdrv_next(bs)) != NULL) {
+aio_context_release(bdrv_get_aio_context(bs));
+}
+
+return ret;
 }
 
 /**/
-- 
2.8.2




[Qemu-block] [PATCH v2 1/3] block: Invalidate all children

2016-05-10 Thread Fam Zheng
Currently we only recurse to bs->file, which will miss the children in quorum
and VMDK.

Recurse into the whole subtree to avoid that.

Signed-off-by: Fam Zheng 
---
 block.c | 20 ++--
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/block.c b/block.c
index d4939b4..fa8b38f 100644
--- a/block.c
+++ b/block.c
@@ -3201,6 +3201,7 @@ void bdrv_init_with_whitelist(void)
 
 void bdrv_invalidate_cache(BlockDriverState *bs, Error **errp)
 {
+BdrvChild *child;
 Error *local_err = NULL;
 int ret;
 
@@ -3215,13 +3216,20 @@ void bdrv_invalidate_cache(BlockDriverState *bs, Error 
**errp)
 
 if (bs->drv->bdrv_invalidate_cache) {
 bs->drv->bdrv_invalidate_cache(bs, _err);
-} else if (bs->file) {
-bdrv_invalidate_cache(bs->file->bs, _err);
+if (local_err) {
+bs->open_flags |= BDRV_O_INACTIVE;
+error_propagate(errp, local_err);
+return;
+}
 }
-if (local_err) {
-bs->open_flags |= BDRV_O_INACTIVE;
-error_propagate(errp, local_err);
-return;
+
+QLIST_FOREACH(child, >children, next) {
+bdrv_invalidate_cache(child->bs, _err);
+if (local_err) {
+bs->open_flags |= BDRV_O_INACTIVE;
+error_propagate(errp, local_err);
+return;
+}
 }
 
 ret = refresh_total_sectors(bs, bs->total_sectors);
-- 
2.8.2




Re: [Qemu-block] [Qemu-devel] [PATCHv2 0/3] Better 'Force Unit Access' handling

2016-05-10 Thread Fam Zheng
On Tue, 05/03 16:39, Eric Blake wrote:
> I noticed some inconsistencies in FUA handling while working
> with NBD, then Kevin pointed out that my initial attempt wasn't
> quite right for iscsi which also had problems, so this has
> expanded into a series rather than a single patch.
> 
> I'm not sure if this is qemu-stable material at this point.
> 
> Depends on Kevin's block-next branch.
> 
> Also available as a tag at this location:
> git fetch git://repo.or.cz/qemu/ericb.git nbd-fua-v2
> 
> v1 was here:
> https://lists.gnu.org/archive/html/qemu-devel/2016-04/msg04674.html
> 
> diffstat not worth posting, as the series is basically rewritten
> 
> Eric Blake (3):
>   block: Make supported_write_flags a per-bds property
>   block: Honor BDRV_REQ_FUA during write_zeroes
>   nbd: Simplify client FUA handling

Reviewed-by: Fam Zheng 

> 
>  block/nbd-client.h|  2 +-
>  include/block/block_int.h | 11 +++
>  block/io.c| 39 ++-
>  block/iscsi.c | 11 ---
>  block/nbd-client.c| 11 +++
>  block/nbd.c   | 28 +++-
>  block/raw-posix.c |  1 +
>  block/raw_bsd.c   |  3 ++-
>  8 files changed, 59 insertions(+), 47 deletions(-)
> 
> -- 
> 2.5.5
> 
> 



Re: [Qemu-block] [Qemu-devel] [PATCH v4 08/27] osdep: Add qemu_lock_fd and qemu_unlock_fd

2016-05-10 Thread Fam Zheng
On Wed, 05/11 08:48, Fam Zheng wrote:
> racy problem. Any suggestion how this could be fixed?

Reading into the subthread I see the answer: the file-private locks look
promising. Will take a look at that! Thanks.

Fam




Re: [Qemu-block] [Qemu-devel] [PATCH v4 08/27] osdep: Add qemu_lock_fd and qemu_unlock_fd

2016-05-10 Thread Fam Zheng
On Tue, 05/10 09:57, Daniel P. Berrange wrote:
> On Tue, May 10, 2016 at 10:50:40AM +0800, Fam Zheng wrote:
> > They are wrappers of POSIX fcntl file locking, with the additional
> > interception of open/close (through qemu_open and qemu_close) to offer a
> > better semantics that preserves the locks across multiple life cycles of
> > different fds on the same file.  The reason to make this semantics
> > change over the fcntl locks is to make the API cleaner for QEMU
> > subsystems.
> > 
> > More precisely, we remove this "feature" of fcntl lock:
> > 
> > If a process closes any file descriptor referring to a file, then
> > all of the process's locks on that file are released, regardless of
> > the file descriptor(s) on which the locks were obtained.
> > 
> > as long as the fd is always open/closed through qemu_open and
> > qemu_close.
> 
> You're not actually really removing that problem - this is just hacking
> around it in a manner which is racy & susceptible to silent failure.
> 
> 
> > +static int qemu_fd_close(int fd)
> > +{
> > +LockEntry *ent, *tmp;
> > +LockRecord *rec;
> > +char *path;
> > +int ret;
> > +
> > +assert(fd_to_path);
> > +path = g_hash_table_lookup(fd_to_path, GINT_TO_POINTER(fd));
> > +assert(path);
> > +g_hash_table_remove(fd_to_path, GINT_TO_POINTER(fd));
> > +rec = g_hash_table_lookup(lock_map, path);
> > +ret = close(fd);
> > +
> > +if (rec) {
> > +QLIST_FOREACH_SAFE(ent, >records, next, tmp) {
> > +int r;
> > +if (ent->fd == fd) {
> > +QLIST_REMOVE(ent, next);
> > +g_free(ent);
> > +continue;
> > +}
> > +r = qemu_lock_do(ent->fd, ent->start, ent->len,
> > + ent->readonly ? F_RDLCK : F_WRLCK);
> > +if (r == -1) {
> > +fprintf(stderr, "Failed to acquire lock on fd %d: %s\n",
> > +ent->fd, strerror(errno));
> > +}
> 
> If another app has acquired the lock between the close and this attempt
> to re-acquire the lock, then QEMU is silently carrying on without any
> lock being held. For something that's intending to provide protection
> against concurrent use I think this is not an acceptable failure
> scenario.
> 
> 
> > +int qemu_open(const char *name, int flags, ...)
> > +{
> > +int mode = 0;
> > +int ret;
> > +
> > +if (flags & O_CREAT) {
> > +va_list ap;
> > +
> > +va_start(ap, flags);
> > +mode = va_arg(ap, int);
> > +va_end(ap);
> > +}
> > +ret = qemu_fd_open(name, flags, mode);
> > +if (ret >= 0) {
> > +qemu_fd_add_record(ret, name);
> > +}
> > +return ret;
> > +}
> 
> I think the approach you have here is fundamentally not usable with
> fcntl locks, because it is still using the pattern
> 
>fd = open(path)
>lock(fd)
>if failed lock
>   close(fd)
>...do stuff.
> 
> 
> As mentioned in previous posting I believe the block layer must be
> checking whether it already has a lock held against "path" *before*
> even attempting to open the file. Once QEMU has the file descriptor
> open it is too later, because closing the FD will release pre-existing
> locks QEMU has.

There will still be valid close() calls, that will release necessary locks
temporarily.  You are right the "close + re-acquire" in qemu_fd_close() is a
racy problem. Any suggestion how this could be fixed?  Something like
introducing a "close2" syscall that doesn't drop locks on other fds?

Fam



Re: [Qemu-block] [Nbd] [Qemu-devel] [PATCH] nbd: fix trim/discard commands with a length bigger than NBD_MAX_BUFFER_SIZE

2016-05-10 Thread Eric Blake
On 05/10/2016 10:33 AM, Quentin Casasnovas wrote:

> Looks like there's an easier way:
> 
>  $ qemu-img create -f qcow2 foo.qcow2 10G
>  $ qemu-nbd --discard=on -c /dev/nbd0 foo.qcow2
>  $ mkfs.ext4 /dev/nbd0
>  mke2fs 1.42.13 (17-May-2015)
>  Discarding device blocks: failed - Input/output error

strace on mkfs.ext4 shows:
...
open("/dev/nbd1", O_RDWR|O_EXCL)= 3
stat("/dev/nbd1", {st_mode=S_IFBLK|0660, st_rdev=makedev(43, 1), ...}) = 0
ioctl(3, BLKDISCARDZEROES, [0]) = 0
ioctl(3, BLKROGET, [0]) = 0
uname({sysname="Linux", nodename="red", ...}) = 0
ioctl(3, BLKDISCARD, [0, 100])  = 0
write(1, "Discarding device blocks: ", 26) = 26
write(1, "   4096/2621440", 15   4096/2621440) = 15
write(1, "\10\10\10\10\10\10\10\10\10\10\10\1) = 15
ioctl(3, BLKDISCARD, [100, 8000]) = -1 EIO (Input/output error)
write(1, "   ", 15   ) = 15
write(1, "\10\10\10\10\10\10\10\10\10\10\10\1) = 15
write(1, "failed - ", 9failed - )= 9

so it does indeed look like the smaller request [0, 0x100] succeeded
while the larger [0x100, 0x8000] failed with EIO.  But I
instrumented my qemu-nbd server to output all of the incoming requests
it was receiving from the kernel, and I never saw a mention of
NBD_CMD_TRIM being received.  Maybe it's dependent on what version of
the kernel and/or NBD kernel module you are running?

I also tried fallocate(1) on /dev/nbd1, as in:
 # strace fallocate -o 0 -l 64k -p /dev/nbd1
but it looks like the kernel doesn't yet wire up fallocate(2) to forward
to the appropriate device ioctls and/or NBD_CMD_TRIM, where strace says:

open("/dev/nbd1", O_RDWR)   = 3
fallocate(3, FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE, 0, 65536) = -1
ENODEV (No such device)

The ENODEV is a rather unusual choice of error.

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [Nbd] [Qemu-devel] [PATCH] nbd: fix trim/discard commands with a length bigger than NBD_MAX_BUFFER_SIZE

2016-05-10 Thread Paolo Bonzini


On 10/05/2016 17:38, Alex Bligh wrote:
> > and are at the
> > mercy of however the kernel currently decides to split large requests).
> 
> I am surprised TRIM doesn't get broken up the same way READ and WRITE
> do.

The payload of TRIM has constant size, so it makes sense to do that.
The kernel then self-imposes a 2GB limit in blkdev_issue_discard.

On the other hand, READ and WRITE of size n can possibly require O(n)
memory.

Paolo



Re: [Qemu-block] [Nbd] [Qemu-devel] [PATCH] nbd: fix trim/discard commands with a length bigger than NBD_MAX_BUFFER_SIZE

2016-05-10 Thread Quentin Casasnovas
On Tue, May 10, 2016 at 05:54:44PM +0200, Quentin Casasnovas wrote:
> On Tue, May 10, 2016 at 09:46:36AM -0600, Eric Blake wrote:
> > On 05/10/2016 09:41 AM, Alex Bligh wrote:
> > > 
> > > On 10 May 2016, at 16:29, Eric Blake  wrote:
> > > 
> > >> So the kernel is currently one of the clients that does NOT honor block
> > >> sizes, and as such, servers should be prepared for ANY size up to
> > >> UINT_MAX (other than DoS handling).
> > > 
> > > Interesting followup question:
> > > 
> > > If the kernel does not fragment TRIM requests at all (in the
> > > same way it fragments read and write requests), I suspect
> > > something bad may happen with TRIM requests over 2^31
> > > in size (particularly over 2^32 in size), as the length
> > > field in nbd only has 32 bits.
> > > 
> > > Whether it supports block size constraints or not, it is
> > > going to need to do *some* breaking up of requests.
> > 
> > Does anyone have an easy way to cause the kernel to request a trim
> > operation that large on a > 4G export?  I'm not familiar enough with
> > EXT4 operation to know what file system operations you can run to
> > ultimately indirectly create a file system trim operation that large.
> > But maybe there is something simpler - does the kernel let you use the
> > fallocate(2) syscall operation with FALLOC_FL_PUNCH_HOLE or
> > FALLOC_FL_ZERO_RANGE on an fd backed by an NBD device?
> > 
> 
> It was fairly reproducible here, we just used a random qcow2 image with
> some Debian minimal system pre-installed, mounted that qcow2 image through
> qemu-nbd then compiled a whole kernel inside it.  Then you can make clean
> and run fstrim on the mount point.  I'm assuming you can go faster than
> that by just writing a big file to the qcow2 image mounted without -o
> discard, delete the big file, then remount with -o discard + run fstrim.
> 

Looks like there's an easier way:

 $ qemu-img create -f qcow2 foo.qcow2 10G
 $ qemu-nbd --discard=on -c /dev/nbd0 foo.qcow2
 $ mkfs.ext4 /dev/nbd0
 mke2fs 1.42.13 (17-May-2015)
 Discarding device blocks: failed - Input/output error
 Creating filesystem with 2621440 4k blocks and 655360 inodes
 Filesystem UUID: 25aeb51f-0dea-4c1d-8b65-61f6bcdf97e9
 Superblock backups stored on blocks:
   32768, 98304, 163840, 229376, 294912, 819200, 884736, 1605632

 Allocating group tables: done
 Writing inode tables: done
 Creating journal (32768 blocks): done
 Writing superblocks and filesystem accounting information: done

Notice the "Discarding device blocks: failed - Input/output error" line, I
bet that it is mkfs.ext4 trying to trim all blocks prior to writing the
filesystem, but it gets an I/O error while doing so.  I haven't verified it
is the same problem, but it it isn't, simply mount the resulting filesystem
and run fstrim on it:

 $ mount -o discard /dev/nbd0 /tmp/foo
 $ fstrim /tmp/foo
 fstrim: /tmp/foo: FITRIM ioctl failed: Input/output error

Quentin



[Qemu-block] [PATCH v9 3/3] block: enable testing of LUKS driver with block I/O tests

2016-05-10 Thread Daniel P. Berrange
This adds support for testing the LUKS driver with the block
I/O test framework.

   cd tests/qemu-io-tests
   ./check -luks

A handful of test cases are modified to work with luks

 - 004 - whitelist luks format
 - 012 - use TEST_IMG_FILE instead of TEST_IMG for file ops
 - 048 - use TEST_IMG_FILE instead of TEST_IMG for file ops.
 don't assume extended image contents is all zeros,
 explicitly initialize with zeros
 Make file size smaller to avoid having to decrypt
 1 GB of data.
 - 052 - don't assume initial image contents is all zeros,
 explicitly initialize with zeros
 - 100 - don't assume initial image contents is all zeros,
 explicitly initialize with zeros

With this patch applied, the results are as follows:

  Passed: 001 002 003 004 005 008 009 010 011 012 021 032 043
  047 048 049 052 087 100 134 143
  Failed: 033 120 140 145
 Skipped: 007 013 014 015 017 018 019 020 022 023 024 025 026
  027 028 029 030 031 034 035 036 037 038 039 040 041
  042 043 044 045 046 047 049 050 051 053 054 055 056
  057 058 059 060 061 062 063 064 065 066 067 068 069
  070 071 072 073 074 075 076 077 078 079 080 081 082
  083 084 085 086 087 088 089 090 091 092 093 094 095
  096 097 098 099 101 102 103 104 105 107 108 109 110
  111 112 113 114 115 116 117 118 119 121 122 123 124
  128 129 130 131 132 133 134 135 136 137 138 139 141
  142 144 146 148 150 152

The reasons for the failed tests are:

 - 033 - needs adapting to use image opts syntax with blkdebug
 and test image in order to correctly set align property
 - 120 - needs adapting to use correct -drive syntax for luks
 - 140 - needs adapting to use correct -drive syntax for luks
 - 145 - needs adapting to use correct -drive syntax for luks

The vast majority of skipped tests are exercising code that is
qcow2 specific, though a couple could probably be usefully
enabled for luks too.

Signed-off-by: Daniel P. Berrange 
---
 tests/qemu-iotests/004   |  2 +-
 tests/qemu-iotests/012   |  5 -
 tests/qemu-iotests/048   | 26 +++---
 tests/qemu-iotests/048.out   |  6 --
 tests/qemu-iotests/052   |  4 
 tests/qemu-iotests/052.out   |  4 
 tests/qemu-iotests/100   |  7 +++
 tests/qemu-iotests/100.out   | 14 ++
 tests/qemu-iotests/common|  7 +++
 tests/qemu-iotests/common.rc |  3 +++
 10 files changed, 67 insertions(+), 11 deletions(-)

diff --git a/tests/qemu-iotests/004 b/tests/qemu-iotests/004
index 67e1beb..6f2aa3d 100755
--- a/tests/qemu-iotests/004
+++ b/tests/qemu-iotests/004
@@ -37,7 +37,7 @@ trap "_cleanup; exit \$status" 0 1 2 3 15
 . ./common.rc
 . ./common.filter
 
-_supported_fmt raw qcow qcow2 qed vdi vmdk vhdx
+_supported_fmt raw qcow qcow2 qed vdi vmdk vhdx luks
 _supported_proto generic
 _supported_os Linux
 
diff --git a/tests/qemu-iotests/012 b/tests/qemu-iotests/012
index d1d3f22..01a770d 100755
--- a/tests/qemu-iotests/012
+++ b/tests/qemu-iotests/012
@@ -43,13 +43,16 @@ _supported_fmt generic
 _supported_proto file
 _supported_os Linux
 
+# Remove once all tests are fixed to use TEST_IMG_FILE
+# correctly and common.rc sets it unconditionally
+test -z "$TEST_IMG_FILE" && TEST_IMG_FILE=$TEST_IMG
 
 size=128M
 _make_test_img $size
 
 echo
 echo "== mark image read-only"
-chmod a-w "$TEST_IMG"
+chmod a-w "$TEST_IMG_FILE"
 
 echo
 echo "== read from read-only image"
diff --git a/tests/qemu-iotests/048 b/tests/qemu-iotests/048
index e1eeac2..203c04f 100755
--- a/tests/qemu-iotests/048
+++ b/tests/qemu-iotests/048
@@ -31,13 +31,13 @@ _cleanup()
 {
 echo "Cleanup"
 _cleanup_test_img
-rm "${TEST_IMG2}"
+rm "${TEST_IMG_FILE2}"
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
 
 _compare()
 {
-$QEMU_IMG compare "$@" "$TEST_IMG" "${TEST_IMG2}"
+$QEMU_IMG compare $QEMU_IMG_EXTRA_ARGS "$@" "$TEST_IMG" "${TEST_IMG2}"
 echo $?
 }
 
@@ -46,25 +46,37 @@ _compare()
 . ./common.filter
 . ./common.pattern
 
-_supported_fmt raw qcow qcow2 qed
+_supported_fmt raw qcow qcow2 qed luks
 _supported_proto file
 _supported_os Linux
 
+# Remove once all tests are fixed to use TEST_IMG_FILE
+# correctly and common.rc sets it unconditionally
+test -z "$TEST_IMG_FILE" && TEST_IMG_FILE=$TEST_IMG
+
 # Setup test basic parameters
 TEST_IMG2=$TEST_IMG.2
+TEST_IMG_FILE2=$TEST_IMG_FILE.2
 CLUSTER_SIZE=4096
-size=1024M
+size=128M
 
 _make_test_img $size
 io_pattern write 524288 $CLUSTER_SIZE $CLUSTER_SIZE 4 45
 
 # Compare identical images
-cp "$TEST_IMG" "${TEST_IMG2}"
+cp "$TEST_IMG_FILE" "${TEST_IMG_FILE2}"
 _compare
 _compare -q
 
 # Compare images with different size
-$QEMU_IMG resize -f $IMGFMT "$TEST_IMG" +512M
+if [ "$IMGOPTSSYNTAX" = "true" ]; then
+$QEMU_IMG resize $QEMU_IMG_EXTRA_ARGS "$TEST_IMG" +32M
+else
+$QEMU_IMG resize -f $IMGFMT "$TEST_IMG" +32M
+fi
+# Ensure extended space is zero-initialized
+$QEMU_IO 

[Qemu-block] [PATCH v9 2/3] block: add support for encryption secrets in block I/O tests

2016-05-10 Thread Daniel P. Berrange
The LUKS block driver tests will require the ability to specify
encryption secrets with block devices. This requires using the
--object argument to qemu-img/qemu-io to create a 'secret'
object.

When the IMGKEYSECRET env variable is set, it provides the
password to be associated with a secret called 'keysec0'

The _qemu_img_wrapper function isn't modified as that needs
to cope with differing syntax for subcommands, so can't be
made to use the image opts syntax unconditionally.

Signed-off-by: Daniel P. Berrange 
---
 tests/qemu-iotests/common| 1 +
 tests/qemu-iotests/common.config | 6 ++
 tests/qemu-iotests/common.filter | 3 ++-
 tests/qemu-iotests/common.rc | 9 +++--
 4 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/tests/qemu-iotests/common b/tests/qemu-iotests/common
index fe3b1a0..e87287c 100644
--- a/tests/qemu-iotests/common
+++ b/tests/qemu-iotests/common
@@ -53,6 +53,7 @@ export QEMU_IO_OPTIONS=""
 export CACHEMODE_IS_DEFAULT=true
 export QEMU_OPTIONS="-nodefaults"
 export VALGRIND_QEMU=
+export IMGKEYSECRET=
 export IMGOPTSSYNTAX=false
 
 for r
diff --git a/tests/qemu-iotests/common.config b/tests/qemu-iotests/common.config
index ea698da..f6384fb 100644
--- a/tests/qemu-iotests/common.config
+++ b/tests/qemu-iotests/common.config
@@ -126,6 +126,9 @@ _qemu_io_wrapper()
 local QEMU_IO_ARGS="$QEMU_IO_OPTIONS"
 if [ "$IMGOPTSSYNTAX" = "true" ]; then
 QEMU_IO_ARGS="--image-opts $QEMU_IO_ARGS"
+if [ -n "$IMGKEYSECRET" ]; then
+QEMU_IO_ARGS="--object secret,id=keysec0,data=$IMGKEYSECRET 
$QEMU_IO_ARGS"
+fi
 fi
 local RETVAL
 (
@@ -161,6 +164,9 @@ export QEMU_NBD=_qemu_nbd_wrapper
 QEMU_IMG_EXTRA_ARGS=
 if [ "$IMGOPTSSYNTAX" = "true" ]; then
 QEMU_IMG_EXTRA_ARGS="--image-opts $QEMU_IMG_EXTRA_ARGS"
+if [ -n "$IMGKEYSECRET" ]; then
+QEMU_IMG_EXTRA_ARGS="--object secret,id=keysec0,data=$IMGKEYSECRET 
$QEMU_IMG_EXTRA_ARGS"
+fi
 fi
 export QEMU_IMG_EXTRA_ARGS
 
diff --git a/tests/qemu-iotests/common.filter b/tests/qemu-iotests/common.filter
index 8a6e1b5..d8188a5 100644
--- a/tests/qemu-iotests/common.filter
+++ b/tests/qemu-iotests/common.filter
@@ -97,7 +97,8 @@ _filter_img_create()
 -e "s# block_state_zero=\\(on\\|off\\)##g" \
 -e "s# log_size=[0-9]\\+##g" \
 -e "s/archipelago:a/TEST_DIR\//g" \
--e "s# refcount_bits=[0-9]\\+##g"
+-e "s# refcount_bits=[0-9]\\+##g" \
+-e "s# key-secret=[a-zA-Z0-9]\\+##g"
 }
 
 _filter_img_info()
diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc
index 080f1bc..164792d 100644
--- a/tests/qemu-iotests/common.rc
+++ b/tests/qemu-iotests/common.rc
@@ -129,6 +129,7 @@ _make_test_img()
 local img_name=""
 local use_backing=0
 local backing_file=""
+local object_options=""
 
 if [ -n "$TEST_IMG_FILE" ]; then
 img_name=$TEST_IMG_FILE
@@ -139,6 +140,10 @@ _make_test_img()
 if [ -n "$IMGOPTS" ]; then
 optstr=$(_optstr_add "$optstr" "$IMGOPTS")
 fi
+if [ -n "$IMGKEYSECRET" ]; then
+object_options="--object secret,id=keysec0,data=$IMGKEYSECRET"
+optstr=$(_optstr_add "$optstr" "key-secret=keysec0")
+fi
 
 if [ "$1" = "-b" ]; then
 use_backing=1
@@ -156,9 +161,9 @@ _make_test_img()
 # XXX(hch): have global image options?
 (
  if [ $use_backing = 1 ]; then
-$QEMU_IMG create -f $IMGFMT $extra_img_options -b "$backing_file" 
"$img_name" $image_size 2>&1
+$QEMU_IMG create $object_options -f $IMGFMT $extra_img_options -b 
"$backing_file" "$img_name" $image_size 2>&1
  else
-$QEMU_IMG create -f $IMGFMT $extra_img_options "$img_name" $image_size 
2>&1
+$QEMU_IMG create $object_options -f $IMGFMT $extra_img_options 
"$img_name" $image_size 2>&1
  fi
 ) | _filter_img_create
 
-- 
2.5.5




[Qemu-block] [PATCH v9 1/3] block: add support for --image-opts in block I/O tests

2016-05-10 Thread Daniel P. Berrange
Currently all block tests use the traditional syntax for images
just specifying a filename. To support the LUKS driver without
resorting to JSON, the tests need to be able to use the new
--image-opts argument to qemu-img and qemu-io.

This introduces a new env variable IMGOPTSSYNTAX. If this is
set to 'true', then qemu-img/qemu-io should use --image-opts.

Signed-off-by: Daniel P. Berrange 
---
 tests/qemu-iotests/039.out   | 20 +++---
 tests/qemu-iotests/061.out   |  8 +++---
 tests/qemu-iotests/137.out   |  4 +--
 tests/qemu-iotests/common|  7 -
 tests/qemu-iotests/common.config | 15 +--
 tests/qemu-iotests/common.rc | 57 +---
 6 files changed, 77 insertions(+), 34 deletions(-)

diff --git a/tests/qemu-iotests/039.out b/tests/qemu-iotests/039.out
index 32c8846..c6e0ac2 100644
--- a/tests/qemu-iotests/039.out
+++ b/tests/qemu-iotests/039.out
@@ -12,9 +12,9 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
 wrote 512/512 bytes at offset 0
 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 ./common.config: Killed  ( if [ "${VALGRIND_QEMU}" == "y" ]; 
then
-exec valgrind --log-file="${VALGRIND_LOGFILE}" --error-exitcode=99 
"$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+exec valgrind --log-file="${VALGRIND_LOGFILE}" --error-exitcode=99 
"$QEMU_IO_PROG" $QEMU_IO_ARGS "$@";
 else
-exec "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+exec "$QEMU_IO_PROG" $QEMU_IO_ARGS "$@";
 fi )
 incompatible_features 0x1
 ERROR cluster 5 refcount=0 reference=1
@@ -51,9 +51,9 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
 wrote 512/512 bytes at offset 0
 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 ./common.config: Killed  ( if [ "${VALGRIND_QEMU}" == "y" ]; 
then
-exec valgrind --log-file="${VALGRIND_LOGFILE}" --error-exitcode=99 
"$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+exec valgrind --log-file="${VALGRIND_LOGFILE}" --error-exitcode=99 
"$QEMU_IO_PROG" $QEMU_IO_ARGS "$@";
 else
-exec "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+exec "$QEMU_IO_PROG" $QEMU_IO_ARGS "$@";
 fi )
 incompatible_features 0x1
 ERROR cluster 5 refcount=0 reference=1
@@ -69,9 +69,9 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
 wrote 512/512 bytes at offset 0
 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 ./common.config: Killed  ( if [ "${VALGRIND_QEMU}" == "y" ]; 
then
-exec valgrind --log-file="${VALGRIND_LOGFILE}" --error-exitcode=99 
"$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+exec valgrind --log-file="${VALGRIND_LOGFILE}" --error-exitcode=99 
"$QEMU_IO_PROG" $QEMU_IO_ARGS "$@";
 else
-exec "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+exec "$QEMU_IO_PROG" $QEMU_IO_ARGS "$@";
 fi )
 incompatible_features 0x0
 No errors were found on the image.
@@ -92,9 +92,9 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
 wrote 512/512 bytes at offset 0
 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 ./common.config: Killed  ( if [ "${VALGRIND_QEMU}" == "y" ]; 
then
-exec valgrind --log-file="${VALGRIND_LOGFILE}" --error-exitcode=99 
"$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+exec valgrind --log-file="${VALGRIND_LOGFILE}" --error-exitcode=99 
"$QEMU_IO_PROG" $QEMU_IO_ARGS "$@";
 else
-exec "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+exec "$QEMU_IO_PROG" $QEMU_IO_ARGS "$@";
 fi )
 incompatible_features 0x1
 ERROR cluster 5 refcount=0 reference=1
@@ -106,9 +106,9 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
 wrote 512/512 bytes at offset 0
 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 ./common.config: Killed  ( if [ "${VALGRIND_QEMU}" == "y" ]; 
then
-exec valgrind --log-file="${VALGRIND_LOGFILE}" --error-exitcode=99 
"$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+exec valgrind --log-file="${VALGRIND_LOGFILE}" --error-exitcode=99 
"$QEMU_IO_PROG" $QEMU_IO_ARGS "$@";
 else
-exec "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+exec "$QEMU_IO_PROG" $QEMU_IO_ARGS "$@";
 fi )
 incompatible_features 0x0
 No errors were found on the image.
diff --git a/tests/qemu-iotests/061.out b/tests/qemu-iotests/061.out
index a03732e..a431b7f 100644
--- a/tests/qemu-iotests/061.out
+++ b/tests/qemu-iotests/061.out
@@ -58,9 +58,9 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
 wrote 131072/131072 bytes at offset 0
 128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
 ./common.config: Killed  ( if [ "${VALGRIND_QEMU}" == "y" ]; 
then
-exec valgrind --log-file="${VALGRIND_LOGFILE}" --error-exitcode=99 
"$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+exec valgrind --log-file="${VALGRIND_LOGFILE}" --error-exitcode=99 
"$QEMU_IO_PROG" $QEMU_IO_ARGS "$@";
 else
-exec "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+exec "$QEMU_IO_PROG" $QEMU_IO_ARGS "$@";
 fi )
 magic 0x514649fb
 

[Qemu-block] [PATCH v9 0/3] Tests for LUKS driver

2016-05-10 Thread Daniel P. Berrange
This series contains the 3 test suite patches that had to be dropped
from the v6 series during merge with the block tree:

 v6: https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg04935.html
 v7: https://lists.gnu.org/archive/html/qemu-devel/2016-03/msg06687.html
 v8: https://lists.gnu.org/archive/html/qemu-devel/2016-04/msg02736.html

Changed in v9:

 - Use QEMU_IMG_EXTRA_ARGS in common.rc
 - Use 'write -z' instead of 'write -P 0'

Changed in v8:

 - Add conversion of 039.out & 137.out to new opts variable name
 - Fix 012 and 048 to not assume TEST_IMG_FILE is always set
 - Clarify why we don't change _qemu_img_wrapper in commit msg

Changed in v7:

 - Avoid setting TEST_IMG_FILE when IMGPROTO=file in common.rc
   for traditional (not --image-opts) variable setup

Daniel P. Berrange (3):
  block: add support for --image-opts in block I/O tests
  block: add support for encryption secrets in block I/O tests
  block: enable testing of LUKS driver with block I/O tests

 tests/qemu-iotests/004   |  2 +-
 tests/qemu-iotests/012   |  5 ++-
 tests/qemu-iotests/039.out   | 20 ++--
 tests/qemu-iotests/048   | 26 +++
 tests/qemu-iotests/048.out   |  6 ++--
 tests/qemu-iotests/052   |  4 +++
 tests/qemu-iotests/052.out   |  4 +++
 tests/qemu-iotests/061.out   |  8 ++---
 tests/qemu-iotests/100   |  7 
 tests/qemu-iotests/100.out   | 14 
 tests/qemu-iotests/137.out   |  4 +--
 tests/qemu-iotests/common| 15 -
 tests/qemu-iotests/common.config | 21 ++--
 tests/qemu-iotests/common.filter |  3 +-
 tests/qemu-iotests/common.rc | 69 ++--
 15 files changed, 160 insertions(+), 48 deletions(-)

-- 
2.5.5




Re: [Qemu-block] [Nbd] [Qemu-devel] [PATCH] nbd: fix trim/discard commands with a length bigger than NBD_MAX_BUFFER_SIZE

2016-05-10 Thread Quentin Casasnovas
On Tue, May 10, 2016 at 04:49:57PM +0100, Alex Bligh wrote:
> 
> On 10 May 2016, at 16:45, Quentin Casasnovas  
> wrote:
> 
> > I'm by no mean an expert in this, but why would the kernel break up those
> > TRIM commands?  After all, breaking things up makes sense when the length
> > of the request is big, not that much when it only contains the request
> > header, which is the case for TRIM commands.
> 
> 1. You are assuming that the only reason for limiting the size of operations 
> is
>limiting the data transferred within one request. That is not necessarily
>the case. There are good reasons (if only orthogonality) to have limits
>in place even where no data is transferred.
> 
> 2. As and when the blocksize extension is implemented in the kernel (it isn't
>now), the protocol requires it.
> 
> 3. The maximum length of an NBD trim operation is 2^32. The maximum length
>of a trim operation is larger. Therefore the kernel needs to do at least
>some breaking up.

Alright, I assumed by 'length of an NBD request', the specification was
talking about the length of.. well, the request as opposed to whatever is
in the length field of the request header.

Is there a use case where you'd want to split up a single big TRIM request
in smaller ones (as in some hardware would not support it or something)?
Even then, it looks like this splitting up would be hardware dependant and
better implemented in block device drivers.

I'm just finding odd that something that fits inside the length field can't
be used.  I do agree with your point number 3, obviously if the lenght
field type doesn't allow something bigger than a u32, then the kernel has
to do some breaking up in that case.

Quentin



Re: [Qemu-block] [Nbd] [Qemu-devel] [PATCH] nbd: fix trim/discard commands with a length bigger than NBD_MAX_BUFFER_SIZE

2016-05-10 Thread Quentin Casasnovas
On Tue, May 10, 2016 at 09:46:36AM -0600, Eric Blake wrote:
> On 05/10/2016 09:41 AM, Alex Bligh wrote:
> > 
> > On 10 May 2016, at 16:29, Eric Blake  wrote:
> > 
> >> So the kernel is currently one of the clients that does NOT honor block
> >> sizes, and as such, servers should be prepared for ANY size up to
> >> UINT_MAX (other than DoS handling).
> > 
> > Interesting followup question:
> > 
> > If the kernel does not fragment TRIM requests at all (in the
> > same way it fragments read and write requests), I suspect
> > something bad may happen with TRIM requests over 2^31
> > in size (particularly over 2^32 in size), as the length
> > field in nbd only has 32 bits.
> > 
> > Whether it supports block size constraints or not, it is
> > going to need to do *some* breaking up of requests.
> 
> Does anyone have an easy way to cause the kernel to request a trim
> operation that large on a > 4G export?  I'm not familiar enough with
> EXT4 operation to know what file system operations you can run to
> ultimately indirectly create a file system trim operation that large.
> But maybe there is something simpler - does the kernel let you use the
> fallocate(2) syscall operation with FALLOC_FL_PUNCH_HOLE or
> FALLOC_FL_ZERO_RANGE on an fd backed by an NBD device?
> 

It was fairly reproducible here, we just used a random qcow2 image with
some Debian minimal system pre-installed, mounted that qcow2 image through
qemu-nbd then compiled a whole kernel inside it.  Then you can make clean
and run fstrim on the mount point.  I'm assuming you can go faster than
that by just writing a big file to the qcow2 image mounted without -o
discard, delete the big file, then remount with -o discard + run fstrim.

Quentin



Re: [Qemu-block] [Nbd] [Qemu-devel] [PATCH] nbd: fix trim/discard commands with a length bigger than NBD_MAX_BUFFER_SIZE

2016-05-10 Thread Quentin Casasnovas
On Tue, May 10, 2016 at 04:38:29PM +0100, Alex Bligh wrote:
> Eric,
> 
> On 10 May 2016, at 16:29, Eric Blake  wrote:
> >>> Maybe we should revisit that in the spec, and/or advertise yet another
> >>> block size (since the maximum size for a trim and/or write_zeroes
> >>> request may indeed be different than the maximum size for a read/write).
> >> 
> >> I think it's up to the server to either handle large requests, or
> >> for the client to break these up.
> > 
> > But the question at hand here is whether we should permit servers to
> > advertise multiple maximum block sizes (one for read/write, another one
> > for trim/write_zero, or even two [at least qemu tracks a separate
> > maximum trim vs. write_zero sizing in its generic block layer]), or
> > merely stick with the current wording that requires clients that honor
> > maximum block size to obey the same maximum for ALL commands, regardless
> > of amount of data sent over the wire.
> 
> In my view, we should not change this. Block sizes maxima are not there
> to support DoS prevention (that's a separate phrase). They are there
> to impose maximum block sizes. Adding a different maximum block size
> for different commands is way too overengineered. There are after
> all reasons (especially without structured replies) why you'd want
> different maximum block sizes for writes and reads. If clients support
> block sizes, they will necessarily have to have the infrastructure
> to break requests up.
> 
> IE maximum block size should continue to mean maximum block size.
> 
> >> 
> >> The core problem here is that the kernel (and, ahem, most servers) are
> >> ignorant of the block size extension, and need to guess how to break
> >> things up. In my view the client (kernel in this case) should
> >> be breaking the trim requests up into whatever size it uses as the
> >> maximum size write requests. But then it would have to know about block
> >> sizes which are in (another) experimental extension.
> > 
> > Correct - no one has yet patched the kernel to honor block sizes
> > advertised through what is currently an experimental extension.
> 
> Unsurprising at it's still experimental, and only settled down a couple
> of weeks ago :-)
> 
> >  (We
> > have ioctl(NBD_SET_BLKSIZE) which can be argued to set the kernel's
> > minimum block size,
> 
> Technically that is 'out of band transmission of block size
> constraints' :-)
> 
> > but I haven't audited whether the kernel actually
> > guarantees that all client requests are sent aligned to the value passed
> > that way - but we have nothing to set the maximum size,
> 
> indeed
> 
> > and are at the
> > mercy of however the kernel currently decides to split large requests).
> 
> I am surprised TRIM doesn't get broken up the same way READ and WRITE
> do.
>

I'm by no mean an expert in this, but why would the kernel break up those
TRIM commands?  After all, breaking things up makes sense when the length
of the request is big, not that much when it only contains the request
header, which is the case for TRIM commands.

What am I missing?

Quentin



Re: [Qemu-block] [Nbd] [Qemu-devel] [PATCH] nbd: fix trim/discard commands with a length bigger than NBD_MAX_BUFFER_SIZE

2016-05-10 Thread Alex Bligh

On 10 May 2016, at 16:45, Quentin Casasnovas  
wrote:

> I'm by no mean an expert in this, but why would the kernel break up those
> TRIM commands?  After all, breaking things up makes sense when the length
> of the request is big, not that much when it only contains the request
> header, which is the case for TRIM commands.

1. You are assuming that the only reason for limiting the size of operations is
   limiting the data transferred within one request. That is not necessarily
   the case. There are good reasons (if only orthogonality) to have limits
   in place even where no data is transferred.

2. As and when the blocksize extension is implemented in the kernel (it isn't
   now), the protocol requires it.

3. The maximum length of an NBD trim operation is 2^32. The maximum length
   of a trim operation is larger. Therefore the kernel needs to do at least
   some breaking up.

-- 
Alex Bligh







Re: [Qemu-block] [Nbd] [Qemu-devel] [PATCH] nbd: fix trim/discard commands with a length bigger than NBD_MAX_BUFFER_SIZE

2016-05-10 Thread Alex Bligh

On 10 May 2016, at 16:46, Eric Blake  wrote:

> Does anyone have an easy way to cause the kernel to request a trim
> operation that large on a > 4G export?  I'm not familiar enough with
> EXT4 operation to know what file system operations you can run to
> ultimately indirectly create a file system trim operation that large.
> But maybe there is something simpler - does the kernel let you use the
> fallocate(2) syscall operation with FALLOC_FL_PUNCH_HOLE or
> FALLOC_FL_ZERO_RANGE on an fd backed by an NBD device?

Not tried it, but fallocate(1) with -p ?

http://man7.org/linux/man-pages/man1/fallocate.1.html

As it takes length and offset in TB, PB, EB, ZB and YB, it
seems to be 64 bit aware :-)

--
Alex Bligh






signature.asc
Description: Message signed with OpenPGP using GPGMail


Re: [Qemu-block] [Nbd] [Qemu-devel] [PATCH] nbd: fix trim/discard commands with a length bigger than NBD_MAX_BUFFER_SIZE

2016-05-10 Thread Eric Blake
On 05/10/2016 09:41 AM, Alex Bligh wrote:
> 
> On 10 May 2016, at 16:29, Eric Blake  wrote:
> 
>> So the kernel is currently one of the clients that does NOT honor block
>> sizes, and as such, servers should be prepared for ANY size up to
>> UINT_MAX (other than DoS handling).
> 
> Interesting followup question:
> 
> If the kernel does not fragment TRIM requests at all (in the
> same way it fragments read and write requests), I suspect
> something bad may happen with TRIM requests over 2^31
> in size (particularly over 2^32 in size), as the length
> field in nbd only has 32 bits.
> 
> Whether it supports block size constraints or not, it is
> going to need to do *some* breaking up of requests.

Does anyone have an easy way to cause the kernel to request a trim
operation that large on a > 4G export?  I'm not familiar enough with
EXT4 operation to know what file system operations you can run to
ultimately indirectly create a file system trim operation that large.
But maybe there is something simpler - does the kernel let you use the
fallocate(2) syscall operation with FALLOC_FL_PUNCH_HOLE or
FALLOC_FL_ZERO_RANGE on an fd backed by an NBD device?

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [Nbd] [Qemu-devel] [PATCH] nbd: fix trim/discard commands with a length bigger than NBD_MAX_BUFFER_SIZE

2016-05-10 Thread Alex Bligh

On 10 May 2016, at 16:29, Eric Blake  wrote:

> So the kernel is currently one of the clients that does NOT honor block
> sizes, and as such, servers should be prepared for ANY size up to
> UINT_MAX (other than DoS handling).

Interesting followup question:

If the kernel does not fragment TRIM requests at all (in the
same way it fragments read and write requests), I suspect
something bad may happen with TRIM requests over 2^31
in size (particularly over 2^32 in size), as the length
field in nbd only has 32 bits.

Whether it supports block size constraints or not, it is
going to need to do *some* breaking up of requests.

--
Alex Bligh






signature.asc
Description: Message signed with OpenPGP using GPGMail


Re: [Qemu-block] [Nbd] [Qemu-devel] [PATCH] nbd: fix trim/discard commands with a length bigger than NBD_MAX_BUFFER_SIZE

2016-05-10 Thread Alex Bligh
Eric,

On 10 May 2016, at 16:29, Eric Blake  wrote:
>>> Maybe we should revisit that in the spec, and/or advertise yet another
>>> block size (since the maximum size for a trim and/or write_zeroes
>>> request may indeed be different than the maximum size for a read/write).
>> 
>> I think it's up to the server to either handle large requests, or
>> for the client to break these up.
> 
> But the question at hand here is whether we should permit servers to
> advertise multiple maximum block sizes (one for read/write, another one
> for trim/write_zero, or even two [at least qemu tracks a separate
> maximum trim vs. write_zero sizing in its generic block layer]), or
> merely stick with the current wording that requires clients that honor
> maximum block size to obey the same maximum for ALL commands, regardless
> of amount of data sent over the wire.

In my view, we should not change this. Block sizes maxima are not there
to support DoS prevention (that's a separate phrase). They are there
to impose maximum block sizes. Adding a different maximum block size
for different commands is way too overengineered. There are after
all reasons (especially without structured replies) why you'd want
different maximum block sizes for writes and reads. If clients support
block sizes, they will necessarily have to have the infrastructure
to break requests up.

IE maximum block size should continue to mean maximum block size.

>> 
>> The core problem here is that the kernel (and, ahem, most servers) are
>> ignorant of the block size extension, and need to guess how to break
>> things up. In my view the client (kernel in this case) should
>> be breaking the trim requests up into whatever size it uses as the
>> maximum size write requests. But then it would have to know about block
>> sizes which are in (another) experimental extension.
> 
> Correct - no one has yet patched the kernel to honor block sizes
> advertised through what is currently an experimental extension.

Unsurprising at it's still experimental, and only settled down a couple
of weeks ago :-)

>  (We
> have ioctl(NBD_SET_BLKSIZE) which can be argued to set the kernel's
> minimum block size,

Technically that is 'out of band transmission of block size
constraints' :-)

> but I haven't audited whether the kernel actually
> guarantees that all client requests are sent aligned to the value passed
> that way - but we have nothing to set the maximum size,

indeed

> and are at the
> mercy of however the kernel currently decides to split large requests).

I am surprised TRIM doesn't get broken up the same way READ and WRITE
do.

> So the kernel is currently one of the clients that does NOT honor block
> sizes, and as such, servers should be prepared for ANY size up to
> UINT_MAX (other than DoS handling).

Or not to permit a connection.

>  My question above only applies to
> clients that use the experimental block size extensions.

Indeed.

>> What surprises me is that a release kernel is using experimental
>> NBD extensions; there's no guarantee these won't change. Or does
>> fstrim work some other way?
> 
> No extension in play.  The kernel is obeying NBD_FLAG_SEND_TRIM, which
> is in the normative standard, and unrelated to the INFO/BLOCK_SIZE
> extensions.

My mistake. I was confusing 'WRITE_ZEROES' with 'TRIM'.

--
Alex Bligh






signature.asc
Description: Message signed with OpenPGP using GPGMail


Re: [Qemu-block] [Nbd] [Qemu-devel] [PATCH] nbd: fix trim/discard commands with a length bigger than NBD_MAX_BUFFER_SIZE

2016-05-10 Thread Eric Blake
On 05/10/2016 09:08 AM, Alex Bligh wrote:
> Eric,
> 
>> Hmm. The current wording of the experimental block size additions does
>> NOT allow the client to send a NBD_CMD_TRIM with a size larger than the
>> maximum NBD_CMD_WRITE:
>> https://github.com/yoe/nbd/blob/extension-info/doc/proto.md#block-size-constraints
> 
> Correct
> 
>> Maybe we should revisit that in the spec, and/or advertise yet another
>> block size (since the maximum size for a trim and/or write_zeroes
>> request may indeed be different than the maximum size for a read/write).
> 
> I think it's up to the server to either handle large requests, or
> for the client to break these up.

But the question at hand here is whether we should permit servers to
advertise multiple maximum block sizes (one for read/write, another one
for trim/write_zero, or even two [at least qemu tracks a separate
maximum trim vs. write_zero sizing in its generic block layer]), or
merely stick with the current wording that requires clients that honor
maximum block size to obey the same maximum for ALL commands, regardless
of amount of data sent over the wire.

> 
> The core problem here is that the kernel (and, ahem, most servers) are
> ignorant of the block size extension, and need to guess how to break
> things up. In my view the client (kernel in this case) should
> be breaking the trim requests up into whatever size it uses as the
> maximum size write requests. But then it would have to know about block
> sizes which are in (another) experimental extension.

Correct - no one has yet patched the kernel to honor block sizes
advertised through what is currently an experimental extension.  (We
have ioctl(NBD_SET_BLKSIZE) which can be argued to set the kernel's
minimum block size, but I haven't audited whether the kernel actually
guarantees that all client requests are sent aligned to the value passed
that way - but we have nothing to set the maximum size, and are at the
mercy of however the kernel currently decides to split large requests).
 So the kernel is currently one of the clients that does NOT honor block
sizes, and as such, servers should be prepared for ANY size up to
UINT_MAX (other than DoS handling).  My question above only applies to
clients that use the experimental block size extensions.

> 
> I've nothing against you fixing it qemu's server (after all I don't
> think there is anything to /prohibit/ a server working on something
> larger than the maximum block size), and ...
> 
>> But since the kernel is the one sending the large length request, and
>> since you are right that this is not a denial-of-service in the amount
>> of data being sent in a single NBD message,
> 
> ... currently there isn't actually a maximum block size advertised (that
> being in another experimental addition), so this is just DoS prevention,
> which qemu is free to do how it wishes.

Okay, sounds like that part is uncontroversial, and it is indeed worth
improving qemu's behavior.

> 
> What surprises me is that a release kernel is using experimental
> NBD extensions; there's no guarantee these won't change. Or does
> fstrim work some other way?

No extension in play.  The kernel is obeying NBD_FLAG_SEND_TRIM, which
is in the normative standard, and unrelated to the INFO/BLOCK_SIZE
extensions.

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [Nbd] [Qemu-devel] [PATCH] nbd: fix trim/discard commands with a length bigger than NBD_MAX_BUFFER_SIZE

2016-05-10 Thread Alex Bligh
Eric,

> Hmm. The current wording of the experimental block size additions does
> NOT allow the client to send a NBD_CMD_TRIM with a size larger than the
> maximum NBD_CMD_WRITE:
> https://github.com/yoe/nbd/blob/extension-info/doc/proto.md#block-size-constraints

Correct

> Maybe we should revisit that in the spec, and/or advertise yet another
> block size (since the maximum size for a trim and/or write_zeroes
> request may indeed be different than the maximum size for a read/write).

I think it's up to the server to either handle large requests, or
for the client to break these up.

The core problem here is that the kernel (and, ahem, most servers) are
ignorant of the block size extension, and need to guess how to break
things up. In my view the client (kernel in this case) should
be breaking the trim requests up into whatever size it uses as the
maximum size write requests. But then it would have to know about block
sizes which are in (another) experimental extension.

I've nothing against you fixing it qemu's server (after all I don't
think there is anything to /prohibit/ a server working on something
larger than the maximum block size), and ...

> But since the kernel is the one sending the large length request, and
> since you are right that this is not a denial-of-service in the amount
> of data being sent in a single NBD message,

... currently there isn't actually a maximum block size advertised (that
being in another experimental addition), so this is just DoS prevention,
which qemu is free to do how it wishes.

> I definitely agree that qemu
> would be wise as a quality-of-implementation to allow the larger size,
> for maximum interoperability, even if it exceeds advertised limits (that
> is, when no limits are advertised, we should handle everything possible
> if it is not so large as to be construed a denial-of-service, and
> NBD_CMD_TRIM is not large; and when limits ARE advertised, a client that
> violates limits is out of spec but we can still be liberal and respond
> successfully to such a client rather than having to outright reject it).
> So I think this patch is headed in the right direction.

I'd agree with that.

What surprises me is that a release kernel is using experimental
NBD extensions; there's no guarantee these won't change. Or does
fstrim work some other way?

Alex

> 
>> 
>> and looking at the kernel side implementation of the nbd device
>> (drivers/block/nbd.c) where it only sends the request header with no data
>> for a NBD_CMD_TRIM.
>> 
>> With this fix in, I am now able to run fstrim on my qcow2 images and keep
>> them small (or at least keep their size proportional to the amount of data
>> present on them).
>> 
>> Signed-off-by: Quentin Casasnovas 
>> CC: Paolo Bonzini 
>> CC: 
>> CC: 
>> CC: 
> 
> This is NOT trivial material and should not go in through that tree.
> However, I concur that it qualifies for a backport on a stable branch.
> 
>> ---
>> nbd.c | 8 +++-
>> 1 file changed, 7 insertions(+), 1 deletion(-)
>> 
>> diff --git a/nbd.c b/nbd.c
>> index b3d9654..e733669 100644
>> --- a/nbd.c
>> +++ b/nbd.c
>> @@ -1209,6 +1209,11 @@ static ssize_t nbd_co_send_reply(NBDRequest *req, 
>> struct nbd_reply *reply,
>> return rc;
>> }
>> 
>> +static bool nbd_should_check_request_size(const struct nbd_request *request)
>> +{
>> +return (request->type & NBD_CMD_MASK_COMMAND) != NBD_CMD_TRIM;
>> +}
>> +
>> static ssize_t nbd_co_receive_request(NBDRequest *req, struct nbd_request 
>> *request)
>> {
>> NBDClient *client = req->client;
>> @@ -1227,7 +1232,8 @@ static ssize_t nbd_co_receive_request(NBDRequest *req, 
>> struct nbd_request *reque
>> goto out;
>> }
>> 
>> -if (request->len > NBD_MAX_BUFFER_SIZE) {
>> +if (nbd_should_check_request_size(request) &&
>> +request->len > NBD_MAX_BUFFER_SIZE) {
> 
> I'd rather sort out the implications of this on the NBD protocol before
> taking anything into qemu.  We've got time on our hand, so let's use it
> to get this right.  (That, and I have several pending patches that
> conflict with this as part of adding WRITE_ZEROES and INFO_BLOCK_SIZE
> support, where it may be easier to resubmit this fix on top of my
> pending patches).
> 
>> LOG("len (%u) is larger than max len (%u)",
>> request->len, NBD_MAX_BUFFER_SIZE);
>> rc = -EINVAL;
>> 
> 
> --
> Eric Blake   eblake redhat com+1-919-301-3266
> Libvirt virtualization library http://libvirt.org
> 
> --
> Mobile security can be enabling, not merely restricting. Employees who
> bring their own devices (BYOD) to work are irked by the imposition of MDM
> restrictions. Mobile Device Manager Plus allows you to control only the
> apps on BYO-devices by containerizing them, leaving personal data untouched!
> 

Re: [Qemu-block] [PATCH v7 00/19] block: kill sector-based blk_write/read

2016-05-10 Thread Kevin Wolf
Am 06.05.2016 um 18:26 hat Eric Blake geschrieben:
> 2.7 material, depends on Kevin's block-next:
> git://repo.or.cz/qemu/kevin.git block-next
> 
> Previously posted as part of a larger NBD series [1] and as v6 [3].
> Mostly orthogonal to Kevin's recent work to also kill sector
> interfaces from the driver.
> 
> [1] https://lists.gnu.org/archive/html/qemu-devel/2016-04/msg03526.html
> [2] https://lists.gnu.org/archive/html/qemu-devel/2016-05/msg00557.html
> 
> Also available as a tag at this location:
> git fetch git://repo.or.cz/qemu/ericb.git nbd-block-v7

Thanks, fixed up scsi-disk and applied to block-next.

Kevin



Re: [Qemu-block] [PATCH v7 06/19] scsi-disk: Switch to byte-based aio block access

2016-05-10 Thread Kevin Wolf
Am 10.05.2016 um 14:56 hat Eric Blake geschrieben:
> On 05/10/2016 02:55 AM, Kevin Wolf wrote:
> > Am 06.05.2016 um 18:26 hat Eric Blake geschrieben:
> >> Sector-based blk_aio_readv() and blk_aio_writev() should die; switch
> >> to byte-based blk_aio_preadv() and blk_aio_pwritev() instead.
> >>
> >> As part of the cleanup, scsi_init_iovec() no longer needs to return
> >> a value, and reword a comment.
> >>
> >> Signed-off-by: Eric Blake 
> >>
> 
> >> @@ -118,7 +118,6 @@ static uint32_t scsi_init_iovec(SCSIDiskReq *r, size_t 
> >> size)
> >>  }
> >>  r->iov.iov_len = MIN(r->sector_count * 512, r->buflen);
> >>  qemu_iovec_init_external(>qiov, >iov, 1);
> >> -return r->qiov.size / 512;
> > 
> > The return value was MIN(r->sector_count, SCSI_DMA_BUF_SIZE / 512).
> > 
> >>  }
> >>
> >>  static void scsi_disk_save_request(QEMUFile *f, SCSIRequest *req)
> 
> >> -n = scsi_init_iovec(r, SCSI_DMA_BUF_SIZE);
> >> +scsi_init_iovec(r, SCSI_DMA_BUF_SIZE);
> >>  block_acct_start(blk_get_stats(s->qdev.conf.blk), >acct,
> >> - n * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
> >> -r->req.aiocb = blk_aio_readv(s->qdev.conf.blk, r->sector, 
> >> >qiov, n,
> >> - scsi_read_complete, r);
> >> + SCSI_DMA_BUF_SIZE, BLOCK_ACCT_READ);
> > 
> > But here you use SCSI_DMA_BUF_SIZE without considering r->sector_count.
> > Is this correct or are requests that are smaller than SCSI_DMA_BUF_SIZE
> > now accounted for more data than they actually read?
> > 
> > Would r->qiov.size be more obviously correct? You did that for writes.
> 
> Yes. Is that something you are able to fix on commit, or should I submit
> a fixup?

Okay, I can fix it up while applying.

Kevin


pgptRe6v8nlat.pgp
Description: PGP signature


Re: [Qemu-block] [PATCH v14 0/3] qapi: child add/delete support

2016-05-10 Thread Max Reitz
On 10.05.2016 09:36, Changlong Xie wrote:
> ChangLog:
> v14:
> 1. Address commets from Betro and Max
> p2: introduce bdrv_drained_begin/end, rename last_index, remove redundant
> assert codes
> v13:
> 1. Rebase to the newest codes
> 2. Address commets from Betro and Max
> p1. Add R-B, fix incorrect syntax
> p2. Add missing "qemu/cutils.h" since 2.6, and rewrite quorum_add/del_child
> p3. Remove unnecessary "id", add "since 2.7"
> v11~v12:
> 1. Address comments from Max
> p1. Add R-B
> p2. Add R-B, remove unnecessary "endptr" "value"
> p3. Add R-B
> v10~v11:
> 1. Rebase to the newest codes
> 2. Address comment from Max
> Don't use contractions in error messages,
> p1: Remove R-Bs, and use "BdrvChild *child" in bdrv_del_child
> p2: Fix error logic in get_new_child_index/remove_child_index, and prefect
> child->name parsing
> p3: Make bdrv_find_child return BdrvChild *, and add missing explanation
> 
> v9~v10:
> 1. Rebase to the newest codes
> 2. Address comments from Berto and Max, update the documents in 
> block-core.json 
>and qmp-commands.hx 
> 3. Remove redundant codes in quorum_add_child() and quorum_del_child()
> v8:
> 1. Rebase to the newest codes
> 2. Address the comments from Eric Blake
> v7:
> 1. Remove the qmp command x-blockdev-change's parameter operation according
>to Kevin's comments.
> 2. Remove the hmp command.
> v6:
> 1. Use a single qmp command x-blockdev-change to replace x-blockdev-child-add
>and x-blockdev-child-delete
> v5:
> 1. Address Eric Blake's comments
> v4:
> 1. drop nbd driver's implementation. We can use human-monitor-command
>to do it.
> 2. Rename the command name.
> v3:
> 1. Don't open BDS in bdrv_add_child(). Use the existing BDS which is
>created by the QMP command blockdev-add.
> 2. The driver NBD can support filename, path, host:port now.
> v2:
> 1. Use bdrv_get_device_or_node_name() instead of new function
>bdrv_get_id_or_node_name()
> 2. Update the error message
> 3. Update the documents in block-core.json
> 
> Wen Congyang (3):
>   Add new block driver interface to add/delete a BDS's child
>   quorum: implement bdrv_add_child() and bdrv_del_child()
>   qmp: add monitor command to add/remove a child
> 
>  block.c   | 57 +++---
>  block/quorum.c| 78 
> +--
>  blockdev.c| 55 +
>  include/block/block.h |  8 +
>  include/block/block_int.h |  5 +++
>  qapi/block-core.json  | 32 +++
>  qmp-commands.hx   | 53 
>  7 files changed, 282 insertions(+), 6 deletions(-)

Thanks, I've applied the series to my block-next branch:

https://github.com/XanClic/qemu/commits/block-next

Max



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH v14 2/3] quorum: implement bdrv_add_child() and bdrv_del_child()

2016-05-10 Thread Max Reitz
On 10.05.2016 09:36, Changlong Xie wrote:
> From: Wen Congyang 
> 
> Signed-off-by: Wen Congyang 
> Signed-off-by: zhanghailiang 
> Signed-off-by: Gonglei 
> Signed-off-by: Changlong Xie 
> ---
>  block.c   |  8 +++---
>  block/quorum.c| 78 
> +--
>  include/block/block.h |  4 +++
>  3 files changed, 84 insertions(+), 6 deletions(-)

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [Qemu-devel] [PATCH] nbd: fix trim/discard commands with a length bigger than NBD_MAX_BUFFER_SIZE

2016-05-10 Thread Eric Blake
[adding nbd-devel, qemu-block]

On 05/06/2016 02:45 AM, Quentin Casasnovas wrote:
> When running fstrim on a filesystem mounted through qemu-nbd with
> --discard=on, fstrim would fail with I/O errors:
> 
>   $ fstrim /k/spl/ice/
>   fstrim: /k/spl/ice/: FITRIM ioctl failed: Input/output error
> 
> and qemu-nbd was spitting these:
> 
>   nbd.c:nbd_co_receive_request():L1232: len (94621696) is larger than max len 
> (33554432)
> 

> 
> The length of the request seems huge but this is really just the filesystem
> telling the block device driver that "this length should be trimmed", and,
> unlike for a NBD_CMD_READ or NBD_CMD_WRITE, we'll not try to read/write
> that amount of data from/to the NBD socket.  It is thus safe to remove the
> length check for a NBD_CMD_TRIM.
> 
> I've confirmed this with both the protocol documentation at:
> 
>  https://github.com/yoe/nbd/blob/master/doc/proto.md

Hmm. The current wording of the experimental block size additions does
NOT allow the client to send a NBD_CMD_TRIM with a size larger than the
maximum NBD_CMD_WRITE:
https://github.com/yoe/nbd/blob/extension-info/doc/proto.md#block-size-constraints

Maybe we should revisit that in the spec, and/or advertise yet another
block size (since the maximum size for a trim and/or write_zeroes
request may indeed be different than the maximum size for a read/write).

But since the kernel is the one sending the large length request, and
since you are right that this is not a denial-of-service in the amount
of data being sent in a single NBD message, I definitely agree that qemu
would be wise as a quality-of-implementation to allow the larger size,
for maximum interoperability, even if it exceeds advertised limits (that
is, when no limits are advertised, we should handle everything possible
if it is not so large as to be construed a denial-of-service, and
NBD_CMD_TRIM is not large; and when limits ARE advertised, a client that
violates limits is out of spec but we can still be liberal and respond
successfully to such a client rather than having to outright reject it).
 So I think this patch is headed in the right direction.

> 
> and looking at the kernel side implementation of the nbd device
> (drivers/block/nbd.c) where it only sends the request header with no data
> for a NBD_CMD_TRIM.
> 
> With this fix in, I am now able to run fstrim on my qcow2 images and keep
> them small (or at least keep their size proportional to the amount of data
> present on them).
> 
> Signed-off-by: Quentin Casasnovas 
> CC: Paolo Bonzini 
> CC: 
> CC: 
> CC: 

This is NOT trivial material and should not go in through that tree.
However, I concur that it qualifies for a backport on a stable branch.

> ---
>  nbd.c | 8 +++-
>  1 file changed, 7 insertions(+), 1 deletion(-)
> 
> diff --git a/nbd.c b/nbd.c
> index b3d9654..e733669 100644
> --- a/nbd.c
> +++ b/nbd.c
> @@ -1209,6 +1209,11 @@ static ssize_t nbd_co_send_reply(NBDRequest *req, 
> struct nbd_reply *reply,
>  return rc;
>  }
>  
> +static bool nbd_should_check_request_size(const struct nbd_request *request)
> +{
> +return (request->type & NBD_CMD_MASK_COMMAND) != NBD_CMD_TRIM;
> +}
> +
>  static ssize_t nbd_co_receive_request(NBDRequest *req, struct nbd_request 
> *request)
>  {
>  NBDClient *client = req->client;
> @@ -1227,7 +1232,8 @@ static ssize_t nbd_co_receive_request(NBDRequest *req, 
> struct nbd_request *reque
>  goto out;
>  }
>  
> -if (request->len > NBD_MAX_BUFFER_SIZE) {
> +if (nbd_should_check_request_size(request) &&
> +request->len > NBD_MAX_BUFFER_SIZE) {

I'd rather sort out the implications of this on the NBD protocol before
taking anything into qemu.  We've got time on our hand, so let's use it
to get this right.  (That, and I have several pending patches that
conflict with this as part of adding WRITE_ZEROES and INFO_BLOCK_SIZE
support, where it may be easier to resubmit this fix on top of my
pending patches).

>  LOG("len (%u) is larger than max len (%u)",
>  request->len, NBD_MAX_BUFFER_SIZE);
>  rc = -EINVAL;
> 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH v7 06/19] scsi-disk: Switch to byte-based aio block access

2016-05-10 Thread Eric Blake
On 05/10/2016 02:55 AM, Kevin Wolf wrote:
> Am 06.05.2016 um 18:26 hat Eric Blake geschrieben:
>> Sector-based blk_aio_readv() and blk_aio_writev() should die; switch
>> to byte-based blk_aio_preadv() and blk_aio_pwritev() instead.
>>
>> As part of the cleanup, scsi_init_iovec() no longer needs to return
>> a value, and reword a comment.
>>
>> Signed-off-by: Eric Blake 
>>

>> @@ -118,7 +118,6 @@ static uint32_t scsi_init_iovec(SCSIDiskReq *r, size_t 
>> size)
>>  }
>>  r->iov.iov_len = MIN(r->sector_count * 512, r->buflen);
>>  qemu_iovec_init_external(>qiov, >iov, 1);
>> -return r->qiov.size / 512;
> 
> The return value was MIN(r->sector_count, SCSI_DMA_BUF_SIZE / 512).
> 
>>  }
>>
>>  static void scsi_disk_save_request(QEMUFile *f, SCSIRequest *req)

>> -n = scsi_init_iovec(r, SCSI_DMA_BUF_SIZE);
>> +scsi_init_iovec(r, SCSI_DMA_BUF_SIZE);
>>  block_acct_start(blk_get_stats(s->qdev.conf.blk), >acct,
>> - n * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
>> -r->req.aiocb = blk_aio_readv(s->qdev.conf.blk, r->sector, >qiov, 
>> n,
>> - scsi_read_complete, r);
>> + SCSI_DMA_BUF_SIZE, BLOCK_ACCT_READ);
> 
> But here you use SCSI_DMA_BUF_SIZE without considering r->sector_count.
> Is this correct or are requests that are smaller than SCSI_DMA_BUF_SIZE
> now accounted for more data than they actually read?
> 
> Would r->qiov.size be more obviously correct? You did that for writes.

Yes. Is that something you are able to fix on commit, or should I submit
a fixup?

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [Qemu-devel] [PATCH v4 00/27] block: Lock images when opening

2016-05-10 Thread Kevin Wolf
Am 10.05.2016 um 14:22 hat Daniel P. Berrange geschrieben:
> On Tue, May 10, 2016 at 01:11:30PM +0100, Richard W.M. Jones wrote:
> > At no point did I say that it was safe to use libguestfs on live VMs
> > or that you would always get consistent data out.
> > 
> > But the fact that it can fail is understood, the chance of failure is
> > really tiny (it has literally only happened twice that I've read
> > corrupted data, in years of daily use), and the operation is very
> > useful.

Is it understood by you (which I'm happy to believe you) or by all of
your users (which I doubt)?

> > So I think this patch series should either not lock r/o VMs, or should
> > add a nolock flag to override the locking (which libguestfs will
> > always use).
> 
> If QEMU locks r/o disks, then libvirt would likely end up setting the
> "nolock" flag unconditionally too, in order to avoid breaking libguestfs
> and other application usage of libvirt.

It would still have some value as we would protect command line users at
least. Of course I would prefer if libvirt could find a way to protect
their users as well without breaking valid use cases. But if we can
point at libvirt, I guess that's good for qemu at least...

Kevin



Re: [Qemu-block] [PATCHv2 0/3] Better 'Force Unit Access' handling

2016-05-10 Thread Kevin Wolf
Am 04.05.2016 um 00:39 hat Eric Blake geschrieben:
> I noticed some inconsistencies in FUA handling while working
> with NBD, then Kevin pointed out that my initial attempt wasn't
> quite right for iscsi which also had problems, so this has
> expanded into a series rather than a single patch.
> 
> I'm not sure if this is qemu-stable material at this point.
> 
> Depends on Kevin's block-next branch.
> 
> Also available as a tag at this location:
> git fetch git://repo.or.cz/qemu/ericb.git nbd-fua-v2

Reviewed-by: Kevin Wolf 

Stefan, if you're happy with the series, I can merge it into block-next.

Kevin



Re: [Qemu-block] [PATCH v2 10/13] block: Decouple throttling from BlockDriverState

2016-05-10 Thread Alberto Garcia
On Fri 22 Apr 2016 07:42:39 PM CEST, Kevin Wolf wrote:
> This moves the throttling related part of the BDS life cycle management
> to BlockBackend. The throttling group reference is now kept even when no
> medium is inserted.
>
> With this commit, throttling isn't disabled and then re-enabled any more
> during graph reconfiguration. This fixes the temporary breakage of I/O
> throttling when used with live snapshots or block jobs that manipulate
> the graph.
>
> Signed-off-by: Kevin Wolf 

Reviewed-by: Alberto Garcia 

Berto



Re: [Qemu-block] [PATCH v2 05/13] block: Move throttling fields from BDS to BB

2016-05-10 Thread Alberto Garcia
On Fri 22 Apr 2016 07:42:34 PM CEST, Kevin Wolf wrote:
>  typedef struct BlockBackendPublic {
> -/* I/O throttling */
> +/* I/O throttling.
> + * throttle_state tells us if this BDS has I/O limits configured.
> + * io_limits_disabled tells us if they are currently being enforced */

I overlooked this earlier... this is now in BlockBackend but the comment
still refers to a BDS. There's also a couple more cases of this in
throttle-groups.c.

Berto



Re: [Qemu-block] [Qemu-devel] [PATCH v4 00/27] block: Lock images when opening

2016-05-10 Thread Daniel P. Berrange
On Tue, May 10, 2016 at 01:11:30PM +0100, Richard W.M. Jones wrote:
> At no point did I say that it was safe to use libguestfs on live VMs
> or that you would always get consistent data out.
> 
> But the fact that it can fail is understood, the chance of failure is
> really tiny (it has literally only happened twice that I've read
> corrupted data, in years of daily use), and the operation is very
> useful.
> 
> So I think this patch series should either not lock r/o VMs, or should
> add a nolock flag to override the locking (which libguestfs will
> always use).

If QEMU locks r/o disks, then libvirt would likely end up setting the
"nolock" flag unconditionally too, in order to avoid breaking libguestfs
and other application usage of libvirt.

Regards,
Daniel
-- 
|: http://berrange.com  -o-http://www.flickr.com/photos/dberrange/ :|
|: http://libvirt.org  -o- http://virt-manager.org :|
|: http://autobuild.org   -o- http://search.cpan.org/~danberr/ :|
|: http://entangle-photo.org   -o-   http://live.gnome.org/gtk-vnc :|



Re: [Qemu-block] [PATCH v2 13/13] block: Don't check throttled reqs in bdrv_requests_pending()

2016-05-10 Thread Alberto Garcia
On Fri 22 Apr 2016 07:42:42 PM CEST, Kevin Wolf wrote:
> Checking whether there are throttled requests requires going to the
> associated BlockBackend, which we want to avoid. All users of
> bdrv_requests_pending() already call bdrv_parent_drained_begin()
> first,

There's a couple of assert(!bdrv_requests_pending()), is it also the
case with them?

Berto



Re: [Qemu-block] [Qemu-devel] [PATCH v4 00/27] block: Lock images when opening

2016-05-10 Thread Richard W.M. Jones
At no point did I say that it was safe to use libguestfs on live VMs
or that you would always get consistent data out.

But the fact that it can fail is understood, the chance of failure is
really tiny (it has literally only happened twice that I've read
corrupted data, in years of daily use), and the operation is very
useful.

So I think this patch series should either not lock r/o VMs, or should
add a nolock flag to override the locking (which libguestfs will
always use).

That's all I have to say on this.

Rich.

-- 
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
Read my programming and virtualization blog: http://rwmj.wordpress.com
Fedora Windows cross-compiler. Compile Windows programs, test, and
build Windows installers. Over 100 libraries supported.
http://fedoraproject.org/wiki/MinGW



Re: [Qemu-block] [Qemu-devel] [PATCH v4 00/27] block: Lock images when opening

2016-05-10 Thread Kevin Wolf
Am 10.05.2016 um 13:46 hat Richard W.M. Jones geschrieben:
> On Tue, May 10, 2016 at 01:08:49PM +0200, Kevin Wolf wrote:
> > Are you saying that libguestfs only allows operations like df on live
> > images, but not e.g. copying files out of the VM?
> [...]
> 
> virt-copy-out will let you copy out files from a live VM.
> 
> There's no difference between "safe" and "unsafe" operations, because
> (a) it depends on unknowable information about the guest -- it's safe
> to read (even write) a filesystem if it's not mounted by the guest,
> and (b) even reading a superblock field from an in-use mounted
> filesystem is subject to an unlikely but possible race.

The consequences of a failure are different. Specifically, in the quote
that you stripped from this email, you said as a justification for
ignoring the possible problems:

> However _reading_ disks doesn't corrupt live VMs.  The worst that
> happens is guestfish will error out or you'll see some inconsistent
> stats from virt-df.

Creating corrupted copies of data is much worse than wrong stats in a
virt-df result, in my opinion.

> Users of libguestfs on live VMs just have to be aware of this, and we
> make them aware over and over again of the potential problems.

The correct way to make them aware is requiring something like a --force
flag. Just writing it in the documentation is not much more than an
excuse to tell the users that it was their own fault.

> Importantly, readonly access won't result in corrupt filesystems in
> the live VM.

Yes, it only results in corrupt data in the copy. Is that much better?

> I'm much more interested in stopping people from writing to live VMs.
> That is a serious problem, results in unrecoverable filesystems and
> near-100% certain data loss [especially with journalled fses], and is
> something that has no (or very very few) valid use cases.  It's also
> something which only qemu is in a position to properly protect
> against.

Nobody is saying that we shouldn't protect against writing to live VMs.

We're discussing that you are objecting to protecting users against
reading from live VMs, which may not be quite as disastrous in many
cases, but can result in corruption, too. And here as well only qemu is
in the position to properly protect against it.

Kevin



Re: [Qemu-block] [Qemu-devel] [PATCH v4 00/27] block: Lock images when opening

2016-05-10 Thread Richard W.M. Jones
On Tue, May 10, 2016 at 01:08:49PM +0200, Kevin Wolf wrote:
> Are you saying that libguestfs only allows operations like df on live
> images, but not e.g. copying files out of the VM?
[...]

virt-copy-out will let you copy out files from a live VM.

There's no difference between "safe" and "unsafe" operations, because
(a) it depends on unknowable information about the guest -- it's safe
to read (even write) a filesystem if it's not mounted by the guest,
and (b) even reading a superblock field from an in-use mounted
filesystem is subject to an unlikely but possible race.

Users of libguestfs on live VMs just have to be aware of this, and we
make them aware over and over again of the potential problems.
Importantly, readonly access won't result in corrupt filesystems in
the live VM.

I'm much more interested in stopping people from writing to live VMs.
That is a serious problem, results in unrecoverable filesystems and
near-100% certain data loss [especially with journalled fses], and is
something that has no (or very very few) valid use cases.  It's also
something which only qemu is in a position to properly protect
against.

Rich.

-- 
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
Read my programming and virtualization blog: http://rwmj.wordpress.com
virt-top is 'top' for virtual machines.  Tiny program with many
powerful monitoring features, net stats, disk stats, logging, etc.
http://people.redhat.com/~rjones/virt-top



Re: [Qemu-block] [Qemu-devel] [PATCH v4 00/27] block: Lock images when opening

2016-05-10 Thread Kevin Wolf
Am 10.05.2016 um 12:29 hat Daniel P. Berrange geschrieben:
> On Tue, May 10, 2016 at 12:07:06PM +0200, Kevin Wolf wrote:
> > Am 10.05.2016 um 11:43 hat Daniel P. Berrange geschrieben:
> > > On Tue, May 10, 2016 at 11:35:14AM +0200, Kevin Wolf wrote:
> > > > Am 10.05.2016 um 11:23 hat Daniel P. Berrange geschrieben:
> > > > > On Tue, May 10, 2016 at 11:14:22AM +0200, Kevin Wolf wrote:
> > > > > > Am 10.05.2016 um 10:50 hat Daniel P. Berrange geschrieben:
> > > > > > > On Tue, May 10, 2016 at 09:43:04AM +0100, Richard W.M. Jones 
> > > > > > > wrote:
> > > > > > > > On Tue, May 10, 2016 at 09:14:26AM +0100, Richard W.M. Jones 
> > > > > > > > wrote:
> > > > > > > > > However I didn't test the write-shareable case (the libvirt
> > > > > > > > >  flag which should map to a shared lock -- is 
> > > > > > > > > that right Dan?).
> > > > > > > > 
> > > > > > > > To Dan (mainly): I think setting the  flag in 
> > > > > > > > libvirt only
> > > > > > > > sets cache=unsafe on the qemu drive (it may have other effects 
> > > > > > > > for
> > > > > > > > virtlockd).  Should there be an extra qemu drive flag to 
> > > > > > > > communicate
> > > > > > > > that the drive is write-shareable?
> > > > > > > 
> > > > > > > Sure, if QEMU had a way to indicate that the disk was used in a
> > > > > > > write-sharable mode, libvirt would use it.
> > > > > > > 
> > > > > > > I agree with your general point that we should do no locking for
> > > > > > > read-only access, F_RDLCK for shared-write and F_WRLCK for
> > > > > > > exclusive-write access. I think I made that point similarly 
> > > > > > > against
> > > > > > > an earlier version of this patchset
> > > > > > 
> > > > > > Why should we do no locking for read-only access by default? If an 
> > > > > > image
> > > > > > is written to, read-only accesses are potentially inconsistent and 
> > > > > > we
> > > > > > should protect users against it.
> > > > > > 
> > > > > > The only argument I can see in the old versions of this series is
> > > > > > "libguestfs does it and usually it gets away with it". For me, 
> > > > > > that's
> > > > > > not an argument for generally allowing this, but at most for 
> > > > > > providing a
> > > > > > switch to bypass the locking.
> > > > > > 
> > > > > > Because let's be clear about this: If someone lost data because they
> > > > > > took an inconsistent backup this way and comes to us qemu 
> > > > > > developers,
> > > > > > all we can tell them is "sorry, what you did is not supported". And
> > > > > > that's a pretty strong sign that locking should prevent it by 
> > > > > > default.
> > > > > 
> > > > > We have 3 usage scenarios - readonly-share, writable-shared and
> > > > > writable-exclusive, and only 2 lock types to play with. This series
> > > > > does no locking at all in the writable-shared case, so we still have
> > > > > the problem you describe in that someone opening the image for 
> > > > > readonly
> > > > > access will succeeed even when it is used writable-shared by another
> > > > > process.
> > > > > 
> > > > > So we have to pick a trade-off here. IMHO it is more important to
> > > > > ensure we have locks in both the write-shared & write-exclusive case,
> > > > > as both of those can definitely result in corrupted images if not
> > > > > careful, where as read-only access merely results in your reading
> > > > > bad data, no risk of corrupting the original image.
> > > > 
> > > > I agree that we want locking for the writable-shared case. That doesn't
> > > > mean no locking for read-only, though. I think read-only and writeable-
> > > > shared are the same thing as far as locking is concerned.
> > > 
> > > It doesn't make much sense to say that it is unsafe to use read-only
> > > in combination with write-exclusive, but then allow read-only with
> > > write-shared. In both cases you have the same scenario that the
> > > read-only app may get inconsistent data when reading.
> > 
> > Doesn't write-shared usually indicate that the contents can actually be
> > consistently read/written to, for example because a cluster filesystem
> > is used? If you couldn't, you would use write-exclusive instead.
> > 
> > In contrast, write-exclusive indicates that correct concurrent access
> > isn't possible, for example because you're using a "normal" filesystem
> > that might additionally keep things in a writeback cache in the guest.
> > You wouldn't use write-shared there.
> > 
> > > > This is the matrix of the allowed combinations (without a manual lock
> > > > override that enables libguestfs to use unsupported cases), as I see it:
> > > > 
> > > > wr-excl wr-shared   read-only
> > > > write-exclusive no  no  no
> > > > write-sharedno  yes yes
> > > > read-only   no  yes yes
> > > > 
> > > > Do you disagree with any of the entries?
> > > 
> > > I would have 'yes' in all the read-only cells.
> > 
> > I'm surprised how low the standards 

Re: [Qemu-block] [Qemu-devel] [PATCH v4 00/27] block: Lock images when opening

2016-05-10 Thread Kevin Wolf
Am 10.05.2016 um 12:16 hat Richard W.M. Jones geschrieben:
> On Tue, May 10, 2016 at 12:07:06PM +0200, Kevin Wolf wrote:
> > I'm surprised how low the standards seem to be when we're talking about
> > data integrity. If occasionally losing data is okay, the qemu block
> > layer could be quite a bit simpler.
> 
> I welcome this patch because it fixes a real data integrity issue
> which we've seen in the field: people using guestfish (in write mode)
> on live VMs.

Yes, people writing to live disks is a very real issue. They don't only
use guestfish for it, but also qemu-img (e.g. taking snapshots), and
these are the cases that become visible on the qemu mailing list.

But if you imply that the read-only case isn't real, I have to disagree.
Sometimes people also try to copy data out from a live VM with qemu-img
convert, and while this seems to succeed, they may actually have
produced a corrupt copy. This is why I want to protect the read-only
case as well.

> We try our very best to prevent this happening -- for example if you
> use guestfish via libvirt, it will check if the VM is live and refuse
> access.  Though this is not and cannot be bulletproof (since someone
> can start a VM up after we have opened it).  We also have prominent
> warnings in the manual and in the FAQ about this.
> 
> However _reading_ disks doesn't corrupt live VMs.  The worst that
> happens is guestfish will error out or you'll see some inconsistent
> stats from virt-df.

Are you saying that libguestfs only allows operations like df on live
images, but not e.g. copying files out of the VM?

Because if copying data out was allowed, I'd suspect that people would
use it on live VMs and would be surprised if they didn't get what they
expected (which they often only notice when it's too late).

I guess you're right that we can tolerate some df command not being 100%
sure to return the right numbers, but it's a very special case and I
think it's not demanding too much if you need to pass a lock-override
flag when you do something like this, when this can protect people
against accidentally creating corrupted copies.

> None of this has anything to do with data integrity in the qemu block
> layer, and no one is arguing that it should be weakened.

We're talking about real data corruption in both cases.

Kevin



Re: [Qemu-block] [PATCH v4 8/8] linux-aio: share one LinuxAioState within an AioContext

2016-05-10 Thread Paolo Bonzini


On 10/05/2016 11:40, Kevin Wolf wrote:
> > Regarding performance, I'm thinking about a guest with 8 disks (queue
> > depth 32).  The worst case is when the guest submits 32 requests at once
> > but the Linux AIO event limit has already been reached.  Then the disk
> > is starved until other disks' requests complete.
>
> Sounds like a valid concern.

Oh, so you're concerned about the non-dataplane case.  My suspicion is
that, with such a number of outstanding I/O, you probably have bad
performance unless you use dataplane.

Also, aio=threads has had a 64 I/O limit for years and we've never heard
about problems with stuck I/O.  But I agree that it's one area to keep
an eye on.

Paolo



Re: [Qemu-block] [Qemu-devel] [PATCH v4 00/27] block: Lock images when opening

2016-05-10 Thread Daniel P. Berrange
On Tue, May 10, 2016 at 12:07:06PM +0200, Kevin Wolf wrote:
> Am 10.05.2016 um 11:43 hat Daniel P. Berrange geschrieben:
> > On Tue, May 10, 2016 at 11:35:14AM +0200, Kevin Wolf wrote:
> > > Am 10.05.2016 um 11:23 hat Daniel P. Berrange geschrieben:
> > > > On Tue, May 10, 2016 at 11:14:22AM +0200, Kevin Wolf wrote:
> > > > > Am 10.05.2016 um 10:50 hat Daniel P. Berrange geschrieben:
> > > > > > On Tue, May 10, 2016 at 09:43:04AM +0100, Richard W.M. Jones wrote:
> > > > > > > On Tue, May 10, 2016 at 09:14:26AM +0100, Richard W.M. Jones 
> > > > > > > wrote:
> > > > > > > > However I didn't test the write-shareable case (the libvirt
> > > > > > > >  flag which should map to a shared lock -- is that 
> > > > > > > > right Dan?).
> > > > > > > 
> > > > > > > To Dan (mainly): I think setting the  flag in libvirt 
> > > > > > > only
> > > > > > > sets cache=unsafe on the qemu drive (it may have other effects for
> > > > > > > virtlockd).  Should there be an extra qemu drive flag to 
> > > > > > > communicate
> > > > > > > that the drive is write-shareable?
> > > > > > 
> > > > > > Sure, if QEMU had a way to indicate that the disk was used in a
> > > > > > write-sharable mode, libvirt would use it.
> > > > > > 
> > > > > > I agree with your general point that we should do no locking for
> > > > > > read-only access, F_RDLCK for shared-write and F_WRLCK for
> > > > > > exclusive-write access. I think I made that point similarly against
> > > > > > an earlier version of this patchset
> > > > > 
> > > > > Why should we do no locking for read-only access by default? If an 
> > > > > image
> > > > > is written to, read-only accesses are potentially inconsistent and we
> > > > > should protect users against it.
> > > > > 
> > > > > The only argument I can see in the old versions of this series is
> > > > > "libguestfs does it and usually it gets away with it". For me, that's
> > > > > not an argument for generally allowing this, but at most for 
> > > > > providing a
> > > > > switch to bypass the locking.
> > > > > 
> > > > > Because let's be clear about this: If someone lost data because they
> > > > > took an inconsistent backup this way and comes to us qemu developers,
> > > > > all we can tell them is "sorry, what you did is not supported". And
> > > > > that's a pretty strong sign that locking should prevent it by default.
> > > > 
> > > > We have 3 usage scenarios - readonly-share, writable-shared and
> > > > writable-exclusive, and only 2 lock types to play with. This series
> > > > does no locking at all in the writable-shared case, so we still have
> > > > the problem you describe in that someone opening the image for readonly
> > > > access will succeeed even when it is used writable-shared by another
> > > > process.
> > > > 
> > > > So we have to pick a trade-off here. IMHO it is more important to
> > > > ensure we have locks in both the write-shared & write-exclusive case,
> > > > as both of those can definitely result in corrupted images if not
> > > > careful, where as read-only access merely results in your reading
> > > > bad data, no risk of corrupting the original image.
> > > 
> > > I agree that we want locking for the writable-shared case. That doesn't
> > > mean no locking for read-only, though. I think read-only and writeable-
> > > shared are the same thing as far as locking is concerned.
> > 
> > It doesn't make much sense to say that it is unsafe to use read-only
> > in combination with write-exclusive, but then allow read-only with
> > write-shared. In both cases you have the same scenario that the
> > read-only app may get inconsistent data when reading.
> 
> Doesn't write-shared usually indicate that the contents can actually be
> consistently read/written to, for example because a cluster filesystem
> is used? If you couldn't, you would use write-exclusive instead.
> 
> In contrast, write-exclusive indicates that correct concurrent access
> isn't possible, for example because you're using a "normal" filesystem
> that might additionally keep things in a writeback cache in the guest.
> You wouldn't use write-shared there.
> 
> > > This is the matrix of the allowed combinations (without a manual lock
> > > override that enables libguestfs to use unsupported cases), as I see it:
> > > 
> > > wr-excl wr-shared   read-only
> > > write-exclusive no  no  no
> > > write-sharedno  yes yes
> > > read-only   no  yes yes
> > > 
> > > Do you disagree with any of the entries?
> > 
> > I would have 'yes' in all the read-only cells.
> 
> I'm surprised how low the standards seem to be when we're talking about
> data integrity. If occasionally losing data is okay, the qemu block
> layer could be quite a bit simpler.

To my mind there's two separate issues at play here. Protection against
two processes writing to the same image in a way that causes unrecoverable
corruption. Protection against reading bad data from 

Re: [Qemu-block] [Qemu-devel] [PATCH v4 00/27] block: Lock images when opening

2016-05-10 Thread Richard W.M. Jones
On Tue, May 10, 2016 at 12:07:06PM +0200, Kevin Wolf wrote:
> I'm surprised how low the standards seem to be when we're talking about
> data integrity. If occasionally losing data is okay, the qemu block
> layer could be quite a bit simpler.

I welcome this patch because it fixes a real data integrity issue
which we've seen in the field: people using guestfish (in write mode)
on live VMs.

We try our very best to prevent this happening -- for example if you
use guestfish via libvirt, it will check if the VM is live and refuse
access.  Though this is not and cannot be bulletproof (since someone
can start a VM up after we have opened it).  We also have prominent
warnings in the manual and in the FAQ about this.

However _reading_ disks doesn't corrupt live VMs.  The worst that
happens is guestfish will error out or you'll see some inconsistent
stats from virt-df.

None of this has anything to do with data integrity in the qemu block
layer, and no one is arguing that it should be weakened.

Rich.

-- 
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
Read my programming and virtualization blog: http://rwmj.wordpress.com
virt-top is 'top' for virtual machines.  Tiny program with many
powerful monitoring features, net stats, disk stats, logging, etc.
http://people.redhat.com/~rjones/virt-top



Re: [Qemu-block] [Qemu-devel] [PATCH v4 00/27] block: Lock images when opening

2016-05-10 Thread Kevin Wolf
Am 10.05.2016 um 11:43 hat Daniel P. Berrange geschrieben:
> On Tue, May 10, 2016 at 11:35:14AM +0200, Kevin Wolf wrote:
> > Am 10.05.2016 um 11:23 hat Daniel P. Berrange geschrieben:
> > > On Tue, May 10, 2016 at 11:14:22AM +0200, Kevin Wolf wrote:
> > > > Am 10.05.2016 um 10:50 hat Daniel P. Berrange geschrieben:
> > > > > On Tue, May 10, 2016 at 09:43:04AM +0100, Richard W.M. Jones wrote:
> > > > > > On Tue, May 10, 2016 at 09:14:26AM +0100, Richard W.M. Jones wrote:
> > > > > > > However I didn't test the write-shareable case (the libvirt
> > > > > > >  flag which should map to a shared lock -- is that 
> > > > > > > right Dan?).
> > > > > > 
> > > > > > To Dan (mainly): I think setting the  flag in libvirt 
> > > > > > only
> > > > > > sets cache=unsafe on the qemu drive (it may have other effects for
> > > > > > virtlockd).  Should there be an extra qemu drive flag to communicate
> > > > > > that the drive is write-shareable?
> > > > > 
> > > > > Sure, if QEMU had a way to indicate that the disk was used in a
> > > > > write-sharable mode, libvirt would use it.
> > > > > 
> > > > > I agree with your general point that we should do no locking for
> > > > > read-only access, F_RDLCK for shared-write and F_WRLCK for
> > > > > exclusive-write access. I think I made that point similarly against
> > > > > an earlier version of this patchset
> > > > 
> > > > Why should we do no locking for read-only access by default? If an image
> > > > is written to, read-only accesses are potentially inconsistent and we
> > > > should protect users against it.
> > > > 
> > > > The only argument I can see in the old versions of this series is
> > > > "libguestfs does it and usually it gets away with it". For me, that's
> > > > not an argument for generally allowing this, but at most for providing a
> > > > switch to bypass the locking.
> > > > 
> > > > Because let's be clear about this: If someone lost data because they
> > > > took an inconsistent backup this way and comes to us qemu developers,
> > > > all we can tell them is "sorry, what you did is not supported". And
> > > > that's a pretty strong sign that locking should prevent it by default.
> > > 
> > > We have 3 usage scenarios - readonly-share, writable-shared and
> > > writable-exclusive, and only 2 lock types to play with. This series
> > > does no locking at all in the writable-shared case, so we still have
> > > the problem you describe in that someone opening the image for readonly
> > > access will succeeed even when it is used writable-shared by another
> > > process.
> > > 
> > > So we have to pick a trade-off here. IMHO it is more important to
> > > ensure we have locks in both the write-shared & write-exclusive case,
> > > as both of those can definitely result in corrupted images if not
> > > careful, where as read-only access merely results in your reading
> > > bad data, no risk of corrupting the original image.
> > 
> > I agree that we want locking for the writable-shared case. That doesn't
> > mean no locking for read-only, though. I think read-only and writeable-
> > shared are the same thing as far as locking is concerned.
> 
> It doesn't make much sense to say that it is unsafe to use read-only
> in combination with write-exclusive, but then allow read-only with
> write-shared. In both cases you have the same scenario that the
> read-only app may get inconsistent data when reading.

Doesn't write-shared usually indicate that the contents can actually be
consistently read/written to, for example because a cluster filesystem
is used? If you couldn't, you would use write-exclusive instead.

In contrast, write-exclusive indicates that correct concurrent access
isn't possible, for example because you're using a "normal" filesystem
that might additionally keep things in a writeback cache in the guest.
You wouldn't use write-shared there.

> > This is the matrix of the allowed combinations (without a manual lock
> > override that enables libguestfs to use unsupported cases), as I see it:
> > 
> > wr-excl wr-shared   read-only
> > write-exclusive no  no  no
> > write-sharedno  yes yes
> > read-only   no  yes yes
> > 
> > Do you disagree with any of the entries?
> 
> I would have 'yes' in all the read-only cells.

I'm surprised how low the standards seem to be when we're talking about
data integrity. If occasionally losing data is okay, the qemu block
layer could be quite a bit simpler.

Kevin



Re: [Qemu-block] [Qemu-devel] [PATCH v4 00/27] block: Lock images when opening

2016-05-10 Thread Richard W.M. Jones

On Tue, May 10, 2016 at 11:14:22AM +0200, Kevin Wolf wrote:
> Am 10.05.2016 um 10:50 hat Daniel P. Berrange geschrieben:
> > On Tue, May 10, 2016 at 09:43:04AM +0100, Richard W.M. Jones wrote:
> > > On Tue, May 10, 2016 at 09:14:26AM +0100, Richard W.M. Jones wrote:
> > > > However I didn't test the write-shareable case (the libvirt
> > > >  flag which should map to a shared lock -- is that right 
> > > > Dan?).
> > > 
> > > To Dan (mainly): I think setting the  flag in libvirt only
> > > sets cache=unsafe on the qemu drive (it may have other effects for
> > > virtlockd).  Should there be an extra qemu drive flag to communicate
> > > that the drive is write-shareable?
> > 
> > Sure, if QEMU had a way to indicate that the disk was used in a
> > write-sharable mode, libvirt would use it.
> > 
> > I agree with your general point that we should do no locking for
> > read-only access, F_RDLCK for shared-write and F_WRLCK for
> > exclusive-write access. I think I made that point similarly against
> > an earlier version of this patchset
> 
> Why should we do no locking for read-only access by default? If an image
> is written to, read-only accesses are potentially inconsistent and we
> should protect users against it.
> 
> The only argument I can see in the old versions of this series is
> "libguestfs does it and usually it gets away with it". For me, that's
> not an argument for generally allowing this, but at most for providing a
> switch to bypass the locking.

Most of the interesting stats derived from disks come from the
superblock (eg. virt-df information) or parts of the filesystem that
rarely change (for example contents of /usr).  I don't claim that this
is always safe, but I do claim that it works almost all the time, and
when it doesn't, it doesn't especially matter because you're
collecting time series data where one missed stat is irrelevant.

> Because let's be clear about this: If someone lost data because they
> took an inconsistent backup this way and comes to us qemu developers,
> all we can tell them is "sorry, what you did is not supported". And
> that's a pretty strong sign that locking should prevent it by default.

Mostly that would happen because someone copies the disk image (eg.
rsync /var/lib/libvirt/images/*.qcow2 /backups).  Nothing in this
patch or anywhere else will protect against that.

Libguestfs has prominent warnings in every manual page about both
writing to live VMs and reading live VMs, eg:
http://libguestfs.org/guestfish.1.html#warning

Rich.

-- 
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
Read my programming and virtualization blog: http://rwmj.wordpress.com
virt-df lists disk usage of guests without needing to install any
software inside the virtual machine.  Supports Linux and Windows.
http://people.redhat.com/~rjones/virt-df/



Re: [Qemu-block] [Qemu-devel] [PATCH v18 7/8] Implement new driver for block replication

2016-05-10 Thread Changlong Xie

On 05/06/2016 11:46 PM, Stefan Hajnoczi wrote:

On Fri, Apr 15, 2016 at 04:10:37PM +0800, Changlong Xie wrote:

+static void replication_close(BlockDriverState *bs)
+{
+BDRVReplicationState *s = bs->opaque;
+
+if (s->mode == REPLICATION_MODE_SECONDARY) {
+g_free(s->top_id);
+}
+
+if (s->replication_state == BLOCK_REPLICATION_RUNNING) {
+replication_stop(s->rs, false, NULL);
+}


There is a possible use-after-free with s->top_id.  If we free it above
then replication_stop() must not call backup_job_cleanup().  I think it
could call it from replication_stop().

It would be safer to call replication_stop() before freeing s->top_id.



Yes, you are right.


+top_bs = bdrv_lookup_bs(s->top_id, s->top_id, errp);


Please check that bs is a child of top_bs.  If it is not a child then
strange things could happen, for example the AioContexts might not match
(meaning it's not thread-safe) so this should be forbidden.



Will fix in next version


+if (!top_bs) {
+aio_context_release(aio_context);
+return;
+}


Error return paths after reopen_backing_file(s, true, _err) should
undo the operation.


Will do.




+bdrv_op_block_all(top_bs, s->blocker);
+bdrv_op_unblock(top_bs, BLOCK_OP_TYPE_DATAPLANE, s->blocker);
+
+/*
+ * Must protect backup target if backup job was stopped/cancelled
+ * unexpectedly
+ */
+bdrv_ref(s->hidden_disk->bs);
+
+backup_start(s->secondary_disk->bs, s->hidden_disk->bs, 0,
+ MIRROR_SYNC_MODE_NONE, NULL, BLOCKDEV_ON_ERROR_REPORT,
+ BLOCKDEV_ON_ERROR_REPORT, backup_job_completed,
+ s, NULL, _err);


Did you run stress tests where the primary is writing to the disk while
the secondary reads from the same sectors?

I thought about this some more and I'm wondering about the following
scenario:

NBD writes to secondary_disk and the guest reads from the disk at the
same time.  There is a coroutine mutex in qcow2.c that protects both
read and write requests, but only until they perform the data I/O.  It
may be possible that the read request from the Secondary VM could be
started before the NBD write but the preadv() syscall isn't entered
because of CPU scheduling decisions.  In the meantime the
secondary_disk->hidden_disk backup operation takes place.  With some
unlucky timing it may be possible for the Secondary VM to read the new
contents from secondary_disk instead of the old contents that were
backed up into hidden_disk.


Thanks for your catch. I'll think about this scenario carefully.



Extra serialization would be needed.
block/backup.c:wait_for_overlapping_requests() and
block/io.c:mark_request_serialising() are good starting points for
solving this.


+cleanup_imgs();


Please use qtest_add_abrt_handler() so cleanup happens even when SIGABRT
is received.



Surely.

Thanks
-Xie





Re: [Qemu-block] [Qemu-devel] [PATCH v4 00/27] block: Lock images when opening

2016-05-10 Thread Daniel P. Berrange
On Tue, May 10, 2016 at 11:35:14AM +0200, Kevin Wolf wrote:
> Am 10.05.2016 um 11:23 hat Daniel P. Berrange geschrieben:
> > On Tue, May 10, 2016 at 11:14:22AM +0200, Kevin Wolf wrote:
> > > Am 10.05.2016 um 10:50 hat Daniel P. Berrange geschrieben:
> > > > On Tue, May 10, 2016 at 09:43:04AM +0100, Richard W.M. Jones wrote:
> > > > > On Tue, May 10, 2016 at 09:14:26AM +0100, Richard W.M. Jones wrote:
> > > > > > However I didn't test the write-shareable case (the libvirt
> > > > > >  flag which should map to a shared lock -- is that 
> > > > > > right Dan?).
> > > > > 
> > > > > To Dan (mainly): I think setting the  flag in libvirt only
> > > > > sets cache=unsafe on the qemu drive (it may have other effects for
> > > > > virtlockd).  Should there be an extra qemu drive flag to communicate
> > > > > that the drive is write-shareable?
> > > > 
> > > > Sure, if QEMU had a way to indicate that the disk was used in a
> > > > write-sharable mode, libvirt would use it.
> > > > 
> > > > I agree with your general point that we should do no locking for
> > > > read-only access, F_RDLCK for shared-write and F_WRLCK for
> > > > exclusive-write access. I think I made that point similarly against
> > > > an earlier version of this patchset
> > > 
> > > Why should we do no locking for read-only access by default? If an image
> > > is written to, read-only accesses are potentially inconsistent and we
> > > should protect users against it.
> > > 
> > > The only argument I can see in the old versions of this series is
> > > "libguestfs does it and usually it gets away with it". For me, that's
> > > not an argument for generally allowing this, but at most for providing a
> > > switch to bypass the locking.
> > > 
> > > Because let's be clear about this: If someone lost data because they
> > > took an inconsistent backup this way and comes to us qemu developers,
> > > all we can tell them is "sorry, what you did is not supported". And
> > > that's a pretty strong sign that locking should prevent it by default.
> > 
> > We have 3 usage scenarios - readonly-share, writable-shared and
> > writable-exclusive, and only 2 lock types to play with. This series
> > does no locking at all in the writable-shared case, so we still have
> > the problem you describe in that someone opening the image for readonly
> > access will succeeed even when it is used writable-shared by another
> > process.
> > 
> > So we have to pick a trade-off here. IMHO it is more important to
> > ensure we have locks in both the write-shared & write-exclusive case,
> > as both of those can definitely result in corrupted images if not
> > careful, where as read-only access merely results in your reading
> > bad data, no risk of corrupting the original image.
> 
> I agree that we want locking for the writable-shared case. That doesn't
> mean no locking for read-only, though. I think read-only and writeable-
> shared are the same thing as far as locking is concerned.

It doesn't make much sense to say that it is unsafe to use read-only
in combination with write-exclusive, but then allow read-only with
write-shared. In both cases you have the same scenario that the
read-only app may get inconsistent data when reading.

> This is the matrix of the allowed combinations (without a manual lock
> override that enables libguestfs to use unsupported cases), as I see it:
> 
> wr-excl wr-shared   read-only
> write-exclusive no  no  no
> write-sharedno  yes yes
> read-only   no  yes yes
> 
> Do you disagree with any of the entries?

I would have 'yes' in all the read-only cells.

Regards,
Daniel
-- 
|: http://berrange.com  -o-http://www.flickr.com/photos/dberrange/ :|
|: http://libvirt.org  -o- http://virt-manager.org :|
|: http://autobuild.org   -o- http://search.cpan.org/~danberr/ :|
|: http://entangle-photo.org   -o-   http://live.gnome.org/gtk-vnc :|



Re: [Qemu-block] [PATCH v4 8/8] linux-aio: share one LinuxAioState within an AioContext

2016-05-10 Thread Kevin Wolf
Am 10.05.2016 um 11:30 hat Stefan Hajnoczi geschrieben:
> On Mon, May 09, 2016 at 06:31:44PM +0200, Paolo Bonzini wrote:
> > On 19/04/2016 11:09, Stefan Hajnoczi wrote:
> > >> > This has better performance because it executes fewer system calls
> > >> > and does not use a bottom half per disk.
> > > Each aio_context_t is initialized for 128 in-flight requests in
> > > laio_init().
> > > 
> > > Will it be possible to hit the limit now that all drives share the same
> > > aio_context_t?
> > 
> > It was also possible before, because the virtqueue can be bigger than
> > 128 items; that's why there is logic to submit I/O requests after an
> > io_get_events.  As usual when the answer seems trivial, am I
> > misunderstanding your question?
> 
> I'm concerned about a performance regression rather than correctness.
> 
> But looking at linux-aio.c there *is* a correctness problem:
> 
>   static void ioq_submit(struct qemu_laio_state *s)
>   {
>   int ret, len;
>   struct qemu_laiocb *aiocb;
>   struct iocb *iocbs[MAX_QUEUED_IO];
>   QSIMPLEQ_HEAD(, qemu_laiocb) completed;
> 
>   do {
>   len = 0;
>   QSIMPLEQ_FOREACH(aiocb, >io_q.pending, next) {
>   iocbs[len++] = >iocb;
>   if (len == MAX_QUEUED_IO) {
>   break;
>   }
>   }
> 
>   ret = io_submit(s->ctx, len, iocbs);
>   if (ret == -EAGAIN) {
>   break;
>   }
>   if (ret < 0) {
>   abort();
>   }
> 
>   s->io_q.n -= ret;
>   aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
>   QSIMPLEQ_SPLIT_AFTER(>io_q.pending, aiocb, next, );
>   } while (ret == len && !QSIMPLEQ_EMPTY(>io_q.pending));
>   s->io_q.blocked = (s->io_q.n > 0);
>   }
> 
> io_submit() may have submitted some of the requests when -EAGAIN is
> returned.  QEMU gets no indication of which requests were submitted.

My understanding (which is based on the manpage rather than code) is
that -EAGAIN is only returned if no request could be submitted. In other
cases, the number of submitted requests is returned (similar to how
short reads work).

> It may be possible to dig around in the s->ctx rings to find out or we
> need to keep track of the number of in-flight requests so we can
> prevent ever hitting EAGAIN.
> 
> ioq_submit() pretends that no requests were submitted on -EAGAIN and
> will submit them again next time.  This could result in double
> completions.

Did you check in the code that this can happen?

> Regarding performance, I'm thinking about a guest with 8 disks (queue
> depth 32).  The worst case is when the guest submits 32 requests at once
> but the Linux AIO event limit has already been reached.  Then the disk
> is starved until other disks' requests complete.

Sounds like a valid concern.

Kevin


pgpQ3PKjQfaHS.pgp
Description: PGP signature


Re: [Qemu-block] [Qemu-devel] [PATCH v4 00/27] block: Lock images when opening

2016-05-10 Thread Kevin Wolf
Am 10.05.2016 um 11:23 hat Daniel P. Berrange geschrieben:
> On Tue, May 10, 2016 at 11:14:22AM +0200, Kevin Wolf wrote:
> > Am 10.05.2016 um 10:50 hat Daniel P. Berrange geschrieben:
> > > On Tue, May 10, 2016 at 09:43:04AM +0100, Richard W.M. Jones wrote:
> > > > On Tue, May 10, 2016 at 09:14:26AM +0100, Richard W.M. Jones wrote:
> > > > > However I didn't test the write-shareable case (the libvirt
> > > > >  flag which should map to a shared lock -- is that right 
> > > > > Dan?).
> > > > 
> > > > To Dan (mainly): I think setting the  flag in libvirt only
> > > > sets cache=unsafe on the qemu drive (it may have other effects for
> > > > virtlockd).  Should there be an extra qemu drive flag to communicate
> > > > that the drive is write-shareable?
> > > 
> > > Sure, if QEMU had a way to indicate that the disk was used in a
> > > write-sharable mode, libvirt would use it.
> > > 
> > > I agree with your general point that we should do no locking for
> > > read-only access, F_RDLCK for shared-write and F_WRLCK for
> > > exclusive-write access. I think I made that point similarly against
> > > an earlier version of this patchset
> > 
> > Why should we do no locking for read-only access by default? If an image
> > is written to, read-only accesses are potentially inconsistent and we
> > should protect users against it.
> > 
> > The only argument I can see in the old versions of this series is
> > "libguestfs does it and usually it gets away with it". For me, that's
> > not an argument for generally allowing this, but at most for providing a
> > switch to bypass the locking.
> > 
> > Because let's be clear about this: If someone lost data because they
> > took an inconsistent backup this way and comes to us qemu developers,
> > all we can tell them is "sorry, what you did is not supported". And
> > that's a pretty strong sign that locking should prevent it by default.
> 
> We have 3 usage scenarios - readonly-share, writable-shared and
> writable-exclusive, and only 2 lock types to play with. This series
> does no locking at all in the writable-shared case, so we still have
> the problem you describe in that someone opening the image for readonly
> access will succeeed even when it is used writable-shared by another
> process.
> 
> So we have to pick a trade-off here. IMHO it is more important to
> ensure we have locks in both the write-shared & write-exclusive case,
> as both of those can definitely result in corrupted images if not
> careful, where as read-only access merely results in your reading
> bad data, no risk of corrupting the original image.

I agree that we want locking for the writable-shared case. That doesn't
mean no locking for read-only, though. I think read-only and writeable-
shared are the same thing as far as locking is concerned.

This is the matrix of the allowed combinations (without a manual lock
override that enables libguestfs to use unsupported cases), as I see it:

wr-excl wr-shared   read-only
write-exclusive no  no  no
write-sharedno  yes yes
read-only   no  yes yes

Do you disagree with any of the entries?

Otherwise, this suggests that write-exclusive is F_WRLCK and both
write-shared and read-only are F_RDLCK.

Kevin



Re: [Qemu-block] [PATCH v4 8/8] linux-aio: share one LinuxAioState within an AioContext

2016-05-10 Thread Stefan Hajnoczi
On Mon, May 09, 2016 at 06:31:44PM +0200, Paolo Bonzini wrote:
> On 19/04/2016 11:09, Stefan Hajnoczi wrote:
> >> > This has better performance because it executes fewer system calls
> >> > and does not use a bottom half per disk.
> > Each aio_context_t is initialized for 128 in-flight requests in
> > laio_init().
> > 
> > Will it be possible to hit the limit now that all drives share the same
> > aio_context_t?
> 
> It was also possible before, because the virtqueue can be bigger than
> 128 items; that's why there is logic to submit I/O requests after an
> io_get_events.  As usual when the answer seems trivial, am I
> misunderstanding your question?

I'm concerned about a performance regression rather than correctness.

But looking at linux-aio.c there *is* a correctness problem:

  static void ioq_submit(struct qemu_laio_state *s)
  {
  int ret, len;
  struct qemu_laiocb *aiocb;
  struct iocb *iocbs[MAX_QUEUED_IO];
  QSIMPLEQ_HEAD(, qemu_laiocb) completed;

  do {
  len = 0;
  QSIMPLEQ_FOREACH(aiocb, >io_q.pending, next) {
  iocbs[len++] = >iocb;
  if (len == MAX_QUEUED_IO) {
  break;
  }
  }

  ret = io_submit(s->ctx, len, iocbs);
  if (ret == -EAGAIN) {
  break;
  }
  if (ret < 0) {
  abort();
  }

  s->io_q.n -= ret;
  aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb);
  QSIMPLEQ_SPLIT_AFTER(>io_q.pending, aiocb, next, );
  } while (ret == len && !QSIMPLEQ_EMPTY(>io_q.pending));
  s->io_q.blocked = (s->io_q.n > 0);
  }

io_submit() may have submitted some of the requests when -EAGAIN is
returned.  QEMU gets no indication of which requests were submitted.  It
may be possible to dig around in the s->ctx rings to find out or we need
to keep track of the number of in-flight requests so we can prevent ever
hitting EAGAIN.

ioq_submit() pretends that no requests were submitted on -EAGAIN and
will submit them again next time.  This could result in double
completions.

Regarding performance, I'm thinking about a guest with 8 disks (queue
depth 32).  The worst case is when the guest submits 32 requests at once
but the Linux AIO event limit has already been reached.  Then the disk
is starved until other disks' requests complete.

Stefan


signature.asc
Description: PGP signature


Re: [Qemu-block] [Qemu-devel] [PATCH v4 00/27] block: Lock images when opening

2016-05-10 Thread Daniel P. Berrange
On Tue, May 10, 2016 at 11:14:22AM +0200, Kevin Wolf wrote:
> Am 10.05.2016 um 10:50 hat Daniel P. Berrange geschrieben:
> > On Tue, May 10, 2016 at 09:43:04AM +0100, Richard W.M. Jones wrote:
> > > On Tue, May 10, 2016 at 09:14:26AM +0100, Richard W.M. Jones wrote:
> > > > However I didn't test the write-shareable case (the libvirt
> > > >  flag which should map to a shared lock -- is that right 
> > > > Dan?).
> > > 
> > > To Dan (mainly): I think setting the  flag in libvirt only
> > > sets cache=unsafe on the qemu drive (it may have other effects for
> > > virtlockd).  Should there be an extra qemu drive flag to communicate
> > > that the drive is write-shareable?
> > 
> > Sure, if QEMU had a way to indicate that the disk was used in a
> > write-sharable mode, libvirt would use it.
> > 
> > I agree with your general point that we should do no locking for
> > read-only access, F_RDLCK for shared-write and F_WRLCK for
> > exclusive-write access. I think I made that point similarly against
> > an earlier version of this patchset
> 
> Why should we do no locking for read-only access by default? If an image
> is written to, read-only accesses are potentially inconsistent and we
> should protect users against it.
> 
> The only argument I can see in the old versions of this series is
> "libguestfs does it and usually it gets away with it". For me, that's
> not an argument for generally allowing this, but at most for providing a
> switch to bypass the locking.
> 
> Because let's be clear about this: If someone lost data because they
> took an inconsistent backup this way and comes to us qemu developers,
> all we can tell them is "sorry, what you did is not supported". And
> that's a pretty strong sign that locking should prevent it by default.

We have 3 usage scenarios - readonly-share, writable-shared and
writable-exclusive, and only 2 lock types to play with. This series
does no locking at all in the writable-shared case, so we still have
the problem you describe in that someone opening the image for readonly
access will succeeed even when it is used writable-shared by another
process.

So we have to pick a trade-off here. IMHO it is more important to
ensure we have locks in both the write-shared & write-exclusive case,
as both of those can definitely result in corrupted images if not
careful, where as read-only access merely results in your reading
bad data, no risk of corrupting the original image.

Regards,
Daniel
-- 
|: http://berrange.com  -o-http://www.flickr.com/photos/dberrange/ :|
|: http://libvirt.org  -o- http://virt-manager.org :|
|: http://autobuild.org   -o- http://search.cpan.org/~danberr/ :|
|: http://entangle-photo.org   -o-   http://live.gnome.org/gtk-vnc :|



Re: [Qemu-block] [PATCH v4 08/27] osdep: Add qemu_lock_fd and qemu_unlock_fd

2016-05-10 Thread Daniel P. Berrange
On Tue, May 10, 2016 at 10:06:35AM +0100, Richard W.M. Jones wrote:
> On Tue, May 10, 2016 at 09:57:48AM +0100, Daniel P. Berrange wrote:
> > On Tue, May 10, 2016 at 10:50:40AM +0800, Fam Zheng wrote:
> > > They are wrappers of POSIX fcntl file locking, with the additional
> > > interception of open/close (through qemu_open and qemu_close) to offer a
> > > better semantics that preserves the locks across multiple life cycles of
> > > different fds on the same file.  The reason to make this semantics
> > > change over the fcntl locks is to make the API cleaner for QEMU
> > > subsystems.
> > > 
> > > More precisely, we remove this "feature" of fcntl lock:
> > > 
> > > If a process closes any file descriptor referring to a file, then
> > > all of the process's locks on that file are released, regardless of
> > > the file descriptor(s) on which the locks were obtained.
> > > 
> > > as long as the fd is always open/closed through qemu_open and
> > > qemu_close.
> > 
> > You're not actually really removing that problem - this is just hacking
> > around it in a manner which is racy & susceptible to silent failure.
> 
> Whatever happened to file-private locks (https://lwn.net/Articles/586904/)?
> My very recent glibc doesn't appear to include them, unless the
> final standard used something different from `F_SETLKPW'.

They merged in 3.15, but the constant names got renamed just after merge :-)
Look for F_OFD_SETLKW instead


commit 0d3f7a2dd2f5cf9642982515e020c1aee2cf7af6
Author: Jeff Layton 
Date:   Tue Apr 22 08:23:58 2014 -0400

locks: rename file-private locks to "open file description locks"

File-private locks have been merged into Linux for v3.15, and *now*
people are commenting that the name and macro definitions for the new
file-private locks suck.

...and I can't even disagree. The names and command macros do suck.

We're going to have to live with these for a long time, so it's
important that we be happy with the names before we're stuck with them.
The consensus on the lists so far is that they should be rechristened as
"open file description locks".

The name isn't a big deal for the kernel, but the command macros are not
visually distinct enough from the traditional POSIX lock macros. The
glibc and documentation folks are recommending that we change them to
look like F_OFD_{GETLK|SETLK|SETLKW}. That lessens the chance that a
programmer will typo one of the commands wrong, and also makes it easier
to spot this difference when reading code.

This patch makes the following changes that I think are necessary before
v3.15 ships:

1) rename the command macros to their new names. These end up in the uapi
   headers and so are part of the external-facing API. It turns out that
   glibc doesn't actually use the fcntl.h uapi header, but it's hard to
   be sure that something else won't. Changing it now is safest.

2) make the the /proc/locks output display these as type "OFDLCK"


Regards,
Daniel
-- 
|: http://berrange.com  -o-http://www.flickr.com/photos/dberrange/ :|
|: http://libvirt.org  -o- http://virt-manager.org :|
|: http://autobuild.org   -o- http://search.cpan.org/~danberr/ :|
|: http://entangle-photo.org   -o-   http://live.gnome.org/gtk-vnc :|



Re: [Qemu-block] [Qemu-devel] [PATCH v4 00/27] block: Lock images when opening

2016-05-10 Thread Kevin Wolf
Am 10.05.2016 um 10:50 hat Daniel P. Berrange geschrieben:
> On Tue, May 10, 2016 at 09:43:04AM +0100, Richard W.M. Jones wrote:
> > On Tue, May 10, 2016 at 09:14:26AM +0100, Richard W.M. Jones wrote:
> > > However I didn't test the write-shareable case (the libvirt
> > >  flag which should map to a shared lock -- is that right 
> > > Dan?).
> > 
> > To Dan (mainly): I think setting the  flag in libvirt only
> > sets cache=unsafe on the qemu drive (it may have other effects for
> > virtlockd).  Should there be an extra qemu drive flag to communicate
> > that the drive is write-shareable?
> 
> Sure, if QEMU had a way to indicate that the disk was used in a
> write-sharable mode, libvirt would use it.
> 
> I agree with your general point that we should do no locking for
> read-only access, F_RDLCK for shared-write and F_WRLCK for
> exclusive-write access. I think I made that point similarly against
> an earlier version of this patchset

Why should we do no locking for read-only access by default? If an image
is written to, read-only accesses are potentially inconsistent and we
should protect users against it.

The only argument I can see in the old versions of this series is
"libguestfs does it and usually it gets away with it". For me, that's
not an argument for generally allowing this, but at most for providing a
switch to bypass the locking.

Because let's be clear about this: If someone lost data because they
took an inconsistent backup this way and comes to us qemu developers,
all we can tell them is "sorry, what you did is not supported". And
that's a pretty strong sign that locking should prevent it by default.

Kevin



Re: [Qemu-block] [PATCH v4 08/27] osdep: Add qemu_lock_fd and qemu_unlock_fd

2016-05-10 Thread Richard W.M. Jones
On Tue, May 10, 2016 at 09:57:48AM +0100, Daniel P. Berrange wrote:
> On Tue, May 10, 2016 at 10:50:40AM +0800, Fam Zheng wrote:
> > They are wrappers of POSIX fcntl file locking, with the additional
> > interception of open/close (through qemu_open and qemu_close) to offer a
> > better semantics that preserves the locks across multiple life cycles of
> > different fds on the same file.  The reason to make this semantics
> > change over the fcntl locks is to make the API cleaner for QEMU
> > subsystems.
> > 
> > More precisely, we remove this "feature" of fcntl lock:
> > 
> > If a process closes any file descriptor referring to a file, then
> > all of the process's locks on that file are released, regardless of
> > the file descriptor(s) on which the locks were obtained.
> > 
> > as long as the fd is always open/closed through qemu_open and
> > qemu_close.
> 
> You're not actually really removing that problem - this is just hacking
> around it in a manner which is racy & susceptible to silent failure.

Whatever happened to file-private locks (https://lwn.net/Articles/586904/)?
My very recent glibc doesn't appear to include them, unless the
final standard used something different from `F_SETLKPW'.

Rich.

-- 
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
Read my programming and virtualization blog: http://rwmj.wordpress.com
virt-top is 'top' for virtual machines.  Tiny program with many
powerful monitoring features, net stats, disk stats, logging, etc.
http://people.redhat.com/~rjones/virt-top



Re: [Qemu-block] [PATCH v4 08/27] osdep: Add qemu_lock_fd and qemu_unlock_fd

2016-05-10 Thread Daniel P. Berrange
On Tue, May 10, 2016 at 10:50:40AM +0800, Fam Zheng wrote:
> They are wrappers of POSIX fcntl file locking, with the additional
> interception of open/close (through qemu_open and qemu_close) to offer a
> better semantics that preserves the locks across multiple life cycles of
> different fds on the same file.  The reason to make this semantics
> change over the fcntl locks is to make the API cleaner for QEMU
> subsystems.
> 
> More precisely, we remove this "feature" of fcntl lock:
> 
> If a process closes any file descriptor referring to a file, then
> all of the process's locks on that file are released, regardless of
> the file descriptor(s) on which the locks were obtained.
> 
> as long as the fd is always open/closed through qemu_open and
> qemu_close.

You're not actually really removing that problem - this is just hacking
around it in a manner which is racy & susceptible to silent failure.


> +static int qemu_fd_close(int fd)
> +{
> +LockEntry *ent, *tmp;
> +LockRecord *rec;
> +char *path;
> +int ret;
> +
> +assert(fd_to_path);
> +path = g_hash_table_lookup(fd_to_path, GINT_TO_POINTER(fd));
> +assert(path);
> +g_hash_table_remove(fd_to_path, GINT_TO_POINTER(fd));
> +rec = g_hash_table_lookup(lock_map, path);
> +ret = close(fd);
> +
> +if (rec) {
> +QLIST_FOREACH_SAFE(ent, >records, next, tmp) {
> +int r;
> +if (ent->fd == fd) {
> +QLIST_REMOVE(ent, next);
> +g_free(ent);
> +continue;
> +}
> +r = qemu_lock_do(ent->fd, ent->start, ent->len,
> + ent->readonly ? F_RDLCK : F_WRLCK);
> +if (r == -1) {
> +fprintf(stderr, "Failed to acquire lock on fd %d: %s\n",
> +ent->fd, strerror(errno));
> +}

If another app has acquired the lock between the close and this attempt
to re-acquire the lock, then QEMU is silently carrying on without any
lock being held. For something that's intending to provide protection
against concurrent use I think this is not an acceptable failure
scenario.


> +int qemu_open(const char *name, int flags, ...)
> +{
> +int mode = 0;
> +int ret;
> +
> +if (flags & O_CREAT) {
> +va_list ap;
> +
> +va_start(ap, flags);
> +mode = va_arg(ap, int);
> +va_end(ap);
> +}
> +ret = qemu_fd_open(name, flags, mode);
> +if (ret >= 0) {
> +qemu_fd_add_record(ret, name);
> +}
> +return ret;
> +}

I think the approach you have here is fundamentally not usable with
fcntl locks, because it is still using the pattern

   fd = open(path)
   lock(fd)
   if failed lock
  close(fd)
   ...do stuff.


As mentioned in previous posting I believe the block layer must be
checking whether it already has a lock held against "path" *before*
even attempting to open the file. Once QEMU has the file descriptor
open it is too later, because closing the FD will release pre-existing
locks QEMU has.

Regards,
Daniel
-- 
|: http://berrange.com  -o-http://www.flickr.com/photos/dberrange/ :|
|: http://libvirt.org  -o- http://virt-manager.org :|
|: http://autobuild.org   -o- http://search.cpan.org/~danberr/ :|
|: http://entangle-photo.org   -o-   http://live.gnome.org/gtk-vnc :|



Re: [Qemu-block] [Qemu-devel] [PATCH v4 00/27] block: Lock images when opening

2016-05-10 Thread Daniel P. Berrange
On Tue, May 10, 2016 at 09:43:04AM +0100, Richard W.M. Jones wrote:
> On Tue, May 10, 2016 at 09:14:26AM +0100, Richard W.M. Jones wrote:
> > However I didn't test the write-shareable case (the libvirt
> >  flag which should map to a shared lock -- is that right Dan?).
> 
> To Dan (mainly): I think setting the  flag in libvirt only
> sets cache=unsafe on the qemu drive (it may have other effects for
> virtlockd).  Should there be an extra qemu drive flag to communicate
> that the drive is write-shareable?

Sure, if QEMU had a way to indicate that the disk was used in a
write-sharable mode, libvirt would use it.

I agree with your general point that we should do no locking for
read-only access, F_RDLCK for shared-write and F_WRLCK for
exclusive-write access. I think I made that point similarly against
an earlier version of this patchset

Regards,
Daniel
-- 
|: http://berrange.com  -o-http://www.flickr.com/photos/dberrange/ :|
|: http://libvirt.org  -o- http://virt-manager.org :|
|: http://autobuild.org   -o- http://search.cpan.org/~danberr/ :|
|: http://entangle-photo.org   -o-   http://live.gnome.org/gtk-vnc :|



Re: [Qemu-block] [PATCH v13 2/3] quorum: implement bdrv_add_child() and bdrv_del_child()

2016-05-10 Thread Alberto Garcia
On Tue 10 May 2016 10:39:21 AM CEST, Kevin Wolf wrote:
>>s->children = g_renew(BdrvChild *, s->children, ++s->num_children);
>>s->children[s->num_children] = child;
>
> Without having checked the context, this code is not equivalent. You
> need to access s->children[s->num_children - 1] now in the second
> line.

Yeah, you are both right, sorry for the mistake!

Berto



Re: [Qemu-block] [Qemu-devel] [PATCH v4 00/27] block: Lock images when opening

2016-05-10 Thread Richard W.M. Jones
On Tue, May 10, 2016 at 09:14:26AM +0100, Richard W.M. Jones wrote:
> However I didn't test the write-shareable case (the libvirt
>  flag which should map to a shared lock -- is that right Dan?).

To Dan (mainly): I think setting the  flag in libvirt only
sets cache=unsafe on the qemu drive (it may have other effects for
virtlockd).  Should there be an extra qemu drive flag to communicate
that the drive is write-shareable?

Rich.

-- 
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
Read my programming and virtualization blog: http://rwmj.wordpress.com
libguestfs lets you edit virtual machines.  Supports shell scripting,
bindings from many languages.  http://libguestfs.org



Re: [Qemu-block] [PATCH v13 2/3] quorum: implement bdrv_add_child() and bdrv_del_child()

2016-05-10 Thread Kevin Wolf
Am 09.05.2016 um 17:52 hat Alberto Garcia geschrieben:
> On Wed 13 Apr 2016 10:33:08 AM CEST, Changlong Xie wrote:
> 
> Sorry for the late reply!
> 
> The patch looks good, I have some additional comments on top of what Max
> Wrote, nothing serious though :)
> 
> > @@ -67,6 +68,9 @@ typedef struct QuorumVotes {
> >  typedef struct BDRVQuorumState {
> >  BdrvChild **children;  /* children BlockDriverStates */
> >  int num_children;  /* children count */
> > +uint64_t last_index;   /* indicate the child role name of the last
> > +* element of children array
> > +*/
> 
> I think you can use a regular 'unsigned' here, it's simpler and more
> efficient. We're not going to have 2^32 Quorum children, are we? :)
> 
> > +static void quorum_add_child(BlockDriverState *bs, BlockDriverState 
> > *child_bs,
> > + Error **errp)
> > +{
> > +BDRVQuorumState *s = bs->opaque;
> > +BdrvChild *child;
> > +char indexstr[32];
> > +int ret;
> > +
> > +assert(s->num_children <= INT_MAX / sizeof(BdrvChild *) &&
> > +   s->last_index <= UINT64_MAX);
> 
> That last condition has no practical effect. last_index is a uint64_t so
> s->last_index <= UINT64_MAX is always going to be true.
> 
> > +s->children = g_renew(BdrvChild *, s->children, s->num_children + 1);
> > +s->children[s->num_children++] = child;
> 
> Slightly simpler way:
> 
>s->children = g_renew(BdrvChild *, s->children, ++s->num_children);
>s->children[s->num_children] = child;

Without having checked the context, this code is not equivalent. You
need to access s->children[s->num_children - 1] now in the second line.

> But this is not very important, so you can leave it as it is now if you
> prefer.
> 
> > +/* child->name is "children.%d" */
> > +assert(!strncmp(child->name, "children.", 9));
> 
> I actually don't think there's anything wrong with this assertion, but
> if you decide to keep it you can use g_str_has_prefix() instead, which
> is a bit easier and more readable.

There's also good old strstart() from cutils.c.

Kevin



Re: [Qemu-block] [Qemu-devel] [PATCH v5 0/6] qemu-io: UI enhancements

2016-05-10 Thread Kevin Wolf
Am 09.05.2016 um 20:23 hat Max Reitz geschrieben:
> On 08.05.2016 05:35, Eric Blake wrote:
> > On 05/07/2016 09:16 PM, Eric Blake wrote:
> >> While working on NBD, I found myself cursing the qemu-io UI for
> >> not letting me test various scenarios, particularly after fixing
> >> NBD to serve at byte granularity [1].  And in the process of
> >> writing these qemu-io enhancements, I also managed to flush out
> >> several other bugs in the block layer proper, with fixes posted
> >> separately, such as loss of BDRV_REQ_FUA during write_zeroes [2]
> > 
> > Apologies to reviewers for botching Kevin's email address in my 'git
> > send-email' invocation; you may want to double check your To/CC lines
> > before hitting send.
> 
> Well, I had even read this heads-up but of course I forgot it by the
> time I got to reviewing...
> 
> So let's at least get a
> 
> Series: Reviewed-by: Max Reitz 
> 
> here with the correct CC.

The only practical problem is that you get some bounced emails. As long
as qemu-block is CCed, I'm fine.

Kevin


pgpgb_dqx1fAX.pgp
Description: PGP signature


Re: [Qemu-block] [PATCH 2/2] block: Inactivate all children

2016-05-10 Thread Kevin Wolf
Am 10.05.2016 um 05:23 hat Fam Zheng geschrieben:
> On Fri, 05/06 09:49, Kevin Wolf wrote:
> > Am 05.05.2016 um 02:32 hat Fam Zheng geschrieben:
> > > On Wed, 05/04 12:12, Kevin Wolf wrote:
> > > > Am 19.04.2016 um 03:42 hat Fam Zheng geschrieben:
> > > > > Currently we only inactivate the top BDS. Actually bdrv_inactivate
> > > > > should be the opposite of bdrv_invalidate_cache.
> > > > > 
> > > > > Recurse into the whole subtree instead.
> > > > > 
> > > > > Signed-off-by: Fam Zheng 
> > > > 
> > > > Did you actually test this?
> > > > 
> > > > I would expect that bs->drv->bdrv_inactivate() fails now (as in
> > > > assertion failure) if it has anything to flush to the image because
> > > > bs->file has already be inactivated before. I think children need to be
> > > > inactived after their parents.
> > > 
> > > OK, my test apparently failed to trigger that bdrv_pwritv() path. Good 
> > > catch!
> > > 
> > > > 
> > > > Nodes with multiple parents could actually become even more
> > > > interesting...
> > > 
> > > I'll make it two passes recursion: one for calling drv->bdrv_inactivate 
> > > and the
> > > other for setting BDRV_O_INACTIVATE.
> > 
> > Though that would assume that the .bdrv_inactivate() implementation of
> > drivers doesn't already bring the BDS into a state where further writes
> > aren't possible. I'm not sure if that's a good assumption to make, even
> > though it's currently true for qcow2.
> > 
> > For example, imagine we went forward with format-based image locking.
> > The first .bdrv_inactivate() would then already release the lock, we
> > can't continue writing after that.
> 
> we only need to make sure all cache of all images is flushed when
> bdrv_inactivate_all() returns, and similarly, that the cache of one image is
> flushed when .bdrv_inactivate() returns.  The releasing of the lock is an
> explicit callback and should be place in bdrv_inactivate() right above setting
> of BDRV_O_INACTIVATE.  This is the case in my image locking series.

Fair enough. My series didn't have a separate callback, but with yours
that should be working.

So is the semantics of .bdrv_inactivate() basically "bdrv_flush, and I
really mean it"?

> > Maybe we need something like an "active reference counter", and we
> > decrement that for all children and only call their .bdrv_inactivate()
> > when it arrives at 0.
> 
> That should work, but the effect of the counters are local to one invocation 
> of
> bdrv_inactivate_all(), and is not really necessary if we do as above.

Agreed.

Kevin



Re: [Qemu-block] [PATCH v4 00/27] block: Lock images when opening

2016-05-10 Thread Richard W.M. Jones
On Tue, May 10, 2016 at 10:50:32AM +0800, Fam Zheng wrote:
> v4: Don't lock RO image. [Rich]

I applied this on top of HEAD and added some debugging messages to
block/raw-posix.c so I can see when files are being opened and locked,
and it does appear to do the right thing for read-only and
write-exclusive files, ie. it applies an exclusive lock when opening a
file for write-exclusive, and no lock when opening a file for read.

However I didn't test the write-shareable case (the libvirt
 flag which should map to a shared lock -- is that right Dan?).

Rich.

-- 
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
Read my programming and virtualization blog: http://rwmj.wordpress.com
virt-top is 'top' for virtual machines.  Tiny program with many
powerful monitoring features, net stats, disk stats, logging, etc.
http://people.redhat.com/~rjones/virt-top



Re: [Qemu-block] [PATCH v4 08/27] osdep: Add qemu_lock_fd and qemu_unlock_fd

2016-05-10 Thread Richard W.M. Jones
On Tue, May 10, 2016 at 10:50:40AM +0800, Fam Zheng wrote:
> +int qemu_lock_fd(int fd, int64_t start, int64_t len, bool readonly)

I find this new API to be very unintuitive.  When I was reading the
other code in block/raw-posix.c I had to refer back to this file to
find out what all the integers meant.

It is also misleading.  There's aren't really such a thing as
"readonly locks", or "read locks" or "write locks".  I know that
(some) POSIX APIs use these terms, but the terms are wrong.

There are shared locks, and there are exclusive locks.  The locks have
nothing to do with reading or writing.  In fact the locks only apply
when writing, and are to do with whether you want to allow multiple
writers at the same time (shared lock), or only a single writer
(exclusive lock).

Anyway, I think the boolean "readonly" should be replaced by
some flag like:

#define QEMU_LOCK_SHARED 1
#define QEMU_LOCK_EXCLUSIVE 2

or similar.

Not sure about the start/len.  Do we need them in the API at all given
that we've decided to lock a particular byte of the file?

Rich.

-- 
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
Read my programming and virtualization blog: http://rwmj.wordpress.com
virt-builder quickly builds VMs from scratch
http://libguestfs.org/virt-builder.1.html



Re: [Qemu-block] [PATCH v14 2/3] quorum: implement bdrv_add_child() and bdrv_del_child()

2016-05-10 Thread Alberto Garcia
On Tue 10 May 2016 09:36:38 AM CEST, Changlong Xie wrote:
> From: Wen Congyang 
>
> Signed-off-by: Wen Congyang 
> Signed-off-by: zhanghailiang 
> Signed-off-by: Gonglei 
> Signed-off-by: Changlong Xie 

Reviewed-by: Alberto Garcia 

Berto



[Qemu-block] [PATCH v14 3/3] qmp: add monitor command to add/remove a child

2016-05-10 Thread Changlong Xie
From: Wen Congyang 

The new QMP command name is x-blockdev-change. It's just for adding/removing
quorum's child now, and doesn't support all kinds of children, all kinds of
operations, nor all block drivers. So it is experimental now.

Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
Signed-off-by: Changlong Xie 
Reviewed-by: Max Reitz 
Reviewed-by: Alberto Garcia 
---
 blockdev.c   | 55 
 qapi/block-core.json | 32 ++
 qmp-commands.hx  | 53 ++
 3 files changed, 140 insertions(+)

diff --git a/blockdev.c b/blockdev.c
index f1f520a..d16d449 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -4092,6 +4092,61 @@ out:
 aio_context_release(aio_context);
 }
 
+static BdrvChild *bdrv_find_child(BlockDriverState *parent_bs,
+  const char *child_name)
+{
+BdrvChild *child;
+
+QLIST_FOREACH(child, _bs->children, next) {
+if (strcmp(child->name, child_name) == 0) {
+return child;
+}
+}
+
+return NULL;
+}
+
+void qmp_x_blockdev_change(const char *parent, bool has_child,
+   const char *child, bool has_node,
+   const char *node, Error **errp)
+{
+BlockDriverState *parent_bs, *new_bs = NULL;
+BdrvChild *p_child;
+
+parent_bs = bdrv_lookup_bs(parent, parent, errp);
+if (!parent_bs) {
+return;
+}
+
+if (has_child == has_node) {
+if (has_child) {
+error_setg(errp, "The parameters child and node are in conflict");
+} else {
+error_setg(errp, "Either child or node must be specified");
+}
+return;
+}
+
+if (has_child) {
+p_child = bdrv_find_child(parent_bs, child);
+if (!p_child) {
+error_setg(errp, "Node '%s' does not have child '%s'",
+   parent, child);
+return;
+}
+bdrv_del_child(parent_bs, p_child, errp);
+}
+
+if (has_node) {
+new_bs = bdrv_find_node(node);
+if (!new_bs) {
+error_setg(errp, "Node '%s' not found", node);
+return;
+}
+bdrv_add_child(parent_bs, new_bs, errp);
+}
+}
+
 BlockJobInfoList *qmp_query_block_jobs(Error **errp)
 {
 BlockJobInfoList *head = NULL, **p_next = 
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 1d09079..98a20d2 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2556,3 +2556,35 @@
 ##
 { 'command': 'block-set-write-threshold',
   'data': { 'node-name': 'str', 'write-threshold': 'uint64' } }
+
+##
+# @x-blockdev-change
+#
+# Dynamically reconfigure the block driver state graph. It can be used
+# to add, remove, insert or replace a graph node. Currently only the
+# Quorum driver implements this feature to add or remove its child. This
+# is useful to fix a broken quorum child.
+#
+# If @node is specified, it will be inserted under @parent. @child
+# may not be specified in this case. If both @parent and @child are
+# specified but @node is not, @child will be detached from @parent.
+#
+# @parent: the id or name of the parent node.
+#
+# @child: #optional the name of a child under the given parent node.
+#
+# @node: #optional the name of the node that will be added.
+#
+# Note: this command is experimental, and its API is not stable. It
+# does not support all kinds of operations, all kinds of children, nor
+# all block drivers.
+#
+# Warning: The data in a new quorum child MUST be consistent with that of
+# the rest of the array.
+#
+# Since: 2.7
+##
+{ 'command': 'x-blockdev-change',
+  'data' : { 'parent': 'str',
+ '*child': 'str',
+ '*node': 'str' } }
diff --git a/qmp-commands.hx b/qmp-commands.hx
index de896a5..94847e5 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -4398,6 +4398,59 @@ Example:
 EQMP
 
 {
+.name   = "x-blockdev-change",
+.args_type  = "parent:B,child:B?,node:B?",
+.mhandler.cmd_new = qmp_marshal_x_blockdev_change,
+},
+
+SQMP
+x-blockdev-change
+-
+
+Dynamically reconfigure the block driver state graph. It can be used
+to add, remove, insert or replace a graph node. Currently only the
+Quorum driver implements this feature to add or remove its child. This
+is useful to fix a broken quorum child.
+
+If @node is specified, it will be inserted under @parent. @child
+may not be specified in this case. If both @parent and @child are
+specified but @node is not, @child will be detached from @parent.
+
+Arguments:
+- "parent": the id or name of the parent node (json-string)
+- "child": the name of a child under the given parent node (json-string, 
optional)
+- "node": the 

[Qemu-block] [PATCH v14 0/3] qapi: child add/delete support

2016-05-10 Thread Changlong Xie
ChangLog:
v14:
1. Address commets from Betro and Max
p2: introduce bdrv_drained_begin/end, rename last_index, remove redundant
assert codes
v13:
1. Rebase to the newest codes
2. Address commets from Betro and Max
p1. Add R-B, fix incorrect syntax
p2. Add missing "qemu/cutils.h" since 2.6, and rewrite quorum_add/del_child
p3. Remove unnecessary "id", add "since 2.7"
v11~v12:
1. Address comments from Max
p1. Add R-B
p2. Add R-B, remove unnecessary "endptr" "value"
p3. Add R-B
v10~v11:
1. Rebase to the newest codes
2. Address comment from Max
Don't use contractions in error messages,
p1: Remove R-Bs, and use "BdrvChild *child" in bdrv_del_child
p2: Fix error logic in get_new_child_index/remove_child_index, and prefect
child->name parsing
p3: Make bdrv_find_child return BdrvChild *, and add missing explanation

v9~v10:
1. Rebase to the newest codes
2. Address comments from Berto and Max, update the documents in block-core.json 
   and qmp-commands.hx 
3. Remove redundant codes in quorum_add_child() and quorum_del_child()
v8:
1. Rebase to the newest codes
2. Address the comments from Eric Blake
v7:
1. Remove the qmp command x-blockdev-change's parameter operation according
   to Kevin's comments.
2. Remove the hmp command.
v6:
1. Use a single qmp command x-blockdev-change to replace x-blockdev-child-add
   and x-blockdev-child-delete
v5:
1. Address Eric Blake's comments
v4:
1. drop nbd driver's implementation. We can use human-monitor-command
   to do it.
2. Rename the command name.
v3:
1. Don't open BDS in bdrv_add_child(). Use the existing BDS which is
   created by the QMP command blockdev-add.
2. The driver NBD can support filename, path, host:port now.
v2:
1. Use bdrv_get_device_or_node_name() instead of new function
   bdrv_get_id_or_node_name()
2. Update the error message
3. Update the documents in block-core.json

Wen Congyang (3):
  Add new block driver interface to add/delete a BDS's child
  quorum: implement bdrv_add_child() and bdrv_del_child()
  qmp: add monitor command to add/remove a child

 block.c   | 57 +++---
 block/quorum.c| 78 +--
 blockdev.c| 55 +
 include/block/block.h |  8 +
 include/block/block_int.h |  5 +++
 qapi/block-core.json  | 32 +++
 qmp-commands.hx   | 53 
 7 files changed, 282 insertions(+), 6 deletions(-)

-- 
1.9.3






[Qemu-block] [PATCH v14 1/3] Add new block driver interface to add/delete a BDS's child

2016-05-10 Thread Changlong Xie
From: Wen Congyang 

In some cases, we want to take a quorum child offline, and take
another child online.

Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
Signed-off-by: Changlong Xie 
Reviewed-by: Max Reitz 
Reviewed-by: Alberto Garcia 
---
 block.c   | 49 +++
 include/block/block.h |  4 
 include/block/block_int.h |  5 +
 3 files changed, 58 insertions(+)

diff --git a/block.c b/block.c
index d4939b4..68cd3b2 100644
--- a/block.c
+++ b/block.c
@@ -3981,3 +3981,52 @@ void bdrv_refresh_filename(BlockDriverState *bs)
 QDECREF(json);
 }
 }
+
+/*
+ * Hot add/remove a BDS's child. So the user can take a child offline when
+ * it is broken and take a new child online
+ */
+void bdrv_add_child(BlockDriverState *parent_bs, BlockDriverState *child_bs,
+Error **errp)
+{
+
+if (!parent_bs->drv || !parent_bs->drv->bdrv_add_child) {
+error_setg(errp, "The node %s does not support adding a child",
+   bdrv_get_device_or_node_name(parent_bs));
+return;
+}
+
+if (!QLIST_EMPTY(_bs->parents)) {
+error_setg(errp, "The node %s already has a parent",
+   child_bs->node_name);
+return;
+}
+
+parent_bs->drv->bdrv_add_child(parent_bs, child_bs, errp);
+}
+
+void bdrv_del_child(BlockDriverState *parent_bs, BdrvChild *child, Error 
**errp)
+{
+BdrvChild *tmp;
+
+if (!parent_bs->drv || !parent_bs->drv->bdrv_del_child) {
+error_setg(errp, "The node %s does not support removing a child",
+   bdrv_get_device_or_node_name(parent_bs));
+return;
+}
+
+QLIST_FOREACH(tmp, _bs->children, next) {
+if (tmp == child) {
+break;
+}
+}
+
+if (!tmp) {
+error_setg(errp, "The node %s does not have a child named %s",
+   bdrv_get_device_or_node_name(parent_bs),
+   bdrv_get_device_or_node_name(child->bs));
+return;
+}
+
+parent_bs->drv->bdrv_del_child(parent_bs, child, errp);
+}
diff --git a/include/block/block.h b/include/block/block.h
index 3a73137..694ca76 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -541,4 +541,8 @@ void bdrv_drained_begin(BlockDriverState *bs);
  */
 void bdrv_drained_end(BlockDriverState *bs);
 
+void bdrv_add_child(BlockDriverState *parent, BlockDriverState *child,
+Error **errp);
+void bdrv_del_child(BlockDriverState *parent, BdrvChild *child, Error **errp);
+
 #endif
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 10d8759..7e238a0 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -317,6 +317,11 @@ struct BlockDriver {
  */
 void (*bdrv_drain)(BlockDriverState *bs);
 
+void (*bdrv_add_child)(BlockDriverState *parent, BlockDriverState *child,
+   Error **errp);
+void (*bdrv_del_child)(BlockDriverState *parent, BdrvChild *child,
+   Error **errp);
+
 QLIST_ENTRY(BlockDriver) list;
 };
 
-- 
1.9.3






[Qemu-block] [PATCH v14 2/3] quorum: implement bdrv_add_child() and bdrv_del_child()

2016-05-10 Thread Changlong Xie
From: Wen Congyang 

Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
Signed-off-by: Changlong Xie 
---
 block.c   |  8 +++---
 block/quorum.c| 78 +--
 include/block/block.h |  4 +++
 3 files changed, 84 insertions(+), 6 deletions(-)

diff --git a/block.c b/block.c
index 68cd3b2..4bdc6b3 100644
--- a/block.c
+++ b/block.c
@@ -1176,10 +1176,10 @@ BdrvChild *bdrv_root_attach_child(BlockDriverState 
*child_bs,
 return child;
 }
 
-static BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
-BlockDriverState *child_bs,
-const char *child_name,
-const BdrvChildRole *child_role)
+BdrvChild *bdrv_attach_child(BlockDriverState *parent_bs,
+ BlockDriverState *child_bs,
+ const char *child_name,
+ const BdrvChildRole *child_role)
 {
 BdrvChild *child = bdrv_root_attach_child(child_bs, child_name, 
child_role);
 QLIST_INSERT_HEAD(_bs->children, child, next);
diff --git a/block/quorum.c b/block/quorum.c
index da15465..7cf49a8 100644
--- a/block/quorum.c
+++ b/block/quorum.c
@@ -14,6 +14,7 @@
  */
 
 #include "qemu/osdep.h"
+#include "qemu/cutils.h"
 #include "block/block_int.h"
 #include "qapi/qmp/qbool.h"
 #include "qapi/qmp/qdict.h"
@@ -67,6 +68,9 @@ typedef struct QuorumVotes {
 typedef struct BDRVQuorumState {
 BdrvChild **children;  /* children BlockDriverStates */
 int num_children;  /* children count */
+unsigned next_child_index;  /* the index of the next child that should
+ * be added
+ */
 int threshold; /* if less than threshold children reads gave the
 * same result a quorum error occurs.
 */
@@ -898,9 +902,9 @@ static int quorum_open(BlockDriverState *bs, QDict 
*options, int flags,
 ret = -EINVAL;
 goto exit;
 }
-if (s->num_children < 2) {
+if (s->num_children < 1) {
 error_setg(_err,
-   "Number of provided children must be greater than 1");
+   "Number of provided children must be 1 or more");
 ret = -EINVAL;
 goto exit;
 }
@@ -964,6 +968,7 @@ static int quorum_open(BlockDriverState *bs, QDict 
*options, int flags,
 
 opened[i] = true;
 }
+s->next_child_index = s->num_children;
 
 g_free(opened);
 goto exit;
@@ -1020,6 +1025,72 @@ static void quorum_attach_aio_context(BlockDriverState 
*bs,
 }
 }
 
+static void quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs,
+ Error **errp)
+{
+BDRVQuorumState *s = bs->opaque;
+BdrvChild *child;
+char indexstr[32];
+int ret;
+
+assert(s->num_children <= INT_MAX / sizeof(BdrvChild *));
+if (s->num_children == INT_MAX / sizeof(BdrvChild *) ||
+s->next_child_index == UINT_MAX) {
+error_setg(errp, "Too many children");
+return;
+}
+
+ret = snprintf(indexstr, 32, "children.%u", s->next_child_index);
+if (ret < 0 || ret >= 32) {
+error_setg(errp, "cannot generate child name");
+return;
+}
+s->next_child_index++;
+
+bdrv_drained_begin(bs);
+
+/* We can safely add the child now */
+bdrv_ref(child_bs);
+child = bdrv_attach_child(bs, child_bs, indexstr, _format);
+s->children = g_renew(BdrvChild *, s->children, s->num_children + 1);
+s->children[s->num_children++] = child;
+
+bdrv_drained_end(bs);
+}
+
+static void quorum_del_child(BlockDriverState *bs, BdrvChild *child,
+ Error **errp)
+{
+BDRVQuorumState *s = bs->opaque;
+int i;
+
+for (i = 0; i < s->num_children; i++) {
+if (s->children[i] == child) {
+break;
+}
+}
+
+/* we have checked it in bdrv_del_child() */
+assert(i < s->num_children);
+
+if (s->num_children <= s->threshold) {
+error_setg(errp,
+"The number of children cannot be lower than the vote threshold 
%d",
+s->threshold);
+return;
+}
+
+bdrv_drained_begin(bs);
+
+/* We can safely remove this child now */
+memmove(>children[i], >children[i + 1],
+(s->num_children - i - 1) * sizeof(BdrvChild *));
+s->children = g_renew(BdrvChild *, s->children, --s->num_children);
+bdrv_unref_child(bs, child);
+
+bdrv_drained_end(bs);
+}
+
 static void quorum_refresh_filename(BlockDriverState *bs, QDict *options)
 {
 BDRVQuorumState *s = bs->opaque;
@@ -1075,6 +1146,9 @@ static BlockDriver bdrv_quorum = {
 .bdrv_detach_aio_context= 

Re: [Qemu-block] [PATCH v13 2/3] quorum: implement bdrv_add_child() and bdrv_del_child()

2016-05-10 Thread Changlong Xie

On 05/09/2016 11:52 PM, Alberto Garcia wrote:

On Wed 13 Apr 2016 10:33:08 AM CEST, Changlong Xie wrote:

Sorry for the late reply!



Never mind : )


The patch looks good, I have some additional comments on top of what Max
Wrote, nothing serious though :)


@@ -67,6 +68,9 @@ typedef struct QuorumVotes {
  typedef struct BDRVQuorumState {
  BdrvChild **children;  /* children BlockDriverStates */
  int num_children;  /* children count */
+uint64_t last_index;   /* indicate the child role name of the last
+* element of children array
+*/


I think you can use a regular 'unsigned' here, it's simpler and more
efficient. We're not going to have 2^32 Quorum children, are we? :)



Actually, i tried to use 'unsinged' here in my first version. But 
thinking of if someone did crazy child add/delete test(add 10 children 
per second), it will overflow in about 2^32/(60*60*24*365*10) = 13 
years, so i choiced uint64_t(2^64 is big enough) here.
Now, i argree with you, it's overwrought. Will use 'unsigned' in next 
version.



+static void quorum_add_child(BlockDriverState *bs, BlockDriverState *child_bs,
+ Error **errp)
+{
+BDRVQuorumState *s = bs->opaque;
+BdrvChild *child;
+char indexstr[32];
+int ret;
+
+assert(s->num_children <= INT_MAX / sizeof(BdrvChild *) &&
+   s->last_index <= UINT64_MAX);


That last condition has no practical effect. last_index is a uint64_t so
s->last_index <= UINT64_MAX is always going to be true.


Yes, it's redundant code.




+s->children = g_renew(BdrvChild *, s->children, s->num_children + 1);
+s->children[s->num_children++] = child;


Slightly simpler way:

s->children = g_renew(BdrvChild *, s->children, ++s->num_children);
s->children[s->num_children] = child;


Overflow arrays, should (s->num_children - 1) here. I'll keep my 
original one.




But this is not very important, so you can leave it as it is now if you
prefer.


+/* child->name is "children.%d" */
+assert(!strncmp(child->name, "children.", 9));


I actually don't think there's anything wrong with this assertion, but
if you decide to keep it you can use g_str_has_prefix() instead, which
is a bit easier and more readable.



Just as Max said, it's extra check, and will remove it.


+/* We can safely remove this child now */
+memmove(>children[i], >children[i + 1],
+(s->num_children - i - 1) * sizeof(BdrvChild *));
+s->children = g_renew(BdrvChild *, s->children, --s->num_children);
+bdrv_unref_child(bs, child);


Question: do we want to decrement last_index if 'i' is the last
children? Something like:



I agree with Max, it seems no benifit(although will save number 
resources if (i == s->num_children - 1)) here.


Thanks
-Xie


if (i == s->num_children - 1) {
s->last_index--;
} else {
memmove(...)
}
s->children = g_renew(...)

Berto


.