date:20170119

To avoid image lock failures.

Signed-off-by: Fam Zheng 
---
 tests/qemu-iotests/172 | 53 --
 tests/qemu-iotests/172.out | 50 ++-
 2 files changed, 54 insertions(+), 49 deletions(-)

diff --git a/tests/qemu-iotests/172 b/tests/qemu-iotests/172
index 1b7d3a1..f9d4cff 100755
--- a/tests/qemu-iotests/172
+++ b/tests/qemu-iotests/172
@@ -86,6 +86,9 @@ size=720k
 
 _make_test_img $size
 
+TEST_IMG="$TEST_IMG.2" _make_test_img $size
+TEST_IMG="$TEST_IMG.3" _make_test_img $size
+
 # Default drive semantics:
 #
 # By default you get a single empty floppy drive. You can override it with
@@ -105,7 +108,7 @@ echo === Using -fda/-fdb options ===
 
 check_floppy_qtree -fda "$TEST_IMG"
 check_floppy_qtree -fdb "$TEST_IMG"
-check_floppy_qtree -fda "$TEST_IMG" -fdb "$TEST_IMG"
+check_floppy_qtree -fda "$TEST_IMG" -fdb "$TEST_IMG.2"
 
 
 echo
@@ -114,7 +117,7 @@ echo === Using -drive options ===
 
 check_floppy_qtree -drive if=floppy,file="$TEST_IMG"
 check_floppy_qtree -drive if=floppy,file="$TEST_IMG",index=1
-check_floppy_qtree -drive if=floppy,file="$TEST_IMG" -drive 
if=floppy,file="$TEST_IMG",index=1
+check_floppy_qtree -drive if=floppy,file="$TEST_IMG" -drive 
if=floppy,file="$TEST_IMG.2",index=1
 
 echo
 echo
@@ -122,7 +125,7 @@ echo === Using -drive if=none and -global ===
 
 check_floppy_qtree -drive if=none,file="$TEST_IMG" -global isa-fdc.driveA=none0
 check_floppy_qtree -drive if=none,file="$TEST_IMG" -global isa-fdc.driveB=none0
-check_floppy_qtree -drive if=none,file="$TEST_IMG" -drive 
if=none,file="$TEST_IMG" \
+check_floppy_qtree -drive if=none,file="$TEST_IMG" -drive 
if=none,file="$TEST_IMG.2" \
-global isa-fdc.driveA=none0 -global isa-fdc.driveB=none1
 
 echo
@@ -131,7 +134,7 @@ echo === Using -drive if=none and -device ===
 
 check_floppy_qtree -drive if=none,file="$TEST_IMG" -device floppy,drive=none0
 check_floppy_qtree -drive if=none,file="$TEST_IMG" -device 
floppy,drive=none0,unit=1
-check_floppy_qtree -drive if=none,file="$TEST_IMG" -drive 
if=none,file="$TEST_IMG" \
+check_floppy_qtree -drive if=none,file="$TEST_IMG",file.disable-lock=on -drive 
if=none,file="$TEST_IMG.2" \
-device floppy,drive=none0 -device floppy,drive=none1,unit=1
 
 echo
@@ -139,58 +142,58 @@ echo
 echo === Mixing -fdX and -global ===
 
 # Working
-check_floppy_qtree -fda "$TEST_IMG" -drive if=none,file="$TEST_IMG" -global 
isa-fdc.driveB=none0
-check_floppy_qtree -fdb "$TEST_IMG" -drive if=none,file="$TEST_IMG" -global 
isa-fdc.driveA=none0
+check_floppy_qtree -fda "$TEST_IMG" -drive if=none,file="$TEST_IMG.2" -global 
isa-fdc.driveB=none0
+check_floppy_qtree -fdb "$TEST_IMG" -drive if=none,file="$TEST_IMG.2" -global 
isa-fdc.driveA=none0
 
 # Conflicting (-fdX wins)
-check_floppy_qtree -fda "$TEST_IMG" -drive if=none,file="$TEST_IMG" -global 
isa-fdc.driveA=none0
-check_floppy_qtree -fdb "$TEST_IMG" -drive if=none,file="$TEST_IMG" -global 
isa-fdc.driveB=none0
+check_floppy_qtree -fda "$TEST_IMG" -drive if=none,file="$TEST_IMG.2" -global 
isa-fdc.driveA=none0
+check_floppy_qtree -fdb "$TEST_IMG" -drive if=none,file="$TEST_IMG.2" -global 
isa-fdc.driveB=none0
 
 echo
 echo
 echo === Mixing -fdX and -device ===
 
 # Working
-check_floppy_qtree -fda "$TEST_IMG" -drive if=none,file="$TEST_IMG" -device 
floppy,drive=none0
-check_floppy_qtree -fda "$TEST_IMG" -drive if=none,file="$TEST_IMG" -device 
floppy,drive=none0,unit=1
+check_floppy_qtree -fda "$TEST_IMG" -drive if=none,file="$TEST_IMG.2" -device 
floppy,drive=none0
+check_floppy_qtree -fda "$TEST_IMG" -drive if=none,file="$TEST_IMG.2" -device 
floppy,drive=none0,unit=1
 
-check_floppy_qtree -fdb "$TEST_IMG" -drive if=none,file="$TEST_IMG" -device 
floppy,drive=none0
-check_floppy_qtree -fdb "$TEST_IMG" -drive if=none,file="$TEST_IMG" -device 
floppy,drive=none0,unit=0
+check_floppy_qtree -fdb "$TEST_IMG" -drive if=none,file="$TEST_IMG.2" -device 
floppy,drive=none0
+check_floppy_qtree -fdb "$TEST_IMG" -drive if=none,file="$TEST_IMG.2" -device 
floppy,drive=none0,unit=0
 
 # Conflicting
-check_floppy_qtree -fda "$TEST_IMG" -drive if=none,file="$TEST_IMG" -device 
floppy,drive=none0,unit=0
-check_floppy_qtree -fdb "$TEST_IMG" -drive if=none,file="$TEST_IMG" -device 
floppy,drive=none0,unit=1
+check_floppy_qtree -fda "$TEST_IMG" -drive if=none,file="$TEST_IMG.2" -device 
floppy,drive=none0,unit=0
+check_floppy_qtree -fdb "$TEST_IMG" -drive if=none,file="$TEST_IMG.2" -device 
floppy,drive=none0,unit=1
 
 echo
 echo
 echo === Mixing -drive and -device ===
 
 # Working
-check_floppy_qtree -drive if=floppy,file="$TEST_IMG" -drive 
if=none,file="$TEST_IMG" -device floppy,drive=none0
-check_floppy_qtree -drive if=floppy,file="$TEST_IMG" -drive 
if=none,file="$TEST_IMG" -device floppy,drive=none0,unit=1
+check_floppy_qtree -drive if=floppy,file="$TEST_IMG" -drive 
if=none,file="$TEST_IMG.2" -device floppy,drive=none0
+check_floppy_qtree -drive if=floppy,f

[Qemu-block] [PATCH v11 12/16] tests: Use null-co:// instead of /dev/null as the dummy image

Signed-off-by: Fam Zheng 
---
 tests/drive_del-test.c| 2 +-
 tests/nvme-test.c | 2 +-
 tests/usb-hcd-uhci-test.c | 2 +-
 tests/usb-hcd-xhci-test.c | 2 +-
 tests/virtio-blk-test.c   | 2 +-
 tests/virtio-scsi-test.c  | 4 ++--
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/drive_del-test.c b/tests/drive_del-test.c
index 121b9c9..2175139 100644
--- a/tests/drive_del-test.c
+++ b/tests/drive_del-test.c
@@ -92,7 +92,7 @@ static void test_after_failed_device_add(void)
 static void test_drive_del_device_del(void)
 {
 /* Start with a drive used by a device that unplugs instantaneously */
-qtest_start("-drive if=none,id=drive0,file=/dev/null,format=raw"
+qtest_start("-drive if=none,id=drive0,file=null-co://,format=raw"
 " -device virtio-scsi-pci"
 " -device scsi-hd,drive=drive0,id=dev0");
 
diff --git a/tests/nvme-test.c b/tests/nvme-test.c
index c8bece4..7674a44 100644
--- a/tests/nvme-test.c
+++ b/tests/nvme-test.c
@@ -22,7 +22,7 @@ int main(int argc, char **argv)
 g_test_init(&argc, &argv, NULL);
 qtest_add_func("/nvme/nop", nop);
 
-qtest_start("-drive id=drv0,if=none,file=/dev/null,format=raw "
+qtest_start("-drive id=drv0,if=none,file=null-co://,format=raw "
 "-device nvme,drive=drv0,serial=foo");
 ret = g_test_run();
 
diff --git a/tests/usb-hcd-uhci-test.c b/tests/usb-hcd-uhci-test.c
index e956b9c..24c3ea9 100644
--- a/tests/usb-hcd-uhci-test.c
+++ b/tests/usb-hcd-uhci-test.c
@@ -78,7 +78,7 @@ int main(int argc, char **argv)
 {
 const char *arch = qtest_get_arch();
 const char *cmd = "-device piix3-usb-uhci,id=uhci,addr=1d.0"
-  " -drive id=drive0,if=none,file=/dev/null,format=raw"
+  " -drive id=drive0,if=none,file=null-co://,format=raw"
   " -device usb-tablet,bus=uhci.0,port=1";
 int ret;
 
diff --git a/tests/usb-hcd-xhci-test.c b/tests/usb-hcd-xhci-test.c
index 22513e9..031764d 100644
--- a/tests/usb-hcd-xhci-test.c
+++ b/tests/usb-hcd-xhci-test.c
@@ -89,7 +89,7 @@ int main(int argc, char **argv)
 qtest_add_func("/xhci/pci/hotplug/usb-uas", test_usb_uas_hotplug);
 
 qtest_start("-device nec-usb-xhci,id=xhci"
-" -drive id=drive0,if=none,file=/dev/null,format=raw");
+" -drive id=drive0,if=none,file=null-co://,format=raw");
 ret = g_test_run();
 qtest_end();
 
diff --git a/tests/virtio-blk-test.c b/tests/virtio-blk-test.c
index 0e32e41..0fc7c99 100644
--- a/tests/virtio-blk-test.c
+++ b/tests/virtio-blk-test.c
@@ -63,7 +63,7 @@ static QOSState *pci_test_start(void)
 const char *arch = qtest_get_arch();
 char *tmp_path;
 const char *cmd = "-drive if=none,id=drive0,file=%s,format=raw "
-  "-drive if=none,id=drive1,file=/dev/null,format=raw "
+  "-drive if=none,id=drive1,file=null-co://,format=raw "
   "-device virtio-blk-pci,id=drv0,drive=drive0,"
   "addr=%x.%x";
 
diff --git a/tests/virtio-scsi-test.c b/tests/virtio-scsi-test.c
index 69220ef..8c27641 100644
--- a/tests/virtio-scsi-test.c
+++ b/tests/virtio-scsi-test.c
@@ -35,7 +35,7 @@ typedef struct {
 static QOSState *qvirtio_scsi_start(const char *extra_opts)
 {
 const char *arch = qtest_get_arch();
-const char *cmd = "-drive id=drv0,if=none,file=/dev/null,format=raw "
+const char *cmd = "-drive id=drv0,if=none,file=null-co://,format=raw "
   "-device virtio-scsi-pci,id=vs0 "
   "-device scsi-hd,bus=vs0.0,drive=drv0 %s";
 
@@ -195,7 +195,7 @@ static void hotplug(void)
 QDict *response;
 QOSState *qs;
 
-qs = qvirtio_scsi_start("-drive 
id=drv1,if=none,file=/dev/null,format=raw");
+qs = qvirtio_scsi_start("-drive 
id=drv1,if=none,file=null-co://,format=raw");
 response = qmp("{\"execute\": \"device_add\","
" \"arguments\": {"
"   \"driver\": \"scsi-hd\","
-- 
2.9.3

[Qemu-block] [PATCH v11 15/16] qcow2: Force "no other writer" lock on bs->file

Writing to the same qcow2 file from two QEMU processes at the same time
will never work correctly, so disable it even when the caller specifies
BDRV_O_RDWR.

Other formats are less vulnerable because they don't have internal
snapshots thus qemu-img is less often misused to create live snapshot.

Signed-off-by: Fam Zheng 
---
 block/qcow2.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/block/qcow2.c b/block/qcow2.c
index 96fb8a8..879361a 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1177,6 +1177,17 @@ static int qcow2_open(BlockDriverState *bs, QDict 
*options, int flags,
 }
 }
 
+if ((flags & BDRV_O_SHARE_RW) && (flags & BDRV_O_RDWR)) {
+/* Shared write is never a good idea for qcow2, override it.
+ * XXX: Use permission propagation and masking mechanism in op blockers
+ * API once it's there. */
+ret = bdrv_reopen(bs->file->bs, flags & ~BDRV_O_SHARE_RW, &local_err);
+if (ret) {
+error_propagate(errp, local_err);
+goto fail;
+}
+}
+
 #ifdef DEBUG_ALLOC
 {
 BdrvCheckResult result = {0};
-- 
2.9.3

[Qemu-block] [PATCH v11 14/16] file-posix: Implement image locking

This implements open flag sensible image locking for local file
and host device protocol.

virtlockd in libvirt locks the first byte, so we start looking at the
file bytes from 1.

Quoting what was proposed by Kevin Wolf , there are
four locking modes by combining two bits (BDRV_O_RDWR and
BDRV_O_SHARE_RW), and implemented by taking two locks:

Lock bytes:

* byte 1: I can't allow other processes to write to the image
* byte 2: I am writing to the image

Lock modes:

* shared writer (BDRV_O_RDWR | BDRV_O_SHARE_RW): Take shared lock on
  byte 2. Test whether byte 1 is locked using an exclusive lock, and
  fail if so.

* exclusive writer (BDRV_O_RDWR only): Take shared lock on byte 2. Test
  whether byte 1 is locked using an exclusive lock, and fail if so. Then
  take shared lock on byte 1. I suppose this is racy, but we can
  probably tolerate that.

* reader that can tolerate writers (BDRV_O_SHARE_RW only): Don't do anything

* reader that can't tolerate writers (neither bit is set): Take shared
  lock on byte 1. Test whether byte 2 is locked, and fail if so.

The complication is in the transactional reopen.  To make the reopen
logic managable, and allow better reuse, the code is internally
organized with a table from old mode to the new one.

Signed-off-by: Fam Zheng 
---
 block/file-posix.c | 668 -
 1 file changed, 665 insertions(+), 3 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 28b47d9..ff95cef 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -131,8 +131,40 @@ do { \
 
 #define MAX_BLOCKSIZE  4096
 
+/* Posix file locking bytes. Libvirt takes byte 0, we start from byte 0x10,
+ * leaving a few more bytes for its future use. */
+#define RAW_LOCK_BYTE_MIN 0x10
+#define RAW_LOCK_BYTE_NO_OTHER_WRITER 0x10
+#define RAW_LOCK_BYTE_WRITE   0x11
+
+/*
+ ** reader that can tolerate writers: Don't do anything
+ *
+ ** reader that can't tolerate writers: Take shared lock on byte 1. Test
+ *  byte 2 is unlocked.
+ *
+ ** shared writer: Take shared lock on byte 2. Test byte 1 is unlocked.
+ *
+ ** exclusive writer: Take exclusive locks on both bytes.
+ */
+
+typedef enum {
+/* Read only and accept other writers. */
+RAW_L_READ_SHARE_RW,
+/* Read only and try to forbid other writers. */
+RAW_L_READ,
+/* Read write and accept other writers. */
+RAW_L_WRITE_SHARE_RW,
+/* Read write and try to forbid other writers. */
+RAW_L_WRITE,
+} BDRVRawLockMode;
+
 typedef struct BDRVRawState {
 int fd;
+/* A dup of @fd to make manipulating lock easier, especially during reopen,
+ * where this will accept BDRVRawReopenState.lock_fd. */
+int lock_fd;
+bool disable_lock;
 int type;
 int open_flags;
 size_t buf_align;
@@ -146,11 +178,15 @@ typedef struct BDRVRawState {
 bool use_linux_aio:1;
 bool has_fallocate;
 bool needs_alignment;
+BDRVRawLockMode cur_lock_mode;
 } BDRVRawState;
 
 typedef struct BDRVRawReopenState {
 int fd;
+/* A dup of @fd used for acquiring lock. */
+int lock_fd;
 int open_flags;
+bool disable_lock;
 } BDRVRawReopenState;
 
 static int fd_open(BlockDriverState *bs);
@@ -368,6 +404,58 @@ static void raw_parse_flags(int bdrv_flags, int 
*open_flags)
 }
 }
 
+static int raw_lock_fd(int fd, BDRVRawLockMode mode, Error **errp)
+{
+int ret;
+assert(fd >= 0);
+switch (mode) {
+case RAW_L_READ_SHARE_RW:
+ret = qemu_unlock_fd(fd, RAW_LOCK_BYTE_MIN, 2);
+if (ret) {
+error_setg_errno(errp, -ret, "Failed to unlock fd");
+goto fail;
+}
+break;
+case RAW_L_READ:
+ret = qemu_lock_fd(fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, false);
+if (ret) {
+error_setg_errno(errp, -ret, "Failed to lock share byte");
+goto fail;
+}
+ret = qemu_lock_fd_test(fd, RAW_LOCK_BYTE_WRITE, 1, true);
+if (ret) {
+error_setg_errno(errp, -ret, "Write byte lock is taken");
+goto fail;
+}
+break;
+case RAW_L_WRITE_SHARE_RW:
+ret = qemu_lock_fd(fd, RAW_LOCK_BYTE_WRITE, 1, false);
+if (ret) {
+error_setg_errno(errp, -ret, "Failed to lock write byte");
+goto fail;
+}
+ret = qemu_lock_fd_test(fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, true);
+if (ret) {
+error_setg_errno(errp, -ret, "Share byte lock is taken");
+goto fail;
+}
+break;
+case RAW_L_WRITE:
+ret = qemu_lock_fd(fd, RAW_LOCK_BYTE_MIN, 2, true);
+if (ret) {
+error_setg_errno(errp, -ret, "Failed to lock image");
+goto fail;
+}
+break;
+default:
+abort();
+}
+return 0;
+fail:
+qemu_unlock_fd(fd, RAW_LOCK_BYTE_MIN, 2);
+return ret;
+}
+
 static void raw_parse_filename(const char *filename, QDict *options,

[Qemu-block] [PATCH v11 16/16] tests: Add test-image-lock

Signed-off-by: Fam Zheng 
---
 tests/Makefile.include   |   2 +
 tests/test-image-lock.c  | 200 +++
 tests/test-replication.c |   6 +-
 3 files changed, 205 insertions(+), 3 deletions(-)
 create mode 100644 tests/test-image-lock.c

diff --git a/tests/Makefile.include b/tests/Makefile.include
index 96f5970..7718a9b 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -55,6 +55,7 @@ check-unit-y += tests/test-hbitmap$(EXESUF)
 gcov-files-test-hbitmap-y = blockjob.c
 check-unit-y += tests/test-blockjob$(EXESUF)
 check-unit-y += tests/test-blockjob-txn$(EXESUF)
+check-unit-y += tests/test-image-lock$(EXESUF)
 check-unit-y += tests/test-x86-cpuid$(EXESUF)
 # all code tested by test-x86-cpuid is inside topology.h
 gcov-files-test-x86-cpuid-y =
@@ -516,6 +517,7 @@ tests/test-aio$(EXESUF): tests/test-aio.o 
$(test-block-obj-y)
 tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
 tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) 
$(test-util-obj-y)
 tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o 
$(test-block-obj-y) $(test-util-obj-y)
+tests/test-image-lock$(EXESUF): tests/test-image-lock.o $(test-block-obj-y) 
$(libqos-obj-y)
 tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(test-block-obj-y)
 tests/test-iov$(EXESUF): tests/test-iov.o $(test-util-obj-y)
 tests/test-hbitmap$(EXESUF): tests/test-hbitmap.o $(test-util-obj-y)
diff --git a/tests/test-image-lock.c b/tests/test-image-lock.c
new file mode 100644
index 000..df86ff0
--- /dev/null
+++ b/tests/test-image-lock.c
@@ -0,0 +1,200 @@
+/*
+ * Image lock tests
+ *
+ * Copyright 2016 Red Hat, Inc.
+ *
+ * Authors:
+ *  Fam Zheng 
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qapi/qmp/qbool.h"
+#include "sysemu/block-backend.h"
+
+#define DEBUG_IMAGE_LOCK_TEST 0
+#define DPRINTF(...) do { \
+if (DEBUG_IMAGE_LOCK_TEST) { \
+printf(__VA_ARGS__); \
+} \
+} while (0)
+
+#define TEST_IMAGE_SIZE 4096
+static char test_image[] = "/tmp/qtest.XX";
+static int test_image_fd;
+
+static BlockBackend *open_test_image(int flags, bool disable_lock)
+{
+QDict *opts = qdict_new();
+
+qdict_set_default_str(opts, "filename", test_image);
+qdict_set_default_str(opts, "driver", "file");
+if (disable_lock) {
+qdict_put(opts, "disable-lock", qbool_from_bool(true));
+}
+
+return blk_new_open(NULL, NULL, opts, flags | BDRV_O_ALLOW_RDWR, NULL);
+}
+
+#define RW true
+#define RO false
+#define SHARE true
+#define EXCLU false
+
+static struct CompatData {
+bool write_1;
+bool share_1;
+bool write_2;
+bool share_2;
+bool compatible;
+} compat_data[] = {
+/* Write 1, Share 1, Write 2, Share 2, Compatible. */
+{ RO,   SHARE,   RO,  SHARE,   true,  },
+{ RO,   SHARE,   RO,  EXCLU,   true,  },
+{ RO,   SHARE,   RW,  SHARE,   true,  },
+{ RO,   SHARE,   RW,  EXCLU,   true,  },
+
+{ RO,   EXCLU,   RO,  EXCLU,   true,  },
+{ RO,   EXCLU,   RW,  SHARE,   false, },
+{ RO,   EXCLU,   RW,  EXCLU,   false, },
+
+{ RW,   SHARE,   RW,  SHARE,   true, },
+{ RW,   SHARE,   RW,  EXCLU,   false, },
+
+{ RW,   EXCLU,   RW,  EXCLU,   false, },
+};
+
+/* Test one combination scenario.
+ *
+ * @flags1: The flags of the first blk.
+ * @flags2: The flags of the second blk.
+ * @disable1: The value for raw-posix disable-lock option of the first blk.
+ * @disable2: The value for raw-posix disable-lock option of the second blk.
+ * @from_reopen: Whether or not the first blk should get flags1 from a reopen.
+ * @initial: The source flags from which the blk1 is reopened, only
+ *effective if @from_reopen is true.
+ */
+static void do_test_compat_one(int flags1, int flags2,
+   bool disable1, bool disable2,
+   bool from_reopen, int initial_flags,
+   bool compatible)
+{
+BlockBackend *blk1, *blk2;
+
+DPRINTF("\n===\ndo test compat one\n");
+DPRINTF("flags %x %x\n", flags1, flags2);
+DPRINTF("disable %d %d\n", disable1, disable2);
+DPRINTF("from reopen %d, initial flags %d\n", from_reopen, initial_flags);
+DPRINTF("compatible %d\n", compatible);
+if (!from_reopen) {
+blk1 = open_test_image(flags1, disable1);
+} else {
+int ret;
+blk1 = open_test_image(initial_flags, disable1);
+BlockReopenQueue *rq = NULL;
+
+rq = bdrv_reopen_queue(rq, blk_bs(blk1), NULL, flags1);
+ret = bdrv_reopen_multiple(blk_get_aio_context(blk1), rq, 
&error_abort);
+g_assert_cmpint(ret, ==, 0);
+}
+g_assert_nonnull(blk1);
+g_assert

[Qemu-block] [PATCH v11 05/16] block: Set "share-rw" flag in drive-backup when sync=none

In this case we may open the source's backing image chain multiple
times. Setting share flag means the new open won't try to acquire or
check any lock, once we implement image locking.

Signed-off-by: Fam Zheng 
---
 blockdev.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/blockdev.c b/blockdev.c
index 245e1e1..c97e97f 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -3177,6 +3177,9 @@ static BlockJob *do_drive_backup(DriveBackup *backup, 
BlockJobTxn *txn,
 }
 if (backup->sync == MIRROR_SYNC_MODE_NONE) {
 source = bs;
+/* FIXME: block layer should really open target with BDRV_O_NO_BACKING
+ * and reuse source's backing chain, if they share one. */
+flags |= BDRV_O_SHARE_RW;
 }
 
 size = bdrv_getlength(bs);
-- 
2.9.3

[Qemu-block] [PATCH v11 02/16] block: Define BDRV_O_SHARE_RW

Signed-off-by: Fam Zheng 
---
 include/block/block.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/block/block.h b/include/block/block.h
index 8b0dcda..243839d 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -97,6 +97,8 @@ typedef struct HDGeometry {
   select an appropriate protocol driver,
   ignoring the format layer */
 #define BDRV_O_NO_IO   0x1 /* don't initialize for I/O */
+#define BDRV_O_SHARE_RW0x2 /* accept shared read-write from other users
+  of the image */
 
 #define BDRV_O_CACHE_MASK  (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH)
 
-- 
2.9.3

[Qemu-block] [PATCH v11 09/16] iotests: 085: Avoid image locking conflict

In the case where we test the expected error when a blockdev-snapshot
target already has a backing image, backing chain is opened multiple
times. This will be a problem when we use image locking, so use a
different backing file that is not already open.

Signed-off-by: Fam Zheng 
---
 tests/qemu-iotests/085 | 32 +++-
 tests/qemu-iotests/085.out |  3 ++-
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/tests/qemu-iotests/085 b/tests/qemu-iotests/085
index c53e97f..c91d78d 100755
--- a/tests/qemu-iotests/085
+++ b/tests/qemu-iotests/085
@@ -87,24 +87,26 @@ function create_group_snapshot()
 }
 
 # ${1}: unique identifier for the snapshot filename
-# ${2}: true: open backing images; false: don't open them (default)
+# ${2}: extra_params to the blockdev-add command
+# ${3}: filename
+function do_blockdev_add()
+{
+cmd="{ 'execute': 'blockdev-add', 'arguments':
+   { 'driver': 'qcow2', 'node-name': 'snap_${1}', ${2}
+ 'file':
+ { 'driver': 'file', 'filename': '${3}',
+   'node-name': 'file_${1}' } } }"
+_send_qemu_cmd $h "${cmd}" "return"
+}
+
+# ${1}: unique identifier for the snapshot filename
 function add_snapshot_image()
 {
-if [ "${2}" = "true" ]; then
-extra_params=""
-else
-extra_params="'backing': '', "
-fi
 base_image="${TEST_DIR}/$((${1}-1))-${snapshot_virt0}"
 snapshot_file="${TEST_DIR}/${1}-${snapshot_virt0}"
 _make_test_img -b "${base_image}" "$size"
 mv "${TEST_IMG}" "${snapshot_file}"
-cmd="{ 'execute': 'blockdev-add', 'arguments':
-   { 'driver': 'qcow2', 'node-name': 'snap_${1}', ${extra_params}
- 'file':
- { 'driver': 'file', 'filename': '${snapshot_file}',
-   'node-name': 'file_${1}' } } }"
-_send_qemu_cmd $h "${cmd}" "return"
+do_blockdev_add "$1" "'backing': '', " "${snapshot_file}"
 }
 
 # ${1}: unique identifier for the snapshot filename
@@ -222,7 +224,11 @@ echo === Invalid command - snapshot node has a backing 
image ===
 echo
 
 SNAPSHOTS=$((${SNAPSHOTS}+1))
-add_snapshot_image ${SNAPSHOTS} true
+
+_make_test_img "$size"
+mv "${TEST_IMG}" "${TEST_IMG}.base"
+_make_test_img -b "${TEST_IMG}.base" "$size"
+do_blockdev_add ${SNAPSHOTS} "" "${TEST_IMG}"
 blockdev_snapshot ${SNAPSHOTS} error
 
 echo
diff --git a/tests/qemu-iotests/085.out b/tests/qemu-iotests/085.out
index 08e4bb7..7bbf84d 100644
--- a/tests/qemu-iotests/085.out
+++ b/tests/qemu-iotests/085.out
@@ -78,7 +78,8 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 
backing_file=TEST_DIR/
 
 === Invalid command - snapshot node has a backing image ===
 
-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 
backing_file=TEST_DIR/12-snapshot-v0.IMGFMT
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 
backing_file=TEST_DIR/t.IMGFMT.base
 {"return": {}}
 {"error": {"class": "GenericError", "desc": "The snapshot already has a 
backing image"}}
 
-- 
2.9.3

[Qemu-block] [PATCH v11 13/16] tests: Disable image lock in test-replication

The COLO block replication architecture requires one disk to be shared
between primary and secondary, in the test both processes use posix file
protocol (instead of over NBD) so it is affected by image locking.
Disable the lock.

Signed-off-by: Fam Zheng 
---
 tests/test-replication.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/test-replication.c b/tests/test-replication.c
index fac2da3..5bede49 100644
--- a/tests/test-replication.c
+++ b/tests/test-replication.c
@@ -179,7 +179,8 @@ static BlockBackend *start_primary(void)
 char *cmdline;
 
 cmdline = g_strdup_printf("driver=replication,mode=primary,node-name=xxx,"
-  "file.driver=qcow2,file.file.filename=%s"
+  "file.driver=qcow2,file.file.filename=%s,"
+  "file.file.disable-lock=on"
   , p_local_disk);
 opts = qemu_opts_parse_noisily(&qemu_drive_opts, cmdline, false);
 g_free(cmdline);
@@ -310,7 +311,9 @@ static BlockBackend *start_secondary(void)
 Error *local_err = NULL;
 
 /* add s_local_disk and forge S_LOCAL_DISK_ID */
-cmdline = g_strdup_printf("file.filename=%s,driver=qcow2", s_local_disk);
+cmdline = g_strdup_printf("file.filename=%s,driver=qcow2,"
+  "file.file.disable-lock=on",
+  s_local_disk);
 opts = qemu_opts_parse_noisily(&qemu_drive_opts, cmdline, false);
 g_free(cmdline);
 
@@ -331,8 +334,10 @@ static BlockBackend *start_secondary(void)
 /* add S_(ACTIVE/HIDDEN)_DISK and forge S_ID */
 cmdline = g_strdup_printf("driver=replication,mode=secondary,top-id=%s,"
   "file.driver=qcow2,file.file.filename=%s,"
+  "file.file.disable-lock=on",
   "file.backing.driver=qcow2,"
   "file.backing.file.filename=%s,"
+  "file.backing.file.disable-lock=on",
   "file.backing.backing=%s"
   , S_ID, s_active_disk, s_hidden_disk
   , S_LOCAL_DISK_ID);
-- 
2.9.3

[Qemu-block] [PATCH v11 08/16] iotests: 087: Don't attach test image twice

The test scenario doesn't require the same image, instead it focuses on
the duplicated node-name, so use null-co to avoid locking conflict.

Signed-off-by: Fam Zheng 
---
 tests/qemu-iotests/087 | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/qemu-iotests/087 b/tests/qemu-iotests/087
index 9de57dd..6d52f7d 100755
--- a/tests/qemu-iotests/087
+++ b/tests/qemu-iotests/087
@@ -82,8 +82,7 @@ run_qemu -drive 
driver=$IMGFMT,id=disk,node-name=test-node,file="$TEST_IMG" <

[Qemu-block] [PATCH v11 06/16] iotests: 055: Don't attach the drive to vm for drive-backup

Signed-off-by: Fam Zheng 
---
 tests/qemu-iotests/055 | 32 ++--
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/tests/qemu-iotests/055 b/tests/qemu-iotests/055
index 1d3fd04..20a7596 100755
--- a/tests/qemu-iotests/055
+++ b/tests/qemu-iotests/055
@@ -455,17 +455,18 @@ class TestDriveCompression(iotests.QMPTestCase):
 except OSError:
 pass
 
-def do_prepare_drives(self, fmt, args):
+def do_prepare_drives(self, fmt, args, attach):
 self.vm = iotests.VM().add_drive(test_img)
 
 qemu_img('create', '-f', fmt, blockdev_target_img,
  str(TestDriveCompression.image_len), *args)
-self.vm.add_drive(blockdev_target_img, format=fmt)
+if attach:
+self.vm.add_drive(blockdev_target_img, format=fmt)
 
 self.vm.launch()
 
-def do_test_compress_complete(self, cmd, format, **args):
-self.do_prepare_drives(format['type'], format['args'])
+def do_test_compress_complete(self, cmd, format, attach, **args):
+self.do_prepare_drives(format['type'], format['args'], attach)
 
 self.assert_no_active_block_jobs()
 
@@ -481,15 +482,16 @@ class TestDriveCompression(iotests.QMPTestCase):
 
 def test_complete_compress_drive_backup(self):
 for format in TestDriveCompression.fmt_supports_compression:
-self.do_test_compress_complete('drive-backup', format,
+self.do_test_compress_complete('drive-backup', format, 
attach=False,
target=blockdev_target_img, 
mode='existing')
 
 def test_complete_compress_blockdev_backup(self):
 for format in TestDriveCompression.fmt_supports_compression:
-self.do_test_compress_complete('blockdev-backup', format, 
target='drive1')
+self.do_test_compress_complete('blockdev-backup', format,
+   attach=True, target='drive1')
 
-def do_test_compress_cancel(self, cmd, format, **args):
-self.do_prepare_drives(format['type'], format['args'])
+def do_test_compress_cancel(self, cmd, format, attach, **args):
+self.do_prepare_drives(format['type'], format['args'], attach)
 
 self.assert_no_active_block_jobs()
 
@@ -503,15 +505,16 @@ class TestDriveCompression(iotests.QMPTestCase):
 
 def test_compress_cancel_drive_backup(self):
 for format in TestDriveCompression.fmt_supports_compression:
-self.do_test_compress_cancel('drive-backup', format,
+self.do_test_compress_cancel('drive-backup', format, attach=False,
  target=blockdev_target_img, 
mode='existing')
 
 def test_compress_cancel_blockdev_backup(self):
for format in TestDriveCompression.fmt_supports_compression:
-self.do_test_compress_cancel('blockdev-backup', format, 
target='drive1')
+self.do_test_compress_cancel('blockdev-backup', format, 
attach=True,
+ target='drive1')
 
-def do_test_compress_pause(self, cmd, format, **args):
-self.do_prepare_drives(format['type'], format['args'])
+def do_test_compress_pause(self, cmd, format, attach, **args):
+self.do_prepare_drives(format['type'], format['args'], attach)
 
 self.assert_no_active_block_jobs()
 
@@ -543,12 +546,13 @@ class TestDriveCompression(iotests.QMPTestCase):
 
 def test_compress_pause_drive_backup(self):
 for format in TestDriveCompression.fmt_supports_compression:
-self.do_test_compress_pause('drive-backup', format,
+self.do_test_compress_pause('drive-backup', format, attach=False,
 target=blockdev_target_img, 
mode='existing')
 
 def test_compress_pause_blockdev_backup(self):
 for format in TestDriveCompression.fmt_supports_compression:
-self.do_test_compress_pause('blockdev-backup', format, 
target='drive1')
+self.do_test_compress_pause('blockdev-backup', format, attach=True,
+target='drive1')
 
 if __name__ == '__main__':
 iotests.main(supported_fmts=['raw', 'qcow2'])
-- 
2.9.3

[Qemu-block] [PATCH v11 10/16] iotests: 091: Quit QEMU before checking image

Signed-off-by: Fam Zheng 
---
 tests/qemu-iotests/091 | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/qemu-iotests/091 b/tests/qemu-iotests/091
index 32bbd56..10ac4a8 100755
--- a/tests/qemu-iotests/091
+++ b/tests/qemu-iotests/091
@@ -95,7 +95,9 @@ echo "vm2: qemu process running successfully"
 echo "vm2: flush io, and quit"
 _send_qemu_cmd $h2 'qemu-io disk flush' "(qemu)"
 _send_qemu_cmd $h2 'quit' ""
+_send_qemu_cmd $h1 'quit' ""
 
+wait
 echo "Check image pattern"
 ${QEMU_IO} -c "read -P 0x22 0 4M" "${TEST_IMG}" | _filter_testdir | 
_filter_qemu_io
 
-- 
2.9.3

[Qemu-block] [PATCH v11 04/16] qemu-img: Set "share-rw" flag in read-only commands

Checking the status of an image when it is being used by guest is
usually useful, and there is no risk of corrupting data, so don't let
the upcoming image locking feature limit this use case.

Signed-off-by: Fam Zheng 
---
 qemu-img.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index 5df66fe..6a091e0 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -705,6 +705,10 @@ static int img_check(int argc, char **argv)
 return 1;
 }
 
+if (!(flags & BDRV_O_RDWR)) {
+flags |= BDRV_O_SHARE_RW;
+}
+
 blk = img_open(image_opts, filename, fmt, flags, writethrough, quiet);
 if (!blk) {
 return 1;
@@ -1238,6 +1242,7 @@ static int img_compare(int argc, char **argv)
 goto out3;
 }
 
+flags |= BDRV_O_SHARE_RW;
 blk1 = img_open(image_opts, filename1, fmt1, flags, writethrough, quiet);
 if (!blk1) {
 ret = 2;
@@ -2286,7 +2291,8 @@ static ImageInfoList *collect_image_info_list(bool 
image_opts,
 g_hash_table_insert(filenames, (gpointer)filename, NULL);
 
 blk = img_open(image_opts, filename, fmt,
-   BDRV_O_NO_BACKING | BDRV_O_NO_IO, false, false);
+   BDRV_O_NO_BACKING | BDRV_O_NO_IO | BDRV_O_SHARE_RW,
+   false, false);
 if (!blk) {
 goto err;
 }
@@ -2612,7 +2618,7 @@ static int img_map(int argc, char **argv)
 return 1;
 }
 
-blk = img_open(image_opts, filename, fmt, 0, false, false);
+blk = img_open(image_opts, filename, fmt, BDRV_O_SHARE_RW, false, false);
 if (!blk) {
 return 1;
 }
-- 
2.9.3

[Qemu-block] [PATCH v11 07/16] iotests: 030: Read-only open image for getting map

Signed-off-by: Fam Zheng 
---
 tests/qemu-iotests/030 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/qemu-iotests/030 b/tests/qemu-iotests/030
index 54db54a..fe0c73f 100755
--- a/tests/qemu-iotests/030
+++ b/tests/qemu-iotests/030
@@ -114,7 +114,7 @@ class TestSingleDrive(iotests.QMPTestCase):
 self.assert_no_active_block_jobs()
 
 # The image map is empty before the operation
-empty_map = qemu_io('-f', iotests.imgfmt, '-c', 'map', test_img)
+empty_map = qemu_io('-f', iotests.imgfmt, '-c', 'map', '-r', test_img)
 
 # This is a no-op: no data should ever be copied from the base image
 result = self.vm.qmp('block-stream', device='drive0', base=mid_img)
@@ -125,7 +125,7 @@ class TestSingleDrive(iotests.QMPTestCase):
 self.assert_no_active_block_jobs()
 self.vm.shutdown()
 
-self.assertEqual(qemu_io('-f', iotests.imgfmt, '-c', 'map', test_img),
+self.assertEqual(qemu_io('-f', iotests.imgfmt, '-c', 'map', '-r', 
test_img),
  empty_map, 'image file map changed after a no-op')
 
 def test_stream_partial(self):
-- 
2.9.3

[Qemu-block] [PATCH v11 01/16] osdep: Add qemu_lock_fd and qemu_unlock_fd

They are wrappers of POSIX fcntl "file private locking", with a
convenient "try lock" wrapper implemented with F_OFD_GETLK.

Signed-off-by: Fam Zheng 
---
 include/qemu/osdep.h |  3 +++
 util/osdep.c | 48 
 2 files changed, 51 insertions(+)

diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index 689f253..e864fe8 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -297,6 +297,9 @@ int qemu_close(int fd);
 #ifndef _WIN32
 int qemu_dup(int fd);
 #endif
+int qemu_lock_fd(int fd, int64_t start, int64_t len, bool exclusive);
+int qemu_unlock_fd(int fd, int64_t start, int64_t len);
+int qemu_lock_fd_test(int fd, int64_t start, int64_t len, bool exclusive);
 
 #if defined(__HAIKU__) && defined(__i386__)
 #define FMT_pid "%ld"
diff --git a/util/osdep.c b/util/osdep.c
index 06fb1cf..3de4a18 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -140,6 +140,54 @@ static int qemu_parse_fdset(const char *param)
 {
 return qemu_parse_fd(param);
 }
+
+static int qemu_lock_fcntl(int fd, int64_t start, int64_t len, int fl_type)
+{
+#ifdef F_OFD_SETLK
+int ret;
+struct flock fl = {
+.l_whence = SEEK_SET,
+.l_start  = start,
+.l_len= len,
+.l_type   = fl_type,
+};
+ret = fcntl(fd, F_OFD_SETLK, &fl);
+return ret == -1 ? -errno : 0;
+#else
+return -ENOTSUP;
+#endif
+}
+
+int qemu_lock_fd(int fd, int64_t start, int64_t len, bool exclusive)
+{
+return qemu_lock_fcntl(fd, start, len, exclusive ? F_WRLCK : F_RDLCK);
+}
+
+int qemu_unlock_fd(int fd, int64_t start, int64_t len)
+{
+return qemu_lock_fcntl(fd, start, len, F_UNLCK);
+}
+
+int qemu_lock_fd_test(int fd, int64_t start, int64_t len, bool exclusive)
+{
+#ifdef F_OFD_SETLK
+int ret;
+struct flock fl = {
+.l_whence = SEEK_SET,
+.l_start  = start,
+.l_len= len,
+.l_type   = exclusive ? F_WRLCK : F_RDLCK,
+};
+ret = fcntl(fd, F_OFD_GETLK, &fl);
+if (ret == -1) {
+return -errno;
+} else {
+return fl.l_type == F_UNLCK ? 0 : -EAGAIN;
+}
+#else
+return -ENOTSUP;
+#endif
+}
 #endif
 
 /*
-- 
2.9.3

[Qemu-block] [PATCH v11 03/16] qemu-io: Set "share-rw" flag together with read-only

qemu-io is a low level tool to read or modify guest visible data, which
implies the user knows very well what is being done. Allowing reading
from a locked image is harmless in most cases, so do it.

Signed-off-by: Fam Zheng 
---
 qemu-io.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/qemu-io.c b/qemu-io.c
index 23a229f..f000504 100644
--- a/qemu-io.c
+++ b/qemu-io.c
@@ -585,6 +585,8 @@ int main(int argc, char **argv)
 /* open the device */
 if (!readonly) {
 flags |= BDRV_O_RDWR;
+} else {
+flags |= BDRV_O_SHARE_RW;
 }
 
 if ((argc - optind) == 1) {
-- 
2.9.3

[Qemu-block] [PATCH v11 00/16] block: Image locking series

v11: Move lock bytes from 1-2 to 0x10-0x12. [Daniel]

v10: While we still don't have comprehensive propagation mechanism that will be
provided by new op blocker system for "permissive modes", the locking enabled
by default is regardlessly useful and long overdue. So I think we should merge
this for 2.9 and build user options on top later when the op blocker API
settles.

Address comments from Max and Eric:
  - Use F_OFD_GETLK instead of opening r/w for ro images. [Max]
  - Lock both bytes exclusively for non-shared write. [Max]
  - Spell fixes. [Eric]
  - Fix test matrix. [Max]
  - Comment tweaks. [Max]
  - Return code cleanups. [Max]
  - Don't abuse "disable_lock" for migration. [Max]
  - Use bs->exact_filename instead of bs->filename. [Max]
  - Force protect qcow2 concurrent write.
  - Fix indentation. [Max]
  - Always use re-open for lockfd instead of dup. [Max]
  - Fall through to "abort" where "prepare" failed. [Max]
  - Fix option handling in raw_reopen_handle_lock. [Max]
  - Use "error_abort" in commit and abort. [Max]
  - Fix cleanup of raw_reopen_handle_lock() failure. [Max]
  - Add a patch for qcow2 to mask BDRV_O_SHARE_RW if r/w.
  - Rebase and fix new more cases that will be broken by "lock by default".

Fam Zheng (16):
  osdep: Add qemu_lock_fd and qemu_unlock_fd
  block: Define BDRV_O_SHARE_RW
  qemu-io: Set "share-rw" flag together with read-only
  qemu-img: Set "share-rw" flag in read-only commands
  block: Set "share-rw" flag in drive-backup when sync=none
  iotests: 055: Don't attach the drive to vm for drive-backup
  iotests: 030: Read-only open image for getting map
  iotests: 087: Don't attach test image twice
  iotests: 085: Avoid image locking conflict
  iotests: 091: Quit QEMU before checking image
  iotests: 172: Use separate images for multiple devices
  tests: Use null-co:// instead of /dev/null as the dummy image
  tests: Disable image lock in test-replication
  file-posix: Implement image locking
  qcow2: Force "no other writer" lock on bs->file
  tests: Add test-image-lock

 block/file-posix.c | 668 -
 block/qcow2.c  |  11 +
 blockdev.c |   3 +
 include/block/block.h  |   2 +
 include/qemu/osdep.h   |   3 +
 qemu-img.c |  10 +-
 qemu-io.c  |   2 +
 tests/Makefile.include |   2 +
 tests/drive_del-test.c |   2 +-
 tests/nvme-test.c  |   2 +-
 tests/qemu-iotests/030 |   4 +-
 tests/qemu-iotests/055 |  32 ++-
 tests/qemu-iotests/085 |  32 ++-
 tests/qemu-iotests/085.out |   3 +-
 tests/qemu-iotests/087 |   6 +-
 tests/qemu-iotests/091 |   2 +
 tests/qemu-iotests/172 |  53 ++--
 tests/qemu-iotests/172.out |  50 ++--
 tests/test-image-lock.c| 200 ++
 tests/test-replication.c   |   9 +-
 tests/usb-hcd-uhci-test.c  |   2 +-
 tests/usb-hcd-xhci-test.c  |   2 +-
 tests/virtio-blk-test.c|   2 +-
 tests/virtio-scsi-test.c   |   4 +-
 util/osdep.c   |  48 
 25 files changed, 1057 insertions(+), 97 deletions(-)
 create mode 100644 tests/test-image-lock.c

-- 
2.9.3

[Qemu-block] [PATCH v3 3/6] replication: Split out backup_do_checkpoint() from secondary_do_checkpoint()

The helper backup_do_checkpoint() will be used for primary related
codes. Here we split it out from secondary_do_checkpoint().

Besides, it is unnecessary to call backup_do_checkpoint() in
replication starting and normally stop replication path.
We only need call it while do real checkpointing.

Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Changlong Xie 
Signed-off-by: zhanghailiang 
---
 block/replication.c | 36 +++-
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/block/replication.c b/block/replication.c
index b96a3e5..3a44728 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -332,20 +332,8 @@ static bool 
replication_recurse_is_first_non_filter(BlockDriverState *bs,
 
 static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp)
 {
-Error *local_err = NULL;
 int ret;
 
-if (!s->secondary_disk->bs->job) {
-error_setg(errp, "Backup job was cancelled unexpectedly");
-return;
-}
-
-backup_do_checkpoint(s->secondary_disk->bs->job, &local_err);
-if (local_err) {
-error_propagate(errp, local_err);
-return;
-}
-
 ret = s->active_disk->bs->drv->bdrv_make_empty(s->active_disk->bs);
 if (ret < 0) {
 error_setg(errp, "Cannot make active disk empty");
@@ -558,6 +546,8 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 return;
 }
 block_job_start(job);
+
+secondary_do_checkpoint(s, errp);
 break;
 default:
 aio_context_release(aio_context);
@@ -566,10 +556,6 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 
 s->replication_state = BLOCK_REPLICATION_RUNNING;
 
-if (s->mode == REPLICATION_MODE_SECONDARY) {
-secondary_do_checkpoint(s, errp);
-}
-
 s->error = 0;
 aio_context_release(aio_context);
 }
@@ -579,13 +565,29 @@ static void replication_do_checkpoint(ReplicationState 
*rs, Error **errp)
 BlockDriverState *bs = rs->opaque;
 BDRVReplicationState *s;
 AioContext *aio_context;
+Error *local_err = NULL;
 
 aio_context = bdrv_get_aio_context(bs);
 aio_context_acquire(aio_context);
 s = bs->opaque;
 
-if (s->mode == REPLICATION_MODE_SECONDARY) {
+switch (s->mode) {
+case REPLICATION_MODE_PRIMARY:
+break;
+case REPLICATION_MODE_SECONDARY:
+if (!s->secondary_disk->bs->job) {
+error_setg(errp, "Backup job was cancelled unexpectedly");
+break;
+}
+backup_do_checkpoint(s->secondary_disk->bs->job, &local_err);
+if (local_err) {
+error_propagate(errp, local_err);
+break;
+}
 secondary_do_checkpoint(s, errp);
+break;
+default:
+abort();
 }
 aio_context_release(aio_context);
 }
-- 
1.8.3.1

[Qemu-block] [PATCH v3 4/6] replication: fix code logic with the new shared_disk option

Some code logic only be needed in non-shared disk, here
we adjust these codes to prepare for shared disk scenario.

Reviewed-by: Stefan Hajnoczi 
Signed-off-by: zhanghailiang 
---
 block/replication.c | 47 ---
 1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/block/replication.c b/block/replication.c
index 3a44728..70ec08c 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -531,21 +531,28 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 aio_context_release(aio_context);
 return;
 }
-bdrv_op_block_all(top_bs, s->blocker);
-bdrv_op_unblock(top_bs, BLOCK_OP_TYPE_DATAPLANE, s->blocker);
 
-job = backup_job_create(NULL, s->secondary_disk->bs, 
s->hidden_disk->bs,
-0, MIRROR_SYNC_MODE_NONE, NULL, false,
+/*
+ * Only in the case of non-shared disk,
+ * the backup job is in the secondary side
+ */
+if (!s->is_shared_disk) {
+bdrv_op_block_all(top_bs, s->blocker);
+bdrv_op_unblock(top_bs, BLOCK_OP_TYPE_DATAPLANE, s->blocker);
+job = backup_job_create(NULL, s->secondary_disk->bs,
+s->hidden_disk->bs, 0,
+MIRROR_SYNC_MODE_NONE, NULL, false,
 BLOCKDEV_ON_ERROR_REPORT,
 BLOCKDEV_ON_ERROR_REPORT, BLOCK_JOB_INTERNAL,
 backup_job_completed, bs, NULL, &local_err);
-if (local_err) {
-error_propagate(errp, local_err);
-backup_job_cleanup(bs);
-aio_context_release(aio_context);
-return;
+if (local_err) {
+error_propagate(errp, local_err);
+backup_job_cleanup(bs);
+aio_context_release(aio_context);
+return;
+}
+block_job_start(job);
 }
-block_job_start(job);
 
 secondary_do_checkpoint(s, errp);
 break;
@@ -575,14 +582,16 @@ static void replication_do_checkpoint(ReplicationState 
*rs, Error **errp)
 case REPLICATION_MODE_PRIMARY:
 break;
 case REPLICATION_MODE_SECONDARY:
-if (!s->secondary_disk->bs->job) {
-error_setg(errp, "Backup job was cancelled unexpectedly");
-break;
-}
-backup_do_checkpoint(s->secondary_disk->bs->job, &local_err);
-if (local_err) {
-error_propagate(errp, local_err);
-break;
+if (!s->is_shared_disk) {
+if (!s->secondary_disk->bs->job) {
+error_setg(errp, "Backup job was cancelled unexpectedly");
+break;
+}
+backup_do_checkpoint(s->secondary_disk->bs->job, &local_err);
+if (local_err) {
+error_propagate(errp, local_err);
+break;
+}
 }
 secondary_do_checkpoint(s, errp);
 break;
@@ -663,7 +672,7 @@ static void replication_stop(ReplicationState *rs, bool 
failover, Error **errp)
  * before the BDS is closed, because we will access hidden
  * disk, secondary disk in backup_job_completed().
  */
-if (s->secondary_disk->bs->job) {
+if (!s->is_shared_disk && s->secondary_disk->bs->job) {
 block_job_cancel_sync(s->secondary_disk->bs->job);
 }
 
-- 
1.8.3.1

[Qemu-block] [PATCH v3 6/6] nbd/replication: implement .bdrv_get_info() for nbd and replication driver

Without this callback, there will be an error reports in the primary side:
"qemu-system-x86_64: Couldn't determine the cluster size of the target image,
which has no backing file: Operation not supported
Aborting, since this may create an unusable destination image"

For nbd driver, it doesn't have cluster size, so here we return
a fake value for it.

This patch should be dropped if Eric's nbd patch be merged.
https://lists.gnu.org/archive/html/qemu-devel/2016-04/msg03567.html

Cc: Eric Blake 
Signed-off-by: zhanghailiang 
Signed-off-by: Wen Congyang 
---
 block/nbd.c | 12 
 block/replication.c |  6 ++
 2 files changed, 18 insertions(+)

diff --git a/block/nbd.c b/block/nbd.c
index 35f24be..b71a13d 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -43,6 +43,8 @@
 
 #define EN_OPTSTR ":exportname="
 
+#define NBD_FAKE_CLUSTER_SIZE 512
+
 typedef struct BDRVNBDState {
 NBDClientSession client;
 
@@ -552,6 +554,13 @@ static void nbd_refresh_filename(BlockDriverState *bs, 
QDict *options)
 bs->full_open_options = opts;
 }
 
+static int nbd_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+bdi->cluster_size  = NBD_FAKE_CLUSTER_SIZE;
+
+return 0;
+}
+
 static BlockDriver bdrv_nbd = {
 .format_name= "nbd",
 .protocol_name  = "nbd",
@@ -569,6 +578,7 @@ static BlockDriver bdrv_nbd = {
 .bdrv_detach_aio_context= nbd_detach_aio_context,
 .bdrv_attach_aio_context= nbd_attach_aio_context,
 .bdrv_refresh_filename  = nbd_refresh_filename,
+.bdrv_get_info  = nbd_get_info,
 };
 
 static BlockDriver bdrv_nbd_tcp = {
@@ -588,6 +598,7 @@ static BlockDriver bdrv_nbd_tcp = {
 .bdrv_detach_aio_context= nbd_detach_aio_context,
 .bdrv_attach_aio_context= nbd_attach_aio_context,
 .bdrv_refresh_filename  = nbd_refresh_filename,
+.bdrv_get_info  = nbd_get_info,
 };
 
 static BlockDriver bdrv_nbd_unix = {
@@ -607,6 +618,7 @@ static BlockDriver bdrv_nbd_unix = {
 .bdrv_detach_aio_context= nbd_detach_aio_context,
 .bdrv_attach_aio_context= nbd_attach_aio_context,
 .bdrv_refresh_filename  = nbd_refresh_filename,
+.bdrv_get_info  = nbd_get_info,
 };
 
 static void bdrv_nbd_init(void)
diff --git a/block/replication.c b/block/replication.c
index a0b3e41..e4f8069 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -731,6 +731,11 @@ static void replication_stop(ReplicationState *rs, bool 
failover, Error **errp)
 aio_context_release(aio_context);
 }
 
+static int replication_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+return bdrv_get_info(bs->file->bs, bdi);
+}
+
 BlockDriver bdrv_replication = {
 .format_name= "replication",
 .protocol_name  = "replication",
@@ -743,6 +748,7 @@ BlockDriver bdrv_replication = {
 .bdrv_co_readv  = replication_co_readv,
 .bdrv_co_writev = replication_co_writev,
 
+.bdrv_get_info  = replication_get_info,
 .is_filter  = true,
 .bdrv_recurse_is_first_non_filter = 
replication_recurse_is_first_non_filter,
 
-- 
1.8.3.1

[Qemu-block] [PATCH v3 0/6] COLO block replication supports shared disk case

COLO block replication doesn't support the shared disk case,
Here we try to implement it and this is the third version.

Last posted series patches:
https://lists.gnu.org/archive/html/qemu-block/2016-12/msg00039.html
You can refer to the above link if want to test it.

I have uploaded the new version to github:
https://github.com/coloft/qemu/tree/colo-developing-with-shared-disk-2016-1-20

Please review and any commits are welcomed.

Cc: Juan Quintela 
Cc: Amit Shah  
Cc: Dr. David Alan Gilbert (git) 
Cc: eddie.d...@intel.com

v3:
- Fix some comments from Stefan and Eric

v2:
- Drop the patch which add a blk_root() helper
- Fix some comments from Changlong

zhanghailiang (6):
  docs/block-replication: Add description for shared-disk case
  replication: add shared-disk and shared-disk-id options
  replication: Split out backup_do_checkpoint() from
secondary_do_checkpoint()
  replication: fix code logic with the new shared_disk option
  replication: Implement block replication for shared disk case
  nbd/replication: implement .bdrv_get_info() for nbd and replication
driver

 block/nbd.c|  12 
 block/replication.c| 156 +++--
 docs/block-replication.txt | 139 ++--
 qapi/block-core.json   |  10 ++-
 4 files changed, 279 insertions(+), 38 deletions(-)

-- 
1.8.3.1

[Qemu-block] [PATCH v3 2/6] replication: add shared-disk and shared-disk-id options

We use these two options to identify which disk is
shared

Cc: Eric Blake 
Signed-off-by: zhanghailiang 
Signed-off-by: Wen Congyang 
Signed-off-by: Zhang Chen 
---
v3:
- Move g_free(s->shared_disk_id) to the common fail process place (Stefan)
- Fix comments for these two options
---
 block/replication.c  | 37 +
 qapi/block-core.json | 10 +-
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/block/replication.c b/block/replication.c
index 729dd12..b96a3e5 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -25,9 +25,12 @@
 typedef struct BDRVReplicationState {
 ReplicationMode mode;
 int replication_state;
+bool is_shared_disk;
+char *shared_disk_id;
 BdrvChild *active_disk;
 BdrvChild *hidden_disk;
 BdrvChild *secondary_disk;
+BdrvChild *primary_disk;
 char *top_id;
 ReplicationState *rs;
 Error *blocker;
@@ -53,6 +56,9 @@ static void replication_stop(ReplicationState *rs, bool 
failover,
 
 #define REPLICATION_MODE"mode"
 #define REPLICATION_TOP_ID  "top-id"
+#define REPLICATION_SHARED_DISK "shared-disk"
+#define REPLICATION_SHARED_DISK_ID "shared-disk-id"
+
 static QemuOptsList replication_runtime_opts = {
 .name = "replication",
 .head = QTAILQ_HEAD_INITIALIZER(replication_runtime_opts.head),
@@ -65,6 +71,14 @@ static QemuOptsList replication_runtime_opts = {
 .name = REPLICATION_TOP_ID,
 .type = QEMU_OPT_STRING,
 },
+{
+.name = REPLICATION_SHARED_DISK_ID,
+.type = QEMU_OPT_STRING,
+},
+{
+.name = REPLICATION_SHARED_DISK,
+.type = QEMU_OPT_BOOL,
+},
 { /* end of list */ }
 },
 };
@@ -85,6 +99,9 @@ static int replication_open(BlockDriverState *bs, QDict 
*options,
 QemuOpts *opts = NULL;
 const char *mode;
 const char *top_id;
+const char *shared_disk_id;
+BlockBackend *blk;
+BlockDriverState *tmp_bs;
 
 ret = -EINVAL;
 opts = qemu_opts_create(&replication_runtime_opts, NULL, 0, &error_abort);
@@ -119,12 +136,31 @@ static int replication_open(BlockDriverState *bs, QDict 
*options,
"The option mode's value should be primary or secondary");
 goto fail;
 }
+s->is_shared_disk = qemu_opt_get_bool(opts, REPLICATION_SHARED_DISK,
+  false);
+if (s->is_shared_disk && (s->mode == REPLICATION_MODE_PRIMARY)) {
+shared_disk_id = qemu_opt_get(opts, REPLICATION_SHARED_DISK_ID);
+if (!shared_disk_id) {
+error_setg(&local_err, "Missing shared disk blk option");
+goto fail;
+}
+s->shared_disk_id = g_strdup(shared_disk_id);
+blk = blk_by_name(s->shared_disk_id);
+if (!blk) {
+error_setg(&local_err, "There is no %s block", s->shared_disk_id);
+goto fail;
+}
+/* We can't access root member of BlockBackend directly */
+tmp_bs = blk_bs(blk);
+s->primary_disk = QLIST_FIRST(&tmp_bs->parents);
+}
 
 s->rs = replication_new(bs, &replication_ops);
 
 ret = 0;
 
 fail:
+g_free(s->shared_disk_id);
 qemu_opts_del(opts);
 error_propagate(errp, local_err);
 
@@ -135,6 +171,7 @@ static void replication_close(BlockDriverState *bs)
 {
 BDRVReplicationState *s = bs->opaque;
 
+g_free(s->shared_disk_id);
 if (s->replication_state == BLOCK_REPLICATION_RUNNING) {
 replication_stop(s->rs, false, NULL);
 }
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 1b3e6eb..e9fecda 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2624,12 +2624,20 @@
 #  node who owns the replication node chain. Must not be given in
 #  primary mode.
 #
+# @shared-disk-id: #optional id of shared disk while is replication mode,
+#  if @shared-disk is true, this option is required (Since: 
2.9)
+#
+# @shared-disk: #optional (The default is false) to indicate whether or not
+#   a disk is shared by primary VM and secondary VM. (Since: 2.9)
+#
 # Since: 2.8
 ##
 { 'struct': 'BlockdevOptionsReplication',
   'base': 'BlockdevOptionsGenericFormat',
   'data': { 'mode': 'ReplicationMode',
-'*top-id': 'str' } }
+'*top-id': 'str',
+'*shared-disk-id': 'str',
+'*shared-disk': 'bool' } }
 
 ##
 # @NFSTransport:
-- 
1.8.3.1

[Qemu-block] [PATCH v3 1/6] docs/block-replication: Add description for shared-disk case

Introuduce the scenario of shared-disk block replication
and how to use it.

Reviewed-by: Changlong Xie 
Reviewed-by: Stefan Hajnoczi 
Signed-off-by: zhanghailiang 
Signed-off-by: Wen Congyang 
Signed-off-by: Zhang Chen 
---
 docs/block-replication.txt | 139 +++--
 1 file changed, 135 insertions(+), 4 deletions(-)

diff --git a/docs/block-replication.txt b/docs/block-replication.txt
index 6bde673..fbfe005 100644
--- a/docs/block-replication.txt
+++ b/docs/block-replication.txt
@@ -24,7 +24,7 @@ only dropped at next checkpoint time. To reduce the network 
transportation
 effort during a vmstate checkpoint, the disk modification operations of
 the Primary disk are asynchronously forwarded to the Secondary node.
 
-== Workflow ==
+== Non-shared disk workflow ==
 The following is the image of block replication workflow:
 
 +--+++
@@ -57,7 +57,7 @@ The following is the image of block replication workflow:
 4) Secondary write requests will be buffered in the Disk buffer and it
will overwrite the existing sector content in the buffer.
 
-== Architecture ==
+== Non-shared disk architecture ==
 We are going to implement block replication from many basic
 blocks that are already in QEMU.
 
@@ -106,6 +106,74 @@ any state that would otherwise be lost by the speculative 
write-through
 of the NBD server into the secondary disk. So before block replication,
 the primary disk and secondary disk should contain the same data.
 
+== Shared Disk Mode Workflow ==
+The following is the image of block replication workflow:
+
++--+++
+|Primary Write Requests||Secondary Write Requests|
++--+++
+  |   |
+  |  (4)
+  |   V
+  |  /-\
+  | (2)Forward and write through | |
+  | +--> | Disk Buffer |
+  | || |
+  | |\-/
+  | |(1)read   |
+  | |  |
+   (3)write   | |  | backing file
+  V |  |
+ +-+   |
+ | Shared Disk | <-+
+ +-+
+
+1) Primary writes will read original data and forward it to Secondary
+   QEMU.
+2) Before Primary write requests are written to Shared disk, the
+   original sector content will be read from Shared disk and
+   forwarded and buffered in the Disk buffer on the secondary site,
+   but it will not overwrite the existing sector content (it could be
+   from either "Secondary Write Requests" or previous COW of "Primary
+   Write Requests") in the Disk buffer.
+3) Primary write requests will be written to Shared disk.
+4) Secondary write requests will be buffered in the Disk buffer and it
+   will overwrite the existing sector content in the buffer.
+
+== Shared Disk Mode Architecture ==
+We are going to implement block replication from many basic
+blocks that are already in QEMU.
+ virtio-blk ||   
.--
+ /  ||   | 
Secondary
+/   ||   
'--
+   /|| 
virtio-blk
+  / || 
 |
+  | ||   
replication(5)
+  |NBD  >   NBD   (2)  
 |
+  |  client ||server ---> hidden disk <-- 
active disk(4)
+  | ^   ||  |
+  |  replication(1) ||  |
+  | |   ||  |
+  |   +-'   ||  |
+ (3)  |drive-backup sync=none   ||  |
+. |   +-+   ||  |
+Primary | | |   ||   backing|
+' | |   ||  |
+  V |   |
+   +---+|
+   |

[Qemu-block] [PATCH v3 5/6] replication: Implement block replication for shared disk case

Just as the scenario of non-shared disk block replication,
we are going to implement block replication from many basic
blocks that are already in QEMU.
The architecture is:

 virtio-blk ||   
.--
 /  ||   | 
Secondary
/   ||   
'--
   /|| 
virtio-blk
  / ||  
|
  | ||   
replication(5)
  |NBD  >   NBD   (2)   
|
  |  client ||server ---> hidden disk <-- 
active disk(4)
  | ^   ||  |
  |  replication(1) ||  |
  | |   ||  |
  |   +-'   ||  |
 (3)  |drive-backup sync=none   ||  |
. |   +-+   ||  |
Primary | | |   ||   backing|
' | |   ||  |
  V |   |
   +---+|
   |   shared disk | <--+
   +---+

1) Primary writes will read original data and forward it to Secondary
   QEMU.
2) The hidden-disk is created automatically. It buffers the original content
   that is modified by the primary VM. It should also be an empty disk, and
   the driver supports bdrv_make_empty() and backing file.
3) Primary write requests will be written to Shared disk.
4) Secondary write requests will be buffered in the active disk and it
   will overwrite the existing sector content in the buffer.

Signed-off-by: zhanghailiang 
Signed-off-by: Wen Congyang 
Signed-off-by: Zhang Chen 
---
 block/replication.c | 48 ++--
 1 file changed, 42 insertions(+), 6 deletions(-)

diff --git a/block/replication.c b/block/replication.c
index 70ec08c..a0b3e41 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -233,7 +233,7 @@ static coroutine_fn int 
replication_co_readv(BlockDriverState *bs,
  QEMUIOVector *qiov)
 {
 BDRVReplicationState *s = bs->opaque;
-BdrvChild *child = s->secondary_disk;
+BdrvChild *child = s->is_shared_disk ? s->primary_disk : s->secondary_disk;
 BlockJob *job = NULL;
 CowRequest req;
 int ret;
@@ -415,7 +415,12 @@ static void backup_job_completed(void *opaque, int ret)
 s->error = -EIO;
 }
 
-backup_job_cleanup(bs);
+if (s->mode == REPLICATION_MODE_PRIMARY) {
+s->replication_state = BLOCK_REPLICATION_DONE;
+s->error = 0;
+} else {
+backup_job_cleanup(bs);
+}
 }
 
 static bool check_top_bs(BlockDriverState *top_bs, BlockDriverState *bs)
@@ -467,6 +472,19 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 
 switch (s->mode) {
 case REPLICATION_MODE_PRIMARY:
+if (s->is_shared_disk) {
+job = backup_job_create(NULL, s->primary_disk->bs, bs, 0,
+MIRROR_SYNC_MODE_NONE, NULL, false, BLOCKDEV_ON_ERROR_REPORT,
+BLOCKDEV_ON_ERROR_REPORT, BLOCK_JOB_INTERNAL,
+backup_job_completed, bs, NULL, &local_err);
+if (local_err) {
+error_propagate(errp, local_err);
+backup_job_cleanup(bs);
+aio_context_release(aio_context);
+return;
+}
+block_job_start(job);
+}
 break;
 case REPLICATION_MODE_SECONDARY:
 s->active_disk = bs->file;
@@ -485,7 +503,8 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 }
 
 s->secondary_disk = s->hidden_disk->bs->backing;
-if (!s->secondary_disk->bs || !bdrv_has_blk(s->secondary_disk->bs)) {
+if (!s->secondary_disk->bs ||
+(!s->is_shared_disk && !bdrv_has_blk(s->secondary_disk->bs))) {
 error_setg(errp, "The secondary disk doesn't have block backend");
 aio_context_release(aio_context);
 return;
@@ -580,11 +599,24 @@ static void replication_do_checkpoint(ReplicationState 
*rs, Error **errp)
 
 switch (s->mode) {
 case REPLICATION_MODE_PRIMARY:
+if (s->is_shared_disk) {
+if (!s->primary_disk->bs->job) {
+error_setg(errp, "Primary backup job was cancelled"
+   " unexpectedly");
+break

Re: [Qemu-block] [PATCH RFC v2 1/6] docs/block-replication: Add description for shared-disk case

2017-01-19 Thread Hailiang Zhang


On 2017/1/20 0:41, Stefan Hajnoczi wrote:

On Thu, Jan 19, 2017 at 10:50:19AM +0800, Hailiang Zhang wrote:

On 2017/1/13 21:41, Stefan Hajnoczi wrote:

On Mon, Dec 05, 2016 at 04:34:59PM +0800, zhanghailiang wrote:

+Issue qmp command:
+  { 'execute': 'blockdev-add',
+'arguments': {
+'driver': 'replication',
+'node-name': 'rep',
+'mode': 'primary',
+'shared-disk-id': 'primary_disk0',
+'shared-disk': true,
+'file': {
+'driver': 'nbd',
+'export': 'hidden_disk0',
+'server': {
+'type': 'inet',
+'data': {
+'host': 'xxx.xxx.xxx.xxx',
+'port': 'yyy'
+}
+}


block/nbd.c does have good error handling and recovery in case there is
a network issue.  There are no reconnection attempts or timeouts that
deal with a temporary loss of network connectivity.

This is a general problem with block/nbd.c and not something to solve in
this patch series.  I'm just mentioning it because it may affect COLO
replication.

I'm sure these limitations in block/nbd.c can be fixed but it will take
some effort.  Maybe block/sheepdog.c, net/socket.c, and other network
code could also benefit from generic network connection recovery.



Hmm, good suggestion, but IMHO, here, COLO is a little different from
other scenes, if the reconnection method has been implemented,
it still needs a mechanism to identify the temporary loss of network
connection or real broken in network connection.

I did a simple test, just ifconfig down the network card that be used
by block replication, It seems that NBD in qemu doesn't has a ability to
find the connection has been broken, there was no error reports
and COLO just got stuck in vm_stop() where it called aio_poll().


Yes, this is the vm_stop() problem again.  There is no reliable way to
cancel I/O requests so instead QEMU waits...forever.  A solution is
needed so COLO doesn't hang on network failure.



Yes, COLO needs to detect this situation and cancel the requests in a proper
way.


I'm not sure how to solve the problem.  The secondary still has the last
successful checkpoint so it could resume instead of waiting for the
current checkpoint to commit.

There may still be NBD I/O in flight, so the would need to drain it or
fence storage to prevent interference once the secondary VM is running.



Agreed, we need to think this carefully. We'll put these reliabilities
developing in future after COLO's basic function completed.

Thanks,
Hailiang


Stefan

Re: [Qemu-block] [Qemu-devel] [PATCH v10 14/16] file-posix: Implement image locking

On Thu, 01/19 15:49, Daniel P. Berrange wrote:
> On Thu, Jan 19, 2017 at 10:38:14PM +0800, Fam Zheng wrote:
> > This implements open flag sensible image locking for local file
> > and host device protocol.
> > 
> > virtlockd in libvirt locks the first byte, so we start looking at the
> > file bytes from 1.
> > 
> > Quoting what was proposed by Kevin Wolf , there are
> > four locking modes by combining two bits (BDRV_O_RDWR and
> > BDRV_O_SHARE_RW), and implemented by taking two locks:
> > 
> > Lock bytes:
> > 
> > * byte 1: I can't allow other processes to write to the image
> > * byte 2: I am writing to the image
> > 
> > Lock modes:
> > 
> > * shared writer (BDRV_O_RDWR | BDRV_O_SHARE_RW): Take shared lock on
> >   byte 2. Test whether byte 1 is locked using an exclusive lock, and
> >   fail if so.
> > 
> > * exclusive writer (BDRV_O_RDWR only): Take shared lock on byte 2. Test
> >   whether byte 1 is locked using an exclusive lock, and fail if so. Then
> >   take shared lock on byte 1. I suppose this is racy, but we can
> >   probably tolerate that.
> > 
> > * reader that can tolerate writers (BDRV_O_SHARE_RW only): Don't do anything
> > 
> > * reader that can't tolerate writers (neither bit is set): Take shared
> >   lock on byte 1. Test whether byte 2 is locked, and fail if so.
> 
> Ahh, using two bytes is an interesting technique for mapping the four
> different access methods onto the more limit fcntl lock semantics. We
> might want to copy this approach in libvirt too
> 
> > +/* Posix file locking bytes. Libvirt takes byte 0, so start from byte 1. */
> > +#define RAW_LOCK_BYTE_MIN 1
> > +#define RAW_LOCK_BYTE_NO_OTHER_WRITER 1
> > +#define RAW_LOCK_BYTE_WRITE   2
> 
> ...would you mind if QEMU started from say byte 10, leaving the first 10
> reserved for libvirt uses. This lets libvirt have a continuous space for
> its own usage if we want to use more bytes

That's easy, will do it. (Actually the descriptions above are a bit stale
because exclusive writers now take two bytes exclusively and the lock testings
are done with F_OFD_GETLK so that RO readers can test against shared locks
without acquiring a write lock, which requires O_RDWR).

Fam

[Qemu-block] [PATCH v2 0/3] scsi-generic and BLKSECTGET

In the Linux kernel, I see two (three) places where the BLKSECTGET ioctl is
handled:

(1) block/(compat_)ioctl.c -- (compat_)blkdev_ioctl
(2) drivers/scsi/sg.c -- sg_ioctl

The former has been around forever[1], and returns a short value measured in
sectors.  A sector is generally assumed to be 512 bytes.

The latter has been around for slightly less than forever[2], and returns an
int that measures the value in bytes.  A change to return the block count
was brought up a few years ago[3] and nacked.

As a convenient example, if I use the blockdev tool to drive the ioctl to a
SCSI disk and its scsi-generic equivalent, I get different results:

  # lsscsi -g
  [0:0:8:1077166114]diskIBM  2107900  .217  /dev/sda /dev/sg0
  # blockdev --getmaxsect /dev/sda
  2560
  # blockdev --getmaxsect /dev/sg0
  20

Now, the value for /dev/sda looks "correct" to me.

  # cd /sys/devices/css0/0.0.0125/0.0.1f69/host0/rport-0\:0-8/
  # cd target0\:0\:8/0\:0\:8\:1077166114/
  # cat block/sda/queue/max_sectors_kb
  1280
  # cat block/sda/queue/hw_sector_size
  512

And the math checks out:

  max_sectors_kb * 1024 / hw_sector_size == getmaxsect
  -OR-
  1280 * 1024 / 512 = 2560

For /dev/sg0, it appears the answer is coming from the sg_ioctl result
which is already multiplied by the block size, and then looking at only the
upper half (short) of the returned big-endian fullword:

  (1280 * 1024 / 512) * 512 = 1310720 = x0014 => x0014 = 20

The reason for all this?  Well, QEMU recently added a BLKSECTGET ioctl
call[4] which we see during guest boot.  This code presumes the value is in
blocks/sectors, and converts it to bytes[5].  Not that this matters, because
the short/int discrepancy gives us "zero" on s390x.

Also, that code doesn't execute for scsi-generic devices, so the conversion
to bytes is correct, but I'd like to extend this code to interrogate
scsi-generic devices as well.  This is important because libvirt converts
a specified virtio-scsi device to its /dev/sgX address for the QEMU
commandline.

So, do I have to code around the different field sizes (int vs short) as
well as scaling (bytes vs blocks)?  Obviously doable, but looking at the
resulting commits, I find myself feeling a little ill.

Changes:
 v1->v2:
  - Patch 3, make hdev_get_max_transfer_length return bytes [Fam Zheng]

[1] The initial kernel git commit
[2] kernel commit 44ec95425c1d9dce6e4638c29e4362cfb44814e7
[3] https://lkml.org/lkml/2012/6/27/78
[4] qemu commit 6f6071745bd0366221f5a0160ed7d18d0e38b9f7
[5] qemu commit 5def6b80e1eca696c1fc6099e7f4d36729686402 

Eric Farman (3):
  hw/scsi: Fix debug message of cdb structure in scsi-generic
  block: Fix target variable of BLKSECTGET ioctl
  block: get max_transfer limit for char (scsi-generic) devices

 block/file-posix.c | 17 +++--
 hw/scsi/scsi-generic.c |  5 +++--
 include/block/block.h  |  1 +
 3 files changed, 15 insertions(+), 8 deletions(-)

-- 
2.8.4

[Qemu-block] [PATCH v2 1/3] hw/scsi: Fix debug message of cdb structure in scsi-generic

When running with debug enabled, the scsi-generic cdb that is
dumped skips byte 0 of the command, which is the opcode.  This
makes identifying which command is being issued/completed a
little difficult.  Example:

  0x00 0x00 0x01 0x00 0x00
  scsi-generic: scsi_read_data 0x0
  scsi-generic: Data ready tag=0x0 len=164
  scsi-generic: scsi_read_data 0x0
  scsi-generic: Command complete 0x0x10a42c60 tag=0x0 status=0

Improve this by adding a message prior to the loop, similar to
what exists for scsi-disk.  Clean up a few other messages to be
more explicit of what is being represented.  Example:

  scsi-generic: Command: data=0x12 0x00 0x00 0x01 0x00 0x00
  scsi-generic: scsi_read_data tag=0x0
  scsi-generic: Data ready tag=0x0 len=164
  scsi-generic: scsi_read_data tag=0x0
  scsi-generic: Command complete 0x0x10a452d0 tag=0x0 status=0

Signed-off-by: Eric Farman 
---
 hw/scsi/scsi-generic.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/hw/scsi/scsi-generic.c b/hw/scsi/scsi-generic.c
index 7a588a7..92f091a 100644
--- a/hw/scsi/scsi-generic.c
+++ b/hw/scsi/scsi-generic.c
@@ -246,7 +246,7 @@ static void scsi_read_data(SCSIRequest *req)
 SCSIDevice *s = r->req.dev;
 int ret;
 
-DPRINTF("scsi_read_data 0x%x\n", req->tag);
+DPRINTF("scsi_read_data tag=0x%x\n", req->tag);
 
 /* The request is used as the AIO opaque value, so add a ref.  */
 scsi_req_ref(&r->req);
@@ -294,7 +294,7 @@ static void scsi_write_data(SCSIRequest *req)
 SCSIDevice *s = r->req.dev;
 int ret;
 
-DPRINTF("scsi_write_data 0x%x\n", req->tag);
+DPRINTF("scsi_write_data tag=0x%x\n", req->tag);
 if (r->len == 0) {
 r->len = r->buflen;
 scsi_req_data(&r->req, r->len);
@@ -329,6 +329,7 @@ static int32_t scsi_send_command(SCSIRequest *req, uint8_t 
*cmd)
 int ret;
 
 #ifdef DEBUG_SCSI
+DPRINTF("Command: data=0x%02x", cmd[0]);
 {
 int i;
 for (i = 1; i < r->req.cmd.len; i++) {
-- 
2.8.4

[Qemu-block] [PATCH v2 3/3] block: get max_transfer limit for char (scsi-generic) devices

Commit 6f607174 introduced a routine to get the maximum number
of bytes for a single I/O transfer for block devices, however
scsi generic devices are character devices, not block.  Add
a condition for this, such that scsi generic devices can view
the same data.

Some tweaking of data is required, because the block and sg
ioctls return values in different scales (sectors versus
bytes).  So adjust hdev_get_max_transfer_length such that it
always returns a value in bytes.

Signed-off-by: Eric Farman 
---
 block/file-posix.c| 10 ++
 include/block/block.h |  1 +
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 2115155..94068ca 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -657,9 +657,11 @@ static int hdev_get_max_transfer_length(BlockDriverState 
*bs, int fd)
 int max_sectors = 0;
 short max_sectors_short = 0;
 if (bs->sg && ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
+/* sg returns a value in bytes */
 return max_sectors;
 } else if (!bs->sg && ioctl(fd, BLKSECTGET, &max_sectors_short) == 0) {
-return max_sectors_short;
+/* block returns a value in sectors */
+return max_sectors_short << BDRV_SECTOR_BITS;
 } else {
 return -errno;
 }
@@ -674,10 +676,10 @@ static void raw_refresh_limits(BlockDriverState *bs, 
Error **errp)
 struct stat st;
 
 if (!fstat(s->fd, &st)) {
-if (S_ISBLK(st.st_mode)) {
+if (S_ISBLK(st.st_mode) || S_ISCHR(st.st_mode)) {
 int ret = hdev_get_max_transfer_length(bs, s->fd);
-if (ret > 0 && ret <= BDRV_REQUEST_MAX_SECTORS) {
-bs->bl.max_transfer = pow2floor(ret << BDRV_SECTOR_BITS);
+if (ret > 0 && ret <= BDRV_REQUEST_MAX_BYTES) {
+bs->bl.max_transfer = pow2floor(ret);
 }
 }
 }
diff --git a/include/block/block.h b/include/block/block.h
index 8b0dcda..4e81f20 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -116,6 +116,7 @@ typedef struct HDGeometry {
 
 #define BDRV_REQUEST_MAX_SECTORS MIN(SIZE_MAX >> BDRV_SECTOR_BITS, \
  INT_MAX >> BDRV_SECTOR_BITS)
+#define BDRV_REQUEST_MAX_BYTES (BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS)
 
 /*
  * Allocation status flags
-- 
2.8.4

[Qemu-block] [PATCH v2 2/3] block: Fix target variable of BLKSECTGET ioctl

Commit 6f607174 introduced a routine to call the kernel BLKSECTGET
ioctl, which stores the result back to user space.  However, the
size of the data returned depends on the routine handling the ioctl.
The (compat_)blkdev_ioctl returns a short, while sg_ioctl returns
an int.  Thus, on big-endian systems, we can find ourselves
accidentally shifting the result to a much larger value.
(On s390x, a short is 16 bits while an int is 32 bits.)

Signed-off-by: Eric Farman 
---
 block/file-posix.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 28b47d9..2115155 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -651,12 +651,15 @@ static void raw_reopen_abort(BDRVReopenState *state)
 state->opaque = NULL;
 }
 
-static int hdev_get_max_transfer_length(int fd)
+static int hdev_get_max_transfer_length(BlockDriverState *bs, int fd)
 {
 #ifdef BLKSECTGET
 int max_sectors = 0;
-if (ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
+short max_sectors_short = 0;
+if (bs->sg && ioctl(fd, BLKSECTGET, &max_sectors) == 0) {
 return max_sectors;
+} else if (!bs->sg && ioctl(fd, BLKSECTGET, &max_sectors_short) == 0) {
+return max_sectors_short;
 } else {
 return -errno;
 }
@@ -672,7 +675,7 @@ static void raw_refresh_limits(BlockDriverState *bs, Error 
**errp)
 
 if (!fstat(s->fd, &st)) {
 if (S_ISBLK(st.st_mode)) {
-int ret = hdev_get_max_transfer_length(s->fd);
+int ret = hdev_get_max_transfer_length(bs, s->fd);
 if (ret > 0 && ret <= BDRV_REQUEST_MAX_SECTORS) {
 bs->bl.max_transfer = pow2floor(ret << BDRV_SECTOR_BITS);
 }
-- 
2.8.4

Re: [Qemu-block] [Qemu-devel] [PATCH v10 14/16] file-posix: Implement image locking

2017-01-19 Thread Richard W.M. Jones

On Thu, Jan 19, 2017 at 10:19:29AM -0600, Eric Blake wrote:
> On 01/19/2017 09:49 AM, Daniel P. Berrange wrote:
> > On Thu, Jan 19, 2017 at 10:38:14PM +0800, Fam Zheng wrote:
> >> This implements open flag sensible image locking for local file
> >> and host device protocol.
> >>
> >> virtlockd in libvirt locks the first byte, so we start looking at the
> >> file bytes from 1.
> >>
> >> Quoting what was proposed by Kevin Wolf , there are
> >> four locking modes by combining two bits (BDRV_O_RDWR and
> >> BDRV_O_SHARE_RW), and implemented by taking two locks:
> >>
> 
> >> +/* Posix file locking bytes. Libvirt takes byte 0, so start from byte 1. 
> >> */
> >> +#define RAW_LOCK_BYTE_MIN 1
> >> +#define RAW_LOCK_BYTE_NO_OTHER_WRITER 1
> >> +#define RAW_LOCK_BYTE_WRITE   2
> > 
> > ...would you mind if QEMU started from say byte 10, leaving the first 10
> > reserved for libvirt uses. This lets libvirt have a continuous space for
> > its own usage if we want to use more bytes
> 
> Thankfully, fcntl() locks can be taken beyond end-of-file, so I think
> we're okay making an arbitrarily larger range of bytes reserved for each
> process, even in the unlikely corner case of passing files smaller than
> 512 bytes.

Not so unlikely.  libguestfs actually detects and works around this
case because qemu was (possibly still is) very broken when you pass
small files.

https://github.com/libguestfs/libguestfs/blob/master/src/drives.c#L395-L441

Rich.

-- 
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
Read my programming and virtualization blog: http://rwmj.wordpress.com
libguestfs lets you edit virtual machines.  Supports shell scripting,
bindings from many languages.  http://libguestfs.org

Re: [Qemu-block] [PATCH v2 2/6] qdict: Add convenience helpers for wrapped puts

2017-01-19 Thread Stefan Hajnoczi

On Wed, Jan 18, 2017 at 10:16:49AM -0600, Eric Blake wrote:
> Quite a few users of qdict_put() were manually wrapping a
> non-QObject. We can make such call-sites shorter, by providing
> common macros to do the tedious work.  Also shorten nearby
> qdict_put_obj(,,QOBJECT()) sequences.
> 
> Signed-off-by: Eric Blake 
> Reviewed-by: Alberto Garcia 
> 
> ---
> 
> v2: rebase to current master
> 
> I'm okay if you want me to break this patch into smaller pieces.
> ---
>  include/qapi/qmp/qdict.h|   8 +++
>  block.c |  59 +++-
>  block/archipelago.c |   4 +-
>  block/blkdebug.c|   6 +-
>  block/blkverify.c   |  11 ++-
>  block/curl.c|   2 +-
>  block/file-posix.c  |   8 +--
>  block/file-win32.c  |   4 +-
>  block/iscsi.c   |   2 +-
>  block/nbd.c |  41 ++-
>  block/nfs.c |  43 +---
>  block/null.c|   2 +-
>  block/qcow2.c   |   4 +-
>  block/quorum.c  |  13 ++--
>  block/ssh.c |  16 ++---
>  block/vvfat.c   |  10 +--
>  blockdev.c  |  28 
>  hw/block/xen_disk.c |   2 +-
>  hw/usb/xen-usb.c|  12 ++--
>  monitor.c   |  18 ++---
>  qapi/qmp-event.c|   2 +-
>  qemu-img.c  |   6 +-
>  qemu-io.c   |   2 +-
>  qemu-nbd.c  |   2 +-
>  qobject/qdict.c |   2 +-
>  target/s390x/cpu_models.c   |   4 +-
>  tests/check-qdict.c | 132 
> ++--
>  tests/test-qmp-commands.c   |  30 
>  tests/test-qmp-event.c  |  30 
>  tests/test-qobject-output-visitor.c |   6 +-
>  util/qemu-option.c  |   6 +-
>  31 files changed, 245 insertions(+), 270 deletions(-)

Reviewed-by: Stefan Hajnoczi 


signature.asc
Description: PGP signature

Re: [Qemu-block] [PATCH v6 0/2] allow blockdev-add for NFS

Am 19.01.2017 um 18:08 schrieb Kevin Wolf:

Am 19.01.2017 um 16:58 hat Peter Lieven geschrieben:

Am 19.01.2017 um 16:55 schrieb Kevin Wolf:

Am 19.01.2017 um 16:44 hat Peter Lieven geschrieben:

Am 19.01.2017 um 16:42 schrieb Kevin Wolf:

Am 19.01.2017 um 16:34 hat Peter Lieven geschrieben:

Am 19.01.2017 um 16:20 schrieb Kevin Wolf:

Am 19.01.2017 um 15:59 hat Eric Blake geschrieben:

On 01/19/2017 08:30 AM, Peter Lieven wrote:

qemu-img: Could not open
'nfs://172.21.200.61/vcore-dev-cdrom/templates/VC_debian8-20170116.qcow2,linux?readahead=131072':
Block protocol 'nfs' doesn't support the option 'readahead-size'

Please let me know if the below fix would be correct:

No, this needs to be fixed the other way round: runtime_opts must use
the names as specified in the schema, and nfs_client_open() must access
them as such. Without that, blockdev-add can't work (and the command
line only with the "wrong" old option names from the URL, whereas it
should be using the same names as the QAPI schema).

Shouldn't we support both for backwards compatiblity.?

blockdev-add only needs to support the modern naming. But yes,
preserving back-compat spelling of the command-line spellings, as well
as matching blockdev-add spellings, is desirable.

We only just added the individual command line options, previously it
only supported the URL.

It's true that we have the messed up version of the options in 2.8, so
strictly speaking we would break compatibility with a release, but it's
only one release, it's only the nfs driver, and the documentation of the
options is the schema, which had the right option names even in 2.8
(they just didn't work).

So I wouldn't feel bad about removing the wrong names in this specific
case.

So want exactly do you want to do? Fix the names in the QAPI schema
to use the old naming?

No, fix the command line to use the names in the QAPI schema.

The option names from the URL were never supposed to be supported on the
command line.

Okay, so no backwards compatiblity? I actually used the options on the command
line...

Well, do you _need_ compatibility?

It can certainly be done, but as the (wrong) options on the command line
have only existed since November and were never documented, I wouldn't
bother unless there's a good reason.

Every Qemu before 2.8.0 was working with sth like:

qemu -cdrom nfs://10.0.0.1/expory/my.iso?readahead=131072

That will keep working. We're not changing the URL parsing, just the
runtime_opts and its accesses in nfs_client_open(). The translation in
nfs_parse_uri() stays intact with the fixes.

What will stop working (and only worked in 2.8.0) is this:

qemu -drive
media=cdrom,driver=nfs,server.host=10.0.0.1,path=export/my.iso,readahead=131072

Also, I think the fixes should be Cc: qemu-stable, so that 2.8.1 will
work correctly again.

Okay, I hope I got it. Will send a patch tomorrow.

Peter

Re: [Qemu-block] [PATCH v6 0/2] allow blockdev-add for NFS

Am 19.01.2017 um 16:58 hat Peter Lieven geschrieben:
> Am 19.01.2017 um 16:55 schrieb Kevin Wolf:
> >Am 19.01.2017 um 16:44 hat Peter Lieven geschrieben:
> >>Am 19.01.2017 um 16:42 schrieb Kevin Wolf:
> >>>Am 19.01.2017 um 16:34 hat Peter Lieven geschrieben:
> Am 19.01.2017 um 16:20 schrieb Kevin Wolf:
> >Am 19.01.2017 um 15:59 hat Eric Blake geschrieben:
> >>On 01/19/2017 08:30 AM, Peter Lieven wrote:
> >qemu-img: Could not open
> >'nfs://172.21.200.61/vcore-dev-cdrom/templates/VC_debian8-20170116.qcow2,linux?readahead=131072':
> >Block protocol 'nfs' doesn't support the option 'readahead-size'
> >
> >Please let me know if the below fix would be correct:
> No, this needs to be fixed the other way round: runtime_opts must use
> the names as specified in the schema, and nfs_client_open() must 
> access
> them as such. Without that, blockdev-add can't work (and the command
> line only with the "wrong" old option names from the URL, whereas it
> should be using the same names as the QAPI schema).
> >>>Shouldn't we support both for backwards compatiblity.?
> >>blockdev-add only needs to support the modern naming.  But yes,
> >>preserving back-compat spelling of the command-line spellings, as well
> >>as matching blockdev-add spellings, is desirable.
> >We only just added the individual command line options, previously it
> >only supported the URL.
> >
> >It's true that we have the messed up version of the options in 2.8, so
> >strictly speaking we would break compatibility with a release, but it's
> >only one release, it's only the nfs driver, and the documentation of the
> >options is the schema, which had the right option names even in 2.8
> >(they just didn't work).
> >
> >So I wouldn't feel bad about removing the wrong names in this specific
> >case.
> So want exactly do you want to do? Fix the names in the QAPI schema
> to use the old naming?
> >>>No, fix the command line to use the names in the QAPI schema.
> >>>
> >>>The option names from the URL were never supposed to be supported on the
> >>>command line.
> >>Okay, so no backwards compatiblity? I actually used the options on the 
> >>command line...
> >Well, do you _need_ compatibility?
> >
> >It can certainly be done, but as the (wrong) options on the command line
> >have only existed since November and were never documented, I wouldn't
> >bother unless there's a good reason.
> 
> Every Qemu before 2.8.0 was working with sth like:
> 
> qemu -cdrom nfs://10.0.0.1/expory/my.iso?readahead=131072

That will keep working. We're not changing the URL parsing, just the
runtime_opts and its accesses in nfs_client_open(). The translation in
nfs_parse_uri() stays intact with the fixes.

What will stop working (and only worked in 2.8.0) is this:

qemu -drive 
media=cdrom,driver=nfs,server.host=10.0.0.1,path=export/my.iso,readahead=131072

Also, I think the fixes should be Cc: qemu-stable, so that 2.8.1 will
work correctly again.

Kevin

Re: [Qemu-block] [Qemu-devel] qemu-img: complains about missing iscsi option group

On Thu, Jan 19, 2017 at 05:57:45PM +0100, Peter Lieven wrote:
> Am 19.01.2017 um 17:38 schrieb Daniel P. Berrange:
> > On Thu, Jan 19, 2017 at 05:30:20PM +0100, Peter Lieven wrote:
> > > Hi all,
> > > 
> > > 
> > > since commit f57b4b5 (blockdev: prepare iSCSI block driver for dynamic 
> > > loading)
> > > 
> > > qemu-img complains about a missing option group several times.
> > > 
> > > 
> > > ~/git/qemu$ ./qemu-img info 
> > > iscsi://172.21.200.56:3260/iqn.2001-05.com.equallogic:0-8a0906-69d384e0a-aa3004e55e15878d-00lieven-data/0
> > > qemu-img: There is no option group 'iscsi'
> > > qemu-img: There is no option group 'iscsi'
> > > qemu-img: There is no option group 'iscsi'
> > > qemu-img: There is no option group 'iscsi'
> > > image: 
> > > iscsi://172.21.200.56:3260/iqn.2001-05.com.equallogic:0-8a0906-69d384e0a-aa3004e55e15878d-00lieven-data/0
> > > file format: raw
> > > virtual size: 25G (26848788480 bytes)
> > > disk size: unavailable
> > > cluster_size: 15728640
> > > 
> > > I don't know whats the proper way to fix this?
> > Move the iscsi option registration out of vl.c and back into a shared file
> > in block/ such that all programs see the options, not just the emulators.
> 
> Any preference which file? It seems no other block driver registers options.

ISCSI is somewhat bizarre in that it has a separate global -iscsi arg
instead of just accepting all args via the -drive spec like every other
block driver does, so there's no precedent to follow.

So suggest we just need a block/iscsi-global.c to deal with this.

Regards,
Daniel
-- 
|: http://berrange.com  -o-http://www.flickr.com/photos/dberrange/ :|
|: http://libvirt.org  -o- http://virt-manager.org :|
|: http://entangle-photo.org   -o-http://search.cpan.org/~danberr/ :|

Re: [Qemu-block] [Qemu-devel] qemu-img: complains about missing iscsi option group


Am 19.01.2017 um 17:38 schrieb Daniel P. Berrange:

On Thu, Jan 19, 2017 at 05:30:20PM +0100, Peter Lieven wrote:

Hi all,


since commit f57b4b5 (blockdev: prepare iSCSI block driver for dynamic loading)

qemu-img complains about a missing option group several times.


~/git/qemu$ ./qemu-img info 
iscsi://172.21.200.56:3260/iqn.2001-05.com.equallogic:0-8a0906-69d384e0a-aa3004e55e15878d-00lieven-data/0
qemu-img: There is no option group 'iscsi'
qemu-img: There is no option group 'iscsi'
qemu-img: There is no option group 'iscsi'
qemu-img: There is no option group 'iscsi'
image: 
iscsi://172.21.200.56:3260/iqn.2001-05.com.equallogic:0-8a0906-69d384e0a-aa3004e55e15878d-00lieven-data/0
file format: raw
virtual size: 25G (26848788480 bytes)
disk size: unavailable
cluster_size: 15728640

I don't know whats the proper way to fix this?

Move the iscsi option registration out of vl.c and back into a shared file
in block/ such that all programs see the options, not just the emulators.


Any preference which file? It seems no other block driver registers options.

Thanks,
Peter

[Qemu-block] [PATCH V2] qemu-img: optimize is_allocated_sectors_min

the current implementation always splits requests if a buffer
begins or ends with zeroes independent of the length of the
zero area. Change this to really only split off zero areas
that have at least a length of 'min' bytes.

Signed-off-by: Peter Lieven 
---
 qemu-img.c | 44 ++--
 1 file changed, 14 insertions(+), 30 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index 5df66fe..8e7357d 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1010,45 +1010,29 @@ static int is_allocated_sectors(const uint8_t *buf, int 
n, int *pnum)
 }
 
 /*
- * Like is_allocated_sectors, but if the buffer starts with a used sector,
- * up to 'min' consecutive sectors containing zeros are ignored. This avoids
- * breaking up write requests for only small sparse areas.
+ * Like is_allocated_sectors, but only at least 'min' consecutive sectors
+ * containing zeros are considered unallocated. This avoids breaking up write
+ * requests for only small sparse areas.
  */
 static int is_allocated_sectors_min(const uint8_t *buf, int n, int *pnum,
-int min)
+int min)
 {
-int ret;
-int num_checked, num_used;
-
-if (n < min) {
-min = n;
-}
-
-ret = is_allocated_sectors(buf, n, pnum);
-if (!ret) {
-return ret;
-}
-
-num_used = *pnum;
-buf += BDRV_SECTOR_SIZE * *pnum;
-n -= *pnum;
-num_checked = num_used;
+int num_used = 0;
 
 while (n > 0) {
-ret = is_allocated_sectors(buf, n, pnum);
-
-buf += BDRV_SECTOR_SIZE * *pnum;
-n -= *pnum;
-num_checked += *pnum;
-if (ret) {
-num_used = num_checked;
-} else if (*pnum >= min) {
+if (!is_allocated_sectors(buf, n, pnum) && *pnum >= min) {
 break;
 }
+num_used += *pnum;
+buf += BDRV_SECTOR_SIZE * *pnum;
+n -= *pnum;
 }
 
-*pnum = num_used;
-return 1;
+if (num_used) {
+*pnum = num_used;
+}
+
+return !!num_used;
 }
 
 /*
-- 
1.9.1

Re: [Qemu-block] [PATCH RFC v2 1/6] docs/block-replication: Add description for shared-disk case

2017-01-19 Thread Stefan Hajnoczi

On Thu, Jan 19, 2017 at 10:50:19AM +0800, Hailiang Zhang wrote:
> On 2017/1/13 21:41, Stefan Hajnoczi wrote:
> > On Mon, Dec 05, 2016 at 04:34:59PM +0800, zhanghailiang wrote:
> > > +Issue qmp command:
> > > +  { 'execute': 'blockdev-add',
> > > +'arguments': {
> > > +'driver': 'replication',
> > > +'node-name': 'rep',
> > > +'mode': 'primary',
> > > +'shared-disk-id': 'primary_disk0',
> > > +'shared-disk': true,
> > > +'file': {
> > > +'driver': 'nbd',
> > > +'export': 'hidden_disk0',
> > > +'server': {
> > > +'type': 'inet',
> > > +'data': {
> > > +'host': 'xxx.xxx.xxx.xxx',
> > > +'port': 'yyy'
> > > +}
> > > +}
> > 
> > block/nbd.c does have good error handling and recovery in case there is
> > a network issue.  There are no reconnection attempts or timeouts that
> > deal with a temporary loss of network connectivity.
> > 
> > This is a general problem with block/nbd.c and not something to solve in
> > this patch series.  I'm just mentioning it because it may affect COLO
> > replication.
> > 
> > I'm sure these limitations in block/nbd.c can be fixed but it will take
> > some effort.  Maybe block/sheepdog.c, net/socket.c, and other network
> > code could also benefit from generic network connection recovery.
> > 
> 
> Hmm, good suggestion, but IMHO, here, COLO is a little different from
> other scenes, if the reconnection method has been implemented,
> it still needs a mechanism to identify the temporary loss of network
> connection or real broken in network connection.
> 
> I did a simple test, just ifconfig down the network card that be used
> by block replication, It seems that NBD in qemu doesn't has a ability to
> find the connection has been broken, there was no error reports
> and COLO just got stuck in vm_stop() where it called aio_poll().

Yes, this is the vm_stop() problem again.  There is no reliable way to
cancel I/O requests so instead QEMU waits...forever.  A solution is
needed so COLO doesn't hang on network failure.

I'm not sure how to solve the problem.  The secondary still has the last
successful checkpoint so it could resume instead of waiting for the
current checkpoint to commit.

There may still be NBD I/O in flight, so the would need to drain it or
fence storage to prevent interference once the secondary VM is running.

Stefan


signature.asc
Description: PGP signature

Re: [Qemu-block] [Qemu-devel] qemu-img: complains about missing iscsi option group

On Thu, Jan 19, 2017 at 05:30:20PM +0100, Peter Lieven wrote:
> Hi all,
> 
> 
> since commit f57b4b5 (blockdev: prepare iSCSI block driver for dynamic 
> loading)
> 
> qemu-img complains about a missing option group several times.
> 
> 
> ~/git/qemu$ ./qemu-img info 
> iscsi://172.21.200.56:3260/iqn.2001-05.com.equallogic:0-8a0906-69d384e0a-aa3004e55e15878d-00lieven-data/0
> qemu-img: There is no option group 'iscsi'
> qemu-img: There is no option group 'iscsi'
> qemu-img: There is no option group 'iscsi'
> qemu-img: There is no option group 'iscsi'
> image: 
> iscsi://172.21.200.56:3260/iqn.2001-05.com.equallogic:0-8a0906-69d384e0a-aa3004e55e15878d-00lieven-data/0
> file format: raw
> virtual size: 25G (26848788480 bytes)
> disk size: unavailable
> cluster_size: 15728640
> 
> I don't know whats the proper way to fix this?

Move the iscsi option registration out of vl.c and back into a shared file
in block/ such that all programs see the options, not just the emulators.


Regards,
Daniel
-- 
|: http://berrange.com  -o-http://www.flickr.com/photos/dberrange/ :|
|: http://libvirt.org  -o- http://virt-manager.org :|
|: http://entangle-photo.org   -o-http://search.cpan.org/~danberr/ :|

[Qemu-block] qemu-img: complains about missing iscsi option group


Hi all,


since commit f57b4b5 (blockdev: prepare iSCSI block driver for dynamic loading)

qemu-img complains about a missing option group several times.


~/git/qemu$ ./qemu-img info 
iscsi://172.21.200.56:3260/iqn.2001-05.com.equallogic:0-8a0906-69d384e0a-aa3004e55e15878d-00lieven-data/0
qemu-img: There is no option group 'iscsi'
qemu-img: There is no option group 'iscsi'
qemu-img: There is no option group 'iscsi'
qemu-img: There is no option group 'iscsi'
image: 
iscsi://172.21.200.56:3260/iqn.2001-05.com.equallogic:0-8a0906-69d384e0a-aa3004e55e15878d-00lieven-data/0
file format: raw
virtual size: 25G (26848788480 bytes)
disk size: unavailable
cluster_size: 15728640

I don't know whats the proper way to fix this?


Thanks,

Peter

Re: [Qemu-block] [PATCH] qemu-img: optimize is_allocated_sectors_min


Please ignore this one The code was suboptimal. Will send v2.

Peter

Am 19.01.2017 um 16:56 schrieb Peter Lieven:

the current implementation always splits requests if a buffer
begins or ends with zeroes independent of the length of the
zero area. Change this to really only split off zero areas
that have at least a length of 'min' bytes.

Signed-off-by: Peter Lieven 
---
  qemu-img.c | 38 --
  1 file changed, 12 insertions(+), 26 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index 5df66fe..046696f 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1010,45 +1010,31 @@ static int is_allocated_sectors(const uint8_t *buf, int 
n, int *pnum)
  }
  
  /*

- * Like is_allocated_sectors, but if the buffer starts with a used sector,
- * up to 'min' consecutive sectors containing zeros are ignored. This avoids
- * breaking up write requests for only small sparse areas.
+ * Like is_allocated_sectors, but only at least 'min' consecutive sectors
+ * containing zeros are considered unallocated. This avoids breaking up write
+ * requests for only small sparse areas.
   */
  static int is_allocated_sectors_min(const uint8_t *buf, int n, int *pnum,
-int min)
+int min)
  {
-int ret;
-int num_checked, num_used;
-
-if (n < min) {
-min = n;
-}
-
-ret = is_allocated_sectors(buf, n, pnum);
-if (!ret) {
-return ret;
-}
-
-num_used = *pnum;
-buf += BDRV_SECTOR_SIZE * *pnum;
-n -= *pnum;
-num_checked = num_used;
+int ret, num_checked = 0, num_used = 0;
  
  while (n > 0) {

  ret = is_allocated_sectors(buf, n, pnum);
-
  buf += BDRV_SECTOR_SIZE * *pnum;
  n -= *pnum;
  num_checked += *pnum;
-if (ret) {
-num_used = num_checked;
-} else if (*pnum >= min) {
+if (!ret && *pnum >= min) {
  break;
  }
+num_used = num_checked;
  }
  
-*pnum = num_used;

-return 1;
+if (num_used) {
+*pnum = num_used;
+}
+
+return !!num_used;
  }
  
  /*

[Qemu-block] [PATCH] qemu-img: optimize is_allocated_sectors_min

the current implementation always splits requests if a buffer
begins or ends with zeroes independent of the length of the
zero area. Change this to really only split off zero areas
that have at least a length of 'min' bytes.

Signed-off-by: Peter Lieven 
---
 qemu-img.c | 38 --
 1 file changed, 12 insertions(+), 26 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index 5df66fe..046696f 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1010,45 +1010,31 @@ static int is_allocated_sectors(const uint8_t *buf, int 
n, int *pnum)
 }
 
 /*
- * Like is_allocated_sectors, but if the buffer starts with a used sector,
- * up to 'min' consecutive sectors containing zeros are ignored. This avoids
- * breaking up write requests for only small sparse areas.
+ * Like is_allocated_sectors, but only at least 'min' consecutive sectors
+ * containing zeros are considered unallocated. This avoids breaking up write
+ * requests for only small sparse areas.
  */
 static int is_allocated_sectors_min(const uint8_t *buf, int n, int *pnum,
-int min)
+int min)
 {
-int ret;
-int num_checked, num_used;
-
-if (n < min) {
-min = n;
-}
-
-ret = is_allocated_sectors(buf, n, pnum);
-if (!ret) {
-return ret;
-}
-
-num_used = *pnum;
-buf += BDRV_SECTOR_SIZE * *pnum;
-n -= *pnum;
-num_checked = num_used;
+int ret, num_checked = 0, num_used = 0;
 
 while (n > 0) {
 ret = is_allocated_sectors(buf, n, pnum);
-
 buf += BDRV_SECTOR_SIZE * *pnum;
 n -= *pnum;
 num_checked += *pnum;
-if (ret) {
-num_used = num_checked;
-} else if (*pnum >= min) {
+if (!ret && *pnum >= min) {
 break;
 }
+num_used = num_checked;
 }
 
-*pnum = num_used;
-return 1;
+if (num_used) {
+*pnum = num_used;
+}
+
+return !!num_used;
 }
 
 /*
-- 
1.9.1

Re: [Qemu-block] [PATCH v6 0/2] allow blockdev-add for NFS

Am 19.01.2017 um 16:55 schrieb Kevin Wolf:

Am 19.01.2017 um 16:44 hat Peter Lieven geschrieben:

Am 19.01.2017 um 16:42 schrieb Kevin Wolf:

Am 19.01.2017 um 16:34 hat Peter Lieven geschrieben:

Am 19.01.2017 um 16:20 schrieb Kevin Wolf:

Am 19.01.2017 um 15:59 hat Eric Blake geschrieben:

On 01/19/2017 08:30 AM, Peter Lieven wrote:

qemu-img: Could not open
'nfs://172.21.200.61/vcore-dev-cdrom/templates/VC_debian8-20170116.qcow2,linux?readahead=131072':
Block protocol 'nfs' doesn't support the option 'readahead-size'

Please let me know if the below fix would be correct:

Shouldn't we support both for backwards compatiblity.?

blockdev-add only needs to support the modern naming. But yes,
preserving back-compat spelling of the command-line spellings, as well
as matching blockdev-add spellings, is desirable.

We only just added the individual command line options, previously it
only supported the URL.

So I wouldn't feel bad about removing the wrong names in this specific
case.

So want exactly do you want to do? Fix the names in the QAPI schema
to use the old naming?

No, fix the command line to use the names in the QAPI schema.

The option names from the URL were never supposed to be supported on the
command line.

Okay, so no backwards compatiblity? I actually used the options on the command
line...

Well, do you _need_ compatibility?

It can certainly be done, but as the (wrong) options on the command line
have only existed since November and were never documented, I wouldn't
bother unless there's a good reason.

Every Qemu before 2.8.0 was working with sth like:

qemu -cdrom nfs://10.0.0.1/expory/my.iso?readahead=131072

Peter

Re: [Qemu-block] [Qemu-devel] [PATCH v10 14/16] file-posix: Implement image locking

On 01/19/2017 09:49 AM, Daniel P. Berrange wrote:
> On Thu, Jan 19, 2017 at 10:38:14PM +0800, Fam Zheng wrote:
>> This implements open flag sensible image locking for local file
>> and host device protocol.
>>
>> virtlockd in libvirt locks the first byte, so we start looking at the
>> file bytes from 1.
>>
>> Quoting what was proposed by Kevin Wolf , there are
>> four locking modes by combining two bits (BDRV_O_RDWR and
>> BDRV_O_SHARE_RW), and implemented by taking two locks:
>>

>> +/* Posix file locking bytes. Libvirt takes byte 0, so start from byte 1. */
>> +#define RAW_LOCK_BYTE_MIN 1
>> +#define RAW_LOCK_BYTE_NO_OTHER_WRITER 1
>> +#define RAW_LOCK_BYTE_WRITE   2
> 
> ...would you mind if QEMU started from say byte 10, leaving the first 10
> reserved for libvirt uses. This lets libvirt have a continuous space for
> its own usage if we want to use more bytes

Thankfully, fcntl() locks can be taken beyond end-of-file, so I think
we're okay making an arbitrarily larger range of bytes reserved for each
process, even in the unlikely corner case of passing files smaller than
512 bytes.

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-block] [PATCH v6 0/2] allow blockdev-add for NFS

Am 19.01.2017 um 16:20 schrieb Kevin Wolf:

Am 19.01.2017 um 15:59 hat Eric Blake geschrieben:

On 01/19/2017 08:30 AM, Peter Lieven wrote:

qemu-img: Could not open
'nfs://172.21.200.61/vcore-dev-cdrom/templates/VC_debian8-20170116.qcow2,linux?readahead=131072':
Block protocol 'nfs' doesn't support the option 'readahead-size'

Please let me know if the below fix would be correct:

Shouldn't we support both for backwards compatiblity.?

blockdev-add only needs to support the modern naming. But yes,
preserving back-compat spelling of the command-line spellings, as well
as matching blockdev-add spellings, is desirable.

We only just added the individual command line options, previously it
only supported the URL.

So I wouldn't feel bad about removing the wrong names in this specific
case.

So want exactly do you want to do? Fix the names in the QAPI schema
to use the old naming?

Peter

Re: [Qemu-block] [Qemu-devel] [PATCH v10 14/16] file-posix: Implement image locking

On Thu, Jan 19, 2017 at 10:38:14PM +0800, Fam Zheng wrote:
> This implements open flag sensible image locking for local file
> and host device protocol.
> 
> virtlockd in libvirt locks the first byte, so we start looking at the
> file bytes from 1.
> 
> Quoting what was proposed by Kevin Wolf , there are
> four locking modes by combining two bits (BDRV_O_RDWR and
> BDRV_O_SHARE_RW), and implemented by taking two locks:
> 
> Lock bytes:
> 
> * byte 1: I can't allow other processes to write to the image
> * byte 2: I am writing to the image
> 
> Lock modes:
> 
> * shared writer (BDRV_O_RDWR | BDRV_O_SHARE_RW): Take shared lock on
>   byte 2. Test whether byte 1 is locked using an exclusive lock, and
>   fail if so.
> 
> * exclusive writer (BDRV_O_RDWR only): Take shared lock on byte 2. Test
>   whether byte 1 is locked using an exclusive lock, and fail if so. Then
>   take shared lock on byte 1. I suppose this is racy, but we can
>   probably tolerate that.
> 
> * reader that can tolerate writers (BDRV_O_SHARE_RW only): Don't do anything
> 
> * reader that can't tolerate writers (neither bit is set): Take shared
>   lock on byte 1. Test whether byte 2 is locked, and fail if so.

Ahh, using two bytes is an interesting technique for mapping the four
different access methods onto the more limit fcntl lock semantics. We
might want to copy this approach in libvirt too

> +/* Posix file locking bytes. Libvirt takes byte 0, so start from byte 1. */
> +#define RAW_LOCK_BYTE_MIN 1
> +#define RAW_LOCK_BYTE_NO_OTHER_WRITER 1
> +#define RAW_LOCK_BYTE_WRITE   2

...would you mind if QEMU started from say byte 10, leaving the first 10
reserved for libvirt uses. This lets libvirt have a continuous space for
its own usage if we want to use more bytes

Regards,
Daniel
-- 
|: http://berrange.com  -o-http://www.flickr.com/photos/dberrange/ :|
|: http://libvirt.org  -o- http://virt-manager.org :|
|: http://entangle-photo.org   -o-http://search.cpan.org/~danberr/ :|

Re: [Qemu-block] [PATCH v6 0/2] allow blockdev-add for NFS

Am 19.01.2017 um 15:59 hat Eric Blake geschrieben:
> On 01/19/2017 08:30 AM, Peter Lieven wrote:
> >>> qemu-img: Could not open
> >>> 'nfs://172.21.200.61/vcore-dev-cdrom/templates/VC_debian8-20170116.qcow2,linux?readahead=131072':
> >>> Block protocol 'nfs' doesn't support the option 'readahead-size'
> >>>
> >>> Please let me know if the below fix would be correct:
> >> No, this needs to be fixed the other way round: runtime_opts must use
> >> the names as specified in the schema, and nfs_client_open() must access
> >> them as such. Without that, blockdev-add can't work (and the command
> >> line only with the "wrong" old option names from the URL, whereas it
> >> should be using the same names as the QAPI schema).
> > 
> > Shouldn't we support both for backwards compatiblity.?
> 
> blockdev-add only needs to support the modern naming.  But yes,
> preserving back-compat spelling of the command-line spellings, as well
> as matching blockdev-add spellings, is desirable.

We only just added the individual command line options, previously it
only supported the URL.

It's true that we have the messed up version of the options in 2.8, so
strictly speaking we would break compatibility with a release, but it's
only one release, it's only the nfs driver, and the documentation of the
options is the schema, which had the right option names even in 2.8
(they just didn't work).

So I wouldn't feel bad about removing the wrong names in this specific
case.

Kevin

pgp5pL45qgXtr.pgp
Description: PGP signature

Re: [Qemu-block] [PATCH v6 0/2] allow blockdev-add for NFS


Am 19.01.2017 um 16:42 schrieb Kevin Wolf:

Am 19.01.2017 um 16:34 hat Peter Lieven geschrieben:

Am 19.01.2017 um 16:20 schrieb Kevin Wolf:

Am 19.01.2017 um 15:59 hat Eric Blake geschrieben:

On 01/19/2017 08:30 AM, Peter Lieven wrote:

qemu-img: Could not open
'nfs://172.21.200.61/vcore-dev-cdrom/templates/VC_debian8-20170116.qcow2,linux?readahead=131072':
Block protocol 'nfs' doesn't support the option 'readahead-size'

Please let me know if the below fix would be correct:

No, this needs to be fixed the other way round: runtime_opts must use
the names as specified in the schema, and nfs_client_open() must access
them as such. Without that, blockdev-add can't work (and the command
line only with the "wrong" old option names from the URL, whereas it
should be using the same names as the QAPI schema).

Shouldn't we support both for backwards compatiblity.?

blockdev-add only needs to support the modern naming.  But yes,
preserving back-compat spelling of the command-line spellings, as well
as matching blockdev-add spellings, is desirable.

We only just added the individual command line options, previously it
only supported the URL.

It's true that we have the messed up version of the options in 2.8, so
strictly speaking we would break compatibility with a release, but it's
only one release, it's only the nfs driver, and the documentation of the
options is the schema, which had the right option names even in 2.8
(they just didn't work).

So I wouldn't feel bad about removing the wrong names in this specific
case.

So want exactly do you want to do? Fix the names in the QAPI schema
to use the old naming?

No, fix the command line to use the names in the QAPI schema.

The option names from the URL were never supposed to be supported on the
command line.

Kevin


This is what I wanted to do (before I asked ;-)):

diff --git a/block/nfs.c b/block/nfs.c
index baaecff..7c997cd 100644
--- a/block/nfs.c
+++ b/block/nfs.c
@@ -119,19 +119,24 @@ static int nfs_parse_uri(const char *filename, QDict 
*options, Error **errp)
qp->p[i].name);
 goto out;
 }
-if (!strcmp(qp->p[i].name, "uid")) {
+if (!strcmp(qp->p[i].name, "uid") ||
+!strcmp(qp->p[i].name, "user")) {
 qdict_put(options, "user",
   qstring_from_str(qp->p[i].value));
-} else if (!strcmp(qp->p[i].name, "gid")) {
+} else if (!strcmp(qp->p[i].name, "gid") ||
+   !strcmp(qp->p[i].name, "group")) {
 qdict_put(options, "group",
   qstring_from_str(qp->p[i].value));
-} else if (!strcmp(qp->p[i].name, "tcp-syncnt")) {
+} else if (!strcmp(qp->p[i].name, "tcp-syncnt") ||
+   !strcmp(qp->p[i].name, "tcp-syn-count")) {
 qdict_put(options, "tcp-syn-count",
   qstring_from_str(qp->p[i].value));
-} else if (!strcmp(qp->p[i].name, "readahead")) {
+} else if (!strcmp(qp->p[i].name, "readahead") ||
+   !strcmp(qp->p[i].name, "readahead-size")) {
 qdict_put(options, "readahead-size",
   qstring_from_str(qp->p[i].value));
-} else if (!strcmp(qp->p[i].name, "pagecache")) {
+} else if (!strcmp(qp->p[i].name, "pagecache") ||
+   !strcmp(qp->p[i].name, "page-cache-size")) {
 qdict_put(options, "page-cache-size",
   qstring_from_str(qp->p[i].value));
 } else if (!strcmp(qp->p[i].name, "debug")) {
@@ -359,27 +364,27 @@ static QemuOptsList runtime_opts = {
 .help = "Path of the image on the host",
 },
 {
-.name = "uid",
+.name = "user",
 .type = QEMU_OPT_NUMBER,
 .help = "UID value to use when talking to the server",
 },
 {
-.name = "gid",
+.name = "group",
 .type = QEMU_OPT_NUMBER,
 .help = "GID value to use when talking to the server",
 },
 {
-.name = "tcp-syncnt",
+.name = "tcp-syn-count",
 .type = QEMU_OPT_NUMBER,
 .help = "Number of SYNs to send during the session establish",
 },
 {
-.name = "readahead",
+.name = "readahead-size",
 .type = QEMU_OPT_NUMBER,
 .help = "Set the readahead size in bytes",
 },
 {
-.name = "pagecache",
+.name = "page-cache-size",
 .type = QEMU_OPT_NUMBER,
 .help = "Set the pagecache size in bytes",
 },
@@ -508,29 +513,29 @@ static int64_t nfs_client_open(NFSClient *client, QDict 
*options,
 goto fail;
 }

-if (qemu_opt_get(opts, "uid")) {
-client->uid = qemu_opt_get_number(opts, "uid", 0);
+if (qemu_opt_get(opts, "user")) {
+client->uid = qemu_opt_get_number(opts, "user", 0);
 nfs_set_uid(client->context, client->ui

Re: [Qemu-block] [PATCH v6 0/2] allow blockdev-add for NFS

Am 19.01.2017 um 16:42 schrieb Kevin Wolf:

Am 19.01.2017 um 16:34 hat Peter Lieven geschrieben:

Am 19.01.2017 um 16:20 schrieb Kevin Wolf:

Am 19.01.2017 um 15:59 hat Eric Blake geschrieben:

On 01/19/2017 08:30 AM, Peter Lieven wrote:

qemu-img: Could not open
'nfs://172.21.200.61/vcore-dev-cdrom/templates/VC_debian8-20170116.qcow2,linux?readahead=131072':
Block protocol 'nfs' doesn't support the option 'readahead-size'

Please let me know if the below fix would be correct:

Shouldn't we support both for backwards compatiblity.?

blockdev-add only needs to support the modern naming. But yes,
preserving back-compat spelling of the command-line spellings, as well
as matching blockdev-add spellings, is desirable.

We only just added the individual command line options, previously it
only supported the URL.

So I wouldn't feel bad about removing the wrong names in this specific
case.

So want exactly do you want to do? Fix the names in the QAPI schema
to use the old naming?

No, fix the command line to use the names in the QAPI schema.

The option names from the URL were never supposed to be supported on the
command line.

Okay, so no backwards compatiblity? I actually used the options on the command
line...

Peter

Re: [Qemu-block] [PATCH v6 0/2] allow blockdev-add for NFS

On 01/19/2017 08:30 AM, Peter Lieven wrote:
>>> qemu-img: Could not open
>>> 'nfs://172.21.200.61/vcore-dev-cdrom/templates/VC_debian8-20170116.qcow2,linux?readahead=131072':
>>> Block protocol 'nfs' doesn't support the option 'readahead-size'
>>>
>>> Please let me know if the below fix would be correct:
>> No, this needs to be fixed the other way round: runtime_opts must use
>> the names as specified in the schema, and nfs_client_open() must access
>> them as such. Without that, blockdev-add can't work (and the command
>> line only with the "wrong" old option names from the URL, whereas it
>> should be using the same names as the QAPI schema).
> 
> Shouldn't we support both for backwards compatiblity.?

blockdev-add only needs to support the modern naming.  But yes,
preserving back-compat spelling of the command-line spellings, as well
as matching blockdev-add spellings, is desirable.

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature

Re: [Qemu-block] [PATCH v6 0/2] allow blockdev-add for NFS

Am 19.01.2017 um 16:44 hat Peter Lieven geschrieben:
> Am 19.01.2017 um 16:42 schrieb Kevin Wolf:
> >Am 19.01.2017 um 16:34 hat Peter Lieven geschrieben:
> >>Am 19.01.2017 um 16:20 schrieb Kevin Wolf:
> >>>Am 19.01.2017 um 15:59 hat Eric Blake geschrieben:
> On 01/19/2017 08:30 AM, Peter Lieven wrote:
> >>>qemu-img: Could not open
> >>>'nfs://172.21.200.61/vcore-dev-cdrom/templates/VC_debian8-20170116.qcow2,linux?readahead=131072':
> >>>Block protocol 'nfs' doesn't support the option 'readahead-size'
> >>>
> >>>Please let me know if the below fix would be correct:
> >>No, this needs to be fixed the other way round: runtime_opts must use
> >>the names as specified in the schema, and nfs_client_open() must access
> >>them as such. Without that, blockdev-add can't work (and the command
> >>line only with the "wrong" old option names from the URL, whereas it
> >>should be using the same names as the QAPI schema).
> >Shouldn't we support both for backwards compatiblity.?
> blockdev-add only needs to support the modern naming.  But yes,
> preserving back-compat spelling of the command-line spellings, as well
> as matching blockdev-add spellings, is desirable.
> >>>We only just added the individual command line options, previously it
> >>>only supported the URL.
> >>>
> >>>It's true that we have the messed up version of the options in 2.8, so
> >>>strictly speaking we would break compatibility with a release, but it's
> >>>only one release, it's only the nfs driver, and the documentation of the
> >>>options is the schema, which had the right option names even in 2.8
> >>>(they just didn't work).
> >>>
> >>>So I wouldn't feel bad about removing the wrong names in this specific
> >>>case.
> >>So want exactly do you want to do? Fix the names in the QAPI schema
> >>to use the old naming?
> >No, fix the command line to use the names in the QAPI schema.
> >
> >The option names from the URL were never supposed to be supported on the
> >command line.
> 
> Okay, so no backwards compatiblity? I actually used the options on the 
> command line...

Well, do you _need_ compatibility?

It can certainly be done, but as the (wrong) options on the command line
have only existed since November and were never documented, I wouldn't
bother unless there's a good reason.

Kevin

Re: [Qemu-block] [PATCH v6 0/2] allow blockdev-add for NFS

Am 19.01.2017 um 16:34 hat Peter Lieven geschrieben:
> Am 19.01.2017 um 16:20 schrieb Kevin Wolf:
> >Am 19.01.2017 um 15:59 hat Eric Blake geschrieben:
> >>On 01/19/2017 08:30 AM, Peter Lieven wrote:
> >qemu-img: Could not open
> >'nfs://172.21.200.61/vcore-dev-cdrom/templates/VC_debian8-20170116.qcow2,linux?readahead=131072':
> >Block protocol 'nfs' doesn't support the option 'readahead-size'
> >
> >Please let me know if the below fix would be correct:
> No, this needs to be fixed the other way round: runtime_opts must use
> the names as specified in the schema, and nfs_client_open() must access
> them as such. Without that, blockdev-add can't work (and the command
> line only with the "wrong" old option names from the URL, whereas it
> should be using the same names as the QAPI schema).
> >>>Shouldn't we support both for backwards compatiblity.?
> >>blockdev-add only needs to support the modern naming.  But yes,
> >>preserving back-compat spelling of the command-line spellings, as well
> >>as matching blockdev-add spellings, is desirable.
> >We only just added the individual command line options, previously it
> >only supported the URL.
> >
> >It's true that we have the messed up version of the options in 2.8, so
> >strictly speaking we would break compatibility with a release, but it's
> >only one release, it's only the nfs driver, and the documentation of the
> >options is the schema, which had the right option names even in 2.8
> >(they just didn't work).
> >
> >So I wouldn't feel bad about removing the wrong names in this specific
> >case.
> 
> So want exactly do you want to do? Fix the names in the QAPI schema
> to use the old naming?

No, fix the command line to use the names in the QAPI schema.

The option names from the URL were never supposed to be supported on the
command line.

Kevin

[Qemu-block] [PATCH v10 15/16] qcow2: Force "no other writer" lock on bs->file

Writing to the same qcow2 file from two QEMU processes at the same time
will never work correctly, so disable it even when the caller specifies
BDRV_O_RDWR.

Other formats are less vulnerable because they don't have internal
snapshots thus qemu-img is less often misused to create live snapshot.

Signed-off-by: Fam Zheng 
---
 block/qcow2.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/block/qcow2.c b/block/qcow2.c
index 96fb8a8..879361a 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1177,6 +1177,17 @@ static int qcow2_open(BlockDriverState *bs, QDict 
*options, int flags,
 }
 }
 
+if ((flags & BDRV_O_SHARE_RW) && (flags & BDRV_O_RDWR)) {
+/* Shared write is never a good idea for qcow2, override it.
+ * XXX: Use permission propagation and masking mechanism in op blockers
+ * API once it's there. */
+ret = bdrv_reopen(bs->file->bs, flags & ~BDRV_O_SHARE_RW, &local_err);
+if (ret) {
+error_propagate(errp, local_err);
+goto fail;
+}
+}
+
 #ifdef DEBUG_ALLOC
 {
 BdrvCheckResult result = {0};
-- 
2.9.3

[Qemu-block] [PATCH v10 13/16] tests: Disable image lock in test-replication

The COLO block replication architecture requires one disk to be shared
between primary and secondary, in the test both processes use posix file
protocol (instead of over NBD) so it is affected by image locking.
Disable the lock.

Signed-off-by: Fam Zheng 
---
 tests/test-replication.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/tests/test-replication.c b/tests/test-replication.c
index fac2da3..5bede49 100644
--- a/tests/test-replication.c
+++ b/tests/test-replication.c
@@ -179,7 +179,8 @@ static BlockBackend *start_primary(void)
 char *cmdline;
 
 cmdline = g_strdup_printf("driver=replication,mode=primary,node-name=xxx,"
-  "file.driver=qcow2,file.file.filename=%s"
+  "file.driver=qcow2,file.file.filename=%s,"
+  "file.file.disable-lock=on"
   , p_local_disk);
 opts = qemu_opts_parse_noisily(&qemu_drive_opts, cmdline, false);
 g_free(cmdline);
@@ -310,7 +311,9 @@ static BlockBackend *start_secondary(void)
 Error *local_err = NULL;
 
 /* add s_local_disk and forge S_LOCAL_DISK_ID */
-cmdline = g_strdup_printf("file.filename=%s,driver=qcow2", s_local_disk);
+cmdline = g_strdup_printf("file.filename=%s,driver=qcow2,"
+  "file.file.disable-lock=on",
+  s_local_disk);
 opts = qemu_opts_parse_noisily(&qemu_drive_opts, cmdline, false);
 g_free(cmdline);
 
@@ -331,8 +334,10 @@ static BlockBackend *start_secondary(void)
 /* add S_(ACTIVE/HIDDEN)_DISK and forge S_ID */
 cmdline = g_strdup_printf("driver=replication,mode=secondary,top-id=%s,"
   "file.driver=qcow2,file.file.filename=%s,"
+  "file.file.disable-lock=on",
   "file.backing.driver=qcow2,"
   "file.backing.file.filename=%s,"
+  "file.backing.file.disable-lock=on",
   "file.backing.backing=%s"
   , S_ID, s_active_disk, s_hidden_disk
   , S_LOCAL_DISK_ID);
-- 
2.9.3

[Qemu-block] [PATCH v10 11/16] iotests: 172: Use separate images for multiple devices

To avoid image lock failures.

Signed-off-by: Fam Zheng 
---
 tests/qemu-iotests/172 | 53 --
 tests/qemu-iotests/172.out | 50 ++-
 2 files changed, 54 insertions(+), 49 deletions(-)

diff --git a/tests/qemu-iotests/172 b/tests/qemu-iotests/172
index 1b7d3a1..f9d4cff 100755
--- a/tests/qemu-iotests/172
+++ b/tests/qemu-iotests/172
@@ -86,6 +86,9 @@ size=720k
 
 _make_test_img $size
 
+TEST_IMG="$TEST_IMG.2" _make_test_img $size
+TEST_IMG="$TEST_IMG.3" _make_test_img $size
+
 # Default drive semantics:
 #
 # By default you get a single empty floppy drive. You can override it with
@@ -105,7 +108,7 @@ echo === Using -fda/-fdb options ===
 
 check_floppy_qtree -fda "$TEST_IMG"
 check_floppy_qtree -fdb "$TEST_IMG"
-check_floppy_qtree -fda "$TEST_IMG" -fdb "$TEST_IMG"
+check_floppy_qtree -fda "$TEST_IMG" -fdb "$TEST_IMG.2"
 
 
 echo
@@ -114,7 +117,7 @@ echo === Using -drive options ===
 
 check_floppy_qtree -drive if=floppy,file="$TEST_IMG"
 check_floppy_qtree -drive if=floppy,file="$TEST_IMG",index=1
-check_floppy_qtree -drive if=floppy,file="$TEST_IMG" -drive 
if=floppy,file="$TEST_IMG",index=1
+check_floppy_qtree -drive if=floppy,file="$TEST_IMG" -drive 
if=floppy,file="$TEST_IMG.2",index=1
 
 echo
 echo
@@ -122,7 +125,7 @@ echo === Using -drive if=none and -global ===
 
 check_floppy_qtree -drive if=none,file="$TEST_IMG" -global isa-fdc.driveA=none0
 check_floppy_qtree -drive if=none,file="$TEST_IMG" -global isa-fdc.driveB=none0
-check_floppy_qtree -drive if=none,file="$TEST_IMG" -drive 
if=none,file="$TEST_IMG" \
+check_floppy_qtree -drive if=none,file="$TEST_IMG" -drive 
if=none,file="$TEST_IMG.2" \
-global isa-fdc.driveA=none0 -global isa-fdc.driveB=none1
 
 echo
@@ -131,7 +134,7 @@ echo === Using -drive if=none and -device ===
 
 check_floppy_qtree -drive if=none,file="$TEST_IMG" -device floppy,drive=none0
 check_floppy_qtree -drive if=none,file="$TEST_IMG" -device 
floppy,drive=none0,unit=1
-check_floppy_qtree -drive if=none,file="$TEST_IMG" -drive 
if=none,file="$TEST_IMG" \
+check_floppy_qtree -drive if=none,file="$TEST_IMG",file.disable-lock=on -drive 
if=none,file="$TEST_IMG.2" \
-device floppy,drive=none0 -device floppy,drive=none1,unit=1
 
 echo
@@ -139,58 +142,58 @@ echo
 echo === Mixing -fdX and -global ===
 
 # Working
-check_floppy_qtree -fda "$TEST_IMG" -drive if=none,file="$TEST_IMG" -global 
isa-fdc.driveB=none0
-check_floppy_qtree -fdb "$TEST_IMG" -drive if=none,file="$TEST_IMG" -global 
isa-fdc.driveA=none0
+check_floppy_qtree -fda "$TEST_IMG" -drive if=none,file="$TEST_IMG.2" -global 
isa-fdc.driveB=none0
+check_floppy_qtree -fdb "$TEST_IMG" -drive if=none,file="$TEST_IMG.2" -global 
isa-fdc.driveA=none0
 
 # Conflicting (-fdX wins)
-check_floppy_qtree -fda "$TEST_IMG" -drive if=none,file="$TEST_IMG" -global 
isa-fdc.driveA=none0
-check_floppy_qtree -fdb "$TEST_IMG" -drive if=none,file="$TEST_IMG" -global 
isa-fdc.driveB=none0
+check_floppy_qtree -fda "$TEST_IMG" -drive if=none,file="$TEST_IMG.2" -global 
isa-fdc.driveA=none0
+check_floppy_qtree -fdb "$TEST_IMG" -drive if=none,file="$TEST_IMG.2" -global 
isa-fdc.driveB=none0
 
 echo
 echo
 echo === Mixing -fdX and -device ===
 
 # Working
-check_floppy_qtree -fda "$TEST_IMG" -drive if=none,file="$TEST_IMG" -device 
floppy,drive=none0
-check_floppy_qtree -fda "$TEST_IMG" -drive if=none,file="$TEST_IMG" -device 
floppy,drive=none0,unit=1
+check_floppy_qtree -fda "$TEST_IMG" -drive if=none,file="$TEST_IMG.2" -device 
floppy,drive=none0
+check_floppy_qtree -fda "$TEST_IMG" -drive if=none,file="$TEST_IMG.2" -device 
floppy,drive=none0,unit=1
 
-check_floppy_qtree -fdb "$TEST_IMG" -drive if=none,file="$TEST_IMG" -device 
floppy,drive=none0
-check_floppy_qtree -fdb "$TEST_IMG" -drive if=none,file="$TEST_IMG" -device 
floppy,drive=none0,unit=0
+check_floppy_qtree -fdb "$TEST_IMG" -drive if=none,file="$TEST_IMG.2" -device 
floppy,drive=none0
+check_floppy_qtree -fdb "$TEST_IMG" -drive if=none,file="$TEST_IMG.2" -device 
floppy,drive=none0,unit=0
 
 # Conflicting
-check_floppy_qtree -fda "$TEST_IMG" -drive if=none,file="$TEST_IMG" -device 
floppy,drive=none0,unit=0
-check_floppy_qtree -fdb "$TEST_IMG" -drive if=none,file="$TEST_IMG" -device 
floppy,drive=none0,unit=1
+check_floppy_qtree -fda "$TEST_IMG" -drive if=none,file="$TEST_IMG.2" -device 
floppy,drive=none0,unit=0
+check_floppy_qtree -fdb "$TEST_IMG" -drive if=none,file="$TEST_IMG.2" -device 
floppy,drive=none0,unit=1
 
 echo
 echo
 echo === Mixing -drive and -device ===
 
 # Working
-check_floppy_qtree -drive if=floppy,file="$TEST_IMG" -drive 
if=none,file="$TEST_IMG" -device floppy,drive=none0
-check_floppy_qtree -drive if=floppy,file="$TEST_IMG" -drive 
if=none,file="$TEST_IMG" -device floppy,drive=none0,unit=1
+check_floppy_qtree -drive if=floppy,file="$TEST_IMG" -drive 
if=none,file="$TEST_IMG.2" -device floppy,drive=none0
+check_floppy_qtree -drive if=floppy,f

Re: [Qemu-block] [Qemu-devel] [PATCH v2 4/6] fdc-test: Avoid deprecated 'change' command

On 01/19/2017 03:30 AM, Markus Armbruster wrote:
> Eric Blake  writes:
> 
>> Use the preferred blockdev-change-medium command instead.
>>
>> Signed-off-by: Eric Blake 
>> Reviewed-by: John Snow 
>> ---
>>  tests/fdc-test.c | 5 +++--
>>  1 file changed, 3 insertions(+), 2 deletions(-)
>>
>> diff --git a/tests/fdc-test.c b/tests/fdc-test.c
>> index 738c6b4..f5ff68d 100644
>> --- a/tests/fdc-test.c
>> +++ b/tests/fdc-test.c
>> @@ -298,8 +298,9 @@ static void test_media_insert(void)
>>
>>  /* Insert media in drive. DSKCHK should not be reset until a step pulse
>>   * is sent. */
>> -qmp_discard_response("{'execute':'change', 'arguments':{"
>> - " 'device':'floppy0', 'target': %s, 'arg': 'raw' 
>> }}",
>> +qmp_discard_response("{'execute':'blockdev-change-medium', 
>> 'arguments':{"
>> + " 'device':'floppy0', 'filename': %s, "
>> + "'format': 'raw' }}",
>>   test_image);
>>
>>  dir = inb(FLOPPY_BASE + reg_dir);
> 
> 'device' is deprecated.  Can we use 'id' here?

I'll give it a shot.

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature

[Qemu-block] [PATCH v10 12/16] tests: Use null-co:// instead of /dev/null as the dummy image

Signed-off-by: Fam Zheng 
---
 tests/drive_del-test.c| 2 +-
 tests/nvme-test.c | 2 +-
 tests/usb-hcd-uhci-test.c | 2 +-
 tests/usb-hcd-xhci-test.c | 2 +-
 tests/virtio-blk-test.c   | 2 +-
 tests/virtio-scsi-test.c  | 4 ++--
 6 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/tests/drive_del-test.c b/tests/drive_del-test.c
index 121b9c9..2175139 100644
--- a/tests/drive_del-test.c
+++ b/tests/drive_del-test.c
@@ -92,7 +92,7 @@ static void test_after_failed_device_add(void)
 static void test_drive_del_device_del(void)
 {
 /* Start with a drive used by a device that unplugs instantaneously */
-qtest_start("-drive if=none,id=drive0,file=/dev/null,format=raw"
+qtest_start("-drive if=none,id=drive0,file=null-co://,format=raw"
 " -device virtio-scsi-pci"
 " -device scsi-hd,drive=drive0,id=dev0");
 
diff --git a/tests/nvme-test.c b/tests/nvme-test.c
index c8bece4..7674a44 100644
--- a/tests/nvme-test.c
+++ b/tests/nvme-test.c
@@ -22,7 +22,7 @@ int main(int argc, char **argv)
 g_test_init(&argc, &argv, NULL);
 qtest_add_func("/nvme/nop", nop);
 
-qtest_start("-drive id=drv0,if=none,file=/dev/null,format=raw "
+qtest_start("-drive id=drv0,if=none,file=null-co://,format=raw "
 "-device nvme,drive=drv0,serial=foo");
 ret = g_test_run();
 
diff --git a/tests/usb-hcd-uhci-test.c b/tests/usb-hcd-uhci-test.c
index e956b9c..24c3ea9 100644
--- a/tests/usb-hcd-uhci-test.c
+++ b/tests/usb-hcd-uhci-test.c
@@ -78,7 +78,7 @@ int main(int argc, char **argv)
 {
 const char *arch = qtest_get_arch();
 const char *cmd = "-device piix3-usb-uhci,id=uhci,addr=1d.0"
-  " -drive id=drive0,if=none,file=/dev/null,format=raw"
+  " -drive id=drive0,if=none,file=null-co://,format=raw"
   " -device usb-tablet,bus=uhci.0,port=1";
 int ret;
 
diff --git a/tests/usb-hcd-xhci-test.c b/tests/usb-hcd-xhci-test.c
index 22513e9..031764d 100644
--- a/tests/usb-hcd-xhci-test.c
+++ b/tests/usb-hcd-xhci-test.c
@@ -89,7 +89,7 @@ int main(int argc, char **argv)
 qtest_add_func("/xhci/pci/hotplug/usb-uas", test_usb_uas_hotplug);
 
 qtest_start("-device nec-usb-xhci,id=xhci"
-" -drive id=drive0,if=none,file=/dev/null,format=raw");
+" -drive id=drive0,if=none,file=null-co://,format=raw");
 ret = g_test_run();
 qtest_end();
 
diff --git a/tests/virtio-blk-test.c b/tests/virtio-blk-test.c
index 0e32e41..0fc7c99 100644
--- a/tests/virtio-blk-test.c
+++ b/tests/virtio-blk-test.c
@@ -63,7 +63,7 @@ static QOSState *pci_test_start(void)
 const char *arch = qtest_get_arch();
 char *tmp_path;
 const char *cmd = "-drive if=none,id=drive0,file=%s,format=raw "
-  "-drive if=none,id=drive1,file=/dev/null,format=raw "
+  "-drive if=none,id=drive1,file=null-co://,format=raw "
   "-device virtio-blk-pci,id=drv0,drive=drive0,"
   "addr=%x.%x";
 
diff --git a/tests/virtio-scsi-test.c b/tests/virtio-scsi-test.c
index 69220ef..8c27641 100644
--- a/tests/virtio-scsi-test.c
+++ b/tests/virtio-scsi-test.c
@@ -35,7 +35,7 @@ typedef struct {
 static QOSState *qvirtio_scsi_start(const char *extra_opts)
 {
 const char *arch = qtest_get_arch();
-const char *cmd = "-drive id=drv0,if=none,file=/dev/null,format=raw "
+const char *cmd = "-drive id=drv0,if=none,file=null-co://,format=raw "
   "-device virtio-scsi-pci,id=vs0 "
   "-device scsi-hd,bus=vs0.0,drive=drv0 %s";
 
@@ -195,7 +195,7 @@ static void hotplug(void)
 QDict *response;
 QOSState *qs;
 
-qs = qvirtio_scsi_start("-drive 
id=drv1,if=none,file=/dev/null,format=raw");
+qs = qvirtio_scsi_start("-drive 
id=drv1,if=none,file=null-co://,format=raw");
 response = qmp("{\"execute\": \"device_add\","
" \"arguments\": {"
"   \"driver\": \"scsi-hd\","
-- 
2.9.3

[Qemu-block] [PATCH v10 10/16] iotests: 091: Quit QEMU before checking image

Signed-off-by: Fam Zheng 
---
 tests/qemu-iotests/091 | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/qemu-iotests/091 b/tests/qemu-iotests/091
index 32bbd56..10ac4a8 100755
--- a/tests/qemu-iotests/091
+++ b/tests/qemu-iotests/091
@@ -95,7 +95,9 @@ echo "vm2: qemu process running successfully"
 echo "vm2: flush io, and quit"
 _send_qemu_cmd $h2 'qemu-io disk flush' "(qemu)"
 _send_qemu_cmd $h2 'quit' ""
+_send_qemu_cmd $h1 'quit' ""
 
+wait
 echo "Check image pattern"
 ${QEMU_IO} -c "read -P 0x22 0 4M" "${TEST_IMG}" | _filter_testdir | 
_filter_qemu_io
 
-- 
2.9.3

[Qemu-block] [PATCH v10 09/16] iotests: 085: Avoid image locking conflict

In the case where we test the expected error when a blockdev-snapshot
target already has a backing image, backing chain is opened multiple
times. This will be a problem when we use image locking, so use a
different backing file that is not already open.

Signed-off-by: Fam Zheng 
---
 tests/qemu-iotests/085 | 32 +++-
 tests/qemu-iotests/085.out |  3 ++-
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/tests/qemu-iotests/085 b/tests/qemu-iotests/085
index c53e97f..c91d78d 100755
--- a/tests/qemu-iotests/085
+++ b/tests/qemu-iotests/085
@@ -87,24 +87,26 @@ function create_group_snapshot()
 }
 
 # ${1}: unique identifier for the snapshot filename
-# ${2}: true: open backing images; false: don't open them (default)
+# ${2}: extra_params to the blockdev-add command
+# ${3}: filename
+function do_blockdev_add()
+{
+cmd="{ 'execute': 'blockdev-add', 'arguments':
+   { 'driver': 'qcow2', 'node-name': 'snap_${1}', ${2}
+ 'file':
+ { 'driver': 'file', 'filename': '${3}',
+   'node-name': 'file_${1}' } } }"
+_send_qemu_cmd $h "${cmd}" "return"
+}
+
+# ${1}: unique identifier for the snapshot filename
 function add_snapshot_image()
 {
-if [ "${2}" = "true" ]; then
-extra_params=""
-else
-extra_params="'backing': '', "
-fi
 base_image="${TEST_DIR}/$((${1}-1))-${snapshot_virt0}"
 snapshot_file="${TEST_DIR}/${1}-${snapshot_virt0}"
 _make_test_img -b "${base_image}" "$size"
 mv "${TEST_IMG}" "${snapshot_file}"
-cmd="{ 'execute': 'blockdev-add', 'arguments':
-   { 'driver': 'qcow2', 'node-name': 'snap_${1}', ${extra_params}
- 'file':
- { 'driver': 'file', 'filename': '${snapshot_file}',
-   'node-name': 'file_${1}' } } }"
-_send_qemu_cmd $h "${cmd}" "return"
+do_blockdev_add "$1" "'backing': '', " "${snapshot_file}"
 }
 
 # ${1}: unique identifier for the snapshot filename
@@ -222,7 +224,11 @@ echo === Invalid command - snapshot node has a backing 
image ===
 echo
 
 SNAPSHOTS=$((${SNAPSHOTS}+1))
-add_snapshot_image ${SNAPSHOTS} true
+
+_make_test_img "$size"
+mv "${TEST_IMG}" "${TEST_IMG}.base"
+_make_test_img -b "${TEST_IMG}.base" "$size"
+do_blockdev_add ${SNAPSHOTS} "" "${TEST_IMG}"
 blockdev_snapshot ${SNAPSHOTS} error
 
 echo
diff --git a/tests/qemu-iotests/085.out b/tests/qemu-iotests/085.out
index 08e4bb7..7bbf84d 100644
--- a/tests/qemu-iotests/085.out
+++ b/tests/qemu-iotests/085.out
@@ -78,7 +78,8 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 
backing_file=TEST_DIR/
 
 === Invalid command - snapshot node has a backing image ===
 
-Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 
backing_file=TEST_DIR/12-snapshot-v0.IMGFMT
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 
backing_file=TEST_DIR/t.IMGFMT.base
 {"return": {}}
 {"error": {"class": "GenericError", "desc": "The snapshot already has a 
backing image"}}
 
-- 
2.9.3

Re: [Qemu-block] [Qemu-devel] [PATCH v2 2/6] qdict: Add convenience helpers for wrapped puts

On 01/19/2017 03:25 AM, Markus Armbruster wrote:
> Eric Blake  writes:
> 
>> Quite a few users of qdict_put() were manually wrapping a
>> non-QObject. We can make such call-sites shorter, by providing
>> common macros to do the tedious work.  Also shorten nearby
>> qdict_put_obj(,,QOBJECT()) sequences.
>>
>> Signed-off-by: Eric Blake 
>> Reviewed-by: Alberto Garcia 
>>
>> ---
>>
>> v2: rebase to current master
>>
>> I'm okay if you want me to break this patch into smaller pieces.
> 
> I guess I'm okay with a single piece, but I'd like to know how you did
> the conversion.  Coccinelle?  Manually?

Manual, via grepping for put_obj.*QOBJECT. I'll see if I can do the same
under Coccinelle (at which point, committing the script will make it
easier to rerun cleanups if later code reintroduces poor usage
patterns), so maybe I have a v3 coming up.

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature

[Qemu-block] [PATCH v10 05/16] block: Set "share-rw" flag in drive-backup when sync=none

In this case we may open the source's backing image chain multiple
times. Setting share flag means the new open won't try to acquire or
check any lock, once we implement image locking.

Signed-off-by: Fam Zheng 
---
 blockdev.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/blockdev.c b/blockdev.c
index 245e1e1..c97e97f 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -3177,6 +3177,9 @@ static BlockJob *do_drive_backup(DriveBackup *backup, 
BlockJobTxn *txn,
 }
 if (backup->sync == MIRROR_SYNC_MODE_NONE) {
 source = bs;
+/* FIXME: block layer should really open target with BDRV_O_NO_BACKING
+ * and reuse source's backing chain, if they share one. */
+flags |= BDRV_O_SHARE_RW;
 }
 
 size = bdrv_getlength(bs);
-- 
2.9.3

[Qemu-block] [PATCH v10 06/16] iotests: 055: Don't attach the drive to vm for drive-backup

Signed-off-by: Fam Zheng 
---
 tests/qemu-iotests/055 | 32 ++--
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/tests/qemu-iotests/055 b/tests/qemu-iotests/055
index 1d3fd04..20a7596 100755
--- a/tests/qemu-iotests/055
+++ b/tests/qemu-iotests/055
@@ -455,17 +455,18 @@ class TestDriveCompression(iotests.QMPTestCase):
 except OSError:
 pass
 
-def do_prepare_drives(self, fmt, args):
+def do_prepare_drives(self, fmt, args, attach):
 self.vm = iotests.VM().add_drive(test_img)
 
 qemu_img('create', '-f', fmt, blockdev_target_img,
  str(TestDriveCompression.image_len), *args)
-self.vm.add_drive(blockdev_target_img, format=fmt)
+if attach:
+self.vm.add_drive(blockdev_target_img, format=fmt)
 
 self.vm.launch()
 
-def do_test_compress_complete(self, cmd, format, **args):
-self.do_prepare_drives(format['type'], format['args'])
+def do_test_compress_complete(self, cmd, format, attach, **args):
+self.do_prepare_drives(format['type'], format['args'], attach)
 
 self.assert_no_active_block_jobs()
 
@@ -481,15 +482,16 @@ class TestDriveCompression(iotests.QMPTestCase):
 
 def test_complete_compress_drive_backup(self):
 for format in TestDriveCompression.fmt_supports_compression:
-self.do_test_compress_complete('drive-backup', format,
+self.do_test_compress_complete('drive-backup', format, 
attach=False,
target=blockdev_target_img, 
mode='existing')
 
 def test_complete_compress_blockdev_backup(self):
 for format in TestDriveCompression.fmt_supports_compression:
-self.do_test_compress_complete('blockdev-backup', format, 
target='drive1')
+self.do_test_compress_complete('blockdev-backup', format,
+   attach=True, target='drive1')
 
-def do_test_compress_cancel(self, cmd, format, **args):
-self.do_prepare_drives(format['type'], format['args'])
+def do_test_compress_cancel(self, cmd, format, attach, **args):
+self.do_prepare_drives(format['type'], format['args'], attach)
 
 self.assert_no_active_block_jobs()
 
@@ -503,15 +505,16 @@ class TestDriveCompression(iotests.QMPTestCase):
 
 def test_compress_cancel_drive_backup(self):
 for format in TestDriveCompression.fmt_supports_compression:
-self.do_test_compress_cancel('drive-backup', format,
+self.do_test_compress_cancel('drive-backup', format, attach=False,
  target=blockdev_target_img, 
mode='existing')
 
 def test_compress_cancel_blockdev_backup(self):
for format in TestDriveCompression.fmt_supports_compression:
-self.do_test_compress_cancel('blockdev-backup', format, 
target='drive1')
+self.do_test_compress_cancel('blockdev-backup', format, 
attach=True,
+ target='drive1')
 
-def do_test_compress_pause(self, cmd, format, **args):
-self.do_prepare_drives(format['type'], format['args'])
+def do_test_compress_pause(self, cmd, format, attach, **args):
+self.do_prepare_drives(format['type'], format['args'], attach)
 
 self.assert_no_active_block_jobs()
 
@@ -543,12 +546,13 @@ class TestDriveCompression(iotests.QMPTestCase):
 
 def test_compress_pause_drive_backup(self):
 for format in TestDriveCompression.fmt_supports_compression:
-self.do_test_compress_pause('drive-backup', format,
+self.do_test_compress_pause('drive-backup', format, attach=False,
 target=blockdev_target_img, 
mode='existing')
 
 def test_compress_pause_blockdev_backup(self):
 for format in TestDriveCompression.fmt_supports_compression:
-self.do_test_compress_pause('blockdev-backup', format, 
target='drive1')
+self.do_test_compress_pause('blockdev-backup', format, attach=True,
+target='drive1')
 
 if __name__ == '__main__':
 iotests.main(supported_fmts=['raw', 'qcow2'])
-- 
2.9.3

[Qemu-block] [PATCH v10 02/16] block: Define BDRV_O_SHARE_RW

Signed-off-by: Fam Zheng 
---
 include/block/block.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/block/block.h b/include/block/block.h
index 8b0dcda..243839d 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -97,6 +97,8 @@ typedef struct HDGeometry {
   select an appropriate protocol driver,
   ignoring the format layer */
 #define BDRV_O_NO_IO   0x1 /* don't initialize for I/O */
+#define BDRV_O_SHARE_RW0x2 /* accept shared read-write from other users
+  of the image */
 
 #define BDRV_O_CACHE_MASK  (BDRV_O_NOCACHE | BDRV_O_NO_FLUSH)
 
-- 
2.9.3

[Qemu-block] [PATCH v10 07/16] iotests: 030: Read-only open image for getting map

Signed-off-by: Fam Zheng 
---
 tests/qemu-iotests/030 | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/qemu-iotests/030 b/tests/qemu-iotests/030
index 54db54a..fe0c73f 100755
--- a/tests/qemu-iotests/030
+++ b/tests/qemu-iotests/030
@@ -114,7 +114,7 @@ class TestSingleDrive(iotests.QMPTestCase):
 self.assert_no_active_block_jobs()
 
 # The image map is empty before the operation
-empty_map = qemu_io('-f', iotests.imgfmt, '-c', 'map', test_img)
+empty_map = qemu_io('-f', iotests.imgfmt, '-c', 'map', '-r', test_img)
 
 # This is a no-op: no data should ever be copied from the base image
 result = self.vm.qmp('block-stream', device='drive0', base=mid_img)
@@ -125,7 +125,7 @@ class TestSingleDrive(iotests.QMPTestCase):
 self.assert_no_active_block_jobs()
 self.vm.shutdown()
 
-self.assertEqual(qemu_io('-f', iotests.imgfmt, '-c', 'map', test_img),
+self.assertEqual(qemu_io('-f', iotests.imgfmt, '-c', 'map', '-r', 
test_img),
  empty_map, 'image file map changed after a no-op')
 
 def test_stream_partial(self):
-- 
2.9.3

[Qemu-block] [PATCH v10 01/16] osdep: Add qemu_lock_fd and qemu_unlock_fd

They are wrappers of POSIX fcntl "file private locking", with a
convenient "try lock" wrapper implemented with F_OFD_GETLK.

Signed-off-by: Fam Zheng 
---
 include/qemu/osdep.h |  3 +++
 util/osdep.c | 48 
 2 files changed, 51 insertions(+)

diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index 689f253..e864fe8 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -297,6 +297,9 @@ int qemu_close(int fd);
 #ifndef _WIN32
 int qemu_dup(int fd);
 #endif
+int qemu_lock_fd(int fd, int64_t start, int64_t len, bool exclusive);
+int qemu_unlock_fd(int fd, int64_t start, int64_t len);
+int qemu_lock_fd_test(int fd, int64_t start, int64_t len, bool exclusive);
 
 #if defined(__HAIKU__) && defined(__i386__)
 #define FMT_pid "%ld"
diff --git a/util/osdep.c b/util/osdep.c
index 06fb1cf..3de4a18 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -140,6 +140,54 @@ static int qemu_parse_fdset(const char *param)
 {
 return qemu_parse_fd(param);
 }
+
+static int qemu_lock_fcntl(int fd, int64_t start, int64_t len, int fl_type)
+{
+#ifdef F_OFD_SETLK
+int ret;
+struct flock fl = {
+.l_whence = SEEK_SET,
+.l_start  = start,
+.l_len= len,
+.l_type   = fl_type,
+};
+ret = fcntl(fd, F_OFD_SETLK, &fl);
+return ret == -1 ? -errno : 0;
+#else
+return -ENOTSUP;
+#endif
+}
+
+int qemu_lock_fd(int fd, int64_t start, int64_t len, bool exclusive)
+{
+return qemu_lock_fcntl(fd, start, len, exclusive ? F_WRLCK : F_RDLCK);
+}
+
+int qemu_unlock_fd(int fd, int64_t start, int64_t len)
+{
+return qemu_lock_fcntl(fd, start, len, F_UNLCK);
+}
+
+int qemu_lock_fd_test(int fd, int64_t start, int64_t len, bool exclusive)
+{
+#ifdef F_OFD_SETLK
+int ret;
+struct flock fl = {
+.l_whence = SEEK_SET,
+.l_start  = start,
+.l_len= len,
+.l_type   = exclusive ? F_WRLCK : F_RDLCK,
+};
+ret = fcntl(fd, F_OFD_GETLK, &fl);
+if (ret == -1) {
+return -errno;
+} else {
+return fl.l_type == F_UNLCK ? 0 : -EAGAIN;
+}
+#else
+return -ENOTSUP;
+#endif
+}
 #endif
 
 /*
-- 
2.9.3

[Qemu-block] [PATCH v10 00/16] block: Image locking series

v10: While we still don't have comprehensive propagation mechanism that will be
provided by new op blocker system for "permissive modes", the locking enabled
by default is regardlessly useful and long overdue. So I think we should merge
this for 2.9 and build user options on top later when the op blocker API
settles.

Address comments from Max and Eric:
  - Use F_OFD_GETLK instead of opening r/w for ro images. [Max]
  - Lock both bytes exclusively for non-shared write. [Max]
  - Spell fixes. [Eric]
  - Fix test matrix. [Max]
  - Comment tweaks. [Max]
  - Return code cleanups. [Max]
  - Don't abuse "disable_lock" for migration. [Max]
  - Use bs->exact_filename instead of bs->filename. [Max]
  - Force protect qcow2 concurrent write.
  - Fix indentation. [Max]
  - Always use re-open for lockfd instead of dup. [Max]
  - Fall through to "abort" where "prepare" failed. [Max]
  - Fix option handling in raw_reopen_handle_lock. [Max]
  - Use "error_abort" in commit and abort. [Max]
  - Fix cleanup of raw_reopen_handle_lock() failure. [Max]
  - Add a patch for qcow2 to mask BDRV_O_SHARE_RW if r/w.
  - Rebase and fix new more cases that will be broken by "lock by default".

Fam Zheng (16):
  osdep: Add qemu_lock_fd and qemu_unlock_fd
  block: Define BDRV_O_SHARE_RW
  qemu-io: Set "share-rw" flag together with read-only
  qemu-img: Set "share-rw" flag in read-only commands
  block: Set "share-rw" flag in drive-backup when sync=none
  iotests: 055: Don't attach the drive to vm for drive-backup
  iotests: 030: Read-only open image for getting map
  iotests: 087: Don't attach test image twice
  iotests: 085: Avoid image locking conflict
  iotests: 091: Quit QEMU before checking image
  iotests: 172: Use separate images for multiple devices
  tests: Use null-co:// instead of /dev/null as the dummy image
  tests: Disable image lock in test-replication
  file-posix: Implement image locking
  qcow2: Force "no other writer" lock on bs->file
  tests: Add test-image-lock

 block/file-posix.c | 667 -
 block/qcow2.c  |  11 +
 blockdev.c |   3 +
 include/block/block.h  |   2 +
 include/qemu/osdep.h   |   3 +
 qemu-img.c |  10 +-
 qemu-io.c  |   2 +
 tests/Makefile.include |   2 +
 tests/drive_del-test.c |   2 +-
 tests/nvme-test.c  |   2 +-
 tests/qemu-iotests/030 |   4 +-
 tests/qemu-iotests/055 |  32 ++-
 tests/qemu-iotests/085 |  32 ++-
 tests/qemu-iotests/085.out |   3 +-
 tests/qemu-iotests/087 |   6 +-
 tests/qemu-iotests/091 |   2 +
 tests/qemu-iotests/172 |  53 ++--
 tests/qemu-iotests/172.out |  50 ++--
 tests/test-image-lock.c| 200 ++
 tests/test-replication.c   |   9 +-
 tests/usb-hcd-uhci-test.c  |   2 +-
 tests/usb-hcd-xhci-test.c  |   2 +-
 tests/virtio-blk-test.c|   2 +-
 tests/virtio-scsi-test.c   |   4 +-
 util/osdep.c   |  48 
 25 files changed, 1056 insertions(+), 97 deletions(-)
 create mode 100644 tests/test-image-lock.c

-- 
2.9.3

[Qemu-block] [PATCH] migration: re-active images while migration been canceled after inactive them

commit fe904ea8242cbae2d7e69c052c754b8f5f1ba1d6 fixed a case
which migration aborted QEMU because it didn't regain the control
of images while some errors happened.

Actually, there are another two cases can trigger the same error reports:
" bdrv_co_do_pwritev: Assertion `!(bs->open_flags & 0x0800)' failed",

Case 1, codes path:
migration_thread()
migration_completion()
bdrv_inactivate_all() > inactivate images
qemu_savevm_state_complete_precopy()
socket_writev_buffer() > error because destination fails
qemu_fflush() ---> set error on migration stream
-> qmp_migrate_cancel() -> user cancelled migration 
concurrently
-> migrate_set_state() --> set migrate CANCELLIN
migration_completion() -> go on to fail_invalidate
if (s->state == MIGRATION_STATUS_ACTIVE) -> Jump this branch

Case 2, codes path:
migration_thread()
migration_completion()
bdrv_inactivate_all() > inactivate images
migreation_completion() finished
-> qmp_migrate_cancel() -> user cancelled migration 
concurrently
qemu_mutex_lock_iothread();
qemu_bh_schedule (s->cleanup_bh);

As we can see from above, qmp_migrate_cancel can slip in whenever
migration_thread does not hold the global lock. If this happens after
bdrv_inactive_all() been called, the above error reports will appear.

To prevent this, we can call bdrv_invalidate_cache_all() in qmp_migrate_cancel()
directly if we find images become inactive.

Signed-off-by: zhanghailiang 
---
Hi,

I have sent another patch before to fix this problem, but didn't cover
all the scenes, and there are some discussions about this problem,
For more detail, please refer to
https://lists.gnu.org/archive/html/qemu-block/2016-12/msg3.html
---
 include/migration/migration.h |  3 +++
 migration/migration.c | 13 +
 2 files changed, 16 insertions(+)

diff --git a/include/migration/migration.h b/include/migration/migration.h
index c309d23..2d5b724 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -177,6 +177,9 @@ struct MigrationState
 /* Flag set once the migration thread is running (and needs joining) */
 bool migration_thread_running;
 
+/* Flag set once the migration thread called bdrv_inactivate_all */
+bool block_inactive;
+
 /* Queue of outstanding page requests from the destination */
 QemuMutex src_page_req_mutex;
 QSIMPLEQ_HEAD(src_page_requests, MigrationSrcPageRequest) 
src_page_requests;
diff --git a/migration/migration.c b/migration/migration.c
index f498ab8..9defb3e 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1006,6 +1006,16 @@ static void migrate_fd_cancel(MigrationState *s)
 if (s->state == MIGRATION_STATUS_CANCELLING && f) {
 qemu_file_shutdown(f);
 }
+if (s->state == MIGRATION_STATUS_CANCELLING && s->block_inactive) {
+Error *local_err = NULL;
+
+bdrv_invalidate_cache_all(&local_err);
+if (local_err) {
+error_report_err(local_err);
+} else {
+s->block_inactive = false;
+}
+}
 }
 
 void add_migration_state_change_notifier(Notifier *notify)
@@ -1705,6 +1715,7 @@ static void migration_completion(MigrationState *s, int 
current_active_state,
 if (ret >= 0) {
 qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
 qemu_savevm_state_complete_precopy(s->to_dst_file, false);
+s->block_inactive = true;
 }
 }
 qemu_mutex_unlock_iothread();
@@ -1758,6 +1769,8 @@ fail_invalidate:
 bdrv_invalidate_cache_all(&local_err);
 if (local_err) {
 error_report_err(local_err);
+} else {
+s->block_inactive = false;
 }
 }
 
-- 
1.8.3.1

Re: [Qemu-block] [Qemu-devel] [PATCH 0/2] block: Fix iotests 059

On 01/19/2017 07:07 AM, Fam Zheng wrote:
> The 059.out went out of sync, bring it back in line.
> 
> Fam Zheng (2):
>   qapi: Tweak error message of bdrv_query_image_info
>   iotests: Fix reference output for 059
> 

Reviewed-by: Eric Blake 

>  block/qapi.c   | 4 ++--
>  tests/qemu-iotests/059.out | 2 +-
>  2 files changed, 3 insertions(+), 3 deletions(-)
> 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature

[Qemu-block] [PATCH v10 14/16] file-posix: Implement image locking

This implements open flag sensible image locking for local file
and host device protocol.

virtlockd in libvirt locks the first byte, so we start looking at the
file bytes from 1.

Quoting what was proposed by Kevin Wolf , there are
four locking modes by combining two bits (BDRV_O_RDWR and
BDRV_O_SHARE_RW), and implemented by taking two locks:

Lock bytes:

* byte 1: I can't allow other processes to write to the image
* byte 2: I am writing to the image

Lock modes:

* shared writer (BDRV_O_RDWR | BDRV_O_SHARE_RW): Take shared lock on
  byte 2. Test whether byte 1 is locked using an exclusive lock, and
  fail if so.

* exclusive writer (BDRV_O_RDWR only): Take shared lock on byte 2. Test
  whether byte 1 is locked using an exclusive lock, and fail if so. Then
  take shared lock on byte 1. I suppose this is racy, but we can
  probably tolerate that.

* reader that can tolerate writers (BDRV_O_SHARE_RW only): Don't do anything

* reader that can't tolerate writers (neither bit is set): Take shared
  lock on byte 1. Test whether byte 2 is locked, and fail if so.

The complication is in the transactional reopen.  To make the reopen
logic managable, and allow better reuse, the code is internally
organized with a table from old mode to the new one.

Signed-off-by: Fam Zheng 
---
 block/file-posix.c | 667 -
 1 file changed, 664 insertions(+), 3 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 28b47d9..fabce04 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -131,8 +131,39 @@ do { \
 
 #define MAX_BLOCKSIZE  4096
 
+/* Posix file locking bytes. Libvirt takes byte 0, so start from byte 1. */
+#define RAW_LOCK_BYTE_MIN 1
+#define RAW_LOCK_BYTE_NO_OTHER_WRITER 1
+#define RAW_LOCK_BYTE_WRITE   2
+
+/*
+ ** reader that can tolerate writers: Don't do anything
+ *
+ ** reader that can't tolerate writers: Take shared lock on byte 1. Test
+ *  byte 2 is unlocked.
+ *
+ ** shared writer: Take shared lock on byte 2. Test byte 1 is unlocked.
+ *
+ ** exclusive writer: Take exclusive locks on both bytes.
+ */
+
+typedef enum {
+/* Read only and accept other writers. */
+RAW_L_READ_SHARE_RW,
+/* Read only and try to forbid other writers. */
+RAW_L_READ,
+/* Read write and accept other writers. */
+RAW_L_WRITE_SHARE_RW,
+/* Read write and try to forbid other writers. */
+RAW_L_WRITE,
+} BDRVRawLockMode;
+
 typedef struct BDRVRawState {
 int fd;
+/* A dup of @fd to make manipulating lock easier, especially during reopen,
+ * where this will accept BDRVRawReopenState.lock_fd. */
+int lock_fd;
+bool disable_lock;
 int type;
 int open_flags;
 size_t buf_align;
@@ -146,11 +177,15 @@ typedef struct BDRVRawState {
 bool use_linux_aio:1;
 bool has_fallocate;
 bool needs_alignment;
+BDRVRawLockMode cur_lock_mode;
 } BDRVRawState;
 
 typedef struct BDRVRawReopenState {
 int fd;
+/* A dup of @fd used for acquiring lock. */
+int lock_fd;
 int open_flags;
+bool disable_lock;
 } BDRVRawReopenState;
 
 static int fd_open(BlockDriverState *bs);
@@ -368,6 +403,58 @@ static void raw_parse_flags(int bdrv_flags, int 
*open_flags)
 }
 }
 
+static int raw_lock_fd(int fd, BDRVRawLockMode mode, Error **errp)
+{
+int ret;
+assert(fd >= 0);
+switch (mode) {
+case RAW_L_READ_SHARE_RW:
+ret = qemu_unlock_fd(fd, RAW_LOCK_BYTE_MIN, 2);
+if (ret) {
+error_setg_errno(errp, -ret, "Failed to unlock fd");
+goto fail;
+}
+break;
+case RAW_L_READ:
+ret = qemu_lock_fd(fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, false);
+if (ret) {
+error_setg_errno(errp, -ret, "Failed to lock share byte");
+goto fail;
+}
+ret = qemu_lock_fd_test(fd, RAW_LOCK_BYTE_WRITE, 1, true);
+if (ret) {
+error_setg_errno(errp, -ret, "Write byte lock is taken");
+goto fail;
+}
+break;
+case RAW_L_WRITE_SHARE_RW:
+ret = qemu_lock_fd(fd, RAW_LOCK_BYTE_WRITE, 1, false);
+if (ret) {
+error_setg_errno(errp, -ret, "Failed to lock write byte");
+goto fail;
+}
+ret = qemu_lock_fd_test(fd, RAW_LOCK_BYTE_NO_OTHER_WRITER, 1, true);
+if (ret) {
+error_setg_errno(errp, -ret, "Share byte lock is taken");
+goto fail;
+}
+break;
+case RAW_L_WRITE:
+ret = qemu_lock_fd(fd, RAW_LOCK_BYTE_MIN, 2, true);
+if (ret) {
+error_setg_errno(errp, -ret, "Failed to lock image");
+goto fail;
+}
+break;
+default:
+abort();
+}
+return 0;
+fail:
+qemu_unlock_fd(fd, RAW_LOCK_BYTE_MIN, 2);
+return ret;
+}
+
 static void raw_parse_filename(const char *filename, QDict *options,
Error **errp)
 {
@@ -393,10 +480,88 @@ static QemuOpts

[Qemu-block] [PATCH v10 16/16] tests: Add test-image-lock

Signed-off-by: Fam Zheng 
---
 tests/Makefile.include   |   2 +
 tests/test-image-lock.c  | 200 +++
 tests/test-replication.c |   6 +-
 3 files changed, 205 insertions(+), 3 deletions(-)
 create mode 100644 tests/test-image-lock.c

diff --git a/tests/Makefile.include b/tests/Makefile.include
index 96f5970..7718a9b 100644
--- a/tests/Makefile.include
+++ b/tests/Makefile.include
@@ -55,6 +55,7 @@ check-unit-y += tests/test-hbitmap$(EXESUF)
 gcov-files-test-hbitmap-y = blockjob.c
 check-unit-y += tests/test-blockjob$(EXESUF)
 check-unit-y += tests/test-blockjob-txn$(EXESUF)
+check-unit-y += tests/test-image-lock$(EXESUF)
 check-unit-y += tests/test-x86-cpuid$(EXESUF)
 # all code tested by test-x86-cpuid is inside topology.h
 gcov-files-test-x86-cpuid-y =
@@ -516,6 +517,7 @@ tests/test-aio$(EXESUF): tests/test-aio.o 
$(test-block-obj-y)
 tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
 tests/test-blockjob$(EXESUF): tests/test-blockjob.o $(test-block-obj-y) 
$(test-util-obj-y)
 tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o 
$(test-block-obj-y) $(test-util-obj-y)
+tests/test-image-lock$(EXESUF): tests/test-image-lock.o $(test-block-obj-y) 
$(libqos-obj-y)
 tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(test-block-obj-y)
 tests/test-iov$(EXESUF): tests/test-iov.o $(test-util-obj-y)
 tests/test-hbitmap$(EXESUF): tests/test-hbitmap.o $(test-util-obj-y)
diff --git a/tests/test-image-lock.c b/tests/test-image-lock.c
new file mode 100644
index 000..df86ff0
--- /dev/null
+++ b/tests/test-image-lock.c
@@ -0,0 +1,200 @@
+/*
+ * Image lock tests
+ *
+ * Copyright 2016 Red Hat, Inc.
+ *
+ * Authors:
+ *  Fam Zheng 
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/error-report.h"
+#include "qapi/qmp/qbool.h"
+#include "sysemu/block-backend.h"
+
+#define DEBUG_IMAGE_LOCK_TEST 0
+#define DPRINTF(...) do { \
+if (DEBUG_IMAGE_LOCK_TEST) { \
+printf(__VA_ARGS__); \
+} \
+} while (0)
+
+#define TEST_IMAGE_SIZE 4096
+static char test_image[] = "/tmp/qtest.XX";
+static int test_image_fd;
+
+static BlockBackend *open_test_image(int flags, bool disable_lock)
+{
+QDict *opts = qdict_new();
+
+qdict_set_default_str(opts, "filename", test_image);
+qdict_set_default_str(opts, "driver", "file");
+if (disable_lock) {
+qdict_put(opts, "disable-lock", qbool_from_bool(true));
+}
+
+return blk_new_open(NULL, NULL, opts, flags | BDRV_O_ALLOW_RDWR, NULL);
+}
+
+#define RW true
+#define RO false
+#define SHARE true
+#define EXCLU false
+
+static struct CompatData {
+bool write_1;
+bool share_1;
+bool write_2;
+bool share_2;
+bool compatible;
+} compat_data[] = {
+/* Write 1, Share 1, Write 2, Share 2, Compatible. */
+{ RO,   SHARE,   RO,  SHARE,   true,  },
+{ RO,   SHARE,   RO,  EXCLU,   true,  },
+{ RO,   SHARE,   RW,  SHARE,   true,  },
+{ RO,   SHARE,   RW,  EXCLU,   true,  },
+
+{ RO,   EXCLU,   RO,  EXCLU,   true,  },
+{ RO,   EXCLU,   RW,  SHARE,   false, },
+{ RO,   EXCLU,   RW,  EXCLU,   false, },
+
+{ RW,   SHARE,   RW,  SHARE,   true, },
+{ RW,   SHARE,   RW,  EXCLU,   false, },
+
+{ RW,   EXCLU,   RW,  EXCLU,   false, },
+};
+
+/* Test one combination scenario.
+ *
+ * @flags1: The flags of the first blk.
+ * @flags2: The flags of the second blk.
+ * @disable1: The value for raw-posix disable-lock option of the first blk.
+ * @disable2: The value for raw-posix disable-lock option of the second blk.
+ * @from_reopen: Whether or not the first blk should get flags1 from a reopen.
+ * @initial: The source flags from which the blk1 is reopened, only
+ *effective if @from_reopen is true.
+ */
+static void do_test_compat_one(int flags1, int flags2,
+   bool disable1, bool disable2,
+   bool from_reopen, int initial_flags,
+   bool compatible)
+{
+BlockBackend *blk1, *blk2;
+
+DPRINTF("\n===\ndo test compat one\n");
+DPRINTF("flags %x %x\n", flags1, flags2);
+DPRINTF("disable %d %d\n", disable1, disable2);
+DPRINTF("from reopen %d, initial flags %d\n", from_reopen, initial_flags);
+DPRINTF("compatible %d\n", compatible);
+if (!from_reopen) {
+blk1 = open_test_image(flags1, disable1);
+} else {
+int ret;
+blk1 = open_test_image(initial_flags, disable1);
+BlockReopenQueue *rq = NULL;
+
+rq = bdrv_reopen_queue(rq, blk_bs(blk1), NULL, flags1);
+ret = bdrv_reopen_multiple(blk_get_aio_context(blk1), rq, 
&error_abort);
+g_assert_cmpint(ret, ==, 0);
+}
+g_assert_nonnull(blk1);
+g_assert

Re: [Qemu-block] [PATCH v6 0/2] allow blockdev-add for NFS

Am 18.01.2017 um 10:59 schrieb Kevin Wolf:

Am 17.01.2017 um 16:14 hat Peter Lieven geschrieben:

Am 31.10.2016 um 18:20 schrieb Kevin Wolf:

Am 31.10.2016 um 16:05 hat Ashijeet Acharya geschrieben:

Previously posted series patches:
v5: https://lists.gnu.org/archive/html/qemu-devel/2016-10/msg07580.html
v4: https://lists.gnu.org/archive/html/qemu-devel/2016-10/msg07449.html
v3: https://lists.gnu.org/archive/html/qemu-devel/2016-10/msg06903.html
v2: https://lists.gnu.org/archive/html/qemu-devel/2016-10/msg05844.html
v1: https://lists.gnu.org/archive/html/qemu-devel/2016-10/msg04487.html

This series adds blockdev-add support for NFS block driver.

Thanks, fixed as commented on patch 1 and applied.

Hi,

it seems this series breaks passing options via URI.

1) in nfs_parse_uri

parse_uint_full(qp->p[i].value, NULL, 0)

segfaults, as the routine wants to set *NULL = val.

Yes, you're right.

2) all parameters that have a different names in options and qdict
e.g. readahead-size vs. readahead cannot be passed via URI.

$ qemu-img convert -p
nfs://172.21.200.61/templates/VC_debian8-20170116.qcow2,linux\?readahead=131072
iscsi://172.21.200.56:3260/iqn.2001-05.com.equallogic:0-8a0906-69d384e0a-aa3004e55e15878d-XXX/0

qemu-img: Could not open
'nfs://172.21.200.61/vcore-dev-cdrom/templates/VC_debian8-20170116.qcow2,linux?readahead=131072':
Block protocol 'nfs' doesn't support the option 'readahead-size'

Please let me know if the below fix would be correct:

Shouldn't we support both for backwards compatiblity.?

Peter

[Qemu-block] [PATCH v10 04/16] qemu-img: Set "share-rw" flag in read-only commands

Checking the status of an image when it is being used by guest is
usually useful, and there is no risk of corrupting data, so don't let
the upcoming image locking feature limit this use case.

Signed-off-by: Fam Zheng 
---
 qemu-img.c | 10 --
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index 5df66fe..6a091e0 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -705,6 +705,10 @@ static int img_check(int argc, char **argv)
 return 1;
 }
 
+if (!(flags & BDRV_O_RDWR)) {
+flags |= BDRV_O_SHARE_RW;
+}
+
 blk = img_open(image_opts, filename, fmt, flags, writethrough, quiet);
 if (!blk) {
 return 1;
@@ -1238,6 +1242,7 @@ static int img_compare(int argc, char **argv)
 goto out3;
 }
 
+flags |= BDRV_O_SHARE_RW;
 blk1 = img_open(image_opts, filename1, fmt1, flags, writethrough, quiet);
 if (!blk1) {
 ret = 2;
@@ -2286,7 +2291,8 @@ static ImageInfoList *collect_image_info_list(bool 
image_opts,
 g_hash_table_insert(filenames, (gpointer)filename, NULL);
 
 blk = img_open(image_opts, filename, fmt,
-   BDRV_O_NO_BACKING | BDRV_O_NO_IO, false, false);
+   BDRV_O_NO_BACKING | BDRV_O_NO_IO | BDRV_O_SHARE_RW,
+   false, false);
 if (!blk) {
 goto err;
 }
@@ -2612,7 +2618,7 @@ static int img_map(int argc, char **argv)
 return 1;
 }
 
-blk = img_open(image_opts, filename, fmt, 0, false, false);
+blk = img_open(image_opts, filename, fmt, BDRV_O_SHARE_RW, false, false);
 if (!blk) {
 return 1;
 }
-- 
2.9.3

[Qemu-block] [PATCH v10 08/16] iotests: 087: Don't attach test image twice

The test scenario doesn't require the same image, instead it focuses on
the duplicated node-name, so use null-co to avoid locking conflict.

Signed-off-by: Fam Zheng 
---
 tests/qemu-iotests/087 | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/qemu-iotests/087 b/tests/qemu-iotests/087
index 9de57dd..6d52f7d 100755
--- a/tests/qemu-iotests/087
+++ b/tests/qemu-iotests/087
@@ -82,8 +82,7 @@ run_qemu -drive 
driver=$IMGFMT,id=disk,node-name=test-node,file="$TEST_IMG" <

[Qemu-block] [PATCH v10 03/16] qemu-io: Set "share-rw" flag together with read-only

qemu-io is a low level tool to read or modify guest visible data, which
implies the user knows very well what is being done. Allowing reading
from a locked image is harmless in most cases, so do it.

Signed-off-by: Fam Zheng 
---
 qemu-io.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/qemu-io.c b/qemu-io.c
index 23a229f..f000504 100644
--- a/qemu-io.c
+++ b/qemu-io.c
@@ -585,6 +585,8 @@ int main(int argc, char **argv)
 /* open the device */
 if (!readonly) {
 flags |= BDRV_O_RDWR;
+} else {
+flags |= BDRV_O_SHARE_RW;
 }
 
 if ((argc - optind) == 1) {
-- 
2.9.3

Re: [Qemu-block] Performance impact of the qcow2 overlap checks

2017-01-19 Thread Alberto Garcia

On Wed 18 Jan 2017 04:30:49 PM CET, Max Reitz wrote:

>> In other words: the vast majority of the entries in the refcount
>> table are probably not going to be used ever, but we're still
>> checking them for each write request. One user reported a >200%
>> performance increase on a fast SSD when using overlap-check=constant.
>> 
>> I think this is at least worth documenting a bit better (unless
>> there's existing documentation that I have missed), but my main
>> question is: does it make sense to try to optimize these checks, or
>> is it better to simply tell the user to disable them in these
>> scenarios?
>
> I have optimized these checks. See:
>
> http://lists.nongnu.org/archive/html/qemu-block/2015-08/msg00020.html
>
> Feel free to review. If you want, I can rebase the series.
>
> So far nobody has seriously done so because it wasn't considered
> important enough, and the patches are not exactly trivial.

I see. I wonder if it's worth complicating the code so much more instead
of simply disabling the tests.

For the 'refcount-block' check, which is the one that has an immediate
impact in all kinds of images, I was wondering if we could simply use
the image size to determine how many entries need to be checked. This
would keep things much faster unless the image grows abnormally bigger.

Berto

[Qemu-block] [PATCH 0/2] block: Fix iotests 059

The 059.out went out of sync, bring it back in line.

Fam Zheng (2):
  qapi: Tweak error message of bdrv_query_image_info
  iotests: Fix reference output for 059

 block/qapi.c   | 4 ++--
 tests/qemu-iotests/059.out | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

-- 
2.9.3

[Qemu-block] [PATCH 2/2] iotests: Fix reference output for 059

It was broken by efaa7c4eeb7 when it dropped the device name "image"
from BB API.  Now this error message text is updated again, sync it up.

Signed-off-by: Fam Zheng 
---
 tests/qemu-iotests/059.out | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/qemu-iotests/059.out b/tests/qemu-iotests/059.out
index 678adb4..19bd50d 100644
--- a/tests/qemu-iotests/059.out
+++ b/tests/qemu-iotests/059.out
@@ -2361,5 +2361,5 @@ Offset  Length  Mapped to   File
 0x14000 0x1 0x5 
TEST_DIR/iotest-version3-s003.vmdk
 
 === Testing afl image with a very large capacity ===
-qemu-img: Can't get size of device 'image': File too large
+qemu-img: Can't get image size 'TEST_DIR/afl9.IMGFMT': File too large
 *** done
-- 
2.9.3

[Qemu-block] [PATCH 1/2] qapi: Tweak error message of bdrv_query_image_info

@bs doesn't always have a device name, such as when it comes from
"qemu-img info". Report file name instead.

Signed-off-by: Fam Zheng 
---
 block/qapi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/block/qapi.c b/block/qapi.c
index a62e862..6329735 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -237,8 +237,8 @@ void bdrv_query_image_info(BlockDriverState *bs,
 
 size = bdrv_getlength(bs);
 if (size < 0) {
-error_setg_errno(errp, -size, "Can't get size of device '%s'",
- bdrv_get_device_name(bs));
+error_setg_errno(errp, -size, "Can't get image size '%s'",
+ bs->exact_filename);
 goto out;
 }
 
-- 
2.9.3

Re: [Qemu-block] [PATCH v1 11/15] qcow2: convert QCow2 to use QCryptoBlock for encryption