[Qemu-block] [PATCH v4 6/6] nbd/replication: implement .bdrv_get_info() for nbd and replication driver

2017-04-12 Thread zhanghailiang
Without this callback, there will be an error reports in the primary side:
"qemu-system-x86_64: Couldn't determine the cluster size of the target image,
which has no backing file: Operation not supported
Aborting, since this may create an unusable destination image"

For nbd driver, it doesn't have cluster size, so here we return
a fake value for it.

This patch should be dropped if Eric's nbd patch be merged.
https://lists.gnu.org/archive/html/qemu-block/2017-02/msg00825.html
'[PATCH v4 7/8] nbd: Implement NBD_INFO_BLOCK_SIZE on server'.

Cc: Eric Blake <ebl...@redhat.com>
Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com>
Signed-off-by: Wen Congyang <we...@cn.fujitsu.com>
---
 block/nbd.c | 12 
 block/replication.c |  6 ++
 2 files changed, 18 insertions(+)

diff --git a/block/nbd.c b/block/nbd.c
index 814ab26d..fceb14b 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -43,6 +43,8 @@
 
 #define EN_OPTSTR ":exportname="
 
+#define NBD_FAKE_CLUSTER_SIZE 512
+
 typedef struct BDRVNBDState {
 NBDClientSession client;
 
@@ -561,6 +563,13 @@ static void nbd_refresh_filename(BlockDriverState *bs, 
QDict *options)
 bs->full_open_options = opts;
 }
 
+static int nbd_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+bdi->cluster_size  = NBD_FAKE_CLUSTER_SIZE;
+
+return 0;
+}
+
 static BlockDriver bdrv_nbd = {
 .format_name= "nbd",
 .protocol_name  = "nbd",
@@ -578,6 +587,7 @@ static BlockDriver bdrv_nbd = {
 .bdrv_detach_aio_context= nbd_detach_aio_context,
 .bdrv_attach_aio_context= nbd_attach_aio_context,
 .bdrv_refresh_filename  = nbd_refresh_filename,
+.bdrv_get_info  = nbd_get_info,
 };
 
 static BlockDriver bdrv_nbd_tcp = {
@@ -597,6 +607,7 @@ static BlockDriver bdrv_nbd_tcp = {
 .bdrv_detach_aio_context= nbd_detach_aio_context,
 .bdrv_attach_aio_context= nbd_attach_aio_context,
 .bdrv_refresh_filename  = nbd_refresh_filename,
+.bdrv_get_info  = nbd_get_info,
 };
 
 static BlockDriver bdrv_nbd_unix = {
@@ -616,6 +627,7 @@ static BlockDriver bdrv_nbd_unix = {
 .bdrv_detach_aio_context= nbd_detach_aio_context,
 .bdrv_attach_aio_context= nbd_attach_aio_context,
 .bdrv_refresh_filename  = nbd_refresh_filename,
+.bdrv_get_info  = nbd_get_info,
 };
 
 static void bdrv_nbd_init(void)
diff --git a/block/replication.c b/block/replication.c
index fb604e5..7371caa 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -761,6 +761,11 @@ static void replication_stop(ReplicationState *rs, bool 
failover, Error **errp)
 aio_context_release(aio_context);
 }
 
+static int replication_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+return bdrv_get_info(bs->file->bs, bdi);
+}
+
 BlockDriver bdrv_replication = {
 .format_name= "replication",
 .protocol_name  = "replication",
@@ -774,6 +779,7 @@ BlockDriver bdrv_replication = {
 .bdrv_co_readv  = replication_co_readv,
 .bdrv_co_writev = replication_co_writev,
 
+.bdrv_get_info  = replication_get_info,
 .is_filter  = true,
 .bdrv_recurse_is_first_non_filter = 
replication_recurse_is_first_non_filter,
 
-- 
1.8.3.1





[Qemu-block] [PATCH v4 5/6] replication: Implement block replication for shared disk case

2017-04-12 Thread zhanghailiang
Just as the scenario of non-shared disk block replication,
we are going to implement block replication from many basic
blocks that are already in QEMU.
The architecture is:

 virtio-blk ||   
.--
 /  ||   | 
Secondary
/   ||   
'--
   /|| 
virtio-blk
  / ||  
|
  | ||   
replication(5)
  |NBD  >   NBD   (2)   
|
  |  client ||server ---> hidden disk <-- 
active disk(4)
  | ^   ||  |
  |  replication(1) ||  |
  | |   ||  |
  |   +-'   ||  |
 (3)  |drive-backup sync=none   ||  |
. |   +-+   ||  |
Primary | | |   ||   backing|
' | |   ||  |
  V |   |
   +---+|
   |   shared disk | <--+
   +---+

1) Primary writes will read original data and forward it to Secondary
   QEMU.
2) The hidden-disk is created automatically. It buffers the original content
   that is modified by the primary VM. It should also be an empty disk, and
   the driver supports bdrv_make_empty() and backing file.
3) Primary write requests will be written to Shared disk.
4) Secondary write requests will be buffered in the active disk and it
   will overwrite the existing sector content in the buffer.

Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com>
Signed-off-by: Wen Congyang <we...@cn.fujitsu.com>
Signed-off-by: Zhang Chen <zhangchen.f...@cn.fujitsu.com>
---
v4:
 - Call bdrv_invalidate_cache() while do checkpoint for shared disk
---
 block/replication.c | 58 +++--
 1 file changed, 52 insertions(+), 6 deletions(-)

diff --git a/block/replication.c b/block/replication.c
index 3a35471..fb604e5 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -253,7 +253,7 @@ static coroutine_fn int 
replication_co_readv(BlockDriverState *bs,
  QEMUIOVector *qiov)
 {
 BDRVReplicationState *s = bs->opaque;
-BdrvChild *child = s->secondary_disk;
+BdrvChild *child = s->is_shared_disk ? s->primary_disk : s->secondary_disk;
 BlockJob *job = NULL;
 CowRequest req;
 int ret;
@@ -435,7 +435,12 @@ static void backup_job_completed(void *opaque, int ret)
 s->error = -EIO;
 }
 
-backup_job_cleanup(bs);
+if (s->mode == REPLICATION_MODE_PRIMARY) {
+s->replication_state = BLOCK_REPLICATION_DONE;
+s->error = 0;
+} else {
+backup_job_cleanup(bs);
+}
 }
 
 static bool check_top_bs(BlockDriverState *top_bs, BlockDriverState *bs)
@@ -487,6 +492,19 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 
 switch (s->mode) {
 case REPLICATION_MODE_PRIMARY:
+if (s->is_shared_disk) {
+job = backup_job_create(NULL, s->primary_disk->bs, bs, 0,
+MIRROR_SYNC_MODE_NONE, NULL, false, BLOCKDEV_ON_ERROR_REPORT,
+BLOCKDEV_ON_ERROR_REPORT, BLOCK_JOB_INTERNAL,
+backup_job_completed, bs, NULL, _err);
+if (local_err) {
+error_propagate(errp, local_err);
+backup_job_cleanup(bs);
+aio_context_release(aio_context);
+return;
+}
+block_job_start(job);
+}
 break;
 case REPLICATION_MODE_SECONDARY:
 s->active_disk = bs->file;
@@ -505,7 +523,8 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 }
 
 s->secondary_disk = s->hidden_disk->bs->backing;
-if (!s->secondary_disk->bs || !bdrv_has_blk(s->secondary_disk->bs)) {
+if (!s->secondary_disk->bs ||
+(!s->is_shared_disk && !bdrv_has_blk(s->secondary_disk->bs))) {
 error_setg(errp, "The secondary disk doesn't have block backend");
 aio_context_release(aio_context);
 return;
@@ -600,11 +619,24 @@ static void replication_do_checkpoint(ReplicationState 
*r

[Qemu-block] [PATCH v4 4/6] replication: fix code logic with the new shared_disk option

2017-04-12 Thread zhanghailiang
Some code logic only be needed in non-shared disk, here
we adjust these codes to prepare for shared disk scenario.

Reviewed-by: Stefan Hajnoczi <stefa...@redhat.com>
Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com>
---
 block/replication.c | 73 ++---
 1 file changed, 41 insertions(+), 32 deletions(-)

diff --git a/block/replication.c b/block/replication.c
index b021215..3a35471 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -539,33 +539,40 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 return;
 }
 
-/* start backup job now */
-error_setg(>blocker,
-   "Block device is in use by internal backup job");
-
-top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
-if (!top_bs || !bdrv_is_root_node(top_bs) ||
-!check_top_bs(top_bs, bs)) {
-error_setg(errp, "No top_bs or it is invalid");
-reopen_backing_file(bs, false, NULL);
-aio_context_release(aio_context);
-return;
-}
-bdrv_op_block_all(top_bs, s->blocker);
-bdrv_op_unblock(top_bs, BLOCK_OP_TYPE_DATAPLANE, s->blocker);
-
-job = backup_job_create(NULL, s->secondary_disk->bs, 
s->hidden_disk->bs,
-0, MIRROR_SYNC_MODE_NONE, NULL, false,
+/*
+ * Only in the case of non-shared disk,
+ * the backup job is in the secondary side
+ */
+if (!s->is_shared_disk) {
+/* start backup job now */
+error_setg(>blocker,
+"Block device is in use by internal backup job");
+
+top_bs = bdrv_lookup_bs(s->top_id, s->top_id, NULL);
+if (!top_bs || !bdrv_is_root_node(top_bs) ||
+!check_top_bs(top_bs, bs)) {
+error_setg(errp, "No top_bs or it is invalid");
+reopen_backing_file(bs, false, NULL);
+aio_context_release(aio_context);
+return;
+}
+
+bdrv_op_block_all(top_bs, s->blocker);
+bdrv_op_unblock(top_bs, BLOCK_OP_TYPE_DATAPLANE, s->blocker);
+job = backup_job_create(NULL, s->secondary_disk->bs,
+s->hidden_disk->bs, 0,
+MIRROR_SYNC_MODE_NONE, NULL, false,
 BLOCKDEV_ON_ERROR_REPORT,
 BLOCKDEV_ON_ERROR_REPORT, BLOCK_JOB_INTERNAL,
 backup_job_completed, bs, NULL, _err);
-if (local_err) {
-error_propagate(errp, local_err);
-backup_job_cleanup(bs);
-aio_context_release(aio_context);
-return;
+if (local_err) {
+error_propagate(errp, local_err);
+backup_job_cleanup(bs);
+aio_context_release(aio_context);
+return;
+}
+block_job_start(job);
 }
-block_job_start(job);
 
 secondary_do_checkpoint(s, errp);
 break;
@@ -595,14 +602,16 @@ static void replication_do_checkpoint(ReplicationState 
*rs, Error **errp)
 case REPLICATION_MODE_PRIMARY:
 break;
 case REPLICATION_MODE_SECONDARY:
-if (!s->secondary_disk->bs->job) {
-error_setg(errp, "Backup job was cancelled unexpectedly");
-break;
-}
-backup_do_checkpoint(s->secondary_disk->bs->job, _err);
-if (local_err) {
-error_propagate(errp, local_err);
-break;
+if (!s->is_shared_disk) {
+if (!s->secondary_disk->bs->job) {
+error_setg(errp, "Backup job was cancelled unexpectedly");
+break;
+}
+backup_do_checkpoint(s->secondary_disk->bs->job, _err);
+if (local_err) {
+error_propagate(errp, local_err);
+break;
+}
 }
 secondary_do_checkpoint(s, errp);
 break;
@@ -683,7 +692,7 @@ static void replication_stop(ReplicationState *rs, bool 
failover, Error **errp)
  * before the BDS is closed, because we will access hidden
  * disk, secondary disk in backup_job_completed().
  */
-if (s->secondary_disk->bs->job) {
+if (!s->is_shared_disk && s->secondary_disk->bs->job) {
 block_job_cancel_sync(s->secondary_disk->bs->job);
 }
 
-- 
1.8.3.1





[Qemu-block] [PATCH v4 0/6] COLO block replication supports shared disk case

2017-04-12 Thread zhanghailiang
COLO block replication doesn't support the shared disk case,
Here we try to implement it and this is the 4th version.

Please review and any commits are welcomed.

Cc: Dr. David Alan Gilbert (git) <dgilb...@redhat.com>
Cc: eddie.d...@intel.com

v4:
- Add proper comment for primary_disk in patch 2 (Stefan)
- Call bdrv_invalidate_cache() while do checkpoint for shared disk in patch 5

v3:
- Fix some comments from Stefan and Eric

v2:
- Drop the patch which add a blk_root() helper
- Fix some comments from Changlong

zhanghailiang (6):
  docs/block-replication: Add description for shared-disk case
  replication: add shared-disk and shared-disk-id options
  replication: Split out backup_do_checkpoint() from
secondary_do_checkpoint()
  replication: fix code logic with the new shared_disk option
  replication: Implement block replication for shared disk case
  nbd/replication: implement .bdrv_get_info() for nbd and replication
driver

 block/nbd.c|  12 +++
 block/replication.c| 198 ++---
 docs/block-replication.txt | 139 ++-
 qapi/block-core.json   |  10 ++-
 4 files changed, 306 insertions(+), 53 deletions(-)

-- 
1.8.3.1





[Qemu-block] [PATCH v4 2/6] replication: add shared-disk and shared-disk-id options

2017-04-12 Thread zhanghailiang
We use these two options to identify which disk is
shared

Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com>
Signed-off-by: Wen Congyang <we...@cn.fujitsu.com>
Signed-off-by: Zhang Chen <zhangchen.f...@cn.fujitsu.com>
---
v4:
- Add proper comment for primary_disk (Stefan)
v2:
- Move g_free(s->shared_disk_id) to the common fail process place (Stefan)
- Fix comments for these two options
---
 block/replication.c  | 43 +--
 qapi/block-core.json | 10 +-
 2 files changed, 50 insertions(+), 3 deletions(-)

diff --git a/block/replication.c b/block/replication.c
index bf3c395..418b81b 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -25,9 +25,12 @@
 typedef struct BDRVReplicationState {
 ReplicationMode mode;
 int replication_state;
+bool is_shared_disk;
+char *shared_disk_id;
 BdrvChild *active_disk;
 BdrvChild *hidden_disk;
 BdrvChild *secondary_disk;
+BdrvChild *primary_disk;
 char *top_id;
 ReplicationState *rs;
 Error *blocker;
@@ -53,6 +56,9 @@ static void replication_stop(ReplicationState *rs, bool 
failover,
 
 #define REPLICATION_MODE"mode"
 #define REPLICATION_TOP_ID  "top-id"
+#define REPLICATION_SHARED_DISK "shared-disk"
+#define REPLICATION_SHARED_DISK_ID "shared-disk-id"
+
 static QemuOptsList replication_runtime_opts = {
 .name = "replication",
 .head = QTAILQ_HEAD_INITIALIZER(replication_runtime_opts.head),
@@ -65,6 +71,14 @@ static QemuOptsList replication_runtime_opts = {
 .name = REPLICATION_TOP_ID,
 .type = QEMU_OPT_STRING,
 },
+{
+.name = REPLICATION_SHARED_DISK_ID,
+.type = QEMU_OPT_STRING,
+},
+{
+.name = REPLICATION_SHARED_DISK,
+.type = QEMU_OPT_BOOL,
+},
 { /* end of list */ }
 },
 };
@@ -85,6 +99,9 @@ static int replication_open(BlockDriverState *bs, QDict 
*options,
 QemuOpts *opts = NULL;
 const char *mode;
 const char *top_id;
+const char *shared_disk_id;
+BlockBackend *blk;
+BlockDriverState *tmp_bs;
 
 bs->file = bdrv_open_child(NULL, options, "file", bs, _file,
false, errp);
@@ -125,12 +142,33 @@ static int replication_open(BlockDriverState *bs, QDict 
*options,
"The option mode's value should be primary or secondary");
 goto fail;
 }
+s->is_shared_disk = qemu_opt_get_bool(opts, REPLICATION_SHARED_DISK,
+  false);
+if (s->is_shared_disk && (s->mode == REPLICATION_MODE_PRIMARY)) {
+shared_disk_id = qemu_opt_get(opts, REPLICATION_SHARED_DISK_ID);
+if (!shared_disk_id) {
+error_setg(_err, "Missing shared disk blk option");
+goto fail;
+}
+s->shared_disk_id = g_strdup(shared_disk_id);
+blk = blk_by_name(s->shared_disk_id);
+if (!blk) {
+error_setg(_err, "There is no %s block", s->shared_disk_id);
+goto fail;
+}
+/* We have a BlockBackend for the primary disk but use BdrvChild for
+ * consistency - active_disk, secondary_disk, etc are also BdrvChild.
+ */
+tmp_bs = blk_bs(blk);
+s->primary_disk = QLIST_FIRST(_bs->parents);
+}
 
 s->rs = replication_new(bs, _ops);
 
-ret = 0;
-
+qemu_opts_del(opts);
+return 0;
 fail:
+g_free(s->shared_disk_id);
 qemu_opts_del(opts);
 error_propagate(errp, local_err);
 
@@ -141,6 +179,7 @@ static void replication_close(BlockDriverState *bs)
 {
 BDRVReplicationState *s = bs->opaque;
 
+g_free(s->shared_disk_id);
 if (s->replication_state == BLOCK_REPLICATION_RUNNING) {
 replication_stop(s->rs, false, NULL);
 }
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 033457c..361c932 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2661,12 +2661,20 @@
 #  node who owns the replication node chain. Must not be given in
 #  primary mode.
 #
+# @shared-disk-id: Id of shared disk while is replication mode, if @shared-disk
+#  is true, this option is required (Since: 2.10)
+#
+# @shared-disk: To indicate whether or not a disk is shared by primary VM
+#   and secondary VM. (The default is false) (Since: 2.10)
+#
 # Since: 2.9
 ##
 { 'struct': 'BlockdevOptionsReplication',
   'base': 'BlockdevOptionsGenericFormat',
   'data': { 'mode': 'ReplicationMode',
-'*top-id': 'str' } }
+'*top-id': 'str',
+'*shared-disk-id': 'str',
+'*shared-disk': 'bool' } }
 
 ##
 # @NFSTransport:
-- 
1.8.3.1





[Qemu-block] [PATCH v4 3/6] replication: Split out backup_do_checkpoint() from secondary_do_checkpoint()

2017-04-12 Thread zhanghailiang
The helper backup_do_checkpoint() will be used for primary related
codes. Here we split it out from secondary_do_checkpoint().

Besides, it is unnecessary to call backup_do_checkpoint() in
replication starting and normally stop replication path.
We only need call it while do real checkpointing.

Reviewed-by: Stefan Hajnoczi <stefa...@redhat.com>
Reviewed-by: Changlong Xie <xiecl.f...@cn.fujitsu.com>
Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com>
---
 block/replication.c | 36 +++-
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/block/replication.c b/block/replication.c
index 418b81b..b021215 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -352,20 +352,8 @@ static bool 
replication_recurse_is_first_non_filter(BlockDriverState *bs,
 
 static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp)
 {
-Error *local_err = NULL;
 int ret;
 
-if (!s->secondary_disk->bs->job) {
-error_setg(errp, "Backup job was cancelled unexpectedly");
-return;
-}
-
-backup_do_checkpoint(s->secondary_disk->bs->job, _err);
-if (local_err) {
-error_propagate(errp, local_err);
-return;
-}
-
 ret = s->active_disk->bs->drv->bdrv_make_empty(s->active_disk->bs);
 if (ret < 0) {
 error_setg(errp, "Cannot make active disk empty");
@@ -578,6 +566,8 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 return;
 }
 block_job_start(job);
+
+secondary_do_checkpoint(s, errp);
 break;
 default:
 aio_context_release(aio_context);
@@ -586,10 +576,6 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 
 s->replication_state = BLOCK_REPLICATION_RUNNING;
 
-if (s->mode == REPLICATION_MODE_SECONDARY) {
-secondary_do_checkpoint(s, errp);
-}
-
 s->error = 0;
 aio_context_release(aio_context);
 }
@@ -599,13 +585,29 @@ static void replication_do_checkpoint(ReplicationState 
*rs, Error **errp)
 BlockDriverState *bs = rs->opaque;
 BDRVReplicationState *s;
 AioContext *aio_context;
+Error *local_err = NULL;
 
 aio_context = bdrv_get_aio_context(bs);
 aio_context_acquire(aio_context);
 s = bs->opaque;
 
-if (s->mode == REPLICATION_MODE_SECONDARY) {
+switch (s->mode) {
+case REPLICATION_MODE_PRIMARY:
+break;
+case REPLICATION_MODE_SECONDARY:
+if (!s->secondary_disk->bs->job) {
+error_setg(errp, "Backup job was cancelled unexpectedly");
+break;
+}
+backup_do_checkpoint(s->secondary_disk->bs->job, _err);
+if (local_err) {
+error_propagate(errp, local_err);
+break;
+}
 secondary_do_checkpoint(s, errp);
+break;
+default:
+abort();
 }
 aio_context_release(aio_context);
 }
-- 
1.8.3.1





[Qemu-block] [PATCH v4 1/6] docs/block-replication: Add description for shared-disk case

2017-04-12 Thread zhanghailiang
Introuduce the scenario of shared-disk block replication
and how to use it.

Reviewed-by: Changlong Xie <xiecl.f...@cn.fujitsu.com>
Reviewed-by: Stefan Hajnoczi <stefa...@redhat.com>
Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com>
Signed-off-by: Wen Congyang <we...@cn.fujitsu.com>
Signed-off-by: Zhang Chen <zhangchen.f...@cn.fujitsu.com>
---
 docs/block-replication.txt | 139 +++--
 1 file changed, 135 insertions(+), 4 deletions(-)

diff --git a/docs/block-replication.txt b/docs/block-replication.txt
index 6bde673..fbfe005 100644
--- a/docs/block-replication.txt
+++ b/docs/block-replication.txt
@@ -24,7 +24,7 @@ only dropped at next checkpoint time. To reduce the network 
transportation
 effort during a vmstate checkpoint, the disk modification operations of
 the Primary disk are asynchronously forwarded to the Secondary node.
 
-== Workflow ==
+== Non-shared disk workflow ==
 The following is the image of block replication workflow:
 
 +--+++
@@ -57,7 +57,7 @@ The following is the image of block replication workflow:
 4) Secondary write requests will be buffered in the Disk buffer and it
will overwrite the existing sector content in the buffer.
 
-== Architecture ==
+== Non-shared disk architecture ==
 We are going to implement block replication from many basic
 blocks that are already in QEMU.
 
@@ -106,6 +106,74 @@ any state that would otherwise be lost by the speculative 
write-through
 of the NBD server into the secondary disk. So before block replication,
 the primary disk and secondary disk should contain the same data.
 
+== Shared Disk Mode Workflow ==
+The following is the image of block replication workflow:
+
++--+++
+|Primary Write Requests||Secondary Write Requests|
++--+++
+  |   |
+  |  (4)
+  |   V
+  |  /-\
+  | (2)Forward and write through | |
+  | +--> | Disk Buffer |
+  | || |
+  | |\-/
+  | |(1)read   |
+  | |  |
+   (3)write   | |  | backing file
+  V |  |
+ +-+   |
+ | Shared Disk | <-+
+ +-+
+
+1) Primary writes will read original data and forward it to Secondary
+   QEMU.
+2) Before Primary write requests are written to Shared disk, the
+   original sector content will be read from Shared disk and
+   forwarded and buffered in the Disk buffer on the secondary site,
+   but it will not overwrite the existing sector content (it could be
+   from either "Secondary Write Requests" or previous COW of "Primary
+   Write Requests") in the Disk buffer.
+3) Primary write requests will be written to Shared disk.
+4) Secondary write requests will be buffered in the Disk buffer and it
+   will overwrite the existing sector content in the buffer.
+
+== Shared Disk Mode Architecture ==
+We are going to implement block replication from many basic
+blocks that are already in QEMU.
+ virtio-blk ||   
.--
+ /  ||   | 
Secondary
+/   ||   
'--
+   /|| 
virtio-blk
+  / || 
 |
+  | ||   
replication(5)
+  |NBD  >   NBD   (2)  
 |
+  |  client ||server ---> hidden disk <-- 
active disk(4)
+  | ^   ||  |
+  |  replication(1) ||  |
+  | |   ||  |
+  |   +-'   ||  |
+ (3)  |drive-backup sync=none   ||  |
+. |   +-+   ||

[Qemu-block] [PATCH v2] migration: re-active images while migration been canceled after inactive them

2017-01-24 Thread zhanghailiang
commit fe904ea8242cbae2d7e69c052c754b8f5f1ba1d6 fixed a case
which migration aborted QEMU because it didn't regain the control
of images while some errors happened.

Actually, there are another two cases can trigger the same error reports:
" bdrv_co_do_pwritev: Assertion `!(bs->open_flags & 0x0800)' failed",

Case 1, codes path:
migration_thread()
migration_completion()
bdrv_inactivate_all() > inactivate images
qemu_savevm_state_complete_precopy()
socket_writev_buffer() > error because destination fails
qemu_fflush() > set error on migration stream
-> qmp_migrate_cancel() > user cancelled migration concurrently
-> migrate_set_state() --> set migrate CANCELLIN
migration_completion() -> go on to fail_invalidate
if (s->state == MIGRATION_STATUS_ACTIVE) -> Jump this branch

Case 2, codes path:
migration_thread()
migration_completion()
bdrv_inactivate_all() > inactivate images
migreation_completion() finished
-> qmp_migrate_cancel() ---> user cancelled migration concurrently
qemu_mutex_lock_iothread();
qemu_bh_schedule (s->cleanup_bh);

As we can see from above, qmp_migrate_cancel can slip in whenever
migration_thread does not hold the global lock. If this happens after
bdrv_inactive_all() been called, the above error reports will appear.

To prevent this, we can call bdrv_invalidate_cache_all() in qmp_migrate_cancel()
directly if we find images become inactive.

Besides, bdrv_invalidate_cache_all() in migration_completion() doesn't have the
protection of big lock, fix it by add the missing qemu_mutex_lock_iothread();

Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com>
---
v2:
 - Fix a bug introduced by commit fe904 which didn't get big lock
   before call bdrv_invalidate_cache_all. (Suggested by Dave)
---
 include/migration/migration.h |  3 +++
 migration/migration.c | 15 +++
 2 files changed, 18 insertions(+)

diff --git a/include/migration/migration.h b/include/migration/migration.h
index c309d23..2d5b724 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -177,6 +177,9 @@ struct MigrationState
 /* Flag set once the migration thread is running (and needs joining) */
 bool migration_thread_running;
 
+/* Flag set once the migration thread called bdrv_inactivate_all */
+bool block_inactive;
+
 /* Queue of outstanding page requests from the destination */
 QemuMutex src_page_req_mutex;
 QSIMPLEQ_HEAD(src_page_requests, MigrationSrcPageRequest) 
src_page_requests;
diff --git a/migration/migration.c b/migration/migration.c
index f498ab8..5b50afe 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1006,6 +1006,16 @@ static void migrate_fd_cancel(MigrationState *s)
 if (s->state == MIGRATION_STATUS_CANCELLING && f) {
 qemu_file_shutdown(f);
 }
+if (s->state == MIGRATION_STATUS_CANCELLING && s->block_inactive) {
+Error *local_err = NULL;
+
+bdrv_invalidate_cache_all(_err);
+if (local_err) {
+error_report_err(local_err);
+} else {
+s->block_inactive = false;
+}
+}
 }
 
 void add_migration_state_change_notifier(Notifier *notify)
@@ -1705,6 +1715,7 @@ static void migration_completion(MigrationState *s, int 
current_active_state,
 if (ret >= 0) {
 qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
 qemu_savevm_state_complete_precopy(s->to_dst_file, false);
+s->block_inactive = true;
 }
 }
 qemu_mutex_unlock_iothread();
@@ -1755,10 +1766,14 @@ fail_invalidate:
 if (s->state == MIGRATION_STATUS_ACTIVE) {
 Error *local_err = NULL;
 
+qemu_mutex_lock_iothread();
 bdrv_invalidate_cache_all(_err);
 if (local_err) {
 error_report_err(local_err);
+} else {
+s->block_inactive = false;
 }
+qemu_mutex_unlock_iothread();
 }
 
 fail:
-- 
1.8.3.1





[Qemu-block] [PATCH v3 3/6] replication: Split out backup_do_checkpoint() from secondary_do_checkpoint()

2017-01-19 Thread zhanghailiang
The helper backup_do_checkpoint() will be used for primary related
codes. Here we split it out from secondary_do_checkpoint().

Besides, it is unnecessary to call backup_do_checkpoint() in
replication starting and normally stop replication path.
We only need call it while do real checkpointing.

Reviewed-by: Stefan Hajnoczi <stefa...@redhat.com>
Reviewed-by: Changlong Xie <xiecl.f...@cn.fujitsu.com>
Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com>
---
 block/replication.c | 36 +++-
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/block/replication.c b/block/replication.c
index b96a3e5..3a44728 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -332,20 +332,8 @@ static bool 
replication_recurse_is_first_non_filter(BlockDriverState *bs,
 
 static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp)
 {
-Error *local_err = NULL;
 int ret;
 
-if (!s->secondary_disk->bs->job) {
-error_setg(errp, "Backup job was cancelled unexpectedly");
-return;
-}
-
-backup_do_checkpoint(s->secondary_disk->bs->job, _err);
-if (local_err) {
-error_propagate(errp, local_err);
-return;
-}
-
 ret = s->active_disk->bs->drv->bdrv_make_empty(s->active_disk->bs);
 if (ret < 0) {
 error_setg(errp, "Cannot make active disk empty");
@@ -558,6 +546,8 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 return;
 }
 block_job_start(job);
+
+secondary_do_checkpoint(s, errp);
 break;
 default:
 aio_context_release(aio_context);
@@ -566,10 +556,6 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 
 s->replication_state = BLOCK_REPLICATION_RUNNING;
 
-if (s->mode == REPLICATION_MODE_SECONDARY) {
-secondary_do_checkpoint(s, errp);
-}
-
 s->error = 0;
 aio_context_release(aio_context);
 }
@@ -579,13 +565,29 @@ static void replication_do_checkpoint(ReplicationState 
*rs, Error **errp)
 BlockDriverState *bs = rs->opaque;
 BDRVReplicationState *s;
 AioContext *aio_context;
+Error *local_err = NULL;
 
 aio_context = bdrv_get_aio_context(bs);
 aio_context_acquire(aio_context);
 s = bs->opaque;
 
-if (s->mode == REPLICATION_MODE_SECONDARY) {
+switch (s->mode) {
+case REPLICATION_MODE_PRIMARY:
+break;
+case REPLICATION_MODE_SECONDARY:
+if (!s->secondary_disk->bs->job) {
+error_setg(errp, "Backup job was cancelled unexpectedly");
+break;
+}
+backup_do_checkpoint(s->secondary_disk->bs->job, _err);
+if (local_err) {
+error_propagate(errp, local_err);
+break;
+}
 secondary_do_checkpoint(s, errp);
+break;
+default:
+abort();
 }
 aio_context_release(aio_context);
 }
-- 
1.8.3.1





[Qemu-block] [PATCH v3 4/6] replication: fix code logic with the new shared_disk option

2017-01-19 Thread zhanghailiang
Some code logic only be needed in non-shared disk, here
we adjust these codes to prepare for shared disk scenario.

Reviewed-by: Stefan Hajnoczi <stefa...@redhat.com>
Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com>
---
 block/replication.c | 47 ---
 1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/block/replication.c b/block/replication.c
index 3a44728..70ec08c 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -531,21 +531,28 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 aio_context_release(aio_context);
 return;
 }
-bdrv_op_block_all(top_bs, s->blocker);
-bdrv_op_unblock(top_bs, BLOCK_OP_TYPE_DATAPLANE, s->blocker);
 
-job = backup_job_create(NULL, s->secondary_disk->bs, 
s->hidden_disk->bs,
-0, MIRROR_SYNC_MODE_NONE, NULL, false,
+/*
+ * Only in the case of non-shared disk,
+ * the backup job is in the secondary side
+ */
+if (!s->is_shared_disk) {
+bdrv_op_block_all(top_bs, s->blocker);
+bdrv_op_unblock(top_bs, BLOCK_OP_TYPE_DATAPLANE, s->blocker);
+job = backup_job_create(NULL, s->secondary_disk->bs,
+s->hidden_disk->bs, 0,
+MIRROR_SYNC_MODE_NONE, NULL, false,
 BLOCKDEV_ON_ERROR_REPORT,
 BLOCKDEV_ON_ERROR_REPORT, BLOCK_JOB_INTERNAL,
 backup_job_completed, bs, NULL, _err);
-if (local_err) {
-error_propagate(errp, local_err);
-backup_job_cleanup(bs);
-aio_context_release(aio_context);
-return;
+if (local_err) {
+error_propagate(errp, local_err);
+backup_job_cleanup(bs);
+aio_context_release(aio_context);
+return;
+}
+block_job_start(job);
 }
-block_job_start(job);
 
 secondary_do_checkpoint(s, errp);
 break;
@@ -575,14 +582,16 @@ static void replication_do_checkpoint(ReplicationState 
*rs, Error **errp)
 case REPLICATION_MODE_PRIMARY:
 break;
 case REPLICATION_MODE_SECONDARY:
-if (!s->secondary_disk->bs->job) {
-error_setg(errp, "Backup job was cancelled unexpectedly");
-break;
-}
-backup_do_checkpoint(s->secondary_disk->bs->job, _err);
-if (local_err) {
-error_propagate(errp, local_err);
-break;
+if (!s->is_shared_disk) {
+if (!s->secondary_disk->bs->job) {
+error_setg(errp, "Backup job was cancelled unexpectedly");
+break;
+}
+backup_do_checkpoint(s->secondary_disk->bs->job, _err);
+if (local_err) {
+error_propagate(errp, local_err);
+break;
+}
 }
 secondary_do_checkpoint(s, errp);
 break;
@@ -663,7 +672,7 @@ static void replication_stop(ReplicationState *rs, bool 
failover, Error **errp)
  * before the BDS is closed, because we will access hidden
  * disk, secondary disk in backup_job_completed().
  */
-if (s->secondary_disk->bs->job) {
+if (!s->is_shared_disk && s->secondary_disk->bs->job) {
 block_job_cancel_sync(s->secondary_disk->bs->job);
 }
 
-- 
1.8.3.1





[Qemu-block] [PATCH v3 0/6] COLO block replication supports shared disk case

2017-01-19 Thread zhanghailiang
COLO block replication doesn't support the shared disk case,
Here we try to implement it and this is the third version.

Last posted series patches:
https://lists.gnu.org/archive/html/qemu-block/2016-12/msg00039.html
You can refer to the above link if want to test it.

I have uploaded the new version to github:
https://github.com/coloft/qemu/tree/colo-developing-with-shared-disk-2016-1-20

Please review and any commits are welcomed.

Cc: Juan Quintela <quint...@redhat.com>
Cc: Amit Shah <amit.s...@redhat.com> 
Cc: Dr. David Alan Gilbert (git) <dgilb...@redhat.com>
Cc: eddie.d...@intel.com

v3:
- Fix some comments from Stefan and Eric

v2:
- Drop the patch which add a blk_root() helper
- Fix some comments from Changlong

zhanghailiang (6):
  docs/block-replication: Add description for shared-disk case
  replication: add shared-disk and shared-disk-id options
  replication: Split out backup_do_checkpoint() from
secondary_do_checkpoint()
  replication: fix code logic with the new shared_disk option
  replication: Implement block replication for shared disk case
  nbd/replication: implement .bdrv_get_info() for nbd and replication
driver

 block/nbd.c|  12 
 block/replication.c| 156 +++--
 docs/block-replication.txt | 139 ++--
 qapi/block-core.json   |  10 ++-
 4 files changed, 279 insertions(+), 38 deletions(-)

-- 
1.8.3.1





[Qemu-block] [PATCH v3 2/6] replication: add shared-disk and shared-disk-id options

2017-01-19 Thread zhanghailiang
We use these two options to identify which disk is
shared

Cc: Eric Blake <ebl...@redhat.com>
Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com>
Signed-off-by: Wen Congyang <we...@cn.fujitsu.com>
Signed-off-by: Zhang Chen <zhangchen.f...@cn.fujitsu.com>
---
v3:
- Move g_free(s->shared_disk_id) to the common fail process place (Stefan)
- Fix comments for these two options
---
 block/replication.c  | 37 +
 qapi/block-core.json | 10 +-
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/block/replication.c b/block/replication.c
index 729dd12..b96a3e5 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -25,9 +25,12 @@
 typedef struct BDRVReplicationState {
 ReplicationMode mode;
 int replication_state;
+bool is_shared_disk;
+char *shared_disk_id;
 BdrvChild *active_disk;
 BdrvChild *hidden_disk;
 BdrvChild *secondary_disk;
+BdrvChild *primary_disk;
 char *top_id;
 ReplicationState *rs;
 Error *blocker;
@@ -53,6 +56,9 @@ static void replication_stop(ReplicationState *rs, bool 
failover,
 
 #define REPLICATION_MODE"mode"
 #define REPLICATION_TOP_ID  "top-id"
+#define REPLICATION_SHARED_DISK "shared-disk"
+#define REPLICATION_SHARED_DISK_ID "shared-disk-id"
+
 static QemuOptsList replication_runtime_opts = {
 .name = "replication",
 .head = QTAILQ_HEAD_INITIALIZER(replication_runtime_opts.head),
@@ -65,6 +71,14 @@ static QemuOptsList replication_runtime_opts = {
 .name = REPLICATION_TOP_ID,
 .type = QEMU_OPT_STRING,
 },
+{
+.name = REPLICATION_SHARED_DISK_ID,
+.type = QEMU_OPT_STRING,
+},
+{
+.name = REPLICATION_SHARED_DISK,
+.type = QEMU_OPT_BOOL,
+},
 { /* end of list */ }
 },
 };
@@ -85,6 +99,9 @@ static int replication_open(BlockDriverState *bs, QDict 
*options,
 QemuOpts *opts = NULL;
 const char *mode;
 const char *top_id;
+const char *shared_disk_id;
+BlockBackend *blk;
+BlockDriverState *tmp_bs;
 
 ret = -EINVAL;
 opts = qemu_opts_create(_runtime_opts, NULL, 0, _abort);
@@ -119,12 +136,31 @@ static int replication_open(BlockDriverState *bs, QDict 
*options,
"The option mode's value should be primary or secondary");
 goto fail;
 }
+s->is_shared_disk = qemu_opt_get_bool(opts, REPLICATION_SHARED_DISK,
+  false);
+if (s->is_shared_disk && (s->mode == REPLICATION_MODE_PRIMARY)) {
+shared_disk_id = qemu_opt_get(opts, REPLICATION_SHARED_DISK_ID);
+if (!shared_disk_id) {
+error_setg(_err, "Missing shared disk blk option");
+goto fail;
+}
+s->shared_disk_id = g_strdup(shared_disk_id);
+blk = blk_by_name(s->shared_disk_id);
+if (!blk) {
+error_setg(_err, "There is no %s block", s->shared_disk_id);
+goto fail;
+}
+/* We can't access root member of BlockBackend directly */
+tmp_bs = blk_bs(blk);
+s->primary_disk = QLIST_FIRST(_bs->parents);
+}
 
 s->rs = replication_new(bs, _ops);
 
 ret = 0;
 
 fail:
+g_free(s->shared_disk_id);
 qemu_opts_del(opts);
 error_propagate(errp, local_err);
 
@@ -135,6 +171,7 @@ static void replication_close(BlockDriverState *bs)
 {
 BDRVReplicationState *s = bs->opaque;
 
+g_free(s->shared_disk_id);
 if (s->replication_state == BLOCK_REPLICATION_RUNNING) {
 replication_stop(s->rs, false, NULL);
 }
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 1b3e6eb..e9fecda 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2624,12 +2624,20 @@
 #  node who owns the replication node chain. Must not be given in
 #  primary mode.
 #
+# @shared-disk-id: #optional id of shared disk while is replication mode,
+#  if @shared-disk is true, this option is required (Since: 
2.9)
+#
+# @shared-disk: #optional (The default is false) to indicate whether or not
+#   a disk is shared by primary VM and secondary VM. (Since: 2.9)
+#
 # Since: 2.8
 ##
 { 'struct': 'BlockdevOptionsReplication',
   'base': 'BlockdevOptionsGenericFormat',
   'data': { 'mode': 'ReplicationMode',
-'*top-id': 'str' } }
+'*top-id': 'str',
+'*shared-disk-id': 'str',
+'*shared-disk': 'bool' } }
 
 ##
 # @NFSTransport:
-- 
1.8.3.1





[Qemu-block] [PATCH v3 1/6] docs/block-replication: Add description for shared-disk case

2017-01-19 Thread zhanghailiang
Introuduce the scenario of shared-disk block replication
and how to use it.

Reviewed-by: Changlong Xie <xiecl.f...@cn.fujitsu.com>
Reviewed-by: Stefan Hajnoczi <stefa...@redhat.com>
Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com>
Signed-off-by: Wen Congyang <we...@cn.fujitsu.com>
Signed-off-by: Zhang Chen <zhangchen.f...@cn.fujitsu.com>
---
 docs/block-replication.txt | 139 +++--
 1 file changed, 135 insertions(+), 4 deletions(-)

diff --git a/docs/block-replication.txt b/docs/block-replication.txt
index 6bde673..fbfe005 100644
--- a/docs/block-replication.txt
+++ b/docs/block-replication.txt
@@ -24,7 +24,7 @@ only dropped at next checkpoint time. To reduce the network 
transportation
 effort during a vmstate checkpoint, the disk modification operations of
 the Primary disk are asynchronously forwarded to the Secondary node.
 
-== Workflow ==
+== Non-shared disk workflow ==
 The following is the image of block replication workflow:
 
 +--+++
@@ -57,7 +57,7 @@ The following is the image of block replication workflow:
 4) Secondary write requests will be buffered in the Disk buffer and it
will overwrite the existing sector content in the buffer.
 
-== Architecture ==
+== Non-shared disk architecture ==
 We are going to implement block replication from many basic
 blocks that are already in QEMU.
 
@@ -106,6 +106,74 @@ any state that would otherwise be lost by the speculative 
write-through
 of the NBD server into the secondary disk. So before block replication,
 the primary disk and secondary disk should contain the same data.
 
+== Shared Disk Mode Workflow ==
+The following is the image of block replication workflow:
+
++--+++
+|Primary Write Requests||Secondary Write Requests|
++--+++
+  |   |
+  |  (4)
+  |   V
+  |  /-\
+  | (2)Forward and write through | |
+  | +--> | Disk Buffer |
+  | || |
+  | |\-/
+  | |(1)read   |
+  | |  |
+   (3)write   | |  | backing file
+  V |  |
+ +-+   |
+ | Shared Disk | <-+
+ +-+
+
+1) Primary writes will read original data and forward it to Secondary
+   QEMU.
+2) Before Primary write requests are written to Shared disk, the
+   original sector content will be read from Shared disk and
+   forwarded and buffered in the Disk buffer on the secondary site,
+   but it will not overwrite the existing sector content (it could be
+   from either "Secondary Write Requests" or previous COW of "Primary
+   Write Requests") in the Disk buffer.
+3) Primary write requests will be written to Shared disk.
+4) Secondary write requests will be buffered in the Disk buffer and it
+   will overwrite the existing sector content in the buffer.
+
+== Shared Disk Mode Architecture ==
+We are going to implement block replication from many basic
+blocks that are already in QEMU.
+ virtio-blk ||   
.--
+ /  ||   | 
Secondary
+/   ||   
'--
+   /|| 
virtio-blk
+  / || 
 |
+  | ||   
replication(5)
+  |NBD  >   NBD   (2)  
 |
+  |  client ||server ---> hidden disk <-- 
active disk(4)
+  | ^   ||  |
+  |  replication(1) ||  |
+  | |   ||  |
+  |   +-'   ||  |
+ (3)  |drive-backup sync=none   ||  |
+. |   +-+   ||

[Qemu-block] [PATCH v3 5/6] replication: Implement block replication for shared disk case

2017-01-19 Thread zhanghailiang
Just as the scenario of non-shared disk block replication,
we are going to implement block replication from many basic
blocks that are already in QEMU.
The architecture is:

 virtio-blk ||   
.--
 /  ||   | 
Secondary
/   ||   
'--
   /|| 
virtio-blk
  / ||  
|
  | ||   
replication(5)
  |NBD  >   NBD   (2)   
|
  |  client ||server ---> hidden disk <-- 
active disk(4)
  | ^   ||  |
  |  replication(1) ||  |
  | |   ||  |
  |   +-'   ||  |
 (3)  |drive-backup sync=none   ||  |
. |   +-+   ||  |
Primary | | |   ||   backing|
' | |   ||  |
  V |   |
   +---+|
   |   shared disk | <--+
   +---+

1) Primary writes will read original data and forward it to Secondary
   QEMU.
2) The hidden-disk is created automatically. It buffers the original content
   that is modified by the primary VM. It should also be an empty disk, and
   the driver supports bdrv_make_empty() and backing file.
3) Primary write requests will be written to Shared disk.
4) Secondary write requests will be buffered in the active disk and it
   will overwrite the existing sector content in the buffer.

Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com>
Signed-off-by: Wen Congyang <we...@cn.fujitsu.com>
Signed-off-by: Zhang Chen <zhangchen.f...@cn.fujitsu.com>
---
 block/replication.c | 48 ++--
 1 file changed, 42 insertions(+), 6 deletions(-)

diff --git a/block/replication.c b/block/replication.c
index 70ec08c..a0b3e41 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -233,7 +233,7 @@ static coroutine_fn int 
replication_co_readv(BlockDriverState *bs,
  QEMUIOVector *qiov)
 {
 BDRVReplicationState *s = bs->opaque;
-BdrvChild *child = s->secondary_disk;
+BdrvChild *child = s->is_shared_disk ? s->primary_disk : s->secondary_disk;
 BlockJob *job = NULL;
 CowRequest req;
 int ret;
@@ -415,7 +415,12 @@ static void backup_job_completed(void *opaque, int ret)
 s->error = -EIO;
 }
 
-backup_job_cleanup(bs);
+if (s->mode == REPLICATION_MODE_PRIMARY) {
+s->replication_state = BLOCK_REPLICATION_DONE;
+s->error = 0;
+} else {
+backup_job_cleanup(bs);
+}
 }
 
 static bool check_top_bs(BlockDriverState *top_bs, BlockDriverState *bs)
@@ -467,6 +472,19 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 
 switch (s->mode) {
 case REPLICATION_MODE_PRIMARY:
+if (s->is_shared_disk) {
+job = backup_job_create(NULL, s->primary_disk->bs, bs, 0,
+MIRROR_SYNC_MODE_NONE, NULL, false, BLOCKDEV_ON_ERROR_REPORT,
+BLOCKDEV_ON_ERROR_REPORT, BLOCK_JOB_INTERNAL,
+backup_job_completed, bs, NULL, _err);
+if (local_err) {
+error_propagate(errp, local_err);
+backup_job_cleanup(bs);
+aio_context_release(aio_context);
+return;
+}
+block_job_start(job);
+}
 break;
 case REPLICATION_MODE_SECONDARY:
 s->active_disk = bs->file;
@@ -485,7 +503,8 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 }
 
 s->secondary_disk = s->hidden_disk->bs->backing;
-if (!s->secondary_disk->bs || !bdrv_has_blk(s->secondary_disk->bs)) {
+if (!s->secondary_disk->bs ||
+(!s->is_shared_disk && !bdrv_has_blk(s->secondary_disk->bs))) {
 error_setg(errp, "The secondary disk doesn't have block backend");
 aio_context_release(aio_context);
 return;
@@ -580,11 +599,24 @@ static void replication_do_checkpoint(ReplicationState 
*rs, Error **errp)
 
 switch (s->mode) {
 case REPLICATION_MODE_PRIMARY

[Qemu-block] [PATCH] migration: re-active images while migration been canceled after inactive them

2017-01-19 Thread zhanghailiang
commit fe904ea8242cbae2d7e69c052c754b8f5f1ba1d6 fixed a case
which migration aborted QEMU because it didn't regain the control
of images while some errors happened.

Actually, there are another two cases can trigger the same error reports:
" bdrv_co_do_pwritev: Assertion `!(bs->open_flags & 0x0800)' failed",

Case 1, codes path:
migration_thread()
migration_completion()
bdrv_inactivate_all() > inactivate images
qemu_savevm_state_complete_precopy()
socket_writev_buffer() > error because destination fails
qemu_fflush() ---> set error on migration stream
-> qmp_migrate_cancel() -> user cancelled migration 
concurrently
-> migrate_set_state() --> set migrate CANCELLIN
migration_completion() -> go on to fail_invalidate
if (s->state == MIGRATION_STATUS_ACTIVE) -> Jump this branch

Case 2, codes path:
migration_thread()
migration_completion()
bdrv_inactivate_all() > inactivate images
migreation_completion() finished
-> qmp_migrate_cancel() -> user cancelled migration 
concurrently
qemu_mutex_lock_iothread();
qemu_bh_schedule (s->cleanup_bh);

As we can see from above, qmp_migrate_cancel can slip in whenever
migration_thread does not hold the global lock. If this happens after
bdrv_inactive_all() been called, the above error reports will appear.

To prevent this, we can call bdrv_invalidate_cache_all() in qmp_migrate_cancel()
directly if we find images become inactive.

Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com>
---
Hi,

I have sent another patch before to fix this problem, but didn't cover
all the scenes, and there are some discussions about this problem,
For more detail, please refer to
https://lists.gnu.org/archive/html/qemu-block/2016-12/msg3.html
---
 include/migration/migration.h |  3 +++
 migration/migration.c | 13 +
 2 files changed, 16 insertions(+)

diff --git a/include/migration/migration.h b/include/migration/migration.h
index c309d23..2d5b724 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -177,6 +177,9 @@ struct MigrationState
 /* Flag set once the migration thread is running (and needs joining) */
 bool migration_thread_running;
 
+/* Flag set once the migration thread called bdrv_inactivate_all */
+bool block_inactive;
+
 /* Queue of outstanding page requests from the destination */
 QemuMutex src_page_req_mutex;
 QSIMPLEQ_HEAD(src_page_requests, MigrationSrcPageRequest) 
src_page_requests;
diff --git a/migration/migration.c b/migration/migration.c
index f498ab8..9defb3e 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -1006,6 +1006,16 @@ static void migrate_fd_cancel(MigrationState *s)
 if (s->state == MIGRATION_STATUS_CANCELLING && f) {
 qemu_file_shutdown(f);
 }
+if (s->state == MIGRATION_STATUS_CANCELLING && s->block_inactive) {
+Error *local_err = NULL;
+
+bdrv_invalidate_cache_all(_err);
+if (local_err) {
+error_report_err(local_err);
+} else {
+s->block_inactive = false;
+}
+}
 }
 
 void add_migration_state_change_notifier(Notifier *notify)
@@ -1705,6 +1715,7 @@ static void migration_completion(MigrationState *s, int 
current_active_state,
 if (ret >= 0) {
 qemu_file_set_rate_limit(s->to_dst_file, INT64_MAX);
 qemu_savevm_state_complete_precopy(s->to_dst_file, false);
+s->block_inactive = true;
 }
 }
 qemu_mutex_unlock_iothread();
@@ -1758,6 +1769,8 @@ fail_invalidate:
 bdrv_invalidate_cache_all(_err);
 if (local_err) {
 error_report_err(local_err);
+} else {
+s->block_inactive = false;
 }
 }
 
-- 
1.8.3.1





[Qemu-block] [PATCH RFC v2 4/6] replication: fix code logic with the new shared_disk option

2016-12-05 Thread zhanghailiang
Some code logic only be needed in non-shared disk, here
we adjust these codes to prepare for shared disk scenario.

Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com>
---
 block/replication.c | 47 ---
 1 file changed, 28 insertions(+), 19 deletions(-)

diff --git a/block/replication.c b/block/replication.c
index 35e9ab3..6574cc2 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -531,21 +531,28 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 aio_context_release(aio_context);
 return;
 }
-bdrv_op_block_all(top_bs, s->blocker);
-bdrv_op_unblock(top_bs, BLOCK_OP_TYPE_DATAPLANE, s->blocker);
 
-job = backup_job_create(NULL, s->secondary_disk->bs, 
s->hidden_disk->bs,
-0, MIRROR_SYNC_MODE_NONE, NULL, false,
+/*
+ * Only in the case of non-shared disk,
+ * the backup job is in the secondary side
+ */
+if (!s->is_shared_disk) {
+bdrv_op_block_all(top_bs, s->blocker);
+bdrv_op_unblock(top_bs, BLOCK_OP_TYPE_DATAPLANE, s->blocker);
+job = backup_job_create(NULL, s->secondary_disk->bs,
+s->hidden_disk->bs, 0,
+MIRROR_SYNC_MODE_NONE, NULL, false,
 BLOCKDEV_ON_ERROR_REPORT,
 BLOCKDEV_ON_ERROR_REPORT, BLOCK_JOB_INTERNAL,
 backup_job_completed, bs, NULL, _err);
-if (local_err) {
-error_propagate(errp, local_err);
-backup_job_cleanup(bs);
-aio_context_release(aio_context);
-return;
+if (local_err) {
+error_propagate(errp, local_err);
+backup_job_cleanup(bs);
+aio_context_release(aio_context);
+return;
+}
+block_job_start(job);
 }
-block_job_start(job);
 
 secondary_do_checkpoint(s, errp);
 break;
@@ -575,14 +582,16 @@ static void replication_do_checkpoint(ReplicationState 
*rs, Error **errp)
 case REPLICATION_MODE_PRIMARY:
 break;
 case REPLICATION_MODE_SECONDARY:
-if (!s->secondary_disk->bs->job) {
-error_setg(errp, "Backup job was cancelled unexpectedly");
-break;
-}
-backup_do_checkpoint(s->secondary_disk->bs->job, _err);
-if (local_err) {
-error_propagate(errp, local_err);
-break;
+if (!s->is_shared_disk) {
+if (!s->secondary_disk->bs->job) {
+error_setg(errp, "Backup job was cancelled unexpectedly");
+break;
+}
+backup_do_checkpoint(s->secondary_disk->bs->job, _err);
+if (local_err) {
+error_propagate(errp, local_err);
+break;
+}
 }
 secondary_do_checkpoint(s, errp);
 break;
@@ -663,7 +672,7 @@ static void replication_stop(ReplicationState *rs, bool 
failover, Error **errp)
  * before the BDS is closed, because we will access hidden
  * disk, secondary disk in backup_job_completed().
  */
-if (s->secondary_disk->bs->job) {
+if (!s->is_shared_disk && s->secondary_disk->bs->job) {
 block_job_cancel_sync(s->secondary_disk->bs->job);
 }
 
-- 
1.8.3.1





[Qemu-block] [PATCH RFC v2 0/6] COLO block replication supports shared disk case

2016-12-05 Thread zhanghailiang
COLO block replication doesn't support the shared disk case,
Here we try to implement it.

For the detail of shared-disk scenario, please refer to patch 1.

COLO codes with shared-disk block replication can be found from the link:
https://github.com/coloft/qemu/tree/colo-developing-with-shared-disk-2016-12-5

Test procedures:
1. Secondary:
# x86_64-softmmu/qemu-system-x86_64 -boot c -m 2048 -smp 2 -qmp stdio -vnc :9 
-name secondary -enable-kvm -cpu qemu64,+kvmclock -device piix3-usb-uhci -drive 
if=none,driver=qcow2,file.filename=/mnt/ramfs/hidden_disk.img,id=hidden_disk0,backing.driver=raw,backing.file.filename=/work/kvm/suse11_sp3_64
  -drive 
if=ide,id=active-disk0,driver=replication,mode=secondary,file.driver=qcow2,top-id=active-disk0,file.file.filename=/mnt/ramfs/active_disk.img,file.backing=hidden_disk0,shared-disk=on
 -incoming tcp:0:

Issue qmp commands:
{'execute':'qmp_capabilities'}
{'execute': 'nbd-server-start', 'arguments': {'addr': {'type': 'inet', 'data': 
{'host': '0', 'port': '9998'} } } }
{'execute': 'nbd-server-add', 'arguments': {'device': 'hidden_disk0', 
'writable': true } }

2.Primary:
# x86_64-softmmu/qemu-system-x86_64 -enable-kvm -m 2048 -smp 2 -qmp stdio -vnc 
:9 -name primary -cpu qemu64,+kvmclock -device piix3-usb-uhci -drive 
if=virtio,id=primary_disk0,file.filename=/work/kvm/suse11_sp3_64,driver=raw -S

Issue qmp commands:
{'execute':'qmp_capabilities'}
{'execute': 'human-monitor-command', 'arguments': {'command-line': 'drive_add 
-n buddy 
driver=replication,mode=primary,file.driver=nbd,file.host=9.42.3.17,file.port=9998,file.export=hidden_disk0,shared-disk-id=primary_disk0,shared-disk=on,node-name=rep'}}
{'execute': 'migrate-set-capabilities', 'arguments': {'capabilities': [ 
{'capability': 'x-colo', 'state': true } ] } }
{'execute': 'migrate', 'arguments': {'uri': 'tcp:9.42.3.17:' } }

3. Failover
Secondary side:
Issue qmp commands:
{ 'execute': 'nbd-server-stop' }
{ "execute": "x-colo-lost-heartbeat" }

Please review and any commits are welcomed.

Cc: Juan Quintela <quint...@redhat.com>
Cc: Amit Shah <amit.s...@redhat.com> 
Cc: Dr. David Alan Gilbert (git) <dgilb...@redhat.com>
Cc: eddie.d...@intel.com

v2:
- Drop the patch which add a blk_root() helper
- Fix some comments from Changlong

zhanghailiang (6):
  docs/block-replication: Add description for shared-disk case
  replication: add shared-disk and shared-disk-id options
  replication: Split out backup_do_checkpoint() from
secondary_do_checkpoint()
  replication: fix code logic with the new shared_disk option
  replication: Implement block replication for shared disk case
  nbd/replication: implement .bdrv_get_info() for nbd and replication
driver

 block/nbd.c|  12 
 block/replication.c| 156 +++--
 docs/block-replication.txt | 139 ++--
 qapi/block-core.json   |   9 ++-
 4 files changed, 278 insertions(+), 38 deletions(-)

-- 
1.8.3.1





[Qemu-block] [PATCH RFC v2 1/6] docs/block-replication: Add description for shared-disk case

2016-12-05 Thread zhanghailiang
Introuduce the scenario of shared-disk block replication
and how to use it.

Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com>
Signed-off-by: Wen Congyang <we...@cn.fujitsu.com>
Signed-off-by: Zhang Chen <zhangchen.f...@cn.fujitsu.com>
---
v2:
- fix some problems found by Changlong
---
 docs/block-replication.txt | 139 +++--
 1 file changed, 135 insertions(+), 4 deletions(-)

diff --git a/docs/block-replication.txt b/docs/block-replication.txt
index 6bde673..fbfe005 100644
--- a/docs/block-replication.txt
+++ b/docs/block-replication.txt
@@ -24,7 +24,7 @@ only dropped at next checkpoint time. To reduce the network 
transportation
 effort during a vmstate checkpoint, the disk modification operations of
 the Primary disk are asynchronously forwarded to the Secondary node.
 
-== Workflow ==
+== Non-shared disk workflow ==
 The following is the image of block replication workflow:
 
 +--+++
@@ -57,7 +57,7 @@ The following is the image of block replication workflow:
 4) Secondary write requests will be buffered in the Disk buffer and it
will overwrite the existing sector content in the buffer.
 
-== Architecture ==
+== Non-shared disk architecture ==
 We are going to implement block replication from many basic
 blocks that are already in QEMU.
 
@@ -106,6 +106,74 @@ any state that would otherwise be lost by the speculative 
write-through
 of the NBD server into the secondary disk. So before block replication,
 the primary disk and secondary disk should contain the same data.
 
+== Shared Disk Mode Workflow ==
+The following is the image of block replication workflow:
+
++--+++
+|Primary Write Requests||Secondary Write Requests|
++--+++
+  |   |
+  |  (4)
+  |   V
+  |  /-\
+  | (2)Forward and write through | |
+  | +--> | Disk Buffer |
+  | || |
+  | |\-/
+  | |(1)read   |
+  | |  |
+   (3)write   | |  | backing file
+  V |  |
+ +-+   |
+ | Shared Disk | <-+
+ +-+
+
+1) Primary writes will read original data and forward it to Secondary
+   QEMU.
+2) Before Primary write requests are written to Shared disk, the
+   original sector content will be read from Shared disk and
+   forwarded and buffered in the Disk buffer on the secondary site,
+   but it will not overwrite the existing sector content (it could be
+   from either "Secondary Write Requests" or previous COW of "Primary
+   Write Requests") in the Disk buffer.
+3) Primary write requests will be written to Shared disk.
+4) Secondary write requests will be buffered in the Disk buffer and it
+   will overwrite the existing sector content in the buffer.
+
+== Shared Disk Mode Architecture ==
+We are going to implement block replication from many basic
+blocks that are already in QEMU.
+ virtio-blk ||   
.--
+ /  ||   | 
Secondary
+/   ||   
'--
+   /|| 
virtio-blk
+  / || 
 |
+  | ||   
replication(5)
+  |NBD  >   NBD   (2)  
 |
+  |  client ||server ---> hidden disk <-- 
active disk(4)
+  | ^   ||  |
+  |  replication(1) ||  |
+  | |   ||  |
+  |   +-'   ||  |
+ (3)  |drive-backup sync=none   ||  |
+. |   +-+   ||  |
+Primary | | |   ||   backing|
+' |  

[Qemu-block] [PATCH RFC v2 2/6] replication: add shared-disk and shared-disk-id options

2016-12-05 Thread zhanghailiang
We use these two options to identify which disk is
shared

Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com>
Signed-off-by: Wen Congyang <we...@cn.fujitsu.com>
Signed-off-by: Zhang Chen <zhangchen.f...@cn.fujitsu.com>
---
v2:
- add these two options for BlockdevOptionsReplication to support
  qmp blockdev-add command.
- fix a memory leak found by Changlong
---
 block/replication.c  | 37 +
 qapi/block-core.json |  9 -
 2 files changed, 45 insertions(+), 1 deletion(-)

diff --git a/block/replication.c b/block/replication.c
index 729dd12..e87ae87 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -25,9 +25,12 @@
 typedef struct BDRVReplicationState {
 ReplicationMode mode;
 int replication_state;
+bool is_shared_disk;
+char *shared_disk_id;
 BdrvChild *active_disk;
 BdrvChild *hidden_disk;
 BdrvChild *secondary_disk;
+BdrvChild *primary_disk;
 char *top_id;
 ReplicationState *rs;
 Error *blocker;
@@ -53,6 +56,9 @@ static void replication_stop(ReplicationState *rs, bool 
failover,
 
 #define REPLICATION_MODE"mode"
 #define REPLICATION_TOP_ID  "top-id"
+#define REPLICATION_SHARED_DISK "shared-disk"
+#define REPLICATION_SHARED_DISK_ID "shared-disk-id"
+
 static QemuOptsList replication_runtime_opts = {
 .name = "replication",
 .head = QTAILQ_HEAD_INITIALIZER(replication_runtime_opts.head),
@@ -65,6 +71,14 @@ static QemuOptsList replication_runtime_opts = {
 .name = REPLICATION_TOP_ID,
 .type = QEMU_OPT_STRING,
 },
+{
+.name = REPLICATION_SHARED_DISK_ID,
+.type = QEMU_OPT_STRING,
+},
+{
+.name = REPLICATION_SHARED_DISK,
+.type = QEMU_OPT_BOOL,
+},
 { /* end of list */ }
 },
 };
@@ -85,6 +99,9 @@ static int replication_open(BlockDriverState *bs, QDict 
*options,
 QemuOpts *opts = NULL;
 const char *mode;
 const char *top_id;
+const char *shared_disk_id;
+BlockBackend *blk;
+BlockDriverState *tmp_bs;
 
 ret = -EINVAL;
 opts = qemu_opts_create(_runtime_opts, NULL, 0, _abort);
@@ -119,6 +136,25 @@ static int replication_open(BlockDriverState *bs, QDict 
*options,
"The option mode's value should be primary or secondary");
 goto fail;
 }
+s->is_shared_disk = qemu_opt_get_bool(opts, REPLICATION_SHARED_DISK,
+  false);
+if (s->is_shared_disk && (s->mode == REPLICATION_MODE_PRIMARY)) {
+shared_disk_id = qemu_opt_get(opts, REPLICATION_SHARED_DISK_ID);
+if (!shared_disk_id) {
+error_setg(_err, "Missing shared disk blk");
+goto fail;
+}
+s->shared_disk_id = g_strdup(shared_disk_id);
+blk = blk_by_name(s->shared_disk_id);
+if (!blk) {
+g_free(s->shared_disk_id);
+error_setg(_err, "There is no %s block", s->shared_disk_id);
+goto fail;
+}
+/* We can't access root member of BlockBackend directly */
+tmp_bs = blk_bs(blk);
+s->primary_disk = QLIST_FIRST(_bs->parents);
+}
 
 s->rs = replication_new(bs, _ops);
 
@@ -135,6 +171,7 @@ static void replication_close(BlockDriverState *bs)
 {
 BDRVReplicationState *s = bs->opaque;
 
+g_free(s->shared_disk_id);
 if (s->replication_state == BLOCK_REPLICATION_RUNNING) {
 replication_stop(s->rs, false, NULL);
 }
diff --git a/qapi/block-core.json b/qapi/block-core.json
index c29bef7..52d7e0d 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2232,12 +2232,19 @@
 #  node who owns the replication node chain. Must not be given in
 #  primary mode.
 #
+# @shared-disk-id: #optional The id of shared disk while in replication mode.
+#
+# @shared-disk: #optional To indicate whether or not a disk is shared by
+#   primary VM and secondary VM.
+#
 # Since: 2.8
 ##
 { 'struct': 'BlockdevOptionsReplication',
   'base': 'BlockdevOptionsGenericFormat',
   'data': { 'mode': 'ReplicationMode',
-'*top-id': 'str' } }
+'*top-id': 'str',
+'*shared-disk-id': 'str',
+'*shared-disk': 'bool' } }
 
 ##
 # @NFSTransport
-- 
1.8.3.1





[Qemu-block] [PATCH RFC v2 5/6] replication: Implement block replication for shared disk case

2016-12-05 Thread zhanghailiang
Just as the scenario of non-shared disk block replication,
we are going to implement block replication from many basic
blocks that are already in QEMU.
The architecture is:

 virtio-blk ||   
.--
 /  ||   | 
Secondary
/   ||   
'--
   /|| 
virtio-blk
  / ||  
|
  | ||   
replication(5)
  |NBD  >   NBD   (2)   
|
  |  client ||server ---> hidden disk <-- 
active disk(4)
  | ^   ||  |
  |  replication(1) ||  |
  | |   ||  |
  |   +-'   ||  |
 (3)  |drive-backup sync=none   ||  |
. |   +-+   ||  |
Primary | | |   ||   backing|
' | |   ||  |
  V |   |
   +---+|
   |   shared disk | <--+
   +---+

1) Primary writes will read original data and forward it to Secondary
   QEMU.
2) The hidden-disk is created automatically. It buffers the original content
   that is modified by the primary VM. It should also be an empty disk, and
   the driver supports bdrv_make_empty() and backing file.
3) Primary write requests will be written to Shared disk.
4) Secondary write requests will be buffered in the active disk and it
   will overwrite the existing sector content in the buffer.

Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com>
Signed-off-by: Wen Congyang <we...@cn.fujitsu.com>
Signed-off-by: Zhang Chen <zhangchen.f...@cn.fujitsu.com>
---
 block/replication.c | 48 ++--
 1 file changed, 42 insertions(+), 6 deletions(-)

diff --git a/block/replication.c b/block/replication.c
index 6574cc2..f416ca5 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -233,7 +233,7 @@ static coroutine_fn int 
replication_co_readv(BlockDriverState *bs,
  QEMUIOVector *qiov)
 {
 BDRVReplicationState *s = bs->opaque;
-BdrvChild *child = s->secondary_disk;
+BdrvChild *child = s->is_shared_disk ? s->primary_disk : s->secondary_disk;
 BlockJob *job = NULL;
 CowRequest req;
 int ret;
@@ -415,7 +415,12 @@ static void backup_job_completed(void *opaque, int ret)
 s->error = -EIO;
 }
 
-backup_job_cleanup(bs);
+if (s->mode == REPLICATION_MODE_PRIMARY) {
+s->replication_state = BLOCK_REPLICATION_DONE;
+s->error = 0;
+} else {
+backup_job_cleanup(bs);
+}
 }
 
 static bool check_top_bs(BlockDriverState *top_bs, BlockDriverState *bs)
@@ -467,6 +472,19 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 
 switch (s->mode) {
 case REPLICATION_MODE_PRIMARY:
+if (s->is_shared_disk) {
+job = backup_job_create(NULL, s->primary_disk->bs, bs, 0,
+MIRROR_SYNC_MODE_NONE, NULL, false, BLOCKDEV_ON_ERROR_REPORT,
+BLOCKDEV_ON_ERROR_REPORT, BLOCK_JOB_INTERNAL,
+backup_job_completed, bs, NULL, _err);
+if (local_err) {
+error_propagate(errp, local_err);
+backup_job_cleanup(bs);
+aio_context_release(aio_context);
+return;
+}
+block_job_start(job);
+}
 break;
 case REPLICATION_MODE_SECONDARY:
 s->active_disk = bs->file;
@@ -485,7 +503,8 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 }
 
 s->secondary_disk = s->hidden_disk->bs->backing;
-if (!s->secondary_disk->bs || !bdrv_has_blk(s->secondary_disk->bs)) {
+if (!s->secondary_disk->bs ||
+(!s->is_shared_disk && !bdrv_has_blk(s->secondary_disk->bs))) {
 error_setg(errp, "The secondary disk doesn't have block backend");
 aio_context_release(aio_context);
 return;
@@ -580,11 +599,24 @@ static void replication_do_checkpoint(ReplicationState 
*rs, Error **errp)
 
 switch (s->mode) {
 case REPLICATION_MODE_PRIMARY

[Qemu-block] [PATCH RFC v2 3/6] replication: Split out backup_do_checkpoint() from secondary_do_checkpoint()

2016-12-05 Thread zhanghailiang
The helper backup_do_checkpoint() will be used for primary related
codes. Here we split it out from secondary_do_checkpoint().

Besides, it is unnecessary to call backup_do_checkpoint() in
replication starting and normally stop replication path.
We only need call it while do real checkpointing.

Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com>
---
 block/replication.c | 36 +++-
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/block/replication.c b/block/replication.c
index e87ae87..35e9ab3 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -332,20 +332,8 @@ static bool 
replication_recurse_is_first_non_filter(BlockDriverState *bs,
 
 static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp)
 {
-Error *local_err = NULL;
 int ret;
 
-if (!s->secondary_disk->bs->job) {
-error_setg(errp, "Backup job was cancelled unexpectedly");
-return;
-}
-
-backup_do_checkpoint(s->secondary_disk->bs->job, _err);
-if (local_err) {
-error_propagate(errp, local_err);
-return;
-}
-
 ret = s->active_disk->bs->drv->bdrv_make_empty(s->active_disk->bs);
 if (ret < 0) {
 error_setg(errp, "Cannot make active disk empty");
@@ -558,6 +546,8 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 return;
 }
 block_job_start(job);
+
+secondary_do_checkpoint(s, errp);
 break;
 default:
 aio_context_release(aio_context);
@@ -566,10 +556,6 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 
 s->replication_state = BLOCK_REPLICATION_RUNNING;
 
-if (s->mode == REPLICATION_MODE_SECONDARY) {
-secondary_do_checkpoint(s, errp);
-}
-
 s->error = 0;
 aio_context_release(aio_context);
 }
@@ -579,13 +565,29 @@ static void replication_do_checkpoint(ReplicationState 
*rs, Error **errp)
 BlockDriverState *bs = rs->opaque;
 BDRVReplicationState *s;
 AioContext *aio_context;
+Error *local_err = NULL;
 
 aio_context = bdrv_get_aio_context(bs);
 aio_context_acquire(aio_context);
 s = bs->opaque;
 
-if (s->mode == REPLICATION_MODE_SECONDARY) {
+switch (s->mode) {
+case REPLICATION_MODE_PRIMARY:
+break;
+case REPLICATION_MODE_SECONDARY:
+if (!s->secondary_disk->bs->job) {
+error_setg(errp, "Backup job was cancelled unexpectedly");
+break;
+}
+backup_do_checkpoint(s->secondary_disk->bs->job, _err);
+if (local_err) {
+error_propagate(errp, local_err);
+break;
+}
 secondary_do_checkpoint(s, errp);
+break;
+default:
+abort();
 }
 aio_context_release(aio_context);
 }
-- 
1.8.3.1





[Qemu-block] [PATCH RFC v2 6/6] nbd/replication: implement .bdrv_get_info() for nbd and replication driver

2016-12-05 Thread zhanghailiang
Without this callback, there will be an error reports in the primary side:
"qemu-system-x86_64: Couldn't determine the cluster size of the target image,
which has no backing file: Operation not supported
Aborting, since this may create an unusable destination image"

For nbd driver, it doesn't have cluster size, so here we return
a fake value for it.

This patch should be dropped if Eric's nbd patch be merged.
https://lists.gnu.org/archive/html/qemu-devel/2016-04/msg03567.html

Cc: Eric Blake <ebl...@redhat.com>
Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com>
Signed-off-by: Wen Congyang <we...@cn.fujitsu.com>
---
 block/nbd.c | 12 
 block/replication.c |  6 ++
 2 files changed, 18 insertions(+)

diff --git a/block/nbd.c b/block/nbd.c
index 35f24be..b71a13d 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -43,6 +43,8 @@
 
 #define EN_OPTSTR ":exportname="
 
+#define NBD_FAKE_CLUSTER_SIZE 512
+
 typedef struct BDRVNBDState {
 NBDClientSession client;
 
@@ -552,6 +554,13 @@ static void nbd_refresh_filename(BlockDriverState *bs, 
QDict *options)
 bs->full_open_options = opts;
 }
 
+static int nbd_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+bdi->cluster_size  = NBD_FAKE_CLUSTER_SIZE;
+
+return 0;
+}
+
 static BlockDriver bdrv_nbd = {
 .format_name= "nbd",
 .protocol_name  = "nbd",
@@ -569,6 +578,7 @@ static BlockDriver bdrv_nbd = {
 .bdrv_detach_aio_context= nbd_detach_aio_context,
 .bdrv_attach_aio_context= nbd_attach_aio_context,
 .bdrv_refresh_filename  = nbd_refresh_filename,
+.bdrv_get_info  = nbd_get_info,
 };
 
 static BlockDriver bdrv_nbd_tcp = {
@@ -588,6 +598,7 @@ static BlockDriver bdrv_nbd_tcp = {
 .bdrv_detach_aio_context= nbd_detach_aio_context,
 .bdrv_attach_aio_context= nbd_attach_aio_context,
 .bdrv_refresh_filename  = nbd_refresh_filename,
+.bdrv_get_info  = nbd_get_info,
 };
 
 static BlockDriver bdrv_nbd_unix = {
@@ -607,6 +618,7 @@ static BlockDriver bdrv_nbd_unix = {
 .bdrv_detach_aio_context= nbd_detach_aio_context,
 .bdrv_attach_aio_context= nbd_attach_aio_context,
 .bdrv_refresh_filename  = nbd_refresh_filename,
+.bdrv_get_info  = nbd_get_info,
 };
 
 static void bdrv_nbd_init(void)
diff --git a/block/replication.c b/block/replication.c
index f416ca5..5f14360 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -731,6 +731,11 @@ static void replication_stop(ReplicationState *rs, bool 
failover, Error **errp)
 aio_context_release(aio_context);
 }
 
+static int replication_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+return bdrv_get_info(bs->file->bs, bdi);
+}
+
 BlockDriver bdrv_replication = {
 .format_name= "replication",
 .protocol_name  = "replication",
@@ -743,6 +748,7 @@ BlockDriver bdrv_replication = {
 .bdrv_co_readv  = replication_co_readv,
 .bdrv_co_writev = replication_co_writev,
 
+.bdrv_get_info  = replication_get_info,
 .is_filter  = true,
 .bdrv_recurse_is_first_non_filter = 
replication_recurse_is_first_non_filter,
 
-- 
1.8.3.1





[Qemu-block] [PATCH RFC 7/7] nbd/replication: implement .bdrv_get_info() for nbd and replication driver

2016-10-20 Thread zhanghailiang
Without this callback, there will be an error reports in the primary side:
"qemu-system-x86_64: Couldn't determine the cluster size of the target image,
which has no backing file: Operation not supported
Aborting, since this may create an unusable destination image"

For nbd driver, it doesn't have cluster size, so here we return
a fake value for it.

Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com>
Signed-off-by: Wen Congyang <we...@cn.fujitsu.com>
---
 block/nbd.c | 12 
 block/replication.c |  6 ++
 2 files changed, 18 insertions(+)

diff --git a/block/nbd.c b/block/nbd.c
index 6bc06d6..96d7023 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -40,6 +40,8 @@
 
 #define EN_OPTSTR ":exportname="
 
+#define NBD_FAKE_CLUSTER_SIZE 512
+
 typedef struct BDRVNBDState {
 NbdClientSession client;
 
@@ -483,6 +485,13 @@ static void nbd_refresh_filename(BlockDriverState *bs, 
QDict *options)
 bs->full_open_options = opts;
 }
 
+static int nbd_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+bdi->cluster_size  = NBD_FAKE_CLUSTER_SIZE;
+
+return 0;
+}
+
 static BlockDriver bdrv_nbd = {
 .format_name= "nbd",
 .protocol_name  = "nbd",
@@ -499,6 +508,7 @@ static BlockDriver bdrv_nbd = {
 .bdrv_detach_aio_context= nbd_detach_aio_context,
 .bdrv_attach_aio_context= nbd_attach_aio_context,
 .bdrv_refresh_filename  = nbd_refresh_filename,
+.bdrv_get_info  = nbd_get_info,
 };
 
 static BlockDriver bdrv_nbd_tcp = {
@@ -517,6 +527,7 @@ static BlockDriver bdrv_nbd_tcp = {
 .bdrv_detach_aio_context= nbd_detach_aio_context,
 .bdrv_attach_aio_context= nbd_attach_aio_context,
 .bdrv_refresh_filename  = nbd_refresh_filename,
+.bdrv_get_info  = nbd_get_info,
 };
 
 static BlockDriver bdrv_nbd_unix = {
@@ -535,6 +546,7 @@ static BlockDriver bdrv_nbd_unix = {
 .bdrv_detach_aio_context= nbd_detach_aio_context,
 .bdrv_attach_aio_context= nbd_attach_aio_context,
 .bdrv_refresh_filename  = nbd_refresh_filename,
+.bdrv_get_info  = nbd_get_info,
 };
 
 static void bdrv_nbd_init(void)
diff --git a/block/replication.c b/block/replication.c
index e66b1ca..14c718e 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -707,6 +707,11 @@ static void replication_stop(ReplicationState *rs, bool 
failover, Error **errp)
 aio_context_release(aio_context);
 }
 
+static int replication_get_info(BlockDriverState *bs, BlockDriverInfo *bdi)
+{
+return bdrv_get_info(bs->file->bs, bdi);
+}
+
 BlockDriver bdrv_replication = {
 .format_name= "replication",
 .protocol_name  = "replication",
@@ -719,6 +724,7 @@ BlockDriver bdrv_replication = {
 .bdrv_co_readv  = replication_co_readv,
 .bdrv_co_writev = replication_co_writev,
 
+.bdrv_get_info  = replication_get_info,
 .is_filter  = true,
 .bdrv_recurse_is_first_non_filter = 
replication_recurse_is_first_non_filter,
 
-- 
1.8.3.1





[Qemu-block] [PATCH RFC 1/7] docs/block-replication: Add description for shared-disk case

2016-10-20 Thread zhanghailiang
Introuduce the scenario of shared-disk block replication
and how to use it.

Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com>
Signed-off-by: Wen Congyang <we...@cn.fujitsu.com>
Signed-off-by: Zhang Chen <zhangchen.f...@cn.fujitsu.com>
---
 docs/block-replication.txt | 131 +++--
 1 file changed, 127 insertions(+), 4 deletions(-)

diff --git a/docs/block-replication.txt b/docs/block-replication.txt
index 6bde673..97fcfc1 100644
--- a/docs/block-replication.txt
+++ b/docs/block-replication.txt
@@ -24,7 +24,7 @@ only dropped at next checkpoint time. To reduce the network 
transportation
 effort during a vmstate checkpoint, the disk modification operations of
 the Primary disk are asynchronously forwarded to the Secondary node.
 
-== Workflow ==
+== Non-shared disk workflow ==
 The following is the image of block replication workflow:
 
 +--+++
@@ -57,7 +57,7 @@ The following is the image of block replication workflow:
 4) Secondary write requests will be buffered in the Disk buffer and it
will overwrite the existing sector content in the buffer.
 
-== Architecture ==
+== None-shared disk architecture ==
 We are going to implement block replication from many basic
 blocks that are already in QEMU.
 
@@ -106,6 +106,74 @@ any state that would otherwise be lost by the speculative 
write-through
 of the NBD server into the secondary disk. So before block replication,
 the primary disk and secondary disk should contain the same data.
 
+== Shared Disk Mode Workflow ==
+The following is the image of block replication workflow:
+
++--+++
+|Primary Write Requests||Secondary Write Requests|
++--+++
+  |   |
+  |  (4)
+  |   V
+  |  /-\
+  | (2)Forward and write through | |
+  | +--> | Disk Buffer |
+  | || |
+  | |\-/
+  | |(1)read   |
+  | |  |
+   (3)write   | |  | backing file
+  V |  |
+ +-+   |
+ | Shared Disk | <-+
+ +-+
+
+1) Primary writes will read original data and forward it to Secondary
+   QEMU.
+2) Before Primary write requests are written to Shared disk, the
+   original sector content will be read from Shared disk and
+   forwarded and buffered in the Disk buffer on the secondary site,
+   but it will not overwrite the existing
+   sector content(it could be from either "Secondary Write Requests" or
+   previous COW of "Primary Write Requests") in the Disk buffer.
+3) Primary write requests will be written to Shared disk.
+4) Secondary write requests will be buffered in the Disk buffer and it
+   will overwrite the existing sector content in the buffer.
+
+== Shared Disk Mode Architecture ==
+We are going to implement block replication from many basic
+blocks that are already in QEMU.
+ virtio-blk ||   
.--
+ /  ||   | 
Secondary
+/   ||   
'--
+   /|| 
virtio-blk
+  / || 
 |
+  | ||   
replication(5)
+  |NBD  >   NBD   (2)  
 |
+  |  client ||server ---> hidden disk <-- 
active disk(4)
+  | ^   ||  |
+  |  replication(1) ||  |
+  | |   ||  |
+  |   +-'   ||  |
+ (3)  |drive-backup sync=none   ||  |
+. |   +-+   ||  |
+Primary | | |   ||   backing|
+' |  

[Qemu-block] [PATCH RFC 5/7] replication: fix code logic with the new shared_disk option

2016-10-20 Thread zhanghailiang
Some code logic only be needed in non-shared disk, here
we adjust these codes to prepare for shared disk scenario.

Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com>
---
 block/replication.c | 44 ++--
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/block/replication.c b/block/replication.c
index d687ffc..39c616d 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -517,15 +517,21 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 bdrv_op_block_all(top_bs, s->blocker);
 bdrv_op_unblock(top_bs, BLOCK_OP_TYPE_DATAPLANE, s->blocker);
 
-backup_start("replication-backup", s->secondary_disk->bs,
- s->hidden_disk->bs, 0, MIRROR_SYNC_MODE_NONE, NULL, false,
- BLOCKDEV_ON_ERROR_REPORT, BLOCKDEV_ON_ERROR_REPORT,
- backup_job_completed, s, NULL, _err);
-if (local_err) {
-error_propagate(errp, local_err);
-backup_job_cleanup(s);
-aio_context_release(aio_context);
-return;
+/*
+ * Only in the case of non-shared disk,
+ * the backup job is in the Slave side
+ */
+if (!s->is_shared_disk) {
+backup_start("replication-backup", s->secondary_disk->bs,
+s->hidden_disk->bs, 0, MIRROR_SYNC_MODE_NONE, NULL, false,
+BLOCKDEV_ON_ERROR_REPORT, BLOCKDEV_ON_ERROR_REPORT,
+backup_job_completed, s, NULL, _err);
+if (local_err) {
+error_propagate(errp, local_err);
+backup_job_cleanup(s);
+aio_context_release(aio_context);
+return;
+}
 }
 
 secondary_do_checkpoint(s, errp);
@@ -556,14 +562,16 @@ static void replication_do_checkpoint(ReplicationState 
*rs, Error **errp)
 case REPLICATION_MODE_PRIMARY:
 break;
 case REPLICATION_MODE_SECONDARY:
-if (!s->secondary_disk->bs->job) {
-error_setg(errp, "Backup job was cancelled unexpectedly");
-break;
-}
-backup_do_checkpoint(s->secondary_disk->bs->job, _err);
-if (local_err) {
-error_propagate(errp, local_err);
-break;
+if (!s->is_shared_disk) {
+if (!s->secondary_disk->bs->job) {
+error_setg(errp, "Backup job was cancelled unexpectedly");
+break;
+}
+backup_do_checkpoint(s->secondary_disk->bs->job, _err);
+if (local_err) {
+error_propagate(errp, local_err);
+break;
+}
 }
 secondary_do_checkpoint(s, errp);
 break;
@@ -644,7 +652,7 @@ static void replication_stop(ReplicationState *rs, bool 
failover, Error **errp)
  * before the BDS is closed, because we will access hidden
  * disk, secondary disk in backup_job_completed().
  */
-if (s->secondary_disk->bs->job) {
+if (!s->is_shared_disk && s->secondary_disk->bs->job) {
 block_job_cancel_sync(s->secondary_disk->bs->job);
 }
 
-- 
1.8.3.1





[Qemu-block] [PATCH RFC 3/7] replication: add shared-disk and shared-disk-id options

2016-10-20 Thread zhanghailiang
We use these two options to identify which disk is
shared

Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com>
Signed-off-by: Wen Congyang <we...@cn.fujitsu.com>
Signed-off-by: Zhang Chen <zhangchen.f...@cn.fujitsu.com>
---
 block/replication.c | 33 +
 1 file changed, 33 insertions(+)

diff --git a/block/replication.c b/block/replication.c
index 3bd1cf1..2a2fdb2 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -25,9 +25,12 @@
 typedef struct BDRVReplicationState {
 ReplicationMode mode;
 int replication_state;
+bool is_shared_disk;
+char *shared_disk_id;
 BdrvChild *active_disk;
 BdrvChild *hidden_disk;
 BdrvChild *secondary_disk;
+BdrvChild *primary_disk;
 char *top_id;
 ReplicationState *rs;
 Error *blocker;
@@ -53,6 +56,9 @@ static void replication_stop(ReplicationState *rs, bool 
failover,
 
 #define REPLICATION_MODE"mode"
 #define REPLICATION_TOP_ID  "top-id"
+#define REPLICATION_SHARED_DISK "shared-disk"
+#define REPLICATION_SHARED_DISK_ID "shared-disk-id"
+
 static QemuOptsList replication_runtime_opts = {
 .name = "replication",
 .head = QTAILQ_HEAD_INITIALIZER(replication_runtime_opts.head),
@@ -65,6 +71,14 @@ static QemuOptsList replication_runtime_opts = {
 .name = REPLICATION_TOP_ID,
 .type = QEMU_OPT_STRING,
 },
+{
+.name = REPLICATION_SHARED_DISK_ID,
+.type = QEMU_OPT_STRING,
+},
+{
+.name = REPLICATION_SHARED_DISK,
+.type = QEMU_OPT_BOOL,
+},
 { /* end of list */ }
 },
 };
@@ -85,6 +99,8 @@ static int replication_open(BlockDriverState *bs, QDict 
*options,
 QemuOpts *opts = NULL;
 const char *mode;
 const char *top_id;
+const char *shared_disk_id;
+BlockBackend *blk;
 
 ret = -EINVAL;
 opts = qemu_opts_create(_runtime_opts, NULL, 0, _abort);
@@ -114,6 +130,22 @@ static int replication_open(BlockDriverState *bs, QDict 
*options,
"The option mode's value should be primary or secondary");
 goto fail;
 }
+s->is_shared_disk = qemu_opt_get_bool(opts, REPLICATION_SHARED_DISK,
+false);
+if (s->is_shared_disk && (s->mode == REPLICATION_MODE_PRIMARY)) {
+shared_disk_id = qemu_opt_get(opts, REPLICATION_SHARED_DISK_ID);
+if (!shared_disk_id) {
+error_setg(_err, "Missing shared disk blk");
+goto fail;
+}
+s->shared_disk_id = g_strdup(shared_disk_id);
+blk = blk_by_name(s->shared_disk_id);
+if (!blk) {
+error_setg(_err, "There is no %s block", s->shared_disk_id);
+goto fail;
+}
+s->primary_disk = blk_root(blk);
+}
 
 s->rs = replication_new(bs, _ops);
 
@@ -130,6 +162,7 @@ static void replication_close(BlockDriverState *bs)
 {
 BDRVReplicationState *s = bs->opaque;
 
+g_free(s->shared_disk_id);
 if (s->replication_state == BLOCK_REPLICATION_RUNNING) {
 replication_stop(s->rs, false, NULL);
 }
-- 
1.8.3.1





[Qemu-block] [PATCH RFC 6/7] replication: Implement block replication for shared disk case

2016-10-20 Thread zhanghailiang
Just as the scenario of non-shared disk block replication,
we are going to implement block replication from many basic
blocks that are already in QEMU.
The architecture is:

 virtio-blk ||   
.--
 /  ||   | 
Secondary
/   ||   
'--
   /|| 
virtio-blk
  / ||  
|
  | ||   
replication(5)
  |NBD  >   NBD   (2)   
|
  |  client ||server ---> hidden disk <-- 
active disk(4)
  | ^   ||  |
  |  replication(1) ||  |
  | |   ||  |
  |   +-'   ||  |
 (3)  |drive-backup sync=none   ||  |
. |   +-+   ||  |
Primary | | |   ||   backing|
' | |   ||  |
  V |   |
   +---+|
   |   shared disk | <--+
   +---+

1) Primary writes will read original data and forward it to Secondary
   QEMU.
2) The hidden-disk is created automatically. It buffers the original content
   that is modified by the primary VM. It should also be an empty disk, and
   the driver supports bdrv_make_empty() and backing file.
3) Primary write requests will be written to Shared disk.
4) Secondary write requests will be buffered in the active disk and it
   will overwrite the existing sector content in the buffer.

Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com>
Signed-off-by: Wen Congyang <we...@cn.fujitsu.com>
Signed-off-by: Zhang Chen <zhangchen.f...@cn.fujitsu.com>
---
 block/replication.c | 45 ++---
 1 file changed, 38 insertions(+), 7 deletions(-)

diff --git a/block/replication.c b/block/replication.c
index 39c616d..e66b1ca 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -221,7 +221,7 @@ static coroutine_fn int 
replication_co_readv(BlockDriverState *bs,
  QEMUIOVector *qiov)
 {
 BDRVReplicationState *s = bs->opaque;
-BdrvChild *child = s->secondary_disk;
+BdrvChild *child = s->is_shared_disk ? s->primary_disk : s->secondary_disk;
 BlockJob *job = NULL;
 CowRequest req;
 int ret;
@@ -398,8 +398,12 @@ static void backup_job_completed(void *opaque, int ret)
 /* The backup job is cancelled unexpectedly */
 s->error = -EIO;
 }
-
-backup_job_cleanup(s);
+if (s->mode == REPLICATION_MODE_PRIMARY) {
+s->replication_state = BLOCK_REPLICATION_DONE;
+s->error = 0;
+} else {
+backup_job_cleanup(s);
+}
 }
 
 static bool check_top_bs(BlockDriverState *top_bs, BlockDriverState *bs)
@@ -450,6 +454,15 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 
 switch (s->mode) {
 case REPLICATION_MODE_PRIMARY:
+if (s->is_shared_disk) {
+backup_start("replication-backup", s->primary_disk->bs, bs, 0,
+MIRROR_SYNC_MODE_NONE, NULL, false, BLOCKDEV_ON_ERROR_REPORT,
+BLOCKDEV_ON_ERROR_REPORT, backup_job_completed,
+s, NULL, _err);
+if (local_err) {
+error_propagate(errp, local_err);
+}
+}
 break;
 case REPLICATION_MODE_SECONDARY:
 s->active_disk = bs->file;
@@ -468,7 +481,8 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 }
 
 s->secondary_disk = s->hidden_disk->bs->backing;
-if (!s->secondary_disk->bs || !bdrv_has_blk(s->secondary_disk->bs)) {
+if (!s->secondary_disk->bs ||
+(!s->is_shared_disk && !bdrv_has_blk(s->secondary_disk->bs))) {
 error_setg(errp, "The secondary disk doesn't have block backend");
 aio_context_release(aio_context);
 return;
@@ -560,11 +574,24 @@ static void replication_do_checkpoint(ReplicationState 
*rs, Error **errp)
 
 switch (s->mode) {
 case REPLICATION_MODE_PRIMARY:
+if (s->is_shared_disk) {
+if (!s->primary_dis

[Qemu-block] [PATCH RFC 2/7] block-backend: Introduce blk_root() helper

2016-10-20 Thread zhanghailiang
With this helper function, we can get the BdrvChild struct
from BlockBackend

Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com>
---
 block/block-backend.c  | 5 +
 include/sysemu/block-backend.h | 1 +
 2 files changed, 6 insertions(+)

diff --git a/block/block-backend.c b/block/block-backend.c
index 1a724a8..66387f0 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -389,6 +389,11 @@ BlockDriverState *blk_bs(BlockBackend *blk)
 return blk->root ? blk->root->bs : NULL;
 }
 
+BdrvChild *blk_root(BlockBackend *blk)
+{
+return blk->root;
+}
+
 static BlockBackend *bdrv_first_blk(BlockDriverState *bs)
 {
 BdrvChild *child;
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index b07159b..867f9f5 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -99,6 +99,7 @@ void blk_remove_bs(BlockBackend *blk);
 void blk_insert_bs(BlockBackend *blk, BlockDriverState *bs);
 bool bdrv_has_blk(BlockDriverState *bs);
 bool bdrv_is_root_node(BlockDriverState *bs);
+BdrvChild *blk_root(BlockBackend *blk);
 
 void blk_set_allow_write_beyond_eof(BlockBackend *blk, bool allow);
 void blk_iostatus_enable(BlockBackend *blk);
-- 
1.8.3.1





[Qemu-block] [PATCH RFC 4/7] replication: Split out backup_do_checkpoint() from secondary_do_checkpoint()

2016-10-20 Thread zhanghailiang
The helper backup_do_checkpoint() will be used for primary related
codes. Here we split it out from secondary_do_checkpoint().

Besides, it is unnecessary to call backup_do_checkpoint() in
replication starting and normally stop replication path.
We only need call it while do real checkpointing.

Signed-off-by: zhanghailiang <zhang.zhanghaili...@huawei.com>
---
 block/replication.c | 36 +++-
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/block/replication.c b/block/replication.c
index 2a2fdb2..d687ffc 100644
--- a/block/replication.c
+++ b/block/replication.c
@@ -320,20 +320,8 @@ static bool 
replication_recurse_is_first_non_filter(BlockDriverState *bs,
 
 static void secondary_do_checkpoint(BDRVReplicationState *s, Error **errp)
 {
-Error *local_err = NULL;
 int ret;
 
-if (!s->secondary_disk->bs->job) {
-error_setg(errp, "Backup job was cancelled unexpectedly");
-return;
-}
-
-backup_do_checkpoint(s->secondary_disk->bs->job, _err);
-if (local_err) {
-error_propagate(errp, local_err);
-return;
-}
-
 ret = s->active_disk->bs->drv->bdrv_make_empty(s->active_disk->bs);
 if (ret < 0) {
 error_setg(errp, "Cannot make active disk empty");
@@ -539,6 +527,8 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 aio_context_release(aio_context);
 return;
 }
+
+secondary_do_checkpoint(s, errp);
 break;
 default:
 aio_context_release(aio_context);
@@ -547,10 +537,6 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,
 
 s->replication_state = BLOCK_REPLICATION_RUNNING;
 
-if (s->mode == REPLICATION_MODE_SECONDARY) {
-secondary_do_checkpoint(s, errp);
-}
-
 s->error = 0;
 aio_context_release(aio_context);
 }
@@ -560,13 +546,29 @@ static void replication_do_checkpoint(ReplicationState 
*rs, Error **errp)
 BlockDriverState *bs = rs->opaque;
 BDRVReplicationState *s;
 AioContext *aio_context;
+Error *local_err = NULL;
 
 aio_context = bdrv_get_aio_context(bs);
 aio_context_acquire(aio_context);
 s = bs->opaque;
 
-if (s->mode == REPLICATION_MODE_SECONDARY) {
+switch (s->mode) {
+case REPLICATION_MODE_PRIMARY:
+break;
+case REPLICATION_MODE_SECONDARY:
+if (!s->secondary_disk->bs->job) {
+error_setg(errp, "Backup job was cancelled unexpectedly");
+break;
+}
+backup_do_checkpoint(s->secondary_disk->bs->job, _err);
+if (local_err) {
+error_propagate(errp, local_err);
+break;
+}
 secondary_do_checkpoint(s, errp);
+break;
+default:
+abort();
 }
 aio_context_release(aio_context);
 }
-- 
1.8.3.1





[Qemu-block] [PATCH RFC 0/7] COLO block replication supports shared disk case

2016-10-20 Thread zhanghailiang
COLO block replication doesn't support the shared disk case,
Here we try to implement it.

Just as the scenario of non-shared disk block replication,
we are going to implement block replication from many basic
blocks that are already in QEMU.
The architecture is:

 virtio-blk ||   
.--
 /  ||   | 
Secondary
/   ||   
'--
   /|| 
virtio-blk
  / ||  
|
  | ||   
replication(5)
  |NBD  >   NBD   (2)   
|
  |  client ||server ---> hidden disk <-- 
active disk(4)
  | ^   ||  |
  |  replication(1) ||  |
  | |   ||  |
  |   +-'   ||  |
 (3)  |drive-backup sync=none   ||  |
. |   +-+   ||  |
Primary | | |   ||   backing|
' | |   ||  |
  V |   |
   +---+|
   |   shared disk | <--+
   +---+
1) Primary writes will read original data and forward it to Secondary
   QEMU.
2) The hidden-disk will buffers the original content that is modified
   by the primary VM. It should also be an empty disk, and
   the driver supports bdrv_make_empty() and backing file.
3) Primary write requests will be written to Shared disk.
4) Secondary write requests will be buffered in the active disk and it
   will overwrite the existing sector content in the buffe

For more details, please refer to patch 1.

The complete codes can be found from the link:
https://github.com/coloft/qemu/tree/colo-v5.1-developing-COLO-frame-v21-with-shared-disk

Test steps:
1. Secondary:
# x86_64-softmmu/qemu-system-x86_64 -boot c -m 2048 -smp 2 -qmp stdio -vnc :9 
-name secondary -enable-kvm -cpu qemu64,+kvmclock -device piix3-usb-uhci -drive 
if=none,driver=qcow2,file.filename=/mnt/ramfs/hidden_disk.img,id=hidden_disk0,backing.driver=raw,backing.file.filename=/work/kvm/suse11_sp3_64
  -drive 
if=virtio,id=active-disk0,driver=replication,mode=secondary,file.driver=qcow2,top-id=active-disk0,file.file.filename=/mnt/ramfs/active_disk.img,file.backing=hidden_disk0,shared-disk=on
 -incoming tcp:0:

Issue qmp commands:
{'execute':'qmp_capabilities'}
{'execute': 'nbd-server-start', 'arguments': {'addr': {'type': 'inet', 'data': 
{'host': '0', 'port': '9998'} } } }
{'execute': 'nbd-server-add', 'arguments': {'device': 'hidden_disk0', 
'writable': true } }

2.Primary:
# x86_64-softmmu/qemu-system-x86_64 -enable-kvm -m 2048 -smp 2 -qmp stdio -vnc 
:9 -name primary -cpu qemu64,+kvmclock -device piix3-usb-uhci -drive 
if=virtio,id=primary_disk0,file.filename=/work/kvm/suse11_sp3_64,driver=raw -S

Issue qmp commands:
{'execute':'qmp_capabilities'}
{'execute': 'human-monitor-command', 'arguments': {'command-line': 'drive_add 
-n buddy 
driver=replication,mode=primary,file.driver=nbd,file.host=9.42.3.17,file.port=9998,file.export=hidden_disk0,shared-disk-id=primary_disk0,shared-disk=on,node-name=rep'}}
{'execute': 'migrate-set-capabilities', 'arguments': {'capabilities': [ 
{'capability': 'x-colo', 'state': true } ] } }
{'execute': 'migrate', 'arguments': {'uri': 'tcp:9.42.3.17:' } }

3. Failover
Secondary side:
Issue qmp commands:
{ 'execute': 'nbd-server-stop' }
{ "execute": "x-colo-lost-heartbeat" }

Please review and any commits are welcomed.

Cc: Juan Quintela <quint...@redhat.com>
Cc: Amit Shah <amit.s...@redhat.com> 
Cc: Dr. David Alan Gilbert (git) <dgilb...@redhat.com>

zhanghailiang (7):
  docs/block-replication: Add description for shared-disk case
  block-backend: Introduce blk_root() helper
  replication: add shared-disk and shared-disk-id options
  replication: Split out backup_do_checkpoint() from
secondary_do_checkpoint()
  replication: fix code logic with the new shared_disk option
  replication: Implement block replication for shared disk case
  nbd/replication: implement .bdrv_get_info() for nbd and replication
driver

 block/block-backend.c  |   5 ++
 block/nbd.c|  12 
 block/replication.c| 146 +++--
 docs/block-replication.txt | 131 ++--
 include/sy