Re: [Qemu-block] [Qemu-devel] [PATCH v2 1/1] block: clarify error message for qmp-eject

2016-05-20 Thread John Snow


On 05/20/2016 10:48 AM, Markus Armbruster wrote:
> John Snow  writes:
> 
>> If you use HMP's eject but the CDROM tray is locked, you may get a
>> confusing error message informing you that the "tray isn't open."
>>
>> As this is the point of eject, we can do a little better and help
>> clarify that the tray was locked and that it (might) open up later,
>> so try again.
>>
>> It's not ideal, but it makes the semantics of the (legacy) eject
>> command more understandable to end users when they try to use it.
>>
>> Signed-off-by: John Snow 
> [...]
>> @@ -2327,35 +2340,36 @@ void qmp_block_passwd(bool has_device, const char 
>> *device,
>>  aio_context_release(aio_context);
>>  }
>>  
>> -void qmp_blockdev_open_tray(const char *device, bool has_force, bool force,
>> -Error **errp)
>> +/**
>> + * returns -errno on fatal error, +errno for non-fatal situations.
>> + * errp will always be set when the return code is negative.
>> + * May return +ENOSYS if the device has no tray,
>> + * or +EINPROGRESS if the tray is locked and the guest has been notified.
>> + */
> 
> Returning or testing for positive errno instead of a negative one is a
> fairly common error.  The more we can restrict use of positive errno
> codes to errno itself, the less likely such errors are.
> 
> Moreover, I feel fatal vs. non-fatal is not for this function to decide.
> It's the caller's business.  I'd return -errno on any error.  If you
> need this function to also set an error, because it can do a better job
> than its callers, then set it on any error.  If a caller wants to
> suppress a certain error, it can simply free the error.  Clean
> separation of concerns, and a simpler interface.
> 
>> +static int do_open_tray(const char *device, bool force, Error **errp)
>>  {
>>  BlockBackend *blk;
>>  bool locked;
>>  
>> -if (!has_force) {
>> -force = false;
>> -}
>> -
>>  blk = blk_by_name(device);
>>  if (!blk) {
>>  error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
>>"Device '%s' not found", device);
>> -return;
>> +return -ENODEV;
>>  }
>>  
>>  if (!blk_dev_has_removable_media(blk)) {
>>  error_setg(errp, "Device '%s' is not removable", device);
>> -return;
>> +return -ENOTSUP;
>>  }
>>  
>>  if (!blk_dev_has_tray(blk)) {
>>  /* Ignore this command on tray-less devices */
>> -return;
>> +return ENOSYS;
>>  }
>>  
>>  if (blk_dev_is_tray_open(blk)) {
>> -return;
>> +return 0;
>>  }
>>  
>>  locked = blk_dev_is_medium_locked(blk);
>> @@ -2366,6 +2380,21 @@ void qmp_blockdev_open_tray(const char *device, bool 
>> has_force, bool force,
>>  if (!locked || force) {
>>  blk_dev_change_media_cb(blk, false);
>>  }
>> +
>> +if (locked && !force) {
>> +return EINPROGRESS;
>> +}
>> +
>> +return 0;
>> +}
>> +
>> +void qmp_blockdev_open_tray(const char *device, bool has_force, bool force,
>> +Error **errp)
>> +{
>> +if (!has_force) {
>> +force = false;
>> +}
>> +do_open_tray(device, force, errp);
>>  }
>>  
>>  void qmp_blockdev_close_tray(const char *device, Error **errp)

It already got applied, but I can change it to your preference. (Always
return an -errno and an Error, delete-and-free when we don't care about
it...)

--js



[Qemu-block] [PATCH v7 09/15] block: Simplify drive-mirror

2016-05-20 Thread Eric Blake
Now that we can support boxed commands, use it to greatly
reduce the number of parameters (and likelihood of getting
out of sync) when adjusting drive-mirror parameters.

Signed-off-by: Eric Blake 

---
v7: new patch
---
 qapi/block-core.json | 17 -
 blockdev.c   | 72 ++--
 hmp.c| 27 +---
 3 files changed, 59 insertions(+), 57 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index 26f7c0e..885a75a 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -1108,6 +1108,21 @@
 #
 # Start mirroring a block device's writes to a new destination.
 #
+# See DriveMirror for parameter descriptions
+#
+# Returns: nothing on success
+#  If @device is not a valid block device, DeviceNotFound
+#
+# Since 1.3
+##
+{ 'command': 'drive-mirror', 'box': true,
+  'data': 'DriveMirror' }
+
+##
+# DriveMirror
+#
+# The parameters for the drive-mirror command.
+#
 # @device:  the name of the device whose writes should be mirrored.
 #
 # @target: the target of the new image. If the file exists, or if it
@@ -1159,7 +1174,7 @@
 #
 # Since 1.3
 ##
-{ 'command': 'drive-mirror',
+{ 'struct': 'DriveMirror',
   'data': { 'device': 'str', 'target': 'str', '*format': 'str',
 '*node-name': 'str', '*replaces': 'str',
 'sync': 'MirrorSyncMode', '*mode': 'NewImageMode',
diff --git a/blockdev.c b/blockdev.c
index b8db496..94850fd 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -3457,19 +3457,7 @@ static void blockdev_mirror_common(BlockDriverState *bs,
  block_job_cb, bs, errp);
 }

-void qmp_drive_mirror(const char *device, const char *target,
-  bool has_format, const char *format,
-  bool has_node_name, const char *node_name,
-  bool has_replaces, const char *replaces,
-  enum MirrorSyncMode sync,
-  bool has_mode, enum NewImageMode mode,
-  bool has_speed, int64_t speed,
-  bool has_granularity, uint32_t granularity,
-  bool has_buf_size, int64_t buf_size,
-  bool has_on_source_error, BlockdevOnError 
on_source_error,
-  bool has_on_target_error, BlockdevOnError 
on_target_error,
-  bool has_unmap, bool unmap,
-  Error **errp)
+void qmp_drive_mirror(DriveMirror *arg, Error **errp)
 {
 BlockDriverState *bs;
 BlockBackend *blk;
@@ -3480,11 +3468,12 @@ void qmp_drive_mirror(const char *device, const char 
*target,
 int flags;
 int64_t size;
 int ret;
+const char *format = arg->format;

-blk = blk_by_name(device);
+blk = blk_by_name(arg->device);
 if (!blk) {
 error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
-  "Device '%s' not found", device);
+  "Device '%s' not found", arg->device);
 return;
 }

@@ -3492,24 +3481,25 @@ void qmp_drive_mirror(const char *device, const char 
*target,
 aio_context_acquire(aio_context);

 if (!blk_is_available(blk)) {
-error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, device);
+error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, arg->device);
 goto out;
 }
 bs = blk_bs(blk);
-if (!has_mode) {
-mode = NEW_IMAGE_MODE_ABSOLUTE_PATHS;
+if (!arg->has_mode) {
+arg->mode = NEW_IMAGE_MODE_ABSOLUTE_PATHS;
 }

-if (!has_format) {
-format = mode == NEW_IMAGE_MODE_EXISTING ? NULL : bs->drv->format_name;
+if (!arg->has_format) {
+format = (arg->mode == NEW_IMAGE_MODE_EXISTING
+  ? NULL : bs->drv->format_name);
 }

 flags = bs->open_flags | BDRV_O_RDWR;
 source = backing_bs(bs);
-if (!source && sync == MIRROR_SYNC_MODE_TOP) {
-sync = MIRROR_SYNC_MODE_FULL;
+if (!source && arg->sync == MIRROR_SYNC_MODE_TOP) {
+arg->sync = MIRROR_SYNC_MODE_FULL;
 }
-if (sync == MIRROR_SYNC_MODE_NONE) {
+if (arg->sync == MIRROR_SYNC_MODE_NONE) {
 source = bs;
 }

@@ -3519,18 +3509,18 @@ void qmp_drive_mirror(const char *device, const char 
*target,
 goto out;
 }

-if (has_replaces) {
+if (arg->has_replaces) {
 BlockDriverState *to_replace_bs;
 AioContext *replace_aio_context;
 int64_t replace_size;

-if (!has_node_name) {
+if (!arg->has_node_name) {
 error_setg(errp, "a node-name must be provided when replacing a"
  " named node of the graph");
 goto out;
 }

-to_replace_bs = check_to_replace_node(bs, replaces, &local_err);
+to_replace_bs = check_to_replace_node(bs, arg->replaces, &local_err);

 if (!to_replace_bs) {
 error_propagate(errp, local_err);
@@ -3549,20 +3539,20 @@ void qmp_drive_mirror(const char *device, const char 
*target,
 }
 

[Qemu-block] [PATCH v7 08/15] block: Simplify block_set_io_throttle

2016-05-20 Thread Eric Blake
Now that we can support boxed commands, use it to greatly
reduce the number of parameters (and likelihood of getting
out of sync) when adjusting throttle parameters.

Signed-off-by: Eric Blake 

---
v7: new patch
---
 qapi/block-core.json |  20 --
 blockdev.c   | 111 +++
 hmp.c|  45 +
 3 files changed, 66 insertions(+), 110 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index 98a20d2..26f7c0e 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -1312,6 +1312,21 @@
 # the device will be removed from its group and the rest of its
 # members will not be affected. The 'group' parameter is ignored.
 #
+# See BlockIOThrottle for parameter descriptions.
+#
+# Returns: Nothing on success
+#  If @device is not a valid block device, DeviceNotFound
+#
+# Since: 1.1
+##
+{ 'command': 'block_set_io_throttle', 'box': true,
+  'data': 'BlockIOThrottle' }
+
+##
+# BlockIOThrottle
+#
+# The parameters for the block_set_io_throttle command.
+#
 # @device: The name of the device
 #
 # @bps: total throughput limit in bytes per second
@@ -1378,12 +1393,9 @@
 #
 # @group: #optional throttle group name (Since 2.4)
 #
-# Returns: Nothing on success
-#  If @device is not a valid block device, DeviceNotFound
-#
 # Since: 1.1
 ##
-{ 'command': 'block_set_io_throttle',
+{ 'struct': 'BlockIOThrottle',
   'data': { 'device': 'str', 'bps': 'int', 'bps_rd': 'int', 'bps_wr': 'int',
 'iops': 'int', 'iops_rd': 'int', 'iops_wr': 'int',
 '*bps_max': 'int', '*bps_rd_max': 'int',
diff --git a/blockdev.c b/blockdev.c
index cf5afa3..b8db496 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2625,49 +2625,17 @@ fail:
 }

 /* throttling disk I/O limits */
-void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
-   int64_t bps_wr,
-   int64_t iops,
-   int64_t iops_rd,
-   int64_t iops_wr,
-   bool has_bps_max,
-   int64_t bps_max,
-   bool has_bps_rd_max,
-   int64_t bps_rd_max,
-   bool has_bps_wr_max,
-   int64_t bps_wr_max,
-   bool has_iops_max,
-   int64_t iops_max,
-   bool has_iops_rd_max,
-   int64_t iops_rd_max,
-   bool has_iops_wr_max,
-   int64_t iops_wr_max,
-   bool has_bps_max_length,
-   int64_t bps_max_length,
-   bool has_bps_rd_max_length,
-   int64_t bps_rd_max_length,
-   bool has_bps_wr_max_length,
-   int64_t bps_wr_max_length,
-   bool has_iops_max_length,
-   int64_t iops_max_length,
-   bool has_iops_rd_max_length,
-   int64_t iops_rd_max_length,
-   bool has_iops_wr_max_length,
-   int64_t iops_wr_max_length,
-   bool has_iops_size,
-   int64_t iops_size,
-   bool has_group,
-   const char *group, Error **errp)
+void qmp_block_set_io_throttle(BlockIOThrottle *arg, Error **errp)
 {
 ThrottleConfig cfg;
 BlockDriverState *bs;
 BlockBackend *blk;
 AioContext *aio_context;

-blk = blk_by_name(device);
+blk = blk_by_name(arg->device);
 if (!blk) {
 error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
-  "Device '%s' not found", device);
+  "Device '%s' not found", arg->device);
 return;
 }

@@ -2676,59 +2644,59 @@ void qmp_block_set_io_throttle(const char *device, 
int64_t bps, int64_t bps_rd,

 bs = blk_bs(blk);
 if (!bs) {
-error_setg(errp, "Device '%s' has no medium", device);
+error_setg(errp, "Device '%s' has no medium", arg->device);
 goto out;
 }

 throttle_config_init(&cfg);
-cfg.buckets[THROTTLE_BPS_TOTAL].avg = bps;
-cfg.buckets[THROTTLE_BPS_READ].avg  = bps_rd;
-cfg.buckets[THROTTLE_BPS_WRITE].avg = bps_wr;
+cfg.buckets[THROTTLE_BPS_TOTAL].avg = arg->bps;
+cfg.buckets[THROTTLE_BPS_READ].avg  = arg->bps_rd;
+cfg.buckets[THROTTLE_BPS_WRITE].avg = arg->bps_wr;

-cfg.buckets[THROTTLE_OPS_TOTAL].avg = iops;
-cfg.buckets[THROTTLE_OPS_READ].avg  = iops_rd;
-cfg.buckets[THROTTLE_OPS_WRITE].avg = iops_wr;
+cfg.buckets[THROTTLE_OPS_TOTAL].avg = arg->iops;
+cfg.buckets[THROTTLE_OPS_READ].avg  = arg->iops

[Qemu-block] [PATCH v3] block: Fix bdrv_next() memory leak

2016-05-20 Thread Kevin Wolf
The bdrv_next() users all leaked the BdrvNextIterator after completing
the iteration. Simply changing bdrv_next() to free the iterator before
returning NULL at the end of list doesn't work because some callers exit
the loop before looking at all BDSes.

This patch moves the BdrvNextIterator from the heap to the stack of
the caller and switches to a bdrv_first()/bdrv_next() interface for
initialising the iterator.

Signed-off-by: Kevin Wolf 
---

v3:
- block/snapshot.c: With the change to a for loop, bdrv_next() is executed
  unconditionally, even if the exit condition is true. We now need to
  explicitly jump out of the loop if bs should stay the same.

 block.c   | 18 -
 block/block-backend.c | 41 +-
 block/io.c| 10 --
 block/snapshot.c  | 55 ++-
 blockdev.c|  4 ++--
 include/block/block.h | 15 --
 migration/block.c |  4 ++--
 monitor.c |  4 ++--
 qmp.c |  5 ++---
 9 files changed, 92 insertions(+), 64 deletions(-)

diff --git a/block.c b/block.c
index 1205ef8..d287771 100644
--- a/block.c
+++ b/block.c
@@ -3195,9 +3195,9 @@ void bdrv_invalidate_cache_all(Error **errp)
 {
 BlockDriverState *bs;
 Error *local_err = NULL;
-BdrvNextIterator *it = NULL;
+BdrvNextIterator it;
 
-while ((it = bdrv_next(it, &bs)) != NULL) {
+for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 AioContext *aio_context = bdrv_get_aio_context(bs);
 
 aio_context_acquire(aio_context);
@@ -3239,11 +3239,11 @@ static int bdrv_inactivate_recurse(BlockDriverState *bs,
 int bdrv_inactivate_all(void)
 {
 BlockDriverState *bs = NULL;
-BdrvNextIterator *it = NULL;
+BdrvNextIterator it;
 int ret = 0;
 int pass;
 
-while ((it = bdrv_next(it, &bs)) != NULL) {
+for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 aio_context_acquire(bdrv_get_aio_context(bs));
 }
 
@@ -3252,8 +3252,7 @@ int bdrv_inactivate_all(void)
  * the second pass sets the BDRV_O_INACTIVE flag so that no further write
  * is allowed. */
 for (pass = 0; pass < 2; pass++) {
-it = NULL;
-while ((it = bdrv_next(it, &bs)) != NULL) {
+for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 ret = bdrv_inactivate_recurse(bs, pass);
 if (ret < 0) {
 goto out;
@@ -3262,8 +3261,7 @@ int bdrv_inactivate_all(void)
 }
 
 out:
-it = NULL;
-while ((it = bdrv_next(it, &bs)) != NULL) {
+for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 aio_context_release(bdrv_get_aio_context(bs));
 }
 
@@ -3753,10 +3751,10 @@ bool bdrv_recurse_is_first_non_filter(BlockDriverState 
*bs,
 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
 {
 BlockDriverState *bs;
-BdrvNextIterator *it = NULL;
+BdrvNextIterator it;
 
 /* walk down the bs forest recursively */
-while ((it = bdrv_next(it, &bs)) != NULL) {
+for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 bool perm;
 
 /* try to recurse in this top level bs */
diff --git a/block/block-backend.c b/block/block-backend.c
index 6928d61..9f306f8 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -286,25 +286,11 @@ BlockBackend *blk_next(BlockBackend *blk)
: QTAILQ_FIRST(&monitor_block_backends);
 }
 
-struct BdrvNextIterator {
-enum {
-BDRV_NEXT_BACKEND_ROOTS,
-BDRV_NEXT_MONITOR_OWNED,
-} phase;
-BlockBackend *blk;
-BlockDriverState *bs;
-};
-
 /* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
  * the monitor or attached to a BlockBackend */
-BdrvNextIterator *bdrv_next(BdrvNextIterator *it, BlockDriverState **bs)
+BlockDriverState *bdrv_next(BdrvNextIterator *it)
 {
-if (!it) {
-it = g_new(BdrvNextIterator, 1);
-*it = (BdrvNextIterator) {
-.phase = BDRV_NEXT_BACKEND_ROOTS,
-};
-}
+BlockDriverState *bs;
 
 /* First, return all root nodes of BlockBackends. In order to avoid
  * returning a BDS twice when multiple BBs refer to it, we only return it
@@ -312,11 +298,11 @@ BdrvNextIterator *bdrv_next(BdrvNextIterator *it, 
BlockDriverState **bs)
 if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
 do {
 it->blk = blk_all_next(it->blk);
-*bs = it->blk ? blk_bs(it->blk) : NULL;
-} while (it->blk && (*bs == NULL || bdrv_first_blk(*bs) != it->blk));
+bs = it->blk ? blk_bs(it->blk) : NULL;
+} while (it->blk && (bs == NULL || bdrv_first_blk(bs) != it->blk));
 
-if (*bs) {
-return it;
+if (bs) {
+return bs;
 }
 it->phase = BDRV_NEXT_MONITOR_OWNED;
 }
@@ -326,10 +312,19 @@ BdrvNextIterator *bdrv_next(BdrvNextIterator *it, 
BlockDriverState **bs)
  * by the above block already */
 do 

[Qemu-block] [PATCH v2] block: Fix bdrv_next() memory leak

2016-05-20 Thread Kevin Wolf
The bdrv_next() users all leaked the BdrvNextIterator after completing
the iteration. Simply changing bdrv_next() to free the iterator before
returning NULL at the end of list doesn't work because some callers exit
the loop before looking at all BDSes.

This patch moves the BdrvNextIterator from the heap to the stack of
the caller and switches to a bdrv_first()/bdrv_next() interface for
initialising the iterator.

Signed-off-by: Kevin Wolf 
---
 block.c   | 18 --
 block/block-backend.c | 41 ++---
 block/io.c| 10 --
 block/snapshot.c  | 24 
 blockdev.c|  4 ++--
 include/block/block.h | 15 +--
 migration/block.c |  4 ++--
 monitor.c |  4 ++--
 qmp.c |  5 ++---
 9 files changed, 63 insertions(+), 62 deletions(-)

diff --git a/block.c b/block.c
index 1205ef8..d287771 100644
--- a/block.c
+++ b/block.c
@@ -3195,9 +3195,9 @@ void bdrv_invalidate_cache_all(Error **errp)
 {
 BlockDriverState *bs;
 Error *local_err = NULL;
-BdrvNextIterator *it = NULL;
+BdrvNextIterator it;
 
-while ((it = bdrv_next(it, &bs)) != NULL) {
+for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 AioContext *aio_context = bdrv_get_aio_context(bs);
 
 aio_context_acquire(aio_context);
@@ -3239,11 +3239,11 @@ static int bdrv_inactivate_recurse(BlockDriverState *bs,
 int bdrv_inactivate_all(void)
 {
 BlockDriverState *bs = NULL;
-BdrvNextIterator *it = NULL;
+BdrvNextIterator it;
 int ret = 0;
 int pass;
 
-while ((it = bdrv_next(it, &bs)) != NULL) {
+for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 aio_context_acquire(bdrv_get_aio_context(bs));
 }
 
@@ -3252,8 +3252,7 @@ int bdrv_inactivate_all(void)
  * the second pass sets the BDRV_O_INACTIVE flag so that no further write
  * is allowed. */
 for (pass = 0; pass < 2; pass++) {
-it = NULL;
-while ((it = bdrv_next(it, &bs)) != NULL) {
+for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 ret = bdrv_inactivate_recurse(bs, pass);
 if (ret < 0) {
 goto out;
@@ -3262,8 +3261,7 @@ int bdrv_inactivate_all(void)
 }
 
 out:
-it = NULL;
-while ((it = bdrv_next(it, &bs)) != NULL) {
+for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 aio_context_release(bdrv_get_aio_context(bs));
 }
 
@@ -3753,10 +3751,10 @@ bool bdrv_recurse_is_first_non_filter(BlockDriverState 
*bs,
 bool bdrv_is_first_non_filter(BlockDriverState *candidate)
 {
 BlockDriverState *bs;
-BdrvNextIterator *it = NULL;
+BdrvNextIterator it;
 
 /* walk down the bs forest recursively */
-while ((it = bdrv_next(it, &bs)) != NULL) {
+for (bs = bdrv_first(&it); bs; bs = bdrv_next(&it)) {
 bool perm;
 
 /* try to recurse in this top level bs */
diff --git a/block/block-backend.c b/block/block-backend.c
index 6928d61..9f306f8 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -286,25 +286,11 @@ BlockBackend *blk_next(BlockBackend *blk)
: QTAILQ_FIRST(&monitor_block_backends);
 }
 
-struct BdrvNextIterator {
-enum {
-BDRV_NEXT_BACKEND_ROOTS,
-BDRV_NEXT_MONITOR_OWNED,
-} phase;
-BlockBackend *blk;
-BlockDriverState *bs;
-};
-
 /* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
  * the monitor or attached to a BlockBackend */
-BdrvNextIterator *bdrv_next(BdrvNextIterator *it, BlockDriverState **bs)
+BlockDriverState *bdrv_next(BdrvNextIterator *it)
 {
-if (!it) {
-it = g_new(BdrvNextIterator, 1);
-*it = (BdrvNextIterator) {
-.phase = BDRV_NEXT_BACKEND_ROOTS,
-};
-}
+BlockDriverState *bs;
 
 /* First, return all root nodes of BlockBackends. In order to avoid
  * returning a BDS twice when multiple BBs refer to it, we only return it
@@ -312,11 +298,11 @@ BdrvNextIterator *bdrv_next(BdrvNextIterator *it, 
BlockDriverState **bs)
 if (it->phase == BDRV_NEXT_BACKEND_ROOTS) {
 do {
 it->blk = blk_all_next(it->blk);
-*bs = it->blk ? blk_bs(it->blk) : NULL;
-} while (it->blk && (*bs == NULL || bdrv_first_blk(*bs) != it->blk));
+bs = it->blk ? blk_bs(it->blk) : NULL;
+} while (it->blk && (bs == NULL || bdrv_first_blk(bs) != it->blk));
 
-if (*bs) {
-return it;
+if (bs) {
+return bs;
 }
 it->phase = BDRV_NEXT_MONITOR_OWNED;
 }
@@ -326,10 +312,19 @@ BdrvNextIterator *bdrv_next(BdrvNextIterator *it, 
BlockDriverState **bs)
  * by the above block already */
 do {
 it->bs = bdrv_next_monitor_owned(it->bs);
-*bs = it->bs;
-} while (*bs && bdrv_has_blk(*bs));
+bs = it->bs;
+} while (bs && bdrv_has_blk(bs));
+
+return bs;
+}
+
+BlockDriverState *bdrv_first(Bdrv

Re: [Qemu-block] [Qemu-devel] [PATCH v2 1/1] block: clarify error message for qmp-eject

2016-05-20 Thread Markus Armbruster
John Snow  writes:

> If you use HMP's eject but the CDROM tray is locked, you may get a
> confusing error message informing you that the "tray isn't open."
>
> As this is the point of eject, we can do a little better and help
> clarify that the tray was locked and that it (might) open up later,
> so try again.
>
> It's not ideal, but it makes the semantics of the (legacy) eject
> command more understandable to end users when they try to use it.
>
> Signed-off-by: John Snow 
[...]
> @@ -2327,35 +2340,36 @@ void qmp_block_passwd(bool has_device, const char 
> *device,
>  aio_context_release(aio_context);
>  }
>  
> -void qmp_blockdev_open_tray(const char *device, bool has_force, bool force,
> -Error **errp)
> +/**
> + * returns -errno on fatal error, +errno for non-fatal situations.
> + * errp will always be set when the return code is negative.
> + * May return +ENOSYS if the device has no tray,
> + * or +EINPROGRESS if the tray is locked and the guest has been notified.
> + */

Returning or testing for positive errno instead of a negative one is a
fairly common error.  The more we can restrict use of positive errno
codes to errno itself, the less likely such errors are.

Moreover, I feel fatal vs. non-fatal is not for this function to decide.
It's the caller's business.  I'd return -errno on any error.  If you
need this function to also set an error, because it can do a better job
than its callers, then set it on any error.  If a caller wants to
suppress a certain error, it can simply free the error.  Clean
separation of concerns, and a simpler interface.

> +static int do_open_tray(const char *device, bool force, Error **errp)
>  {
>  BlockBackend *blk;
>  bool locked;
>  
> -if (!has_force) {
> -force = false;
> -}
> -
>  blk = blk_by_name(device);
>  if (!blk) {
>  error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
>"Device '%s' not found", device);
> -return;
> +return -ENODEV;
>  }
>  
>  if (!blk_dev_has_removable_media(blk)) {
>  error_setg(errp, "Device '%s' is not removable", device);
> -return;
> +return -ENOTSUP;
>  }
>  
>  if (!blk_dev_has_tray(blk)) {
>  /* Ignore this command on tray-less devices */
> -return;
> +return ENOSYS;
>  }
>  
>  if (blk_dev_is_tray_open(blk)) {
> -return;
> +return 0;
>  }
>  
>  locked = blk_dev_is_medium_locked(blk);
> @@ -2366,6 +2380,21 @@ void qmp_blockdev_open_tray(const char *device, bool 
> has_force, bool force,
>  if (!locked || force) {
>  blk_dev_change_media_cb(blk, false);
>  }
> +
> +if (locked && !force) {
> +return EINPROGRESS;
> +}
> +
> +return 0;
> +}
> +
> +void qmp_blockdev_open_tray(const char *device, bool has_force, bool force,
> +Error **errp)
> +{
> +if (!has_force) {
> +force = false;
> +}
> +do_open_tray(device, force, errp);
>  }
>  
>  void qmp_blockdev_close_tray(const char *device, Error **errp)



Re: [Qemu-block] [PULL 21/31] block: Avoid bs->blk in bdrv_next()

2016-05-20 Thread Paolo Bonzini


On 20/05/2016 12:26, Kevin Wolf wrote:
> Hm, we have a few instances where an iterator variable is used for
> multiple loops, so we need to be able to use it in an assignment, i.e.
> it should be a compound literal. On the other hand, I seem to remember
> that compound literals can't be used as initialisers.
> 
> Maybe a bdrv_next_iterator_reset() function then? Which would be like
> first/next, except that it doesn't return the first value yet.

What qemu/queue.h does is is provide both FOO_INITIALIZER and
FOO_INIT(&some_foo).

Thanks,

Paolo



Re: [Qemu-block] [PULL 21/31] block: Avoid bs->blk in bdrv_next()

2016-05-20 Thread Kevin Wolf
Am 20.05.2016 um 11:39 hat Paolo Bonzini geschrieben:
> 
> 
> On 20/05/2016 10:10, Kevin Wolf wrote:
> >> > Already posted a fix. I chose to keep the interface and free the
> >> > BdrvNextIterator inside bdrv_next(), when we return NULL after the last
> >> > element.
> > Oops, should have actually read your email... You're right about callers
> > that prematurely exit the loop, of course.
> > 
> > I still don't really like first/next interfaces, though. Perhaps start
> > the iteration with bs == NULL instead of it == NULL?
> 
> Yet another alternative is to add a BDRV_NEXT_ITERATOR_INITIALIZER
> macro.  I like it because it's less magic than "x is NULL" and because I
> would prefer an interface with just the BdrvNextIterator* as the
> argument to bdrv_next.

Hm, we have a few instances where an iterator variable is used for
multiple loops, so we need to be able to use it in an assignment, i.e.
it should be a compound literal. On the other hand, I seem to remember
that compound literals can't be used as initialisers.

Maybe a bdrv_next_iterator_reset() function then? Which would be like
first/next, except that it doesn't return the first value yet.

Kevin



[Qemu-block] [PATCH V2] block/iscsi: allow caching of the allocation map

2016-05-20 Thread Peter Lieven
until now the allocation map was used only as a hint if a cluster
is allocated or not. If a block was not allocated (or Qemu had
no info about the allocation status) a get_block_status call was
issued to check the allocation status and possibly avoid
a subsequent read of unallocated sectors. If a block known to be
allocated the get_block_status call was omitted. In the other case
a get_block_status call was issued before every read to avoid
the necessity for a consistent allocation map. To avoid the
potential overhead of calling get_block_status for each and
every read request this took only place for the bigger requests.

This patch enhances this mechanism to cache the allocation
status and avoid calling get_block_status for blocks where
the allocation status has been queried before. This allows
for bypassing the read request even for smaller requests and
additionally omits calling get_block_status for known to be
unallocated blocks.

Signed-off-by: Peter Lieven 
---
v1->v2: - add more comments [Fam]
- free allocmap if allocation of allocmap_valid fails [Fam]
- fix indent and whitespace errors [Fam]
- account for cache mode changes on reopen
 block/iscsi.c |  228 -
 1 file changed, 179 insertions(+), 49 deletions(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 10f3906..e62a9ee 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -2,7 +2,7 @@
  * QEMU Block driver for iSCSI images
  *
  * Copyright (c) 2010-2011 Ronnie Sahlberg 
- * Copyright (c) 2012-2015 Peter Lieven 
+ * Copyright (c) 2012-2016 Peter Lieven 
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to 
deal
@@ -62,7 +62,22 @@ typedef struct IscsiLun {
 struct scsi_inquiry_logical_block_provisioning lbp;
 struct scsi_inquiry_block_limits bl;
 unsigned char *zeroblock;
-unsigned long *allocationmap;
+/* The allocationmap tracks which clusters (pages) on the iSCSI target are
+ * allocated and which are not. In case a target returns zeros for unallocated
+ * pages (iscsilun->lprz) we can directly return zeros instead of reading zeros
+ * over the wire if a read request falls within an unallocated block. As there
+ * are 3 possible states we need 2 bitmaps to track. allocmap_valid keeps track
+ * if Qemus information about a page is valid. allocmap tracks if a page is
+ * allocated or not. In case Qemus has no valid information about a page the
+ * corresponding allocmap entry should be switched to unallocated as well to
+ * force a new lookup of the allocation status as lookups are generally skipped
+ * if a page is suspect to be allocated. If a iSCSI target is opened with
+ * cache.direct = on the allocmap_valid does not exist turning all cached
+ * information invalid so that a fresh lookup is made for any page even if
+ * allocmap entry returns its unallocated. */
+unsigned long *allocmap;
+unsigned long *allocmap_valid;
+long allocmap_size;
 int cluster_sectors;
 bool use_16_for_rw;
 bool write_protected;
@@ -415,37 +430,132 @@ static bool is_request_lun_aligned(int64_t sector_num, 
int nb_sectors,
 return 1;
 }
 
-static unsigned long *iscsi_allocationmap_init(IscsiLun *iscsilun)
+static void iscsi_allocmap_free(IscsiLun *iscsilun)
 {
-return bitmap_try_new(DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks,
-   iscsilun),
-   iscsilun->cluster_sectors));
+g_free(iscsilun->allocmap);
+g_free(iscsilun->allocmap_valid);
+iscsilun->allocmap = NULL;
+iscsilun->allocmap_valid = NULL;
 }
 
-static void iscsi_allocationmap_set(IscsiLun *iscsilun, int64_t sector_num,
-int nb_sectors)
+
+static int iscsi_allocmap_init(IscsiLun *iscsilun, int open_flags)
 {
-if (iscsilun->allocationmap == NULL) {
-return;
+iscsi_allocmap_free(iscsilun);
+
+iscsilun->allocmap_size =
+DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks, iscsilun),
+ iscsilun->cluster_sectors);
+
+iscsilun->allocmap = bitmap_try_new(iscsilun->allocmap_size);
+if (!iscsilun->allocmap) {
+return -ENOMEM;
+}
+
+if (open_flags & BDRV_O_NOCACHE) {
+/* in case that cache.direct = on all allocmap entries are
+ * threated as invalid to force a relookup of the block
+ * status on every read request */
+return 0;
+}
+
+iscsilun->allocmap_valid = bitmap_try_new(iscsilun->allocmap_size);
+if (!iscsilun->allocmap_valid) {
+/* if we are under memory pressure free the allocmap as well */
+iscsi_allocmap_free(iscsilun);
+return -ENOMEM;
 }
-bitmap_set(iscsilun->allocationmap,
-   sector_num / iscsilun->cluster_sectors,
-   DIV_ROUND_UP(nb_sectors, iscsilun->cluster_se

Re: [Qemu-block] [PULL 21/31] block: Avoid bs->blk in bdrv_next()

2016-05-20 Thread Paolo Bonzini


On 20/05/2016 10:10, Kevin Wolf wrote:
>> > Already posted a fix. I chose to keep the interface and free the
>> > BdrvNextIterator inside bdrv_next(), when we return NULL after the last
>> > element.
> Oops, should have actually read your email... You're right about callers
> that prematurely exit the loop, of course.
> 
> I still don't really like first/next interfaces, though. Perhaps start
> the iteration with bs == NULL instead of it == NULL?

Yet another alternative is to add a BDRV_NEXT_ITERATOR_INITIALIZER
macro.  I like it because it's less magic than "x is NULL" and because I
would prefer an interface with just the BdrvNextIterator* as the
argument to bdrv_next.

Thanks,

Paolo



[Qemu-block] Overflow in Virtio-BLK and SCSI Requests?

2016-05-20 Thread Peter Lieven
Hi,

while working at the iSCSI code in Qemu I came across the following line in 
iscsi_aio_ioctl

memcpy(&acb->task->cdb[0], acb->ioh->cmdp, acb->ioh->cmd_len);

Is there anything to ensure that the cmd_len is valid when the requests is e.g. 
coming in via
virtio_blk_handle_scsi ?

It seems that virtio-scsi does not allow to pass ioctls directly from Guest, 
but at least virtio-blk
does. And in virtio-blk it seems the data is blindly copied from 
elem->out_sg[1]. So it would
be possible to overflow the acb->task->cdb. Or am I wrong here?

Peter



Re: [Qemu-block] [PATCH] block: Fix memory leak in bdrv_next()

2016-05-20 Thread Kevin Wolf
Am 20.05.2016 um 09:57 hat Kevin Wolf geschrieben:
> When all BDSes have already been iterated and we return NULL, the
> iterator must be freed, too.
> 
> Signed-off-by: Kevin Wolf 

NACK, this doesn't fix cases where the caller exists loop prematurely.

Kevin



Re: [Qemu-block] [PULL 21/31] block: Avoid bs->blk in bdrv_next()

2016-05-20 Thread Kevin Wolf
Am 20.05.2016 um 10:05 hat Kevin Wolf geschrieben:
> Am 20.05.2016 um 09:54 hat Paolo Bonzini geschrieben:
> > > +/* Iterates over all top-level BlockDriverStates, i.e. BDSs that are 
> > > owned by
> > > + * the monitor or attached to a BlockBackend */
> > > +BdrvNextIterator *bdrv_next(BdrvNextIterator *it, BlockDriverState **bs)
> > > +{
> > > +if (!it) {
> > > +it = g_new(BdrvNextIterator, 1);
> > 
> > Hmm, who frees it?  (Especially if the caller exits the loop
> > prematurely, which means you cannot just free it before returning NULL).
> >  I think it's better to:
> > 
> > - allocate the iterator on the stack and make bdrv_next return a BDS *
> > 
> > - and add a bdrv_first function that does this:
> > 
> > > +*it = (BdrvNextIterator) {
> > > +.phase = BDRV_NEXT_BACKEND_ROOTS,
> > > +};
> > 
> > and then returns bdrv_next(it);
> > 
> > - if desirable add a macro that abstracts the calls to bdrv_first and
> > bdrv_next.
> 
> Already posted a fix. I chose to keep the interface and free the
> BdrvNextIterator inside bdrv_next(), when we return NULL after the last
> element.

Oops, should have actually read your email... You're right about callers
that prematurely exit the loop, of course.

I still don't really like first/next interfaces, though. Perhaps start
the iteration with bs == NULL instead of it == NULL?

Kevin



Re: [Qemu-block] [PULL 21/31] block: Avoid bs->blk in bdrv_next()

2016-05-20 Thread Kevin Wolf
Am 20.05.2016 um 09:54 hat Paolo Bonzini geschrieben:
> 
> 
> On 19/05/2016 17:21, Kevin Wolf wrote:
> > We need to introduce a separate BdrvNextIterator struct that can keep
> > more state than just the current BDS in order to avoid using the bs->blk
> > pointer.
> > 
> > Signed-off-by: Kevin Wolf 
> > Reviewed-by: Max Reitz 
> > ---
> >  block.c| 40 +++
> >  block/block-backend.c  | 72 
> > +-
> >  block/io.c | 13 
> >  block/snapshot.c   | 30 +++---
> >  blockdev.c |  3 +-
> >  include/block/block.h  |  3 +-
> >  include/sysemu/block-backend.h |  1 -
> >  migration/block.c  |  4 ++-
> >  monitor.c  |  6 ++--
> >  qmp.c  |  5 ++-
> >  10 files changed, 102 insertions(+), 75 deletions(-)
> > 
> > diff --git a/block.c b/block.c
> > index fd4cf81..91bf431 100644
> > --- a/block.c
> > +++ b/block.c
> > @@ -2870,25 +2870,6 @@ BlockDriverState *bdrv_next_node(BlockDriverState 
> > *bs)
> >  return QTAILQ_NEXT(bs, node_list);
> >  }
> >  
> > -/* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned 
> > by
> > - * the monitor or attached to a BlockBackend */
> > -BlockDriverState *bdrv_next(BlockDriverState *bs)
> > -{
> > -if (!bs || bs->blk) {
> > -bs = blk_next_root_bs(bs);
> > -if (bs) {
> > -return bs;
> > -}
> > -}
> > -
> > -/* Ignore all BDSs that are attached to a BlockBackend here; they have 
> > been
> > - * handled by the above block already */
> > -do {
> > -bs = bdrv_next_monitor_owned(bs);
> > -} while (bs && bs->blk);
> > -return bs;
> > -}
> > -
> >  const char *bdrv_get_node_name(const BlockDriverState *bs)
> >  {
> >  return bs->node_name;
> > @@ -3220,10 +3201,11 @@ void bdrv_invalidate_cache(BlockDriverState *bs, 
> > Error **errp)
> >  
> >  void bdrv_invalidate_cache_all(Error **errp)
> >  {
> > -BlockDriverState *bs = NULL;
> > +BlockDriverState *bs;
> >  Error *local_err = NULL;
> > +BdrvNextIterator *it = NULL;
> >  
> > -while ((bs = bdrv_next(bs)) != NULL) {
> > +while ((it = bdrv_next(it, &bs)) != NULL) {
> >  AioContext *aio_context = bdrv_get_aio_context(bs);
> >  
> >  aio_context_acquire(aio_context);
> > @@ -3265,10 +3247,11 @@ static int bdrv_inactivate_recurse(BlockDriverState 
> > *bs,
> >  int bdrv_inactivate_all(void)
> >  {
> >  BlockDriverState *bs = NULL;
> > +BdrvNextIterator *it = NULL;
> >  int ret = 0;
> >  int pass;
> >  
> > -while ((bs = bdrv_next(bs)) != NULL) {
> > +while ((it = bdrv_next(it, &bs)) != NULL) {
> >  aio_context_acquire(bdrv_get_aio_context(bs));
> >  }
> >  
> > @@ -3277,8 +3260,8 @@ int bdrv_inactivate_all(void)
> >   * the second pass sets the BDRV_O_INACTIVE flag so that no further 
> > write
> >   * is allowed. */
> >  for (pass = 0; pass < 2; pass++) {
> > -bs = NULL;
> > -while ((bs = bdrv_next(bs)) != NULL) {
> > +it = NULL;
> > +while ((it = bdrv_next(it, &bs)) != NULL) {
> >  ret = bdrv_inactivate_recurse(bs, pass);
> >  if (ret < 0) {
> >  goto out;
> > @@ -3287,8 +3270,8 @@ int bdrv_inactivate_all(void)
> >  }
> >  
> >  out:
> > -bs = NULL;
> > -while ((bs = bdrv_next(bs)) != NULL) {
> > +it = NULL;
> > +while ((it = bdrv_next(it, &bs)) != NULL) {
> >  aio_context_release(bdrv_get_aio_context(bs));
> >  }
> >  
> > @@ -3781,10 +3764,11 @@ bool 
> > bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
> >   */
> >  bool bdrv_is_first_non_filter(BlockDriverState *candidate)
> >  {
> > -BlockDriverState *bs = NULL;
> > +BlockDriverState *bs;
> > +BdrvNextIterator *it = NULL;
> >  
> >  /* walk down the bs forest recursively */
> > -while ((bs = bdrv_next(bs)) != NULL) {
> > +while ((it = bdrv_next(it, &bs)) != NULL) {
> >  bool perm;
> >  
> >  /* try to recurse in this top level bs */
> > diff --git a/block/block-backend.c b/block/block-backend.c
> > index 9dcac97..7f2eeb0 100644
> > --- a/block/block-backend.c
> > +++ b/block/block-backend.c
> > @@ -75,6 +75,7 @@ static const AIOCBInfo block_backend_aiocb_info = {
> >  };
> >  
> >  static void drive_info_del(DriveInfo *dinfo);
> > +static BlockBackend *bdrv_first_blk(BlockDriverState *bs);
> >  
> >  /* All BlockBackends */
> >  static QTAILQ_HEAD(, BlockBackend) block_backends =
> > @@ -286,28 +287,50 @@ BlockBackend *blk_next(BlockBackend *blk)
> > : QTAILQ_FIRST(&monitor_block_backends);
> >  }
> >  
> > -/*
> > - * Iterates over all BlockDriverStates which are attached to a 
> > BlockBackend.
> > - * This function is for use by bdrv_next().
> > - *
> > - * @bs must be NULL or a BDS that is attached to a BB.
> > - *

Re: [Qemu-block] [PATCH 1/3] blockdev-backup: Don't move target AioContext if it's attached

2016-05-20 Thread Kevin Wolf
Am 18.05.2016 um 10:24 hat Fam Zheng geschrieben:
> If the BDS is attached, it will want to stay on the AioContext where its
> BlockBackend is. Don't call bdrv_set_aio_context in this case.
> 
> Signed-off-by: Fam Zheng 
> ---
>  blockdev.c | 12 +++-
>  1 file changed, 11 insertions(+), 1 deletion(-)
> 
> diff --git a/blockdev.c b/blockdev.c
> index 1892b8e..eb15593 100644
> --- a/blockdev.c
> +++ b/blockdev.c
> @@ -3376,8 +3376,18 @@ void do_blockdev_backup(const char *device, const char 
> *target,
>  }
>  target_bs = blk_bs(target_blk);
>  
> +if (bdrv_get_aio_context(target_bs) != aio_context) {
> +if (!target_bs->blk) {

How should this ever happen when we have target_bs = blk_bs(target_blk)
two lines above?

> +/* The target BDS is not attached, we can safely move it to 
> another
> + * AioContext. */
> +bdrv_set_aio_context(target_bs, aio_context);
> +} else {
> +error_setg(errp, "Target is attached to a different thread from "
> + "source.");
> +goto out;
> +}
> +}

Kevin



[Qemu-block] [PATCH] block: Fix memory leak in bdrv_next()

2016-05-20 Thread Kevin Wolf
When all BDSes have already been iterated and we return NULL, the
iterator must be freed, too.

Signed-off-by: Kevin Wolf 
---
 block/block-backend.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/block/block-backend.c b/block/block-backend.c
index 6928d61..c5fb251 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -329,7 +329,12 @@ BdrvNextIterator *bdrv_next(BdrvNextIterator *it, 
BlockDriverState **bs)
 *bs = it->bs;
 } while (*bs && bdrv_has_blk(*bs));
 
-return *bs ? it : NULL;
+if (*bs) {
+return it;
+} else {
+g_free(it);
+return NULL;
+}
 }
 
 /*
-- 
1.8.3.1




Re: [Qemu-block] [PULL 21/31] block: Avoid bs->blk in bdrv_next()

2016-05-20 Thread Paolo Bonzini


On 19/05/2016 17:21, Kevin Wolf wrote:
> We need to introduce a separate BdrvNextIterator struct that can keep
> more state than just the current BDS in order to avoid using the bs->blk
> pointer.
> 
> Signed-off-by: Kevin Wolf 
> Reviewed-by: Max Reitz 
> ---
>  block.c| 40 +++
>  block/block-backend.c  | 72 
> +-
>  block/io.c | 13 
>  block/snapshot.c   | 30 +++---
>  blockdev.c |  3 +-
>  include/block/block.h  |  3 +-
>  include/sysemu/block-backend.h |  1 -
>  migration/block.c  |  4 ++-
>  monitor.c  |  6 ++--
>  qmp.c  |  5 ++-
>  10 files changed, 102 insertions(+), 75 deletions(-)
> 
> diff --git a/block.c b/block.c
> index fd4cf81..91bf431 100644
> --- a/block.c
> +++ b/block.c
> @@ -2870,25 +2870,6 @@ BlockDriverState *bdrv_next_node(BlockDriverState *bs)
>  return QTAILQ_NEXT(bs, node_list);
>  }
>  
> -/* Iterates over all top-level BlockDriverStates, i.e. BDSs that are owned by
> - * the monitor or attached to a BlockBackend */
> -BlockDriverState *bdrv_next(BlockDriverState *bs)
> -{
> -if (!bs || bs->blk) {
> -bs = blk_next_root_bs(bs);
> -if (bs) {
> -return bs;
> -}
> -}
> -
> -/* Ignore all BDSs that are attached to a BlockBackend here; they have 
> been
> - * handled by the above block already */
> -do {
> -bs = bdrv_next_monitor_owned(bs);
> -} while (bs && bs->blk);
> -return bs;
> -}
> -
>  const char *bdrv_get_node_name(const BlockDriverState *bs)
>  {
>  return bs->node_name;
> @@ -3220,10 +3201,11 @@ void bdrv_invalidate_cache(BlockDriverState *bs, 
> Error **errp)
>  
>  void bdrv_invalidate_cache_all(Error **errp)
>  {
> -BlockDriverState *bs = NULL;
> +BlockDriverState *bs;
>  Error *local_err = NULL;
> +BdrvNextIterator *it = NULL;
>  
> -while ((bs = bdrv_next(bs)) != NULL) {
> +while ((it = bdrv_next(it, &bs)) != NULL) {
>  AioContext *aio_context = bdrv_get_aio_context(bs);
>  
>  aio_context_acquire(aio_context);
> @@ -3265,10 +3247,11 @@ static int bdrv_inactivate_recurse(BlockDriverState 
> *bs,
>  int bdrv_inactivate_all(void)
>  {
>  BlockDriverState *bs = NULL;
> +BdrvNextIterator *it = NULL;
>  int ret = 0;
>  int pass;
>  
> -while ((bs = bdrv_next(bs)) != NULL) {
> +while ((it = bdrv_next(it, &bs)) != NULL) {
>  aio_context_acquire(bdrv_get_aio_context(bs));
>  }
>  
> @@ -3277,8 +3260,8 @@ int bdrv_inactivate_all(void)
>   * the second pass sets the BDRV_O_INACTIVE flag so that no further write
>   * is allowed. */
>  for (pass = 0; pass < 2; pass++) {
> -bs = NULL;
> -while ((bs = bdrv_next(bs)) != NULL) {
> +it = NULL;
> +while ((it = bdrv_next(it, &bs)) != NULL) {
>  ret = bdrv_inactivate_recurse(bs, pass);
>  if (ret < 0) {
>  goto out;
> @@ -3287,8 +3270,8 @@ int bdrv_inactivate_all(void)
>  }
>  
>  out:
> -bs = NULL;
> -while ((bs = bdrv_next(bs)) != NULL) {
> +it = NULL;
> +while ((it = bdrv_next(it, &bs)) != NULL) {
>  aio_context_release(bdrv_get_aio_context(bs));
>  }
>  
> @@ -3781,10 +3764,11 @@ bool 
> bdrv_recurse_is_first_non_filter(BlockDriverState *bs,
>   */
>  bool bdrv_is_first_non_filter(BlockDriverState *candidate)
>  {
> -BlockDriverState *bs = NULL;
> +BlockDriverState *bs;
> +BdrvNextIterator *it = NULL;
>  
>  /* walk down the bs forest recursively */
> -while ((bs = bdrv_next(bs)) != NULL) {
> +while ((it = bdrv_next(it, &bs)) != NULL) {
>  bool perm;
>  
>  /* try to recurse in this top level bs */
> diff --git a/block/block-backend.c b/block/block-backend.c
> index 9dcac97..7f2eeb0 100644
> --- a/block/block-backend.c
> +++ b/block/block-backend.c
> @@ -75,6 +75,7 @@ static const AIOCBInfo block_backend_aiocb_info = {
>  };
>  
>  static void drive_info_del(DriveInfo *dinfo);
> +static BlockBackend *bdrv_first_blk(BlockDriverState *bs);
>  
>  /* All BlockBackends */
>  static QTAILQ_HEAD(, BlockBackend) block_backends =
> @@ -286,28 +287,50 @@ BlockBackend *blk_next(BlockBackend *blk)
> : QTAILQ_FIRST(&monitor_block_backends);
>  }
>  
> -/*
> - * Iterates over all BlockDriverStates which are attached to a BlockBackend.
> - * This function is for use by bdrv_next().
> - *
> - * @bs must be NULL or a BDS that is attached to a BB.
> - */
> -BlockDriverState *blk_next_root_bs(BlockDriverState *bs)
> -{
> +struct BdrvNextIterator {
> +enum {
> +BDRV_NEXT_BACKEND_ROOTS,
> +BDRV_NEXT_MONITOR_OWNED,
> +} phase;
>  BlockBackend *blk;
> +BlockDriverState *bs;
> +};
>  
> -if (bs) {
> -assert(bs->blk);
> -blk = bs->blk;
> -} else {
> - 

[Qemu-block] [PATCH v19 06/10] auto complete active commit

2016-05-20 Thread Changlong Xie
From: Wen Congyang 

Auto complete mirror job in background to prevent from
blocking synchronously

Signed-off-by: Wen Congyang 
Signed-off-by: Changlong Xie 
---
 block/mirror.c| 13 +
 blockdev.c|  2 +-
 include/block/block_int.h |  3 ++-
 qemu-img.c|  2 +-
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/block/mirror.c b/block/mirror.c
index b9986d8..385b189 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -801,7 +801,8 @@ static void mirror_start_job(BlockDriverState *bs, 
BlockDriverState *target,
  BlockCompletionFunc *cb,
  void *opaque, Error **errp,
  const BlockJobDriver *driver,
- bool is_none_mode, BlockDriverState *base)
+ bool is_none_mode, BlockDriverState *base,
+ bool auto_complete)
 {
 MirrorBlockJob *s;
 BlockDriverState *replaced_bs;
@@ -850,6 +851,9 @@ static void mirror_start_job(BlockDriverState *bs, 
BlockDriverState *target,
 s->granularity = granularity;
 s->buf_size = ROUND_UP(buf_size, granularity);
 s->unmap = unmap;
+if (auto_complete) {
+s->should_complete = true;
+}
 
 s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
 if (!s->dirty_bitmap) {
@@ -886,14 +890,15 @@ void mirror_start(BlockDriverState *bs, BlockDriverState 
*target,
 mirror_start_job(bs, target, replaces,
  speed, granularity, buf_size,
  on_source_error, on_target_error, unmap, cb, opaque, errp,
- &mirror_job_driver, is_none_mode, base);
+ &mirror_job_driver, is_none_mode, base, false);
 }
 
 void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
  int64_t speed,
  BlockdevOnError on_error,
  BlockCompletionFunc *cb,
- void *opaque, Error **errp)
+ void *opaque, Error **errp,
+ bool auto_complete)
 {
 int64_t length, base_length;
 int orig_base_flags;
@@ -934,7 +939,7 @@ void commit_active_start(BlockDriverState *bs, 
BlockDriverState *base,
 bdrv_ref(base);
 mirror_start_job(bs, base, NULL, speed, 0, 0,
  on_error, on_error, false, cb, opaque, &local_err,
- &commit_active_job_driver, false, base);
+ &commit_active_job_driver, false, base, auto_complete);
 if (local_err) {
 error_propagate(errp, local_err);
 goto error_restore_flags;
diff --git a/blockdev.c b/blockdev.c
index 40e4e6f..90de201 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -3163,7 +3163,7 @@ void qmp_block_commit(const char *device,
 goto out;
 }
 commit_active_start(bs, base_bs, speed, on_error, block_job_cb,
-bs, &local_err);
+bs, &local_err, false);
 } else {
 commit_start(bs, base_bs, top_bs, speed, on_error, block_job_cb, bs,
  has_backing_file ? backing_file : NULL, &local_err);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index b6f4755..4fc60f7 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -653,13 +653,14 @@ void commit_start(BlockDriverState *bs, BlockDriverState 
*base,
  * @cb: Completion function for the job.
  * @opaque: Opaque pointer value passed to @cb.
  * @errp: Error object.
+ * @auto_complete: Auto complete the job.
  *
  */
 void commit_active_start(BlockDriverState *bs, BlockDriverState *base,
  int64_t speed,
  BlockdevOnError on_error,
  BlockCompletionFunc *cb,
- void *opaque, Error **errp);
+ void *opaque, Error **errp, bool auto_complete);
 /*
  * mirror_start:
  * @bs: Block device to operate on.
diff --git a/qemu-img.c b/qemu-img.c
index 4792366..1e04771 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -911,7 +911,7 @@ static int img_commit(int argc, char **argv)
 };
 
 commit_active_start(bs, base_bs, 0, BLOCKDEV_ON_ERROR_REPORT,
-common_block_job_cb, &cbi, &local_err);
+common_block_job_cb, &cbi, &local_err, false);
 if (local_err) {
 goto done;
 }
-- 
1.9.3






[Qemu-block] [PATCH v19 04/10] Link backup into block core

2016-05-20 Thread Changlong Xie
From: Wen Congyang 

Some programs that add a dependency on it will use
the block layer directly.

Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
Signed-off-by: Changlong Xie 
Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Jeff Cody 
---
 block/Makefile.objs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/Makefile.objs b/block/Makefile.objs
index 44a5416..fbfe647 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -22,12 +22,12 @@ block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o
 block-obj-$(CONFIG_LIBSSH2) += ssh.o
 block-obj-y += accounting.o dirty-bitmap.o
 block-obj-y += write-threshold.o
+block-obj-y += backup.o
 
 block-obj-y += crypto.o
 
 common-obj-y += stream.o
 common-obj-y += commit.o
-common-obj-y += backup.o
 
 iscsi.o-cflags := $(LIBISCSI_CFLAGS)
 iscsi.o-libs   := $(LIBISCSI_LIBS)
-- 
1.9.3






[Qemu-block] [PATCH v19 07/10] Introduce new APIs to do replication operation

2016-05-20 Thread Changlong Xie
Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
Signed-off-by: Changlong Xie 
---
 Makefile.objs|   1 +
 qapi/block-core.json |  13 
 replication.c| 105 ++
 replication.h| 176 +++
 4 files changed, 295 insertions(+)
 create mode 100644 replication.c
 create mode 100644 replication.h

diff --git a/Makefile.objs b/Makefile.objs
index 8f705f6..30d403f 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -15,6 +15,7 @@ block-obj-$(CONFIG_POSIX) += aio-posix.o
 block-obj-$(CONFIG_WIN32) += aio-win32.o
 block-obj-y += block/
 block-obj-y += qemu-io-cmds.o
+block-obj-y += replication.o
 
 block-obj-m = block/
 
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 98a20d2..e56cdf4 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2032,6 +2032,19 @@
 '*read-pattern': 'QuorumReadPattern' } }
 
 ##
+# @ReplicationMode
+#
+# An enumeration of replication modes.
+#
+# @primary: Primary mode, the vm's state will be sent to secondary QEMU.
+#
+# @secondary: Secondary mode, receive the vm's state from primary QEMU.
+#
+# Since: 2.7
+##
+{ 'enum' : 'ReplicationMode', 'data' : [ 'primary', 'secondary' ] }
+
+##
 # @BlockdevOptions
 #
 # Options for creating a block device.  Many options are available for all
diff --git a/replication.c b/replication.c
new file mode 100644
index 000..03f4a2b
--- /dev/null
+++ b/replication.c
@@ -0,0 +1,105 @@
+/*
+ * Replication filter
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2016 FUJITSU LIMITED
+ *
+ * Author:
+ *   Changlong Xie 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "replication.h"
+
+static QLIST_HEAD(, ReplicationState) replication_states;
+
+ReplicationState *replication_new(void *opaque, ReplicationOps *ops)
+{
+ReplicationState *rs;
+
+assert(ops != NULL);
+rs = g_new0(ReplicationState, 1);
+rs->opaque = opaque;
+rs->ops = ops;
+QLIST_INSERT_HEAD(&replication_states, rs, node);
+
+return rs;
+}
+
+void replication_remove(ReplicationState *rs)
+{
+if (rs) {
+QLIST_REMOVE(rs, node);
+g_free(rs);
+}
+}
+
+/*
+ * The caller of the function MUST make sure vm stopped
+ */
+void replication_start_all(ReplicationMode mode, Error **errp)
+{
+ReplicationState *rs, *next;
+Error *local_err = NULL;
+
+QLIST_FOREACH_SAFE(rs, &replication_states, node, next) {
+if (rs->ops && rs->ops->start) {
+rs->ops->start(rs, mode, &local_err);
+}
+if (local_err) {
+   error_propagate(errp, local_err);
+   return;
+}
+}
+}
+
+void replication_do_checkpoint_all(Error **errp)
+{
+ReplicationState *rs, *next;
+Error *local_err = NULL;
+
+QLIST_FOREACH_SAFE(rs, &replication_states, node, next) {
+if (rs->ops && rs->ops->checkpoint) {
+rs->ops->checkpoint(rs, &local_err);
+}
+if (local_err) {
+   error_propagate(errp, local_err);
+   return;
+}
+}
+}
+
+void replication_get_error_all(Error **errp)
+{
+ReplicationState *rs, *next;
+Error *local_err = NULL;
+
+QLIST_FOREACH_SAFE(rs, &replication_states, node, next) {
+if (rs->ops && rs->ops->get_error) {
+rs->ops->get_error(rs, &local_err);
+}
+if (local_err) {
+   error_propagate(errp, local_err);
+   return;
+}
+}
+}
+
+void replication_stop_all(bool failover, Error **errp)
+{
+ReplicationState *rs, *next;
+Error *local_err = NULL;
+
+QLIST_FOREACH_SAFE(rs, &replication_states, node, next) {
+if (rs->ops && rs->ops->stop) {
+rs->ops->stop(rs, failover, &local_err);
+}
+if (local_err) {
+   error_propagate(errp, local_err);
+   return;
+}
+}
+}
diff --git a/replication.h b/replication.h
new file mode 100644
index 000..d9db696
--- /dev/null
+++ b/replication.h
@@ -0,0 +1,176 @@
+/*
+ * Replication filter
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2016 FUJITSU LIMITED
+ *
+ * Author:
+ *   Changlong Xie 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#ifndef REPLICATION_H
+#define REPLICATION_H
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "sysemu/sysemu.h"
+
+typedef struct ReplicationOps ReplicationOps;
+typedef struct ReplicationState ReplicationState;
+
+/**
+ * SECTION:replication.h
+ * @title:Base Replication System
+ * @short_description: interfaces for handling replication
+ *
+ * The Replication Model provides a framework for handling Replication
+ *
+ * 
+ *   How 

[Qemu-block] [PATCH v19 03/10] Backup: export interfaces for extra serialization

2016-05-20 Thread Changlong Xie
Normal backup(sync='none') workflow:
step 1. NBD peformance I/O write from client to server
   qcow2_co_writev
bdrv_co_writev
 ...
   bdrv_aligned_pwritev
notifier_with_return_list_notify -> backup_do_cow
 bdrv_driver_pwritev // write new contents

step 2. drive-backup sync=none
   backup_do_cow
   {
wait_for_overlapping_requests
cow_request_begin
for(; start < end; start++) {
bdrv_co_readv_no_serialising //read old contents from Secondary disk
bdrv_co_writev // write old contents to hidden-disk
}
cow_request_end
   }

step 3. Then roll back to "step 1" to write new contents to Secondary disk.

And for replication, we must make sure that we only read the old contents from
Secondary disk in order to keep contents consistent.

1) Replication workflow of Secondary
 virtio-blk
  ^
--->  1 NBD   |
   || server   3 replication
   ||^^
   |||   backing backing  |
   ||  Secondary disk 6< hidden-disk 5 < active-disk 4
   ||| ^
   ||'-'
   ||   drive-backup sync=none 2

Hence, we need these interfaces to implement coarse-grained serialization 
between
COW of Secondary disk and the read operation of replication.

Example codes about how to use them:

*#include "block/block_backup.h"

static coroutine_fn int xxx_co_readv()
{
CowRequest req;
BlockJob *job = secondary_disk->bs->job;

if (job) {
  backup_wait_for_overlapping_requests(job, start, end);
  backup_cow_request_begin(&req, job, start, end);
  ret = bdrv_co_readv();
  backup_cow_request_end(&req);
  goto out;
}
ret = bdrv_co_readv();
out:
return ret;
}

Signed-off-by: Changlong Xie 
Signed-off-by: Wen Congyang 
---
 block/backup.c   | 41 ++---
 include/block/block_backup.h | 14 ++
 2 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index 93bfd4c..57bcfa3 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -28,13 +28,6 @@
 #define BACKUP_CLUSTER_SIZE_DEFAULT (1 << 16)
 #define SLICE_TIME 1ULL /* ns */
 
-typedef struct CowRequest {
-int64_t start;
-int64_t end;
-QLIST_ENTRY(CowRequest) list;
-CoQueue wait_queue; /* coroutines blocked on this request */
-} CowRequest;
-
 typedef struct BackupBlockJob {
 BlockJob common;
 BlockDriverState *target;
@@ -268,6 +261,40 @@ void backup_do_checkpoint(BlockJob *job, Error **errp)
 bitmap_zero(backup_job->done_bitmap, len);
 }
 
+void backup_wait_for_overlapping_requests(BlockJob *job, int64_t sector_num,
+  int nb_sectors)
+{
+BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
+int64_t sectors_per_cluster = cluster_size_sectors(backup_job);
+int64_t start, end;
+
+assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP);
+
+start = sector_num / sectors_per_cluster;
+end = DIV_ROUND_UP(sector_num + nb_sectors, sectors_per_cluster);
+wait_for_overlapping_requests(backup_job, start, end);
+}
+
+void backup_cow_request_begin(CowRequest *req, BlockJob *job,
+  int64_t sector_num,
+  int nb_sectors)
+{
+BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
+int64_t sectors_per_cluster = cluster_size_sectors(backup_job);
+int64_t start, end;
+
+assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP);
+
+start = sector_num / sectors_per_cluster;
+end = DIV_ROUND_UP(sector_num + nb_sectors, sectors_per_cluster);
+cow_request_begin(req, backup_job, start, end);
+}
+
+void backup_cow_request_end(CowRequest *req)
+{
+cow_request_end(req);
+}
+
 static const BlockJobDriver backup_job_driver = {
 .instance_size  = sizeof(BackupBlockJob),
 .job_type   = BLOCK_JOB_TYPE_BACKUP,
diff --git a/include/block/block_backup.h b/include/block/block_backup.h
index 3753bcb..e0e7ce6 100644
--- a/include/block/block_backup.h
+++ b/include/block/block_backup.h
@@ -1,3 +1,17 @@
 #include "block/block_int.h"
 
+typedef struct CowRequest {
+int64_t start;
+int64_t end;
+QLIST_ENTRY(CowRequest) list;
+CoQueue wait_queue; /* coroutines blocked on this request */
+} CowRequest;
+
+void backup_wait_for_overlapping_requests(BlockJob *job, int64_t sector_num,
+  int nb_sectors);
+void backup_cow_request_begin(CowRequest *req, BlockJob *job,
+  int64_t sector_num,
+ 

[Qemu-block] [PATCH v19 08/10] Implement new driver for block replication

2016-05-20 Thread Changlong Xie
From: Wen Congyang 

Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
Signed-off-by: Changlong Xie 
---
 block/Makefile.objs |   1 +
 block/replication.c | 666 
 2 files changed, 667 insertions(+)
 create mode 100644 block/replication.c

diff --git a/block/Makefile.objs b/block/Makefile.objs
index fbfe647..5e28b45 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -23,6 +23,7 @@ block-obj-$(CONFIG_LIBSSH2) += ssh.o
 block-obj-y += accounting.o dirty-bitmap.o
 block-obj-y += write-threshold.o
 block-obj-y += backup.o
+block-obj-y += replication.o
 
 block-obj-y += crypto.o
 
diff --git a/block/replication.c b/block/replication.c
new file mode 100644
index 000..5228c42
--- /dev/null
+++ b/block/replication.c
@@ -0,0 +1,666 @@
+/*
+ * Replication Block filter
+ *
+ * Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+ * Copyright (c) 2016 Intel Corporation
+ * Copyright (c) 2016 FUJITSU LIMITED
+ *
+ * Author:
+ *   Wen Congyang 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qemu-common.h"
+#include "block/nbd.h"
+#include "block/blockjob.h"
+#include "block/block_int.h"
+#include "block/block_backup.h"
+#include "sysemu/block-backend.h"
+#include "qapi/error.h"
+#include "replication.h"
+
+typedef struct BDRVReplicationState {
+ReplicationMode mode;
+int replication_state;
+BdrvChild *active_disk;
+BdrvChild *hidden_disk;
+BdrvChild *secondary_disk;
+char *top_id;
+ReplicationState *rs;
+Error *blocker;
+int orig_hidden_flags;
+int orig_secondary_flags;
+int error;
+} BDRVReplicationState;
+
+enum {
+BLOCK_REPLICATION_NONE, /* block replication is not started */
+BLOCK_REPLICATION_RUNNING,  /* block replication is running */
+BLOCK_REPLICATION_FAILOVER, /* failover is running in background */
+BLOCK_REPLICATION_FAILOVER_FAILED,  /* failover failed */
+BLOCK_REPLICATION_DONE, /* block replication is done */
+};
+
+static void replication_start(ReplicationState *rs, ReplicationMode mode,
+  Error **errp);
+static void replication_do_checkpoint(ReplicationState *rs, Error **errp);
+static void replication_get_error(ReplicationState *rs, Error **errp);
+static void replication_stop(ReplicationState *rs, bool failover,
+ Error **errp);
+
+#define REPLICATION_MODE"mode"
+#define REPLICATION_TOP_ID  "top-id"
+static QemuOptsList replication_runtime_opts = {
+.name = "replication",
+.head = QTAILQ_HEAD_INITIALIZER(replication_runtime_opts.head),
+.desc = {
+{
+.name = REPLICATION_MODE,
+.type = QEMU_OPT_STRING,
+},
+{
+.name = REPLICATION_TOP_ID,
+.type = QEMU_OPT_STRING,
+},
+{ /* end of list */ }
+},
+};
+
+static ReplicationOps replication_ops = {
+.start = replication_start,
+.checkpoint = replication_do_checkpoint,
+.get_error = replication_get_error,
+.stop = replication_stop,
+};
+
+static int replication_open(BlockDriverState *bs, QDict *options,
+int flags, Error **errp)
+{
+int ret;
+BDRVReplicationState *s = bs->opaque;
+Error *local_err = NULL;
+QemuOpts *opts = NULL;
+const char *mode;
+const char *top_id;
+
+ret = -EINVAL;
+opts = qemu_opts_create(&replication_runtime_opts, NULL, 0, &error_abort);
+qemu_opts_absorb_qdict(opts, options, &local_err);
+if (local_err) {
+goto fail;
+}
+
+mode = qemu_opt_get(opts, REPLICATION_MODE);
+if (!mode) {
+error_setg(&local_err, "Missing the option mode");
+goto fail;
+}
+
+if (!strcmp(mode, "primary")) {
+s->mode = REPLICATION_MODE_PRIMARY;
+} else if (!strcmp(mode, "secondary")) {
+s->mode = REPLICATION_MODE_SECONDARY;
+top_id = qemu_opt_get(opts, REPLICATION_TOP_ID);
+s->top_id = g_strdup(top_id);
+if (!s->top_id) {
+error_setg(&local_err, "Missing the option top-id");
+goto fail;
+}
+} else {
+error_setg(&local_err,
+   "The option mode's value should be primary or secondary");
+goto fail;
+}
+
+s->rs = replication_new(bs, &replication_ops);
+
+ret = 0;
+
+fail:
+qemu_opts_del(opts);
+error_propagate(errp, local_err);
+
+return ret;
+}
+
+static void replication_close(BlockDriverState *bs)
+{
+BDRVReplicationState *s = bs->opaque;
+
+if (s->replication_state == BLOCK_REPLICATION_RUNNING) {
+replication_stop(s->rs, false, NULL);
+}
+
+if (s->mode == REPLICATION_MODE_SECONDARY) {
+g_free(s->top_id);
+}
+
+replication_remove(s->rs);
+}
+
+static int64_t replication_get

[Qemu-block] [PATCH v19 01/10] unblock backup operations in backing file

2016-05-20 Thread Changlong Xie
From: Wen Congyang 

Signed-off-by: Wen Congyang 
Signed-off-by: Changlong Xie 
---
 block.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/block.c b/block.c
index 1205ef8..8c4c2c2 100644
--- a/block.c
+++ b/block.c
@@ -1271,6 +1271,23 @@ void bdrv_set_backing_hd(BlockDriverState *bs, 
BlockDriverState *backing_hd)
 /* Otherwise we won't be able to commit due to check in bdrv_commit */
 bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_COMMIT_TARGET,
 bs->backing_blocker);
+/*
+ * We do backup in 3 ways:
+ * 1. drive backup
+ *The target bs is new opened, and the source is top BDS
+ * 2. blockdev backup
+ *Both the source and the target are top BDSes.
+ * 3. internal backup(used for block replication)
+ *Both the source and the target are backing file
+ *
+ * In case 1 and 2, neither the source nor the target is the backing file.
+ * In case 3, we will block the top BDS, so there is only one block job
+ * for the top BDS and its backing chain.
+ */
+bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_SOURCE,
+bs->backing_blocker);
+bdrv_op_unblock(backing_hd, BLOCK_OP_TYPE_BACKUP_TARGET,
+bs->backing_blocker);
 out:
 bdrv_refresh_limits(bs, NULL);
 }
-- 
1.9.3






[Qemu-block] [PATCH v19 05/10] docs: block replication's description

2016-05-20 Thread Changlong Xie
From: Wen Congyang 

Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
Signed-off-by: Changlong Xie 
---
 docs/block-replication.txt | 239 +
 1 file changed, 239 insertions(+)
 create mode 100644 docs/block-replication.txt

diff --git a/docs/block-replication.txt b/docs/block-replication.txt
new file mode 100644
index 000..c5fc18b
--- /dev/null
+++ b/docs/block-replication.txt
@@ -0,0 +1,239 @@
+Block replication
+
+Copyright Fujitsu, Corp. 2016
+Copyright (c) 2016 Intel Corporation
+Copyright (c) 2016 HUAWEI TECHNOLOGIES CO., LTD.
+
+This work is licensed under the terms of the GNU GPL, version 2 or later.
+See the COPYING file in the top-level directory.
+
+Block replication is used for continuous checkpoints. It is designed
+for COLO (COarse-grain LOck-stepping) where the Secondary VM is running.
+It can also be applied for FT/HA (Fault-tolerance/High Assurance) scenario,
+where the Secondary VM is not running.
+
+This document gives an overview of block replication's design.
+
+== Background ==
+High availability solutions such as micro checkpoint and COLO will do
+consecutive checkpoints. The VM state of the Primary and Secondary VM is
+identical right after a VM checkpoint, but becomes different as the VM
+executes till the next checkpoint. To support disk contents checkpoint,
+the modified disk contents in the Secondary VM must be buffered, and are
+only dropped at next checkpoint time. To reduce the network transportation
+effort during a vmstate checkpoint, the disk modification operations of
+the Primary disk are asynchronously forwarded to the Secondary node.
+
+== Workflow ==
+The following is the image of block replication workflow:
+
++--+++
+|Primary Write Requests||Secondary Write Requests|
++--+++
+  |   |
+  |  (4)
+  |   V
+  |  /-\
+  |  Copy and Forward| |
+  |-(1)--+   | Disk Buffer |
+  |  |   | |
+  | (3)  \-/
+  | speculative  ^
+  |write through(2)
+  |  |   |
+  V  V   |
+   +--+   ++
+   | Primary Disk |   | Secondary Disk |
+   +--+   ++
+
+1) Primary write requests will be copied and forwarded to Secondary
+   QEMU.
+2) Before Primary write requests are written to Secondary disk, the
+   original sector content will be read from Secondary disk and
+   buffered in the Disk buffer, but it will not overwrite the existing
+   sector content (it could be from either "Secondary Write Requests" or
+   previous COW of "Primary Write Requests") in the Disk buffer.
+3) Primary write requests will be written to Secondary disk.
+4) Secondary write requests will be buffered in the Disk buffer and it
+   will overwrite the existing sector content in the buffer.
+
+== Architecture ==
+We are going to implement block replication from many basic
+blocks that are already in QEMU.
+
+ virtio-blk   ||
+ ^||.--
+ |||| Secondary
+1 Quorum  ||'--
+ /  \ ||
+/\||
+   Primary2 filter
+ disk ^
 virtio-blk
+  |
  ^
+3 NBD  --->  3 NBD 
  |
+client|| server
  2 filter
+  ||^  
  ^
+. |||  
  |
+Primary | ||  Secondary disk <- hidden-disk 5 
<- active-disk 4
+' |||  backing^   backing
+  ||| |
+  ||| |
+  ||'-'
+  ||   drive-backup sync=

[Qemu-block] [PATCH v19 10/10] support replication driver in blockdev-add

2016-05-20 Thread Changlong Xie
From: Wen Congyang 

Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
Signed-off-by: Changlong Xie 
Reviewed-by: Eric Blake 
---
 qapi/block-core.json | 20 ++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index e56cdf4..b9f9839 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -248,6 +248,7 @@
 #   2.3: 'host_floppy' deprecated
 #   2.5: 'host_floppy' dropped
 #   2.6: 'luks' added
+#   2.7: 'replication' added
 #
 # @backing_file: #optional the name of the backing file (for copy-on-write)
 #
@@ -1632,6 +1633,7 @@
 # Drivers that are supported in block device operations.
 #
 # @host_device, @host_cdrom: Since 2.1
+# @replication: Since 2.7
 #
 # Since: 2.0
 ##
@@ -1639,8 +1641,8 @@
   'data': [ 'archipelago', 'blkdebug', 'blkverify', 'bochs', 'cloop',
 'dmg', 'file', 'ftp', 'ftps', 'host_cdrom', 'host_device',
 'http', 'https', 'luks', 'null-aio', 'null-co', 'parallels',
-'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'tftp', 'vdi', 'vhdx',
-'vmdk', 'vpc', 'vvfat' ] }
+'qcow', 'qcow2', 'qed', 'quorum', 'raw', 'replication', 'tftp',
+'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat' ] }
 
 ##
 # @BlockdevOptionsFile
@@ -2045,6 +2047,19 @@
 { 'enum' : 'ReplicationMode', 'data' : [ 'primary', 'secondary' ] }
 
 ##
+# @BlockdevOptionsReplication
+#
+# Driver specific block device options for replication
+#
+# @mode: the replication mode
+#
+# Since: 2.7
+##
+{ 'struct': 'BlockdevOptionsReplication',
+  'base': 'BlockdevOptionsGenericFormat',
+  'data': { 'mode': 'ReplicationMode'  } }
+
+##
 # @BlockdevOptions
 #
 # Options for creating a block device.  Many options are available for all
@@ -2125,6 +2140,7 @@
   'quorum': 'BlockdevOptionsQuorum',
   'raw':'BlockdevOptionsGenericFormat',
 # TODO rbd: Wait for structured options
+  'replication':'BlockdevOptionsReplication',
 # TODO sheepdog: Wait for structured options
 # TODO ssh: Should take InetSocketAddress for 'host'?
   'tftp':   'BlockdevOptionsFile',
-- 
1.9.3






[Qemu-block] [PATCH v19 02/10] Backup: clear all bitmap when doing block checkpoint

2016-05-20 Thread Changlong Xie
From: Wen Congyang 

Signed-off-by: Wen Congyang 
Signed-off-by: zhanghailiang 
Signed-off-by: Gonglei 
Signed-off-by: Changlong Xie 
---
 block/backup.c   | 18 ++
 include/block/block_backup.h |  3 +++
 2 files changed, 21 insertions(+)
 create mode 100644 include/block/block_backup.h

diff --git a/block/backup.c b/block/backup.c
index fec45e8..93bfd4c 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -17,6 +17,7 @@
 #include "block/block.h"
 #include "block/block_int.h"
 #include "block/blockjob.h"
+#include "block/block_backup.h"
 #include "qapi/error.h"
 #include "qapi/qmp/qerror.h"
 #include "qemu/ratelimit.h"
@@ -250,6 +251,23 @@ static void backup_abort(BlockJob *job)
 }
 }
 
+void backup_do_checkpoint(BlockJob *job, Error **errp)
+{
+BackupBlockJob *backup_job = container_of(job, BackupBlockJob, common);
+int64_t len;
+
+assert(job->driver->job_type == BLOCK_JOB_TYPE_BACKUP);
+
+if (backup_job->sync_mode != MIRROR_SYNC_MODE_NONE) {
+error_setg(errp, "The backup job only supports block checkpoint in"
+   " sync=none mode");
+return;
+}
+
+len = DIV_ROUND_UP(backup_job->common.len, backup_job->cluster_size);
+bitmap_zero(backup_job->done_bitmap, len);
+}
+
 static const BlockJobDriver backup_job_driver = {
 .instance_size  = sizeof(BackupBlockJob),
 .job_type   = BLOCK_JOB_TYPE_BACKUP,
diff --git a/include/block/block_backup.h b/include/block/block_backup.h
new file mode 100644
index 000..3753bcb
--- /dev/null
+++ b/include/block/block_backup.h
@@ -0,0 +1,3 @@
+#include "block/block_int.h"
+
+void backup_do_checkpoint(BlockJob *job, Error **errp);
-- 
1.9.3






[Qemu-block] [PATCH v19 00/10] Block replication for continuous checkpoints

2016-05-20 Thread Changlong Xie
Block replication is a very important feature which is used for
continuous checkpoints(for example: COLO).

You can get the detailed information about block replication from here:
http://wiki.qemu.org/Features/BlockReplication

Usage:
Please refer to docs/block-replication.txt

You can get the patch here:
https://github.com/Pating/qemu/tree/changlox/block-replication-v19

You can get the patch with framework here:
https://github.com/Pating/qemu/tree/changlox/colo_framework_v18

TODO:
1. Continuous block replication. It will be started after basic functions
   are accepted.

Changs Log:
V19:
1. Rebase to v2.6.0
2. Address comments from stefan
p3: a new patch that export interfaces for extra serialization
p8: 
1. call replication_stop() before freeing s->top_id
2. check top_bs
3. reopen file readonly in error return paths
4. enable extra serialization between read and COW
p9: try to hanlde SIGABRT
V18:
p6: add local_err in all replication callbacks to prevent "errp == NULL"
p7: add missing qemu_iovec_destroy(xxx)
V17:
1. Rebase to the lastest codes 
p2: refactor backup_do_checkpoint addressed comments from Jeff Cody
p4: fix bugs in "drive_add buddy xxx" hmp commands
p6: add "since: 2.7"
p7: fix bug in replication_close(), add missing "qapi/error.h", add 
test-replication 
p8: add "since: 2.7"
V16:
1. Rebase to the newest codes
2. Address comments from Stefan & hailiang
p3: we don't need this patch now
p4: add "top-id" parameters for secondary
p6: fix NULL pointer in replication callbacks, remove unnecessary typedefs, 
add doc comments that explain the semantics of Replication
p7: Refactor AioContext for thread-safe, remove unnecessary get_top_bs()
*Note*: I'm working on replication testcase now, will send out in V17
V15:
1. Rebase to the newest codes
2. Fix typos and coding style addresed Eric's comments
3. Address Stefan's comments
   1) Make backup_do_checkpoint public, drop the changes on BlockJobDriver
   2) Update the message and description for [PATCH 4/9]
   3) Make replication_(start/stop/do_checkpoint)_all as global interfaces
   4) Introduce AioContext lock to protect start/stop/do_checkpoint callbacks
   5) Use BdrvChild instead of holding on to BlockDriverState * pointers
4. Clear BDRV_O_INACTIVE for hidden disk's open_flags since commit 09e0c771  
5. Introduce replication_get_error_all to check replication status
6. Remove useless discard interface
V14:
1. Implement auto complete active commit
2. Implement active commit block job for replication.c
3. Address the comments from Stefan, add replication-specific API and data
   structure, also remove old block layer APIs
V13:
1. Rebase to the newest codes
2. Remove redundant marcos and semicolon in replication.c 
3. Fix typos in block-replication.txt
V12:
1. Rebase to the newest codes
2. Use backing reference to replcace 'allow-write-backing-file'
V11:
1. Reopen the backing file when starting blcok replication if it is not
   opened in R/W mode
2. Unblock BLOCK_OP_TYPE_BACKUP_SOURCE and BLOCK_OP_TYPE_BACKUP_TARGET
   when opening backing file
3. Block the top BDS so there is only one block job for the top BDS and
   its backing chain.
V10:
1. Use blockdev-remove-medium and blockdev-insert-medium to replace backing
   reference.
2. Address the comments from Eric Blake
V9:
1. Update the error messages
2. Rebase to the newest qemu
3. Split child add/delete support. These patches are sent in another patchset.
V8:
1. Address Alberto Garcia's comments
V7:
1. Implement adding/removing quorum child. Remove the option non-connect.
2. Simplify the backing refrence option according to Stefan Hajnoczi's 
suggestion
V6:
1. Rebase to the newest qemu.
V5:
1. Address the comments from Gong Lei
2. Speed the failover up. The secondary vm can take over very quickly even
   if there are too many I/O requests.
V4:
1. Introduce a new driver replication to avoid touch nbd and qcow2.
V3:
1: use error_setg() instead of error_set()
2. Add a new block job API
3. Active disk, hidden disk and nbd target uses the same AioContext
4. Add a testcase to test new hbitmap API
V2:
1. Redesign the secondary qemu(use image-fleecing)
2. Use Error objects to return error message
3. Address the comments from Max Reitz and Eric Blake

Changlong Xie (3):
  Backup: export interfaces for extra serialization
  Introduce new APIs to do replication operation
  tests: add unit test case for replication

Wen Congyang (7):
  unblock backup operations in backing file
  Backup: clear all bitmap when doing block checkpoint
  Link backup into block core
  docs: block replication's description
  auto complete active commit
  Implement new driver for block replication
  support replication driver in blockdev-add

 Makefile.objs|   1 +
 block.c  |  17 ++
 block/Makefile.objs  |   3 +-
 block/backup.c   |  59 +++-
 block/mirror.c   |  13 +-
 block/replication.c  | 666 +++
 blockdev.c   

[Qemu-block] [PATCH v19 09/10] tests: add unit test case for replication

2016-05-20 Thread Changlong Xie
Signed-off-by: Wen Congyang 
Signed-off-by: Changlong Xie 
---
 tests/.gitignore |   1 +
 tests/Makefile   |   4 +
 tests/test-replication.c | 523 +++
 3 files changed, 528 insertions(+)
 create mode 100644 tests/test-replication.c

diff --git a/tests/.gitignore b/tests/.gitignore
index a06a8ba..d22ab06 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -58,6 +58,7 @@ test-qmp-introspect.[ch]
 test-qmp-marshal.c
 test-qmp-output-visitor
 test-rcu-list
+test-replication
 test-rfifolock
 test-string-input-visitor
 test-string-output-visitor
diff --git a/tests/Makefile b/tests/Makefile
index 9e6..9e6d31d 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -103,6 +103,7 @@ check-unit-y += tests/test-crypto-xts$(EXESUF)
 check-unit-y += tests/test-crypto-block$(EXESUF)
 gcov-files-test-logging-y = tests/test-logging.c
 check-unit-y += tests/test-logging$(EXESUF)
+check-unit-y += tests/test-replication$(EXESUF)
 
 check-block-$(CONFIG_POSIX) += tests/qemu-iotests-quick.sh
 
@@ -448,6 +449,9 @@ tests/test-base64$(EXESUF): tests/test-base64.o \
 
 tests/test-logging$(EXESUF): tests/test-logging.o $(test-util-obj-y)
 
+tests/test-replication$(EXESUF): tests/test-replication.o $(test-util-obj-y) \
+   $(test-block-obj-y)
+
 tests/test-qapi-types.c tests/test-qapi-types.h :\
 $(SRC_PATH)/tests/qapi-schema/qapi-schema-test.json 
$(SRC_PATH)/scripts/qapi-types.py $(qapi-py)
$(call quiet-command,$(PYTHON) $(SRC_PATH)/scripts/qapi-types.py \
diff --git a/tests/test-replication.c b/tests/test-replication.c
new file mode 100644
index 000..e998d46
--- /dev/null
+++ b/tests/test-replication.c
@@ -0,0 +1,523 @@
+/*
+ * Block replication tests
+ *
+ * Copyright (c) 2016 FUJITSU LIMITED
+ * Author: Changlong Xie 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or
+ * later.  See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+
+#include "qapi/error.h"
+#include "replication.h"
+#include "block/block_int.h"
+#include "sysemu/block-backend.h"
+
+#define IMG_SIZE (64 * 1024 * 1024)
+
+/* primary */
+#define P_LOCAL_DISK "/tmp/p_local_disk.XX"
+#define P_COMMAND "driver=replication,mode=primary,node-name=xxx,"\
+  "file.driver=qcow2,file.file.filename="P_LOCAL_DISK
+
+/* secondary */
+#define S_LOCAL_DISK "/tmp/s_local_disk.XX"
+#define S_ACTIVE_DISK "/tmp/s_active_disk.XX"
+#define S_HIDDEN_DISK "/tmp/s_hidden_disk.XX"
+#define S_ID "secondary-id"
+#define S_LOCAL_DISK_ID "secondary-local-disk-id"
+#define S_COMMAND1 "file.filename="S_LOCAL_DISK",driver=qcow2"
+#define S_COMMAND2 "driver=replication,mode=secondary,top-id="S_ID","\
+   "file.driver=qcow2,file.file.filename="S_ACTIVE_DISK","\
+   "file.backing.driver=qcow2,file.backing.file.filename="\
+   ""S_HIDDEN_DISK",file.backing.backing="S_LOCAL_DISK_ID
+
+/* FIXME: steal from blockdev.c */
+QemuOptsList qemu_drive_opts = {
+.name = "drive",
+.head = QTAILQ_HEAD_INITIALIZER(qemu_drive_opts.head),
+.desc = {
+{ /* end of list */ }
+},
+};
+
+static void io_read(BlockDriverState *bs, long pattern, int64_t pattern_offset,
+int64_t pattern_count, int64_t offset, int64_t count,
+bool expect_failed)
+{
+char *buf;
+void *cmp_buf;
+int ret;
+
+/* 1. alloc pattern buffer */
+if (pattern) {
+cmp_buf = g_malloc(pattern_count);
+memset(cmp_buf, pattern, pattern_count);
+}
+
+/* 2. alloc read buffer */
+buf = qemu_blockalign(bs, count);
+memset(buf, 0xab, count);
+
+/* 3. do read */
+ret = bdrv_read(bs, offset >> 9, (uint8_t *)buf, count >> 9);
+
+/* 4. assert and compare buf */
+if (expect_failed) {
+g_assert(ret < 0);
+} else {
+g_assert(ret >= 0);
+if (pattern) {
+g_assert(memcmp(buf + pattern_offset, cmp_buf, pattern_count) <= 
0);
+g_free(cmp_buf);
+}
+}
+g_free(buf);
+}
+
+static void io_write(BlockDriverState *bs, long pattern, int64_t pattern_count,
+ int64_t offset, int64_t count, bool expect_failed)
+{
+void *pattern_buf;
+int ret;
+
+/* 1. alloc pattern buffer */
+if (pattern) {
+pattern_buf = g_malloc(pattern_count);
+memset(pattern_buf, pattern, pattern_count);
+}
+
+/* 2. do write */
+if (pattern) {
+ret = bdrv_write(bs, offset >> 9, (uint8_t *)pattern_buf, count >> 9);
+} else {
+ret = bdrv_write_zeroes(bs, offset >> 9, count >> 9, 0);
+}
+
+/* 3. assert */
+if (expect_failed) {
+g_assert(ret < 0);
+} else {
+g_assert(ret >= 0);
+g_free(pattern_buf);
+}
+}
+
+static void prepare_imgs(void)
+{
+Error *local_err = NULL;
+
+/* Primary */
+bdrv_img_create(P_LOCAL_DISK, "qcow2", NULL, NULL, NULL, IMG_SIZE,
+BD