Re: [Qemu-block] [PATCH v19 08/10] Implement new driver for block replication

2016-06-06 Thread Changlong Xie

On 05/20/2016 03:36 PM, Changlong Xie wrote:

+if (!failover) {
+/*
+ * This BDS will be closed, and the job should be completed
+ * before the BDS is closed, because we will access hidden
+ * disk, secondary disk in backup_job_completed().
+ */
+if (s->secondary_disk->bs->job) {
+block_job_cancel_sync(s->secondary_disk->bs->job);
+}
+secondary_do_checkpoint(s, errp);
+s->replication_state = BLOCK_REPLICATION_DONE;
+aio_context_release(aio_context);
+return;
+}
+
+s->replication_state = BLOCK_REPLICATION_FAILOVER;
+if (s->secondary_disk->bs->job) {
+block_job_cancel(s->secondary_disk->bs->job);
+}


Since commit b6d2e599 "block: Convert block job core to BlockBackend", 
blockjob uses BB instead of bdrv_ref(), this introduces unexpected 
Segmentation fault with COLO.


In the below backtrace, you can see that. During failover, s->target was 
changed to an illegal value "0x1e1e1e1e1e1e1e1e" in bakup_complete.
Then the active commit job what also has a pointer that refer to 
s->target will use this illegal pointer. To avoid this, we should use 
"bloc_job_cancel_sync" to ensure backup job complete synchronously.


% MALLOC_PERTURB_=$(($RANDOM % 255 + 1))
% export MALLOC_PERTURB_
% gdb --args ./tests/test-replication
(gdb) b backup_complete
(gdb) r
(gdb) n
(gdb) n
(gdb) watch s->target
(gdb) c
Old value = (BlockBackend *) 0x55f1d990
New value = (BlockBackend *) 0x1e1e1e1e1e1e1e1e
0x75a811eb in memset () from /lib64/libc.so.6
(gdb) bt
#0  0x75a811eb in memset () from /lib64/libc.so.6
#1  0x75a7500e in _int_free () from /lib64/libc.so.6
#2  0x7705bf7f in g_free () from /lib64/libglib-2.0.so.0
#3  0x5557e924 in block_job_unref (job=0x55f1d630) at 
blockjob.c:124
#4  0x5557e9da in block_job_completed_single 
(job=0x55f1d630) at blockjob.c:143
#5  0x5557ecc8 in block_job_completed (job=0x55f1d630, 
ret=0) at blockjob.c:215
#6  0x555e6d49 in backup_complete (job=0x55f1d630, 
opaque=0x55f1dd50) at block/backup.c:325
#7  0x5557f5f4 in block_job_defer_to_main_loop_bh 
(opaque=0x596e1dc0) at blockjob.c:500

#8  0x555747d7 in aio_bh_call (bh=0x596e1c30) at async.c:66
#9  0x55574899 in aio_bh_poll (ctx=0x55ce2d60) at async.c:94
#10 0x55581d4d in aio_dispatch (ctx=0x55ce2d60) at 
aio-posix.c:308
#11 0x555823cb in aio_poll (ctx=0x55ce2d60, blocking=false) 
at aio-posix.c:479
#12 0x555d639b in bdrv_drain_poll (bs=0x55dec210) at 
block/io.c:190
#13 0x555d6566 in bdrv_drained_begin (bs=0x55dec210) at 
block/io.c:240
#14 0x55577261 in bdrv_child_cb_drained_begin 
(child=0x596e1ab0) at block.c:665
#15 0x555d5e81 in bdrv_parent_drained_begin (bs=0x55f0e130) 
at block/io.c:54
#16 0x555d652b in bdrv_drained_begin (bs=0x55f0e130) at 
block/io.c:232
#17 0x55577261 in bdrv_child_cb_drained_begin 
(child=0x596e1810) at block.c:665
#18 0x555d5e81 in bdrv_parent_drained_begin (bs=0x596d7030) 
at block/io.c:54
#19 0x555d652b in bdrv_drained_begin (bs=0x596d7030) at 
block/io.c:232
#20 0x55577261 in bdrv_child_cb_drained_begin 
(child=0x55df8830) at block.c:665
#21 0x555d5e81 in bdrv_parent_drained_begin (bs=0x55df0bb0) 
at block/io.c:54

#22 0x555d66ee in bdrv_drain_all () at block/io.c:301
#23 0x55579f9f in bdrv_reopen_multiple (bs_queue=0x55f1dd30, 
errp=0x7fffd768) at block.c:1953
#24 0x5557a169 in bdrv_reopen (bs=0x55df0bb0, 
bdrv_flags=24578, errp=0x7fffd8e0) at block.c:1994
#25 0x555d54a7 in commit_active_start (bs=0x55f0e130, 
base=0x55df0bb0, speed=0, on_error=BLOCKDEV_ON_ERROR_REPORT, 
cb=0x555e8b97 ,
opaque=0x55dec210, errp=0x7fffd8e0, auto_complete=true) at 
block/mirror.c:901
#26 0x555e8d98 in replication_stop (rs=0x5746f7b0, 
failover=true, errp=0x7fffd8e0) at block/replication.c:623
#27 0x555873d1 in replication_stop_all (failover=true, 
errp=0x7fffd928) at replication.c:98
#28 0x5557406d in test_secondary_start () at 
tests/test-replication.c:403
#29 0x7707a5e1 in g_test_run_suite_internal () from 
/lib64/libglib-2.0.so.0
#30 0x7707a7a6 in g_test_run_suite_internal () from 
/lib64/libglib-2.0.so.0
#31 0x7707a7a6 in g_test_run_suite_internal () from 
/lib64/libglib-2.0.so.0

#32 0x7707ab1b in g_test_run_suite () from /lib64/libglib-2.0.so.0
#33 0x555746c8 in main (argc=1, argv=0x7fffdda8) at 
tests/test-replication.c:545

(gdb) d breakpoints
(gdb) c
Program received signal SIGSEGV, Segmentation fault.
0x555c7d6c in blk_bs (blk=0x1e1e1e1e1e1e1e1e) at 
block/block-backend.c:389

389 return blk->root ? 

Re: [Qemu-block] [PATCH v19 08/10] Implement new driver for block replication

2016-06-06 Thread Changlong Xie

On 05/20/2016 03:36 PM, Changlong Xie wrote:

+
+/*
+ * Must protect backup target if backup job was stopped/cancelled
+ * unexpectedly
+ */
+bdrv_ref(s->hidden_disk->bs);
+
+backup_start(s->secondary_disk->bs, s->hidden_disk->bs, 0,
+ MIRROR_SYNC_MODE_NONE, NULL, BLOCKDEV_ON_ERROR_REPORT,
+ BLOCKDEV_ON_ERROR_REPORT, backup_job_completed,
+ s, NULL, _err);
+if (local_err) {
+error_propagate(errp, local_err);
+backup_job_cleanup(s);
+bdrv_unref(s->hidden_disk->bs);
+aio_context_release(aio_context);
+return;
+}
+break;
+default:
+aio_context_release(aio_context);
+abort();
+}


commit 5c438bc6 introduce BB for I/O, so we don't need protect backup 
target by ourself now.


-/*
- * Must protect backup target if backup job was stopped/cancelled
- * unexpectedly
- */
-bdrv_ref(s->hidden_disk->bs);
-
 backup_start(s->secondary_disk->bs, s->hidden_disk->bs, 0,
  MIRROR_SYNC_MODE_NONE, NULL, 
BLOCKDEV_ON_ERROR_REPORT,

  BLOCKDEV_ON_ERROR_REPORT, backup_job_completed,
@@ -508,7 +502,6 @@ static void replication_start(ReplicationState *rs, 
ReplicationMode mode,

 if (local_err) {
 error_propagate(errp, local_err);
 backup_job_cleanup(s);
-bdrv_unref(s->hidden_disk->bs);
 aio_context_release(aio_context);
 return;
 }

will update in next version.





Re: [Qemu-block] [PATCH v2 5/5] qcow2: Implement .bdrv_co_pwritev()

2016-06-06 Thread Eric Blake
On 06/06/2016 08:59 AM, Kevin Wolf wrote:
> This changes qcow2 to implement the byte-based .bdrv_co_pwritev
> interface rather than the sector-based old one.
> 
> As preallocation uses the same allocation function as normal writes, and
> the interface of that function needs to be changed, it is converted in
> the same patch.
> 
> Signed-off-by: Kevin Wolf 
> ---
>  block/qcow2-cluster.c | 13 
>  block/qcow2.c | 89 
> ---
>  block/qcow2.h |  3 +-
>  trace-events  |  6 ++--
>  4 files changed, 52 insertions(+), 59 deletions(-)
> 

Reviewed-by: Eric Blake 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH v2 4/5] qcow2: Use bytes instead of sectors for QCowL2Meta

2016-06-06 Thread Eric Blake
On 06/06/2016 08:59 AM, Kevin Wolf wrote:
> In preparation for implementing .bdrv_co_pwritev in qcow2.
> 
> Signed-off-by: Kevin Wolf 
> ---
>  block/qcow2-cluster.c | 32 
>  block/qcow2.h | 13 +++--
>  2 files changed, 15 insertions(+), 30 deletions(-)
> 

Reviewed-by: Eric Blake 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH v2 3/5] qcow2: Make copy_sectors() byte based

2016-06-06 Thread Eric Blake
On 06/06/2016 08:59 AM, Kevin Wolf wrote:
> This will allow copy on write operations where the overwritten part of
> the cluster is not aligned to sector boundaries.
> 
> Also rename the function because it has nothing to do with sectors any
> more.
> 
> Signed-off-by: Kevin Wolf 
> ---
>  block/qcow2-cluster.c | 54 
> +--
>  1 file changed, 26 insertions(+), 28 deletions(-)
> 

>  
>  if (bs->encrypted) {
>  Error *err = NULL;
> +int sector = (cluster_offset + offset_in_cluster) >> 
> BDRV_SECTOR_BITS;

Potentially the wrong type...

>  assert(s->cipher);
> -if (qcow2_encrypt_sectors(s, start_sect + n_start,
> -  iov.iov_base, iov.iov_base, n,
> -  true, ) < 0) {
> +assert((offset_in_cluster & BDRV_SECTOR_MASK) == 0);

Why is this one true? If I have a cluster of 4 sectors, why must
offset_in_cluster fall within only the first of those sectors?  Are you
missing a ~, to instead be asserting that offset_in_cluster is
sector-aligned?

> +assert((bytes & ~BDRV_SECTOR_MASK) == 0);

This one looks correct, stating that the number of bytes to copy is a
sector multiple.

> +if (qcow2_encrypt_sectors(s, sector, iov.iov_base, iov.iov_base,
> +  bytes >> BDRV_SECTOR_BITS, true, ) < 
> 0) {

...since encryption allows a 64-bit sector number for the case where the
image is larger than 2T.

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [Qemu-devel] coroutines: block: Co-routine re-entered recursively when migrating disk with iothreads

2016-06-06 Thread Fam Zheng
On Mon, 06/06 14:55, Jason J. Herne wrote:
> > I'll see if I can reproduce it here.
> > 
> > Fam
> > 
> 
> Hi Fam,
> Have you had any luck reproducing this?

No I cannot reproduce so far.




Re: [Qemu-block] [PATCH v2 2/5] qcow2: Implement .bdrv_co_preadv()

2016-06-06 Thread Eric Blake
On 06/06/2016 08:59 AM, Kevin Wolf wrote:
> Reading from qcow2 images is now byte granularity.
> 
> Most of the affected code in qcow2 actually gets simpler with this
> change. The only exception is encryption, which is fixed on 512 bytes
> blocks; in order to keep this working, bs->request_alignment is set for
> encrypted images.
> 
> Signed-off-by: Kevin Wolf 
> ---
>  block/qcow2-cluster.c |  27 ++---
>  block/qcow2.c | 108 
> +++---
>  block/qcow2.h |   2 +-
>  3 files changed, 72 insertions(+), 65 deletions(-)
> 

> - * on exit, *num is the number of contiguous sectors we can read.
> + * On exit, *bytes is the number of bytes staring at offset that have the 
> same

s/staring/starting/

(although "staring" does make for a rather interesting thought :)

> @@ -1389,26 +1402,24 @@ static coroutine_fn int 
> qcow2_co_readv(BlockDriverState *bs, int64_t sector_num,
>  
>  qemu_co_mutex_lock(>lock);
>  
> -while (remaining_sectors != 0) {
> +while (bytes != 0) {
>  
>  /* prepare next request */
> -cur_nr_sectors = remaining_sectors;
> +cur_bytes = MIN(bytes, INT_MAX);
>  if (s->cipher) {
> -cur_nr_sectors = MIN(cur_nr_sectors,
> -QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors);
> +cur_bytes = MIN(cur_bytes,
> +QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
>  }

Can this ever result in cur_bytes that is not aligned, even though
encryption requires aligned reads?  Do you need to round anything to a
sector boundary in this case?

>  qemu_co_mutex_unlock(>lock);
> -ret = bdrv_co_readv(bs->file->bs,
> -(cluster_offset >> 9) + index_in_cluster,
> -cur_nr_sectors, _qiov);
> +ret = bdrv_co_preadv(bs->file->bs,
> + cluster_offset + offset_in_cluster,
> + cur_bytes, _qiov, 0);
>  qemu_co_mutex_lock(>lock);
>  if (ret < 0) {
>  goto fail;
>  }
>  if (bs->encrypted) {
>  assert(s->cipher);
> +assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
> +assert((cur_bytes & (BDRV_SECTOR_SIZE - 1)) == 0);

Hmm, you do check that later on.

So, other than the typo (easy to fix on commit),
Reviewed-by: Eric Blake 

-- 
Eric Blake   eblake redhat com+1-919-301-3266
Libvirt virtualization library http://libvirt.org



signature.asc
Description: OpenPGP digital signature


[Qemu-block] [PATCH] Report error when opening device with locked tray

2016-06-06 Thread Colin Lord
This commit causes qmp_blockdev_change_medium to report an error if an
attempt is made to open a device with a locked tray.

Signed-off-by: Colin Lord 
This is based off my previous patch regarding the do_open_tray function
(currently at v3). Probably should have been submitted as a patch set
but I wasn't thinking that far ahead when I submitted the first patch.
---
 blockdev.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 7dd14b9..8a045d9 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2544,6 +2544,7 @@ void qmp_blockdev_change_medium(const char *device, const 
char *filename,
 BlockBackend *blk;
 BlockDriverState *medium_bs = NULL;
 int bdrv_flags;
+int rc;
 QDict *options = NULL;
 Error *err = NULL;
 
@@ -2598,11 +2599,13 @@ void qmp_blockdev_change_medium(const char *device, 
const char *filename,
 goto fail;
 }
 
-qmp_blockdev_open_tray(device, false, false, );
-if (err) {
+rc = do_open_tray(device, false, );
+if (rc && rc != -ENOSYS) {
 error_propagate(errp, err);
 goto fail;
 }
+error_free(err);
+err = NULL;
 
 qmp_x_blockdev_remove_medium(device, );
 if (err) {
-- 
2.5.5




Re: [Qemu-block] [Qemu-devel] coroutines: block: Co-routine re-entered recursively when migrating disk with iothreads

2016-06-06 Thread Jason J. Herne

On 05/25/2016 04:36 AM, Fam Zheng wrote:

On Tue, 05/24 11:05, Jason J. Herne wrote:

Thread 13 (Thread 0x3ff989ff910 (LWP 29452)):
#0  0x03ff99abe2c0 in raise () from /lib64/libc.so.6
#1  0x03ff99abfc26 in abort () from /lib64/libc.so.6
#2  0x80427d80 in qemu_coroutine_enter (co=0x9c5a4120, opaque=0x0)
at /root/kvmdev/qemu/util/qemu-coroutine.c:112
#3  0x8032246e in nbd_restart_write (opaque=0x9c5897b0) at
/root/kvmdev/qemu/block/nbd-client.c:114
#4  0x802b3a1c in aio_dispatch (ctx=0x9c530770) at
/root/kvmdev/qemu/aio-posix.c:341
#5  0x802b4332 in aio_poll (ctx=0x9c530770, blocking=true) at
/root/kvmdev/qemu/aio-posix.c:479
#6  0x80155aba in iothread_run (opaque=0x9c530200) at
/root/kvmdev/qemu/iothread.c:46
#7  0x03ff99c87c2c in start_thread () from /lib64/libpthread.so.0
#8  0x03ff99b8ec9a in thread_start () from /lib64/libc.so.6


This is the continuation of write request to the NBD target


Thread 1 (Thread 0x3ff9a6f2a90 (LWP 29433)):
#0  0x03ff99c8d68a in pthread_cond_wait@@GLIBC_2.3.2 () from
/lib64/libpthread.so.0
#1  0x8040932e in qemu_cond_wait (cond=0x9c530800, mutex=0x9c5307d0)
at /root/kvmdev/qemu/util/qemu-thread-posix.c:123
#2  0x80426a38 in rfifolock_lock (r=0x9c5307d0) at
/root/kvmdev/qemu/util/rfifolock.c:59
#3  0x802a1f72 in aio_context_acquire (ctx=0x9c530770) at
/root/kvmdev/qemu/async.c:373
#4  0x802b3f54 in aio_poll (ctx=0x9c530770, blocking=true) at
/root/kvmdev/qemu/aio-posix.c:415
#5  0x8031e7ac in bdrv_flush (bs=0x9c59b5c0) at
/root/kvmdev/qemu/block/io.c:2470
#6  0x802a8e6e in bdrv_close (bs=0x9c59b5c0) at
/root/kvmdev/qemu/block.c:2134
#7  0x802a9966 in bdrv_delete (bs=0x9c59b5c0) at
/root/kvmdev/qemu/block.c:2341
#8  0x802ac7c6 in bdrv_unref (bs=0x9c59b5c0) at
/root/kvmdev/qemu/block.c:3376
#9  0x80315340 in mirror_exit (job=0x9c956ed0, opaque=0x9c9570d0) at
/root/kvmdev/qemu/block/mirror.c:494
#10 0x802afb52 in block_job_defer_to_main_loop_bh
(opaque=0x9c90dc10) at /root/kvmdev/qemu/blockjob.c:476


... while this is the completion of mirror. They are not supposed to happen
together. Either the job is completed too early, or the nbd_restart_write
function is invoked incorrectly.

I'll see if I can reproduce it here.

Fam



Hi Fam,
Have you had any luck reproducing this?


--
-- Jason J. Herne (jjhe...@linux.vnet.ibm.com)




[Qemu-block] [PATCH v3] blockdev: clean up error handling in do_open_tray

2016-06-06 Thread Colin Lord
Returns negative error codes and accompanying error messages in cases where
the device has no tray or the tray is locked and isn't forced open. This
extra information should result in better flexibility in functions that
call do_open_tray.

Signed-off-by: Colin Lord 
Suggested by: Markus Armbruster 
---
 blockdev.c | 46 --
 1 file changed, 28 insertions(+), 18 deletions(-)

diff --git a/blockdev.c b/blockdev.c
index 717785e..7dd14b9 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -56,6 +56,8 @@
 static QTAILQ_HEAD(, BlockDriverState) monitor_bdrv_states =
 QTAILQ_HEAD_INITIALIZER(monitor_bdrv_states);
 
+static int do_open_tray(const char *device, bool force, Error **errp);
+
 static const char *const if_name[IF_COUNT] = {
 [IF_NONE] = "none",
 [IF_IDE] = "ide",
@@ -2274,8 +2276,6 @@ exit:
 block_job_txn_unref(block_job_txn);
 }
 
-static int do_open_tray(const char *device, bool force, Error **errp);
-
 void qmp_eject(const char *device, bool has_force, bool force, Error **errp)
 {
 Error *local_err = NULL;
@@ -2286,16 +2286,11 @@ void qmp_eject(const char *device, bool has_force, bool 
force, Error **errp)
 }
 
 rc = do_open_tray(device, force, _err);
-if (local_err) {
+if (rc && rc != -ENOSYS) {
 error_propagate(errp, local_err);
 return;
 }
-
-if (rc == EINPROGRESS) {
-error_setg(errp, "Device '%s' is locked and force was not specified, "
-   "wait for tray to open and try again", device);
-return;
-}
+error_free(local_err);
 
 qmp_x_blockdev_remove_medium(device, errp);
 }
@@ -2324,11 +2319,16 @@ void qmp_block_passwd(bool has_device, const char 
*device,
 aio_context_release(aio_context);
 }
 
-/**
- * returns -errno on fatal error, +errno for non-fatal situations.
- * errp will always be set when the return code is negative.
- * May return +ENOSYS if the device has no tray,
- * or +EINPROGRESS if the tray is locked and the guest has been notified.
+/*
+ * Attempt to open the tray of @device.
+ * If @force, ignore its tray lock.
+ * Else, if the tray is locked, don't open it, but ask the guest to open it.
+ * On error, store an error through @errp and return -errno.
+ * If @device does not exist, return -ENODEV.
+ * If it has no removable media, return -ENOTSUP.
+ * If it has no tray, return -ENOSYS.
+ * If the guest was asked to open the tray, return -EINPROGRESS.
+ * Else, return 0.
  */
 static int do_open_tray(const char *device, bool force, Error **errp)
 {
@@ -2348,8 +2348,8 @@ static int do_open_tray(const char *device, bool force, 
Error **errp)
 }
 
 if (!blk_dev_has_tray(blk)) {
-/* Ignore this command on tray-less devices */
-return ENOSYS;
+error_setg(errp, "Device '%s' does not have a tray", device);
+return -ENOSYS;
 }
 
 if (blk_dev_is_tray_open(blk)) {
@@ -2366,7 +2366,9 @@ static int do_open_tray(const char *device, bool force, 
Error **errp)
 }
 
 if (locked && !force) {
-return EINPROGRESS;
+error_setg(errp, "Device '%s' is locked and force was not specified, "
+   "wait for tray to open and try again", device);
+return -EINPROGRESS;
 }
 
 return 0;
@@ -2375,10 +2377,18 @@ static int do_open_tray(const char *device, bool force, 
Error **errp)
 void qmp_blockdev_open_tray(const char *device, bool has_force, bool force,
 Error **errp)
 {
+Error *local_err = NULL;
+int rc;
+
 if (!has_force) {
 force = false;
 }
-do_open_tray(device, force, errp);
+rc = do_open_tray(device, force, _err);
+if (rc && rc != -ENOSYS && rc != -EINPROGRESS) {
+error_propagate(errp, local_err);
+return;
+}
+error_free(local_err);
 }
 
 void qmp_blockdev_close_tray(const char *device, Error **errp)
-- 
2.5.5




[Qemu-block] [PATCH v2 1/5] qcow2: Work with bytes in qcow2_get_cluster_offset()

2016-06-06 Thread Kevin Wolf
This patch changes the units that qcow2_get_cluster_offset() uses
internally, without touching the interface just yet. This will be done
in another patch.

Signed-off-by: Kevin Wolf 
---
 block/qcow2-cluster.c | 44 +++-
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index d901d89..c5906e6 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -482,29 +482,28 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, 
uint64_t offset,
 unsigned int l2_index;
 uint64_t l1_index, l2_offset, *l2_table;
 int l1_bits, c;
-unsigned int index_in_cluster, nb_clusters;
-uint64_t nb_available, nb_needed;
+unsigned int offset_in_cluster, nb_clusters;
+uint64_t bytes_available, bytes_needed;
 int ret;
+unsigned int bytes;
 
-index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1);
-nb_needed = *num + index_in_cluster;
+assert(*num <= BDRV_REQUEST_MAX_SECTORS);
+bytes = *num * BDRV_SECTOR_SIZE;
 
-l1_bits = s->l2_bits + s->cluster_bits;
-
-/* compute how many bytes there are between the offset and
- * the end of the l1 entry
- */
+offset_in_cluster = offset_into_cluster(s, offset);
+bytes_needed = (uint64_t) bytes + offset_in_cluster;
 
-nb_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1));
-
-/* compute the number of available sectors */
+l1_bits = s->l2_bits + s->cluster_bits;
 
-nb_available = (nb_available >> 9) + index_in_cluster;
+/* compute how many bytes there are between the start of the cluster
+ * containing offset and the end of the l1 entry */
+bytes_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1))
++ offset_in_cluster;
 
-if (nb_needed > nb_available) {
-nb_needed = nb_available;
+if (bytes_needed > bytes_available) {
+bytes_needed = bytes_available;
 }
-assert(nb_needed <= INT_MAX);
+assert(bytes_needed <= INT_MAX);
 
 *cluster_offset = 0;
 
@@ -542,7 +541,7 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t 
offset,
 *cluster_offset = be64_to_cpu(l2_table[l2_index]);
 
 /* nb_needed <= INT_MAX, thus nb_clusters <= INT_MAX, too */
-nb_clusters = size_to_clusters(s, nb_needed << 9);
+nb_clusters = size_to_clusters(s, bytes_needed);
 
 ret = qcow2_get_cluster_type(*cluster_offset);
 switch (ret) {
@@ -589,13 +588,16 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, 
uint64_t offset,
 
 qcow2_cache_put(bs, s->l2_table_cache, (void**) _table);
 
-nb_available = (c * s->cluster_sectors);
+bytes_available = (c * s->cluster_size);
 
 out:
-if (nb_available > nb_needed)
-nb_available = nb_needed;
+if (bytes_available > bytes_needed) {
+bytes_available = bytes_needed;
+}
 
-*num = nb_available - index_in_cluster;
+bytes = bytes_available - offset_in_cluster;
+assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
+*num = bytes >> BDRV_SECTOR_BITS;
 
 return ret;
 
-- 
1.8.3.1




[Qemu-block] [PATCH v2 2/5] qcow2: Implement .bdrv_co_preadv()

2016-06-06 Thread Kevin Wolf
Reading from qcow2 images is now byte granularity.

Most of the affected code in qcow2 actually gets simpler with this
change. The only exception is encryption, which is fixed on 512 bytes
blocks; in order to keep this working, bs->request_alignment is set for
encrypted images.

Signed-off-by: Kevin Wolf 
---
 block/qcow2-cluster.c |  27 ++---
 block/qcow2.c | 108 +++---
 block/qcow2.h |   2 +-
 3 files changed, 72 insertions(+), 65 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index c5906e6..48a8e13 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -424,7 +424,8 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs,
  * interface.  This avoids double I/O throttling and request tracking,
  * which can lead to deadlock when block layer copy-on-read is enabled.
  */
-ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, );
+ret = bs->drv->bdrv_co_preadv(bs, (start_sect + n_start) * 
BDRV_SECTOR_SIZE,
+  n * BDRV_SECTOR_SIZE, , 0);
 if (ret < 0) {
 goto out;
 }
@@ -464,19 +465,21 @@ out:
 /*
  * get_cluster_offset
  *
- * For a given offset of the disk image, find the cluster offset in
- * qcow2 file. The offset is stored in *cluster_offset.
+ * For a given offset of the virtual disk, find the cluster type and offset in
+ * the qcow2 file. The offset is stored in *cluster_offset.
  *
- * on entry, *num is the number of contiguous sectors we'd like to
- * access following offset.
+ * On entry, *bytes is the maximum number of contiguous bytes starting at
+ * offset that we are interested in.
  *
- * on exit, *num is the number of contiguous sectors we can read.
+ * On exit, *bytes is the number of bytes staring at offset that have the same
+ * cluster type and (if applicable) are stored contiguously in the image file.
+ * Compressed clusters are always returned one by one.
  *
  * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error
  * cases.
  */
 int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
-int *num, uint64_t *cluster_offset)
+ unsigned int *bytes, uint64_t *cluster_offset)
 {
 BDRVQcow2State *s = bs->opaque;
 unsigned int l2_index;
@@ -485,13 +488,9 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, 
uint64_t offset,
 unsigned int offset_in_cluster, nb_clusters;
 uint64_t bytes_available, bytes_needed;
 int ret;
-unsigned int bytes;
-
-assert(*num <= BDRV_REQUEST_MAX_SECTORS);
-bytes = *num * BDRV_SECTOR_SIZE;
 
 offset_in_cluster = offset_into_cluster(s, offset);
-bytes_needed = (uint64_t) bytes + offset_in_cluster;
+bytes_needed = (uint64_t) *bytes + offset_in_cluster;
 
 l1_bits = s->l2_bits + s->cluster_bits;
 
@@ -595,9 +594,7 @@ out:
 bytes_available = bytes_needed;
 }
 
-bytes = bytes_available - offset_in_cluster;
-assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0);
-*num = bytes >> BDRV_SECTOR_BITS;
+*bytes = bytes_available - offset_in_cluster;
 
 return ret;
 
diff --git a/block/qcow2.c b/block/qcow2.c
index 6f5fb81..545dea0 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -975,6 +975,9 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, 
int flags,
 }
 
 bs->encrypted = 1;
+
+/* Encryption works on a sector granularity */
+bs->request_alignment = BDRV_SECTOR_SIZE;
 }
 
 s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */
@@ -1331,16 +1334,20 @@ static int64_t coroutine_fn 
qcow2_co_get_block_status(BlockDriverState *bs,
 BDRVQcow2State *s = bs->opaque;
 uint64_t cluster_offset;
 int index_in_cluster, ret;
+unsigned int bytes;
 int64_t status = 0;
 
-*pnum = nb_sectors;
+bytes = MIN(INT_MAX, nb_sectors * BDRV_SECTOR_SIZE);
 qemu_co_mutex_lock(>lock);
-ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, _offset);
+ret = qcow2_get_cluster_offset(bs, sector_num << 9, ,
+   _offset);
 qemu_co_mutex_unlock(>lock);
 if (ret < 0) {
 return ret;
 }
 
+*pnum = bytes >> BDRV_SECTOR_BITS;
+
 if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED &&
 !s->cipher) {
 index_in_cluster = sector_num & (s->cluster_sectors - 1);
@@ -1358,28 +1365,34 @@ static int64_t coroutine_fn 
qcow2_co_get_block_status(BlockDriverState *bs,
 
 /* handle reading after the end of the backing file */
 int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
-  int64_t sector_num, int nb_sectors)
+int64_t offset, int bytes)
 {
+uint64_t bs_size = bs->total_sectors * BDRV_SECTOR_SIZE;
 int n1;
-if ((sector_num + nb_sectors) <= bs->total_sectors)
-return nb_sectors;
-if (sector_num >= bs->total_sectors)
+
+if 

[Qemu-block] [PATCH v2 3/5] qcow2: Make copy_sectors() byte based

2016-06-06 Thread Kevin Wolf
This will allow copy on write operations where the overwritten part of
the cluster is not aligned to sector boundaries.

Also rename the function because it has nothing to do with sectors any
more.

Signed-off-by: Kevin Wolf 
---
 block/qcow2-cluster.c | 54 +--
 1 file changed, 26 insertions(+), 28 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 48a8e13..e84d6db 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -390,22 +390,18 @@ int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t 
sector_num,
 return 0;
 }
 
-static int coroutine_fn copy_sectors(BlockDriverState *bs,
- uint64_t start_sect,
- uint64_t cluster_offset,
- int n_start, int n_end)
+static int coroutine_fn do_perform_cow(BlockDriverState *bs,
+   uint64_t src_cluster_offset,
+   uint64_t cluster_offset,
+   int offset_in_cluster,
+   int bytes)
 {
 BDRVQcow2State *s = bs->opaque;
 QEMUIOVector qiov;
 struct iovec iov;
-int n, ret;
-
-n = n_end - n_start;
-if (n <= 0) {
-return 0;
-}
+int ret;
 
-iov.iov_len = n * BDRV_SECTOR_SIZE;
+iov.iov_len = bytes;
 iov.iov_base = qemu_try_blockalign(bs, iov.iov_len);
 if (iov.iov_base == NULL) {
 return -ENOMEM;
@@ -424,18 +420,20 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs,
  * interface.  This avoids double I/O throttling and request tracking,
  * which can lead to deadlock when block layer copy-on-read is enabled.
  */
-ret = bs->drv->bdrv_co_preadv(bs, (start_sect + n_start) * 
BDRV_SECTOR_SIZE,
-  n * BDRV_SECTOR_SIZE, , 0);
+ret = bs->drv->bdrv_co_preadv(bs, src_cluster_offset + offset_in_cluster,
+  bytes, , 0);
 if (ret < 0) {
 goto out;
 }
 
 if (bs->encrypted) {
 Error *err = NULL;
+int sector = (cluster_offset + offset_in_cluster) >> BDRV_SECTOR_BITS;
 assert(s->cipher);
-if (qcow2_encrypt_sectors(s, start_sect + n_start,
-  iov.iov_base, iov.iov_base, n,
-  true, ) < 0) {
+assert((offset_in_cluster & BDRV_SECTOR_MASK) == 0);
+assert((bytes & ~BDRV_SECTOR_MASK) == 0);
+if (qcow2_encrypt_sectors(s, sector, iov.iov_base, iov.iov_base,
+  bytes >> BDRV_SECTOR_BITS, true, ) < 0) {
 ret = -EIO;
 error_free(err);
 goto out;
@@ -443,14 +441,14 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs,
 }
 
 ret = qcow2_pre_write_overlap_check(bs, 0,
-cluster_offset + n_start * BDRV_SECTOR_SIZE, n * BDRV_SECTOR_SIZE);
+cluster_offset + offset_in_cluster, bytes);
 if (ret < 0) {
 goto out;
 }
 
 BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
-ret = bdrv_co_writev(bs->file->bs, (cluster_offset >> 9) + n_start, n,
- );
+ret = bdrv_co_pwritev(bs->file->bs, cluster_offset + offset_in_cluster,
+  bytes, , 0);
 if (ret < 0) {
 goto out;
 }
@@ -745,9 +743,8 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, 
Qcow2COWRegion *r)
 }
 
 qemu_co_mutex_unlock(>lock);
-ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset,
-   r->offset / BDRV_SECTOR_SIZE,
-   r->offset / BDRV_SECTOR_SIZE + r->nb_sectors);
+ret = do_perform_cow(bs, m->offset, m->alloc_offset,
+ r->offset, r->nb_sectors * BDRV_SECTOR_SIZE);
 qemu_co_mutex_lock(>lock);
 
 if (ret < 0) {
@@ -809,13 +806,14 @@ int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, 
QCowL2Meta *m)
 assert(l2_index + m->nb_clusters <= s->l2_size);
 for (i = 0; i < m->nb_clusters; i++) {
 /* if two concurrent writes happen to the same unallocated cluster
-* each write allocates separate cluster and writes data concurrently.
-* The first one to complete updates l2 table with pointer to its
-* cluster the second one has to do RMW (which is done above by
-* copy_sectors()), update l2 table with its cluster pointer and free
-* old cluster. This is what this loop does */
-if(l2_table[l2_index + i] != 0)
+ * each write allocates separate cluster and writes data concurrently.
+ * The first one to complete updates l2 table with pointer to its
+ * cluster the second one has to do RMW (which is done above by
+ * perform_cow()), update l2 table with its cluster pointer and free
+ * old cluster. This is what this loop does 

[Qemu-block] [PATCH v2 4/5] qcow2: Use bytes instead of sectors for QCowL2Meta

2016-06-06 Thread Kevin Wolf
In preparation for implementing .bdrv_co_pwritev in qcow2.

Signed-off-by: Kevin Wolf 
---
 block/qcow2-cluster.c | 32 
 block/qcow2.h | 13 +++--
 2 files changed, 15 insertions(+), 30 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index e84d6db..0182675 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -738,13 +738,12 @@ static int perform_cow(BlockDriverState *bs, QCowL2Meta 
*m, Qcow2COWRegion *r)
 BDRVQcow2State *s = bs->opaque;
 int ret;
 
-if (r->nb_sectors == 0) {
+if (r->nb_bytes == 0) {
 return 0;
 }
 
 qemu_co_mutex_unlock(>lock);
-ret = do_perform_cow(bs, m->offset, m->alloc_offset,
- r->offset, r->nb_sectors * BDRV_SECTOR_SIZE);
+ret = do_perform_cow(bs, m->offset, m->alloc_offset, r->offset, 
r->nb_bytes);
 qemu_co_mutex_lock(>lock);
 
 if (ret < 0) {
@@ -1195,25 +1194,20 @@ static int handle_alloc(BlockDriverState *bs, uint64_t 
guest_offset,
 /*
  * Save info needed for meta data update.
  *
- * requested_sectors: Number of sectors from the start of the first
+ * requested_bytes: Number of bytes from the start of the first
  * newly allocated cluster to the end of the (possibly shortened
  * before) write request.
  *
- * avail_sectors: Number of sectors from the start of the first
+ * avail_bytes: Number of bytes from the start of the first
  * newly allocated to the end of the last newly allocated cluster.
  *
- * nb_sectors: The number of sectors from the start of the first
+ * nb_bytes: The number of bytes from the start of the first
  * newly allocated cluster to the end of the area that the write
  * request actually writes to (excluding COW at the end)
  */
-int requested_sectors =
-(*bytes + offset_into_cluster(s, guest_offset))
->> BDRV_SECTOR_BITS;
-int avail_sectors = nb_clusters
-<< (s->cluster_bits - BDRV_SECTOR_BITS);
-int alloc_n_start = offset_into_cluster(s, guest_offset)
->> BDRV_SECTOR_BITS;
-int nb_sectors = MIN(requested_sectors, avail_sectors);
+uint64_t requested_bytes = *bytes + offset_into_cluster(s, guest_offset);
+int avail_bytes = MIN(INT_MAX, nb_clusters << s->cluster_bits);
+int nb_bytes = MIN(requested_bytes, avail_bytes);
 QCowL2Meta *old_m = *m;
 
 *m = g_malloc0(sizeof(**m));
@@ -1224,23 +1218,21 @@ static int handle_alloc(BlockDriverState *bs, uint64_t 
guest_offset,
 .alloc_offset   = alloc_cluster_offset,
 .offset = start_of_cluster(s, guest_offset),
 .nb_clusters= nb_clusters,
-.nb_available   = nb_sectors,
 
 .cow_start = {
 .offset = 0,
-.nb_sectors = alloc_n_start,
+.nb_bytes   = offset_into_cluster(s, guest_offset),
 },
 .cow_end = {
-.offset = nb_sectors * BDRV_SECTOR_SIZE,
-.nb_sectors = avail_sectors - nb_sectors,
+.offset = nb_bytes,
+.nb_bytes   = avail_bytes - nb_bytes,
 },
 };
 qemu_co_queue_init(&(*m)->dependent_requests);
 QLIST_INSERT_HEAD(>cluster_allocs, *m, next_in_flight);
 
 *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset);
-*bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE)
- - offset_into_cluster(s, guest_offset));
+*bytes = MIN(*bytes, nb_bytes - offset_into_cluster(s, guest_offset));
 assert(*bytes != 0);
 
 return 1;
diff --git a/block/qcow2.h b/block/qcow2.h
index e2c42d5..175b0c1 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -302,8 +302,8 @@ typedef struct Qcow2COWRegion {
  */
 uint64_toffset;
 
-/** Number of sectors to copy */
-int nb_sectors;
+/** Number of bytes to copy */
+int nb_bytes;
 } Qcow2COWRegion;
 
 /**
@@ -318,12 +318,6 @@ typedef struct QCowL2Meta
 /** Host offset of the first newly allocated cluster */
 uint64_t alloc_offset;
 
-/**
- * Number of sectors from the start of the first allocated cluster to
- * the end of the (possibly shortened) request
- */
-int nb_available;
-
 /** Number of newly allocated clusters */
 int nb_clusters;
 
@@ -471,8 +465,7 @@ static inline uint64_t l2meta_cow_start(QCowL2Meta *m)
 
 static inline uint64_t l2meta_cow_end(QCowL2Meta *m)
 {
-return m->offset + m->cow_end.offset
-+ (m->cow_end.nb_sectors << BDRV_SECTOR_BITS);
+return m->offset + m->cow_end.offset + m->cow_end.nb_bytes;
 }
 
 static inline uint64_t refcount_diff(uint64_t r1, uint64_t r2)
-- 
1.8.3.1




[Qemu-block] [PATCH v2 0/5] qcow2: Implement .bdrv_co_preadv/pwritev

2016-06-06 Thread Kevin Wolf
This series converts qcow2 to the byte-based I/O interfaces. This simplifies
the code by removing many unit conversions, and in the unlikely case of actual
unaligned requests, it even makes the driver work more efficiently by avoiding
read-modify-write.

v2:
- Be more careful with integer overflows [Eric]
- Updated some more comments [Eric]
- Rename copy_sectors -> do_perform_cow [Eric] 
- Write single byte instead of full sector at preallocation end [Eric]

Kevin Wolf (5):
  qcow2: Work with bytes in qcow2_get_cluster_offset()
  qcow2: Implement .bdrv_co_preadv()
  qcow2: Make copy_sectors() byte based
  qcow2: Use bytes instead of sectors for QCowL2Meta
  qcow2: Implement .bdrv_co_pwritev()

 block/qcow2-cluster.c | 146 +
 block/qcow2.c | 197 +-
 block/qcow2.h |  18 ++---
 trace-events  |   6 +-
 4 files changed, 176 insertions(+), 191 deletions(-)

-- 
1.8.3.1




Re: [Qemu-block] [Qemu-devel] [PATCH 5/5] qcow2: Implement .bdrv_co_pwritev

2016-06-06 Thread Kevin Wolf
Am 03.06.2016 um 22:13 hat Eric Blake geschrieben:
> On 06/03/2016 11:21 AM, Kevin Wolf wrote:
> > This changes qcow2 to implement the byte-based .bdrv_co_pwritev
> > interface rather than the sector-based old one.
> > 
> > As preallocation uses the same allocation function as normal writes, and
> > the interface of that function needs to be changed, it is converted in
> > the same patch.
> > 
> > Signed-off-by: Kevin Wolf 
> > ---
> >  block/qcow2-cluster.c | 13 
> >  block/qcow2.c | 82 
> > ---
> >  block/qcow2.h |  3 +-
> >  trace-events  |  6 ++--
> >  4 files changed, 49 insertions(+), 55 deletions(-)
> > 
> 
> > +++ b/block/qcow2-cluster.c
> > @@ -1261,7 +1261,8 @@ fail:
> >   * Return 0 on success and -errno in error cases
> >   */
> >  int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
> > -int *num, uint64_t *host_offset, QCowL2Meta **m)
> > +   unsigned int *bytes, uint64_t *host_offset,
> > +   QCowL2Meta **m)
> >  {
> 
> >  
> > -*num -= remaining >> BDRV_SECTOR_BITS;
> > -assert(*num > 0);
> > +*bytes -= remaining;
> > +assert(*bytes > 0);
> 
> The assertion is now weaker.  Beforehand, num could go negative.  But
> bytes cannot, all it can do is go to 0.  If we wrap around, the
> assertion won't catch it.  Do you want to strengthen it by also
> asserting that bytes < INT_MAX?

Is it useful to assert this, though? We have tons of such loops and we
never assert that cur_bytes <= bytes (which is what you would be
checking effectively) because it's quite obvious.

This assertion was meant to document the less obvious fact that we
always make some progress and never return 0 bytes.

> > +++ b/block/qcow2.c
> > @@ -1544,23 +1544,20 @@ fail:
> >  return ret;
> >  }
> >  
> > -static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
> > -   int64_t sector_num,
> > -   int remaining_sectors,
> > -   QEMUIOVector *qiov)
> > +static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs,
> > +uint64_t offset, uint64_t bytes, QEMUIOVector *qiov, int flags)
> 
> Worth consistent indentation, while touching it?

Will fix.

> > +while (bytes != 0) {
> >  
> >  l2meta = NULL;
> >  
> >  trace_qcow2_writev_start_part(qemu_coroutine_self());
> > -index_in_cluster = sector_num & (s->cluster_sectors - 1);
> > -cur_nr_sectors = remaining_sectors;
> > -if (bs->encrypted &&
> > -cur_nr_sectors >
> > -QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - 
> > index_in_cluster) {
> > -cur_nr_sectors =
> > -QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - 
> > index_in_cluster;
> > +offset_in_cluster = offset_into_cluster(s, offset);
> > +cur_bytes = MIN(bytes, INT_MAX);
> > +if (bs->encrypted) {
> > +cur_bytes = MIN(cur_bytes,
> > +QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
> > +- offset_in_cluster);
> 
> Again, if the block layer learns to auto-fragment, then we should just
> inform the block layer about QCOW_MAX_CRYPT_CLUSTERS as our max_transfer
> limit.
> 
> 
> > @@ -1634,10 +1628,10 @@ static coroutine_fn int 
> > qcow2_co_writev(BlockDriverState *bs,
> >  qemu_co_mutex_unlock(>lock);
> >  BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
> >  trace_qcow2_writev_data(qemu_coroutine_self(),
> > -(cluster_offset >> 9) + index_in_cluster);
> > -ret = bdrv_co_writev(bs->file->bs,
> > - (cluster_offset >> 9) + index_in_cluster,
> > - cur_nr_sectors, _qiov);
> > +cluster_offset + offset_in_cluster);
> > +ret = bdrv_co_pwritev(bs->file->bs,
> > +  cluster_offset + offset_in_cluster,
> > +  cur_bytes, _qiov, 0);
> 
> s/0/flags/, even if .supported_write_flags stays 0 for now.

I don't agree. That might make sense for a passthrough driver like raw,
but I don't see a reason to guess for qcow2 that passing flags down is
the right thing to do. The only flag that we currently have (FUA) isn't
correctly implemented by passing it to the lower layer because qcow2
metadata must be considered.

So I would leave changing flags here for the patch that adds some flag
to .supported_write_flags.

> > @@ -2059,7 +2053,7 @@ static int preallocate(BlockDriverState *bs)
> >  uint8_t buf[BDRV_SECTOR_SIZE];
> >  memset(buf, 0, BDRV_SECTOR_SIZE);
> >  ret = bdrv_write(bs->file->bs,
> > - (host_offset >> BDRV_SECTOR_BITS) + num - 1,
> > + ((host_offset + cur_bytes) >> BDRV_SECTOR_BITS) - 
> > 1,
> >   

Re: [Qemu-block] [PATCH v6 14/22] mirror: Disable image locking on target backing chain

2016-06-06 Thread Max Reitz
On 03.06.2016 10:49, Fam Zheng wrote:
> In sync=none the backing image of s->target is s->common.bs, which could
> be exclusively locked, the image locking wouldn't work here.
> 
> Later we can update completion code to lock it after the replaced node
> has dropped its lock.
> 
> Signed-off-by: Fam Zheng 
> ---
>  blockdev.c | 10 --
>  1 file changed, 8 insertions(+), 2 deletions(-)

Without having reviewed the patch in the new context, when basing your
series on v2 of my "block/mirror: Fix target backing BDS​", all iotests
pass even without this patch.

Without my series, test 041 fails.

So I have reason to hope that I was actually able to make this patch
superfluous.

Max



signature.asc
Description: OpenPGP digital signature


[Qemu-block] [PATCH v2 5/5] qcow2: Implement .bdrv_co_pwritev()

2016-06-06 Thread Kevin Wolf
This changes qcow2 to implement the byte-based .bdrv_co_pwritev
interface rather than the sector-based old one.

As preallocation uses the same allocation function as normal writes, and
the interface of that function needs to be changed, it is converted in
the same patch.

Signed-off-by: Kevin Wolf 
---
 block/qcow2-cluster.c | 13 
 block/qcow2.c | 89 ---
 block/qcow2.h |  3 +-
 trace-events  |  6 ++--
 4 files changed, 52 insertions(+), 59 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 0182675..b97eb8f 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1264,7 +1264,8 @@ fail:
  * Return 0 on success and -errno in error cases
  */
 int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset,
-int *num, uint64_t *host_offset, QCowL2Meta **m)
+   unsigned int *bytes, uint64_t *host_offset,
+   QCowL2Meta **m)
 {
 BDRVQcow2State *s = bs->opaque;
 uint64_t start, remaining;
@@ -1272,13 +1273,11 @@ int qcow2_alloc_cluster_offset(BlockDriverState *bs, 
uint64_t offset,
 uint64_t cur_bytes;
 int ret;
 
-trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *num);
-
-assert((offset & ~BDRV_SECTOR_MASK) == 0);
+trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *bytes);
 
 again:
 start = offset;
-remaining = (uint64_t)*num << BDRV_SECTOR_BITS;
+remaining = *bytes;
 cluster_offset = 0;
 *host_offset = 0;
 cur_bytes = 0;
@@ -1364,8 +1363,8 @@ again:
 }
 }
 
-*num -= remaining >> BDRV_SECTOR_BITS;
-assert(*num > 0);
+*bytes -= remaining;
+assert(*bytes > 0);
 assert(*host_offset != 0);
 
 return 0;
diff --git a/block/qcow2.c b/block/qcow2.c
index 545dea0..cb55e2d 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1544,23 +1544,21 @@ fail:
 return ret;
 }
 
-static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
-   int64_t sector_num,
-   int remaining_sectors,
-   QEMUIOVector *qiov)
+static coroutine_fn int qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
+ uint64_t bytes, QEMUIOVector *qiov,
+ int flags)
 {
 BDRVQcow2State *s = bs->opaque;
-int index_in_cluster;
+int offset_in_cluster;
 int ret;
-int cur_nr_sectors; /* number of sectors in current iteration */
+unsigned int cur_bytes; /* number of sectors in current iteration */
 uint64_t cluster_offset;
 QEMUIOVector hd_qiov;
 uint64_t bytes_done = 0;
 uint8_t *cluster_data = NULL;
 QCowL2Meta *l2meta = NULL;
 
-trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num,
- remaining_sectors);
+trace_qcow2_writev_start_req(qemu_coroutine_self(), offset, bytes);
 
 qemu_iovec_init(_qiov, qiov->niov);
 
@@ -1568,22 +1566,21 @@ static coroutine_fn int 
qcow2_co_writev(BlockDriverState *bs,
 
 qemu_co_mutex_lock(>lock);
 
-while (remaining_sectors != 0) {
+while (bytes != 0) {
 
 l2meta = NULL;
 
 trace_qcow2_writev_start_part(qemu_coroutine_self());
-index_in_cluster = sector_num & (s->cluster_sectors - 1);
-cur_nr_sectors = remaining_sectors;
-if (bs->encrypted &&
-cur_nr_sectors >
-QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster) {
-cur_nr_sectors =
-QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - 
index_in_cluster;
+offset_in_cluster = offset_into_cluster(s, offset);
+cur_bytes = MIN(bytes, INT_MAX);
+if (bs->encrypted) {
+cur_bytes = MIN(cur_bytes,
+QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size
+- offset_in_cluster);
 }
 
-ret = qcow2_alloc_cluster_offset(bs, sector_num << 9,
-_nr_sectors, _offset, );
+ret = qcow2_alloc_cluster_offset(bs, offset, _bytes,
+ _offset, );
 if (ret < 0) {
 goto fail;
 }
@@ -1591,8 +1588,7 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState 
*bs,
 assert((cluster_offset & 511) == 0);
 
 qemu_iovec_reset(_qiov);
-qemu_iovec_concat(_qiov, qiov, bytes_done,
-cur_nr_sectors * 512);
+qemu_iovec_concat(_qiov, qiov, bytes_done, cur_bytes);
 
 if (bs->encrypted) {
 Error *err = NULL;
@@ -1611,8 +1607,9 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState 
*bs,
QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
 qemu_iovec_to_buf(_qiov, 0, cluster_data, hd_qiov.size);
 
-if (qcow2_encrypt_sectors(s, sector_num, cluster_data,
- 

[Qemu-block] [PATCH v2 3/3] iotests: Add test for post-mirror backing chains

2016-06-06 Thread Max Reitz
Signed-off-by: Max Reitz 
---
 tests/qemu-iotests/155 | 218 +
 tests/qemu-iotests/155.out |   5 ++
 tests/qemu-iotests/group   |   1 +
 3 files changed, 224 insertions(+)
 create mode 100755 tests/qemu-iotests/155
 create mode 100644 tests/qemu-iotests/155.out

diff --git a/tests/qemu-iotests/155 b/tests/qemu-iotests/155
new file mode 100755
index 000..76fdd4f
--- /dev/null
+++ b/tests/qemu-iotests/155
@@ -0,0 +1,218 @@
+#!/usr/bin/env python
+#
+# Test whether the backing BDSs are correct after completion of a
+# mirror block job
+#
+# Copyright (C) 2016 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see .
+#
+
+import os
+import stat
+import time
+import iotests
+from iotests import qemu_img
+
+back0_img = os.path.join(iotests.test_dir, 'back0.' + iotests.imgfmt)
+back1_img = os.path.join(iotests.test_dir, 'back1.' + iotests.imgfmt)
+back2_img = os.path.join(iotests.test_dir, 'back2.' + iotests.imgfmt)
+source_img = os.path.join(iotests.test_dir, 'source.' + iotests.imgfmt)
+target_img = os.path.join(iotests.test_dir, 'target.' + iotests.imgfmt)
+
+class BaseClass(iotests.QMPTestCase):
+def setUp(self):
+qemu_img('create', '-f', iotests.imgfmt, back0_img, '1M')
+qemu_img('create', '-f', iotests.imgfmt, '-b', back0_img, back1_img)
+qemu_img('create', '-f', iotests.imgfmt, '-b', back1_img, back2_img)
+qemu_img('create', '-f', iotests.imgfmt, '-b', back2_img, source_img)
+
+self.vm = iotests.VM()
+self.vm.add_drive(None, '', 'none')
+self.vm.launch()
+
+# Add the BDS via blockdev-add so it stays around after the mirror 
block
+# job has been completed
+result = self.vm.qmp('blockdev-add',
+ options={'node-name': 'source',
+  'driver': iotests.imgfmt,
+  'file': {'driver': 'file',
+   'filename': source_img}})
+self.assert_qmp(result, 'return', {})
+
+result = self.vm.qmp('x-blockdev-insert-medium',
+ device='drive0', node_name='source')
+self.assert_qmp(result, 'return', {})
+
+self.assertIntactSourceBackingChain()
+
+if self.existing:
+if self.target_backing:
+qemu_img('create', '-f', iotests.imgfmt,
+ '-b', self.target_backing, target_img, '1M')
+else:
+qemu_img('create', '-f', iotests.imgfmt, target_img, '1M')
+
+if self.cmd == 'blockdev-mirror':
+result = self.vm.qmp('blockdev-add',
+ options={'node-name': 'target',
+  'driver': iotests.imgfmt,
+  'file': {'driver': 'file',
+   'filename': 
target_img}})
+self.assert_qmp(result, 'return', {})
+
+def tearDown(self):
+self.vm.shutdown()
+os.remove(source_img)
+os.remove(back2_img)
+os.remove(back1_img)
+os.remove(back0_img)
+try:
+os.remove(target_img)
+except OSError:
+pass
+
+def findBlockNode(self, node_name, id=None):
+if id:
+result = self.vm.qmp('query-block')
+for device in result['return']:
+if device['device'] == id:
+if node_name:
+self.assert_qmp(device, 'inserted/node-name', 
node_name)
+return device['inserted']
+else:
+result = self.vm.qmp('query-named-block-nodes')
+for node in result['return']:
+if node['node-name'] == node_name:
+return node
+
+self.fail('Cannot find node %s/%s' % (id, node_name))
+
+def assertIntactSourceBackingChain(self):
+node = self.findBlockNode('source')
+
+self.assert_qmp(node, 'image' + '/backing-image' * 0 + '/filename',
+source_img)
+self.assert_qmp(node, 'image' + '/backing-image' * 1 + '/filename',
+back2_img)
+self.assert_qmp(node, 'image' + '/backing-image' * 2 + '/filename',
+

[Qemu-block] [PATCH v2 2/3] block/mirror: Fix target backing BDS

2016-06-06 Thread Max Reitz
Currently, we are trying to move the backing BDS from the source to the
target in bdrv_replace_in_backing_chain() which is called from
mirror_exit(). However, mirror_complete() already tries to open the
target's backing chain with a call to bdrv_open_backing_file().

First, we should only set the target's backing BDS once. Second, the
mirroring block job has a better idea of what to set it to than the
generic code in bdrv_replace_in_backing_chain() (in fact, the latter's
conditions on when to move the backing BDS from source to target are not
really correct).

Therefore, remove that code from bdrv_replace_in_backing_chain() and
leave it to mirror_complete().

However, mirror_complete() in turn pursues a questionable strategy by
employing bdrv_open_backing_file(): On the one hand, because this may
open the wrong backing file with drive-mirror in "existing" mode, or
because it will not override a possibly wrong backing file in the
blockdev-mirror case.

On the other hand, we want to reuse the existing backing chain of the
source instead of opening everything anew, because the latter results in
having multiple BDSs for a single physical file and thus potentially
concurrent access which we should try to avoid.

Thus, instead of invoking bdrv_open_backing_file(), just set the correct
backing BDS directly via bdrv_set_backing_hd(). Also, do so only when
mirror_complete() is certain to succeed.

In contrast to what bdrv_replace_in_backing_chain() did so far, we do
not need to drop the source's backing file.

Signed-off-by: Max Reitz 
---
 block.c|  8 
 block/mirror.c | 21 +
 2 files changed, 13 insertions(+), 16 deletions(-)

diff --git a/block.c b/block.c
index 16463aa..792f5dd 100644
--- a/block.c
+++ b/block.c
@@ -2288,14 +2288,6 @@ void bdrv_replace_in_backing_chain(BlockDriverState 
*old, BlockDriverState *new)
 
 change_parent_backing_link(old, new);
 
-/* Change backing files if a previously independent node is added to the
- * chain. For active commit, we replace top by its own (indirect) backing
- * file and don't do anything here so we don't build a loop. */
-if (new->backing == NULL && !bdrv_chain_contains(backing_bs(old), new)) {
-bdrv_set_backing_hd(new, backing_bs(old));
-bdrv_set_backing_hd(old, NULL);
-}
-
 bdrv_unref(old);
 }
 
diff --git a/block/mirror.c b/block/mirror.c
index 80fd3c7..217475b 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -742,15 +742,11 @@ static void mirror_set_speed(BlockJob *job, int64_t 
speed, Error **errp)
 static void mirror_complete(BlockJob *job, Error **errp)
 {
 MirrorBlockJob *s = container_of(job, MirrorBlockJob, common);
-Error *local_err = NULL;
-int ret;
+BlockDriverState *src, *target;
+
+src = blk_bs(job->blk);
+target = blk_bs(s->target);
 
-ret = bdrv_open_backing_file(blk_bs(s->target), NULL, "backing",
- _err);
-if (ret < 0) {
-error_propagate(errp, local_err);
-return;
-}
 if (!s->synced) {
 error_setg(errp, QERR_BLOCK_JOB_NOT_READY, job->id);
 return;
@@ -777,6 +773,15 @@ static void mirror_complete(BlockJob *job, Error **errp)
 aio_context_release(replace_aio_context);
 }
 
+/* Now we need to adjust the target's backing BDS. This is not necessary
+ * when having performed a commit operation. */
+if (!bdrv_chain_contains(backing_bs(src), target)) {
+BlockDriverState *backing = s->is_none_mode ? src : s->base;
+if (backing_bs(target) != backing) {
+bdrv_set_backing_hd(target, backing);
+}
+}
+
 s->should_complete = true;
 block_job_enter(>common);
 }
-- 
2.8.3




[Qemu-block] [PATCH v2 1/3] block: Allow replacement of a BDS by its overlay

2016-06-06 Thread Max Reitz
change_parent_backing_link() asserts that the BDS to be replaced is not
used as a backing file. However, we may want to replace a BDS by its
overlay in which case that very link should not be redirected.

Signed-off-by: Max Reitz 
---
 block.c | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/block.c b/block.c
index f54bc25..16463aa 100644
--- a/block.c
+++ b/block.c
@@ -2224,9 +2224,22 @@ void bdrv_close_all(void)
 static void change_parent_backing_link(BlockDriverState *from,
BlockDriverState *to)
 {
-BdrvChild *c, *next;
+BdrvChild *c, *next, *to_c;
 
 QLIST_FOREACH_SAFE(c, >parents, next_parent, next) {
+if (c->role == _backing) {
+/* Allow @from to be in a backing chain, but only if it is @to's
+ * backing chain. Do not replace @from by @to there. */
+QLIST_FOREACH(to_c, >children, next) {
+if (to_c == c) {
+break;
+}
+}
+if (to_c) {
+continue;
+}
+}
+
 assert(c->role != _backing);
 bdrv_ref(to);
 bdrv_replace_child(c, to);
-- 
2.8.3




[Qemu-block] [PATCH v2 0/3] block/mirror: Fix target backing BDS

2016-06-06 Thread Max Reitz
Issue #1: Sometimes we can have a wrong backing BDS for the target after
a mirror block job. In "existing" mode with drive-mirror, or when using
blockdev-mirror, it's generally the user's fault. In "absolute-paths"
mode this only means that after a sync=full drive-mirror, the target may
have a backing file, but this will not change its visible data, so it's
"fine".

Still, it's ugly.

Issue #2: Currently the backing chain of the target is basically opened
using bdrv_open_backing_file() (except for sometimes™). This results in
multiple BDSs for a single physical file, which is bad. In most use
cases, this is only temporary, but it still is bad.

We can just reuse the existing backing chain of the source, so we should
do so.


Patch 2 fixes the issue. Patch 1 allows change_parent_backing_link() to
replace a BDS by its immediate overlay (which is necessary so that patch
2 can set the source BDS as the backing BDS of the target (sync=none) in
mirror_complete(), i.e. before bdrv_replace_in_backing_chain() is called
in mirror_exit()).

Patch 3 adds a test.


v2:
- Move the whole logic to mirror_complete(). This has the benefit of
  resolving the bdrv_open_backing_file() issue with multiple BDSs being
  open for a single physical file (which is a very real issue when it
  comes to image locking).
- However, this also has the drawback of requiring patch 1. So it needed
  to be added.


git-backport-diff against v1:

Key:
[] : patches are identical
[] : number of functional differences between upstream/downstream patch
[down] : patch is downstream-only
The flags [FC] indicate (F)unctional and (C)ontextual differences, respectively

001/3:[down] 'block: Allow replacement of a BDS by its overlay'
002/3:[0030] [FC] 'block/mirror: Fix target backing BDS'
003/3:[] [--] 'iotests: Add test for post-mirror backing chains'


Max Reitz (3):
  block: Allow replacement of a BDS by its overlay
  block/mirror: Fix target backing BDS
  iotests: Add test for post-mirror backing chains

 block.c|  23 +++--
 block/mirror.c |  21 +++--
 tests/qemu-iotests/155 | 218 +
 tests/qemu-iotests/155.out |   5 ++
 tests/qemu-iotests/group   |   1 +
 5 files changed, 251 insertions(+), 17 deletions(-)
 create mode 100755 tests/qemu-iotests/155
 create mode 100644 tests/qemu-iotests/155.out

-- 
2.8.3




Re: [Qemu-block] [Qemu-devel] [PATCH] scsi-disk: fix reads from scsi-disk devices

2016-06-06 Thread Peter Maydell
On 3 June 2016 at 13:42, Eric Blake  wrote:
> On 06/02/2016 11:17 PM, Mark Cave-Ayland wrote:
>> Commit fcaafb1001b9c42817714dd3b2aadcfdb997b53d accidentally broke reads from
>> scsi-disk devices when being updated from its original form to use the new
>> byte-based block functions. Add the extra missing sector to offset conversion
>> in order to restore read functionality.
>>
>> Signed-off-by: Mark Cave-Ayland 
>> ---
>>  hw/scsi/scsi-disk.c |2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> Reviewed-by: Eric Blake 

Thanks. Applied to master, since it's awkward to leave scsi disks broken...

thanks
-- PMM



[Qemu-block] [PATCH v2 3/5] qemu-img bench: Make start offset configurable

2016-06-06 Thread Kevin Wolf
This patch adds an option the specify the offset of the first request
made by qemu-img bench. This allows to benchmark misaligned requests.

Signed-off-by: Kevin Wolf 
Reviewed-by: Denis V. Lunev 
---
 qemu-img-cmds.hx |  4 ++--
 qemu-img.c   | 23 ---
 qemu-img.texi|  5 +++--
 3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
index baca85e..117d0f9 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
@@ -10,9 +10,9 @@ STEXI
 ETEXI
 
 DEF("bench", img_bench,
-"bench [-c count] [-d depth] [-f fmt] [-n] [--pattern=pattern] [-q] [-s 
buffer_size] [-t cache] [-w] filename")
+"bench [-c count] [-d depth] [-f fmt] [-n] [-o offset] [--pattern=pattern] 
[-q] [-s buffer_size] [-t cache] [-w] filename")
 STEXI
-@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] 
[--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-t @var{cache}] [-w] 
@var{filename}
+@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-o 
@var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-t 
@var{cache}] [-w] @var{filename}
 ETEXI
 
 DEF("check", img_check,
diff --git a/qemu-img.c b/qemu-img.c
index 85d1353..480ef8d 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3515,6 +3515,7 @@ static int img_bench(int argc, char **argv)
 bool is_write = false;
 int count = 75000;
 int depth = 64;
+int64_t offset = 0;
 size_t bufsize = 4096;
 int pattern = 0;
 int64_t image_size;
@@ -3532,7 +3533,7 @@ static int img_bench(int argc, char **argv)
 {"pattern", required_argument, 0, OPTION_PATTERN},
 {0, 0, 0, 0}
 };
-c = getopt_long(argc, argv, "hc:d:f:nqs:t:w", long_options, NULL);
+c = getopt_long(argc, argv, "hc:d:f:no:qs:t:w", long_options, NULL);
 if (c == -1) {
 break;
 }
@@ -3570,6 +3571,19 @@ static int img_bench(int argc, char **argv)
 case 'n':
 flags |= BDRV_O_NATIVE_AIO;
 break;
+case 'o':
+{
+char *end;
+errno = 0;
+offset = qemu_strtosz_suffix(optarg, ,
+ QEMU_STRTOSZ_DEFSUFFIX_B);
+if (offset < 0|| *end) {
+error_report("Invalid offset specified");
+return 1;
+}
+break;
+}
+break;
 case 'q':
 quiet = true;
 break;
@@ -3639,10 +3653,13 @@ static int img_bench(int argc, char **argv)
 .bufsize= bufsize,
 .nrreq  = depth,
 .n  = count,
+.offset = offset,
 .write  = is_write,
 };
-printf("Sending %d %s requests, %d bytes each, %d in parallel\n",
-   data.n, data.write ? "write" : "read", data.bufsize, data.nrreq);
+printf("Sending %d %s requests, %d bytes each, %d in parallel "
+   "(starting at offset %" PRId64 ")\n",
+   data.n, data.write ? "write" : "read", data.bufsize, data.nrreq,
+   data.offset);
 
 data.buf = blk_blockalign(blk, data.nrreq * data.bufsize);
 memset(data.buf, pattern, data.nrreq * data.bufsize);
diff --git a/qemu-img.texi b/qemu-img.texi
index c477fbf..9bffad2 100644
--- a/qemu-img.texi
+++ b/qemu-img.texi
@@ -131,13 +131,14 @@ Skip the creation of the target volume
 Command description:
 
 @table @option
-@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] 
[--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-t @var{cache}] [-w] 
@var{filename}
+@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-o 
@var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-t 
@var{cache}] [-w] @var{filename}
 
 Run a simple sequential I/O benchmark on the specified image. If @code{-w} is
 specified, a write test is performed, otherwise a read test is performed.
 
 A total number of @var{count} I/O requests is performed, each @var{buffer_size}
-bytes in size, and with @var{depth} requests in parallel.
+bytes in size, and with @var{depth} requests in parallel. The first request
+starts at the position given by @var{offset}.
 
 If @code{-n} is specified, the native AIO backend is used if possible. On
 Linux, this option only works if @code{-t none} or @code{-t directsync} is
-- 
1.8.3.1




[Qemu-block] [PATCH v2 2/5] qemu-img bench: Sequential writes

2016-06-06 Thread Kevin Wolf
This extends qemu-img bench with an option that makes it use sequential
writes instead of reads for the test run.

Signed-off-by: Kevin Wolf 
---
 qemu-img-cmds.hx |  4 ++--
 qemu-img.c   | 38 +-
 qemu-img.texi| 13 +
 3 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
index f3bd546..baca85e 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
@@ -10,9 +10,9 @@ STEXI
 ETEXI
 
 DEF("bench", img_bench,
-"bench [-c count] [-d depth] [-f fmt] [-n] [-q] [-s buffer_size] [-t 
cache] filename")
+"bench [-c count] [-d depth] [-f fmt] [-n] [--pattern=pattern] [-q] [-s 
buffer_size] [-t cache] [-w] filename")
 STEXI
-@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-q] [-s 
@var{buffer_size}] [-t @var{cache}] @var{filename}
+@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] 
[--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-t @var{cache}] [-w] 
@var{filename}
 ETEXI
 
 DEF("check", img_check,
diff --git a/qemu-img.c b/qemu-img.c
index d471d10..85d1353 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -53,6 +53,7 @@ enum {
 OPTION_BACKING_CHAIN = 257,
 OPTION_OBJECT = 258,
 OPTION_IMAGE_OPTS = 259,
+OPTION_PATTERN = 260,
 };
 
 typedef enum OutputFormat {
@@ -3462,6 +3463,7 @@ out_no_progress:
 typedef struct BenchData {
 BlockBackend *blk;
 uint64_t image_size;
+bool write;
 int bufsize;
 int nrreq;
 int n;
@@ -3487,8 +3489,13 @@ static void bench_cb(void *opaque, int ret)
 }
 
 while (b->n > b->in_flight && b->in_flight < b->nrreq) {
-acb = blk_aio_preadv(b->blk, b->offset, b->qiov, 0,
- bench_cb, b);
+if (b->write) {
+acb = blk_aio_pwritev(b->blk, b->offset, b->qiov, 0,
+  bench_cb, b);
+} else {
+acb = blk_aio_preadv(b->blk, b->offset, b->qiov, 0,
+ bench_cb, b);
+}
 if (!acb) {
 error_report("Failed to issue request");
 exit(EXIT_FAILURE);
@@ -3505,9 +3512,11 @@ static int img_bench(int argc, char **argv)
 const char *fmt = NULL, *filename;
 bool quiet = false;
 bool image_opts = false;
+bool is_write = false;
 int count = 75000;
 int depth = 64;
 size_t bufsize = 4096;
+int pattern = 0;
 int64_t image_size;
 BlockBackend *blk = NULL;
 BenchData data = {};
@@ -3520,9 +3529,10 @@ static int img_bench(int argc, char **argv)
 static const struct option long_options[] = {
 {"help", no_argument, 0, 'h'},
 {"image-opts", no_argument, 0, OPTION_IMAGE_OPTS},
+{"pattern", required_argument, 0, OPTION_PATTERN},
 {0, 0, 0, 0}
 };
-c = getopt_long(argc, argv, "hc:d:f:nqs:t:", long_options, NULL);
+c = getopt_long(argc, argv, "hc:d:f:nqs:t:w", long_options, NULL);
 if (c == -1) {
 break;
 }
@@ -3585,6 +3595,21 @@ static int img_bench(int argc, char **argv)
 goto out;
 }
 break;
+case 'w':
+flags |= BDRV_O_RDWR;
+is_write = true;
+break;
+case OPTION_PATTERN:
+{
+char *end;
+errno = 0;
+pattern = strtoul(optarg, , 0);
+if (errno || *end || pattern > 0xff) {
+error_report("Invalid pattern byte specified");
+return 1;
+}
+break;
+}
 case OPTION_IMAGE_OPTS:
 image_opts = true;
 break;
@@ -3614,11 +3639,14 @@ static int img_bench(int argc, char **argv)
 .bufsize= bufsize,
 .nrreq  = depth,
 .n  = count,
+.write  = is_write,
 };
-printf("Sending %d requests, %d bytes each, %d in parallel\n",
-data.n, data.bufsize, data.nrreq);
+printf("Sending %d %s requests, %d bytes each, %d in parallel\n",
+   data.n, data.write ? "write" : "read", data.bufsize, data.nrreq);
 
 data.buf = blk_blockalign(blk, data.nrreq * data.bufsize);
+memset(data.buf, pattern, data.nrreq * data.bufsize);
+
 data.qiov = g_new(QEMUIOVector, data.nrreq);
 for (i = 0; i < data.nrreq; i++) {
 qemu_iovec_init([i], 1);
diff --git a/qemu-img.texi b/qemu-img.texi
index b6b28e3..c477fbf 100644
--- a/qemu-img.texi
+++ b/qemu-img.texi
@@ -131,16 +131,21 @@ Skip the creation of the target volume
 Command description:
 
 @table @option
-@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-q] [-s 
@var{buffer_size}] [-t @var{cache}] @var{filename}
+@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] 
[--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-t @var{cache}] [-w] 
@var{filename}
 
-Run a simple sequential read benchmark on the specified image. A total 

[Qemu-block] [PATCH v2 0/5] block: Introduce qemu-img bench

2016-06-06 Thread Kevin Wolf
After merging Den's qcow2 patch to avoid duplicated flushes, I thought I would
be nice to reproduce the problem and I remembered 'qemu-img bench', which I had
posted before as part of more than one RFC series, but which never made it to
master somehow. So here is a rebased and cleaned up version of it, just by
itself, so that it hopefully can be merged finally.

Of course, I failed to actually reproduce the problem on my laptop. Who knows,
something on my system might be more intelligent about useless flushes, or
maybe I just misunderstood what the problematic scenario looks like at the
block level. Doesn't make the tool less useful, though, and I already did the
rebasing.

v2:
- Added --pattern=... option for writes [Den]
- Added --no-drain option for write+flush tests [Den]

Kevin Wolf (5):
  qemu-img bench
  qemu-img bench: Sequential writes
  qemu-img bench: Make start offset configurable
  qemu-img bench: Implement -S (step size)
  qemu-img bench: Add --flush-interval

 qemu-img-cmds.hx |   6 +
 qemu-img.c   | 329 +++
 qemu-img.texi|  24 
 3 files changed, 359 insertions(+)

-- 
1.8.3.1




[Qemu-block] [PATCH v2 4/5] qemu-img bench: Implement -S (step size)

2016-06-06 Thread Kevin Wolf
With this new option, qemu-img bench can be told to advance the current
offset after each request by a different value than the buffer size.
This is useful for controlling the conditions for cluster allocation in
image formats (e.g. qcow2 cluster allocation with COW in front of the
request, or COW areas that aren't overwritten immediately).

Signed-off-by: Kevin Wolf 
Reviewed-by: Denis V. Lunev 
---
 qemu-img-cmds.hx |  4 ++--
 qemu-img.c   | 25 +
 qemu-img.texi|  6 --
 3 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
index 117d0f9..05a2991 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
@@ -10,9 +10,9 @@ STEXI
 ETEXI
 
 DEF("bench", img_bench,
-"bench [-c count] [-d depth] [-f fmt] [-n] [-o offset] [--pattern=pattern] 
[-q] [-s buffer_size] [-t cache] [-w] filename")
+"bench [-c count] [-d depth] [-f fmt] [-n] [-o offset] [--pattern=pattern] 
[-q] [-s buffer_size] [-S step_size] [-t cache] [-w] filename")
 STEXI
-@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-o 
@var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-t 
@var{cache}] [-w] @var{filename}
+@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-o 
@var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-S 
@var{step_size}] [-t @var{cache}] [-w] @var{filename}
 ETEXI
 
 DEF("check", img_check,
diff --git a/qemu-img.c b/qemu-img.c
index 480ef8d..c5e2638 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3465,6 +3465,7 @@ typedef struct BenchData {
 uint64_t image_size;
 bool write;
 int bufsize;
+int step;
 int nrreq;
 int n;
 uint8_t *buf;
@@ -3501,7 +3502,7 @@ static void bench_cb(void *opaque, int ret)
 exit(EXIT_FAILURE);
 }
 b->in_flight++;
-b->offset += b->bufsize;
+b->offset += b->step;
 b->offset %= b->image_size;
 }
 }
@@ -3518,6 +3519,7 @@ static int img_bench(int argc, char **argv)
 int64_t offset = 0;
 size_t bufsize = 4096;
 int pattern = 0;
+size_t step = 0;
 int64_t image_size;
 BlockBackend *blk = NULL;
 BenchData data = {};
@@ -3533,7 +3535,7 @@ static int img_bench(int argc, char **argv)
 {"pattern", required_argument, 0, OPTION_PATTERN},
 {0, 0, 0, 0}
 };
-c = getopt_long(argc, argv, "hc:d:f:no:qs:t:w", long_options, NULL);
+c = getopt_long(argc, argv, "hc:d:f:no:qs:S:t:w", long_options, NULL);
 if (c == -1) {
 break;
 }
@@ -3601,6 +3603,20 @@ static int img_bench(int argc, char **argv)
 bufsize = sval;
 break;
 }
+case 'S':
+{
+int64_t sval;
+char *end;
+
+sval = qemu_strtosz_suffix(optarg, , QEMU_STRTOSZ_DEFSUFFIX_B);
+if (sval < 0 || sval > INT_MAX || *end) {
+error_report("Invalid step size specified");
+return 1;
+}
+
+step = sval;
+break;
+}
 case 't':
 ret = bdrv_parse_cache_mode(optarg, , );
 if (ret < 0) {
@@ -3651,15 +3667,16 @@ static int img_bench(int argc, char **argv)
 .blk= blk,
 .image_size = image_size,
 .bufsize= bufsize,
+.step   = step ?: bufsize,
 .nrreq  = depth,
 .n  = count,
 .offset = offset,
 .write  = is_write,
 };
 printf("Sending %d %s requests, %d bytes each, %d in parallel "
-   "(starting at offset %" PRId64 ")\n",
+   "(starting at offset %" PRId64 ", step size %d)\n",
data.n, data.write ? "write" : "read", data.bufsize, data.nrreq,
-   data.offset);
+   data.offset, data.step);
 
 data.buf = blk_blockalign(blk, data.nrreq * data.bufsize);
 memset(data.buf, pattern, data.nrreq * data.bufsize);
diff --git a/qemu-img.texi b/qemu-img.texi
index 9bffad2..ccc0b51 100644
--- a/qemu-img.texi
+++ b/qemu-img.texi
@@ -131,14 +131,16 @@ Skip the creation of the target volume
 Command description:
 
 @table @option
-@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-o 
@var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-t 
@var{cache}] [-w] @var{filename}
+@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-o 
@var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-S 
@var{step_size}] [-t @var{cache}] [-w] @var{filename}
 
 Run a simple sequential I/O benchmark on the specified image. If @code{-w} is
 specified, a write test is performed, otherwise a read test is performed.
 
 A total number of @var{count} I/O requests is performed, each @var{buffer_size}
 bytes in size, and with @var{depth} requests in parallel. The first request
-starts at the position given by @var{offset}.
+starts at the position given by 

[Qemu-block] [PATCH v2 5/5] qemu-img bench: Add --flush-interval

2016-06-06 Thread Kevin Wolf
This options allows to flush the image periodically during write tests.

Signed-off-by: Kevin Wolf 
---
 qemu-img-cmds.hx |  4 +--
 qemu-img.c   | 95 ++--
 qemu-img.texi|  8 -
 3 files changed, 95 insertions(+), 12 deletions(-)

diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
index 05a2991..7e95b2d 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
@@ -10,9 +10,9 @@ STEXI
 ETEXI
 
 DEF("bench", img_bench,
-"bench [-c count] [-d depth] [-f fmt] [-n] [-o offset] [--pattern=pattern] 
[-q] [-s buffer_size] [-S step_size] [-t cache] [-w] filename")
+"bench [-c count] [-d depth] [-f fmt] [--flush-interval=flush_interval] 
[-n] [--no-drain] [-o offset] [--pattern=pattern] [-q] [-s buffer_size] [-S 
step_size] [-t cache] [-w] filename")
 STEXI
-@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-o 
@var{offset}] [--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-S 
@var{step_size}] [-t @var{cache}] [-w] @var{filename}
+@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] 
[--flush-interval=@var{flush_interval}] [-n] [--no-drain] [-o @var{offset}] 
[--pattern=@var{pattern}] [-q] [-s @var{buffer_size}] [-S @var{step_size}] [-t 
@var{cache}] [-w] @var{filename}
 ETEXI
 
 DEF("check", img_check,
diff --git a/qemu-img.c b/qemu-img.c
index c5e2638..04cddab 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -54,6 +54,8 @@ enum {
 OPTION_OBJECT = 258,
 OPTION_IMAGE_OPTS = 259,
 OPTION_PATTERN = 260,
+OPTION_FLUSH_INTERVAL = 261,
+OPTION_NO_DRAIN = 262,
 };
 
 typedef enum OutputFormat {
@@ -3468,13 +3470,24 @@ typedef struct BenchData {
 int step;
 int nrreq;
 int n;
+int flush_interval;
+bool drain_on_flush;
 uint8_t *buf;
 QEMUIOVector *qiov;
 
 int in_flight;
+bool in_flush;
 uint64_t offset;
 } BenchData;
 
+static void bench_undrained_flush_cb(void *opaque, int ret)
+{
+if (ret < 0) {
+error_report("Failed flush request: %s\n", strerror(-ret));
+exit(EXIT_FAILURE);
+}
+}
+
 static void bench_cb(void *opaque, int ret)
 {
 BenchData *b = opaque;
@@ -3484,9 +3497,39 @@ static void bench_cb(void *opaque, int ret)
 error_report("Failed request: %s\n", strerror(-ret));
 exit(EXIT_FAILURE);
 }
-if (b->in_flight > 0) {
+
+if (b->in_flush) {
+/* Just finished a flush with drained queue: Start next requests */
+assert(b->in_flight == 0);
+b->in_flush = false;
+} else if (b->in_flight > 0) {
+int remaining = b->n - b->in_flight;
+
 b->n--;
 b->in_flight--;
+
+/* Time for flush? Drain queue if requested, then flush */
+if (b->flush_interval && remaining % b->flush_interval == 0) {
+if (!b->in_flight || !b->drain_on_flush) {
+BlockCompletionFunc *cb;
+
+if (b->drain_on_flush) {
+b->in_flush = true;
+cb = bench_cb;
+} else {
+cb = bench_undrained_flush_cb;
+}
+
+acb = blk_aio_flush(b->blk, cb, b);
+if (!acb) {
+error_report("Failed to issue flush request");
+exit(EXIT_FAILURE);
+}
+}
+if (b->drain_on_flush) {
+return;
+}
+}
 }
 
 while (b->n > b->in_flight && b->in_flight < b->nrreq) {
@@ -3520,6 +3563,8 @@ static int img_bench(int argc, char **argv)
 size_t bufsize = 4096;
 int pattern = 0;
 size_t step = 0;
+int flush_interval = 0;
+bool drain_on_flush = true;
 int64_t image_size;
 BlockBackend *blk = NULL;
 BenchData data = {};
@@ -3531,8 +3576,10 @@ static int img_bench(int argc, char **argv)
 for (;;) {
 static const struct option long_options[] = {
 {"help", no_argument, 0, 'h'},
+{"flush-interval", required_argument, 0, OPTION_FLUSH_INTERVAL},
 {"image-opts", no_argument, 0, OPTION_IMAGE_OPTS},
 {"pattern", required_argument, 0, OPTION_PATTERN},
+{"no-drain", no_argument, 0, OPTION_NO_DRAIN},
 {0, 0, 0, 0}
 };
 c = getopt_long(argc, argv, "hc:d:f:no:qs:S:t:w", long_options, NULL);
@@ -3640,6 +3687,20 @@ static int img_bench(int argc, char **argv)
 }
 break;
 }
+case OPTION_FLUSH_INTERVAL:
+{
+char *end;
+errno = 0;
+flush_interval = strtoul(optarg, , 0);
+if (errno || *end || flush_interval > INT_MAX) {
+error_report("Invalid flush interval specified");
+return 1;
+}
+break;
+}
+case OPTION_NO_DRAIN:
+drain_on_flush = false;
+break;
 case OPTION_IMAGE_OPTS:
 image_opts = true;
 break;
@@ 

[Qemu-block] [PATCH v2 1/5] qemu-img bench

2016-06-06 Thread Kevin Wolf
This adds a qemu-img command that allows doing some simple benchmarks
for the block layer without involving guest devices and a real VM.

For the start, this implements only a test of sequential reads.

Signed-off-by: Kevin Wolf 
Reviewed-by: Denis V. Lunev 
---
 qemu-img-cmds.hx |   6 ++
 qemu-img.c   | 190 +++
 qemu-img.texi|  10 +++
 3 files changed, 206 insertions(+)

diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
index e7cded6..f3bd546 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
@@ -9,6 +9,12 @@ STEXI
 @table @option
 ETEXI
 
+DEF("bench", img_bench,
+"bench [-c count] [-d depth] [-f fmt] [-n] [-q] [-s buffer_size] [-t 
cache] filename")
+STEXI
+@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-q] [-s 
@var{buffer_size}] [-t @var{cache}] @var{filename}
+ETEXI
+
 DEF("check", img_check,
 "check [-q] [--object objectdef] [--image-opts] [-f fmt] [--output=ofmt] 
[-r [leaks | all]] [-T src_cache] filename")
 STEXI
diff --git a/qemu-img.c b/qemu-img.c
index 4b56ad3..d471d10 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3459,6 +3459,196 @@ out_no_progress:
 return 0;
 }
 
+typedef struct BenchData {
+BlockBackend *blk;
+uint64_t image_size;
+int bufsize;
+int nrreq;
+int n;
+uint8_t *buf;
+QEMUIOVector *qiov;
+
+int in_flight;
+uint64_t offset;
+} BenchData;
+
+static void bench_cb(void *opaque, int ret)
+{
+BenchData *b = opaque;
+BlockAIOCB *acb;
+
+if (ret < 0) {
+error_report("Failed request: %s\n", strerror(-ret));
+exit(EXIT_FAILURE);
+}
+if (b->in_flight > 0) {
+b->n--;
+b->in_flight--;
+}
+
+while (b->n > b->in_flight && b->in_flight < b->nrreq) {
+acb = blk_aio_preadv(b->blk, b->offset, b->qiov, 0,
+ bench_cb, b);
+if (!acb) {
+error_report("Failed to issue request");
+exit(EXIT_FAILURE);
+}
+b->in_flight++;
+b->offset += b->bufsize;
+b->offset %= b->image_size;
+}
+}
+
+static int img_bench(int argc, char **argv)
+{
+int c, ret = 0;
+const char *fmt = NULL, *filename;
+bool quiet = false;
+bool image_opts = false;
+int count = 75000;
+int depth = 64;
+size_t bufsize = 4096;
+int64_t image_size;
+BlockBackend *blk = NULL;
+BenchData data = {};
+int flags = 0;
+bool writethrough;
+struct timeval t1, t2;
+int i;
+
+for (;;) {
+static const struct option long_options[] = {
+{"help", no_argument, 0, 'h'},
+{"image-opts", no_argument, 0, OPTION_IMAGE_OPTS},
+{0, 0, 0, 0}
+};
+c = getopt_long(argc, argv, "hc:d:f:nqs:t:", long_options, NULL);
+if (c == -1) {
+break;
+}
+
+switch (c) {
+case 'h':
+case '?':
+help();
+break;
+case 'c':
+{
+char *end;
+errno = 0;
+count = strtoul(optarg, , 0);
+if (errno || *end || count > INT_MAX) {
+error_report("Invalid request count specified");
+return 1;
+}
+break;
+}
+case 'd':
+{
+char *end;
+errno = 0;
+depth = strtoul(optarg, , 0);
+if (errno || *end || depth > INT_MAX) {
+error_report("Invalid queue depth specified");
+return 1;
+}
+break;
+}
+case 'f':
+fmt = optarg;
+break;
+case 'n':
+flags |= BDRV_O_NATIVE_AIO;
+break;
+case 'q':
+quiet = true;
+break;
+case 's':
+{
+int64_t sval;
+char *end;
+
+sval = qemu_strtosz_suffix(optarg, , QEMU_STRTOSZ_DEFSUFFIX_B);
+if (sval < 0 || sval > INT_MAX || *end) {
+error_report("Invalid buffer size specified");
+return 1;
+}
+
+bufsize = sval;
+break;
+}
+case 't':
+ret = bdrv_parse_cache_mode(optarg, , );
+if (ret < 0) {
+error_report("Invalid cache mode");
+ret = -1;
+goto out;
+}
+break;
+case OPTION_IMAGE_OPTS:
+image_opts = true;
+break;
+}
+}
+
+if (optind != argc - 1) {
+error_exit("Expecting one image file name");
+}
+filename = argv[argc - 1];
+
+blk = img_open(image_opts, filename, fmt, flags, writethrough, quiet);
+if (!blk) {
+ret = -1;
+goto out;
+}
+
+image_size = blk_getlength(blk);
+if (image_size < 0) {
+ret = image_size;
+goto out;
+}
+
+data = (BenchData) {
+.blk

Re: [Qemu-block] [PATCH 5/5] qemu-img bench: Add --flush-interval

2016-06-06 Thread Kevin Wolf
Am 03.06.2016 um 17:16 hat Kevin Wolf geschrieben:
> Am 03.06.2016 um 17:01 hat Denis V. Lunev geschrieben:
> > On 06/03/2016 03:30 PM, Kevin Wolf wrote:
> > >This options allows to flush the image periodically during write tests.
> > >
> > >Signed-off-by: Kevin Wolf 
> > This pattern could be a bit different - you wait requests to finish
> > and after that start flush. In this case there is no gain with my
> > patch ;)
> > 
> > You should treat flush as ordinary write request i.e. place
> > request immediately without waiting for writes to finish.
> > May be this could be specified as an operation mode.
> 
> Hm, okay, I can try that.

Doesn't make a difference for me. But I'll include the option in v2
anyway.

Kevin



Re: [Qemu-block] [PATCH] block: Fix bdrv_all_delete_snapshot() error handling

2016-06-06 Thread Max Reitz
On 06.06.2016 12:56, Kevin Wolf wrote:
> The code to exit the loop after bdrv_snapshot_delete_by_id_or_name()
> returned failure was duplicated. The first copy of it was too early so
> that the AioContext lock would not be freed. This patch removes it so
> that only the second, correct copy remains.
> 
> Signed-off-by: Kevin Wolf 
> ---
>  block/snapshot.c | 3 ---
>  1 file changed, 3 deletions(-)

Reviewed-by: Max Reitz 



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH] iotests: 095: Clean up QEMU before showing image info

2016-06-06 Thread Max Reitz
On 03.06.2016 11:07, Fam Zheng wrote:
> Somehow in my locking series, I missed this case where concurrent access
> to an image is performed, perhaps we can remove this case independently.
> 
> Signed-off-by: Fam Zheng 
> ---
>  tests/qemu-iotests/095 | 2 ++
>  1 file changed, 2 insertions(+)

Thanks, Fam, I've removed the commit message and applied the patch to my
block branch:

https://github.com/XanClic/qemu/commits/block

Max



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH] iotests: 095: Clean up QEMU before showing image info

2016-06-06 Thread Max Reitz
On 03.06.2016 11:07, Fam Zheng wrote:
> Somehow in my locking series, I missed this case where concurrent access
> to an image is performed, perhaps we can remove this case independently.
> 
> Signed-off-by: Fam Zheng 
> ---
>  tests/qemu-iotests/095 | 2 ++
>  1 file changed, 2 insertions(+)

Thanks, Fam, I've removed the commit message and applied the patch to my
block branch:

https://github.com/XanClic/qemu/commits/block



signature.asc
Description: OpenPGP digital signature


[Qemu-block] [PATCH] block: Fix bdrv_all_delete_snapshot() error handling

2016-06-06 Thread Kevin Wolf
The code to exit the loop after bdrv_snapshot_delete_by_id_or_name()
returned failure was duplicated. The first copy of it was too early so
that the AioContext lock would not be freed. This patch removes it so
that only the second, correct copy remains.

Signed-off-by: Kevin Wolf 
---
 block/snapshot.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/block/snapshot.c b/block/snapshot.c
index 6e6e34f..da89d2b 100644
--- a/block/snapshot.c
+++ b/block/snapshot.c
@@ -409,9 +409,6 @@ int bdrv_all_delete_snapshot(const char *name, 
BlockDriverState **first_bad_bs,
 if (bdrv_can_snapshot(bs) &&
 bdrv_snapshot_find(bs, snapshot, name) >= 0) {
 ret = bdrv_snapshot_delete_by_id_or_name(bs, name, err);
-if (ret < 0) {
-goto fail;
-}
 }
 aio_context_release(ctx);
 if (ret < 0) {
-- 
1.8.3.1




Re: [Qemu-block] [PATCH 4/5] qemu-img bench: Implement -S (step size)

2016-06-06 Thread Denis V. Lunev

On 06/03/2016 03:30 PM, Kevin Wolf wrote:

With this new option, qemu-img bench can be told to advance the current
offset after each request by a different value than the buffer size.
This is useful for controlling the conditions for cluster allocation in
image formats (e.g. qcow2 cluster allocation with COW in front of the
request, or COW areas that aren't overwritten immediately).

Signed-off-by: Kevin Wolf 
---
  qemu-img-cmds.hx |  4 ++--
  qemu-img.c   | 25 +
  qemu-img.texi|  6 --
  3 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
index b9b521e..f5d0098 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
@@ -10,9 +10,9 @@ STEXI
  ETEXI
  
  DEF("bench", img_bench,

-"bench [-c count] [-d depth] [-f fmt] [-n] [-o offset] [-q] [-s buffer_size] 
[-t cache] [-w] filename")
+"bench [-c count] [-d depth] [-f fmt] [-n] [-o offset] [-q] [-s buffer_size] 
[-S step_size] [-t cache] [-w] filename")
  STEXI
-@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-o 
@var{offset}] [-q] [-s @var{buffer_size}] [-t @var{cache}] [-w] @var{filename}
+@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-o 
@var{offset}] [-q] [-s @var{buffer_size}] [-S @var{step_size}] [-t @var{cache}] 
[-w] @var{filename}
  ETEXI
  
  DEF("check", img_check,

diff --git a/qemu-img.c b/qemu-img.c
index fcac8b8..ff99181 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3464,6 +3464,7 @@ typedef struct BenchData {
  uint64_t image_size;
  bool write;
  int bufsize;
+int step;
  int nrreq;
  int n;
  uint8_t *buf;
@@ -3500,7 +3501,7 @@ static void bench_cb(void *opaque, int ret)
  exit(EXIT_FAILURE);
  }
  b->in_flight++;
-b->offset += b->bufsize;
+b->offset += b->step;
  b->offset %= b->image_size;
  }
  }
@@ -3516,6 +3517,7 @@ static int img_bench(int argc, char **argv)
  int depth = 64;
  int64_t offset = 0;
  size_t bufsize = 4096;
+size_t step = 0;
  int64_t image_size;
  BlockBackend *blk = NULL;
  BenchData data = {};
@@ -3530,7 +3532,7 @@ static int img_bench(int argc, char **argv)
  {"image-opts", no_argument, 0, OPTION_IMAGE_OPTS},
  {0, 0, 0, 0}
  };
-c = getopt_long(argc, argv, "hc:d:f:no:qs:t:w", long_options, NULL);
+c = getopt_long(argc, argv, "hc:d:f:no:qs:S:t:w", long_options, NULL);
  if (c == -1) {
  break;
  }
@@ -3598,6 +3600,20 @@ static int img_bench(int argc, char **argv)
  bufsize = sval;
  break;
  }
+case 'S':
+{
+int64_t sval;
+char *end;
+
+sval = qemu_strtosz_suffix(optarg, , QEMU_STRTOSZ_DEFSUFFIX_B);
+if (sval < 0 || sval > INT_MAX || *end) {
+error_report("Invalid step size specified");
+return 1;
+}
+
+step = sval;
+break;
+}
  case 't':
  ret = bdrv_parse_cache_mode(optarg, , );
  if (ret < 0) {
@@ -3637,15 +3653,16 @@ static int img_bench(int argc, char **argv)
  .blk= blk,
  .image_size = image_size,
  .bufsize= bufsize,
+.step   = step ?: bufsize,
  .nrreq  = depth,
  .n  = count,
  .offset = offset,
  .write  = is_write,
  };
  printf("Sending %d %s requests, %d bytes each, %d in parallel "
-   "(starting at offset %" PRId64 ")\n",
+   "(starting at offset %" PRId64 ", step size %d)\n",
 data.n, data.write ? "write" : "read", data.bufsize, data.nrreq,
-   data.offset);
+   data.offset, data.step);
  
  data.buf = blk_blockalign(blk, data.nrreq * data.bufsize);

  memset(data.buf, 0, data.nrreq * data.bufsize);
diff --git a/qemu-img.texi b/qemu-img.texi
index c37380e..6b03d3f 100644
--- a/qemu-img.texi
+++ b/qemu-img.texi
@@ -131,14 +131,16 @@ Skip the creation of the target volume
  Command description:
  
  @table @option

-@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-o 
@var{offset}] [-q] [-s @var{buffer_size}] [-t @var{cache}] [-w] @var{filename}
+@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-o 
@var{offset}] [-q] [-s @var{buffer_size}] [-S @var{step_size}] [-t @var{cache}] 
[-w] @var{filename}
  
  Run a simple sequential I/O benchmark on the specified image. If @code{-w} is

  specified, a write test is performed, otherwise a read test is performed.
  
  A total number of @var{count} I/O requests is performed, each @var{buffer_size}

  bytes in size, and with @var{depth} requests in parallel. The first request
-starts at the position given by @var{offset}.
+starts at the position given by @var{offset}, each following request increases
+the current position by @var{step_size}. 

Re: [Qemu-block] [PATCH 3/5] qemu-img bench: Make start offset configurable

2016-06-06 Thread Denis V. Lunev

On 06/03/2016 03:30 PM, Kevin Wolf wrote:

This patch adds an option the specify the offset of the first request
made by qemu-img bench. This allows to benchmark misaligned requests.

Signed-off-by: Kevin Wolf 
---
  qemu-img-cmds.hx |  4 ++--
  qemu-img.c   | 23 ---
  qemu-img.texi|  5 +++--
  3 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
index d651674..b9b521e 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
@@ -10,9 +10,9 @@ STEXI
  ETEXI
  
  DEF("bench", img_bench,

-"bench [-c count] [-d depth] [-f fmt] [-n] [-q] [-s buffer_size] [-t cache] 
[-w] filename")
+"bench [-c count] [-d depth] [-f fmt] [-n] [-o offset] [-q] [-s buffer_size] 
[-t cache] [-w] filename")
  STEXI
-@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-q] [-s 
@var{buffer_size}] [-t @var{cache}] [-w] @var{filename}
+@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-o 
@var{offset}] [-q] [-s @var{buffer_size}] [-t @var{cache}] [-w] @var{filename}
  ETEXI
  
  DEF("check", img_check,

diff --git a/qemu-img.c b/qemu-img.c
index 142efb1..fcac8b8 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3514,6 +3514,7 @@ static int img_bench(int argc, char **argv)
  bool is_write = false;
  int count = 75000;
  int depth = 64;
+int64_t offset = 0;
  size_t bufsize = 4096;
  int64_t image_size;
  BlockBackend *blk = NULL;
@@ -3529,7 +3530,7 @@ static int img_bench(int argc, char **argv)
  {"image-opts", no_argument, 0, OPTION_IMAGE_OPTS},
  {0, 0, 0, 0}
  };
-c = getopt_long(argc, argv, "hc:d:f:nqs:t:w", long_options, NULL);
+c = getopt_long(argc, argv, "hc:d:f:no:qs:t:w", long_options, NULL);
  if (c == -1) {
  break;
  }
@@ -3567,6 +3568,19 @@ static int img_bench(int argc, char **argv)
  case 'n':
  flags |= BDRV_O_NATIVE_AIO;
  break;
+case 'o':
+{
+char *end;
+errno = 0;
+offset = qemu_strtosz_suffix(optarg, ,
+ QEMU_STRTOSZ_DEFSUFFIX_B);
+if (offset < 0|| *end) {
+error_report("Invalid offset specified");
+return 1;
+}
+break;
+}
+break;
  case 'q':
  quiet = true;
  break;
@@ -3625,10 +3639,13 @@ static int img_bench(int argc, char **argv)
  .bufsize= bufsize,
  .nrreq  = depth,
  .n  = count,
+.offset = offset,
  .write  = is_write,
  };
-printf("Sending %d %s requests, %d bytes each, %d in parallel\n",
-   data.n, data.write ? "write" : "read", data.bufsize, data.nrreq);
+printf("Sending %d %s requests, %d bytes each, %d in parallel "
+   "(starting at offset %" PRId64 ")\n",
+   data.n, data.write ? "write" : "read", data.bufsize, data.nrreq,
+   data.offset);
  
  data.buf = blk_blockalign(blk, data.nrreq * data.bufsize);

  memset(data.buf, 0, data.nrreq * data.bufsize);
diff --git a/qemu-img.texi b/qemu-img.texi
index 34e94db..c37380e 100644
--- a/qemu-img.texi
+++ b/qemu-img.texi
@@ -131,13 +131,14 @@ Skip the creation of the target volume
  Command description:
  
  @table @option

-@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-q] [-s 
@var{buffer_size}] [-t @var{cache}] [-w] @var{filename}
+@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-o 
@var{offset}] [-q] [-s @var{buffer_size}] [-t @var{cache}] [-w] @var{filename}
  
  Run a simple sequential I/O benchmark on the specified image. If @code{-w} is

  specified, a write test is performed, otherwise a read test is performed.
  
  A total number of @var{count} I/O requests is performed, each @var{buffer_size}

-bytes in size, and with @var{depth} requests in parallel.
+bytes in size, and with @var{depth} requests in parallel. The first request
+starts at the position given by @var{offset}.
  
  If @code{-n} is specified, the native AIO backend is used if possible. On

  Linux, this option only works if @code{-t none} or @code{-t directsync} is

Reviewed-by: Denis V. Lunev 



Re: [Qemu-block] [PATCH 1/5] qemu-img bench

2016-06-06 Thread Denis V. Lunev

On 06/03/2016 03:30 PM, Kevin Wolf wrote:

This adds a qemu-img command that allows doing some simple benchmarks
for the block layer without involving guest devices and a real VM.

For the start, this implements only a test of sequential reads.

Signed-off-by: Kevin Wolf 
---
  qemu-img-cmds.hx |   6 ++
  qemu-img.c   | 190 +++
  qemu-img.texi|  10 +++
  3 files changed, 206 insertions(+)

diff --git a/qemu-img-cmds.hx b/qemu-img-cmds.hx
index e7cded6..f3bd546 100644
--- a/qemu-img-cmds.hx
+++ b/qemu-img-cmds.hx
@@ -9,6 +9,12 @@ STEXI
  @table @option
  ETEXI
  
+DEF("bench", img_bench,

+"bench [-c count] [-d depth] [-f fmt] [-n] [-q] [-s buffer_size] [-t cache] 
filename")
+STEXI
+@item bench [-c @var{count}] [-d @var{depth}] [-f @var{fmt}] [-n] [-q] [-s 
@var{buffer_size}] [-t @var{cache}] @var{filename}
+ETEXI
+
  DEF("check", img_check,
  "check [-q] [--object objectdef] [--image-opts] [-f fmt] [--output=ofmt] [-r 
[leaks | all]] [-T src_cache] filename")
  STEXI
diff --git a/qemu-img.c b/qemu-img.c
index 4b56ad3..d471d10 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -3459,6 +3459,196 @@ out_no_progress:
  return 0;
  }
  
+typedef struct BenchData {

+BlockBackend *blk;
+uint64_t image_size;
+int bufsize;
+int nrreq;
+int n;
+uint8_t *buf;
+QEMUIOVector *qiov;
+
+int in_flight;
+uint64_t offset;
+} BenchData;
+
+static void bench_cb(void *opaque, int ret)
+{
+BenchData *b = opaque;
+BlockAIOCB *acb;
+
+if (ret < 0) {
+error_report("Failed request: %s\n", strerror(-ret));
+exit(EXIT_FAILURE);
+}
+if (b->in_flight > 0) {
+b->n--;
+b->in_flight--;
+}
+
+while (b->n > b->in_flight && b->in_flight < b->nrreq) {
+acb = blk_aio_preadv(b->blk, b->offset, b->qiov, 0,
+ bench_cb, b);
+if (!acb) {
+error_report("Failed to issue request");
+exit(EXIT_FAILURE);
+}
+b->in_flight++;
+b->offset += b->bufsize;
+b->offset %= b->image_size;
+}
+}
+
+static int img_bench(int argc, char **argv)
+{
+int c, ret = 0;
+const char *fmt = NULL, *filename;
+bool quiet = false;
+bool image_opts = false;
+int count = 75000;
+int depth = 64;
+size_t bufsize = 4096;
+int64_t image_size;
+BlockBackend *blk = NULL;
+BenchData data = {};
+int flags = 0;
+bool writethrough;
+struct timeval t1, t2;
+int i;
+
+for (;;) {
+static const struct option long_options[] = {
+{"help", no_argument, 0, 'h'},
+{"image-opts", no_argument, 0, OPTION_IMAGE_OPTS},
+{0, 0, 0, 0}
+};
+c = getopt_long(argc, argv, "hc:d:f:nqs:t:", long_options, NULL);
+if (c == -1) {
+break;
+}
+
+switch (c) {
+case 'h':
+case '?':
+help();
+break;
+case 'c':
+{
+char *end;
+errno = 0;
+count = strtoul(optarg, , 0);
+if (errno || *end || count > INT_MAX) {
+error_report("Invalid request count specified");
+return 1;
+}
+break;
+}
+case 'd':
+{
+char *end;
+errno = 0;
+depth = strtoul(optarg, , 0);
+if (errno || *end || depth > INT_MAX) {
+error_report("Invalid queue depth specified");
+return 1;
+}
+break;
+}
+case 'f':
+fmt = optarg;
+break;
+case 'n':
+flags |= BDRV_O_NATIVE_AIO;
+break;
+case 'q':
+quiet = true;
+break;
+case 's':
+{
+int64_t sval;
+char *end;
+
+sval = qemu_strtosz_suffix(optarg, , QEMU_STRTOSZ_DEFSUFFIX_B);
+if (sval < 0 || sval > INT_MAX || *end) {
+error_report("Invalid buffer size specified");
+return 1;
+}
+
+bufsize = sval;
+break;
+}
+case 't':
+ret = bdrv_parse_cache_mode(optarg, , );
+if (ret < 0) {
+error_report("Invalid cache mode");
+ret = -1;
+goto out;
+}
+break;
+case OPTION_IMAGE_OPTS:
+image_opts = true;
+break;
+}
+}
+
+if (optind != argc - 1) {
+error_exit("Expecting one image file name");
+}
+filename = argv[argc - 1];
+
+blk = img_open(image_opts, filename, fmt, flags, writethrough, quiet);
+if (!blk) {
+ret = -1;
+goto out;
+}
+
+image_size = blk_getlength(blk);
+if (image_size < 0) {
+ret = image_size;
+goto out;
+}
+
+data = (BenchData) {
+   

Re: [Qemu-block] [PULL 01/31] block: Fix bdrv_next() memory leak

2016-06-06 Thread Paolo Bonzini


On 25/05/2016 19:39, Kevin Wolf wrote:
> @@ -395,20 +399,27 @@ int bdrv_all_delete_snapshot(const char *name, 
> BlockDriverState **first_bad_bs,
>  {
>  int ret = 0;
>  BlockDriverState *bs;
> -BdrvNextIterator *it = NULL;
> +BdrvNextIterator it;
>  QEMUSnapshotInfo sn1, *snapshot = 
>  
> -while (ret == 0 && (it = bdrv_next(it, ))) {
> +for (bs = bdrv_first(); bs; bs = bdrv_next()) {
>  AioContext *ctx = bdrv_get_aio_context(bs);
>  
>  aio_context_acquire(ctx);
>  if (bdrv_can_snapshot(bs) &&
>  bdrv_snapshot_find(bs, snapshot, name) >= 0) {
>  ret = bdrv_snapshot_delete_by_id_or_name(bs, name, err);
> +if (ret < 0) {
> +goto fail;
> +}

This is redundant with the check below (and the check below is right;
this one is wrong).

Thanks,

Paolo

>  }
>  aio_context_release(ctx);
> +if (ret < 0) {
> +goto fail;
> +}
>  }
>  
> +fail:
>  *first_bad_bs = bs;
>  return ret;
>  }
> @@ -418,9 +429,9 @@ int bdrv_all_goto_snapshot(const char *name, 
> BlockDriverState **first_bad_bs)
>  {
>  int err = 0;
>  BlockDriverState *bs;
> -BdrvNextIterator *it = NULL;
> +BdrvNextIterator it;
>  
> -while (err == 0 && (it = bdrv_next(it, ))) {
> +for (bs = bdrv_first(); bs; bs = bdrv_next()) {
>  AioContext *ctx = bdrv_get_aio_context(bs);
>  
>  aio_context_acquire(ctx);
> @@ -428,8 +439,12 @@ int bdrv_all_goto_snapshot(const char *name, 
> BlockDriverState **first_bad_bs)
>  err = bdrv_snapshot_goto(bs, name);
>  }
>  aio_context_release(ctx);
> +if (err < 0) {
> +goto fail;
> +}
>  }
>  
> +fail:
>  *first_bad_bs = bs;
>  return err;
>  }
> @@ -439,9 +454,9 @@ int bdrv_all_find_snapshot(const char *name, 
> BlockDriverState **first_bad_bs)
>  QEMUSnapshotInfo sn;
>  int err = 0;
>  BlockDriverState *bs;
> -BdrvNextIterator *it = NULL;
> +BdrvNextIterator it;
>  
> -while (err == 0 && (it = bdrv_next(it, ))) {
> +for (bs = bdrv_first(); bs; bs = bdrv_next()) {
>  AioContext *ctx = bdrv_get_aio_context(bs);
>  
>  aio_context_acquire(ctx);
> @@ -449,8 +464,12 @@ int bdrv_all_find_snapshot(const char *name, 
> BlockDriverState **first_bad_bs)
>  err = bdrv_snapshot_find(bs, , name);
>  }
>  aio_context_release(ctx);
> +if (err < 0) {
> +goto fail;
> +}
>  }
>  
> +fail:
>  *first_bad_bs = bs;
>  return err;
>  }
> @@ -462,9 +481,9 @@ int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
>  {
>  int err = 0;
>  BlockDriverState *bs;
> -BdrvNextIterator *it = NULL;
> +BdrvNextIterator it;
>  
> -while (err == 0 && (it = bdrv_next(it, ))) {
> +for (bs = bdrv_first(); bs; bs = bdrv_next()) {
>  AioContext *ctx = bdrv_get_aio_context(bs);
>  
>  aio_context_acquire(ctx);
> @@ -476,24 +495,32 @@ int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn,
>  err = bdrv_snapshot_create(bs, sn);
>  }
>  aio_context_release(ctx);
> +if (err < 0) {
> +goto fail;
> +}
>  }
>  
> +fail:
>  *first_bad_bs = bs;
>  return err;
>  }
>  
>  BlockDriverState *bdrv_all_find_vmstate_bs(void)
>  {
> -bool not_found = true;
>  BlockDriverState *bs;
> -BdrvNextIterator *it = NULL;
> +BdrvNextIterator it;
>  
> -while (not_found && (it = bdrv_next(it, ))) {
> +for (bs = bdrv_first(); bs; bs = bdrv_next()) {
>  AioContext *ctx = bdrv_get_aio_context(bs);
> +bool found;
>  
>  aio_context_acquire(ctx);
> -not_found = !bdrv_can_snapshot(bs);
> +found = bdrv_can_snapshot(bs);
>  aio_context_release(ctx);
> +
> +if (found) {
> +break;
> +}
>  }
>  return bs;
>  }
> diff --git a/blockdev.c b/blockdev.c
> index 40e4e6f..0e37e09 100644
> --- a/blockdev.c
> +++ b/blockdev.c
> @@ -4164,9 +4164,9 @@ BlockJobInfoList *qmp_query_block_jobs(Error **errp)
>  {
>  BlockJobInfoList *head = NULL, **p_next = 
>  BlockDriverState *bs;
> -BdrvNextIterator *it = NULL;
> +BdrvNextIterator it;
>  
> -while ((it = bdrv_next(it, ))) {
> +for (bs = bdrv_first(); bs; bs = bdrv_next()) {
>  AioContext *aio_context = bdrv_get_aio_context(bs);
>  
>  aio_context_acquire(aio_context);
> diff --git a/include/block/block.h b/include/block/block.h
> index a8c15e3..770ca26 100644
> --- a/include/block/block.h
> +++ b/include/block/block.h
> @@ -17,7 +17,6 @@ typedef struct BlockJob BlockJob;
>  typedef struct BdrvChild BdrvChild;
>  typedef struct BdrvChildRole BdrvChildRole;
>  typedef struct BlockJobTxn BlockJobTxn;
> -typedef struct BdrvNextIterator BdrvNextIterator;
>  
>  typedef struct BlockDriverInfo {
>  /* in bytes, 0 if irrelevant */
> @@ 

Re: [Qemu-block] [Qemu-devel] [PATCH v2] blockdev: clean up error handling in do_open_tray

2016-06-06 Thread Markus Armbruster
Colin Lord  writes:

> Returns negative error codes and accompanying error messages in cases where
> the device has no tray or the tray is locked and isn't forced open. This
> extra information should result in better flexibility in functions that
> call do_open_tray.
>
> Signed-off-by: Colin Lord 
> ---
> v2: fix function documentation, improve commit wording, and remove
> unnecessary null check
>  blockdev.c | 36 +---
>  1 file changed, 21 insertions(+), 15 deletions(-)
>
> diff --git a/blockdev.c b/blockdev.c
> index 717785e..d50a2a5 100644
> --- a/blockdev.c
> +++ b/blockdev.c
> @@ -2286,17 +2286,14 @@ void qmp_eject(const char *device, bool has_force, 
> bool force, Error **errp)
>  }
>  
>  rc = do_open_tray(device, force, _err);
> -if (local_err) {
> +if (rc == -ENOSYS) {
> +error_free(local_err);
> +local_err = NULL;
> +} else if (local_err) {
>  error_propagate(errp, local_err);
>  return;
>  }

I like to put the error case in a conditional, and leave the normal flow
unindented, because it makes the normal flow easier to follow:

   if (rc && rc != -ENOSYS) {
   error_propagate(errp, local_err);
   return;
   }
   error_free(local_err);

error_free(NULL) is safe.

>  
> -if (rc == EINPROGRESS) {
> -error_setg(errp, "Device '%s' is locked and force was not specified, 
> "
> -   "wait for tray to open and try again", device);
> -return;
> -}
> -
>  qmp_x_blockdev_remove_medium(device, errp);
>  }
>  
> @@ -2325,10 +2322,9 @@ void qmp_block_passwd(bool has_device, const char 
> *device,
>  }
>  
>  /**
> - * returns -errno on fatal error, +errno for non-fatal situations.
> - * errp will always be set when the return code is negative.
> - * May return +ENOSYS if the device has no tray,
> - * or +EINPROGRESS if the tray is locked and the guest has been notified.
> + * returns -errno on all errors, and errp will be set on error
> + * May return the non-fatal error codes -ENOSYS if the device has no tray,
> + * or -EINPROGRESS if the tray is locked and the guest has been notified.
>   */

What does the function do?

What does it return on success?

Suggest imperative mood.

Here's my try:

   /*
* Attempt to open the tray of @device.
* If @force, ignore its tray lock.
* Else, if the tray is locked, don't open it, but ask the guest to
* open it.
* On error, store an error through @errp and return -errno.
* If @device does not exist, return -ENODEV.
* If it has no removable media, return -ENOTSUP.
* If it has no tray, return -ENOSYS.
* If the guest was asked to open the tray, return -EINPROGRESS.
* Else, return 0.
*/

>  static int do_open_tray(const char *device, bool force, Error **errp)
>  {
> @@ -2348,8 +2344,8 @@ static int do_open_tray(const char *device, bool force, 
> Error **errp)
>  }
>  
>  if (!blk_dev_has_tray(blk)) {
> -/* Ignore this command on tray-less devices */
> -return ENOSYS;
> +error_setg(errp, "Device '%s' does not have a tray", device);
> +return -ENOSYS;
>  }
>  
>  if (blk_dev_is_tray_open(blk)) {
> @@ -2366,7 +2362,9 @@ static int do_open_tray(const char *device, bool force, 
> Error **errp)
>  }
>  
>  if (locked && !force) {
> -return EINPROGRESS;
> +error_setg(errp, "Device '%s' is locked and force was not specified, 
> "
> +   "wait for tray to open and try again", device);
> +return -EINPROGRESS;
>  }
>  
>  return 0;
> @@ -2375,10 +2373,18 @@ static int do_open_tray(const char *device, bool 
> force, Error **errp)
>  void qmp_blockdev_open_tray(const char *device, bool has_force, bool force,
>  Error **errp)
>  {
> +Error *local_err = NULL;
> +int rc;
> +
>  if (!has_force) {
>  force = false;
>  }
> -do_open_tray(device, force, errp);
> +rc = do_open_tray(device, force, _err);
> +if (rc == -EINPROGRESS || rc == -ENOSYS) {
> +error_free(local_err);
> +} else {
> +error_propagate(errp, local_err);
> +}
>  }

Likewise:

   if (rc && rc != -ENOSYS && rc != EINPROGRESS) {
   error_propagate(errp, local_err);
   return;
   }
   error_free(local_err);

>  
>  void qmp_blockdev_close_tray(const char *device, Error **errp)

While you're cleaning up: mind moving the forward declaration of
do_open_tray() to the beginning?