[Qemu-block] [PATCH 2/2] block: Add copy offloading trace points

2018-07-03 Thread Fam Zheng
A few trace points that can help reveal what is happening in a copy
offloading I/O path.

Signed-off-by: Fam Zheng 
---
 block/file-posix.c | 2 ++
 block/io.c | 2 ++
 block/iscsi.c  | 3 +++
 block/trace-events | 6 ++
 4 files changed, 13 insertions(+)

diff --git a/block/file-posix.c b/block/file-posix.c
index 829ee538d8..d3b1609410 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -1488,6 +1488,8 @@ static ssize_t handle_aiocb_copy_range(RawPosixAIOData 
*aiocb)
 ssize_t ret = copy_file_range(aiocb->aio_fildes, &in_off,
   aiocb->aio_fd2, &out_off,
   bytes, 0);
+trace_copy_file_range(aiocb->bs, aiocb->aio_fildes, in_off,
+  aiocb->aio_fd2, out_off, bytes, 0, ret);
 if (ret == 0) {
 /* No progress (e.g. when beyond EOF), let the caller fall back to
  * buffer I/O. */
diff --git a/block/io.c b/block/io.c
index 8e02f4ab95..df930b982f 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2964,6 +2964,7 @@ int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, 
uint64_t src_offset,
  BdrvChild *dst, uint64_t dst_offset,
  uint64_t bytes, BdrvRequestFlags 
flags)
 {
+trace_bdrv_co_copy_range_from(src, src_offset, dst, dst_offset, bytes, 
flags);
 return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
bytes, flags, true);
 }
@@ -2976,6 +2977,7 @@ int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, 
uint64_t src_offset,
BdrvChild *dst, uint64_t dst_offset,
uint64_t bytes, BdrvRequestFlags flags)
 {
+trace_bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, 
flags);
 return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
bytes, flags, false);
 }
diff --git a/block/iscsi.c b/block/iscsi.c
index ead2bd5aa7..118555d051 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -44,6 +44,7 @@
 #include "qapi/qmp/qstring.h"
 #include "crypto/secret.h"
 #include "scsi/utils.h"
+#include "trace.h"
 
 /* Conflict between scsi/utils.h and libiscsi! :( */
 #define SCSI_XFER_NONE ISCSI_XFER_NONE
@@ -2396,6 +2397,8 @@ retry:
 }
 
 out_unlock:
+
+trace_iscsi_xcopy(src_lun, src_offset, dst_lun, dst_offset, bytes, r);
 g_free(iscsi_task.task);
 qemu_mutex_unlock(&dst_lun->mutex);
 g_free(iscsi_task.err_str);
diff --git a/block/trace-events b/block/trace-events
index c35287b48a..1a25a997f2 100644
--- a/block/trace-events
+++ b/block/trace-events
@@ -15,6 +15,8 @@ bdrv_co_preadv(void *bs, int64_t offset, int64_t nbytes, 
unsigned int flags) "bs
 bdrv_co_pwritev(void *bs, int64_t offset, int64_t nbytes, unsigned int flags) 
"bs %p offset %"PRId64" nbytes %"PRId64" flags 0x%x"
 bdrv_co_pwrite_zeroes(void *bs, int64_t offset, int count, int flags) "bs %p 
offset %"PRId64" count %d flags 0x%x"
 bdrv_co_do_copy_on_readv(void *bs, int64_t offset, unsigned int bytes, int64_t 
cluster_offset, int64_t cluster_bytes) "bs %p offset %"PRId64" bytes %u 
cluster_offset %"PRId64" cluster_bytes %"PRId64
+bdrv_co_copy_range_from(void *src, uint64_t src_offset, void *dst, uint64_t 
dst_offset, uint64_t bytes, int flags) "src %p offset %"PRIu64" dst %p offset 
%"PRIu64" bytes %"PRIu64" flags 0x%x"
+bdrv_co_copy_range_to(void *src, uint64_t src_offset, void *dst, uint64_t 
dst_offset, uint64_t bytes, int flags) "src %p offset %"PRIu64" dst %p offset 
%"PRIu64" bytes %"PRIu64" flags 0x%x"
 
 # block/stream.c
 stream_one_iteration(void *s, int64_t offset, uint64_t bytes, int 
is_allocated) "s %p offset %" PRId64 " bytes %" PRIu64 " is_allocated %d"
@@ -57,6 +59,7 @@ qmp_block_stream(void *bs, void *job) "bs %p job %p"
 # block/file-posix.c
 paio_submit_co(int64_t offset, int count, int type) "offset %"PRId64" count %d 
type %d"
 paio_submit(void *acb, void *opaque, int64_t offset, int count, int type) "acb 
%p opaque %p offset %"PRId64" count %d type %d"
+copy_file_range(void *bs, int src, int64_t src_off, int dst, int64_t dst_off, 
int64_t bytes, int flags, int64_t ret) "bs %p src_fd %d offset %"PRIu64" dst_fd 
%d offset %"PRIu64" bytes %"PRIu64" flags %d ret %"PRId64
 
 # block/qcow2.c
 qcow2_writev_start_req(void *co, int64_t offset, int bytes) "co %p offset 0x%" 
PRIx64 " bytes %d"
@@ -150,3 +153,6 @@ nvme_free_req_queue_wait(void *q) "q %p"
 nvme_cmd_map_qiov(void *s, void *cmd, void *req, void *qiov, int entries) "s 
%p cmd %p req %p qiov %p entries %d"
 nvme_cmd_map_qiov_pages(void *s, int i, uint64_t page) "s %p page[%d] 
0x%"PRIx64
 nvme_cmd_map_qiov_iov(void *s, int i, void *page, int pages) "s %p iov[%d] %p 
pages %d"
+
+# block/iscsi.c
+iscsi_xcopy(void *src_lun, uint64_t src_off, void *dst_lun, uint64_t dst_off, 
uint64_t bytes, int ret) "src_lun %p offset %"PRIu64" dst_l

[Qemu-block] [PATCH 1/2] block: Fix dst total_sectors after copy offloading

2018-07-03 Thread Fam Zheng
This was noticed by the new image fleecing tests case 222. The issue is
apparent and we should just do the same right things as in
bdrv_aligned_pwritev.

Reported-by: John Snow 
Signed-off-by: Fam Zheng 
---
 block/io.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/block/io.c b/block/io.c
index 1a2272fad3..8e02f4ab95 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2945,6 +2945,10 @@ static int coroutine_fn 
bdrv_co_copy_range_internal(BdrvChild *src,
   dst, dst_offset,
   bytes, flags);
 }
+if (ret == 0) {
+int64_t end_sector = DIV_ROUND_UP(dst_offset + bytes, 
BDRV_SECTOR_SIZE);
+dst->bs->total_sectors = MAX(dst->bs->total_sectors, end_sector);
+}
 tracked_request_end(&src_req);
 tracked_request_end(&dst_req);
 bdrv_dec_in_flight(src->bs);
-- 
2.17.1




[Qemu-block] [PATCH 0/2] block: Fix dst reading after tail copy offloading

2018-07-03 Thread Fam Zheng
Qcow2 allocates new clusters after the end of the file. If it is the destinaton
of copy offloading, we must adjust dst->bs->total_sectors. Otherwise, further
reads will drop to the "beyond EOF" code path and return zeroes, which problem
is caught by iotests 222.

Follow the logic in the normal write code and update bs->total_sectors after
I/O is done.

While at it, add a few convenient trace points to aid future debug experiences
in the topic.

Fam Zheng (2):
  block: Fix dst total_sectors after copy offloading
  block: Add copy offloading trace points

 block/file-posix.c | 2 ++
 block/io.c | 6 ++
 block/iscsi.c  | 3 +++
 block/trace-events | 6 ++
 4 files changed, 17 insertions(+)

-- 
2.17.1




Re: [Qemu-block] [PULL 3/3] backup: Use copy offloading

2018-07-03 Thread John Snow



On 07/03/2018 12:53 PM, John Snow wrote:
> 
> 
> On 07/02/2018 11:46 PM, Jeff Cody wrote:
>> From: Fam Zheng 
>>
>> The implementation is similar to the 'qemu-img convert'. In the
>> beginning of the job, offloaded copy is attempted. If it fails, further
>> I/O will go through the existing bounce buffer code path.
>>
>> Then, as Kevin pointed out, both this and qemu-img convert can benefit
>> from a local check if one request fails because of, for example, the
>> offset is beyond EOF, but another may well be accepted by the protocol
>> layer. This will be implemented separately.
>>
>> Reviewed-by: Stefan Hajnoczi 
>> Signed-off-by: Fam Zheng 
>> Message-id: 20180703023758.14422-4-f...@redhat.com
>> Signed-off-by: Jeff Cody 
>> ---
>>  block/backup.c | 150 -
>>  block/trace-events |   1 +
>>  2 files changed, 110 insertions(+), 41 deletions(-)
>>
>> diff --git a/block/backup.c b/block/backup.c
>> index d18be40caf..81895ddbe2 100644
>> --- a/block/backup.c
>> +++ b/block/backup.c
>> @@ -45,6 +45,8 @@ typedef struct BackupBlockJob {
>>  QLIST_HEAD(, CowRequest) inflight_reqs;
>>  
>>  HBitmap *copy_bitmap;
>> +bool use_copy_range;
>> +int64_t copy_range_size;
>>  } BackupBlockJob;
>>  
>>  static const BlockJobDriver backup_job_driver;
>> @@ -86,19 +88,101 @@ static void cow_request_end(CowRequest *req)
>>  qemu_co_queue_restart_all(&req->wait_queue);
>>  }
>>  
>> +/* Copy range to target with a bounce buffer and return the bytes copied. If
>> + * error occured, return a negative error number */
>> +static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
>> +  int64_t start,
>> +  int64_t end,
>> +  bool 
>> is_write_notifier,
>> +  bool *error_is_read,
>> +  void **bounce_buffer)
>> +{
>> +int ret;
>> +struct iovec iov;
>> +QEMUIOVector qiov;
>> +BlockBackend *blk = job->common.blk;
>> +int nbytes;
>> +
>> +hbitmap_reset(job->copy_bitmap, start / job->cluster_size, 1);
>> +nbytes = MIN(job->cluster_size, job->len - start);
>> +if (!*bounce_buffer) {
>> +*bounce_buffer = blk_blockalign(blk, job->cluster_size);
>> +}
>> +iov.iov_base = *bounce_buffer;
>> +iov.iov_len = nbytes;
>> +qemu_iovec_init_external(&qiov, &iov, 1);
>> +
>> +ret = blk_co_preadv(blk, start, qiov.size, &qiov,
>> +is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0);
>> +if (ret < 0) {
>> +trace_backup_do_cow_read_fail(job, start, ret);
>> +if (error_is_read) {
>> +*error_is_read = true;
>> +}
>> +goto fail;
>> +}
>> +
>> +if (qemu_iovec_is_zero(&qiov)) {
>> +ret = blk_co_pwrite_zeroes(job->target, start,
>> +   qiov.size, BDRV_REQ_MAY_UNMAP);
>> +} else {
>> +ret = blk_co_pwritev(job->target, start,
>> + qiov.size, &qiov,
>> + job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0);
>> +}
>> +if (ret < 0) {
>> +trace_backup_do_cow_write_fail(job, start, ret);
>> +if (error_is_read) {
>> +*error_is_read = false;
>> +}
>> +goto fail;
>> +}
>> +
>> +return nbytes;
>> +fail:
>> +hbitmap_set(job->copy_bitmap, start / job->cluster_size, 1);
>> +return ret;
>> +
>> +}
>> +
>> +/* Copy range to target and return the bytes copied. If error occured, 
>> return a
>> + * negative error number. */
>> +static int coroutine_fn backup_cow_with_offload(BackupBlockJob *job,
>> +int64_t start,
>> +int64_t end,
>> +bool is_write_notifier)
>> +{
>> +int ret;
>> +int nr_clusters;
>> +BlockBackend *blk = job->common.blk;
>> +int nbytes;
>> +
>> +assert(QEMU_IS_ALIGNED(job->copy_range_size, job->cluster_size));
>> +nbytes = MIN(job->copy_range_size, end - start);
>> +nr_clusters = DIV_ROUND_UP(nbytes, job->cluster_size);
>> +hbitmap_reset(job->copy_bitmap, start / job->cluster_size,
>> +  nr_clusters);
>> +ret = blk_co_copy_range(blk, start, job->target, start, nbytes,
>> +is_write_notifier ? BDRV_REQ_NO_SERIALISING : 
>> 0);
>> +if (ret < 0) {
>> +trace_backup_do_cow_copy_range_fail(job, start, ret);
>> +hbitmap_set(job->copy_bitmap, start / job->cluster_size,
>> +nr_clusters);
>> +return ret;
>> +}
>> +
>> +return nbytes;
>> +}
>> +
>>  static int coroutine_fn backup_do_cow(BackupBlockJob *job,
>>int64_t o

[Qemu-block] [PATCH 2/2] block/backup: fix fleecing scheme: use serialized writes

2018-07-03 Thread Vladimir Sementsov-Ogievskiy
Fleecing scheme works as follows: we want a kind of temporary snapshot
of active drive A. We create temporary image B, with B->backing = A.
Then we start backup(sync=none) from A to B. From this point, B reads
as point-in-time snapshot of A (A continues to be active drive,
accepting guest IO).

This scheme needs some additional synchronization between reads from B
and backup COW operations, otherwise, the following situation is
theoretically possible:

(assume B is qcow2, client is NBD client, reading from B)

1. client starts reading and take qcow2 mutex in qcow2_co_preadv, and
   goes up to l2 table loading (assume cache miss)

2) guest write => backup COW => qcow2 write =>
   try to take qcow2 mutex => waiting

3. l2 table loaded, we see that cluster is UNALLOCATED, go to
   "case QCOW2_CLUSTER_UNALLOCATED" and unlock mutex before
   bdrv_co_preadv(bs->backing, ...)

4) aha, mutex unlocked, backup COW continues, and we finally finish
   guest write and change cluster in our active disk A

5. actually, do bdrv_co_preadv(bs->backing, ...) and read
   _new updated_ data.

To avoid this, let's make all COW writes serializing, to not intersect
with reads from B.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/backup.c | 21 ++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/block/backup.c b/block/backup.c
index 81895ddbe2..ab46a7d43d 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -47,6 +47,8 @@ typedef struct BackupBlockJob {
 HBitmap *copy_bitmap;
 bool use_copy_range;
 int64_t copy_range_size;
+
+bool fleecing;
 } BackupBlockJob;
 
 static const BlockJobDriver backup_job_driver;
@@ -102,6 +104,7 @@ static int coroutine_fn 
backup_cow_with_bounce_buffer(BackupBlockJob *job,
 QEMUIOVector qiov;
 BlockBackend *blk = job->common.blk;
 int nbytes;
+int sflags = job->fleecing ? BDRV_REQ_SERIALISING : 0;
 
 hbitmap_reset(job->copy_bitmap, start / job->cluster_size, 1);
 nbytes = MIN(job->cluster_size, job->len - start);
@@ -124,11 +127,12 @@ static int coroutine_fn 
backup_cow_with_bounce_buffer(BackupBlockJob *job,
 
 if (qemu_iovec_is_zero(&qiov)) {
 ret = blk_co_pwrite_zeroes(job->target, start,
-   qiov.size, BDRV_REQ_MAY_UNMAP);
+   qiov.size, sflags | BDRV_REQ_MAY_UNMAP);
 } else {
 ret = blk_co_pwritev(job->target, start,
  qiov.size, &qiov,
- job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0);
+ job->compress ? BDRV_REQ_WRITE_COMPRESSED :
+ sflags);
 }
 if (ret < 0) {
 trace_backup_do_cow_write_fail(job, start, ret);
@@ -614,6 +618,7 @@ BlockJob *backup_job_create(const char *job_id, 
BlockDriverState *bs,
 BlockDriverInfo bdi;
 BackupBlockJob *job = NULL;
 int ret;
+bool fleecing;
 
 assert(bs);
 assert(target);
@@ -668,6 +673,15 @@ BlockJob *backup_job_create(const char *job_id, 
BlockDriverState *bs,
 return NULL;
 }
 
+/* Detect image fleecing */
+fleecing = sync_mode == MIRROR_SYNC_MODE_NONE && target->backing->bs == bs;
+if (fleecing) {
+if (compress) {
+error_setg(errp, "Image fleecing doesn't support compressed 
mode.");
+return NULL;
+}
+}
+
 len = bdrv_getlength(bs);
 if (len < 0) {
 error_setg_errno(errp, -len, "unable to get length for '%s'",
@@ -700,6 +714,7 @@ BlockJob *backup_job_create(const char *job_id, 
BlockDriverState *bs,
 job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ?
sync_bitmap : NULL;
 job->compress = compress;
+job->fleecing = fleecing;
 
 /* If there is no backing file on the target, we cannot rely on COW if our
  * backup cluster size is smaller than the target cluster size. Even for
@@ -727,7 +742,7 @@ BlockJob *backup_job_create(const char *job_id, 
BlockDriverState *bs,
 } else {
 job->cluster_size = MAX(BACKUP_CLUSTER_SIZE_DEFAULT, bdi.cluster_size);
 }
-job->use_copy_range = true;
+job->use_copy_range = !fleecing;
 job->copy_range_size = MIN_NON_ZERO(blk_get_max_transfer(job->common.blk),
 blk_get_max_transfer(job->target));
 job->copy_range_size = MAX(job->cluster_size,
-- 
2.11.1




[Qemu-block] [PATCH 1/2] block: add BDRV_REQ_SERIALISING flag

2018-07-03 Thread Vladimir Sementsov-Ogievskiy
Serialized writes should be used in copy-on-write of backup(sync=none)
for image fleecing scheme.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 include/block/block.h | 5 -
 block/io.c| 4 
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/include/block/block.h b/include/block/block.h
index e5c7759a0c..107113aad5 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -58,8 +58,11 @@ typedef enum {
  * content. */
 BDRV_REQ_WRITE_UNCHANGED= 0x40,
 
+/* Force request serializing. Only for writes. */
+BDRV_REQ_SERIALISING= 0x80,
+
 /* Mask of valid flags */
-BDRV_REQ_MASK   = 0x7f,
+BDRV_REQ_MASK   = 0xff,
 } BdrvRequestFlags;
 
 typedef struct BlockSizes {
diff --git a/block/io.c b/block/io.c
index 1a2272fad3..d5ba078514 100644
--- a/block/io.c
+++ b/block/io.c
@@ -1572,6 +1572,10 @@ static int coroutine_fn bdrv_aligned_pwritev(BdrvChild 
*child,
 max_transfer = QEMU_ALIGN_DOWN(MIN_NON_ZERO(bs->bl.max_transfer, INT_MAX),
align);
 
+if (flags & BDRV_REQ_SERIALISING) {
+mark_request_serialising(req, bdrv_get_cluster_size(bs));
+}
+
 waited = wait_serialising_requests(req);
 assert(!waited || !req->serialising);
 assert(req->overlap_offset <= offset);
-- 
2.11.1




[Qemu-block] [PATCH 0/2] fix image fleecing

2018-07-03 Thread Vladimir Sementsov-Ogievskiy
Hi all.

It's a continuation of discussion under
"[PATCH v2 2/3] block/fleecing-filter: new filter driver for fleecing" [1].

Here is my try to implement Kevin's idea, that all backup writes (for
fleecing case) should be serialized. (However, I've skipped for now
fixing related permissions).

Looks like these patches may replace patch [1], to make fleecing scheme
safe. But I'm not sure, a look by Kevin is necessary.

A test is still needed, to prove that this patch is necessary and that it
works..

Vladimir Sementsov-Ogievskiy (2):
  block: add BDRV_REQ_SERIALISING flag
  block/backup: fix fleecing scheme: use serialized writes

 include/block/block.h |  5 -
 block/backup.c| 21 ++---
 block/io.c|  4 
 3 files changed, 26 insertions(+), 4 deletions(-)

-- 
2.11.1




Re: [Qemu-block] [PATCH v2 2/3] block/fleecing-filter: new filter driver for fleecing

2018-07-03 Thread Kevin Wolf
Am 03.07.2018 um 18:11 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 03.07.2018 14:15, Kevin Wolf wrote:
> > We'll have to figure out where to fix this problem (or what it really
> > is, once you look more than just at fleecing), but I think requiring the
> > user to add a filter driver to work around missing serialisation in
> > other code, and corrupting their image if they forget to, is not a
> > reasonable solution.
> > 
> > I see at least two things wrong in this context:
> > 
> > * The permissions don't seem to match reality. The NBD server
> >unconditionally shares PERM_WRITE, which is wrong in this case. The
> >client wants to see a point-in-time snapshot that never changes. This
> >should become an option so that it can be properly reflected in the
> >permissions used.
> > 
> > * Once we have proper permissions, the fleecing setup breaks down
> >because the guest needs PERM_WRITE on the backing file, but the
> >fleecing overlay allows that only if the NBD client allows it (which
> >it doesn't for fleecing).
> > 
> >Now we can implement an exception right into backup that installs a
> >backup filter driver between source and target if the source is the
> >backing file of the target. The filter driver would be similar to the
> >commit filter driver in that it simply promises !PERM_WRITE to its
> >parents, but allows PERM_WRITE on the source because it has installed
> >the before_write_notifier that guarantees this condition.
> > 
> >All writes to the target that are made by the backup job in this setup
> >(including before_write_notifier writes) need to be marked as
> >serialising so that any concurrent reads are completed first.
> > 
> > And if we decide to add a target filter to backup, we should probably at
> > the same time use a filter driver for intercepting source writes instead
> > of using before_write_notifier.
> 
> Hmm, is it possible to do all the staff in one super filter driver, which we
> insert into the tree like this:
> 
> top blk    fleecing qcow2
>  +   +
>  |   |backing
>  v <-+
>    super filter
>  +
>  |file
>  v
>    active image
> 
> 
> And super filter do the following:
> 
> 1. copy-on-write, before forwarding write to file, it do serializing write
> to fleecing qcow2

This is where it breaks down. The filter driver in your graph doesn't
know fleecing.qcow2, so it can't write to it. Attaching fleecing.qcow2
as an additional child to the super filter doesn't work either because
you would create a loop then.

I think we need two separate nodes (and probably it's better to have
them managed by a block job so that both together can be checked to
result in a consistent setup).

> 2. fake .bdrv_child_perm for fleecing qcow2, like in block commit
> 
> and no block job is needed.

Kevin



[Qemu-block] [PATCH 2/2] test-bdrv-drain: Test bdrv_append() to drained node

2018-07-03 Thread Kevin Wolf
Signed-off-by: Kevin Wolf 
---
 tests/test-bdrv-drain.c | 43 +++
 1 file changed, 43 insertions(+)

diff --git a/tests/test-bdrv-drain.c b/tests/test-bdrv-drain.c
index 291a050f86..17bb8508ae 100644
--- a/tests/test-bdrv-drain.c
+++ b/tests/test-bdrv-drain.c
@@ -1250,6 +1250,47 @@ static void test_detach_by_driver_cb(void)
 test_detach_indirect(false);
 }
 
+static void test_append_to_drained(void)
+{
+BlockBackend *blk;
+BlockDriverState *base, *overlay;
+BDRVTestState *base_s, *overlay_s;
+
+blk = blk_new(BLK_PERM_ALL, BLK_PERM_ALL);
+base = bdrv_new_open_driver(&bdrv_test, "base", BDRV_O_RDWR, &error_abort);
+base_s = base->opaque;
+blk_insert_bs(blk, base, &error_abort);
+
+overlay = bdrv_new_open_driver(&bdrv_test, "overlay", BDRV_O_RDWR,
+   &error_abort);
+overlay_s = overlay->opaque;
+
+do_drain_begin(BDRV_DRAIN, base);
+g_assert_cmpint(base->quiesce_counter, ==, 1);
+g_assert_cmpint(base_s->drain_count, ==, 1);
+g_assert_cmpint(base->in_flight, ==, 0);
+
+/* Takes ownership of overlay, so we don't have to unref it later */
+bdrv_append(overlay, base, &error_abort);
+g_assert_cmpint(base->in_flight, ==, 0);
+g_assert_cmpint(overlay->in_flight, ==, 0);
+
+g_assert_cmpint(base->quiesce_counter, ==, 1);
+g_assert_cmpint(base_s->drain_count, ==, 1);
+g_assert_cmpint(overlay->quiesce_counter, ==, 1);
+g_assert_cmpint(overlay_s->drain_count, ==, 1);
+
+do_drain_end(BDRV_DRAIN, base);
+
+g_assert_cmpint(base->quiesce_counter, ==, 0);
+g_assert_cmpint(base_s->drain_count, ==, 0);
+g_assert_cmpint(overlay->quiesce_counter, ==, 0);
+g_assert_cmpint(overlay_s->drain_count, ==, 0);
+
+bdrv_unref(base);
+blk_unref(blk);
+}
+
 int main(int argc, char **argv)
 {
 int ret;
@@ -1308,6 +1349,8 @@ int main(int argc, char **argv)
 g_test_add_func("/bdrv-drain/detach/parent_cb", test_detach_by_parent_cb);
 g_test_add_func("/bdrv-drain/detach/driver_cb", test_detach_by_driver_cb);
 
+g_test_add_func("/bdrv-drain/attach/drain", test_append_to_drained);
+
 ret = g_test_run();
 qemu_event_destroy(&done_event);
 return ret;
-- 
2.13.6




[Qemu-block] [PATCH 0/2] block: Fix attaching drained child node

2018-07-03 Thread Kevin Wolf
This fixes the following case that was reported by Max and was caused by
not correctly waiting for activity to cease on the parent node before
attaching a drained child node:

$ ./qemu-img create -f qed foo.qed 64M
Formatting 'foo.qed', fmt=qed size=67108864 cluster_size=65536
$ echo "{'execute':'qmp_capabilities'}
{'execute':'blockdev-snapshot',
 'arguments':{'node':'backing','overlay':'overlay'}}
{'execute':'quit'}" | \
x86_64-softmmu/qemu-system-x86_64 -qmp stdio -nodefaults \
-blockdev "{'node-name':'backing','driver':'null-co'}" \
-blockdev "{'node-name':'overlay','driver':'qed',
'file':{'driver':'file','filename':'foo.qed'}}"
{"QMP": {"version": {"qemu": {"micro": 50, "minor": 12, "major": 2},
"package": "v2.12.0-1422-g0109e7e6f8"}, "capabilities": []}}
{"return": {}}
qemu-system-x86_64: block.c:3434: bdrv_replace_node: Assertion
`!atomic_read(&to->in_flight)' failed.
[1]5252 done echo  |
   5253 abort (core dumped)  x86_64-softmmu/qemu-system-x86_64 -qmp
stdio -nodefaults -blockdev  -blockdev

Kevin Wolf (2):
  block: Poll after drain on attaching a node
  test-bdrv-drain: Test bdrv_append() to drained node

 include/block/block.h |  8 
 include/block/block_int.h |  3 +++
 block.c   |  2 +-
 block/io.c| 26 --
 tests/test-bdrv-drain.c   | 43 +++
 5 files changed, 75 insertions(+), 7 deletions(-)

-- 
2.13.6




[Qemu-block] [PATCH 1/2] block: Poll after drain on attaching a node

2018-07-03 Thread Kevin Wolf
Commit dcf94a23b1 ('block: Don't poll in parent drain callbacks')
removed polling in bdrv_child_cb_drained_begin() on the grounds that the
original bdrv_drain() already will poll and BdrvChildRole.drained_begin
calls must not cause graph changes (and therefore must not call
aio_poll() or the recursion through the graph will break.

This reasoning is correct for calls through bdrv_do_drained_begin().
However, BdrvChildRole.drained_begin is also called when a node that is
already in a drained section (i.e. bdrv_do_drained_begin() has already
returned and therefore can't poll any more) is attached to a new parent.
In this case, we must explicitly poll to have all requests completed
before the drained new child can be attached to the parent.

In bdrv_replace_child_noperm(), we know that we're not inside the
recursion of bdrv_do_drained_begin() because graph changes are not
allowed there, and bdrv_replace_child_noperm() is a graph change. The
call of BdrvChildRole.drained_begin() must therefore be followed by a
BDRV_POLL_WHILE() that waits for the completion of requests.

Reported-by: Max Reitz 
Signed-off-by: Kevin Wolf 
---
 include/block/block.h |  8 
 include/block/block_int.h |  3 +++
 block.c   |  2 +-
 block/io.c| 26 --
 4 files changed, 32 insertions(+), 7 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index bc76b1e59f..706ef009ad 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -569,6 +569,14 @@ void bdrv_parent_drained_begin(BlockDriverState *bs, 
BdrvChild *ignore,
bool ignore_bds_parents);
 
 /**
+ * bdrv_parent_drained_begin_single:
+ *
+ * Begin a quiesced section for the parent of @c. If @poll is true, wait for
+ * any pending activity to cease.
+ */
+void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll);
+
+/**
  * bdrv_parent_drained_end:
  *
  * End a quiesced section of all users of @bs. This is part of
diff --git a/include/block/block_int.h b/include/block/block_int.h
index af71b414be..81cd3db7a9 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -606,6 +606,9 @@ struct BdrvChildRole {
  * requests after returning from .drained_begin() until .drained_end() is
  * called.
  *
+ * These functions must not change the graph (and therefore also must not
+ * call aio_poll(), which could change the graph indirectly).
+ *
  * Note that this can be nested. If drained_begin() was called twice, new
  * I/O is allowed only after drained_end() was called twice, too.
  */
diff --git a/block.c b/block.c
index 961ec97d26..fb1462fbf2 100644
--- a/block.c
+++ b/block.c
@@ -2060,7 +2060,7 @@ static void bdrv_replace_child_noperm(BdrvChild *child,
 }
 assert(num >= 0);
 for (i = 0; i < num; i++) {
-child->role->drained_begin(child);
+bdrv_parent_drained_begin_single(child, true);
 }
 }
 
diff --git a/block/io.c b/block/io.c
index 1a2272fad3..038449f81f 100644
--- a/block/io.c
+++ b/block/io.c
@@ -52,9 +52,7 @@ void bdrv_parent_drained_begin(BlockDriverState *bs, 
BdrvChild *ignore,
 if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
 continue;
 }
-if (c->role->drained_begin) {
-c->role->drained_begin(c);
-}
+bdrv_parent_drained_begin_single(c, false);
 }
 }
 
@@ -73,6 +71,14 @@ void bdrv_parent_drained_end(BlockDriverState *bs, BdrvChild 
*ignore,
 }
 }
 
+static bool bdrv_parent_drained_poll_single(BdrvChild *c)
+{
+if (c->role->drained_poll) {
+return c->role->drained_poll(c);
+}
+return false;
+}
+
 static bool bdrv_parent_drained_poll(BlockDriverState *bs, BdrvChild *ignore,
  bool ignore_bds_parents)
 {
@@ -83,14 +89,22 @@ static bool bdrv_parent_drained_poll(BlockDriverState *bs, 
BdrvChild *ignore,
 if (c == ignore || (ignore_bds_parents && c->role->parent_is_bds)) {
 continue;
 }
-if (c->role->drained_poll) {
-busy |= c->role->drained_poll(c);
-}
+busy |= bdrv_parent_drained_poll_single(c);
 }
 
 return busy;
 }
 
+void bdrv_parent_drained_begin_single(BdrvChild *c, bool poll)
+{
+if (c->role->drained_begin) {
+c->role->drained_begin(c);
+}
+if (poll) {
+BDRV_POLL_WHILE(c->bs, bdrv_parent_drained_poll_single(c));
+}
+}
+
 static void bdrv_merge_limits(BlockLimits *dst, const BlockLimits *src)
 {
 dst->opt_transfer = MAX(dst->opt_transfer, src->opt_transfer);
-- 
2.13.6




Re: [Qemu-block] [PULL 3/3] backup: Use copy offloading

2018-07-03 Thread John Snow



On 07/02/2018 11:46 PM, Jeff Cody wrote:
> From: Fam Zheng 
> 
> The implementation is similar to the 'qemu-img convert'. In the
> beginning of the job, offloaded copy is attempted. If it fails, further
> I/O will go through the existing bounce buffer code path.
> 
> Then, as Kevin pointed out, both this and qemu-img convert can benefit
> from a local check if one request fails because of, for example, the
> offset is beyond EOF, but another may well be accepted by the protocol
> layer. This will be implemented separately.
> 
> Reviewed-by: Stefan Hajnoczi 
> Signed-off-by: Fam Zheng 
> Message-id: 20180703023758.14422-4-f...@redhat.com
> Signed-off-by: Jeff Cody 
> ---
>  block/backup.c | 150 -
>  block/trace-events |   1 +
>  2 files changed, 110 insertions(+), 41 deletions(-)
> 
> diff --git a/block/backup.c b/block/backup.c
> index d18be40caf..81895ddbe2 100644
> --- a/block/backup.c
> +++ b/block/backup.c
> @@ -45,6 +45,8 @@ typedef struct BackupBlockJob {
>  QLIST_HEAD(, CowRequest) inflight_reqs;
>  
>  HBitmap *copy_bitmap;
> +bool use_copy_range;
> +int64_t copy_range_size;
>  } BackupBlockJob;
>  
>  static const BlockJobDriver backup_job_driver;
> @@ -86,19 +88,101 @@ static void cow_request_end(CowRequest *req)
>  qemu_co_queue_restart_all(&req->wait_queue);
>  }
>  
> +/* Copy range to target with a bounce buffer and return the bytes copied. If
> + * error occured, return a negative error number */
> +static int coroutine_fn backup_cow_with_bounce_buffer(BackupBlockJob *job,
> +  int64_t start,
> +  int64_t end,
> +  bool is_write_notifier,
> +  bool *error_is_read,
> +  void **bounce_buffer)
> +{
> +int ret;
> +struct iovec iov;
> +QEMUIOVector qiov;
> +BlockBackend *blk = job->common.blk;
> +int nbytes;
> +
> +hbitmap_reset(job->copy_bitmap, start / job->cluster_size, 1);
> +nbytes = MIN(job->cluster_size, job->len - start);
> +if (!*bounce_buffer) {
> +*bounce_buffer = blk_blockalign(blk, job->cluster_size);
> +}
> +iov.iov_base = *bounce_buffer;
> +iov.iov_len = nbytes;
> +qemu_iovec_init_external(&qiov, &iov, 1);
> +
> +ret = blk_co_preadv(blk, start, qiov.size, &qiov,
> +is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0);
> +if (ret < 0) {
> +trace_backup_do_cow_read_fail(job, start, ret);
> +if (error_is_read) {
> +*error_is_read = true;
> +}
> +goto fail;
> +}
> +
> +if (qemu_iovec_is_zero(&qiov)) {
> +ret = blk_co_pwrite_zeroes(job->target, start,
> +   qiov.size, BDRV_REQ_MAY_UNMAP);
> +} else {
> +ret = blk_co_pwritev(job->target, start,
> + qiov.size, &qiov,
> + job->compress ? BDRV_REQ_WRITE_COMPRESSED : 0);
> +}
> +if (ret < 0) {
> +trace_backup_do_cow_write_fail(job, start, ret);
> +if (error_is_read) {
> +*error_is_read = false;
> +}
> +goto fail;
> +}
> +
> +return nbytes;
> +fail:
> +hbitmap_set(job->copy_bitmap, start / job->cluster_size, 1);
> +return ret;
> +
> +}
> +
> +/* Copy range to target and return the bytes copied. If error occured, 
> return a
> + * negative error number. */
> +static int coroutine_fn backup_cow_with_offload(BackupBlockJob *job,
> +int64_t start,
> +int64_t end,
> +bool is_write_notifier)
> +{
> +int ret;
> +int nr_clusters;
> +BlockBackend *blk = job->common.blk;
> +int nbytes;
> +
> +assert(QEMU_IS_ALIGNED(job->copy_range_size, job->cluster_size));
> +nbytes = MIN(job->copy_range_size, end - start);
> +nr_clusters = DIV_ROUND_UP(nbytes, job->cluster_size);
> +hbitmap_reset(job->copy_bitmap, start / job->cluster_size,
> +  nr_clusters);
> +ret = blk_co_copy_range(blk, start, job->target, start, nbytes,
> +is_write_notifier ? BDRV_REQ_NO_SERIALISING : 0);
> +if (ret < 0) {
> +trace_backup_do_cow_copy_range_fail(job, start, ret);
> +hbitmap_set(job->copy_bitmap, start / job->cluster_size,
> +nr_clusters);
> +return ret;
> +}
> +
> +return nbytes;
> +}
> +
>  static int coroutine_fn backup_do_cow(BackupBlockJob *job,
>int64_t offset, uint64_t bytes,
>bool *error_is_read,
>bool is_write_notifier)
>  {
> -BlockBackend *blk

Re: [Qemu-block] [PATCH v3 00/11] NBD reconnect

2018-07-03 Thread Eric Blake

On 07/03/2018 08:46 AM, Vladimir Sementsov-Ogievskiy wrote:

Hi all.

before v4 realization, I'd like to discuss some questions.

Our proposal for v4 is the following:

1. don't reconnect on nbd_open. So, on open we do only one connect 
attempt, and if it fails, open fails.

2. don't configure timeout between attempts. instead do the following:
     1s timeout, then 2s, then 4, 8, 16, and then always 16 until success
3. configure only time, after disconnect, during which requests are 
paused (and after this time, if not connected, they will return EIO). Or 
not configure it for now, make a default of 5 minutes.


Any ideas?


I apologize that I haven't had time to review this series closely.  At 
this point, I think it's missed 3.0, and will have to be 3.1 material. 
Your proposal for exponential backoff on reconnect attempts makes sense; 
beyond that, I haven't reviewed closely enough to know if I have other 
suggestions on how many knobs to expose to the user, vs. how much to 
make automatic.





09.06.2018 18:32, Vladimir Sementsov-Ogievskiy wrote:

Hi all.

Here is NBD reconnect.
The feature realized inside nbd-client driver and works as follows:

There are two parameters: reconnect-attempts and reconnect-timeout.
So, we will try to reconnect in case of initial connection failed or
in case of connection lost. All current and new io operations will wait
until we make reconnect-attempts tries to reconnect. After this, all
requests will fail with EIO, but we will continue trying to reconnect.


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



Re: [Qemu-block] [PATCH 1/2] drity-bitmap: refactor merge: separte can_merge

2018-07-03 Thread Eric Blake

On 07/03/2018 05:53 AM, Vladimir Sementsov-Ogievskiy wrote:

In the subject: s/drity/dirty/, s/separte/separate/


Separate can_merge, to reuse it for transaction support for merge
command.

Also, switch some asserts to errors.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---


Reviewed-by: Eric Blake 

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



Re: [Qemu-block] [PATCH v2 2/3] block/fleecing-filter: new filter driver for fleecing

2018-07-03 Thread Vladimir Sementsov-Ogievskiy

03.07.2018 14:15, Kevin Wolf wrote:

Am 02.07.2018 um 14:09 hat Vladimir Sementsov-Ogievskiy geschrieben:

29.06.2018 20:40, Eric Blake wrote:

On 06/29/2018 12:30 PM, John Snow wrote:


On 06/29/2018 11:15 AM, Vladimir Sementsov-Ogievskiy wrote:

We need to synchronize backup job with reading from fleecing image
like it was done in block/replication.c.

Otherwise, the following situation is theoretically possible:

1. client start reading
2. client understand, that there is no corresponding cluster in
     fleecing image

I don't think the client refocuses the read, but QEMU does. (the client
has no idea if it is reading from the fleecing node or the backing
file.)

... but understood:


3. client is going to read from backing file (i.e. active image)
4. guest writes to active image

My question here is if QEMU will allow reads and writes to interleave in
general. If the client is reading from the backing file, the active
image, will QEMU allow a write to modify that data before we're done
getting that data?


5. this write is stopped by backup(sync=none) and cluster is copied to
     fleecing image
6. guest write continues...
7. and client reads _new_ (or partly new) date from active image


I can't answer this for myself one way or the other right now, but I
imagine you observed a failure in practice in your test setups, which
motivated this patch?

A test or some proof would help justification for this patch. It would
also help prove that it solves what it claims to!

In fact, do we really need a new filter, or do we just need to make the
"sync":"none" blockdev-backup job take the appropriate synchronization
locks?


How? We'll need additional mechanism like serializing requests.. Or a way to
reuse serializing requests. Using backup internal synchronization looks
simpler, and it is already used in block replication.

But it also just an ugly hack that fixes one special case and leaves
everything else broken. replication is usually not a good example for
anything. It always gives me bad surprises when I have to look at it.

We'll have to figure out where to fix this problem (or what it really
is, once you look more than just at fleecing), but I think requiring the
user to add a filter driver to work around missing serialisation in
other code, and corrupting their image if they forget to, is not a
reasonable solution.

I see at least two things wrong in this context:

* The permissions don't seem to match reality. The NBD server
   unconditionally shares PERM_WRITE, which is wrong in this case. The
   client wants to see a point-in-time snapshot that never changes. This
   should become an option so that it can be properly reflected in the
   permissions used.

* Once we have proper permissions, the fleecing setup breaks down
   because the guest needs PERM_WRITE on the backing file, but the
   fleecing overlay allows that only if the NBD client allows it (which
   it doesn't for fleecing).

   Now we can implement an exception right into backup that installs a
   backup filter driver between source and target if the source is the
   backing file of the target. The filter driver would be similar to the
   commit filter driver in that it simply promises !PERM_WRITE to its
   parents, but allows PERM_WRITE on the source because it has installed
   the before_write_notifier that guarantees this condition.

   All writes to the target that are made by the backup job in this setup
   (including before_write_notifier writes) need to be marked as
   serialising so that any concurrent reads are completed first.

And if we decide to add a target filter to backup, we should probably at
the same time use a filter driver for intercepting source writes instead
of using before_write_notifier.


Hmm, is it possible to do all the staff in one super filter driver, 
which we insert into the tree like this:


top blk    fleecing qcow2
 +   +
 |   |backing
 v <-+
   super filter
 +
 |file
 v
   active image


And super filter do the following:

1. copy-on-write, before forwarding write to file, it do serializing 
write to fleecing qcow2

2. fake .bdrv_child_perm for fleecing qcow2, like in block commit

and no block job is needed.



Max, I think you intended to make both source and target children of the
same block job node (or at least for mirror). But wouldn't that create
loops in a setup like this? I think we may need two filters that are
only connected through the block job, but not with block graph edges.

Kevin



--
Best regards,
Vladimir




Re: [Qemu-block] [PATCH v2 1/2] nbd/client: Add x-dirty-bitmap to query bitmap from server

2018-07-03 Thread Eric Blake

On 07/03/2018 04:46 AM, Vladimir Sementsov-Ogievskiy wrote:


  #
+# @x-dirty-bitmap: A "qemu:dirty-bitmap:NAME" string to query in 
place of

+#  traditional "base:allocation" block status (see
+#  NBD_OPT_LIST_META_CONTEXT in the NBD protocol) 
(since 3.0)

+#


"x-dirty-bitmap=qemu:dirty-bitmap:NAME", is a bit strange, looks like it 
should be "x-dirty-bitmap=NAME", and "qemu:dirty-bitmap" added 
automatically. (and you don't check it, so actually this parameter is 
x-meta-context, and user can select any context, for example, 
"base:allocation", so "x-meta-context=qemu:dirty-bitmap:NAME" sounds 
better too). But I'm ok to leave it as is for now, with x- prefix.


Good point on 'x-meta-context' being slightly nicer; but bikeshedding an 
experimental name doesn't really impact the release.


For 3.1, I'd love to have:

qemu-img map --output=json --image-opts driver=nbd,...,bitmap=FOO

automatically connect to qemu:dirty-bitmap:FOO in addition to block 
status, resulting in output for iotest 223 that resembles:


[{ "start": 0, "length": 1048576, "depth": 0, "zero": true, "data": false},
{ "start": 1048576, "length": 1048576, "depth": 0, "zero": false, 
"data": true},
{ "start": 2097152, "length": 2097152, "depth": 0, "zero": false, 
"data": true, "dirty": true}]


(that is, an optional 'dirty' member added to the JSON to identify the 
dirty portions, in addition to everything else already identified).  I 
also want to enhance that test to show that discarding clusters works 
during dirty tracking (that is, a "data":false, "dirty":true entry 
should be demonstrated).


Getting to that point will mean adding a new BDRV_BLOCK_DIRTY bit to the 
output of bdrv_block_status() (even if only the NBD and passthrough 
drivers set it), as well as teaching qemu-img map to optionally honor 
that bit when present.  It also means the NBD client code will be 
subscribing to status from two meta contexts at once, with the second 
being exactly a qemu:dirty-bitmap (rather than a free-for-all string 
that x-dirty-bitmap currently gives us).


It's too late to get these improvements into 3.0, but should be a good 
path forward in the next few months.




@@ -982,9 +983,11 @@ int nbd_client_init(BlockDriverState *bs,
  client->info.request_sizes = true;
  client->info.structured_reply = true;
  client->info.base_allocation = true;
+    client->info.x_dirty_bitmap = g_strdup(x_dirty_bitmap);
  ret = nbd_receive_negotiate(QIO_CHANNEL(sioc), export,
  tlscreds, hostname,
  &client->ioc, &client->info, errp);
+    g_free(client->info.x_dirty_bitmap);


hm, pointer remains invalid. If you want free it here, what is the 
reason to strdup it? At least, it worth to zero out the pointer after 
g_free.. Or, free it not here but in client close.


The g_strdup() is necessary since client.x_dirty_bitmap is 'const char 
*'.  Nothing used the pointer after it is freed, although I could agree 
that assigning it to NULL to make that point obvious wouldn't hurt; 
however, the pull request was already taken as-is, so we can just live 
with it until the 3.1 improvements.




with pointer zeroed or g_free to client destroy procedure,

Reviewed-by: Vladimir Sementsov-Ogievskiy 





--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



Re: [Qemu-block] [PATCH 2/2] qapi: add transaction support for x-block-dirty-bitmap-merge

2018-07-03 Thread Eric Blake

On 07/03/2018 05:53 AM, Vladimir Sementsov-Ogievskiy wrote:

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
  qapi/transaction.json |  2 ++
  blockdev.c| 53 ++-
  2 files changed, 54 insertions(+), 1 deletion(-)


I'll let John handle this one, but it looks reasonable to me.

--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



Re: [Qemu-block] [PATCH v2 2/2] iotests: New test 223 for exporting dirty bitmap over NBD

2018-07-03 Thread Eric Blake

On 07/03/2018 05:09 AM, Vladimir Sementsov-Ogievskiy wrote:

02.07.2018 22:14, Eric Blake wrote:

Although this test is NOT a full test of image fleecing (as it
intentionally uses just a single block device directly exported
over NBD, rather than trying to set up a blockdev-backup job with
multiple BDS involved), it DOES prove that qemu as a server is
able to properly expose a dirty bitmap over NBD.

When coupled with image fleecing, it is then possible for a
third-party client to do an incremental backup by using
qemu-img map with the x-dirty-bitmap option to learn which parts
of the file are dirty (perhaps confusingly, they are the portions
mapped as "data":false - which is part of the reason this is
still in the x- experimental namespace), along with another
normal client (perhaps 'qemu-nbd -c' to expose the server over
/dev/nbd0 and then just use normal I/O on that block device) to
read the dirty sections.




+
+echo
+echo "=== End NBD server ==="
+echo
+
+_send_qemu_cmd $QEMU_HANDLE '{"execute":"nbd-server-remove",
+  "arguments":{"name":"n"}}' "return"
+_send_qemu_cmd $QEMU_HANDLE '{"execute":"nbd-server-stop"}' "return"


blockdev-del is not necessary?


I guess it's symmetric in that since we hotplugged the disk, we would 
also make sure hotunplug works after everything else has quit using it. 
But even the nbd-server-stop is not strictly necessary, since quitting 
qemu should have the same effect.  At this point, I'm not too worried 
about changing the test.




with or without:

Reviewed-by: Vladimir Sementsov-Ogievskiy 


Thanks for the review; the pull request went through without your 
notation being appended, but it never hurts to have additional review 
(and we still have time even after 3.0 soft freeze to fix any important 
bugs in what went in).


--
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



Re: [Qemu-block] [Qemu-devel] [PATCH v4 2/2] iotests: add 222 to test basic fleecing

2018-07-03 Thread John Snow



On 07/03/2018 05:15 AM, Vladimir Sementsov-Ogievskiy wrote:
> 02.07.2018 22:46, John Snow wrote:
>> Signed-off-by: John Snow 
> 
> 
> Reviewed-by: Vladimir Sementsov-Ogievskiy 
> 

Thanks, and sorry for the 3.0 freeze rush.



Re: [Qemu-block] [PULL 5/5] block: Add blklogwrites

2018-07-03 Thread Kevin Wolf
Am 03.07.2018 um 16:59 hat Kevin Wolf geschrieben:
> From: Aapo Vienamo 
> 
> Implements a block device write logging system, similar to Linux kernel
> device mapper dm-log-writes. The write operations that are performed
> on a block device are logged to a file or another block device. The
> write log format is identical to the dm-log-writes format. Currently,
> log markers are not supported.
> 
> This functionality can be used for crash consistency and fs consistency
> testing. By implementing it in qemu, tests utilizing write logs can be
> be used to test non-Linux drivers and older kernels.
> 
> The driver accepts an optional parameter to set the sector size used
> for logging. This makes the driver require all requests to be aligned
> to this sector size and also makes offsets and sizes of writes in the
> log metadata to be expressed in terms of this value (the log format has
> a granularity of one sector for offsets and sizes). This allows
> accurate logging of writes to guest block devices that have unusual
> sector sizes.
> 
> The implementation is based on the blkverify and blkdebug block
> drivers.
> 
> Signed-off-by: Aapo Vienamo 
> Signed-off-by: Ari Sundholm 
> Signed-off-by: Kevin Wolf 

Note that I saw Ari's v7 right after sending the pull request. It
contains only a few simple bugfixes compared to the version I had
queued, so I quickly squashed the following in and updated the tag.

Kevin


diff --git a/block/blklogwrites.c b/block/blklogwrites.c
index 0748b56d72..47093fadd6 100644
--- a/block/blklogwrites.c
+++ b/block/blklogwrites.c
@@ -53,6 +53,19 @@ typedef struct {
 uint64_t nr_entries;
 } BDRVBlkLogWritesState;
 
+static QemuOptsList runtime_opts = {
+.name = "blklogwrites",
+.head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+.desc = {
+{
+.name = "log-sector-size",
+.type = QEMU_OPT_SIZE,
+.help = "Log sector size",
+},
+{ /* end of list */ }
+},
+};
+
 static inline uint32_t blk_log_writes_log2(uint32_t value)
 {
 assert(value > 0);
@@ -63,9 +76,18 @@ static int blk_log_writes_open(BlockDriverState *bs, QDict 
*options, int flags,
Error **errp)
 {
 BDRVBlkLogWritesState *s = bs->opaque;
+QemuOpts *opts;
 Error *local_err = NULL;
 int ret;
-int64_t log_sector_size = BDRV_SECTOR_SIZE;
+int64_t log_sector_size;
+
+opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
+qemu_opts_absorb_qdict(opts, options, &local_err);
+if (local_err) {
+ret = -EINVAL;
+error_propagate(errp, local_err);
+goto fail;
+}
 
 /* Open the file */
 bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file, false,
@@ -76,12 +98,10 @@ static int blk_log_writes_open(BlockDriverState *bs, QDict 
*options, int flags,
 goto fail;
 }
 
-if (qdict_haskey(options, "log-sector-size")) {
-log_sector_size = qdict_get_int(options, "log-sector-size");
-qdict_del(options, "log-sector-size");
-}
+log_sector_size = qemu_opt_get_size(opts, "log-sector-size",
+BDRV_SECTOR_SIZE);
 
-if (log_sector_size < 0 || log_sector_size >= (1ull << 32) ||
+if (log_sector_size < 0 || log_sector_size > (1ull << 23) ||
 !is_power_of_2(log_sector_size))
 {
 ret = -EINVAL;
@@ -109,6 +129,7 @@ fail:
 bdrv_unref_child(bs, bs->file);
 bs->file = NULL;
 }
+qemu_opts_del(opts);
 return ret;
 }
 
@@ -144,6 +165,7 @@ static void 
blk_log_writes_refresh_filename(BlockDriverState *bs,
 qobject_ref(s->log_file->bs->full_open_options);
 qdict_put_obj(opts, "log",
   QOBJECT(s->log_file->bs->full_open_options));
+qdict_put_int(opts, "log-sector-size", s->sectorsize);
 
 bs->full_open_options = opts;
 }



Re: [Qemu-block] [PATCH v7 0/2] New block driver: blklogwrites

2018-07-03 Thread Kevin Wolf
Am 03.07.2018 um 16:48 hat Ari Sundholm geschrieben:
> This patch series adds a new block driver, blklogwrites, to QEMU. The
> driver is given two block devices: a raw device backed by an image or a
> host block device, and a log device, typically backed by a file, on
> which writes to the raw device are logged.
> 
> The logging format used is the same as in the dm-log-writes target of
> the Linux kernel device mapper. The log reflects the writes that have
> been performed on the guest block device and flushed. To be strict, the
> log may contain writes that have not been flushed yet, but they are
> technically outside the bounds of the log until the next flush updates
> the metadata in the log superblock. We believe these semantics to be
> useful even though they may not be completely identical to those of
> dm-log-writes.
> 
> This functionality can be used for crash consistency and fs consistency
> testing in filesystem drivers, including on non-Linux guests and older
> guests running Linux versions without support for dm-log-writes. This
> is simple and useful. Admittedly this and more advanced things could
> perhaps be done by extending the quorum driver, but this approach would
> require re-engineering the functionality and involve a more complicated
> setup, so we offer this simple solution which we have found useful
> internally.
> 
> In the first patch of this series, two block permission constants are
> moved from block.c to include/block/block.h to make them available
> outside of block.c. The next patch uses these constants.
> 
> In the second patch, the blklogwrites driver is introduced. The driver
> accepts a target block device to be logged, a log device and an option
> to set the sector size used for logging. All requests are aligned to
> this sector size, and offsets and sizes in the log entries will be
> expressed as a multiple of this value.

Thanks, applied to the block branch.

Kevin



[Qemu-block] [PULL 2/5] qcow2: refactor data compression

2018-07-03 Thread Kevin Wolf
From: Vladimir Sementsov-Ogievskiy 

Make a separate function for compression to be parallelized later.
 - use .avail_out field instead of .next_out to calculate size of
   compressed data. It looks more natural and it allows to keep dest to
   be void pointer
 - set avail_out to be at least one byte less than input, to be sure
   avoid inefficient compression earlier

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Kevin Wolf 
---
 block/qcow2.c | 76 ++-
 1 file changed, 49 insertions(+), 27 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 2f9e58e0c4..67f4fb7c71 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -23,11 +23,14 @@
  */
 
 #include "qemu/osdep.h"
+
+#define ZLIB_CONST
+#include 
+
 #include "block/block_int.h"
 #include "block/qdict.h"
 #include "sysemu/block-backend.h"
 #include "qemu/module.h"
-#include 
 #include "qcow2.h"
 #include "qemu/error-report.h"
 #include "qapi/error.h"
@@ -3650,6 +3653,46 @@ fail:
 return ret;
 }
 
+/*
+ * qcow2_compress()
+ *
+ * @dest - destination buffer, at least of @size-1 bytes
+ * @src - source buffer, @size bytes
+ *
+ * Returns: compressed size on success
+ *  -1 if compression is inefficient
+ *  -2 on any other error
+ */
+static ssize_t qcow2_compress(void *dest, const void *src, size_t size)
+{
+ssize_t ret;
+z_stream strm;
+
+/* best compression, small window, no zlib header */
+memset(&strm, 0, sizeof(strm));
+ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
+   -12, 9, Z_DEFAULT_STRATEGY);
+if (ret != 0) {
+return -2;
+}
+
+strm.avail_in = size;
+strm.next_in = src;
+strm.avail_out = size - 1;
+strm.next_out = dest;
+
+ret = deflate(&strm, Z_FINISH);
+if (ret == Z_STREAM_END) {
+ret = size - 1 - strm.avail_out;
+} else {
+ret = (ret == Z_OK ? -1 : -2);
+}
+
+deflateEnd(&strm);
+
+return ret;
+}
+
 /* XXX: put compressed sectors first, then all the cluster aligned
tables to avoid losing bytes in alignment */
 static coroutine_fn int
@@ -3659,8 +3702,8 @@ qcow2_co_pwritev_compressed(BlockDriverState *bs, 
uint64_t offset,
 BDRVQcow2State *s = bs->opaque;
 QEMUIOVector hd_qiov;
 struct iovec iov;
-z_stream strm;
-int ret, out_len;
+int ret;
+size_t out_len;
 uint8_t *buf, *out_buf;
 int64_t cluster_offset;
 
@@ -3694,32 +3737,11 @@ qcow2_co_pwritev_compressed(BlockDriverState *bs, 
uint64_t offset,
 
 out_buf = g_malloc(s->cluster_size);
 
-/* best compression, small window, no zlib header */
-memset(&strm, 0, sizeof(strm));
-ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION,
-   Z_DEFLATED, -12,
-   9, Z_DEFAULT_STRATEGY);
-if (ret != 0) {
+out_len = qcow2_compress(out_buf, buf, s->cluster_size);
+if (out_len == -2) {
 ret = -EINVAL;
 goto fail;
-}
-
-strm.avail_in = s->cluster_size;
-strm.next_in = (uint8_t *)buf;
-strm.avail_out = s->cluster_size;
-strm.next_out = out_buf;
-
-ret = deflate(&strm, Z_FINISH);
-if (ret != Z_STREAM_END && ret != Z_OK) {
-deflateEnd(&strm);
-ret = -EINVAL;
-goto fail;
-}
-out_len = strm.next_out - out_buf;
-
-deflateEnd(&strm);
-
-if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
+} else if (out_len == -1) {
 /* could not compress: write normal cluster */
 ret = qcow2_co_pwritev(bs, offset, bytes, qiov, 0);
 if (ret < 0) {
-- 
2.13.6




[Qemu-block] [PULL 5/5] block: Add blklogwrites

2018-07-03 Thread Kevin Wolf
From: Aapo Vienamo 

Implements a block device write logging system, similar to Linux kernel
device mapper dm-log-writes. The write operations that are performed
on a block device are logged to a file or another block device. The
write log format is identical to the dm-log-writes format. Currently,
log markers are not supported.

This functionality can be used for crash consistency and fs consistency
testing. By implementing it in qemu, tests utilizing write logs can be
be used to test non-Linux drivers and older kernels.

The driver accepts an optional parameter to set the sector size used
for logging. This makes the driver require all requests to be aligned
to this sector size and also makes offsets and sizes of writes in the
log metadata to be expressed in terms of this value (the log format has
a granularity of one sector for offsets and sizes). This allows
accurate logging of writes to guest block devices that have unusual
sector sizes.

The implementation is based on the blkverify and blkdebug block
drivers.

Signed-off-by: Aapo Vienamo 
Signed-off-by: Ari Sundholm 
Signed-off-by: Kevin Wolf 
---
 qapi/block-core.json |  33 -
 block/blklogwrites.c | 392 +++
 MAINTAINERS  |   6 +
 block/Makefile.objs  |   1 +
 4 files changed, 426 insertions(+), 6 deletions(-)
 create mode 100644 block/blklogwrites.c

diff --git a/qapi/block-core.json b/qapi/block-core.json
index 90e554ed0f..a9eab8cab8 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -2533,16 +2533,17 @@
 # @throttle: Since 2.11
 # @nvme: Since 2.12
 # @copy-on-read: Since 3.0
+# @blklogwrites: Since 3.0
 #
 # Since: 2.9
 ##
 { 'enum': 'BlockdevDriver',
-  'data': [ 'blkdebug', 'blkverify', 'bochs', 'cloop', 'copy-on-read',
-'dmg', 'file', 'ftp', 'ftps', 'gluster', 'host_cdrom',
-'host_device', 'http', 'https', 'iscsi', 'luks', 'nbd', 'nfs',
-'null-aio', 'null-co', 'nvme', 'parallels', 'qcow', 'qcow2', 'qed',
-'quorum', 'raw', 'rbd', 'replication', 'sheepdog', 'ssh',
-'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] }
+  'data': [ 'blkdebug', 'blklogwrites', 'blkverify', 'bochs', 'cloop',
+'copy-on-read', 'dmg', 'file', 'ftp', 'ftps', 'gluster',
+'host_cdrom', 'host_device', 'http', 'https', 'iscsi', 'luks',
+'nbd', 'nfs', 'null-aio', 'null-co', 'nvme', 'parallels', 'qcow',
+'qcow2', 'qed', 'quorum', 'raw', 'rbd', 'replication', 'sheepdog',
+'ssh', 'throttle', 'vdi', 'vhdx', 'vmdk', 'vpc', 'vvfat', 'vxhs' ] 
}
 
 ##
 # @BlockdevOptionsFile:
@@ -3045,6 +3046,25 @@
 '*set-state': ['BlkdebugSetStateOptions'] } }
 
 ##
+# @BlockdevOptionsBlklogwrites:
+#
+# Driver specific block device options for blklogwrites.
+#
+# @file:block device
+#
+# @log: block device used to log writes to @file
+#
+# @log-sector-size: sector size used in logging writes to @file, determines
+#   granularity of offsets and sizes of writes (default: 512)
+#
+# Since: 3.0
+##
+{ 'struct': 'BlockdevOptionsBlklogwrites',
+  'data': { 'file': 'BlockdevRef',
+'log': 'BlockdevRef',
+'*log-sector-size': 'uint32' } }
+
+##
 # @BlockdevOptionsBlkverify:
 #
 # Driver specific block device options for blkverify.
@@ -3563,6 +3583,7 @@
   'discriminator': 'driver',
   'data': {
   'blkdebug':   'BlockdevOptionsBlkdebug',
+  'blklogwrites':'BlockdevOptionsBlklogwrites',
   'blkverify':  'BlockdevOptionsBlkverify',
   'bochs':  'BlockdevOptionsGenericFormat',
   'cloop':  'BlockdevOptionsGenericFormat',
diff --git a/block/blklogwrites.c b/block/blklogwrites.c
new file mode 100644
index 00..0748b56d72
--- /dev/null
+++ b/block/blklogwrites.c
@@ -0,0 +1,392 @@
+/*
+ * Write logging blk driver based on blkverify and blkdebug.
+ *
+ * Copyright (c) 2017 Tuomas Tynkkynen 
+ * Copyright (c) 2018 Aapo Vienamo 
+ * Copyright (c) 2018 Ari Sundholm 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/sockets.h" /* for EINPROGRESS on Windows */
+#include "block/block_int.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qstring.h"
+#include "qemu/cutils.h"
+#include "qemu/option.h"
+
+/* Disk format stuff - taken from Linux drivers/md/dm-log-writes.c */
+
+#define LOG_FLUSH_FLAG   (1 << 0)
+#define LOG_FUA_FLAG (1 << 1)
+#define LOG_DISCARD_FLAG (1 << 2)
+#define LOG_MARK_FLAG(1 << 3)
+
+#define WRITE_LOG_VERSION 1ULL
+#define WRITE_LOG_MAGIC 0x6a736677736872ULL
+
+/* All fields are little-endian. */
+struct log_write_super {
+uint64_t magic;
+uint64_t version;
+uint64_t nr_entries;
+uint32_t sectorsize;
+} QEMU_PACKED;
+
+struct log_write_entry {
+uint64_t sector;
+uint64_t nr_sectors;
+uint64_t flags;
+

[Qemu-block] [PULL 0/5] Block layer patches

2018-07-03 Thread Kevin Wolf
The following changes since commit a395717cbd26e7593d3c3fe81faca121ec6d13e8:

  Merge remote-tracking branch 'remotes/cody/tags/block-pull-request' into 
staging (2018-07-03 11:49:51 +0100)

are available in the git repository at:

  git://repo.or.cz/qemu/kevin.git tags/for-upstream

for you to fetch changes up to 59738025a1674bb7e07713c3c93ff4fb9c5079f5:

  block: Add blklogwrites (2018-07-03 16:09:48 +0200)


Block layer patches:

- qcow2: Use worker threads for compression to improve performance of
  'qemu-img convert -W' and compressed backup jobs
- blklogwrites: New filter driver to log write requests to an image in
  the dm-log-writes format


Aapo Vienamo (1):
  block: Add blklogwrites

Ari Sundholm (1):
  block: Move two block permission constants to the relevant enum

Vladimir Sementsov-Ogievskiy (3):
  qemu-img: allow compressed not-in-order writes
  qcow2: refactor data compression
  qcow2: add compress threads

 qapi/block-core.json  |  33 -
 block/qcow2.h |   3 +
 include/block/block.h |   7 +
 block.c   |   6 -
 block/blklogwrites.c  | 392 ++
 block/qcow2.c | 136 ++
 qemu-img.c|   5 -
 MAINTAINERS   |   6 +
 block/Makefile.objs   |   1 +
 9 files changed, 545 insertions(+), 44 deletions(-)
 create mode 100644 block/blklogwrites.c



[Qemu-block] [PULL 3/5] qcow2: add compress threads

2018-07-03 Thread Kevin Wolf
From: Vladimir Sementsov-Ogievskiy 

Do data compression in separate threads. This significantly improve
performance for qemu-img convert with -W (allow async writes) and -c
(compressed) options.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Kevin Wolf 
---
 block/qcow2.h |  3 +++
 block/qcow2.c | 62 ++-
 2 files changed, 64 insertions(+), 1 deletion(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index 1c9c0d3631..d6aca687d6 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -326,6 +326,9 @@ typedef struct BDRVQcow2State {
  * override) */
 char *image_backing_file;
 char *image_backing_format;
+
+CoQueue compress_wait_queue;
+int nb_compress_threads;
 } BDRVQcow2State;
 
 typedef struct Qcow2COWRegion {
diff --git a/block/qcow2.c b/block/qcow2.c
index 67f4fb7c71..327685a2f9 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -44,6 +44,7 @@
 #include "qapi/qobject-input-visitor.h"
 #include "qapi/qapi-visit-block-core.h"
 #include "crypto.h"
+#include "block/thread-pool.h"
 
 /*
   Differences with QCOW:
@@ -1544,6 +1545,9 @@ static int coroutine_fn qcow2_do_open(BlockDriverState 
*bs, QDict *options,
 qcow2_check_refcounts(bs, &result, 0);
 }
 #endif
+
+qemu_co_queue_init(&s->compress_wait_queue);
+
 return ret;
 
  fail:
@@ -3693,6 +3697,62 @@ static ssize_t qcow2_compress(void *dest, const void 
*src, size_t size)
 return ret;
 }
 
+#define MAX_COMPRESS_THREADS 4
+
+typedef struct Qcow2CompressData {
+void *dest;
+const void *src;
+size_t size;
+ssize_t ret;
+} Qcow2CompressData;
+
+static int qcow2_compress_pool_func(void *opaque)
+{
+Qcow2CompressData *data = opaque;
+
+data->ret = qcow2_compress(data->dest, data->src, data->size);
+
+return 0;
+}
+
+static void qcow2_compress_complete(void *opaque, int ret)
+{
+qemu_coroutine_enter(opaque);
+}
+
+/* See qcow2_compress definition for parameters description */
+static ssize_t qcow2_co_compress(BlockDriverState *bs,
+ void *dest, const void *src, size_t size)
+{
+BDRVQcow2State *s = bs->opaque;
+BlockAIOCB *acb;
+ThreadPool *pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
+Qcow2CompressData arg = {
+.dest = dest,
+.src = src,
+.size = size,
+};
+
+while (s->nb_compress_threads >= MAX_COMPRESS_THREADS) {
+qemu_co_queue_wait(&s->compress_wait_queue, NULL);
+}
+
+s->nb_compress_threads++;
+acb = thread_pool_submit_aio(pool, qcow2_compress_pool_func, &arg,
+ qcow2_compress_complete,
+ qemu_coroutine_self());
+
+if (!acb) {
+s->nb_compress_threads--;
+return -EINVAL;
+}
+qemu_coroutine_yield();
+s->nb_compress_threads--;
+qemu_co_queue_next(&s->compress_wait_queue);
+
+return arg.ret;
+}
+
 /* XXX: put compressed sectors first, then all the cluster aligned
tables to avoid losing bytes in alignment */
 static coroutine_fn int
@@ -3737,7 +3797,7 @@ qcow2_co_pwritev_compressed(BlockDriverState *bs, 
uint64_t offset,
 
 out_buf = g_malloc(s->cluster_size);
 
-out_len = qcow2_compress(out_buf, buf, s->cluster_size);
+out_len = qcow2_co_compress(bs, out_buf, buf, s->cluster_size);
 if (out_len == -2) {
 ret = -EINVAL;
 goto fail;
-- 
2.13.6




[Qemu-block] [PULL 1/5] qemu-img: allow compressed not-in-order writes

2018-07-03 Thread Kevin Wolf
From: Vladimir Sementsov-Ogievskiy 

No reason to forbid them, and they are needed to improve performance
with compress-threads in further patches.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
Signed-off-by: Kevin Wolf 
---
 qemu-img.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index e1a506f7f6..7651d8172c 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -2141,11 +2141,6 @@ static int img_convert(int argc, char **argv)
 goto fail_getopt;
 }
 
-if (!s.wr_in_order && s.compressed) {
-error_report("Out of order write and compress are mutually exclusive");
-goto fail_getopt;
-}
-
 if (tgt_image_opts && !skip_create) {
 error_report("--target-image-opts requires use of -n flag");
 goto fail_getopt;
-- 
2.13.6




[Qemu-block] [PULL 4/5] block: Move two block permission constants to the relevant enum

2018-07-03 Thread Kevin Wolf
From: Ari Sundholm 

This allows using the two constants outside of block.c, which will
happen in a subsequent patch.

Signed-off-by: Ari Sundholm 
Signed-off-by: Kevin Wolf 
---
 include/block/block.h | 7 +++
 block.c   | 6 --
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/include/block/block.h b/include/block/block.h
index e5c7759a0c..bc76b1e59f 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -225,6 +225,13 @@ enum {
 BLK_PERM_GRAPH_MOD  = 0x10,
 
 BLK_PERM_ALL= 0x1f,
+
+DEFAULT_PERM_PASSTHROUGH= BLK_PERM_CONSISTENT_READ
+ | BLK_PERM_WRITE
+ | BLK_PERM_WRITE_UNCHANGED
+ | BLK_PERM_RESIZE,
+
+DEFAULT_PERM_UNCHANGED  = BLK_PERM_ALL & ~DEFAULT_PERM_PASSTHROUGH,
 };
 
 char *bdrv_perm_names(uint64_t perm);
diff --git a/block.c b/block.c
index 70a46fdd84..961ec97d26 100644
--- a/block.c
+++ b/block.c
@@ -1948,12 +1948,6 @@ int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, 
uint64_t shared,
 return 0;
 }
 
-#define DEFAULT_PERM_PASSTHROUGH (BLK_PERM_CONSISTENT_READ \
- | BLK_PERM_WRITE \
- | BLK_PERM_WRITE_UNCHANGED \
- | BLK_PERM_RESIZE)
-#define DEFAULT_PERM_UNCHANGED (BLK_PERM_ALL & ~DEFAULT_PERM_PASSTHROUGH)
-
 void bdrv_filter_default_perms(BlockDriverState *bs, BdrvChild *c,
const BdrvChildRole *role,
BlockReopenQueue *reopen_queue,
-- 
2.13.6




[Qemu-block] [PATCH v7 2/2] block: Add blklogwrites

2018-07-03 Thread Ari Sundholm
From: Aapo Vienamo 

Implements a block device write logging system, similar to Linux kernel
device mapper dm-log-writes. The write operations that are performed
on a block device are logged to a file or another block device. The
write log format is identical to the dm-log-writes format. Currently,
log markers are not supported.

This functionality can be used for crash consistency and fs consistency
testing. By implementing it in qemu, tests utilizing write logs can be
be used to test non-Linux drivers and older kernels.

The driver accepts an optional parameter to set the sector size used
for logging. This makes the driver require all requests to be aligned
to this sector size and also makes offsets and sizes of writes in the
log metadata to be expressed in terms of this value (the log format has
a granularity of one sector for offsets and sizes). This allows
accurate logging of writes to guest block devices that have unusual
sector sizes.

The implementation is based on the blkverify and blkdebug block
drivers.

Signed-off-by: Aapo Vienamo 
Signed-off-by: Ari Sundholm 
---
 MAINTAINERS  |   6 +
 block/Makefile.objs  |   1 +
 block/blklogwrites.c | 414 +++
 qapi/block-core.json |  33 +++-
 4 files changed, 448 insertions(+), 6 deletions(-)
 create mode 100644 block/blklogwrites.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 42a1892..5af89e7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2051,6 +2051,12 @@ S: Supported
 F: block/quorum.c
 L: qemu-block@nongnu.org
 
+blklogwrites
+M: Ari Sundholm 
+L: qemu-block@nongnu.org
+S: Supported
+F: block/blklogwrites.c
+
 blkverify
 M: Stefan Hajnoczi 
 L: qemu-block@nongnu.org
diff --git a/block/Makefile.objs b/block/Makefile.objs
index 899bfb5..c8337bf 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -5,6 +5,7 @@ block-obj-y += qed-check.o
 block-obj-y += vhdx.o vhdx-endian.o vhdx-log.o
 block-obj-y += quorum.o
 block-obj-y += parallels.o blkdebug.o blkverify.o blkreplay.o
+block-obj-y += blklogwrites.o
 block-obj-y += block-backend.o snapshot.o qapi.o
 block-obj-$(CONFIG_WIN32) += file-win32.o win32-aio.o
 block-obj-$(CONFIG_POSIX) += file-posix.o
diff --git a/block/blklogwrites.c b/block/blklogwrites.c
new file mode 100644
index 000..47093fa
--- /dev/null
+++ b/block/blklogwrites.c
@@ -0,0 +1,414 @@
+/*
+ * Write logging blk driver based on blkverify and blkdebug.
+ *
+ * Copyright (c) 2017 Tuomas Tynkkynen 
+ * Copyright (c) 2018 Aapo Vienamo 
+ * Copyright (c) 2018 Ari Sundholm 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/sockets.h" /* for EINPROGRESS on Windows */
+#include "block/block_int.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qstring.h"
+#include "qemu/cutils.h"
+#include "qemu/option.h"
+
+/* Disk format stuff - taken from Linux drivers/md/dm-log-writes.c */
+
+#define LOG_FLUSH_FLAG   (1 << 0)
+#define LOG_FUA_FLAG (1 << 1)
+#define LOG_DISCARD_FLAG (1 << 2)
+#define LOG_MARK_FLAG(1 << 3)
+
+#define WRITE_LOG_VERSION 1ULL
+#define WRITE_LOG_MAGIC 0x6a736677736872ULL
+
+/* All fields are little-endian. */
+struct log_write_super {
+uint64_t magic;
+uint64_t version;
+uint64_t nr_entries;
+uint32_t sectorsize;
+} QEMU_PACKED;
+
+struct log_write_entry {
+uint64_t sector;
+uint64_t nr_sectors;
+uint64_t flags;
+uint64_t data_len;
+} QEMU_PACKED;
+
+/* End of disk format structures. */
+
+typedef struct {
+BdrvChild *log_file;
+uint32_t sectorsize;
+uint32_t sectorbits;
+uint64_t cur_log_sector;
+uint64_t nr_entries;
+} BDRVBlkLogWritesState;
+
+static QemuOptsList runtime_opts = {
+.name = "blklogwrites",
+.head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head),
+.desc = {
+{
+.name = "log-sector-size",
+.type = QEMU_OPT_SIZE,
+.help = "Log sector size",
+},
+{ /* end of list */ }
+},
+};
+
+static inline uint32_t blk_log_writes_log2(uint32_t value)
+{
+assert(value > 0);
+return 31 - clz32(value);
+}
+
+static int blk_log_writes_open(BlockDriverState *bs, QDict *options, int flags,
+   Error **errp)
+{
+BDRVBlkLogWritesState *s = bs->opaque;
+QemuOpts *opts;
+Error *local_err = NULL;
+int ret;
+int64_t log_sector_size;
+
+opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort);
+qemu_opts_absorb_qdict(opts, options, &local_err);
+if (local_err) {
+ret = -EINVAL;
+error_propagate(errp, local_err);
+goto fail;
+}
+
+/* Open the file */
+bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file, false,
+   &local_err);
+if (local_err) {
+ret = -EINVAL;
+error_propagate(errp, local_err);
+goto fail;
+}
+
+   

[Qemu-block] [PATCH v7 1/2] block: Move two block permission constants to the relevant enum

2018-07-03 Thread Ari Sundholm
This allows using the two constants outside of block.c, which will
happen in a subsequent patch.

Signed-off-by: Ari Sundholm 
---
 block.c   | 6 --
 include/block/block.h | 7 +++
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/block.c b/block.c
index 70a46fd..961ec97 100644
--- a/block.c
+++ b/block.c
@@ -1948,12 +1948,6 @@ int bdrv_child_try_set_perm(BdrvChild *c, uint64_t perm, 
uint64_t shared,
 return 0;
 }
 
-#define DEFAULT_PERM_PASSTHROUGH (BLK_PERM_CONSISTENT_READ \
- | BLK_PERM_WRITE \
- | BLK_PERM_WRITE_UNCHANGED \
- | BLK_PERM_RESIZE)
-#define DEFAULT_PERM_UNCHANGED (BLK_PERM_ALL & ~DEFAULT_PERM_PASSTHROUGH)
-
 void bdrv_filter_default_perms(BlockDriverState *bs, BdrvChild *c,
const BdrvChildRole *role,
BlockReopenQueue *reopen_queue,
diff --git a/include/block/block.h b/include/block/block.h
index e5c7759..bc76b1e 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -225,6 +225,13 @@ enum {
 BLK_PERM_GRAPH_MOD  = 0x10,
 
 BLK_PERM_ALL= 0x1f,
+
+DEFAULT_PERM_PASSTHROUGH= BLK_PERM_CONSISTENT_READ
+ | BLK_PERM_WRITE
+ | BLK_PERM_WRITE_UNCHANGED
+ | BLK_PERM_RESIZE,
+
+DEFAULT_PERM_UNCHANGED  = BLK_PERM_ALL & ~DEFAULT_PERM_PASSTHROUGH,
 };
 
 char *bdrv_perm_names(uint64_t perm);
-- 
2.7.4




[Qemu-block] [PATCH v7 0/2] New block driver: blklogwrites

2018-07-03 Thread Ari Sundholm
This patch series adds a new block driver, blklogwrites, to QEMU. The
driver is given two block devices: a raw device backed by an image or a
host block device, and a log device, typically backed by a file, on
which writes to the raw device are logged.

The logging format used is the same as in the dm-log-writes target of
the Linux kernel device mapper. The log reflects the writes that have
been performed on the guest block device and flushed. To be strict, the
log may contain writes that have not been flushed yet, but they are
technically outside the bounds of the log until the next flush updates
the metadata in the log superblock. We believe these semantics to be
useful even though they may not be completely identical to those of
dm-log-writes.

This functionality can be used for crash consistency and fs consistency
testing in filesystem drivers, including on non-Linux guests and older
guests running Linux versions without support for dm-log-writes. This
is simple and useful. Admittedly this and more advanced things could
perhaps be done by extending the quorum driver, but this approach would
require re-engineering the functionality and involve a more complicated
setup, so we offer this simple solution which we have found useful
internally.

In the first patch of this series, two block permission constants are
moved from block.c to include/block/block.h to make them available
outside of block.c. The next patch uses these constants.

In the second patch, the blklogwrites driver is introduced. The driver
accepts a target block device to be logged, a log device and an option
to set the sector size used for logging. All requests are aligned to
this sector size, and offsets and sizes in the log entries will be
expressed as a multiple of this value.

v7:
  - Fix invocation using -drive by re-adding the use of QemuOpts
  - Reduce maximum log sector size to 8 MB
  - Add missing addition of log sector size to full_open_options
  - Rebase on current upstream master

v6:
  - Remove patches containing the mechanism to pass block
configurations to block drivers
  - Replace the use of the above-mentioned mechanism with an option to
specify the sector size used for logging
  - Rebase on current upstream master

v5:
  - Reorder patches and squash blklogwrites driver patches together
  - Rebase on current upstream master
  - No functional changes

v4:
  - Check return value of snprintf() (flagged by patchew)
  - Don't use C99 for loop initial declaration (flagged by patchew, but
wasn't QEMU supposed to be C99? Oh well.)
  - Add proper "Since" fields for blklogwrites in block-core.json
  - Rebase on current upstream master

v3:
  - Rebase on current upstream master

v2:
  - Incorporate review feedback
  - Add patches to apply block configurations in more block device
drivers


Aapo Vienamo (1):
  block: Add blklogwrites

Ari Sundholm (1):
  block: Move two block permission constants to the relevant enum

 MAINTAINERS   |   6 +
 block.c   |   6 -
 block/Makefile.objs   |   1 +
 block/blklogwrites.c  | 414 ++
 include/block/block.h |   7 +
 qapi/block-core.json  |  33 +++-
 6 files changed, 455 insertions(+), 12 deletions(-)
 create mode 100644 block/blklogwrites.c

-- 
2.7.4




Re: [Qemu-block] [PATCH v6 2/2] block: Add blklogwrites

2018-07-03 Thread Ari Sundholm

On 07/03/2018 04:06 PM, Kevin Wolf wrote:

Am 29.06.2018 um 22:47 hat Ari Sundholm geschrieben:

From: Aapo Vienamo 

Implements a block device write logging system, similar to Linux kernel
device mapper dm-log-writes. The write operations that are performed
on a block device are logged to a file or another block device. The
write log format is identical to the dm-log-writes format. Currently,
log markers are not supported.

This functionality can be used for crash consistency and fs consistency
testing. By implementing it in qemu, tests utilizing write logs can be
be used to test non-Linux drivers and older kernels.

The driver accepts an optional parameter to set the sector size used
for logging. This makes the driver require all requests to be aligned
to this sector size and also makes offsets and sizes of writes in the
log metadata to be expressed in terms of this value (the log format has
a granularity of one sector for offsets and sizes). This allows
accurate logging of writes to guest block devices that have unusual
sector sizes.

The implementation is based on the blkverify and blkdebug block
drivers.

Signed-off-by: Aapo Vienamo 
Signed-off-by: Ari Sundholm 
---
  MAINTAINERS  |   6 +
  block/Makefile.objs  |   1 +
  block/blklogwrites.c | 392 +++
  qapi/block-core.json |  33 -
  4 files changed, 426 insertions(+), 6 deletions(-)
  create mode 100644 block/blklogwrites.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 42a1892..5af89e7 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -2051,6 +2051,12 @@ S: Supported
  F: block/quorum.c
  L: qemu-block@nongnu.org
  
+blklogwrites

+M: Ari Sundholm 
+L: qemu-block@nongnu.org
+S: Supported
+F: block/blklogwrites.c
+
  blkverify
  M: Stefan Hajnoczi 
  L: qemu-block@nongnu.org
diff --git a/block/Makefile.objs b/block/Makefile.objs
index 899bfb5..c8337bf 100644
--- a/block/Makefile.objs
+++ b/block/Makefile.objs
@@ -5,6 +5,7 @@ block-obj-y += qed-check.o
  block-obj-y += vhdx.o vhdx-endian.o vhdx-log.o
  block-obj-y += quorum.o
  block-obj-y += parallels.o blkdebug.o blkverify.o blkreplay.o
+block-obj-y += blklogwrites.o
  block-obj-y += block-backend.o snapshot.o qapi.o
  block-obj-$(CONFIG_WIN32) += file-win32.o win32-aio.o
  block-obj-$(CONFIG_POSIX) += file-posix.o
diff --git a/block/blklogwrites.c b/block/blklogwrites.c
new file mode 100644
index 000..0748b56
--- /dev/null
+++ b/block/blklogwrites.c
@@ -0,0 +1,392 @@
+/*
+ * Write logging blk driver based on blkverify and blkdebug.
+ *
+ * Copyright (c) 2017 Tuomas Tynkkynen 
+ * Copyright (c) 2018 Aapo Vienamo 
+ * Copyright (c) 2018 Ari Sundholm 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ */
+
+#include "qemu/osdep.h"
+#include "qapi/error.h"
+#include "qemu/sockets.h" /* for EINPROGRESS on Windows */
+#include "block/block_int.h"
+#include "qapi/qmp/qdict.h"
+#include "qapi/qmp/qstring.h"
+#include "qemu/cutils.h"
+#include "qemu/option.h"
+
+/* Disk format stuff - taken from Linux drivers/md/dm-log-writes.c */
+
+#define LOG_FLUSH_FLAG   (1 << 0)
+#define LOG_FUA_FLAG (1 << 1)
+#define LOG_DISCARD_FLAG (1 << 2)
+#define LOG_MARK_FLAG(1 << 3)
+
+#define WRITE_LOG_VERSION 1ULL
+#define WRITE_LOG_MAGIC 0x6a736677736872ULL
+
+/* All fields are little-endian. */
+struct log_write_super {
+uint64_t magic;
+uint64_t version;
+uint64_t nr_entries;
+uint32_t sectorsize;
+} QEMU_PACKED;
+
+struct log_write_entry {
+uint64_t sector;
+uint64_t nr_sectors;
+uint64_t flags;
+uint64_t data_len;
+} QEMU_PACKED;
+
+/* End of disk format structures. */
+
+typedef struct {
+BdrvChild *log_file;
+uint32_t sectorsize;
+uint32_t sectorbits;
+uint64_t cur_log_sector;
+uint64_t nr_entries;
+} BDRVBlkLogWritesState;
+
+static inline uint32_t blk_log_writes_log2(uint32_t value)
+{
+assert(value > 0);
+return 31 - clz32(value);
+}
+
+static int blk_log_writes_open(BlockDriverState *bs, QDict *options, int flags,
+   Error **errp)
+{
+BDRVBlkLogWritesState *s = bs->opaque;
+Error *local_err = NULL;
+int ret;
+int64_t log_sector_size = BDRV_SECTOR_SIZE;
+
+/* Open the file */
+bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file, false,
+   &local_err);
+if (local_err) {
+ret = -EINVAL;
+error_propagate(errp, local_err);
+goto fail;
+}
+
+if (qdict_haskey(options, "log-sector-size")) {
+log_sector_size = qdict_get_int(options, "log-sector-size");


This works with -blockdev and QMP blockdev-add, but not with -drive. The
problem is that the options QDict may contain the option in the proper
data type as specified in the QAPI schema, but it may also contain it as
a string in other code paths.

Other block drivers solve this by using QemuOpts for the driver options,
which c

Re: [Qemu-block] [PATCH v3 00/11] NBD reconnect

2018-07-03 Thread Vladimir Sementsov-Ogievskiy

Hi all.

before v4 realization, I'd like to discuss some questions.

Our proposal for v4 is the following:

1. don't reconnect on nbd_open. So, on open we do only one connect 
attempt, and if it fails, open fails.

2. don't configure timeout between attempts. instead do the following:
    1s timeout, then 2s, then 4, 8, 16, and then always 16 until success
3. configure only time, after disconnect, during which requests are 
paused (and after this time, if not connected, they will return EIO). Or 
not configure it for now, make a default of 5 minutes.


Any ideas?


09.06.2018 18:32, Vladimir Sementsov-Ogievskiy wrote:

Hi all.

Here is NBD reconnect.
The feature realized inside nbd-client driver and works as follows:

There are two parameters: reconnect-attempts and reconnect-timeout.
So, we will try to reconnect in case of initial connection failed or
in case of connection lost. All current and new io operations will wait
until we make reconnect-attempts tries to reconnect. After this, all
requests will fail with EIO, but we will continue trying to reconnect.

v3:
06: fix build error in function 'nbd_co_send_request':
  error: 'i' may be used uninitialized in this function

v2 notes:
Here is v2 of NBD reconnect, but it is very very different from v1, so,
forget about v1.
The series includes my "NBD reconnect: preliminary refactoring", with
changes in 05: leave asserts (Eric).

Vladimir Sementsov-Ogievskiy (11):
   block/nbd-client: split channel errors from export errors
   block/nbd: move connection code from block/nbd to block/nbd-client
   block/nbd-client: split connection from initialization
   block/nbd-client: fix nbd_reply_chunk_iter_receive
   block/nbd-client: don't check ioc
   block/nbd-client: move from quit to state
   block/nbd-client: rename read_reply_co to connection_co
   block/nbd-client: move connecting to connection_co
   block/nbd: add cmdline and qapi parameters for nbd reconnect
   block/nbd-client: nbd reconnect
   iotests: test nbd reconnect

  qapi/block-core.json  |  12 +-
  block/nbd-client.h|  23 ++-
  block/nbd-client.c| 429 ++
  block/nbd.c   |  61 +++---
  tests/qemu-iotests/220|  68 +++
  tests/qemu-iotests/220.out|   7 +
  tests/qemu-iotests/group  |   1 +
  tests/qemu-iotests/iotests.py |   4 +
  8 files changed, 445 insertions(+), 160 deletions(-)
  create mode 100755 tests/qemu-iotests/220
  create mode 100644 tests/qemu-iotests/220.out




--
Best regards,
Vladimir




Re: [Qemu-block] [PATCH v6 2/2] block: Add blklogwrites

2018-07-03 Thread Kevin Wolf
Am 29.06.2018 um 22:47 hat Ari Sundholm geschrieben:
> From: Aapo Vienamo 
> 
> Implements a block device write logging system, similar to Linux kernel
> device mapper dm-log-writes. The write operations that are performed
> on a block device are logged to a file or another block device. The
> write log format is identical to the dm-log-writes format. Currently,
> log markers are not supported.
> 
> This functionality can be used for crash consistency and fs consistency
> testing. By implementing it in qemu, tests utilizing write logs can be
> be used to test non-Linux drivers and older kernels.
> 
> The driver accepts an optional parameter to set the sector size used
> for logging. This makes the driver require all requests to be aligned
> to this sector size and also makes offsets and sizes of writes in the
> log metadata to be expressed in terms of this value (the log format has
> a granularity of one sector for offsets and sizes). This allows
> accurate logging of writes to guest block devices that have unusual
> sector sizes.
> 
> The implementation is based on the blkverify and blkdebug block
> drivers.
> 
> Signed-off-by: Aapo Vienamo 
> Signed-off-by: Ari Sundholm 
> ---
>  MAINTAINERS  |   6 +
>  block/Makefile.objs  |   1 +
>  block/blklogwrites.c | 392 
> +++
>  qapi/block-core.json |  33 -
>  4 files changed, 426 insertions(+), 6 deletions(-)
>  create mode 100644 block/blklogwrites.c
> 
> diff --git a/MAINTAINERS b/MAINTAINERS
> index 42a1892..5af89e7 100644
> --- a/MAINTAINERS
> +++ b/MAINTAINERS
> @@ -2051,6 +2051,12 @@ S: Supported
>  F: block/quorum.c
>  L: qemu-block@nongnu.org
>  
> +blklogwrites
> +M: Ari Sundholm 
> +L: qemu-block@nongnu.org
> +S: Supported
> +F: block/blklogwrites.c
> +
>  blkverify
>  M: Stefan Hajnoczi 
>  L: qemu-block@nongnu.org
> diff --git a/block/Makefile.objs b/block/Makefile.objs
> index 899bfb5..c8337bf 100644
> --- a/block/Makefile.objs
> +++ b/block/Makefile.objs
> @@ -5,6 +5,7 @@ block-obj-y += qed-check.o
>  block-obj-y += vhdx.o vhdx-endian.o vhdx-log.o
>  block-obj-y += quorum.o
>  block-obj-y += parallels.o blkdebug.o blkverify.o blkreplay.o
> +block-obj-y += blklogwrites.o
>  block-obj-y += block-backend.o snapshot.o qapi.o
>  block-obj-$(CONFIG_WIN32) += file-win32.o win32-aio.o
>  block-obj-$(CONFIG_POSIX) += file-posix.o
> diff --git a/block/blklogwrites.c b/block/blklogwrites.c
> new file mode 100644
> index 000..0748b56
> --- /dev/null
> +++ b/block/blklogwrites.c
> @@ -0,0 +1,392 @@
> +/*
> + * Write logging blk driver based on blkverify and blkdebug.
> + *
> + * Copyright (c) 2017 Tuomas Tynkkynen 
> + * Copyright (c) 2018 Aapo Vienamo 
> + * Copyright (c) 2018 Ari Sundholm 
> + *
> + * This work is licensed under the terms of the GNU GPL, version 2 or later.
> + * See the COPYING file in the top-level directory.
> + */
> +
> +#include "qemu/osdep.h"
> +#include "qapi/error.h"
> +#include "qemu/sockets.h" /* for EINPROGRESS on Windows */
> +#include "block/block_int.h"
> +#include "qapi/qmp/qdict.h"
> +#include "qapi/qmp/qstring.h"
> +#include "qemu/cutils.h"
> +#include "qemu/option.h"
> +
> +/* Disk format stuff - taken from Linux drivers/md/dm-log-writes.c */
> +
> +#define LOG_FLUSH_FLAG   (1 << 0)
> +#define LOG_FUA_FLAG (1 << 1)
> +#define LOG_DISCARD_FLAG (1 << 2)
> +#define LOG_MARK_FLAG(1 << 3)
> +
> +#define WRITE_LOG_VERSION 1ULL
> +#define WRITE_LOG_MAGIC 0x6a736677736872ULL
> +
> +/* All fields are little-endian. */
> +struct log_write_super {
> +uint64_t magic;
> +uint64_t version;
> +uint64_t nr_entries;
> +uint32_t sectorsize;
> +} QEMU_PACKED;
> +
> +struct log_write_entry {
> +uint64_t sector;
> +uint64_t nr_sectors;
> +uint64_t flags;
> +uint64_t data_len;
> +} QEMU_PACKED;
> +
> +/* End of disk format structures. */
> +
> +typedef struct {
> +BdrvChild *log_file;
> +uint32_t sectorsize;
> +uint32_t sectorbits;
> +uint64_t cur_log_sector;
> +uint64_t nr_entries;
> +} BDRVBlkLogWritesState;
> +
> +static inline uint32_t blk_log_writes_log2(uint32_t value)
> +{
> +assert(value > 0);
> +return 31 - clz32(value);
> +}
> +
> +static int blk_log_writes_open(BlockDriverState *bs, QDict *options, int 
> flags,
> +   Error **errp)
> +{
> +BDRVBlkLogWritesState *s = bs->opaque;
> +Error *local_err = NULL;
> +int ret;
> +int64_t log_sector_size = BDRV_SECTOR_SIZE;
> +
> +/* Open the file */
> +bs->file = bdrv_open_child(NULL, options, "file", bs, &child_file, false,
> +   &local_err);
> +if (local_err) {
> +ret = -EINVAL;
> +error_propagate(errp, local_err);
> +goto fail;
> +}
> +
> +if (qdict_haskey(options, "log-sector-size")) {
> +log_sector_size = qdict_get_int(options, "log-sector-size");

This works with -blockdev and QMP blockdev-add, but not with -drive. The
problem is that th

[Qemu-block] [PATCH V2] qemu-img: align result of is_allocated_sectors

2018-07-03 Thread Peter Lieven
We currently don't enforce that the sparse segments we detect during convert are
aligned. This leads to unnecessary and costly read-modify-write cycles either
internally in Qemu or in the background on the storage device as nearly all
modern filesystems or hardware have a 4k alignment internally.

The number of RMW cycles when converting an example image [1] to a raw device 
that
has 4k sector size is about 4600 4k read requests to perform a total of about 
15000
write requests. With this path the additional 4600 read requests are eliminated.

[1] 
https://cloud-images.ubuntu.com/releases/16.04/release/ubuntu-16.04-server-cloudimg-amd64-disk1.vmdk

Signed-off-by: Peter Lieven 
---
V1->V2: - take the current sector offset into account [Max]
- try to figure out the target alignment [Max]

 qemu-img.c | 44 ++--
 1 file changed, 34 insertions(+), 10 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index e1a506f..9490a74 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1105,8 +1105,11 @@ static int64_t find_nonzero(const uint8_t *buf, int64_t 
n)
  *
  * 'pnum' is set to the number of sectors (including and immediately following
  * the first one) that are known to be in the same allocated/unallocated state.
+ * The function will try to align 'pnum' to the number of sectors specified
+ * in 'alignment' to avoid unnecassary RMW cycles on modern hardware.
  */
-static int is_allocated_sectors(const uint8_t *buf, int n, int *pnum)
+static int is_allocated_sectors(const uint8_t *buf, int n, int *pnum,
+int64_t sector_num, int alignment)
 {
 bool is_zero;
 int i;
@@ -1115,14 +1118,25 @@ static int is_allocated_sectors(const uint8_t *buf, int 
n, int *pnum)
 *pnum = 0;
 return 0;
 }
-is_zero = buffer_is_zero(buf, 512);
-for(i = 1; i < n; i++) {
-buf += 512;
-if (is_zero != buffer_is_zero(buf, 512)) {
+
+if (sector_num % alignment) {
+n = ROUND_UP(sector_num, alignment) - sector_num;
+}
+
+if (n % alignment) {
+alignment = 1;
+}
+
+n /= alignment;
+
+is_zero = buffer_is_zero(buf, BDRV_SECTOR_SIZE * alignment);
+for (i = 1; i < n; i++) {
+buf += BDRV_SECTOR_SIZE * alignment;
+if (is_zero != buffer_is_zero(buf, BDRV_SECTOR_SIZE * alignment)) {
 break;
 }
 }
-*pnum = i;
+*pnum = i * alignment;
 return !is_zero;
 }
 
@@ -1132,7 +1146,7 @@ static int is_allocated_sectors(const uint8_t *buf, int 
n, int *pnum)
  * breaking up write requests for only small sparse areas.
  */
 static int is_allocated_sectors_min(const uint8_t *buf, int n, int *pnum,
-int min)
+int min, int64_t sector_num, int alignment)
 {
 int ret;
 int num_checked, num_used;
@@ -1141,7 +1155,7 @@ static int is_allocated_sectors_min(const uint8_t *buf, 
int n, int *pnum,
 min = n;
 }
 
-ret = is_allocated_sectors(buf, n, pnum);
+ret = is_allocated_sectors(buf, n, pnum, sector_num, alignment);
 if (!ret) {
 return ret;
 }
@@ -1149,13 +1163,15 @@ static int is_allocated_sectors_min(const uint8_t *buf, 
int n, int *pnum,
 num_used = *pnum;
 buf += BDRV_SECTOR_SIZE * *pnum;
 n -= *pnum;
+sector_num += *pnum;
 num_checked = num_used;
 
 while (n > 0) {
-ret = is_allocated_sectors(buf, n, pnum);
+ret = is_allocated_sectors(buf, n, pnum, sector_num, alignment);
 
 buf += BDRV_SECTOR_SIZE * *pnum;
 n -= *pnum;
+sector_num += *pnum;
 num_checked += *pnum;
 if (ret) {
 num_used = num_checked;
@@ -1560,6 +1576,7 @@ typedef struct ImgConvertState {
 bool wr_in_order;
 bool copy_range;
 int min_sparse;
+int alignment;
 size_t cluster_sectors;
 size_t buf_sectors;
 long num_coroutines;
@@ -1724,7 +1741,8 @@ static int coroutine_fn convert_co_write(ImgConvertState 
*s, int64_t sector_num,
  * zeroed. */
 if (!s->min_sparse ||
 (!s->compressed &&
- is_allocated_sectors_min(buf, n, &n, s->min_sparse)) ||
+ is_allocated_sectors_min(buf, n, &n, s->min_sparse,
+  sector_num, s->alignment)) ||
 (s->compressed &&
  !buffer_is_zero(buf, n * BDRV_SECTOR_SIZE)))
 {
@@ -2373,6 +2391,12 @@ static int img_convert(int argc, char **argv)
 out_bs->bl.pdiscard_alignment >>
 BDRV_SECTOR_BITS)));
 
+/* try to align the write requests to the destination to avoid unnecessary
+ * RMW cycles. */
+s.alignment = MAX(s.min_sparse,
+  DIV_ROUND_UP(out_bs->bl.request_alignment,
+   BDRV_SECTOR_SIZE));
+
 if (skip_create) {
 int64_t output_sectors = blk_nb_sectors(s.target);
 if (output_sectors < 0) {
-- 
2.7.4


Re: [Qemu-block] [PULL 0/3] Block patches

2018-07-03 Thread Peter Maydell
On 3 July 2018 at 04:46, Jeff Cody  wrote:
> The following changes since commit ab08440a4ee09032d1a9cb22fdcab23bc7e1c656:
>
>   Merge remote-tracking branch 'remotes/rth/tags/pull-tcg-20180702' into 
> staging (2018-07-02 17:57:46 +0100)
>
> are available in the Git repository at:
>
>   git://github.com/codyprime/qemu-kvm-jtc.git tags/block-pull-request
>
> for you to fetch changes up to 9ded4a0114968e98b41494fc035ba14f84cdf700:
>
>   backup: Use copy offloading (2018-07-02 23:23:45 -0400)
>
> 
> Block backup patches
> 
>
> Fam Zheng (3):
>   block: Fix parameter checking in bdrv_co_copy_range_internal
>   block: Honour BDRV_REQ_NO_SERIALISING in copy range
>   backup: Use copy offloading
>
>  block/backup.c| 150 ++
>  block/io.c|  35 +-
>  block/trace-events|   1 +
>  include/block/block.h |   5 +-
>  4 files changed, 132 insertions(+), 59 deletions(-)

Applied, thanks.

-- PMM



Re: [Qemu-block] [libvirt] [Qemu-devel] [PULL 25/26] block: Remove deprecated -drive option serial

2018-07-03 Thread Christian Borntraeger



On 07/03/2018 01:35 PM, Peter Maydell wrote:
> On 3 July 2018 at 12:32, Kevin Wolf  wrote:
>> Am 03.07.2018 um 13:22 hat Daniel P. Berrangé geschrieben:
>>> Just posted latest version here:
>>>
>>>   https://www.redhat.com/archives/libvir-list/2018-July/msg00130.html
>>>
>>> It will be in the next release on ~ Aug 1st
>>
>> It would have been a lot nicer to have it the July release because this
>> means that we'll have the released libvirt broken during almost the
>> whole rc phase of QEMU 3.0, but the release is planned for Aug 8th the
>> earliest, so I guess we're still okay. People using QEMU from git will
>> just need libvirt from git as well.
> 
> I'm still not clear what we gain from having a QEMU that's dropped
> a feature that is still used by everything except leading-edge
> not-yet-released versions of libvirt. Is there a strong reason we
> can't just revert the deletion of the deprecated feature for a QEMU
> release or two?

+1




Re: [Qemu-block] [PATCH v2 2/3] block/fleecing-filter: new filter driver for fleecing

2018-07-03 Thread Vladimir Sementsov-Ogievskiy

03.07.2018 14:15, Kevin Wolf wrote:

Am 02.07.2018 um 14:09 hat Vladimir Sementsov-Ogievskiy geschrieben:

29.06.2018 20:40, Eric Blake wrote:

On 06/29/2018 12:30 PM, John Snow wrote:


On 06/29/2018 11:15 AM, Vladimir Sementsov-Ogievskiy wrote:

We need to synchronize backup job with reading from fleecing image
like it was done in block/replication.c.

Otherwise, the following situation is theoretically possible:

1. client start reading
2. client understand, that there is no corresponding cluster in
     fleecing image

I don't think the client refocuses the read, but QEMU does. (the client
has no idea if it is reading from the fleecing node or the backing
file.)

... but understood:


3. client is going to read from backing file (i.e. active image)
4. guest writes to active image

My question here is if QEMU will allow reads and writes to interleave in
general. If the client is reading from the backing file, the active
image, will QEMU allow a write to modify that data before we're done
getting that data?


5. this write is stopped by backup(sync=none) and cluster is copied to
     fleecing image
6. guest write continues...
7. and client reads _new_ (or partly new) date from active image


I can't answer this for myself one way or the other right now, but I
imagine you observed a failure in practice in your test setups, which
motivated this patch?

A test or some proof would help justification for this patch. It would
also help prove that it solves what it claims to!

In fact, do we really need a new filter, or do we just need to make the
"sync":"none" blockdev-backup job take the appropriate synchronization
locks?


How? We'll need additional mechanism like serializing requests.. Or a way to
reuse serializing requests. Using backup internal synchronization looks
simpler, and it is already used in block replication.

But it also just an ugly hack


agree.


  that fixes one special case and leaves
everything else broken. replication is usually not a good example for
anything. It always gives me bad surprises when I have to look at it.

We'll have to figure out where to fix this problem (or what it really
is, once you look more than just at fleecing), but I think requiring the
user to add a filter driver to work around missing serialisation in
other code, and corrupting their image if they forget to, is not a
reasonable solution.

I see at least two things wrong in this context:

* The permissions don't seem to match reality. The NBD server
   unconditionally shares PERM_WRITE, which is wrong in this case. The
   client wants to see a point-in-time snapshot that never changes. This
   should become an option so that it can be properly reflected in the
   permissions used.

* Once we have proper permissions, the fleecing setup breaks down
   because the guest needs PERM_WRITE on the backing file, but the
   fleecing overlay allows that only if the NBD client allows it (which
   it doesn't for fleecing).

   Now we can implement an exception right into backup that installs a
   backup filter driver between source and target if the source is the
   backing file of the target. The filter driver would be similar to the
   commit filter driver in that it simply promises !PERM_WRITE to its
   parents, but allows PERM_WRITE on the source because it has installed
   the before_write_notifier that guarantees this condition.

   All writes to the target that are made by the backup job in this setup
   (including before_write_notifier writes) need to be marked as
   serialising so that any concurrent reads are completed first.

And if we decide to add a target filter to backup, we should probably at
the same time use a filter driver for intercepting source writes instead
of using before_write_notifier.


and this is the point, where we can drop backup job at all from fleecing 
scheme, as actually backup(sync=none) == such special filter driver




Max, I think you intended to make both source and target children of the
same block job node (or at least for mirror). But wouldn't that create
loops in a setup like this? I think we may need two filters that are
only connected through the block job, but not with block graph edges.

Kevin


--
Best regards,
Vladimir




Re: [Qemu-block] [libvirt] [Qemu-devel] [PULL 25/26] block: Remove deprecated -drive option serial

2018-07-03 Thread Daniel P . Berrangé
On Tue, Jul 03, 2018 at 01:32:29PM +0200, Kevin Wolf wrote:
> Am 03.07.2018 um 13:22 hat Daniel P. Berrangé geschrieben:
> > On Tue, Jul 03, 2018 at 12:53:44PM +0200, Christian Borntraeger wrote:
> > > 
> > > 
> > > On 07/02/2018 10:04 AM, Kevin Wolf wrote:
> > > > Am 25.06.2018 um 13:45 hat Peter Krempa geschrieben:
> > > >> On Mon, Jun 25, 2018 at 13:41:06 +0200, Kevin Wolf wrote:
> > > >>> Am 25.06.2018 um 11:53 hat Daniel P. Berrangé geschrieben:
> > >  On Fri, Jun 22, 2018 at 03:31:46PM +0100, Daniel P. Berrangé wrote:
> > > > On Fri, Jun 22, 2018 at 04:25:13PM +0200, Kevin Wolf wrote:
> > > >> Am 22.06.2018 um 15:36 hat Christian Borntraeger geschrieben:
> > > >>
> > > >> [...]
> > > >>
> > > >>>
> > > >>> Thanks!
> > > >>>
> > > >>> I'll look into werror/rerror support for usb-storage. It shouldn't be
> > > >>> too hard, though it's strictly speaking a separate problem related to
> > > >>> using -blockdev rather than option deprecation.
> > > >>>
> > > >>> If Peter wants to wait for QEMU support before converting 
> > > >>> werror/rerror
> > > >>
> > > >> Definitely. I don't want to keep around yet another hack that will
> > > >> satisfy one specific case and then add another capability for it. We
> > > >> should then gate the moving of the feature based on the presence of
> > > >> werror for usb-storage.
> > > >>
> > > >>> to -device, maybe it would make sense to split your patch for v2 so 
> > > >>> that
> > > >>> geometry and serial can get fixed right away?
> > > >>
> > > >> Yes this can be done right away.
> > > > 
> > > > Has serial/gemoetry been fixed meanwhile and will it make it into the
> > > > next release?
> > > 
> > > I cannot find an archive that has it, but it is on the libvirt mailing
> > > list as "[libvirt] [PATCH v3] qemu: format serial and geometry on 
> > > frontend disk device".
> > > Review seems done, but it has missed libvirt 4.5 which was released today.
> > 
> > Just posted latest version here:
> > 
> >   https://www.redhat.com/archives/libvir-list/2018-July/msg00130.html
> > 
> > It will be in the next release on ~ Aug 1st
> 
> It would have been a lot nicer to have it the July release because this
> means that we'll have the released libvirt broken during almost the
> whole rc phase of QEMU 3.0, but the release is planned for Aug 8th the
> earliest, so I guess we're still okay. People using QEMU from git will
> just need libvirt from git as well.

Yeah, unfortunately i just missed the window of possibility to get it
into yesterday's release.

Fedora at least will be ok, and when other distros do pick up new QEMU
they'll likely pick up the corresponding libvirt release at same time
anyway.

Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|



Re: [Qemu-block] [libvirt] [Qemu-devel] [PULL 25/26] block: Remove deprecated -drive option serial

2018-07-03 Thread Peter Maydell
On 3 July 2018 at 12:32, Kevin Wolf  wrote:
> Am 03.07.2018 um 13:22 hat Daniel P. Berrangé geschrieben:
>> Just posted latest version here:
>>
>>   https://www.redhat.com/archives/libvir-list/2018-July/msg00130.html
>>
>> It will be in the next release on ~ Aug 1st
>
> It would have been a lot nicer to have it the July release because this
> means that we'll have the released libvirt broken during almost the
> whole rc phase of QEMU 3.0, but the release is planned for Aug 8th the
> earliest, so I guess we're still okay. People using QEMU from git will
> just need libvirt from git as well.

I'm still not clear what we gain from having a QEMU that's dropped
a feature that is still used by everything except leading-edge
not-yet-released versions of libvirt. Is there a strong reason we
can't just revert the deletion of the deprecated feature for a QEMU
release or two?

thanks
-- PMM



Re: [Qemu-block] [libvirt] [Qemu-devel] [PULL 25/26] block: Remove deprecated -drive option serial

2018-07-03 Thread Kevin Wolf
Am 03.07.2018 um 13:22 hat Daniel P. Berrangé geschrieben:
> On Tue, Jul 03, 2018 at 12:53:44PM +0200, Christian Borntraeger wrote:
> > 
> > 
> > On 07/02/2018 10:04 AM, Kevin Wolf wrote:
> > > Am 25.06.2018 um 13:45 hat Peter Krempa geschrieben:
> > >> On Mon, Jun 25, 2018 at 13:41:06 +0200, Kevin Wolf wrote:
> > >>> Am 25.06.2018 um 11:53 hat Daniel P. Berrangé geschrieben:
> >  On Fri, Jun 22, 2018 at 03:31:46PM +0100, Daniel P. Berrangé wrote:
> > > On Fri, Jun 22, 2018 at 04:25:13PM +0200, Kevin Wolf wrote:
> > >> Am 22.06.2018 um 15:36 hat Christian Borntraeger geschrieben:
> > >>
> > >> [...]
> > >>
> > >>>
> > >>> Thanks!
> > >>>
> > >>> I'll look into werror/rerror support for usb-storage. It shouldn't be
> > >>> too hard, though it's strictly speaking a separate problem related to
> > >>> using -blockdev rather than option deprecation.
> > >>>
> > >>> If Peter wants to wait for QEMU support before converting werror/rerror
> > >>
> > >> Definitely. I don't want to keep around yet another hack that will
> > >> satisfy one specific case and then add another capability for it. We
> > >> should then gate the moving of the feature based on the presence of
> > >> werror for usb-storage.
> > >>
> > >>> to -device, maybe it would make sense to split your patch for v2 so that
> > >>> geometry and serial can get fixed right away?
> > >>
> > >> Yes this can be done right away.
> > > 
> > > Has serial/gemoetry been fixed meanwhile and will it make it into the
> > > next release?
> > 
> > I cannot find an archive that has it, but it is on the libvirt mailing
> > list as "[libvirt] [PATCH v3] qemu: format serial and geometry on frontend 
> > disk device".
> > Review seems done, but it has missed libvirt 4.5 which was released today.
> 
> Just posted latest version here:
> 
>   https://www.redhat.com/archives/libvir-list/2018-July/msg00130.html
> 
> It will be in the next release on ~ Aug 1st

It would have been a lot nicer to have it the July release because this
means that we'll have the released libvirt broken during almost the
whole rc phase of QEMU 3.0, but the release is planned for Aug 8th the
earliest, so I guess we're still okay. People using QEMU from git will
just need libvirt from git as well.

Kevin



Re: [Qemu-block] [libvirt] [Qemu-devel] [PULL 25/26] block: Remove deprecated -drive option serial

2018-07-03 Thread Daniel P . Berrangé
On Tue, Jul 03, 2018 at 12:53:44PM +0200, Christian Borntraeger wrote:
> 
> 
> On 07/02/2018 10:04 AM, Kevin Wolf wrote:
> > Am 25.06.2018 um 13:45 hat Peter Krempa geschrieben:
> >> On Mon, Jun 25, 2018 at 13:41:06 +0200, Kevin Wolf wrote:
> >>> Am 25.06.2018 um 11:53 hat Daniel P. Berrangé geschrieben:
>  On Fri, Jun 22, 2018 at 03:31:46PM +0100, Daniel P. Berrangé wrote:
> > On Fri, Jun 22, 2018 at 04:25:13PM +0200, Kevin Wolf wrote:
> >> Am 22.06.2018 um 15:36 hat Christian Borntraeger geschrieben:
> >>
> >> [...]
> >>
> >>>
> >>> Thanks!
> >>>
> >>> I'll look into werror/rerror support for usb-storage. It shouldn't be
> >>> too hard, though it's strictly speaking a separate problem related to
> >>> using -blockdev rather than option deprecation.
> >>>
> >>> If Peter wants to wait for QEMU support before converting werror/rerror
> >>
> >> Definitely. I don't want to keep around yet another hack that will
> >> satisfy one specific case and then add another capability for it. We
> >> should then gate the moving of the feature based on the presence of
> >> werror for usb-storage.
> >>
> >>> to -device, maybe it would make sense to split your patch for v2 so that
> >>> geometry and serial can get fixed right away?
> >>
> >> Yes this can be done right away.
> > 
> > Has serial/gemoetry been fixed meanwhile and will it make it into the
> > next release?
> 
> I cannot find an archive that has it, but it is on the libvirt mailing
> list as "[libvirt] [PATCH v3] qemu: format serial and geometry on frontend 
> disk device".
> Review seems done, but it has missed libvirt 4.5 which was released today.

Just posted latest version here:

  https://www.redhat.com/archives/libvir-list/2018-July/msg00130.html

It will be in the next release on ~ Aug 1st

Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|



Re: [Qemu-block] [PATCH v2 2/3] block/fleecing-filter: new filter driver for fleecing

2018-07-03 Thread Kevin Wolf
Am 02.07.2018 um 13:57 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 29.06.2018 20:30, John Snow wrote:
> > 
> > On 06/29/2018 11:15 AM, Vladimir Sementsov-Ogievskiy wrote:
> > > We need to synchronize backup job with reading from fleecing image
> > > like it was done in block/replication.c.
> > > 
> > > Otherwise, the following situation is theoretically possible:
> > > 
> > > 1. client start reading
> > > 2. client understand, that there is no corresponding cluster in
> > > fleecing image
> > I don't think the client refocuses the read, but QEMU does. (the client
> > has no idea if it is reading from the fleecing node or the backing file.)
> > 
> > ... but understood:
> > 
> > > 3. client is going to read from backing file (i.e. active image)
> > > 4. guest writes to active image
> > My question here is if QEMU will allow reads and writes to interleave in
> > general. If the client is reading from the backing file, the active
> > image, will QEMU allow a write to modify that data before we're done
> > getting that data?
> 
> If I understand correctly: yes. Physical drives allows intersecting
> operations too and there no guarantee on result. We have serializing
> requests, but in general only unaligned requests are marked serializing.

Copy on read (in the context of image streaming) is the feature which
actually introduced it. It has some similarities with the backup job, so
the idea of using it for backup, too, isn't exactly revolutionary.

Kevin



Re: [Qemu-block] [PATCH v6] crypto: Implement TLS Pre-Shared Keys (PSK).

2018-07-03 Thread Daniel P . Berrangé
On Tue, Jul 03, 2018 at 09:03:03AM +0100, Richard W.M. Jones wrote:
> Pre-Shared Keys (PSK) is a simpler mechanism for enabling TLS
> connections than using certificates.  It requires only a simple secret
> key:
> 
>   $ mkdir -m 0700 /tmp/keys
>   $ psktool -u rjones -p /tmp/keys/keys.psk
>   $ cat /tmp/keys/keys.psk
>   rjones:d543770c15ad93d76443fb56f501a31969235f47e999720ae8d2336f6a13fcbc
> 
> The key can be secretly shared between clients and servers.  Clients
> must specify the directory containing the "keys.psk" file and a
> username (defaults to "qemu").  Servers must specify only the
> directory.
> 
> Example NBD client:
> 
>   $ qemu-img info \
> --object 
> tls-creds-psk,id=tls0,dir=/tmp/keys,username=rjones,endpoint=client \
> --image-opts \
> 
> file.driver=nbd,file.host=localhost,file.port=10809,file.tls-creds=tls0,file.export=/
> 
> Example NBD server using qemu-nbd:
> 
>   $ qemu-nbd -t -x / \
> --object tls-creds-psk,id=tls0,endpoint=server,dir=/tmp/keys \
> --tls-creds tls0 \
> image.qcow2
> 
> Example NBD server using nbdkit:
> 
>   $ nbdkit -n -e / -fv \
> --tls=on --tls-psk=/tmp/keys/keys.psk \
> file file=disk.img
> 
> Signed-off-by: Richard W.M. Jones 
> ---
>  crypto/Makefile.objs   |   1 +
>  crypto/tlscredspsk.c   | 308 +
>  crypto/tlssession.c|  56 +-
>  crypto/trace-events|   3 +
>  include/crypto/tlscredspsk.h   | 106 
>  qemu-doc.texi  |  37 
>  qemu-options.hx|  24 +++
>  tests/Makefile.include |   4 +-
>  tests/crypto-tls-psk-helpers.c |  50 ++
>  tests/crypto-tls-psk-helpers.h |  29 
>  tests/test-crypto-tlssession.c | 185 +---
>  11 files changed, 777 insertions(+), 26 deletions(-)

Signed-off-by: Daniel P. Berrangé 


I'll send a pull request with it shortly


Regards,
Daniel
-- 
|: https://berrange.com  -o-https://www.flickr.com/photos/dberrange :|
|: https://libvirt.org -o-https://fstop138.berrange.com :|
|: https://entangle-photo.org-o-https://www.instagram.com/dberrange :|



Re: [Qemu-block] [PATCH v2 2/3] block/fleecing-filter: new filter driver for fleecing

2018-07-03 Thread Kevin Wolf
Am 02.07.2018 um 14:09 hat Vladimir Sementsov-Ogievskiy geschrieben:
> 29.06.2018 20:40, Eric Blake wrote:
> > On 06/29/2018 12:30 PM, John Snow wrote:
> > > 
> > > 
> > > On 06/29/2018 11:15 AM, Vladimir Sementsov-Ogievskiy wrote:
> > > > We need to synchronize backup job with reading from fleecing image
> > > > like it was done in block/replication.c.
> > > > 
> > > > Otherwise, the following situation is theoretically possible:
> > > > 
> > > > 1. client start reading
> > > > 2. client understand, that there is no corresponding cluster in
> > > >     fleecing image
> > > 
> > > I don't think the client refocuses the read, but QEMU does. (the client
> > > has no idea if it is reading from the fleecing node or the backing
> > > file.)
> > > 
> > > ... but understood:
> > > 
> > > > 3. client is going to read from backing file (i.e. active image)
> > > > 4. guest writes to active image
> > > 
> > > My question here is if QEMU will allow reads and writes to interleave in
> > > general. If the client is reading from the backing file, the active
> > > image, will QEMU allow a write to modify that data before we're done
> > > getting that data?
> > > 
> > > > 5. this write is stopped by backup(sync=none) and cluster is copied to
> > > >     fleecing image
> > > > 6. guest write continues...
> > > > 7. and client reads _new_ (or partly new) date from active image
> > > > 
> > > 
> > > I can't answer this for myself one way or the other right now, but I
> > > imagine you observed a failure in practice in your test setups, which
> > > motivated this patch?
> > > 
> > > A test or some proof would help justification for this patch. It would
> > > also help prove that it solves what it claims to!
> > 
> > In fact, do we really need a new filter, or do we just need to make the
> > "sync":"none" blockdev-backup job take the appropriate synchronization
> > locks?
> > 
> 
> How? We'll need additional mechanism like serializing requests.. Or a way to
> reuse serializing requests. Using backup internal synchronization looks
> simpler, and it is already used in block replication.

But it also just an ugly hack that fixes one special case and leaves
everything else broken. replication is usually not a good example for
anything. It always gives me bad surprises when I have to look at it.

We'll have to figure out where to fix this problem (or what it really
is, once you look more than just at fleecing), but I think requiring the
user to add a filter driver to work around missing serialisation in
other code, and corrupting their image if they forget to, is not a
reasonable solution.

I see at least two things wrong in this context:

* The permissions don't seem to match reality. The NBD server
  unconditionally shares PERM_WRITE, which is wrong in this case. The
  client wants to see a point-in-time snapshot that never changes. This
  should become an option so that it can be properly reflected in the
  permissions used.

* Once we have proper permissions, the fleecing setup breaks down
  because the guest needs PERM_WRITE on the backing file, but the
  fleecing overlay allows that only if the NBD client allows it (which
  it doesn't for fleecing).

  Now we can implement an exception right into backup that installs a
  backup filter driver between source and target if the source is the
  backing file of the target. The filter driver would be similar to the
  commit filter driver in that it simply promises !PERM_WRITE to its
  parents, but allows PERM_WRITE on the source because it has installed
  the before_write_notifier that guarantees this condition.

  All writes to the target that are made by the backup job in this setup
  (including before_write_notifier writes) need to be marked as
  serialising so that any concurrent reads are completed first.

And if we decide to add a target filter to backup, we should probably at
the same time use a filter driver for intercepting source writes instead
of using before_write_notifier.

Max, I think you intended to make both source and target children of the
same block job node (or at least for mirror). But wouldn't that create
loops in a setup like this? I think we may need two filters that are
only connected through the block job, but not with block graph edges.

Kevin



Re: [Qemu-block] [libvirt] [Qemu-devel] [PULL 25/26] block: Remove deprecated -drive option serial

2018-07-03 Thread Christian Borntraeger



On 07/02/2018 10:04 AM, Kevin Wolf wrote:
> Am 25.06.2018 um 13:45 hat Peter Krempa geschrieben:
>> On Mon, Jun 25, 2018 at 13:41:06 +0200, Kevin Wolf wrote:
>>> Am 25.06.2018 um 11:53 hat Daniel P. Berrangé geschrieben:
 On Fri, Jun 22, 2018 at 03:31:46PM +0100, Daniel P. Berrangé wrote:
> On Fri, Jun 22, 2018 at 04:25:13PM +0200, Kevin Wolf wrote:
>> Am 22.06.2018 um 15:36 hat Christian Borntraeger geschrieben:
>>
>> [...]
>>
>>>
>>> Thanks!
>>>
>>> I'll look into werror/rerror support for usb-storage. It shouldn't be
>>> too hard, though it's strictly speaking a separate problem related to
>>> using -blockdev rather than option deprecation.
>>>
>>> If Peter wants to wait for QEMU support before converting werror/rerror
>>
>> Definitely. I don't want to keep around yet another hack that will
>> satisfy one specific case and then add another capability for it. We
>> should then gate the moving of the feature based on the presence of
>> werror for usb-storage.
>>
>>> to -device, maybe it would make sense to split your patch for v2 so that
>>> geometry and serial can get fixed right away?
>>
>> Yes this can be done right away.
> 
> Has serial/gemoetry been fixed meanwhile and will it make it into the
> next release?

I cannot find an archive that has it, but it is on the libvirt mailing
list as "[libvirt] [PATCH v3] qemu: format serial and geometry on frontend disk 
device".
Review seems done, but it has missed libvirt 4.5 which was released today. 

Christian




[Qemu-block] [PATCH 1/2] drity-bitmap: refactor merge: separte can_merge

2018-07-03 Thread Vladimir Sementsov-Ogievskiy
Separate can_merge, to reuse it for transaction support for merge
command.

Also, switch some asserts to errors.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 include/block/dirty-bitmap.h |  3 +++
 include/qemu/hbitmap.h   |  8 
 block/dirty-bitmap.c | 32 +++-
 blockdev.c   | 10 --
 util/hbitmap.c   |  6 +-
 5 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/include/block/dirty-bitmap.h b/include/block/dirty-bitmap.h
index 288dc6adb6..412a333c02 100644
--- a/include/block/dirty-bitmap.h
+++ b/include/block/dirty-bitmap.h
@@ -71,6 +71,9 @@ void bdrv_dirty_bitmap_set_persistance(BdrvDirtyBitmap 
*bitmap,
 void bdrv_dirty_bitmap_set_qmp_locked(BdrvDirtyBitmap *bitmap, bool 
qmp_locked);
 void bdrv_merge_dirty_bitmap(BdrvDirtyBitmap *dest, const BdrvDirtyBitmap *src,
  Error **errp);
+bool bdrv_can_merge_dirty_bitmap(BdrvDirtyBitmap *dst,
+ const BdrvDirtyBitmap *src,
+ Error **errp);
 
 /* Functions that require manual locking.  */
 void bdrv_dirty_bitmap_lock(BdrvDirtyBitmap *bitmap);
diff --git a/include/qemu/hbitmap.h b/include/qemu/hbitmap.h
index ddca52c48e..d08bc20ea3 100644
--- a/include/qemu/hbitmap.h
+++ b/include/qemu/hbitmap.h
@@ -85,6 +85,14 @@ void hbitmap_truncate(HBitmap *hb, uint64_t size);
 bool hbitmap_merge(HBitmap *a, const HBitmap *b);
 
 /**
+ * hbitmap_can_merge:
+ *
+ * Returns same value as hbitmap_merge, but do not do actual merge.
+ *
+ */
+bool hbitmap_can_merge(HBitmap *a, const HBitmap *b);
+
+/**
  * hbitmap_empty:
  * @hb: HBitmap to operate on.
  *
diff --git a/block/dirty-bitmap.c b/block/dirty-bitmap.c
index db1782ec1f..1137224aaa 100644
--- a/block/dirty-bitmap.c
+++ b/block/dirty-bitmap.c
@@ -784,6 +784,30 @@ int64_t bdrv_dirty_bitmap_next_zero(BdrvDirtyBitmap 
*bitmap, uint64_t offset)
 return hbitmap_next_zero(bitmap->bitmap, offset);
 }
 
+bool bdrv_can_merge_dirty_bitmap(BdrvDirtyBitmap *dst,
+ const BdrvDirtyBitmap *src,
+ Error **errp)
+{
+if (bdrv_dirty_bitmap_frozen(dst)) {
+error_setg(errp, "Bitmap '%s' is frozen and cannot be modified",
+   dst->name);
+return false;
+}
+
+if (bdrv_dirty_bitmap_readonly(dst)) {
+error_setg(errp, "Bitmap '%s' is readonly and cannot be modified",
+   dst->name);
+return false;
+}
+
+if (!hbitmap_can_merge(dst->bitmap, src->bitmap)) {
+error_setg(errp, "Bitmaps are incompatible and can't be merged");
+return false;
+}
+
+return true;
+}
+
 void bdrv_merge_dirty_bitmap(BdrvDirtyBitmap *dest, const BdrvDirtyBitmap *src,
  Error **errp)
 {
@@ -792,11 +816,9 @@ void bdrv_merge_dirty_bitmap(BdrvDirtyBitmap *dest, const 
BdrvDirtyBitmap *src,
 
 qemu_mutex_lock(dest->mutex);
 
-assert(bdrv_dirty_bitmap_enabled(dest));
-assert(!bdrv_dirty_bitmap_readonly(dest));
-
-if (!hbitmap_merge(dest->bitmap, src->bitmap)) {
-error_setg(errp, "Bitmaps are incompatible and can't be merged");
+if (bdrv_can_merge_dirty_bitmap(dest, src, errp)) {
+bool ret = hbitmap_merge(dest->bitmap, src->bitmap);
+assert(ret);
 }
 
 qemu_mutex_unlock(dest->mutex);
diff --git a/blockdev.c b/blockdev.c
index 58d7570932..63c4d33124 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -2955,16 +2955,6 @@ void qmp_x_block_dirty_bitmap_merge(const char *node, 
const char *dst_name,
 return;
 }
 
-if (bdrv_dirty_bitmap_frozen(dst)) {
-error_setg(errp, "Bitmap '%s' is frozen and cannot be modified",
-   dst_name);
-return;
-} else if (bdrv_dirty_bitmap_readonly(dst)) {
-error_setg(errp, "Bitmap '%s' is readonly and cannot be modified",
-   dst_name);
-return;
-}
-
 src = bdrv_find_dirty_bitmap(bs, src_name);
 if (!src) {
 error_setg(errp, "Dirty bitmap '%s' not found", src_name);
diff --git a/util/hbitmap.c b/util/hbitmap.c
index bcd304041a..b56377b043 100644
--- a/util/hbitmap.c
+++ b/util/hbitmap.c
@@ -723,6 +723,10 @@ void hbitmap_truncate(HBitmap *hb, uint64_t size)
 }
 }
 
+bool hbitmap_can_merge(HBitmap *a, const HBitmap *b)
+{
+return (a->size == b->size) && (a->granularity == b->granularity);
+}
 
 /**
  * Given HBitmaps A and B, let A := A (BITOR) B.
@@ -736,7 +740,7 @@ bool hbitmap_merge(HBitmap *a, const HBitmap *b)
 int i;
 uint64_t j;
 
-if ((a->size != b->size) || (a->granularity != b->granularity)) {
+if (!hbitmap_can_merge(a, b)) {
 return false;
 }
 
-- 
2.11.1




[Qemu-block] [PATCH 0/2] transaction support for bitmap merge

2018-07-03 Thread Vladimir Sementsov-Ogievskiy
This is a last brick, necessary to play with nbd bitmap export in
conjunction with image fleecing.

Vladimir Sementsov-Ogievskiy (2):
  drity-bitmap: refactor merge: separte can_merge
  qapi: add transaction support for x-block-dirty-bitmap-merge

 qapi/transaction.json|  2 ++
 include/block/dirty-bitmap.h |  3 +++
 include/qemu/hbitmap.h   |  8 ++
 block/dirty-bitmap.c | 32 ++
 blockdev.c   | 63 
 util/hbitmap.c   |  6 -
 6 files changed, 97 insertions(+), 17 deletions(-)

-- 
2.11.1




[Qemu-block] [PATCH 2/2] qapi: add transaction support for x-block-dirty-bitmap-merge

2018-07-03 Thread Vladimir Sementsov-Ogievskiy
Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 qapi/transaction.json |  2 ++
 blockdev.c| 53 ++-
 2 files changed, 54 insertions(+), 1 deletion(-)

diff --git a/qapi/transaction.json b/qapi/transaction.json
index d7e4274550..f9da6ad47f 100644
--- a/qapi/transaction.json
+++ b/qapi/transaction.json
@@ -48,6 +48,7 @@
 # - @block-dirty-bitmap-clear: since 2.5
 # - @x-block-dirty-bitmap-enable: since 3.0
 # - @x-block-dirty-bitmap-disable: since 3.0
+# - @x-block-dirty-bitmap-merge: since 3.0
 # - @blockdev-backup: since 2.3
 # - @blockdev-snapshot: since 2.5
 # - @blockdev-snapshot-internal-sync: since 1.7
@@ -63,6 +64,7 @@
'block-dirty-bitmap-clear': 'BlockDirtyBitmap',
'x-block-dirty-bitmap-enable': 'BlockDirtyBitmap',
'x-block-dirty-bitmap-disable': 'BlockDirtyBitmap',
+   'x-block-dirty-bitmap-merge': 'BlockDirtyBitmapMerge',
'blockdev-backup': 'BlockdevBackup',
'blockdev-snapshot': 'BlockdevSnapshot',
'blockdev-snapshot-internal-sync': 'BlockdevSnapshotInternal',
diff --git a/blockdev.c b/blockdev.c
index 63c4d33124..d0f2d1a4e9 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1946,6 +1946,13 @@ typedef struct BlockDirtyBitmapState {
 bool was_enabled;
 } BlockDirtyBitmapState;
 
+typedef struct BlockDirtyBitmapMergeState {
+BlkActionState common;
+BlockDriverState *bs;
+BdrvDirtyBitmap *dst;
+BdrvDirtyBitmap *src;
+} BlockDirtyBitmapMergeState;
+
 static void block_dirty_bitmap_add_prepare(BlkActionState *common,
Error **errp)
 {
@@ -2112,6 +2119,45 @@ static void 
block_dirty_bitmap_disable_abort(BlkActionState *common)
 }
 }
 
+static void block_dirty_bitmap_merge_prepare(BlkActionState *common,
+ Error **errp)
+{
+BlockDirtyBitmapMerge *action;
+BlockDirtyBitmapMergeState *state = DO_UPCAST(BlockDirtyBitmapMergeState,
+  common, common);
+
+if (action_check_completion_mode(common, errp) < 0) {
+return;
+}
+
+action = common->action->u.x_block_dirty_bitmap_merge.data;
+state->dst = block_dirty_bitmap_lookup(action->node,
+   action->dst_name,
+   &state->bs,
+   errp);
+if (!state->dst) {
+return;
+}
+
+state->src = bdrv_find_dirty_bitmap(state->bs, action->src_name);
+if (!state->src) {
+return;
+}
+
+if (!bdrv_can_merge_dirty_bitmap(state->dst, state->src, errp)) {
+return;
+}
+}
+
+static void block_dirty_bitmap_merge_commit(BlkActionState *common)
+{
+BlockDirtyBitmapMergeState *state = DO_UPCAST(BlockDirtyBitmapMergeState,
+  common, common);
+
+/* success is guaranteed by bdrv_can_merge_dirty_bitmap() */
+bdrv_merge_dirty_bitmap(state->dst, state->src, &error_abort);
+}
+
 static void abort_prepare(BlkActionState *common, Error **errp)
 {
 error_setg(errp, "Transaction aborted using Abort action");
@@ -2182,7 +2228,12 @@ static const BlkActionOps actions[] = {
 .instance_size = sizeof(BlockDirtyBitmapState),
 .prepare = block_dirty_bitmap_disable_prepare,
 .abort = block_dirty_bitmap_disable_abort,
- }
+},
+[TRANSACTION_ACTION_KIND_X_BLOCK_DIRTY_BITMAP_MERGE] = {
+.instance_size = sizeof(BlockDirtyBitmapMergeState),
+.prepare = block_dirty_bitmap_merge_prepare,
+.commit = block_dirty_bitmap_merge_commit,
+}
 };
 
 /**
-- 
2.11.1




Re: [Qemu-block] [Qemu-devel] [PATCH v2 2/2] iotests: New test 223 for exporting dirty bitmap over NBD

2018-07-03 Thread Vladimir Sementsov-Ogievskiy

03.07.2018 00:27, John Snow wrote:


On 07/02/2018 03:14 PM, Eric Blake wrote:

Although this test is NOT a full test of image fleecing (as it
intentionally uses just a single block device directly exported
over NBD, rather than trying to set up a blockdev-backup job with
multiple BDS involved), it DOES prove that qemu as a server is
able to properly expose a dirty bitmap over NBD.

When coupled with image fleecing, it is then possible for a
third-party client to do an incremental backup by using
qemu-img map with the x-dirty-bitmap option to learn which parts
of the file are dirty (perhaps confusingly, they are the portions
mapped as "data":false - which is part of the reason this is
still in the x- experimental namespace), along with another
normal client (perhaps 'qemu-nbd -c' to expose the server over
/dev/nbd0 and then just use normal I/O on that block device) to
read the dirty sections.

Signed-off-by: Eric Blake 
---
  tests/qemu-iotests/223 | 138 +
  tests/qemu-iotests/223.out |  49 
  tests/qemu-iotests/group   |   1 +
  3 files changed, 188 insertions(+)
  create mode 100755 tests/qemu-iotests/223
  create mode 100644 tests/qemu-iotests/223.out

diff --git a/tests/qemu-iotests/223 b/tests/qemu-iotests/223
new file mode 100755
index 000..b63b7a4f9e1
--- /dev/null
+++ b/tests/qemu-iotests/223
@@ -0,0 +1,138 @@
+#!/bin/bash
+#
+# Test reading dirty bitmap over NBD
+#
+# Copyright (C) 2018 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see .
+#
+
+seq="$(basename $0)"
+echo "QA output created by $seq"
+
+here="$PWD"
+status=1 # failure is the default!
+
+_cleanup()
+{
+_cleanup_test_img
+_cleanup_qemu
+rm -f "$TEST_DIR/nbd"
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+. ./common.qemu
+
+_supported_fmt qcow2
+_supported_proto file # uses NBD as well
+_supported_os Linux
+
+function do_run_qemu()
+{
+echo Testing: "$@"
+$QEMU -nographic -qmp stdio -serial none "$@"
+echo
+}
+
+function run_qemu()
+{
+do_run_qemu "$@" 2>&1 | _filter_testdir | _filter_qmp \
+  | _filter_qemu | _filter_imgfmt \
+  | _filter_actual_image_size
+}
+
+echo
+echo "=== Create partially sparse image, then add dirty bitmap ==="
+echo
+
+_make_test_img 4M
+$QEMU_IO -c 'w -P 0x11 1M 2M' "$TEST_IMG" | _filter_qemu_io

- Write 0x11 from [1M, 3M), not recorded by a bitmap.


+run_qemu <
Saving a few precious bytes.


+"persistent": true
+  }
+}
+{ "execute": "quit" }
+EOF
+
+echo
+echo "=== Write part of the file under active bitmap ==="
+echo
+
+$QEMU_IO -c 'w -P 0x22 2M 2M' "$TEST_IMG" | _filter_qemu_io
+

- Write 0x22 to [2M, 4M) recorded by the bitmap.


+echo
+echo "=== End dirty bitmap, and start serving image over NBD ==="
+echo
+
+_launch_qemu 2> >(_filter_nbd)
+
+silent=
+_send_qemu_cmd $QEMU_HANDLE '{"execute":"qmp_capabilities"}' "return"
+_send_qemu_cmd $QEMU_HANDLE '{"execute":"blockdev-add",
+  "arguments":{"driver":"qcow2", "node-name":"n",
+"file":{"driver":"file", "filename":"'"$TEST_IMG"'"}}}' "return"
+_send_qemu_cmd $QEMU_HANDLE '{"execute":"x-block-dirty-bitmap-disable",
+  "arguments":{"node":"n", "name":"b"}}' "return"
+_send_qemu_cmd $QEMU_HANDLE '{"execute":"nbd-server-start",
+  "arguments":{"addr":{"type":"unix",
+"data":{"path":"'"$TEST_DIR/nbd"'"' "return"
+_send_qemu_cmd $QEMU_HANDLE '{"execute":"nbd-server-add",
+  "arguments":{"device":"n"}}' "return"
+_send_qemu_cmd $QEMU_HANDLE '{"execute":"x-nbd-server-add-bitmap",
+  "arguments":{"name":"n", "bitmap":"b"}}' "return"
+

So far, so good.


+echo
+echo "=== Contrast normal status with dirty-bitmap status ==="
+echo
+
+QEMU_IO_OPTIONS=$QEMU_IO_OPTIONS_NO_FMT
+IMG="driver=nbd,export=n,server.type=unix,server.path=$TEST_DIR/nbd"
+$QEMU_IO -r -c 'r -P 0 0 1m' -c 'r -P 0x11 1m 1m' \
+  -c 'r -P 0x22 2m 2m' --image-opts "$IMG" | _filter_qemu_io

Confirming that we've got 0x11 from [1M, 2M) and 0x22 from [2M, 4M).


+$QEMU_IMG map --output=json --image-opts \
+  "$IMG" | _filter_qemu_img_map

Normal allocation map. Ought to show [1M, 4M).


+$QEMU_IMG map --output=json --image-opts \
+  "$IMG,x-dirty-bitmap=qemu:dirty-bitmap:b" | _filter_qemu_img_map

Hacked bitmap allocation map. Ought to show [2M, 4M).


+
+echo

Re: [Qemu-block] [PATCH v2 2/2] iotests: New test 223 for exporting dirty bitmap over NBD

2018-07-03 Thread Vladimir Sementsov-Ogievskiy

02.07.2018 22:14, Eric Blake wrote:

Although this test is NOT a full test of image fleecing (as it
intentionally uses just a single block device directly exported
over NBD, rather than trying to set up a blockdev-backup job with
multiple BDS involved), it DOES prove that qemu as a server is
able to properly expose a dirty bitmap over NBD.

When coupled with image fleecing, it is then possible for a
third-party client to do an incremental backup by using
qemu-img map with the x-dirty-bitmap option to learn which parts
of the file are dirty (perhaps confusingly, they are the portions
mapped as "data":false - which is part of the reason this is
still in the x- experimental namespace), along with another
normal client (perhaps 'qemu-nbd -c' to expose the server over
/dev/nbd0 and then just use normal I/O on that block device) to
read the dirty sections.

Signed-off-by: Eric Blake 
---
  tests/qemu-iotests/223 | 138 +
  tests/qemu-iotests/223.out |  49 
  tests/qemu-iotests/group   |   1 +
  3 files changed, 188 insertions(+)
  create mode 100755 tests/qemu-iotests/223
  create mode 100644 tests/qemu-iotests/223.out

diff --git a/tests/qemu-iotests/223 b/tests/qemu-iotests/223
new file mode 100755
index 000..b63b7a4f9e1
--- /dev/null
+++ b/tests/qemu-iotests/223
@@ -0,0 +1,138 @@
+#!/bin/bash


[...]


+
+echo
+echo "=== End NBD server ==="
+echo
+
+_send_qemu_cmd $QEMU_HANDLE '{"execute":"nbd-server-remove",
+  "arguments":{"name":"n"}}' "return"
+_send_qemu_cmd $QEMU_HANDLE '{"execute":"nbd-server-stop"}' "return"


blockdev-del is not necessary?

with or without:

Reviewed-by: Vladimir Sementsov-Ogievskiy 


--
Best regards,
Vladimir




Re: [Qemu-block] [PATCH] pr-helper: Rework socket path handling

2018-07-03 Thread Paolo Bonzini
On 03/07/2018 11:51, Michal Privoznik wrote:
> When reviewing Paolo's pr-helper patches I've noticed couple of
> problems:
> 
> 1) socket_path needs to be calculated at two different places
> (one for printing out help, the other if socket activation is NOT
> used),
> 
> 2) even though the default socket_path is allocated in
> compute_default_paths() it is the only default path the function
> handles. For instance, pidfile is allocated outside of this
> function. And yet again, at different places than 1)
> 
> Signed-off-by: Michal Privoznik 

Queued, thanks.

Paolo

> ---
>  scsi/qemu-pr-helper.c | 36 ++--
>  1 file changed, 10 insertions(+), 26 deletions(-)
> 
> diff --git a/scsi/qemu-pr-helper.c b/scsi/qemu-pr-helper.c
> index 0218d65bbf..59df3fd608 100644
> --- a/scsi/qemu-pr-helper.c
> +++ b/scsi/qemu-pr-helper.c
> @@ -76,14 +76,12 @@ static int gid = -1;
>  
>  static void compute_default_paths(void)
>  {
> -if (!socket_path) {
> -socket_path = 
> qemu_get_local_state_pathname("run/qemu-pr-helper.sock");
> -}
> +socket_path = qemu_get_local_state_pathname("run/qemu-pr-helper.sock");
> +pidfile = qemu_get_local_state_pathname("run/qemu-pr-helper.pid");
>  }
>  
>  static void usage(const char *name)
>  {
> -compute_default_paths();
>  (printf) (
>  "Usage: %s [OPTIONS] FILE\n"
>  "Persistent Reservation helper program for QEMU\n"
> @@ -844,19 +842,6 @@ static gboolean accept_client(QIOChannel *ioc, 
> GIOCondition cond, gpointer opaqu
>  return TRUE;
>  }
>  
> -
> -/*
> - * Check socket parameters compatibility when socket activation is used.
> - */
> -static const char *socket_activation_validate_opts(void)
> -{
> -if (socket_path != NULL) {
> -return "Unix socket can't be set when using socket activation";
> -}
> -
> -return NULL;
> -}
> -
>  static void termsig_handler(int signum)
>  {
>  atomic_cmpxchg(&state, RUNNING, TERMINATE);
> @@ -930,6 +915,7 @@ int main(int argc, char **argv)
>  char *trace_file = NULL;
>  bool daemonize = false;
>  bool pidfile_specified = false;
> +bool socket_path_specified = false;
>  unsigned socket_activation;
>  
>  struct sigaction sa_sigterm;
> @@ -946,12 +932,14 @@ int main(int argc, char **argv)
>  qemu_add_opts(&qemu_trace_opts);
>  qemu_init_exec_dir(argv[0]);
>  
> -pidfile = qemu_get_local_state_pathname("run/qemu-pr-helper.pid");
> +compute_default_paths();
>  
>  while ((ch = getopt_long(argc, argv, sopt, lopt, &opt_ind)) != -1) {
>  switch (ch) {
>  case 'k':
> -socket_path = optarg;
> +g_free(socket_path);
> +socket_path = g_strdup(optarg);
> +socket_path_specified = true;
>  if (socket_path[0] != '/') {
>  error_report("socket path must be absolute");
>  exit(EXIT_FAILURE);
> @@ -1042,10 +1030,9 @@ int main(int argc, char **argv)
>  socket_activation = check_socket_activation();
>  if (socket_activation == 0) {
>  SocketAddress saddr;
> -compute_default_paths();
>  saddr = (SocketAddress){
>  .type = SOCKET_ADDRESS_TYPE_UNIX,
> -.u.q_unix.path = g_strdup(socket_path)
> +.u.q_unix.path = socket_path,
>  };
>  server_ioc = qio_channel_socket_new();
>  if (qio_channel_socket_listen_sync(server_ioc, &saddr, &local_err) < 
> 0) {
> @@ -1053,12 +1040,10 @@ int main(int argc, char **argv)
>  error_report_err(local_err);
>  return 1;
>  }
> -g_free(saddr.u.q_unix.path);
>  } else {
>  /* Using socket activation - check user didn't use -p etc. */
> -const char *err_msg = socket_activation_validate_opts();
> -if (err_msg != NULL) {
> -error_report("%s", err_msg);
> +if (socket_path_specified) {
> +error_report("Unix socket can't be set when using socket 
> activation");
>  exit(EXIT_FAILURE);
>  }
>  
> @@ -1075,7 +1060,6 @@ int main(int argc, char **argv)
>   error_get_pretty(local_err));
>  exit(EXIT_FAILURE);
>  }
> -socket_path = NULL;
>  }
>  
>  if (qemu_init_main_loop(&local_err)) {
> 



[Qemu-block] [PATCH] pr-helper: Rework socket path handling

2018-07-03 Thread Michal Privoznik
When reviewing Paolo's pr-helper patches I've noticed couple of
problems:

1) socket_path needs to be calculated at two different places
(one for printing out help, the other if socket activation is NOT
used),

2) even though the default socket_path is allocated in
compute_default_paths() it is the only default path the function
handles. For instance, pidfile is allocated outside of this
function. And yet again, at different places than 1)

Signed-off-by: Michal Privoznik 
---
 scsi/qemu-pr-helper.c | 36 ++--
 1 file changed, 10 insertions(+), 26 deletions(-)

diff --git a/scsi/qemu-pr-helper.c b/scsi/qemu-pr-helper.c
index 0218d65bbf..59df3fd608 100644
--- a/scsi/qemu-pr-helper.c
+++ b/scsi/qemu-pr-helper.c
@@ -76,14 +76,12 @@ static int gid = -1;
 
 static void compute_default_paths(void)
 {
-if (!socket_path) {
-socket_path = qemu_get_local_state_pathname("run/qemu-pr-helper.sock");
-}
+socket_path = qemu_get_local_state_pathname("run/qemu-pr-helper.sock");
+pidfile = qemu_get_local_state_pathname("run/qemu-pr-helper.pid");
 }
 
 static void usage(const char *name)
 {
-compute_default_paths();
 (printf) (
 "Usage: %s [OPTIONS] FILE\n"
 "Persistent Reservation helper program for QEMU\n"
@@ -844,19 +842,6 @@ static gboolean accept_client(QIOChannel *ioc, 
GIOCondition cond, gpointer opaqu
 return TRUE;
 }
 
-
-/*
- * Check socket parameters compatibility when socket activation is used.
- */
-static const char *socket_activation_validate_opts(void)
-{
-if (socket_path != NULL) {
-return "Unix socket can't be set when using socket activation";
-}
-
-return NULL;
-}
-
 static void termsig_handler(int signum)
 {
 atomic_cmpxchg(&state, RUNNING, TERMINATE);
@@ -930,6 +915,7 @@ int main(int argc, char **argv)
 char *trace_file = NULL;
 bool daemonize = false;
 bool pidfile_specified = false;
+bool socket_path_specified = false;
 unsigned socket_activation;
 
 struct sigaction sa_sigterm;
@@ -946,12 +932,14 @@ int main(int argc, char **argv)
 qemu_add_opts(&qemu_trace_opts);
 qemu_init_exec_dir(argv[0]);
 
-pidfile = qemu_get_local_state_pathname("run/qemu-pr-helper.pid");
+compute_default_paths();
 
 while ((ch = getopt_long(argc, argv, sopt, lopt, &opt_ind)) != -1) {
 switch (ch) {
 case 'k':
-socket_path = optarg;
+g_free(socket_path);
+socket_path = g_strdup(optarg);
+socket_path_specified = true;
 if (socket_path[0] != '/') {
 error_report("socket path must be absolute");
 exit(EXIT_FAILURE);
@@ -1042,10 +1030,9 @@ int main(int argc, char **argv)
 socket_activation = check_socket_activation();
 if (socket_activation == 0) {
 SocketAddress saddr;
-compute_default_paths();
 saddr = (SocketAddress){
 .type = SOCKET_ADDRESS_TYPE_UNIX,
-.u.q_unix.path = g_strdup(socket_path)
+.u.q_unix.path = socket_path,
 };
 server_ioc = qio_channel_socket_new();
 if (qio_channel_socket_listen_sync(server_ioc, &saddr, &local_err) < 
0) {
@@ -1053,12 +1040,10 @@ int main(int argc, char **argv)
 error_report_err(local_err);
 return 1;
 }
-g_free(saddr.u.q_unix.path);
 } else {
 /* Using socket activation - check user didn't use -p etc. */
-const char *err_msg = socket_activation_validate_opts();
-if (err_msg != NULL) {
-error_report("%s", err_msg);
+if (socket_path_specified) {
+error_report("Unix socket can't be set when using socket 
activation");
 exit(EXIT_FAILURE);
 }
 
@@ -1075,7 +1060,6 @@ int main(int argc, char **argv)
  error_get_pretty(local_err));
 exit(EXIT_FAILURE);
 }
-socket_path = NULL;
 }
 
 if (qemu_init_main_loop(&local_err)) {
-- 
2.16.4




Re: [Qemu-block] [PATCH v2 1/2] nbd/client: Add x-dirty-bitmap to query bitmap from server

2018-07-03 Thread Vladimir Sementsov-Ogievskiy

02.07.2018 22:14, Eric Blake wrote:

In order to test that the NBD server is properly advertising
dirty bitmaps, we need a bare minimum client that can request
and read the context.  Since feature freeze for 3.0 is imminent,
this is the smallest workable patch, which replaces the qemu
block status report with the results of the NBD server's dirty
bitmap (making it very easy to use 'qemu-img map --output=json'
to learn where the dirty portions are).  Note that the NBD
protocol defines a dirty section with the same bit but opposite
sense that normal "base:allocation" uses to report an allocated
section; so in qemu-img map output, "data":true corresponds to
clean, "data":false corresponds to dirty.

A more complete solution that allows dirty bitmaps to be queried
at the same time as normal block status will be required before
this addition can lose the x- prefix.  Until then, the fact that
this replaces normal status with dirty status means actions
like 'qemu-img convert' will likely misbehave due to treating
dirty regions of the file as if they are unallocated.

The next patch adds an iotest to exercise this new code.

Signed-off-by: Eric Blake 
---

  qapi/block-core.json |  7 ++-
  block/nbd-client.h   |  1 +
  include/block/nbd.h  |  1 +
  block/nbd-client.c   |  3 +++
  block/nbd.c  | 10 --
  nbd/client.c |  4 ++--
  6 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/qapi/block-core.json b/qapi/block-core.json
index 577ce5e9991..90e554ed0ff 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -3471,12 +3471,17 @@
  #
  # @tls-creds:   TLS credentials ID
  #
+# @x-dirty-bitmap: A "qemu:dirty-bitmap:NAME" string to query in place of
+#  traditional "base:allocation" block status (see
+#  NBD_OPT_LIST_META_CONTEXT in the NBD protocol) (since 3.0)
+#


"x-dirty-bitmap=qemu:dirty-bitmap:NAME", is a bit strange, looks like it 
should be "x-dirty-bitmap=NAME", and "qemu:dirty-bitmap" added 
automatically. (and you don't check it, so actually this parameter is 
x-meta-context, and user can select any context, for example, 
"base:allocation", so "x-meta-context=qemu:dirty-bitmap:NAME" sounds 
better too). But I'm ok to leave it as is for now, with x- prefix.




  # Since: 2.9
  ##
  { 'struct': 'BlockdevOptionsNbd',
'data': { 'server': 'SocketAddress',
  '*export': 'str',
-'*tls-creds': 'str' } }
+'*tls-creds': 'str',
+'*x-dirty-bitmap': 'str' } }

  ##
  # @BlockdevOptionsRaw:
diff --git a/block/nbd-client.h b/block/nbd-client.h
index 0ece76e5aff..cfc90550b99 100644
--- a/block/nbd-client.h
+++ b/block/nbd-client.h
@@ -45,6 +45,7 @@ int nbd_client_init(BlockDriverState *bs,
  const char *export_name,
  QCryptoTLSCreds *tlscreds,
  const char *hostname,
+const char *x_dirty_bitmap,
  Error **errp);
  void nbd_client_close(BlockDriverState *bs);

diff --git a/include/block/nbd.h b/include/block/nbd.h
index daaeae61bf9..4638c839f51 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -259,6 +259,7 @@ static inline bool nbd_reply_type_is_error(int type)
  struct NBDExportInfo {
  /* Set by client before nbd_receive_negotiate() */
  bool request_sizes;
+char *x_dirty_bitmap;

  /* In-out fields, set by client before nbd_receive_negotiate() and
   * updated by server results during nbd_receive_negotiate() */
diff --git a/block/nbd-client.c b/block/nbd-client.c
index 8d69eaaa32f..9686ecbd5ee 100644
--- a/block/nbd-client.c
+++ b/block/nbd-client.c
@@ -970,6 +970,7 @@ int nbd_client_init(BlockDriverState *bs,
  const char *export,
  QCryptoTLSCreds *tlscreds,
  const char *hostname,
+const char *x_dirty_bitmap,
  Error **errp)
  {
  NBDClientSession *client = nbd_get_client_session(bs);
@@ -982,9 +983,11 @@ int nbd_client_init(BlockDriverState *bs,
  client->info.request_sizes = true;
  client->info.structured_reply = true;
  client->info.base_allocation = true;
+client->info.x_dirty_bitmap = g_strdup(x_dirty_bitmap);
  ret = nbd_receive_negotiate(QIO_CHANNEL(sioc), export,
  tlscreds, hostname,
  &client->ioc, &client->info, errp);
+g_free(client->info.x_dirty_bitmap);


hm, pointer remains invalid. If you want free it here, what is the 
reason to strdup it? At least, it worth to zero out the pointer after 
g_free.. Or, free it not here but in client close.


with pointer zeroed or g_free to client destroy procedure,

Reviewed-by: Vladimir Sementsov-Ogievskiy 



--
Best regards,
Vladimir




Re: [Qemu-block] [Qemu-devel] [PATCH v4 2/2] iotests: add 222 to test basic fleecing

2018-07-03 Thread Vladimir Sementsov-Ogievskiy

02.07.2018 22:46, John Snow wrote:

Signed-off-by: John Snow 



Reviewed-by: Vladimir Sementsov-Ogievskiy 

--

Best regards,
Vladimir




[Qemu-block] [PATCH v6] crypto: Implement TLS Pre-Shared Keys (PSK).

2018-07-03 Thread Richard W.M. Jones
Pre-Shared Keys (PSK) is a simpler mechanism for enabling TLS
connections than using certificates.  It requires only a simple secret
key:

  $ mkdir -m 0700 /tmp/keys
  $ psktool -u rjones -p /tmp/keys/keys.psk
  $ cat /tmp/keys/keys.psk
  rjones:d543770c15ad93d76443fb56f501a31969235f47e999720ae8d2336f6a13fcbc

The key can be secretly shared between clients and servers.  Clients
must specify the directory containing the "keys.psk" file and a
username (defaults to "qemu").  Servers must specify only the
directory.

Example NBD client:

  $ qemu-img info \
--object 
tls-creds-psk,id=tls0,dir=/tmp/keys,username=rjones,endpoint=client \
--image-opts \

file.driver=nbd,file.host=localhost,file.port=10809,file.tls-creds=tls0,file.export=/

Example NBD server using qemu-nbd:

  $ qemu-nbd -t -x / \
--object tls-creds-psk,id=tls0,endpoint=server,dir=/tmp/keys \
--tls-creds tls0 \
image.qcow2

Example NBD server using nbdkit:

  $ nbdkit -n -e / -fv \
--tls=on --tls-psk=/tmp/keys/keys.psk \
file file=disk.img

Signed-off-by: Richard W.M. Jones 
---
 crypto/Makefile.objs   |   1 +
 crypto/tlscredspsk.c   | 308 +
 crypto/tlssession.c|  56 +-
 crypto/trace-events|   3 +
 include/crypto/tlscredspsk.h   | 106 
 qemu-doc.texi  |  37 
 qemu-options.hx|  24 +++
 tests/Makefile.include |   4 +-
 tests/crypto-tls-psk-helpers.c |  50 ++
 tests/crypto-tls-psk-helpers.h |  29 
 tests/test-crypto-tlssession.c | 185 +---
 11 files changed, 777 insertions(+), 26 deletions(-)

diff --git a/crypto/Makefile.objs b/crypto/Makefile.objs
index 2b99e08062..756bab111b 100644
--- a/crypto/Makefile.objs
+++ b/crypto/Makefile.objs
@@ -15,6 +15,7 @@ crypto-obj-$(CONFIG_AF_ALG) += cipher-afalg.o
 crypto-obj-$(CONFIG_AF_ALG) += hash-afalg.o
 crypto-obj-y += tlscreds.o
 crypto-obj-y += tlscredsanon.o
+crypto-obj-y += tlscredspsk.o
 crypto-obj-y += tlscredsx509.o
 crypto-obj-y += tlssession.o
 crypto-obj-y += secret.o
diff --git a/crypto/tlscredspsk.c b/crypto/tlscredspsk.c
new file mode 100644
index 00..7be7c8efdd
--- /dev/null
+++ b/crypto/tlscredspsk.c
@@ -0,0 +1,308 @@
+/*
+ * QEMU crypto TLS Pre-Shared Keys (PSK) support
+ *
+ * Copyright (c) 2018 Red Hat, Inc.
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, see .
+ *
+ */
+
+#include "qemu/osdep.h"
+#include "crypto/tlscredspsk.h"
+#include "tlscredspriv.h"
+#include "qapi/error.h"
+#include "qom/object_interfaces.h"
+#include "trace.h"
+
+
+#ifdef CONFIG_GNUTLS
+
+static int
+lookup_key(const char *pskfile, const char *username, gnutls_datum_t *key,
+   Error **errp)
+{
+const size_t ulen = strlen(username);
+GError *gerr = NULL;
+char *content = NULL;
+char **lines = NULL;
+size_t clen = 0, i;
+int ret = -1;
+
+if (!g_file_get_contents(pskfile, &content, &clen, &gerr)) {
+error_setg(errp, "Cannot read PSK file %s: %s",
+   pskfile, gerr->message);
+g_error_free(gerr);
+return -1;
+}
+
+lines = g_strsplit(content, "\n", -1);
+for (i = 0; lines[i] != NULL; ++i) {
+if (strncmp(lines[i], username, ulen) == 0 && lines[i][ulen] == ':') {
+key->data = (unsigned char *) g_strdup(&lines[i][ulen + 1]);
+key->size = strlen(lines[i]) - ulen - 1;
+ret = 0;
+goto out;
+}
+}
+error_setg(errp, "Username %s not found in PSK file %s",
+   username, pskfile);
+
+ out:
+free(content);
+g_strfreev(lines);
+return ret;
+}
+
+static int
+qcrypto_tls_creds_psk_load(QCryptoTLSCredsPSK *creds,
+   Error **errp)
+{
+char *pskfile = NULL, *dhparams = NULL;
+const char *username;
+int ret;
+int rv = -1;
+gnutls_datum_t key = { .data = NULL };
+
+trace_qcrypto_tls_creds_psk_load(creds,
+creds->parent_obj.dir ? creds->parent_obj.dir : "");
+
+if (creds->parent_obj.endpoint == QCRYPTO_TLS_CREDS_ENDPOINT_SERVER) {
+if (creds->username) {
+error_setg(errp, "username should not be set when 
endpoint=server");
+goto cleanup;
+}
+
+if (qcrypto_tls_creds_get_path(&creds->parent_obj,
+   QCRYP

[Qemu-block] [PATCH v6] crypto: Implement TLS Pre-Shared Keys (PSK).

2018-07-03 Thread Richard W.M. Jones
v5 was here:

https://lists.gnu.org/archive/html/qemu-devel/2018-06/msg08491.html
https://lists.gnu.org/archive/html/qemu-devel/2018-07/msg00077.html

v6:

 - Make ECDHE-PSK dependent on GnuTLS >= 3.

 - Retested against nbdkit.

 - Retested with internal unit tests.

Rich.





Re: [Qemu-block] [PATCH v5] crypto: Implement TLS Pre-Shared Keys (PSK).

2018-07-03 Thread Richard W.M. Jones


Hi Dan, I was on holiday yesterday so I guess we've missed the deadline.

In any case I will post v6 with the requested change in a few minutes.

Rich.

-- 
Richard Jones, Virtualization Group, Red Hat http://people.redhat.com/~rjones
Read my programming and virtualization blog: http://rwmj.wordpress.com
virt-df lists disk usage of guests without needing to install any
software inside the virtual machine.  Supports Linux and Windows.
http://people.redhat.com/~rjones/virt-df/