[Qemu-block] [PATCH v5 09/10] block-backend: Add blk_co_copy_range

2018-05-22 Thread Fam Zheng
It's a BlockBackend wrapper of the BDS interface.

Signed-off-by: Fam Zheng 
Reviewed-by: Stefan Hajnoczi 
---
 block/block-backend.c  | 18 ++
 include/sysemu/block-backend.h |  4 
 2 files changed, 22 insertions(+)

diff --git a/block/block-backend.c b/block/block-backend.c
index 89f47b00ea..d55c328736 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -2211,3 +2211,21 @@ void blk_unregister_buf(BlockBackend *blk, void *host)
 {
 bdrv_unregister_buf(blk_bs(blk), host);
 }
+
+int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in,
+   BlockBackend *blk_out, int64_t off_out,
+   int bytes, BdrvRequestFlags flags)
+{
+int r;
+r = blk_check_byte_request(blk_in, off_in, bytes);
+if (r) {
+return r;
+}
+r = blk_check_byte_request(blk_out, off_out, bytes);
+if (r) {
+return r;
+}
+return bdrv_co_copy_range(blk_in->root, off_in,
+  blk_out->root, off_out,
+  bytes, flags);
+}
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index 92ab624fac..8d03d493c2 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -232,4 +232,8 @@ void blk_set_force_allow_inactivate(BlockBackend *blk);
 void blk_register_buf(BlockBackend *blk, void *host, size_t size);
 void blk_unregister_buf(BlockBackend *blk, void *host);
 
+int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in,
+   BlockBackend *blk_out, int64_t off_out,
+   int bytes, BdrvRequestFlags flags);
+
 #endif
-- 
2.14.3




[Qemu-block] [PATCH v5 10/10] qemu-img: Convert with copy offloading

2018-05-22 Thread Fam Zheng
The new blk_co_copy_range interface offers a more efficient way in the
case of network based storage. Make use of it to allow faster convert
operation.

Since copy offloading cannot do zero detection ('-S') and compression
(-c), only try it when these options are not used.

Signed-off-by: Fam Zheng 
Reviewed-by: Stefan Hajnoczi 
---
 qemu-img.c | 50 --
 1 file changed, 48 insertions(+), 2 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index 2b5a5706b6..67f3631a81 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1543,6 +1543,7 @@ typedef struct ImgConvertState {
 bool compressed;
 bool target_has_backing;
 bool wr_in_order;
+bool copy_range;
 int min_sparse;
 size_t cluster_sectors;
 size_t buf_sectors;
@@ -1736,6 +1737,37 @@ static int coroutine_fn convert_co_write(ImgConvertState 
*s, int64_t sector_num,
 return 0;
 }
 
+static int coroutine_fn convert_co_copy_range(ImgConvertState *s, int64_t 
sector_num,
+  int nb_sectors)
+{
+int n, ret;
+
+while (nb_sectors > 0) {
+BlockBackend *blk;
+int src_cur;
+int64_t bs_sectors, src_cur_offset;
+int64_t offset;
+
+convert_select_part(s, sector_num, &src_cur, &src_cur_offset);
+offset = (sector_num - src_cur_offset) << BDRV_SECTOR_BITS;
+blk = s->src[src_cur];
+bs_sectors = s->src_sectors[src_cur];
+
+n = MIN(nb_sectors, bs_sectors - (sector_num - src_cur_offset));
+
+ret = blk_co_copy_range(blk, offset, s->target,
+sector_num << BDRV_SECTOR_BITS,
+n << BDRV_SECTOR_BITS, 0);
+if (ret < 0) {
+return ret;
+}
+
+sector_num += n;
+nb_sectors -= n;
+}
+return 0;
+}
+
 static void coroutine_fn convert_co_do_copy(void *opaque)
 {
 ImgConvertState *s = opaque;
@@ -1758,6 +1790,7 @@ static void coroutine_fn convert_co_do_copy(void *opaque)
 int n;
 int64_t sector_num;
 enum ImgConvertBlockStatus status;
+bool copy_range;
 
 qemu_co_mutex_lock(&s->lock);
 if (s->ret != -EINPROGRESS || s->sector_num >= s->total_sectors) {
@@ -1787,7 +1820,9 @@ static void coroutine_fn convert_co_do_copy(void *opaque)
 s->allocated_sectors, 0);
 }
 
-if (status == BLK_DATA) {
+retry:
+copy_range = s->copy_range && s->status == BLK_DATA;
+if (status == BLK_DATA && !copy_range) {
 ret = convert_co_read(s, sector_num, n, buf);
 if (ret < 0) {
 error_report("error while reading sector %" PRId64
@@ -1809,7 +1844,15 @@ static void coroutine_fn convert_co_do_copy(void *opaque)
 }
 
 if (s->ret == -EINPROGRESS) {
-ret = convert_co_write(s, sector_num, n, buf, status);
+if (copy_range) {
+ret = convert_co_copy_range(s, sector_num, n);
+if (ret) {
+s->copy_range = false;
+goto retry;
+}
+} else {
+ret = convert_co_write(s, sector_num, n, buf, status);
+}
 if (ret < 0) {
 error_report("error while writing sector %" PRId64
  ": %s", sector_num, strerror(-ret));
@@ -1932,6 +1975,7 @@ static int img_convert(int argc, char **argv)
 ImgConvertState s = (ImgConvertState) {
 /* Need at least 4k of zeros for sparse detection */
 .min_sparse = 8,
+.copy_range = true,
 .buf_sectors= IO_BUF_SIZE / BDRV_SECTOR_SIZE,
 .wr_in_order= true,
 .num_coroutines = 8,
@@ -1972,6 +2016,7 @@ static int img_convert(int argc, char **argv)
 break;
 case 'c':
 s.compressed = true;
+s.copy_range = false;
 break;
 case 'o':
 if (!is_valid_option_list(optarg)) {
@@ -2013,6 +2058,7 @@ static int img_convert(int argc, char **argv)
 }
 
 s.min_sparse = sval / BDRV_SECTOR_SIZE;
+s.copy_range = false;
 break;
 }
 case 'p':
-- 
2.14.3




[Qemu-block] [PATCH v5 08/10] iscsi: Implement copy offloading

2018-05-22 Thread Fam Zheng
Issue EXTENDED COPY (LID1) command to implement the copy_range API.

The parameter data construction code is modified from libiscsi's
iscsi-dd.c.

Signed-off-by: Fam Zheng 
Reviewed-by: Stefan Hajnoczi 
---
 block/iscsi.c| 219 +++
 include/scsi/constants.h |   3 +
 2 files changed, 222 insertions(+)

diff --git a/block/iscsi.c b/block/iscsi.c
index 6a365cb07b..5ea75646d9 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -2205,6 +2205,221 @@ static void coroutine_fn 
iscsi_co_invalidate_cache(BlockDriverState *bs,
 iscsi_allocmap_invalidate(iscsilun);
 }
 
+static int coroutine_fn iscsi_co_copy_range_from(BlockDriverState *bs,
+ BdrvChild *src,
+ uint64_t src_offset,
+ BdrvChild *dst,
+ uint64_t dst_offset,
+ uint64_t bytes,
+ BdrvRequestFlags flags)
+{
+return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, 
flags);
+}
+
+static struct scsi_task *iscsi_xcopy_task(int param_len)
+{
+struct scsi_task *task;
+
+task = g_new0(struct scsi_task, 1);
+
+task->cdb[0] = EXTENDED_COPY;
+task->cdb[10]= (param_len >> 24) & 0xFF;
+task->cdb[11]= (param_len >> 16) & 0xFF;
+task->cdb[12]= (param_len >> 8) & 0xFF;
+task->cdb[13]= param_len & 0xFF;
+task->cdb_size   = 16;
+task->xfer_dir   = SCSI_XFER_WRITE;
+task->expxferlen = param_len;
+
+return task;
+}
+
+static void iscsi_populate_target_desc(unsigned char *desc, IscsiLun *lun)
+{
+struct scsi_inquiry_device_designator *dd = lun->dd;
+
+memset(desc, 0, 32);
+desc[0] = IDENT_DESCR_TGT_DESCR;
+desc[4] = dd->code_set;
+desc[5] = (dd->designator_type & 0xF)
+| ((dd->association & 3) << 4);
+desc[7] = dd->designator_length;
+memcpy(desc + 8, dd->designator, dd->designator_length);
+
+desc[28] = 0;
+desc[29] = (lun->block_size >> 16) & 0xFF;
+desc[30] = (lun->block_size >> 8) & 0xFF;
+desc[31] = lun->block_size & 0xFF;
+}
+
+static void iscsi_xcopy_desc_hdr(uint8_t *hdr, int dc, int cat, int src_index,
+ int dst_index)
+{
+hdr[0] = 0x02; /* BLK_TO_BLK_SEG_DESCR */
+hdr[1] = ((dc << 1) | cat) & 0xFF;
+hdr[2] = (XCOPY_BLK2BLK_SEG_DESC_SIZE >> 8) & 0xFF;
+/* don't account for the first 4 bytes in descriptor header*/
+hdr[3] = (XCOPY_BLK2BLK_SEG_DESC_SIZE - 4 /* SEG_DESC_SRC_INDEX_OFFSET */) 
& 0xFF;
+hdr[4] = (src_index >> 8) & 0xFF;
+hdr[5] = src_index & 0xFF;
+hdr[6] = (dst_index >> 8) & 0xFF;
+hdr[7] = dst_index & 0xFF;
+}
+
+static void iscsi_xcopy_populate_desc(uint8_t *desc, int dc, int cat,
+  int src_index, int dst_index, int 
num_blks,
+  uint64_t src_lba, uint64_t dst_lba)
+{
+iscsi_xcopy_desc_hdr(desc, dc, cat, src_index, dst_index);
+
+/* The caller should verify the request size */
+assert(num_blks < 65536);
+desc[10] = (num_blks >> 8) & 0xFF;
+desc[11] = num_blks & 0xFF;
+desc[12] = (src_lba >> 56) & 0xFF;
+desc[13] = (src_lba >> 48) & 0xFF;
+desc[14] = (src_lba >> 40) & 0xFF;
+desc[15] = (src_lba >> 32) & 0xFF;
+desc[16] = (src_lba >> 24) & 0xFF;
+desc[17] = (src_lba >> 16) & 0xFF;
+desc[18] = (src_lba >> 8) & 0xFF;
+desc[19] = src_lba & 0xFF;
+desc[20] = (dst_lba >> 56) & 0xFF;
+desc[21] = (dst_lba >> 48) & 0xFF;
+desc[22] = (dst_lba >> 40) & 0xFF;
+desc[23] = (dst_lba >> 32) & 0xFF;
+desc[24] = (dst_lba >> 24) & 0xFF;
+desc[25] = (dst_lba >> 16) & 0xFF;
+desc[26] = (dst_lba >> 8) & 0xFF;
+desc[27] = dst_lba & 0xFF;
+}
+
+static void iscsi_xcopy_populate_header(unsigned char *buf, int list_id, int 
str,
+int list_id_usage, int prio,
+int tgt_desc_len,
+int seg_desc_len, int inline_data_len)
+{
+buf[0] = list_id;
+buf[1] = ((str & 1) << 5) | ((list_id_usage & 3) << 3) | (prio & 7);
+buf[2] = (tgt_desc_len >> 8) & 0xFF;
+buf[3] = tgt_desc_len & 0xFF;
+buf[8] = (seg_desc_len >> 24) & 0xFF;
+buf[9] = (seg_desc_len >> 16) & 0xFF;
+buf[10] = (seg_desc_len >> 8) & 0xFF;
+buf[11] = seg_desc_len & 0xFF;
+buf[12] = (inline_data_len >> 24) & 0xFF;
+buf[13] = (inline_data_len >> 16) & 0xFF;
+buf[14] = (inline_data_len >> 8) & 0xFF;
+buf[15] = inline_data_len & 0xFF;
+}
+
+static void iscsi_xcopy_data(struct iscsi_data *data,
+ IscsiLun *src, int64_t src_lba,
+ IscsiLun *dst, int64_t dst_lba,
+ uint16_t num_blocks)
+{
+uint8_t *

[Qemu-block] [PATCH v5 06/10] iscsi: Query and save device designator when opening

2018-05-22 Thread Fam Zheng
The device designator data returned in INQUIRY command will be useful to
fill in source/target fields during copy offloading. Do this when
connecting to the target and save the data for later use.

Signed-off-by: Fam Zheng 
Reviewed-by: Stefan Hajnoczi 
---
 block/iscsi.c| 41 +
 include/scsi/constants.h |  2 ++
 2 files changed, 43 insertions(+)

diff --git a/block/iscsi.c b/block/iscsi.c
index 3fd7203916..6d0035d4b9 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -68,6 +68,7 @@ typedef struct IscsiLun {
 QemuMutex mutex;
 struct scsi_inquiry_logical_block_provisioning lbp;
 struct scsi_inquiry_block_limits bl;
+struct scsi_inquiry_device_designator *dd;
 unsigned char *zeroblock;
 /* The allocmap tracks which clusters (pages) on the iSCSI target are
  * allocated and which are not. In case a target returns zeros for
@@ -1740,6 +1741,30 @@ static QemuOptsList runtime_opts = {
 },
 };
 
+static void iscsi_save_designator(IscsiLun *lun,
+  struct scsi_inquiry_device_identification 
*inq_di)
+{
+struct scsi_inquiry_device_designator *desig, *copy = NULL;
+
+for (desig = inq_di->designators; desig; desig = desig->next) {
+if (desig->association ||
+desig->designator_type > SCSI_DESIGNATOR_TYPE_NAA) {
+continue;
+}
+/* NAA works better than T10 vendor ID based designator. */
+if (!copy || copy->designator_type < desig->designator_type) {
+copy = desig;
+}
+}
+if (copy) {
+lun->dd = g_new(struct scsi_inquiry_device_designator, 1);
+*lun->dd = *copy;
+lun->dd->next = NULL;
+lun->dd->designator = g_malloc(copy->designator_length);
+memcpy(lun->dd->designator, copy->designator, copy->designator_length);
+}
+}
+
 static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
   Error **errp)
 {
@@ -1922,6 +1947,7 @@ static int iscsi_open(BlockDriverState *bs, QDict 
*options, int flags,
 struct scsi_task *inq_task;
 struct scsi_inquiry_logical_block_provisioning *inq_lbp;
 struct scsi_inquiry_block_limits *inq_bl;
+struct scsi_inquiry_device_identification *inq_di;
 switch (inq_vpd->pages[i]) {
 case SCSI_INQUIRY_PAGECODE_LOGICAL_BLOCK_PROVISIONING:
 inq_task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1,
@@ -1947,6 +1973,17 @@ static int iscsi_open(BlockDriverState *bs, QDict 
*options, int flags,
sizeof(struct scsi_inquiry_block_limits));
 scsi_free_scsi_task(inq_task);
 break;
+case SCSI_INQUIRY_PAGECODE_DEVICE_IDENTIFICATION:
+inq_task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1,
+
SCSI_INQUIRY_PAGECODE_DEVICE_IDENTIFICATION,
+(void **) &inq_di, errp);
+if (inq_task == NULL) {
+ret = -EINVAL;
+goto out;
+}
+iscsi_save_designator(iscsilun, inq_di);
+scsi_free_scsi_task(inq_task);
+break;
 default:
 break;
 }
@@ -2003,6 +2040,10 @@ static void iscsi_close(BlockDriverState *bs)
 iscsi_logout_sync(iscsi);
 }
 iscsi_destroy_context(iscsi);
+if (iscsilun->dd) {
+g_free(iscsilun->dd->designator);
+g_free(iscsilun->dd);
+}
 g_free(iscsilun->zeroblock);
 iscsi_allocmap_free(iscsilun);
 qemu_mutex_destroy(&iscsilun->mutex);
diff --git a/include/scsi/constants.h b/include/scsi/constants.h
index a141dd71f8..54733b7110 100644
--- a/include/scsi/constants.h
+++ b/include/scsi/constants.h
@@ -311,4 +311,6 @@
 #define MMC_PROFILE_HDDVD_RW_DL 0x005A
 #define MMC_PROFILE_INVALID 0x
 
+#define IDENT_DESCR_TGT_DESCR 0xE4
+
 #endif
-- 
2.14.3




[Qemu-block] [PATCH v5 04/10] qcow2: Implement copy offloading

2018-05-22 Thread Fam Zheng
The two callbacks are implemented quite similarly to the read/write
functions: bdrv_co_copy_range_from maps for read and calls into bs->file
or bs->backing depending on the allocation status; bdrv_co_copy_range_to
maps for write and calls into bs->file.

Signed-off-by: Fam Zheng 
---
 block/qcow2.c | 226 ++
 1 file changed, 196 insertions(+), 30 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 6d532470a8..e32a3c1518 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1762,6 +1762,39 @@ static int coroutine_fn 
qcow2_co_block_status(BlockDriverState *bs,
 return status;
 }
 
+static coroutine_fn int qcow2_handle_l2meta(BlockDriverState *bs,
+QCowL2Meta **pl2meta,
+bool link_l2)
+{
+int ret = 0;
+QCowL2Meta *l2meta = *pl2meta;
+
+while (l2meta != NULL) {
+QCowL2Meta *next;
+
+if (!ret && link_l2) {
+ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
+if (ret) {
+goto out;
+}
+}
+
+/* Take the request off the list of running requests */
+if (l2meta->nb_clusters != 0) {
+QLIST_REMOVE(l2meta, next_in_flight);
+}
+
+qemu_co_queue_restart_all(&l2meta->dependent_requests);
+
+next = l2meta->next;
+g_free(l2meta);
+l2meta = next;
+}
+out:
+*pl2meta = l2meta;
+return ret;
+}
+
 static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
 uint64_t bytes, QEMUIOVector *qiov,
 int flags)
@@ -2048,24 +2081,9 @@ static coroutine_fn int 
qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
 }
 }
 
-while (l2meta != NULL) {
-QCowL2Meta *next;
-
-ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
-if (ret < 0) {
-goto fail;
-}
-
-/* Take the request off the list of running requests */
-if (l2meta->nb_clusters != 0) {
-QLIST_REMOVE(l2meta, next_in_flight);
-}
-
-qemu_co_queue_restart_all(&l2meta->dependent_requests);
-
-next = l2meta->next;
-g_free(l2meta);
-l2meta = next;
+ret = qcow2_handle_l2meta(bs, &l2meta, true);
+if (ret) {
+goto fail;
 }
 
 bytes -= cur_bytes;
@@ -2076,18 +2094,7 @@ static coroutine_fn int 
qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
 ret = 0;
 
 fail:
-while (l2meta != NULL) {
-QCowL2Meta *next;
-
-if (l2meta->nb_clusters != 0) {
-QLIST_REMOVE(l2meta, next_in_flight);
-}
-qemu_co_queue_restart_all(&l2meta->dependent_requests);
-
-next = l2meta->next;
-g_free(l2meta);
-l2meta = next;
-}
+qcow2_handle_l2meta(bs, &l2meta, false);
 
 qemu_co_mutex_unlock(&s->lock);
 
@@ -3274,6 +3281,163 @@ static coroutine_fn int 
qcow2_co_pdiscard(BlockDriverState *bs,
 return ret;
 }
 
+static int coroutine_fn
+qcow2_co_copy_range_from(BlockDriverState *bs,
+ BdrvChild *src, uint64_t src_offset,
+ BdrvChild *dst, uint64_t dst_offset,
+ uint64_t bytes, BdrvRequestFlags flags)
+{
+BDRVQcow2State *s = bs->opaque;
+int ret;
+unsigned int cur_bytes; /* number of bytes in current iteration */
+BdrvChild *child = NULL;
+
+assert(!bs->encrypted);
+qemu_co_mutex_lock(&s->lock);
+
+while (bytes != 0) {
+uint64_t copy_offset = 0;
+/* prepare next request */
+cur_bytes = MIN(bytes, INT_MAX);
+
+ret = qcow2_get_cluster_offset(bs, src_offset, &cur_bytes, 
©_offset);
+if (ret < 0) {
+goto out;
+}
+
+switch (ret) {
+case QCOW2_CLUSTER_UNALLOCATED:
+if (bs->backing && bs->backing->bs) {
+int64_t backing_length = bdrv_getlength(bs->backing->bs);
+if (src_offset >= backing_length) {
+flags |= BDRV_REQ_ZERO_WRITE;
+} else {
+child = bs->backing;
+cur_bytes = MIN(cur_bytes, backing_length - src_offset);
+copy_offset = src_offset;
+}
+} else {
+flags |= BDRV_REQ_ZERO_WRITE;
+}
+break;
+
+case QCOW2_CLUSTER_ZERO_PLAIN:
+case QCOW2_CLUSTER_ZERO_ALLOC:
+flags |= BDRV_REQ_ZERO_WRITE;
+break;
+
+case QCOW2_CLUSTER_COMPRESSED:
+ret = -ENOTSUP;
+goto out;
+break;
+
+case QCOW2_CLUSTER_NORMAL:
+child = bs->file;
+copy_offset += offset_into_cluster(s, src_offset);
+if ((copy_offset & 511) != 0) {

[Qemu-block] [PATCH v5 05/10] file-posix: Implement bdrv_co_copy_range

2018-05-22 Thread Fam Zheng
With copy_file_range(2), we can implement the bdrv_co_copy_range
semantics.

Signed-off-by: Fam Zheng 
Reviewed-by: Stefan Hajnoczi 
---
 block/file-posix.c  | 96 +++--
 include/block/raw-aio.h | 10 --
 2 files changed, 101 insertions(+), 5 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 5a602cfe37..550a201750 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -59,6 +59,7 @@
 #ifdef __linux__
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -187,6 +188,8 @@ typedef struct RawPosixAIOData {
 #define aio_ioctl_cmd   aio_nbytes /* for QEMU_AIO_IOCTL */
 off_t aio_offset;
 int aio_type;
+int aio_fd2;
+off_t aio_offset2;
 } RawPosixAIOData;
 
 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
@@ -1446,6 +1449,47 @@ static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData 
*aiocb)
 return -ENOTSUP;
 }
 
+static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
+ off_t *out_off, size_t len, unsigned int flags)
+{
+#ifdef __NR_copy_file_range
+return syscall(__NR_copy_file_range, in_fd, in_off, out_fd,
+   out_off, len, flags);
+#else
+errno = ENOSYS;
+return -1;
+#endif
+}
+
+static ssize_t handle_aiocb_copy_range(RawPosixAIOData *aiocb)
+{
+uint64_t bytes = aiocb->aio_nbytes;
+off_t in_off = aiocb->aio_offset;
+off_t out_off = aiocb->aio_offset2;
+
+while (bytes) {
+ssize_t ret = copy_file_range(aiocb->aio_fildes, &in_off,
+  aiocb->aio_fd2, &out_off,
+  bytes, 0);
+if (ret == -EINTR) {
+continue;
+}
+if (ret < 0) {
+if (errno == ENOSYS) {
+return -ENOTSUP;
+} else {
+return -errno;
+}
+}
+if (!ret) {
+/* No progress (e.g. when beyond EOF), fall back to buffer I/O. */
+return -ENOTSUP;
+}
+bytes -= ret;
+}
+return 0;
+}
+
 static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
 {
 int ret = -EOPNOTSUPP;
@@ -1526,6 +1570,9 @@ static int aio_worker(void *arg)
 case QEMU_AIO_WRITE_ZEROES:
 ret = handle_aiocb_write_zeroes(aiocb);
 break;
+case QEMU_AIO_COPY_RANGE:
+ret = handle_aiocb_copy_range(aiocb);
+break;
 default:
 fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
 ret = -EINVAL;
@@ -1536,9 +1583,10 @@ static int aio_worker(void *arg)
 return ret;
 }
 
-static int paio_submit_co(BlockDriverState *bs, int fd,
-  int64_t offset, QEMUIOVector *qiov,
-  int bytes, int type)
+static int paio_submit_co_full(BlockDriverState *bs, int fd,
+   int64_t offset, int fd2, int64_t offset2,
+   QEMUIOVector *qiov,
+   int bytes, int type)
 {
 RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
 ThreadPool *pool;
@@ -1546,6 +1594,8 @@ static int paio_submit_co(BlockDriverState *bs, int fd,
 acb->bs = bs;
 acb->aio_type = type;
 acb->aio_fildes = fd;
+acb->aio_fd2 = fd2;
+acb->aio_offset2 = offset2;
 
 acb->aio_nbytes = bytes;
 acb->aio_offset = offset;
@@ -1561,6 +1611,13 @@ static int paio_submit_co(BlockDriverState *bs, int fd,
 return thread_pool_submit_co(pool, aio_worker, acb);
 }
 
+static inline int paio_submit_co(BlockDriverState *bs, int fd,
+ int64_t offset, QEMUIOVector *qiov,
+ int bytes, int type)
+{
+return paio_submit_co_full(bs, fd, offset, -1, 0, qiov, bytes, type);
+}
+
 static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
 int64_t offset, QEMUIOVector *qiov, int bytes,
 BlockCompletionFunc *cb, void *opaque, int type)
@@ -2451,6 +2508,35 @@ static void raw_abort_perm_update(BlockDriverState *bs)
 raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
 }
 
+static int coroutine_fn raw_co_copy_range_from(BlockDriverState *bs,
+   BdrvChild *src, uint64_t 
src_offset,
+   BdrvChild *dst, uint64_t 
dst_offset,
+   uint64_t bytes, 
BdrvRequestFlags flags)
+{
+return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, 
flags);
+}
+
+static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs,
+ BdrvChild *src, uint64_t 
src_offset,
+ BdrvChild *dst, uint64_t 
dst_offset,
+ uint64_t bytes, BdrvRequestFlags 
flags)
+{
+BDRVRawState *s = bs->opaque;
+BDRVRawState *src_s;
+
+assert(dst->bs == bs);
+if (src->bs->drv->bdrv_co

[Qemu-block] [PATCH v5 07/10] iscsi: Create and use iscsi_co_wait_for_task

2018-05-22 Thread Fam Zheng
This loop is repeated a growing number times. Make a helper.

Signed-off-by: Fam Zheng 
Reviewed-by: Stefan Hajnoczi 
Reviewed-by: Eric Blake 
---
 block/iscsi.c | 54 +-
 1 file changed, 17 insertions(+), 37 deletions(-)

diff --git a/block/iscsi.c b/block/iscsi.c
index 6d0035d4b9..6a365cb07b 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -556,6 +556,17 @@ static inline bool iscsi_allocmap_is_valid(IscsiLun 
*iscsilun,
offset / iscsilun->cluster_size) == size);
 }
 
+static void coroutine_fn iscsi_co_wait_for_task(IscsiTask *iTask,
+IscsiLun *iscsilun)
+{
+while (!iTask->complete) {
+iscsi_set_events(iscsilun);
+qemu_mutex_unlock(&iscsilun->mutex);
+qemu_coroutine_yield();
+qemu_mutex_lock(&iscsilun->mutex);
+}
+}
+
 static int coroutine_fn
 iscsi_co_writev(BlockDriverState *bs, int64_t sector_num, int nb_sectors,
 QEMUIOVector *iov, int flags)
@@ -617,12 +628,7 @@ retry:
 scsi_task_set_iov_out(iTask.task, (struct scsi_iovec *) iov->iov,
   iov->niov);
 #endif
-while (!iTask.complete) {
-iscsi_set_events(iscsilun);
-qemu_mutex_unlock(&iscsilun->mutex);
-qemu_coroutine_yield();
-qemu_mutex_lock(&iscsilun->mutex);
-}
+iscsi_co_wait_for_task(&iTask, iscsilun);
 
 if (iTask.task != NULL) {
 scsi_free_scsi_task(iTask.task);
@@ -693,13 +699,7 @@ retry:
 ret = -ENOMEM;
 goto out_unlock;
 }
-
-while (!iTask.complete) {
-iscsi_set_events(iscsilun);
-qemu_mutex_unlock(&iscsilun->mutex);
-qemu_coroutine_yield();
-qemu_mutex_lock(&iscsilun->mutex);
-}
+iscsi_co_wait_for_task(&iTask, iscsilun);
 
 if (iTask.do_retry) {
 if (iTask.task != NULL) {
@@ -863,13 +863,8 @@ retry:
 #if LIBISCSI_API_VERSION < (20160603)
 scsi_task_set_iov_in(iTask.task, (struct scsi_iovec *) iov->iov, 
iov->niov);
 #endif
-while (!iTask.complete) {
-iscsi_set_events(iscsilun);
-qemu_mutex_unlock(&iscsilun->mutex);
-qemu_coroutine_yield();
-qemu_mutex_lock(&iscsilun->mutex);
-}
 
+iscsi_co_wait_for_task(&iTask, iscsilun);
 if (iTask.task != NULL) {
 scsi_free_scsi_task(iTask.task);
 iTask.task = NULL;
@@ -906,12 +901,7 @@ retry:
 return -ENOMEM;
 }
 
-while (!iTask.complete) {
-iscsi_set_events(iscsilun);
-qemu_mutex_unlock(&iscsilun->mutex);
-qemu_coroutine_yield();
-qemu_mutex_lock(&iscsilun->mutex);
-}
+iscsi_co_wait_for_task(&iTask, iscsilun);
 
 if (iTask.task != NULL) {
 scsi_free_scsi_task(iTask.task);
@@ -1143,12 +1133,7 @@ retry:
 goto out_unlock;
 }
 
-while (!iTask.complete) {
-iscsi_set_events(iscsilun);
-qemu_mutex_unlock(&iscsilun->mutex);
-qemu_coroutine_yield();
-qemu_mutex_lock(&iscsilun->mutex);
-}
+iscsi_co_wait_for_task(&iTask, iscsilun);
 
 if (iTask.task != NULL) {
 scsi_free_scsi_task(iTask.task);
@@ -1244,12 +1229,7 @@ retry:
 return -ENOMEM;
 }
 
-while (!iTask.complete) {
-iscsi_set_events(iscsilun);
-qemu_mutex_unlock(&iscsilun->mutex);
-qemu_coroutine_yield();
-qemu_mutex_lock(&iscsilun->mutex);
-}
+iscsi_co_wait_for_task(&iTask, iscsilun);
 
 if (iTask.status == SCSI_STATUS_CHECK_CONDITION &&
 iTask.task->sense.key == SCSI_SENSE_ILLEGAL_REQUEST &&
-- 
2.14.3




[Qemu-block] [PATCH v5 03/10] raw: Implement copy offloading

2018-05-22 Thread Fam Zheng
Just pass down to ->file.

Signed-off-by: Fam Zheng 
Reviewed-by: Stefan Hajnoczi 
---
 block/raw-format.c | 32 
 1 file changed, 32 insertions(+)

diff --git a/block/raw-format.c b/block/raw-format.c
index b69a0674b3..f2e468df6f 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -497,6 +497,36 @@ static int raw_probe_geometry(BlockDriverState *bs, 
HDGeometry *geo)
 return bdrv_probe_geometry(bs->file->bs, geo);
 }
 
+static int coroutine_fn raw_co_copy_range_from(BlockDriverState *bs,
+   BdrvChild *src, uint64_t 
src_offset,
+   BdrvChild *dst, uint64_t 
dst_offset,
+   uint64_t bytes, 
BdrvRequestFlags flags)
+{
+int ret;
+
+ret = raw_adjust_offset(bs, &src_offset, bytes, false);
+if (ret) {
+return ret;
+}
+return bdrv_co_copy_range_from(bs->file, src_offset, dst, dst_offset,
+   bytes, flags);
+}
+
+static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs,
+ BdrvChild *src, uint64_t 
src_offset,
+ BdrvChild *dst, uint64_t 
dst_offset,
+ uint64_t bytes, BdrvRequestFlags 
flags)
+{
+int ret;
+
+ret = raw_adjust_offset(bs, &dst_offset, bytes, true);
+if (ret) {
+return ret;
+}
+return bdrv_co_copy_range_to(src, src_offset, bs->file, dst_offset, bytes,
+ flags);
+}
+
 BlockDriver bdrv_raw = {
 .format_name  = "raw",
 .instance_size= sizeof(BDRVRawState),
@@ -513,6 +543,8 @@ BlockDriver bdrv_raw = {
 .bdrv_co_pwrite_zeroes = &raw_co_pwrite_zeroes,
 .bdrv_co_pdiscard = &raw_co_pdiscard,
 .bdrv_co_block_status = &raw_co_block_status,
+.bdrv_co_copy_range_from = &raw_co_copy_range_from,
+.bdrv_co_copy_range_to  = &raw_co_copy_range_to,
 .bdrv_truncate= &raw_truncate,
 .bdrv_getlength   = &raw_getlength,
 .has_variable_length  = true,
-- 
2.14.3




[Qemu-block] [PATCH v5 02/10] raw: Check byte range uniformly

2018-05-22 Thread Fam Zheng
We don't verify the request range against s->size in the I/O callbacks
except for raw_co_pwritev. This is wrong (especially for
raw_co_pwrite_zeroes and raw_co_pdiscard), so fix them.

Signed-off-by: Fam Zheng 
---
 block/raw-format.c | 64 +-
 1 file changed, 39 insertions(+), 25 deletions(-)

diff --git a/block/raw-format.c b/block/raw-format.c
index fe33693a2d..b69a0674b3 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -167,16 +167,37 @@ static void raw_reopen_abort(BDRVReopenState *state)
 state->opaque = NULL;
 }
 
+/* Check and adjust the offset, against 'offset' and 'size' options. */
+static inline int raw_adjust_offset(BlockDriverState *bs, uint64_t *offset,
+uint64_t bytes, bool is_write)
+{
+BDRVRawState *s = bs->opaque;
+
+if (s->has_size && (*offset > s->size || bytes > (s->size - *offset))) {
+/* There's not enough space for the write, or the read request is
+ * out-of-range. Don't read/write anything to prevent leaking out of
+ * the size specified in options. */
+return is_write ? -ENOSPC : -EINVAL;;
+}
+
+if (*offset > INT64_MAX - s->offset) {
+return -EINVAL;
+}
+*offset += s->offset;
+
+return 0;
+}
+
 static int coroutine_fn raw_co_preadv(BlockDriverState *bs, uint64_t offset,
   uint64_t bytes, QEMUIOVector *qiov,
   int flags)
 {
-BDRVRawState *s = bs->opaque;
+int ret;
 
-if (offset > UINT64_MAX - s->offset) {
-return -EINVAL;
+ret = raw_adjust_offset(bs, &offset, bytes, false);
+if (ret) {
+return ret;
 }
-offset += s->offset;
 
 BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
 return bdrv_co_preadv(bs->file, offset, bytes, qiov, flags);
@@ -186,23 +207,11 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState 
*bs, uint64_t offset,
uint64_t bytes, QEMUIOVector *qiov,
int flags)
 {
-BDRVRawState *s = bs->opaque;
 void *buf = NULL;
 BlockDriver *drv;
 QEMUIOVector local_qiov;
 int ret;
 
-if (s->has_size && (offset > s->size || bytes > (s->size - offset))) {
-/* There's not enough space for the data. Don't write anything and just
- * fail to prevent leaking out of the size specified in options. */
-return -ENOSPC;
-}
-
-if (offset > UINT64_MAX - s->offset) {
-ret = -EINVAL;
-goto fail;
-}
-
 if (bs->probed && offset < BLOCK_PROBE_BUF_SIZE && bytes) {
 /* Handling partial writes would be a pain - so we just
  * require that guests have 512-byte request alignment if
@@ -237,7 +246,10 @@ static int coroutine_fn raw_co_pwritev(BlockDriverState 
*bs, uint64_t offset,
 qiov = &local_qiov;
 }
 
-offset += s->offset;
+ret = raw_adjust_offset(bs, &offset, bytes, true);
+if (ret) {
+goto fail;
+}
 
 BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
 ret = bdrv_co_pwritev(bs->file, offset, bytes, qiov, flags);
@@ -267,22 +279,24 @@ static int coroutine_fn 
raw_co_pwrite_zeroes(BlockDriverState *bs,
  int64_t offset, int bytes,
  BdrvRequestFlags flags)
 {
-BDRVRawState *s = bs->opaque;
-if (offset > UINT64_MAX - s->offset) {
-return -EINVAL;
+int ret;
+
+ret = raw_adjust_offset(bs, (uint64_t *)&offset, bytes, true);
+if (ret) {
+return ret;
 }
-offset += s->offset;
 return bdrv_co_pwrite_zeroes(bs->file, offset, bytes, flags);
 }
 
 static int coroutine_fn raw_co_pdiscard(BlockDriverState *bs,
 int64_t offset, int bytes)
 {
-BDRVRawState *s = bs->opaque;
-if (offset > UINT64_MAX - s->offset) {
-return -EINVAL;
+int ret;
+
+ret = raw_adjust_offset(bs, (uint64_t *)&offset, bytes, true);
+if (ret) {
+return ret;
 }
-offset += s->offset;
 return bdrv_co_pdiscard(bs->file->bs, offset, bytes);
 }
 
-- 
2.14.3




[Qemu-block] [PATCH v5 00/10] qemu-img convert with copy offloading

2018-05-22 Thread Fam Zheng
v5: - Fix raw offset/bytes check for read. [Eric]
- Fix qcow2_handle_l2meta. [Stefan]
- Add coroutine_fn whereever appropriate. [Stefan]

v4: - Fix raw offset and size. [Eric]
- iscsi: Drop unnecessary return values and variables in favor of
  constants. [Stefan]
- qcow2: Handle small backing case. [Stefan]
- file-posix: Translate ENOSYS to ENOTSUP. [Stefan]
- API documentation and commit message. [Stefan]
- Add rev-by to patches 3, 5 - 10. [Stefan, Eric]

This series introduces block layer API for copy offloading and makes use of it
in qemu-img convert.

For now we implemented the operation in local file protocol with
copy_file_range(2).  Besides that it's possible to add similar to iscsi, nfs
and potentially more.

As far as its usage goes, in addition to qemu-img convert, we can emulate
offloading in scsi-disk (handle EXTENDED COPY command), and use the API in
block jobs too.

Fam Zheng (10):
  block: Introduce API for copy offloading
  raw: Check byte range uniformly
  raw: Implement copy offloading
  qcow2: Implement copy offloading
  file-posix: Implement bdrv_co_copy_range
  iscsi: Query and save device designator when opening
  iscsi: Create and use iscsi_co_wait_for_task
  iscsi: Implement copy offloading
  block-backend: Add blk_co_copy_range
  qemu-img: Convert with copy offloading

 block/block-backend.c  |  18 +++
 block/file-posix.c |  96 -
 block/io.c |  97 +
 block/iscsi.c  | 314 -
 block/qcow2.c  | 226 +
 block/raw-format.c |  96 +
 include/block/block.h  |  32 +
 include/block/block_int.h  |  38 +
 include/block/raw-aio.h|  10 +-
 include/scsi/constants.h   |   5 +
 include/sysemu/block-backend.h |   4 +
 qemu-img.c |  50 ++-
 12 files changed, 887 insertions(+), 99 deletions(-)

-- 
2.14.3




[Qemu-block] [PATCH v5 01/10] block: Introduce API for copy offloading

2018-05-22 Thread Fam Zheng
Introduce the bdrv_co_copy_range() API for copy offloading.  Block
drivers implementing this API support efficient copy operations that
avoid reading each block from the source device and writing it to the
destination devices.  Examples of copy offload primitives are SCSI
EXTENDED COPY and Linux copy_file_range(2).

Signed-off-by: Fam Zheng 
Reviewed-by: Stefan Hajnoczi 
---
 block/io.c| 97 +++
 include/block/block.h | 32 
 include/block/block_int.h | 38 +++
 3 files changed, 167 insertions(+)

diff --git a/block/io.c b/block/io.c
index ca96b487eb..b7beaeeb9f 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2835,3 +2835,100 @@ void bdrv_unregister_buf(BlockDriverState *bs, void 
*host)
 bdrv_unregister_buf(child->bs, host);
 }
 }
+
+static int coroutine_fn bdrv_co_copy_range_internal(BdrvChild *src,
+uint64_t src_offset,
+BdrvChild *dst,
+uint64_t dst_offset,
+uint64_t bytes,
+BdrvRequestFlags flags,
+bool recurse_src)
+{
+int ret;
+
+if (!src || !dst || !src->bs || !dst->bs) {
+return -ENOMEDIUM;
+}
+ret = bdrv_check_byte_request(src->bs, src_offset, bytes);
+if (ret) {
+return ret;
+}
+
+ret = bdrv_check_byte_request(dst->bs, dst_offset, bytes);
+if (ret) {
+return ret;
+}
+if (flags & BDRV_REQ_ZERO_WRITE) {
+return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, flags);
+}
+
+if (!src->bs->drv->bdrv_co_copy_range_from
+|| !dst->bs->drv->bdrv_co_copy_range_to
+|| src->bs->encrypted || dst->bs->encrypted) {
+return -ENOTSUP;
+}
+if (recurse_src) {
+return src->bs->drv->bdrv_co_copy_range_from(src->bs,
+ src, src_offset,
+ dst, dst_offset,
+ bytes, flags);
+} else {
+return dst->bs->drv->bdrv_co_copy_range_to(dst->bs,
+   src, src_offset,
+   dst, dst_offset,
+   bytes, flags);
+}
+}
+
+/* Copy range from @src to @dst.
+ *
+ * See the comment of bdrv_co_copy_range for the parameter and return value
+ * semantics. */
+int coroutine_fn bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset,
+ BdrvChild *dst, uint64_t dst_offset,
+ uint64_t bytes, BdrvRequestFlags 
flags)
+{
+return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
+   bytes, flags, true);
+}
+
+/* Copy range from @src to @dst.
+ *
+ * See the comment of bdrv_co_copy_range for the parameter and return value
+ * semantics. */
+int coroutine_fn bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset,
+   BdrvChild *dst, uint64_t dst_offset,
+   uint64_t bytes, BdrvRequestFlags flags)
+{
+return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
+   bytes, flags, false);
+}
+
+int coroutine_fn bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset,
+BdrvChild *dst, uint64_t dst_offset,
+uint64_t bytes, BdrvRequestFlags flags)
+{
+BdrvTrackedRequest src_req, dst_req;
+BlockDriverState *src_bs = src->bs;
+BlockDriverState *dst_bs = dst->bs;
+int ret;
+
+bdrv_inc_in_flight(src_bs);
+bdrv_inc_in_flight(dst_bs);
+tracked_request_begin(&src_req, src_bs, src_offset,
+  bytes, BDRV_TRACKED_READ);
+tracked_request_begin(&dst_req, dst_bs, dst_offset,
+  bytes, BDRV_TRACKED_WRITE);
+
+wait_serialising_requests(&src_req);
+wait_serialising_requests(&dst_req);
+ret = bdrv_co_copy_range_from(src, src_offset,
+  dst, dst_offset,
+  bytes, flags);
+
+tracked_request_end(&src_req);
+tracked_request_end(&dst_req);
+bdrv_dec_in_flight(src_bs);
+bdrv_dec_in_flight(dst_bs);
+return ret;
+}
diff --git a/include/block/block.h b/include/block/block.h
index 3894edda9d..6cc6c7e699 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -611,4 +611,36 @@ bool bdrv_can_store_new_dirty_bitmap(BlockDriverState *bs, 
const char *name,
  */
 void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size);
 void bdrv_unregister_

Re: [Qemu-block] [Qemu-devel] [PATCH v2 2/3] qemu-iotests: Filter NFS paths

2018-05-22 Thread Fam Zheng
On Fri, 05/18 16:26, Kevin Wolf wrote:
> NFS paths were only partially filtered in _filter_img_create, _img_info
> and _filter_img_info, resulting in "nfs://127.0.0.1TEST_DIR/t.IMGFMT".
> This adds another replacement to the sed calls that matches the test
> directory not as a host path, but as an NFS URL (the prefix as used for
> $TEST_IMG).
> 
> Signed-off-by: Kevin Wolf 
> ---
>  tests/qemu-iotests/common.filter | 6 --
>  tests/qemu-iotests/common.rc | 8 +++-
>  2 files changed, 11 insertions(+), 3 deletions(-)
> 
> diff --git a/tests/qemu-iotests/common.filter 
> b/tests/qemu-iotests/common.filter
> index c5f4bcf578..f08ee55046 100644
> --- a/tests/qemu-iotests/common.filter
> +++ b/tests/qemu-iotests/common.filter
> @@ -119,7 +119,8 @@ _filter_actual_image_size()
>  # replace driver-specific options in the "Formatting..." line
>  _filter_img_create()
>  {
> -sed -e "s#$IMGPROTO:$TEST_DIR#TEST_DIR#g" \
> +sed -e "s#$REMOTE_TEST_DIR#TEST_DIR#g" \
> +-e "s#$IMGPROTO:$TEST_DIR#TEST_DIR#g" \
>  -e "s#$TEST_DIR#TEST_DIR#g" \
>  -e "s#$IMGFMT#IMGFMT#g" \
>  -e 's#nbd:127.0.0.1:10810#TEST_DIR/t.IMGFMT#g' \
> @@ -154,7 +155,8 @@ _filter_img_info()
>  
>  discard=0
>  regex_json_spec_start='^ *"format-specific": \{'
> -sed -e "s#$IMGPROTO:$TEST_DIR#TEST_DIR#g" \
> +sed -e "s#$REMOTE_TEST_DIR#TEST_DIR#g" \
> +-e "s#$IMGPROTO:$TEST_DIR#TEST_DIR#g" \
>  -e "s#$TEST_DIR#TEST_DIR#g" \
>  -e "s#$IMGFMT#IMGFMT#g" \
>  -e 's#nbd://127.0.0.1:10810$#TEST_DIR/t.IMGFMT#g' \
> diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc
> index cb5fa14e7f..d236bb9f15 100644
> --- a/tests/qemu-iotests/common.rc
> +++ b/tests/qemu-iotests/common.rc
> @@ -148,6 +148,7 @@ else
>  TEST_IMG="ssh://127.0.0.1$TEST_IMG_FILE"
>  elif [ "$IMGPROTO" = "nfs" ]; then
>  TEST_IMG_FILE=$TEST_DIR/t.$IMGFMT
> +REMOTE_TEST_DIR="nfs://127.0.0.1$TEST_DIR"
>  TEST_IMG="nfs://127.0.0.1$TEST_IMG_FILE"
>  elif [ "$IMGPROTO" = "vxhs" ]; then
>  TEST_IMG_FILE=$TEST_DIR/t.$IMGFMT
> @@ -173,6 +174,10 @@ if [ ! -d "$TEST_DIR" ]; then
>  exit 1
>  fi
>  
> +if [ -z "$REMOTE_TEST_DIR" ]; then
> +REMOTE_TEST_DIR="$TEST_DIR"

This has one extra level of indentation.

Reviewed-by: Fam Zheng 

> +fi
> +
>  if [ ! -d "$SAMPLE_IMG_DIR" ]; then
>  echo "common.config: Error: \$SAMPLE_IMG_DIR ($SAMPLE_IMG_DIR) is not a 
> directory"
>  exit 1
> @@ -333,7 +338,8 @@ _img_info()
>  discard=0
>  regex_json_spec_start='^ *"format-specific": \{'
>  $QEMU_IMG info $QEMU_IMG_EXTRA_ARGS "$@" "$TEST_IMG" 2>&1 | \
> -sed -e "s#$IMGPROTO:$TEST_DIR#TEST_DIR#g" \
> +sed -e "s#$REMOTE_TEST_DIR#TEST_DIR#g" \
> +-e "s#$IMGPROTO:$TEST_DIR#TEST_DIR#g" \
>  -e "s#$TEST_DIR#TEST_DIR#g" \
>  -e "s#$IMGFMT#IMGFMT#g" \
>  -e "/^disk size:/ D" \
> -- 
> 2.13.6
> 
> 



Re: [Qemu-block] [Qemu-devel] [PATCH v2 3/3] qemu-iotests: 086 doesn't work with NFS

2018-05-22 Thread Fam Zheng
On Fri, 05/18 16:26, Kevin Wolf wrote:
> The reference output file only works for file. 'qemu-img convert -p'
> makes a lot more progress updates for NFS than for file, so disable the
> test for NFS.
> 
> Signed-off-by: Kevin Wolf 

Reviewed-by: Fam Zheng 




Re: [Qemu-block] [Qemu-devel] [PATCH v2 1/3] qemu-iotests: Fix paths for NFS

2018-05-22 Thread Fam Zheng
On Fri, 05/18 16:26, Kevin Wolf wrote:
> Test cases were trying to use nfs:// URLs as local filenames, which made
> every test fail for NFS. With TEST_IMG and TEST_IMG_FILE set like for
> the other protocols, NFS tests can pass again.
> 
> Signed-off-by: Kevin Wolf 

Reviewed-by: Fam Zheng 




Re: [Qemu-block] [Qemu-devel] [PATCH 1/2] sheepdog: cleanup repeated expression

2018-05-22 Thread Fam Zheng
On Tue, 05/22 22:10, Paolo Bonzini wrote:
> The expression "SD_INODE_SIZE - sizeof(inode.data_vdi_id)" already has a macro
> defined for the same value (though with a nicer definition using offsetof).
> Replace it.
> 
> Signed-off-by: Paolo Bonzini 

Reviewed-by: Fam Zheng 




Re: [Qemu-block] [Qemu-devel] storing machine data in qcow images?

2018-05-22 Thread Fam Zheng
On Tue, 05/22 17:02, Kevin Wolf wrote:
> Am 22.05.2018 um 16:19 hat Michael S. Tsirkin geschrieben:
> > On Tue, May 22, 2018 at 09:35:55AM +0200, Gerd Hoffmann wrote:
> > >   Hi,
> > > 
> > > > You must /sometimes/ supply the correct machine type.
> > > > 
> > > > It is quite dependent on the guest OS you have installed, and even
> > > > just how the guest OS is configured.  In general Linux is very
> > > > flexible and can adapt to a wide range of hardware, automatically
> > > > detecting things as needed. It is possible for a sysadmin to build
> > > > a Linux image in a way that would only work with I440FX, but I
> > > > don't think it would be common to see that.
> > > 
> > > I think it would be pretty hard to actually build such an image.
> > > 
> > > The more critical thing for linux guests is the storage driver which
> > > must be included into the initrd so the image can mount the root
> > > filesystem.  And the firmware, bios vs. uefi is more critical than
> > > pc vs. q35.
> > 
> > I think we can start by finding a location to embed a string in a qcow
> > image, add ability for qemu-img to set and get this string.  We can
> > discuss how it's formatted separately.
> 
> If we want it, we'll find a place to store it.
> 
> But the first thing we need is a spec for what's actually in it. Just
> storing a machine type hint would be a one-off hack that wouldn't last
> very long before we want to add the next thing.
> 
> Essentially, what we need is a description of the virtual machine that
> we suggest to use with this image. We can try to reuse something
> existing there, like libvirt XML or OVF, or invent something new (a JSON
> array describing runtime options?). One difference to existing formats
> is probably that we want only frontends and no backends in the
> description.
> 

Do we really need a uniform way and require compliance to the standard we
choose, and implement verification in the block driver, or can we get away with
a description field that accepts any text and leave it to the user to decide
what to put there? In the header we could assign a Content-type field that
defaults to 'text/plain' to the description, that way apps can mark the data as
"application/ovf" if they want, or whatever the upper layer decides.

Fam



Re: [Qemu-block] [Qemu-devel] [PATCH 2/2] sheepdog: remove huge BSS object

2018-05-22 Thread Fam Zheng
On Tue, 05/22 22:10, Paolo Bonzini wrote:
> block/sheepdog.o has a 4M static variable that is 90% of QEMU's whole .bss
> section.  Replace it with a heap-allocated block, and make it smaller too
> since only the inode header is actually being used.
> 
> bss size goes down from 4464280 to 269976.
> 
> Signed-off-by: Paolo Bonzini 
> ---
>  block/sheepdog.c | 21 +++--
>  1 file changed, 11 insertions(+), 10 deletions(-)
> 
> diff --git a/block/sheepdog.c b/block/sheepdog.c
> index 23cf5a8430..2068166e3e 100644
> --- a/block/sheepdog.c
> +++ b/block/sheepdog.c
> @@ -2940,13 +2940,14 @@ static int sd_snapshot_list(BlockDriverState *bs, 
> QEMUSnapshotInfo **psn_tab)
>  QEMUSnapshotInfo *sn_tab = NULL;
>  unsigned wlen, rlen;
>  int found = 0;
> -static SheepdogInode inode;
> +SheepdogInode *inode;
>  unsigned long *vdi_inuse;
>  unsigned int start_nr;
>  uint64_t hval;
>  uint32_t vid;
>  
>  vdi_inuse = g_malloc(max);
> +inode = g_malloc(SD_INODE_HEADER_SIZE);

Should you g_free() it before returning?


>  
>  fd = connect_to_sdog(s, &local_err);
>  if (fd < 0) {
> @@ -2989,7 +2990,7 @@ static int sd_snapshot_list(BlockDriverState *bs, 
> QEMUSnapshotInfo **psn_tab)
>  }
>  
>  /* we don't need to read entire object */
> -ret = read_object(fd, s->bs, (char *)&inode,
> +ret = read_object(fd, s->bs, (char *)inode,
>vid_to_vdi_oid(vid),
>0, SD_INODE_HEADER_SIZE, 0,
>s->cache_flags);
> @@ -2998,17 +2999,17 @@ static int sd_snapshot_list(BlockDriverState *bs, 
> QEMUSnapshotInfo **psn_tab)
>  continue;
>  }
>  
> -if (!strcmp(inode.name, s->name) && is_snapshot(&inode)) {
> -sn_tab[found].date_sec = inode.snap_ctime >> 32;
> -sn_tab[found].date_nsec = inode.snap_ctime & 0x;
> -sn_tab[found].vm_state_size = inode.vm_state_size;
> -sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec;
> +if (!strcmp(inode->name, s->name) && is_snapshot(inode)) {
> +sn_tab[found].date_sec = inode->snap_ctime >> 32;
> +sn_tab[found].date_nsec = inode->snap_ctime & 0x;
> +sn_tab[found].vm_state_size = inode->vm_state_size;
> +sn_tab[found].vm_clock_nsec = inode->vm_clock_nsec;
>  
>  snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str),
> - "%" PRIu32, inode.snap_id);
> + "%" PRIu32, inode->snap_id);
>  pstrcpy(sn_tab[found].name,
> -MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)),
> -inode.tag);
> +MIN(sizeof(sn_tab[found].name), sizeof(inode->tag)),
> +inode->tag);
>  found++;
>  }
>  }
> -- 
> 2.17.0
> 
> 



Re: [Qemu-block] [Qemu-devel] [PATCH 1/2] sheepdog: cleanup repeated expression

2018-05-22 Thread Philippe Mathieu-Daudé
On 05/22/2018 05:10 PM, Paolo Bonzini wrote:
> The expression "SD_INODE_SIZE - sizeof(inode.data_vdi_id)" already has a macro
> defined for the same value (though with a nicer definition using offsetof).
> Replace it.
> 
> Signed-off-by: Paolo Bonzini 

Reviewed-by: Philippe Mathieu-Daudé 

> ---
>  block/sheepdog.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/block/sheepdog.c b/block/sheepdog.c
> index 4237132419..23cf5a8430 100644
> --- a/block/sheepdog.c
> +++ b/block/sheepdog.c
> @@ -2337,7 +2337,7 @@ static int sd_truncate(BlockDriverState *bs, int64_t 
> offset,
>  }
>  
>  /* we don't need to update entire object */
> -datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
> +datalen = SD_INODE_HEADER_SIZE;
>  s->inode.vdi_size = offset;
>  ret = write_object(fd, s->bs, (char *)&s->inode,
> vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies,
> @@ -2705,7 +2705,7 @@ static int sd_snapshot_create(BlockDriverState *bs, 
> QEMUSnapshotInfo *sn_info)
>   */
>  strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag));
>  /* we don't need to update entire object */
> -datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
> +datalen = SD_INODE_HEADER_SIZE;
>  inode = g_malloc(datalen);
>  
>  /* refresh inode. */
> @@ -2991,7 +2991,7 @@ static int sd_snapshot_list(BlockDriverState *bs, 
> QEMUSnapshotInfo **psn_tab)
>  /* we don't need to read entire object */
>  ret = read_object(fd, s->bs, (char *)&inode,
>vid_to_vdi_oid(vid),
> -  0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0,
> +  0, SD_INODE_HEADER_SIZE, 0,
>s->cache_flags);
>  
>  if (ret) {
> 



Re: [Qemu-block] [Qemu-devel] [PATCH 2/2] sheepdog: remove huge BSS object

2018-05-22 Thread Philippe Mathieu-Daudé
On 05/22/2018 05:10 PM, Paolo Bonzini wrote:
> block/sheepdog.o has a 4M static variable that is 90% of QEMU's whole .bss
> section.  Replace it with a heap-allocated block, and make it smaller too
> since only the inode header is actually being used.
> 
> bss size goes down from 4464280 to 269976.

\o/

> Signed-off-by: Paolo Bonzini 

Reviewed-by: Philippe Mathieu-Daudé 

> ---
>  block/sheepdog.c | 21 +++--
>  1 file changed, 11 insertions(+), 10 deletions(-)
> 
> diff --git a/block/sheepdog.c b/block/sheepdog.c
> index 23cf5a8430..2068166e3e 100644
> --- a/block/sheepdog.c
> +++ b/block/sheepdog.c
> @@ -2940,13 +2940,14 @@ static int sd_snapshot_list(BlockDriverState *bs, 
> QEMUSnapshotInfo **psn_tab)
>  QEMUSnapshotInfo *sn_tab = NULL;
>  unsigned wlen, rlen;
>  int found = 0;
> -static SheepdogInode inode;
> +SheepdogInode *inode;
>  unsigned long *vdi_inuse;
>  unsigned int start_nr;
>  uint64_t hval;
>  uint32_t vid;
>  
>  vdi_inuse = g_malloc(max);
> +inode = g_malloc(SD_INODE_HEADER_SIZE);
>  
>  fd = connect_to_sdog(s, &local_err);
>  if (fd < 0) {
> @@ -2989,7 +2990,7 @@ static int sd_snapshot_list(BlockDriverState *bs, 
> QEMUSnapshotInfo **psn_tab)
>  }
>  
>  /* we don't need to read entire object */
> -ret = read_object(fd, s->bs, (char *)&inode,
> +ret = read_object(fd, s->bs, (char *)inode,
>vid_to_vdi_oid(vid),
>0, SD_INODE_HEADER_SIZE, 0,
>s->cache_flags);
> @@ -2998,17 +2999,17 @@ static int sd_snapshot_list(BlockDriverState *bs, 
> QEMUSnapshotInfo **psn_tab)
>  continue;
>  }
>  
> -if (!strcmp(inode.name, s->name) && is_snapshot(&inode)) {
> -sn_tab[found].date_sec = inode.snap_ctime >> 32;
> -sn_tab[found].date_nsec = inode.snap_ctime & 0x;
> -sn_tab[found].vm_state_size = inode.vm_state_size;
> -sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec;
> +if (!strcmp(inode->name, s->name) && is_snapshot(inode)) {
> +sn_tab[found].date_sec = inode->snap_ctime >> 32;
> +sn_tab[found].date_nsec = inode->snap_ctime & 0x;
> +sn_tab[found].vm_state_size = inode->vm_state_size;
> +sn_tab[found].vm_clock_nsec = inode->vm_clock_nsec;
>  
>  snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str),
> - "%" PRIu32, inode.snap_id);
> + "%" PRIu32, inode->snap_id);
>  pstrcpy(sn_tab[found].name,
> -MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)),
> -inode.tag);
> +MIN(sizeof(sn_tab[found].name), sizeof(inode->tag)),
> +inode->tag);
>  found++;
>  }
>  }
> 



[Qemu-block] [PATCH 2/2] sheepdog: remove huge BSS object

2018-05-22 Thread Paolo Bonzini
block/sheepdog.o has a 4M static variable that is 90% of QEMU's whole .bss
section.  Replace it with a heap-allocated block, and make it smaller too
since only the inode header is actually being used.

bss size goes down from 4464280 to 269976.

Signed-off-by: Paolo Bonzini 
---
 block/sheepdog.c | 21 +++--
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 23cf5a8430..2068166e3e 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -2940,13 +2940,14 @@ static int sd_snapshot_list(BlockDriverState *bs, 
QEMUSnapshotInfo **psn_tab)
 QEMUSnapshotInfo *sn_tab = NULL;
 unsigned wlen, rlen;
 int found = 0;
-static SheepdogInode inode;
+SheepdogInode *inode;
 unsigned long *vdi_inuse;
 unsigned int start_nr;
 uint64_t hval;
 uint32_t vid;
 
 vdi_inuse = g_malloc(max);
+inode = g_malloc(SD_INODE_HEADER_SIZE);
 
 fd = connect_to_sdog(s, &local_err);
 if (fd < 0) {
@@ -2989,7 +2990,7 @@ static int sd_snapshot_list(BlockDriverState *bs, 
QEMUSnapshotInfo **psn_tab)
 }
 
 /* we don't need to read entire object */
-ret = read_object(fd, s->bs, (char *)&inode,
+ret = read_object(fd, s->bs, (char *)inode,
   vid_to_vdi_oid(vid),
   0, SD_INODE_HEADER_SIZE, 0,
   s->cache_flags);
@@ -2998,17 +2999,17 @@ static int sd_snapshot_list(BlockDriverState *bs, 
QEMUSnapshotInfo **psn_tab)
 continue;
 }
 
-if (!strcmp(inode.name, s->name) && is_snapshot(&inode)) {
-sn_tab[found].date_sec = inode.snap_ctime >> 32;
-sn_tab[found].date_nsec = inode.snap_ctime & 0x;
-sn_tab[found].vm_state_size = inode.vm_state_size;
-sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec;
+if (!strcmp(inode->name, s->name) && is_snapshot(inode)) {
+sn_tab[found].date_sec = inode->snap_ctime >> 32;
+sn_tab[found].date_nsec = inode->snap_ctime & 0x;
+sn_tab[found].vm_state_size = inode->vm_state_size;
+sn_tab[found].vm_clock_nsec = inode->vm_clock_nsec;
 
 snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str),
- "%" PRIu32, inode.snap_id);
+ "%" PRIu32, inode->snap_id);
 pstrcpy(sn_tab[found].name,
-MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)),
-inode.tag);
+MIN(sizeof(sn_tab[found].name), sizeof(inode->tag)),
+inode->tag);
 found++;
 }
 }
-- 
2.17.0




[Qemu-block] [PATCH 1/2] sheepdog: cleanup repeated expression

2018-05-22 Thread Paolo Bonzini
The expression "SD_INODE_SIZE - sizeof(inode.data_vdi_id)" already has a macro
defined for the same value (though with a nicer definition using offsetof).
Replace it.

Signed-off-by: Paolo Bonzini 
---
 block/sheepdog.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/block/sheepdog.c b/block/sheepdog.c
index 4237132419..23cf5a8430 100644
--- a/block/sheepdog.c
+++ b/block/sheepdog.c
@@ -2337,7 +2337,7 @@ static int sd_truncate(BlockDriverState *bs, int64_t 
offset,
 }
 
 /* we don't need to update entire object */
-datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
+datalen = SD_INODE_HEADER_SIZE;
 s->inode.vdi_size = offset;
 ret = write_object(fd, s->bs, (char *)&s->inode,
vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies,
@@ -2705,7 +2705,7 @@ static int sd_snapshot_create(BlockDriverState *bs, 
QEMUSnapshotInfo *sn_info)
  */
 strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag));
 /* we don't need to update entire object */
-datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id);
+datalen = SD_INODE_HEADER_SIZE;
 inode = g_malloc(datalen);
 
 /* refresh inode. */
@@ -2991,7 +2991,7 @@ static int sd_snapshot_list(BlockDriverState *bs, 
QEMUSnapshotInfo **psn_tab)
 /* we don't need to read entire object */
 ret = read_object(fd, s->bs, (char *)&inode,
   vid_to_vdi_oid(vid),
-  0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0,
+  0, SD_INODE_HEADER_SIZE, 0,
   s->cache_flags);
 
 if (ret) {
-- 
2.17.0





[Qemu-block] [PATCH 0/2] sheepdog: remove huge BSS object

2018-05-22 Thread Paolo Bonzini
block/sheepdog.o has a 4M static variable that is 90% of QEMU's whole .bss
section.  Since it doesn't really have to be static, we can just use a
heap allocated block.  We can actually make it smaller too. :)

Patch 1 is a related cleanup since we're touching that area of the code.

Paolo

Paolo Bonzini (2):
  sheepdog: cleanup repeated expression
  sheepdog: remove huge BSS object

 block/sheepdog.c | 27 ++-
 1 file changed, 14 insertions(+), 13 deletions(-)

-- 
2.17.0




Re: [Qemu-block] [Qemu-devel] [PATCH v2 00/40] Generic background jobs

2018-05-22 Thread Marc-André Lureau
Hi

On Tue, May 22, 2018 at 1:01 PM, Kevin Wolf  wrote:
> Am 18.05.2018 um 20:41 hat Dr. David Alan Gilbert geschrieben:
>> * Kevin Wolf (kw...@redhat.com) wrote:
>> > Before we can make x-blockdev-create a background job, we need to
>> > generalise the job infrastructure so that it can be used without any
>> > associated block node.
>>
>> Is there any relationship between what this does, and what
>> Marc-André's 'monitor: add asynchronous command type' tries to do?
>> (See 20180326150916.9602-1-marcandre.lur...@redhat.com 26th March)
>
> Depends on who you ask. :-)
>
> In a way, yes. Marc-André's async commands are for long-running
> operations and jobs are for long-running operations. However, they are
> for a different kind of "long-running".
>
> Basically, Marc-André's work is for commands that are generally quick,
> but perform some blocking operation. Usually commands that are currently
> implemented synchronously, but always bothered us because they block the
> VM for a while. They take maybe a few seconds at most, but you really
> don't want them to block the guest even for short time.
>
> The important part here is you don't want to modify the operations while
> they're running, they just send their return value when they are done.
> (In the latest version, Marc-André even made sure not to modify the wire
> protocol, so IIUC the commands aren't really async any more in the sense
> that you can have multiple commands running, but they are just
> non-blocking in the sense that the guest can keep running while they are
> doing their work.)
>
> In comparison, jobs are typically really long running, like several
> minutes (like copying a whole disk image). Some of them can even run
> indefinitely (like mirroring) until you explicitly tell them to stop.
> You want to continue using the monitor while they are running, and to be
> able to manage them at runtime (pause/resume/set speed/cancel/...).
>
> So I think both address different problems.

I agree with Kevin that both address different needs and complement
each other. Right now, QMP commands are handled synchronous, so they
block the main thread (or the "OOB" thread). I needed a simple way to
handle them asynchronously, *without* breaking or changing an existing
QMP command API. Job, on the other hand, provides a lot of facilities
that are unnecessary for most short living commands, while adding
complexity, at the protocol level (for instance, by using an extra
"job-id" space and not being tied to the qmp session). Depending on
the facilities Job provide, and the guarantee that must hold when
using it, it may make sense or help to transform an existing command
to an asynchronous version, but I doubt many of the existing commands
will need it. However, if there is a need to introduce job-like
facilities to QMP async (such as running concurrent commands, listing
running commands, being able to cancel them, being notified on
progression etc), then I think we should be careful to use Job. For
now, this is unneeded, as QMP async is internal to qemu, the client
can't run anything  to manage an ongoing async command, so I think
both are improvements and will coexist harmoniously.



Re: [Qemu-block] [PATCH 3/4] nbd/client: Support requests of additional block sizing info

2018-05-22 Thread Vladimir Sementsov-Ogievskiy

02.05.2018 00:13, Eric Blake wrote:

The NBD spec is clarifying [1] that a server may want to advertise
different limits for READ/WRITE (in our case, 32M) than for
TRIM/ZERO (in our case, nearly 4G).  Implement the client
side support for these alternate limits, by always requesting
the new information (a compliant server must ignore the
request if it is unknown), and by validating anything reported
by the server before populating the block layer limits.

[1] https://lists.debian.org/nbd/2018/03/msg00048.html

Signed-off-by: Eric Blake 

---

The given URL for the NBD spec was v3; it will change to be a v4
version of that patch in part to point back to this qemu commit
as a proof of implementation.
---
  include/block/nbd.h |   4 ++
  block/nbd.c |  15 ++-
  nbd/client.c| 116 ++--
  nbd/trace-events|   2 +
  4 files changed, 131 insertions(+), 6 deletions(-)

diff --git a/include/block/nbd.h b/include/block/nbd.h
index cbf51628f78..90ddef32bb3 100644
--- a/include/block/nbd.h
+++ b/include/block/nbd.h
@@ -270,6 +270,10 @@ struct NBDExportInfo {
  uint32_t min_block;
  uint32_t opt_block;
  uint32_t max_block;
+uint32_t min_trim;
+uint32_t max_trim;
+uint32_t min_zero;
+uint32_t max_zero;

  uint32_t meta_base_allocation_id;
  };
diff --git a/block/nbd.c b/block/nbd.c
index 1e2b3ba2d3b..823d10b251d 100644
--- a/block/nbd.c
+++ b/block/nbd.c
@@ -478,8 +478,19 @@ static void nbd_refresh_limits(BlockDriverState *bs, Error 
**errp)
  uint32_t max = MIN_NON_ZERO(NBD_MAX_BUFFER_SIZE, s->info.max_block);

  bs->bl.request_alignment = min ? min : BDRV_SECTOR_SIZE;
-bs->bl.max_pdiscard = max;
-bs->bl.max_pwrite_zeroes = max;
+if (s->info.max_trim) {
+bs->bl.max_pdiscard = MIN(s->info.max_trim, BDRV_REQUEST_MAX_BYTES);
+} else {
+bs->bl.max_pdiscard = max;
+}
+bs->bl.pdiscard_alignment = s->info.min_trim;
+if (s->info.max_zero) {
+bs->bl.max_pwrite_zeroes = MIN(s->info.max_zero,
+   BDRV_REQUEST_MAX_BYTES);
+} else {
+bs->bl.max_pwrite_zeroes = max;
+}
+bs->bl.pwrite_zeroes_alignment = s->info.min_zero;
  bs->bl.max_transfer = max;


Should not max_transfer be updated too? Looks like it limits all 
requests, is it right?


Best regards,
Vladimir




Re: [Qemu-block] [Qemu-devel] storing machine data in qcow images?

2018-05-22 Thread Eduardo Habkost
On Tue, May 22, 2018 at 05:02:21PM +0200, Kevin Wolf wrote:
> Am 22.05.2018 um 16:19 hat Michael S. Tsirkin geschrieben:
> > On Tue, May 22, 2018 at 09:35:55AM +0200, Gerd Hoffmann wrote:
> > >   Hi,
> > > 
> > > > You must /sometimes/ supply the correct machine type.
> > > > 
> > > > It is quite dependent on the guest OS you have installed, and even
> > > > just how the guest OS is configured.  In general Linux is very
> > > > flexible and can adapt to a wide range of hardware, automatically
> > > > detecting things as needed. It is possible for a sysadmin to build
> > > > a Linux image in a way that would only work with I440FX, but I
> > > > don't think it would be common to see that.
> > > 
> > > I think it would be pretty hard to actually build such an image.
> > > 
> > > The more critical thing for linux guests is the storage driver which
> > > must be included into the initrd so the image can mount the root
> > > filesystem.  And the firmware, bios vs. uefi is more critical than
> > > pc vs. q35.
> > 
> > I think we can start by finding a location to embed a string in a qcow
> > image, add ability for qemu-img to set and get this string.  We can
> > discuss how it's formatted separately.
> 
> If we want it, we'll find a place to store it.
> 
> But the first thing we need is a spec for what's actually in it. Just
> storing a machine type hint would be a one-off hack that wouldn't last
> very long before we want to add the next thing.
> 
> Essentially, what we need is a description of the virtual machine that
> we suggest to use with this image. We can try to reuse something
> existing there, like libvirt XML or OVF, or invent something new (a JSON
> array describing runtime options?). One difference to existing formats
> is probably that we want only frontends and no backends in the
> description.

The OVF virtual hardware description might be appropriate to
define what's required vs what's recommended, to support multiple
machine-types, and ranges of valid values for variables.

Pro: management software can reuse exactly the same logic for
qcow2 machine descriptions and OVF machine descriptions.

Con: OVF is a pretty complex specification.

-- 
Eduardo



Re: [Qemu-block] [Qemu-devel] storing machine data in qcow images?

2018-05-22 Thread Kevin Wolf
Am 22.05.2018 um 16:19 hat Michael S. Tsirkin geschrieben:
> On Tue, May 22, 2018 at 09:35:55AM +0200, Gerd Hoffmann wrote:
> >   Hi,
> > 
> > > You must /sometimes/ supply the correct machine type.
> > > 
> > > It is quite dependent on the guest OS you have installed, and even
> > > just how the guest OS is configured.  In general Linux is very
> > > flexible and can adapt to a wide range of hardware, automatically
> > > detecting things as needed. It is possible for a sysadmin to build
> > > a Linux image in a way that would only work with I440FX, but I
> > > don't think it would be common to see that.
> > 
> > I think it would be pretty hard to actually build such an image.
> > 
> > The more critical thing for linux guests is the storage driver which
> > must be included into the initrd so the image can mount the root
> > filesystem.  And the firmware, bios vs. uefi is more critical than
> > pc vs. q35.
> 
> I think we can start by finding a location to embed a string in a qcow
> image, add ability for qemu-img to set and get this string.  We can
> discuss how it's formatted separately.

If we want it, we'll find a place to store it.

But the first thing we need is a spec for what's actually in it. Just
storing a machine type hint would be a one-off hack that wouldn't last
very long before we want to add the next thing.

Essentially, what we need is a description of the virtual machine that
we suggest to use with this image. We can try to reuse something
existing there, like libvirt XML or OVF, or invent something new (a JSON
array describing runtime options?). One difference to existing formats
is probably that we want only frontends and no backends in the
description.

Kevin



Re: [Qemu-block] [Qemu-devel] storing machine data in qcow images?

2018-05-22 Thread Michael S. Tsirkin
On Tue, May 22, 2018 at 09:35:55AM +0200, Gerd Hoffmann wrote:
>   Hi,
> 
> > You must /sometimes/ supply the correct machine type.
> > 
> > It is quite dependent on the guest OS you have installed, and even
> > just how the guest OS is configured.  In general Linux is very
> > flexible and can adapt to a wide range of hardware, automatically
> > detecting things as needed. It is possible for a sysadmin to build
> > a Linux image in a way that would only work with I440FX, but I
> > don't think it would be common to see that.
> 
> I think it would be pretty hard to actually build such an image.
> 
> The more critical thing for linux guests is the storage driver which
> must be included into the initrd so the image can mount the root
> filesystem.  And the firmware, bios vs. uefi is more critical than
> pc vs. q35.

I think we can start by finding a location to embed a string in a qcow
image, add ability for qemu-img to set and get this string.  We can
discuss how it's formatted separately.

> > That said I'm not really convinced that using the qcow2 headers is
> > a good plan. We have many disk image formats in common use, qcow2
> > is just one. Even if the user provides the image in qcow2 format,
> > that doesn't mean that mgmt apps actually store the qcow2 file.
> 
> > I tend to think we'd be better looking at what we can do in the context
> > of an existing standard like OVF rather than inventing something that
> > only works with qcow2. I think it would need to be more expressive than
> > just a single list of key,value pairs for each item.
> 
> Embed OVF metadata in the qcow2 image?
> 
> cheers,
>   Gerd

What would be helpful is if we could tell the user who wonders
how to run an image "hey you probably want flags X,Y and Z".
I can see how we could have an option to either stick
a bit of XML there, or just some QEMU flags.

-- 
MST



[Qemu-block] [PATCH] nbd/server: fix trace

2018-05-22 Thread Vladimir Sementsov-Ogievskiy
Return code = 1 doesn't mean that we parsed base:allocation. Move
trace point to appropriate place.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 nbd/server.c | 7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/nbd/server.c b/nbd/server.c
index 9e1f227178..84381baaa8 100644
--- a/nbd/server.c
+++ b/nbd/server.c
@@ -741,7 +741,10 @@ static int nbd_negotiate_send_meta_context(NBDClient 
*client,
  * the current name, after the 'base:' portion has been stripped.
  *
  * Return -errno on I/O error, 0 if option was completely handled by
- * sending a reply about inconsistent lengths, or 1 on success. */
+ * sending a reply about inconsistent lengths, or 1 on success.
+ *
+ * Note: return code = 1 doesn't mean that we've parsed "base:allocation"
+ * namespace. It only mean that there are no errors.*/
 static int nbd_meta_base_query(NBDClient *client, NBDExportMetaContexts *meta,
uint32_t len, Error **errp)
 {
@@ -768,10 +771,10 @@ static int nbd_meta_base_query(NBDClient *client, 
NBDExportMetaContexts *meta,
 }
 
 if (strncmp(query, "allocation", alen) == 0) {
+trace_nbd_negotiate_meta_query_parse("base:allocation");
 meta->base_allocation = true;
 }
 
-trace_nbd_negotiate_meta_query_parse("base:allocation");
 return 1;
 }
 
-- 
2.11.1




Re: [Qemu-block] [PATCH] sheepdog: Remove unnecessary NULL check in sd_prealloc()

2018-05-22 Thread Kevin Wolf
Am 18.05.2018 um 20:17 hat Peter Maydell geschrieben:
> In commit 8b9ad56e9cbfd852a, we removed the code that could result
> in our getting to sd_prealloc()'s out_with_err_set label with a
> NULL blk pointer. That makes the NULL check in the error-handling
> path unnecessary, and Coverity gripes about it (CID 1390636).
> Delete the redundant check.
> 
> Signed-off-by: Peter Maydell 

In fact, it was unnecessary even before that, because blk_unref(NULL)
is valid (it does nothing).

Thanks, applied to the block branch.

Kevin



Re: [Qemu-block] [Qemu-devel] [PATCH v2 00/40] Generic background jobs

2018-05-22 Thread Kevin Wolf
Am 18.05.2018 um 20:41 hat Dr. David Alan Gilbert geschrieben:
> * Kevin Wolf (kw...@redhat.com) wrote:
> > Before we can make x-blockdev-create a background job, we need to
> > generalise the job infrastructure so that it can be used without any
> > associated block node.
> 
> Is there any relationship between what this does, and what
> Marc-André's 'monitor: add asynchronous command type' tries to do?
> (See 20180326150916.9602-1-marcandre.lur...@redhat.com 26th March)

Depends on who you ask. :-)

In a way, yes. Marc-André's async commands are for long-running
operations and jobs are for long-running operations. However, they are
for a different kind of "long-running".

Basically, Marc-André's work is for commands that are generally quick,
but perform some blocking operation. Usually commands that are currently
implemented synchronously, but always bothered us because they block the
VM for a while. They take maybe a few seconds at most, but you really
don't want them to block the guest even for short time.

The important part here is you don't want to modify the operations while
they're running, they just send their return value when they are done.
(In the latest version, Marc-André even made sure not to modify the wire
protocol, so IIUC the commands aren't really async any more in the sense
that you can have multiple commands running, but they are just
non-blocking in the sense that the guest can keep running while they are
doing their work.)

In comparison, jobs are typically really long running, like several
minutes (like copying a whole disk image). Some of them can even run
indefinitely (like mirroring) until you explicitly tell them to stop.
You want to continue using the monitor while they are running, and to be
able to manage them at runtime (pause/resume/set speed/cancel/...).

So I think both address different problems.

Kevin



Re: [Qemu-block] [Qemu-devel] storing machine data in qcow images?

2018-05-22 Thread Eduardo Habkost
On Tue, May 22, 2018 at 09:35:55AM +0200, Gerd Hoffmann wrote:
>   Hi,
> 
> > You must /sometimes/ supply the correct machine type.
> > 
> > It is quite dependent on the guest OS you have installed, and even
> > just how the guest OS is configured.  In general Linux is very
> > flexible and can adapt to a wide range of hardware, automatically
> > detecting things as needed. It is possible for a sysadmin to build
> > a Linux image in a way that would only work with I440FX, but I
> > don't think it would be common to see that.
> 
> I think it would be pretty hard to actually build such an image.
> 
> The more critical thing for linux guests is the storage driver which
> must be included into the initrd so the image can mount the root
> filesystem.  And the firmware, bios vs. uefi is more critical than
> pc vs. q35.
> 
> > That said I'm not really convinced that using the qcow2 headers is
> > a good plan. We have many disk image formats in common use, qcow2
> > is just one. Even if the user provides the image in qcow2 format,
> > that doesn't mean that mgmt apps actually store the qcow2 file.
> 
> > I tend to think we'd be better looking at what we can do in the context
> > of an existing standard like OVF rather than inventing something that
> > only works with qcow2. I think it would need to be more expressive than
> > just a single list of key,value pairs for each item.
> 
> Embed OVF metadata in the qcow2 image?

I'm all for using the same standard for specifying machine hints
on both cases.

Now, is there an existing mechanism for virtual hardware hints
(not requirements) in OVF, or we have to invent one?

-- 
Eduardo



Re: [Qemu-block] [PATCH v2 37/40] job: Add query-jobs QMP command

2018-05-22 Thread Kevin Wolf
Am 18.05.2018 um 20:22 hat Eric Blake geschrieben:
> On 05/18/2018 08:21 AM, Kevin Wolf wrote:
> > This adds a minimal query-jobs implementation that shouldn't pose many
> > design questions. It can later be extended to expose more information,
> > and especially job-specific information.
> > 
> > Signed-off-by: Kevin Wolf 
> > ---
> >   qapi/job.json  | 45 +
> >   include/qemu/job.h |  3 +++
> >   job-qmp.c  | 54 
> > ++
> >   job.c  |  2 +-
> >   4 files changed, 103 insertions(+), 1 deletion(-)
> > 
> 
> > +##
> > +# @JobInfo:
> > +#
> > +# Information about a job.
> > +#
> > +# @id:  The job identifier
> 
> > +##
> > +{ 'struct': 'JobInfo',
> > +  'data': { 'id': 'str', 'type': 'JobType', 'status': 'JobStatus',
> > +'current-progress': 'int', 'total-progress': 'int',
> > +'*error': 'str' } }
> 
> Is it worth exposing whether a job is auto-finalize and auto-complete? Goes
> back to the issue of whether clients of the new job API would ever want/need
> to rely on the auto- features; while clients of the old blockjob API that
> get the auto- features by default will never be calling the new query-jobs
> command.

There are probably more fields that could be exposed. For most of them,
it's not obvious whether we want to expose them, so I went for the
minimal useful information here. We can always add more information to a
query command; but we can't take information away later if it turns out
that exposing it was a bad idea.

Kevin



Re: [Qemu-block] [PATCH v2 36/40] job: Add lifecycle QMP commands

2018-05-22 Thread Kevin Wolf
Am 18.05.2018 um 20:12 hat Eric Blake geschrieben:
> On 05/18/2018 08:21 AM, Kevin Wolf wrote:
> > This adds QMP commands that control the transition between states of the
> > job lifecycle.
> > 
> > Signed-off-by: Kevin Wolf 
> > ---
> >   qapi/job.json |  99 +++
> >   job-qmp.c | 134 
> > ++
> >   MAINTAINERS   |   1 +
> >   Makefile.objs |   1 +
> >   trace-events  |   9 
> >   5 files changed, 244 insertions(+)
> >   create mode 100644 job-qmp.c
> > 
> > +##
> > +# @job-dismiss:
> > +#
> > +# Deletes a job that is in the CONCLUDED state. This command only needs to 
> > be
> > +# run explicitly for jobs that don't have automatic dismiss enabled.
> 
> Did we decide whether it is valid to expect a job with automatic dismiss
> enabled (old-style block jobs) to use the new job control commands?

"job control commands" is a pretty broad term, so I'd say yes, it's
expected that you can use job-* commands on any job. For example,
job-pause/resume make perfect sense on a job with auto-dismiss=true.

> Or would it be reasonable to require that 'job-dismiss' is an error on
> jobs with auto-dismiss enabled

This, too, but only because it implicitly follows from the condition
specified above: You'll never catch a job in the CONCLUDED state when
you have auto-dismiss=true because it will immediately move on to NULL.

> (as in, if you're going to use new style jobs, you are guaranteed to
> also have auto-dismiss false, because we don't expose a way to change
> that flag in new-style jobs; and if you use old style jobs, all
> management of the job should be done through the old interfaces).

I'm not completely convinced that auto-dismiss=false is the only
"correct" setting. That might be the case for libvirt, but possibly not
for simple ad-hoc scripts with lower requirements.

We don't have any new jobs yet, so whether we expose the auto-* flags
there is a decision yet to be made.

Kevin



Re: [Qemu-block] [Qemu-devel] storing machine data in qcow images?

2018-05-22 Thread Gerd Hoffmann
  Hi,

> You must /sometimes/ supply the correct machine type.
> 
> It is quite dependent on the guest OS you have installed, and even
> just how the guest OS is configured.  In general Linux is very
> flexible and can adapt to a wide range of hardware, automatically
> detecting things as needed. It is possible for a sysadmin to build
> a Linux image in a way that would only work with I440FX, but I
> don't think it would be common to see that.

I think it would be pretty hard to actually build such an image.

The more critical thing for linux guests is the storage driver which
must be included into the initrd so the image can mount the root
filesystem.  And the firmware, bios vs. uefi is more critical than
pc vs. q35.

> That said I'm not really convinced that using the qcow2 headers is
> a good plan. We have many disk image formats in common use, qcow2
> is just one. Even if the user provides the image in qcow2 format,
> that doesn't mean that mgmt apps actually store the qcow2 file.

> I tend to think we'd be better looking at what we can do in the context
> of an existing standard like OVF rather than inventing something that
> only works with qcow2. I think it would need to be more expressive than
> just a single list of key,value pairs for each item.

Embed OVF metadata in the qcow2 image?

cheers,
  Gerd