[Qemu-block] [RFC PATCH v2 7/7] qemu-img: Convert with copy offloading

2018-04-17 Thread Fam Zheng
The new blk_co_copy_range interface offers a more efficient way in the
case of network based storage. Make use of it to allow faster convert
operation.

Since copy offloading cannot do zero detection ('-S') and compression
(-c), only try it when these options are not used.

Signed-off-by: Fam Zheng 
---
 qemu-img.c | 50 --
 1 file changed, 48 insertions(+), 2 deletions(-)

diff --git a/qemu-img.c b/qemu-img.c
index 855fa52514..268b749592 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -1544,6 +1544,7 @@ typedef struct ImgConvertState {
 bool compressed;
 bool target_has_backing;
 bool wr_in_order;
+bool copy_range;
 int min_sparse;
 size_t cluster_sectors;
 size_t buf_sectors;
@@ -1737,6 +1738,37 @@ static int coroutine_fn convert_co_write(ImgConvertState 
*s, int64_t sector_num,
 return 0;
 }
 
+static int coroutine_fn convert_co_copy_range(ImgConvertState *s, int64_t 
sector_num,
+  int nb_sectors)
+{
+int n, ret;
+
+while (nb_sectors > 0) {
+BlockBackend *blk;
+int src_cur;
+int64_t bs_sectors, src_cur_offset;
+int64_t offset;
+
+convert_select_part(s, sector_num, _cur, _cur_offset);
+offset = (sector_num - src_cur_offset) << BDRV_SECTOR_BITS;
+blk = s->src[src_cur];
+bs_sectors = s->src_sectors[src_cur];
+
+n = MIN(nb_sectors, bs_sectors - (sector_num - src_cur_offset));
+
+ret = blk_co_copy_range(blk, offset, s->target,
+sector_num << BDRV_SECTOR_BITS,
+n << BDRV_SECTOR_BITS, 0);
+if (ret < 0) {
+return ret;
+}
+
+sector_num += n;
+nb_sectors -= n;
+}
+return 0;
+}
+
 static void coroutine_fn convert_co_do_copy(void *opaque)
 {
 ImgConvertState *s = opaque;
@@ -1759,6 +1791,7 @@ static void coroutine_fn convert_co_do_copy(void *opaque)
 int n;
 int64_t sector_num;
 enum ImgConvertBlockStatus status;
+bool copy_range;
 
 qemu_co_mutex_lock(>lock);
 if (s->ret != -EINPROGRESS || s->sector_num >= s->total_sectors) {
@@ -1788,7 +1821,9 @@ static void coroutine_fn convert_co_do_copy(void *opaque)
 s->allocated_sectors, 0);
 }
 
-if (status == BLK_DATA) {
+retry:
+copy_range = s->copy_range && s->status == BLK_DATA;
+if (status == BLK_DATA && !copy_range) {
 ret = convert_co_read(s, sector_num, n, buf);
 if (ret < 0) {
 error_report("error while reading sector %" PRId64
@@ -1810,7 +1845,15 @@ static void coroutine_fn convert_co_do_copy(void *opaque)
 }
 
 if (s->ret == -EINPROGRESS) {
-ret = convert_co_write(s, sector_num, n, buf, status);
+if (copy_range) {
+ret = convert_co_copy_range(s, sector_num, n);
+if (ret) {
+s->copy_range = false;
+goto retry;
+}
+} else {
+ret = convert_co_write(s, sector_num, n, buf, status);
+}
 if (ret < 0) {
 error_report("error while writing sector %" PRId64
  ": %s", sector_num, strerror(-ret));
@@ -1933,6 +1976,7 @@ static int img_convert(int argc, char **argv)
 ImgConvertState s = (ImgConvertState) {
 /* Need at least 4k of zeros for sparse detection */
 .min_sparse = 8,
+.copy_range = true,
 .buf_sectors= IO_BUF_SIZE / BDRV_SECTOR_SIZE,
 .wr_in_order= true,
 .num_coroutines = 8,
@@ -1973,6 +2017,7 @@ static int img_convert(int argc, char **argv)
 break;
 case 'c':
 s.compressed = true;
+s.copy_range = false;
 break;
 case 'o':
 if (!is_valid_option_list(optarg)) {
@@ -2014,6 +2059,7 @@ static int img_convert(int argc, char **argv)
 }
 
 s.min_sparse = sval / BDRV_SECTOR_SIZE;
+s.copy_range = false;
 break;
 }
 case 'p':
-- 
2.14.3




[Qemu-block] [RFC PATCH v2 2/7] raw: Implement copy offloading

2018-04-17 Thread Fam Zheng
Just pass down to ->file.

Signed-off-by: Fam Zheng 
---
 block/raw-format.c | 20 
 1 file changed, 20 insertions(+)

diff --git a/block/raw-format.c b/block/raw-format.c
index a378547c99..febddf00c0 100644
--- a/block/raw-format.c
+++ b/block/raw-format.c
@@ -482,6 +482,24 @@ static int raw_probe_geometry(BlockDriverState *bs, 
HDGeometry *geo)
 return bdrv_probe_geometry(bs->file->bs, geo);
 }
 
+static int coroutine_fn raw_co_copy_range_from(BlockDriverState *bs,
+   BdrvChild *src, uint64_t 
src_offset,
+   BdrvChild *dst, uint64_t 
dst_offset,
+   uint64_t bytes, 
BdrvRequestFlags flags)
+{
+return bdrv_co_copy_range_from(bs->file, src_offset, dst, dst_offset,
+   bytes, flags);
+}
+
+static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs,
+ BdrvChild *src, uint64_t 
src_offset,
+ BdrvChild *dst, uint64_t 
dst_offset,
+ uint64_t bytes, BdrvRequestFlags 
flags)
+{
+return bdrv_co_copy_range_to(src, src_offset, bs->file, dst_offset, bytes,
+ flags);
+}
+
 BlockDriver bdrv_raw = {
 .format_name  = "raw",
 .instance_size= sizeof(BDRVRawState),
@@ -498,6 +516,8 @@ BlockDriver bdrv_raw = {
 .bdrv_co_pwrite_zeroes = _co_pwrite_zeroes,
 .bdrv_co_pdiscard = _co_pdiscard,
 .bdrv_co_block_status = _co_block_status,
+.bdrv_co_copy_range_from = _co_copy_range_from,
+.bdrv_co_copy_range_to  = _co_copy_range_to,
 .bdrv_truncate= _truncate,
 .bdrv_getlength   = _getlength,
 .has_variable_length  = true,
-- 
2.14.3




[Qemu-block] [RFC PATCH v2 5/7] iscsi: Implement copy offloading

2018-04-17 Thread Fam Zheng
Issue EXTENDED COPY (LID1) command to implement the copy_range API.

The parameter data construction code is ported from libiscsi's
iscsi-dd.c.

Signed-off-by: Fam Zheng 
---
 block/iscsi.c| 266 +++
 include/scsi/constants.h |   3 +
 2 files changed, 269 insertions(+)

diff --git a/block/iscsi.c b/block/iscsi.c
index f5aecfc883..7d17e03ad3 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -68,6 +68,7 @@ typedef struct IscsiLun {
 QemuMutex mutex;
 struct scsi_inquiry_logical_block_provisioning lbp;
 struct scsi_inquiry_block_limits bl;
+struct scsi_inquiry_device_designator *dd;
 unsigned char *zeroblock;
 /* The allocmap tracks which clusters (pages) on the iSCSI target are
  * allocated and which are not. In case a target returns zeros for
@@ -1740,6 +1741,29 @@ static QemuOptsList runtime_opts = {
 },
 };
 
+static void iscsi_save_designator(IscsiLun *lun,
+  struct scsi_inquiry_device_identification 
*inq_di)
+{
+struct scsi_inquiry_device_designator *desig, *copy = NULL;
+
+for (desig = inq_di->designators; desig; desig = desig->next) {
+if (desig->association ||
+desig->designator_type > SCSI_DESIGNATOR_TYPE_NAA) {
+continue;
+}
+/* NAA works better than T10 vendor ID based designator. */
+if (!copy || copy->designator_type < desig->designator_type) {
+copy = desig;
+}
+}
+if (copy) {
+lun->dd = g_new(struct scsi_inquiry_device_designator, 1);
+*lun->dd = *copy;
+lun->dd->designator = g_malloc(copy->designator_length);
+memcpy(lun->dd->designator, copy->designator, copy->designator_length);
+}
+}
+
 static int iscsi_open(BlockDriverState *bs, QDict *options, int flags,
   Error **errp)
 {
@@ -1922,6 +1946,7 @@ static int iscsi_open(BlockDriverState *bs, QDict 
*options, int flags,
 struct scsi_task *inq_task;
 struct scsi_inquiry_logical_block_provisioning *inq_lbp;
 struct scsi_inquiry_block_limits *inq_bl;
+struct scsi_inquiry_device_identification *inq_di;
 switch (inq_vpd->pages[i]) {
 case SCSI_INQUIRY_PAGECODE_LOGICAL_BLOCK_PROVISIONING:
 inq_task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1,
@@ -1947,6 +1972,17 @@ static int iscsi_open(BlockDriverState *bs, QDict 
*options, int flags,
sizeof(struct scsi_inquiry_block_limits));
 scsi_free_scsi_task(inq_task);
 break;
+case SCSI_INQUIRY_PAGECODE_DEVICE_IDENTIFICATION:
+inq_task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1,
+
SCSI_INQUIRY_PAGECODE_DEVICE_IDENTIFICATION,
+(void **) _di, errp);
+if (inq_task == NULL) {
+ret = -EINVAL;
+goto out;
+}
+iscsi_save_designator(iscsilun, inq_di);
+scsi_free_scsi_task(inq_task);
+break;
 default:
 break;
 }
@@ -2003,6 +2039,8 @@ static void iscsi_close(BlockDriverState *bs)
 iscsi_logout_sync(iscsi);
 }
 iscsi_destroy_context(iscsi);
+g_free(iscsilun->dd->designator);
+g_free(iscsilun->dd);
 g_free(iscsilun->zeroblock);
 iscsi_allocmap_free(iscsilun);
 qemu_mutex_destroy(>mutex);
@@ -2184,6 +,230 @@ static void coroutine_fn 
iscsi_co_invalidate_cache(BlockDriverState *bs,
 iscsi_allocmap_invalidate(iscsilun);
 }
 
+static int coroutine_fn iscsi_co_copy_range_from(BlockDriverState *bs,
+ BdrvChild *src,
+ uint64_t src_offset,
+ BdrvChild *dst,
+ uint64_t dst_offset,
+ uint64_t bytes,
+ BdrvRequestFlags flags)
+{
+return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, 
flags);
+}
+
+static struct scsi_task *iscsi_xcopy_task(int param_len)
+{
+struct scsi_task *task;
+
+task = g_new0(struct scsi_task, 1);
+
+task->cdb[0] = EXTENDED_COPY;
+task->cdb[10]= (param_len >> 24) & 0xFF;
+task->cdb[11]= (param_len >> 16) & 0xFF;
+task->cdb[12]= (param_len >> 8) & 0xFF;
+task->cdb[13]= param_len & 0xFF;
+task->cdb_size   = 16;
+task->xfer_dir   = SCSI_XFER_WRITE;
+task->expxferlen = param_len;
+
+return task;
+}
+
+static int iscsi_populate_target_desc(unsigned char *desc, IscsiLun *lun)
+{
+struct scsi_inquiry_device_designator *dd = lun->dd;
+
+memset(desc, 0, 32);
+desc[0] = IDENT_DESCR_TGT_DESCR;
+desc[4] = dd->code_set;
+desc[5] = (dd->designator_type & 0xF)
+| 

[Qemu-block] [RFC PATCH v2 6/7] block-backend: Add blk_co_copy_range

2018-04-17 Thread Fam Zheng
It's a BlockBackend wrapper of the BDS interface.

Signed-off-by: Fam Zheng 
---
 block/block-backend.c  | 9 +
 include/sysemu/block-backend.h | 4 
 2 files changed, 13 insertions(+)

diff --git a/block/block-backend.c b/block/block-backend.c
index 681b240b12..2a984b1864 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -2217,3 +2217,12 @@ void blk_unregister_buf(BlockBackend *blk, void *host)
 {
 bdrv_unregister_buf(blk_bs(blk), host);
 }
+
+int coroutine_fn blk_co_copy_range(BlockBackend *blk_in, int64_t off_in,
+   BlockBackend *blk_out, int64_t off_out,
+   int bytes, BdrvRequestFlags flags)
+{
+return bdrv_co_copy_range(blk_in->root, off_in,
+  blk_out->root, off_out,
+  bytes, flags);
+}
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index 92ab624fac..6f043b6b51 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -232,4 +232,8 @@ void blk_set_force_allow_inactivate(BlockBackend *blk);
 void blk_register_buf(BlockBackend *blk, void *host, size_t size);
 void blk_unregister_buf(BlockBackend *blk, void *host);
 
+int blk_co_copy_range(BlockBackend *blk_in, int64_t off_in,
+  BlockBackend *blk_out, int64_t off_out,
+  int bytes, BdrvRequestFlags flags);
+
 #endif
-- 
2.14.3




[Qemu-block] [RFC PATCH v2 4/7] file-posix: Implement bdrv_co_copy_range

2018-04-17 Thread Fam Zheng
With copy_file_range(2), we can implement the bdrv_co_copy_range
semantics.

Signed-off-by: Fam Zheng 
---
 block/file-posix.c  | 99 +++--
 include/block/raw-aio.h | 10 -
 2 files changed, 104 insertions(+), 5 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 3794c0007a..45ad543481 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -100,6 +100,7 @@
 #ifdef CONFIG_XFS
 #include 
 #endif
+#include 
 
 //#define DEBUG_BLOCK
 
@@ -185,6 +186,8 @@ typedef struct RawPosixAIOData {
 #define aio_ioctl_cmd   aio_nbytes /* for QEMU_AIO_IOCTL */
 off_t aio_offset;
 int aio_type;
+int fd2;
+off_t offset2;
 } RawPosixAIOData;
 
 #if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
@@ -1421,6 +1424,48 @@ static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData 
*aiocb)
 return -ENOTSUP;
 }
 
+#ifdef __NR_copy_file_range
+#define HAS_COPY_FILE_RANGE
+#endif
+
+#ifdef HAS_COPY_FILE_RANGE
+static off_t copy_file_range(int in_fd, off_t *in_off, int out_fd,
+ off_t *out_off, size_t len, unsigned int flags)
+{
+return syscall(__NR_copy_file_range, in_fd, in_off, out_fd,
+   out_off, len, flags);
+}
+#endif
+
+static ssize_t handle_aiocb_copy_range(RawPosixAIOData *aiocb)
+{
+#ifndef HAS_COPY_FILE_RANGE
+return -ENOTSUP;
+#else
+uint64_t bytes = aiocb->aio_nbytes;
+off_t in_off = aiocb->aio_offset;
+off_t out_off = aiocb->offset2;
+
+while (bytes) {
+ssize_t ret = copy_file_range(aiocb->aio_fildes, _off,
+  aiocb->fd2, _off,
+  bytes, 0);
+if (ret == -EINTR) {
+continue;
+}
+if (ret < 0) {
+return -errno;
+}
+if (!ret) {
+/* No progress (e.g. when beyond EOF), fall back to buffer I/O. */
+return -ENOTSUP;
+}
+bytes -= ret;
+}
+return 0;
+#endif
+}
+
 static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb)
 {
 int ret = -EOPNOTSUPP;
@@ -1501,6 +1546,9 @@ static int aio_worker(void *arg)
 case QEMU_AIO_WRITE_ZEROES:
 ret = handle_aiocb_write_zeroes(aiocb);
 break;
+case QEMU_AIO_COPY_RANGE:
+ret = handle_aiocb_copy_range(aiocb);
+break;
 default:
 fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type);
 ret = -EINVAL;
@@ -1511,9 +1559,10 @@ static int aio_worker(void *arg)
 return ret;
 }
 
-static int paio_submit_co(BlockDriverState *bs, int fd,
-  int64_t offset, QEMUIOVector *qiov,
-  int bytes, int type)
+static int paio_submit_co_full(BlockDriverState *bs, int fd,
+   int64_t offset, int fd2, int64_t offset2,
+   QEMUIOVector *qiov,
+   int bytes, int type)
 {
 RawPosixAIOData *acb = g_new(RawPosixAIOData, 1);
 ThreadPool *pool;
@@ -1521,6 +1570,8 @@ static int paio_submit_co(BlockDriverState *bs, int fd,
 acb->bs = bs;
 acb->aio_type = type;
 acb->aio_fildes = fd;
+acb->fd2 = fd2;
+acb->offset2 = offset2;
 
 acb->aio_nbytes = bytes;
 acb->aio_offset = offset;
@@ -1536,6 +1587,13 @@ static int paio_submit_co(BlockDriverState *bs, int fd,
 return thread_pool_submit_co(pool, aio_worker, acb);
 }
 
+static inline int paio_submit_co(BlockDriverState *bs, int fd,
+ int64_t offset, QEMUIOVector *qiov,
+ int bytes, int type)
+{
+return paio_submit_co_full(bs, fd, offset, -1, 0, qiov, bytes, type);
+}
+
 static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd,
 int64_t offset, QEMUIOVector *qiov, int bytes,
 BlockCompletionFunc *cb, void *opaque, int type)
@@ -2312,6 +2370,35 @@ static void raw_abort_perm_update(BlockDriverState *bs)
 raw_handle_perm_lock(bs, RAW_PL_ABORT, 0, 0, NULL);
 }
 
+static int coroutine_fn raw_co_copy_range_from(BlockDriverState *bs,
+  BdrvChild *src, uint64_t src_offset,
+  BdrvChild *dst, uint64_t dst_offset,
+  uint64_t bytes, BdrvRequestFlags flags)
+{
+return bdrv_co_copy_range_to(src, src_offset, dst, dst_offset, bytes, 
flags);
+}
+
+static int coroutine_fn raw_co_copy_range_to(BlockDriverState *bs,
+ BdrvChild *src, uint64_t 
src_offset,
+ BdrvChild *dst, uint64_t 
dst_offset,
+ uint64_t bytes, BdrvRequestFlags 
flags)
+{
+BDRVRawState *s = bs->opaque;
+BDRVRawState *src_s;
+
+assert(dst->bs == bs);
+if (src->bs->drv->bdrv_co_copy_range_to != raw_co_copy_range_to) {
+return -ENOTSUP;
+}
+
+src_s = src->opaque;
+if 

[Qemu-block] [RFC PATCH v2 0/7] qemu-img convert with copy offloading

2018-04-17 Thread Fam Zheng
v2: - Add iscsi EXTENDED COPY.
- Design change: bdrv_co_copy_range_{from,to}. [Stefan]
- Retry upon EINTR. [Stefan]
- Drop the bounce buffer fallback. It is inefficient to attempt the
  offloaded copy over and over again if the error is returned from the host
  rather than checking our internal state. E.g.  trying copy_file_range
  between two filesystems results in EXDEV, in which case qemu-img.c should
  switch back to the copy path for the remaining sectors.

This series introduces block layer API for copy offloading and makes use of it
in qemu-img convert.

For now we implemented the operation in local file protocol with
copy_file_range(2).  Besides that it's possible to add similar to iscsi, nfs
and potentially more.

As far as its usage goes, in addition to qemu-img convert, we can emulate
offloading in scsi-disk (handle EXTENDED COPY command), and use the API in
block jobs too.

Fam Zheng (7):
  block: Introduce API for copy offloading
  raw: Implement copy offloading
  qcow2: Implement copy offloading
  file-posix: Implement bdrv_co_copy_range
  iscsi: Implement copy offloading
  block-backend: Add blk_co_copy_range
  qemu-img: Convert with copy offloading

 block/block-backend.c  |   9 ++
 block/file-posix.c |  99 ++-
 block/io.c |  91 ++
 block/iscsi.c  | 266 +
 block/qcow2.c  | 224 +-
 block/raw-format.c |  20 
 include/block/block.h  |   4 +
 include/block/block_int.h  |  30 +
 include/block/raw-aio.h|  10 +-
 include/scsi/constants.h   |   3 +
 include/sysemu/block-backend.h |   4 +
 qemu-img.c |  50 +++-
 12 files changed, 773 insertions(+), 37 deletions(-)

-- 
2.14.3




[Qemu-block] [RFC PATCH v2 3/7] qcow2: Implement copy offloading

2018-04-17 Thread Fam Zheng
The two callbacks are implemented quite similarly to the read/write
functions: bdrv_co_copy_range_from maps for read and calls into bs->file
or bs->backing depending on the allocation status; bdrv_co_copy_range_to
maps for write and calls into bs->file.

Signed-off-by: Fam Zheng 
---
 block/qcow2.c | 224 ++
 1 file changed, 194 insertions(+), 30 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 486f3e83b7..9a0046220d 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1755,6 +1755,31 @@ static int coroutine_fn 
qcow2_co_block_status(BlockDriverState *bs,
 return status;
 }
 
+static int qcow2_handle_l2meta(BlockDriverState *bs, QCowL2Meta *l2meta)
+{
+int ret = 0;
+
+while (l2meta != NULL) {
+QCowL2Meta *next;
+
+if (!ret) {
+ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
+}
+
+/* Take the request off the list of running requests */
+if (l2meta->nb_clusters != 0) {
+QLIST_REMOVE(l2meta, next_in_flight);
+}
+
+qemu_co_queue_restart_all(>dependent_requests);
+
+next = l2meta->next;
+g_free(l2meta);
+l2meta = next;
+}
+return ret;
+}
+
 static coroutine_fn int qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
 uint64_t bytes, QEMUIOVector *qiov,
 int flags)
@@ -2041,24 +2066,10 @@ static coroutine_fn int 
qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
 }
 }
 
-while (l2meta != NULL) {
-QCowL2Meta *next;
-
-ret = qcow2_alloc_cluster_link_l2(bs, l2meta);
-if (ret < 0) {
-goto fail;
-}
-
-/* Take the request off the list of running requests */
-if (l2meta->nb_clusters != 0) {
-QLIST_REMOVE(l2meta, next_in_flight);
-}
-
-qemu_co_queue_restart_all(>dependent_requests);
-
-next = l2meta->next;
-g_free(l2meta);
-l2meta = next;
+ret = qcow2_handle_l2meta(bs, l2meta);
+l2meta = NULL;
+if (ret) {
+goto fail;
 }
 
 bytes -= cur_bytes;
@@ -2069,18 +2080,7 @@ static coroutine_fn int 
qcow2_co_pwritev(BlockDriverState *bs, uint64_t offset,
 ret = 0;
 
 fail:
-while (l2meta != NULL) {
-QCowL2Meta *next;
-
-if (l2meta->nb_clusters != 0) {
-QLIST_REMOVE(l2meta, next_in_flight);
-}
-qemu_co_queue_restart_all(>dependent_requests);
-
-next = l2meta->next;
-g_free(l2meta);
-l2meta = next;
-}
+qcow2_handle_l2meta(bs, l2meta);
 
 qemu_co_mutex_unlock(>lock);
 
@@ -3267,6 +3267,168 @@ static coroutine_fn int 
qcow2_co_pdiscard(BlockDriverState *bs,
 return ret;
 }
 
+static int qcow2_co_copy_range_from(BlockDriverState *bs,
+BdrvChild *src, uint64_t src_offset,
+BdrvChild *dst, uint64_t dst_offset,
+uint64_t bytes, BdrvRequestFlags flags)
+{
+BDRVQcow2State *s = bs->opaque;
+int offset_in_cluster;
+int ret;
+unsigned int cur_bytes; /* number of bytes in current iteration */
+uint64_t cluster_offset = 0;
+BdrvChild *child = NULL;
+
+assert(!bs->encrypted);
+qemu_co_mutex_lock(>lock);
+
+while (bytes != 0) {
+
+/* prepare next request */
+cur_bytes = MIN(bytes, INT_MAX);
+
+ret = qcow2_get_cluster_offset(bs, src_offset, _bytes, 
_offset);
+if (ret < 0) {
+goto out;
+}
+
+offset_in_cluster = offset_into_cluster(s, src_offset);
+
+switch (ret) {
+case QCOW2_CLUSTER_UNALLOCATED:
+if (bs->backing) {
+child = bs->backing;
+} else {
+flags |= BDRV_REQ_ZERO_WRITE;
+}
+break;
+
+case QCOW2_CLUSTER_ZERO_PLAIN:
+case QCOW2_CLUSTER_ZERO_ALLOC:
+flags |= BDRV_REQ_ZERO_WRITE;
+break;
+
+case QCOW2_CLUSTER_COMPRESSED:
+ret = -ENOTSUP;
+goto out;
+break;
+
+case QCOW2_CLUSTER_NORMAL:
+child = bs->file;
+if ((cluster_offset & 511) != 0) {
+ret = -EIO;
+goto out;
+}
+break;
+
+default:
+abort();
+}
+qemu_co_mutex_unlock(>lock);
+ret = bdrv_co_copy_range_from(child,
+  cluster_offset + offset_in_cluster,
+  dst, dst_offset,
+  cur_bytes, flags);
+qemu_co_mutex_lock(>lock);
+if (ret < 0) {
+goto out;
+}
+
+bytes -= cur_bytes;
+src_offset += cur_bytes;
+}
+

[Qemu-block] [RFC PATCH v2 1/7] block: Introduce API for copy offloading

2018-04-17 Thread Fam Zheng
Signed-off-by: Fam Zheng 
---
 block/io.c| 91 +++
 include/block/block.h |  4 +++
 include/block/block_int.h | 30 
 3 files changed, 125 insertions(+)

diff --git a/block/io.c b/block/io.c
index bd9a19a9c4..d274e9525f 100644
--- a/block/io.c
+++ b/block/io.c
@@ -2826,3 +2826,94 @@ void bdrv_unregister_buf(BlockDriverState *bs, void 
*host)
 bdrv_unregister_buf(child->bs, host);
 }
 }
+
+static int bdrv_co_copy_range_internal(BdrvChild *src,
+   uint64_t src_offset,
+   BdrvChild *dst,
+   uint64_t dst_offset,
+   uint64_t bytes, BdrvRequestFlags flags,
+   bool recurse_src)
+{
+int ret;
+
+if (!src || !dst || !src->bs || !dst->bs) {
+return -ENOMEDIUM;
+}
+ret = bdrv_check_byte_request(src->bs, src_offset, bytes);
+if (ret) {
+return ret;
+}
+
+ret = bdrv_check_byte_request(dst->bs, dst_offset, bytes);
+if (ret) {
+return ret;
+}
+if (flags & BDRV_REQ_ZERO_WRITE) {
+return bdrv_co_pwrite_zeroes(dst, dst_offset, bytes, flags);
+}
+
+if (!src->bs->drv->bdrv_co_copy_range_from
+|| !dst->bs->drv->bdrv_co_copy_range_to
+|| src->bs->encrypted || dst->bs->encrypted) {
+return -ENOTSUP;
+}
+if (recurse_src) {
+return src->bs->drv->bdrv_co_copy_range_from(src->bs,
+ src, src_offset,
+ dst, dst_offset,
+ bytes, flags);
+} else {
+return dst->bs->drv->bdrv_co_copy_range_to(dst->bs,
+   src, src_offset,
+   dst, dst_offset,
+   bytes, flags);
+}
+}
+
+/* Copy range from @bs to @dst. */
+int bdrv_co_copy_range_from(BdrvChild *src, uint64_t src_offset,
+BdrvChild *dst, uint64_t dst_offset,
+uint64_t bytes, BdrvRequestFlags flags)
+{
+return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
+   bytes, flags, true);
+}
+
+/* Copy range from @src to @bs. Should only be called by block drivers when @bs
+ * is the leaf. */
+int bdrv_co_copy_range_to(BdrvChild *src, uint64_t src_offset,
+  BdrvChild *dst, uint64_t dst_offset,
+  uint64_t bytes, BdrvRequestFlags flags)
+{
+return bdrv_co_copy_range_internal(src, src_offset, dst, dst_offset,
+   bytes, flags, false);
+}
+
+int bdrv_co_copy_range(BdrvChild *src, uint64_t src_offset,
+   BdrvChild *dst, uint64_t dst_offset,
+   uint64_t bytes, BdrvRequestFlags flags)
+{
+BdrvTrackedRequest src_req, dst_req;
+BlockDriverState *src_bs = src->bs;
+BlockDriverState *dst_bs = dst->bs;
+int ret;
+
+bdrv_inc_in_flight(src_bs);
+bdrv_inc_in_flight(dst_bs);
+tracked_request_begin(_req, src_bs, src_offset,
+  bytes, BDRV_TRACKED_READ);
+tracked_request_begin(_req, dst_bs, dst_offset,
+  bytes, BDRV_TRACKED_WRITE);
+
+wait_serialising_requests(_req);
+wait_serialising_requests(_req);
+ret = bdrv_co_copy_range_from(src, src_offset,
+  dst, dst_offset,
+  bytes, flags);
+
+tracked_request_end(_req);
+tracked_request_end(_req);
+bdrv_dec_in_flight(src_bs);
+bdrv_dec_in_flight(dst_bs);
+return ret;
+}
diff --git a/include/block/block.h b/include/block/block.h
index cdec3639a3..72ac011b2b 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -604,4 +604,8 @@ bool bdrv_can_store_new_dirty_bitmap(BlockDriverState *bs, 
const char *name,
  */
 void bdrv_register_buf(BlockDriverState *bs, void *host, size_t size);
 void bdrv_unregister_buf(BlockDriverState *bs, void *host);
+
+int bdrv_co_copy_range(BdrvChild *bs, uint64_t offset,
+   BdrvChild *src, uint64_t src_offset,
+   uint64_t bytes, BdrvRequestFlags flags);
 #endif
diff --git a/include/block/block_int.h b/include/block/block_int.h
index c4dd1d4bb8..305c29b75e 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -206,6 +206,29 @@ struct BlockDriver {
 int coroutine_fn (*bdrv_co_pdiscard)(BlockDriverState *bs,
 int64_t offset, int bytes);
 
+/* Map [offset, offset + nbytes) range onto a child of @bs to copy from,
+ * and invoke bdrv_co_copy_range_from(child, ...), or invoke
+ * bdrv_co_copy_range_to() if @bs is the leaf child to 

Re: [Qemu-block] [PATCH 3/3] qemu-iotests: Test new qemu-nbd --nolist option

2018-04-17 Thread Eric Blake
On 04/13/2018 02:26 PM, Nir Soffer wrote:
> Add new test module for tesing the --nolist option.
> 
> Signed-off-by: Nir Soffer 
> ---

> +iotests.log('Check that listing exports is allowed by default')
> +disk, nbd_sock = iotests.file_path('disk1', 'nbd-sock1')
> +iotests.qemu_img_create('-f', iotests.imgfmt, disk, '1m')
> +iotests.qemu_nbd('-k', nbd_sock, '-f', iotests.imgfmt, '-x', 'export', disk)
> +out = iotests.run('nbd-client', '-l', '--unix', nbd_sock)

Should we really be relying on the third-party nbd-client to be
installed?  Would it not be better to teach our own NBD client to learn
how to do interesting things over NBD?  Your use case of listing the
output of NBD_OPT_LIST is one, but another that readily comes to mind is
listing the possibilities of NBD_OPT_LIST_META_CONTEXT that just went
into 2.12.  Maybe making 'qemu-img info' give more details when
connecting to an NBD server, compared to what is normally needed just
for connecting to an export on that server?

Additionally, once we merge in Vladimir's work to expose persistent
dirty bitmaps via NBD_OPT_SET_META_CONTEXT/NBD_CMD_BLOCK_STATUS, it
would be nice to have an in-tree tool for reading out the context
information of an NBD export, perhaps extending what 'qemu-img map' can
already do (Vladimir already mentioned that he only implemented the
server side, and left the client side for an out-of-tree solution [1],
although I'm wondering if that is still the wisest course of action).

[1] https://lists.gnu.org/archive/html/qemu-devel/2018-03/msg05701.html

> +
> +assert 'export' in out.splitlines(), 'Export not in %r' % out
> +
> +iotests.log('Check that listing exports is forbidden with --nolist')
> +disk, nbd_sock = iotests.file_path('disk2', 'nbd-sock2')
> +iotests.qemu_img_create('-f', iotests.imgfmt, disk, '1m')
> +iotests.qemu_nbd('-k', nbd_sock, '-f', iotests.imgfmt, '-x', 'secret',
> + '--nolist', disk)
> +
> +# nbd-client fails when listing is not allowed, but lets not depend on 3rd

s/lets/let's/

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [Qemu-devel] [PATCH 1/3] nbd: Add option to disallow listing exports

2018-04-17 Thread Eric Blake
On 04/16/2018 06:00 AM, Daniel P. Berrangé wrote:
> On Mon, Apr 16, 2018 at 11:53:41AM +0100, Richard W.M. Jones wrote:
>> On Mon, Apr 16, 2018 at 11:31:18AM +0100, Daniel P. Berrangé wrote:
>>> Essentially this is abusing the export name as a crude authentication
>>> token. There are NBD servers that expect NBD_OPT_LIST to always succeeed
>>
>> I guess you mean "NBD clients" ...
> 
> Sigh, yes, of course.

qemu 2.10 and older tries to use NBD_OPT_LIST, but gracefully still
tries to connect even if the LIST fails (that is, it's use of
NBD_OPT_LIST was for better error handling than what NBD_OPT_EXPORT_NAME
gives, and not because it actually needed the list).  The recent
introduction in qemu 2.11 for support of NBD_OPT_GO means that modern
qemu is no longer even attempting NBD_OPT_LIST when talking to a new
server.  But cross-implementation compatibility is still a concern, and
there may indeed be non-qemu clients that choke if LIST fails, even
though...

> 
>>> when they detect that the new style protocol is available. I really hate
>>> the idea of making it possible to break the NBD_OPT_LIST functionality
>>> via a command line arg like this.

...the NBD spec suggests that a client that requires LIST to work is not
fully compliant, since NBD_OPT_LIST is an optional feature.

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH 1/3] nbd: Add option to disallow listing exports

2018-04-17 Thread Eric Blake
On 04/13/2018 02:26 PM, Nir Soffer wrote:
> When a management application expose images using qemu-nbd, it needs a
> secure way to allow temporary access to the disk. Using a random export
> name can solve this problem:
> 
> nbd://server:10809/22965f19-9ab5-4d18-94e1-cbeb321fa433

I share Dan's concerns that you are trying to protect information
without requiring TLS.  If you would just use TLS, then only clients
that can authenticate can list the export names; the fact that the name
leaks at all means you aren't using TLS, so you are just as vulnerable
to a man-in-the-middle attack as you are to the information leak.

> 
> Assuming that the url is passed to the user in a secure way, and the
> user is using TLS to access the image.
> 
> However, since qemu-nbd implements NBD_OPT_LIST, anyone can easily find
> the secret export:
> 
> $ nbd-client -l server 10809
> Negotiation: ..
> 22965f19-9ab5-4d18-94e1-cbeb321fa433

If the server requires TLS, then 'nbd-client -l' already cannot list
names without first negotiating TLS (all commands other than
NBD_OPT_STARTTLS are rejected with NBD_REP_ERR_TLS_REQD if the server
required TLS).  Your example is thus invalidating your above assumption
that the user is using TLS.

> 
> Add a new --nolist option, disabling listing, similar the "allowlist"
> nbd-server configuration option.

This may still make sense to implement, but not necessarily for the
reasons you are giving.


> @@ -86,6 +88,7 @@ static void usage(const char *name)
>  "  -v, --verbose display extra debugging information\n"
>  "  -x, --export-name=NAMEexpose export by name\n"
>  "  -D, --description=TEXTwith -x, also export a human-readable 
> description\n"
> +"  --nolist  do not list export\n"

s/export/exports/

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [PATCH v3 2/2] docs: Document the new default sizes of the qcow2 caches

2018-04-17 Thread Eric Blake
On 04/17/2018 07:37 AM, Alberto Garcia wrote:
> We have just reduced the refcount cache size to the minimum unless
> the user explicitly requests a larger one, so we have to update the
> documentation to reflect this change.
> 
> Signed-off-by: Alberto Garcia 
> ---
>  docs/qcow2-cache.txt | 33 -
>  1 file changed, 16 insertions(+), 17 deletions(-)

Reviewed-by: Eric Blake 

-- 
Eric Blake, Principal Software Engineer
Red Hat, Inc.   +1-919-301-3266
Virtualization:  qemu.org | libvirt.org



signature.asc
Description: OpenPGP digital signature


Re: [Qemu-block] [Qemu-devel] [PATCH v5 00/21] blockjobs: add explicit job management

2018-04-17 Thread John Snow


On 04/17/2018 09:44 AM, Markus Armbruster wrote:
> John Snow  writes:
> 
>> This series seeks to address two distinct but closely related issues
>> concerning the job management API.
>>
>> (1) For jobs that complete when a monitor is not attached and receiving
>> events or notifications, there's no way to discern the job's final
>> return code. Jobs must remain in the query list until dismissed
>> for reliable management.
>>
>> (2) Jobs that change the block graph structure at an indeterminate point
>> after the job starts compete with the management layer that relies
>> on that graph structure to issue meaningful commands.
>>
>> This structure should change only at the behest of the management
>> API, and not asynchronously at unknown points in time. Before a job
>> issues such changes, it must rely on explicit and synchronous
>> confirmation from the management API.
>>
>> These changes are implemented by formalizing a State Transition Machine
>> for the BlockJob subsystem.
>>
>> Job States:
>>
>> UNDEFINED   Default state. Internal state only.
>> CREATED Job has been created
>> RUNNING Job has been started and is running
>> PAUSED  Job is not ready and has been paused
>> READY   Job is ready and is running
>> STANDBY Job is ready and is paused
>>
>> WAITING Job is waiting on peers in transaction
>> PENDING Job is waiting on ACK from QMP
>> ABORTINGJob is aborting or has been cancelled
>> CONCLUDED   Job has finished and has a retcode available
>> NULLJob is being dismantled. Internal state only.
>>
>> Job Verbs:
>>

Backporting your quote up here:

> For each job verb and job state: what's the new job state?
>

That's not always 1:1, though I tried to address it in the commit messages.


>> CANCEL  Instructs a running job to terminate with error,
>> (Except when that job is READY, which produces no error.)

CANCEL will take a job to either NULL... (this is the early abort
pathway, prior to the job being fully realized.)

...or to ABORTING (from CREATED once it has fully realized the job, or
from RUNNING, READY, WAITING, or PENDING.)

>> PAUSE   Request a job to pause.

issued to RUNNING or READY, transitions to PAUSED or STANDBY respectively.

>> RESUME  Request a job to resume from a pause.

issued to PAUSED or STANDBY, transitions to RUNNING or READY respectively.

>> SET-SPEED   Change the speed limiting parameter of a job.

No run state change.

>> COMPLETEAsk a READY job to finish and exit.
>>

Issued to a READY job, transitions to WAITING.

>> FINALIZEAsk a PENDING job to perform its graph finalization.

Issued to a PENDING job, transitions to CONCLUDED.

>> DISMISS Finish cleaning up an empty job.
> 

Issued to a CONCLUDED job, transitions to NULL.


>> And here's my stab at a diagram:
>>
>>  +-+
>>  |UNDEFINED|
>>  +--+--+
>> |
>>  +--v+
>>+-+CREATED+-+
>>| +--++ |
>>||  |
>>| +--++ +--+|
>>+-+RUNNING<->PAUSED||
>>| +--+-+--+ +--+|
>>|| ||
>>|| +--+ |
>>||| |
>>| +--v--+   +---+ | |
>>+-+READY<--->STANDBY| | |
>>| +--+--+   +---+ | |
>>||| |
>>| +--v+   | |
>>+-+WAITING<---+ |
>>| +--++ |
>>||  |
>>| +--v+ |
>>+-+PENDING| |
>>| +--++ |
>>||  |
>> +--v-+   +--v--+   |
>> |ABORTING+--->CONCLUDED|   |
>> ++   +--+--+   |
>> |  |
>>  +--v-+|
>>  |NULL<+
>>  ++
> 
> Is this diagram missing a few arrowheads?  E.g. on the edge between
> RUNNING and WAITING.
> 

Apparently yes. :\

(Secretly fixed up in my reply.)

> Might push the limits of ASCII art, but here goes anyway: can we label
> the arrows with job verbs?
> 

Can you recommend a tool to help me do that? I've been using asciiflow
infinity (http://asciiflow.com) and it's not very good, but I don't have
anything better.

> Can you briefly explain how this state machine addresses (1) and (2)?
> 

(1) The CONCLUDED state allows jobs to persist in the job query list

[Qemu-block] Why is there no qom_get in hmp.c?

2018-04-17 Thread QingFeng Hao
Hi all,
I did some investigation and found that "virsh qemu-monitor-command" supports 
qom-get,
but qemu hmp doesn't. However, in hmp.c there are qom_list and qom_set. It 
confused me
and my question is: why is this? And how can I get a property's value in hmp? 
e.g.
qemu-system-* -nodefaults -machine accel=qtest -no-shutdown -nographic -monitor 
stdio -serial none -hda /root/t.qcow2
"info qtree" can only get a few properties.

Thanks a lot!
-- 
Regards
QingFeng Hao




[Qemu-block] [PATCH 6/6] ide: introduce ide_transfer_start_norecurse

2018-04-17 Thread Paolo Bonzini
For the case where the end_transfer_func is also the caller of
ide_transfer_start, the mutual recursion can lead to unlimited
stack usage.  Introduce a new version that can be used to change
tail recursion into a loop, and use it in trace_ide_atapi_cmd_reply_end.

Signed-off-by: Paolo Bonzini 
---
 hw/ide/atapi.c| 42 +--
 hw/ide/core.c | 16 +++
 include/hw/ide/internal.h |  2 ++
 3 files changed, 37 insertions(+), 23 deletions(-)

diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c
index 7168ff55a7..39e473f9c2 100644
--- a/hw/ide/atapi.c
+++ b/hw/ide/atapi.c
@@ -245,15 +245,11 @@ static uint16_t atapi_byte_count_limit(IDEState *s)
 void ide_atapi_cmd_reply_end(IDEState *s)
 {
 int byte_count_limit, size, ret;
-trace_ide_atapi_cmd_reply_end(s, s->packet_transfer_size,
-  s->elementary_transfer_size,
-  s->io_buffer_index);
-if (s->packet_transfer_size <= 0) {
-/* end of transfer */
-ide_atapi_cmd_ok(s);
-ide_set_irq(s->bus);
-trace_ide_atapi_cmd_reply_end_eot(s, s->status);
-} else {
+while (s->packet_transfer_size > 0) {
+trace_ide_atapi_cmd_reply_end(s, s->packet_transfer_size,
+  s->elementary_transfer_size,
+  s->io_buffer_index);
+
 /* see if a new sector must be read */
 if (s->lba != -1 && s->io_buffer_index >= s->cd_sector_size) {
 if (!s->elementary_transfer_size) {
@@ -279,11 +275,6 @@ void ide_atapi_cmd_reply_end(IDEState *s)
 size = s->cd_sector_size - s->io_buffer_index;
 if (size > s->elementary_transfer_size)
 size = s->elementary_transfer_size;
-s->packet_transfer_size -= size;
-s->elementary_transfer_size -= size;
-s->io_buffer_index += size;
-ide_transfer_start(s, s->io_buffer + s->io_buffer_index - size,
-   size, ide_atapi_cmd_reply_end);
 } else {
 /* a new transfer is needed */
 s->nsector = (s->nsector & ~7) | ATAPI_INT_REASON_IO;
@@ -306,13 +297,26 @@ void ide_atapi_cmd_reply_end(IDEState *s)
 size = (s->cd_sector_size - s->io_buffer_index);
 }
 trace_ide_atapi_cmd_reply_end_new(s, s->status);
-s->packet_transfer_size -= size;
-s->elementary_transfer_size -= size;
-s->io_buffer_index += size;
-ide_transfer_start(s, s->io_buffer + s->io_buffer_index - size,
-   size, ide_atapi_cmd_reply_end);
+}
+s->packet_transfer_size -= size;
+s->elementary_transfer_size -= size;
+s->io_buffer_index += size;
+
+/* Some adapters process PIO data right away.  In that case, we need
+ * to avoid mutual recursion between ide_transfer_start
+ * and ide_atapi_cmd_reply_end.
+ */
+if (!ide_transfer_start_norecurse(s,
+  s->io_buffer + s->io_buffer_index - 
size,
+  size, ide_atapi_cmd_reply_end)) {
+return;
 }
 }
+
+/* end of transfer */
+trace_ide_atapi_cmd_reply_end_eot(s, s->status);
+ide_atapi_cmd_ok(s);
+ide_set_irq(s->bus);
 }
 
 /* send a reply of 'size' bytes in s->io_buffer to an ATAPI command */
diff --git a/hw/ide/core.c b/hw/ide/core.c
index bc3648eb13..9884df0e2e 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -524,8 +524,8 @@ static void ide_clear_retry(IDEState *s)
 }
 
 /* prepare data transfer and tell what to do after */
-void ide_transfer_start(IDEState *s, uint8_t *buf, int size,
-EndTransferFunc *end_transfer_func)
+bool ide_transfer_start_norecurse(IDEState *s, uint8_t *buf, int size,
+  EndTransferFunc *end_transfer_func)
 {
 s->data_ptr = buf;
 s->data_end = buf + size;
@@ -535,10 +535,18 @@ void ide_transfer_start(IDEState *s, uint8_t *buf, int 
size,
 }
 if (!s->bus->dma->ops->pio_transfer) {
 s->end_transfer_func = end_transfer_func;
-return;
+return false;
 }
 s->bus->dma->ops->pio_transfer(s->bus->dma);
-end_transfer_func(s);
+return true;
+}
+
+void ide_transfer_start(IDEState *s, uint8_t *buf, int size,
+EndTransferFunc *end_transfer_func)
+{
+if (ide_transfer_start_norecurse(s, buf, size, end_transfer_func)) {
+end_transfer_func(s);
+}
 }
 
 static void ide_cmd_done(IDEState *s)
diff --git a/include/hw/ide/internal.h b/include/hw/ide/internal.h
index f3de6f9b73..594081e57f 100644
--- a/include/hw/ide/internal.h
+++ b/include/hw/ide/internal.h
@@ -623,6 +623,8 @@ void ide_exec_cmd(IDEBus *bus, uint32_t val);
 
 void ide_transfer_start(IDEState *s, uint8_t *buf, int 

[Qemu-block] [PATCH 1/6] ahci: move PIO Setup FIS before transfer, fix it for ATAPI commands

2018-04-17 Thread Paolo Bonzini
The PIO Setup FIS is written in the PIO:Entry state, which comes before
the ATA and ATAPI data transfer states.  As a result, the PIO Setup FIS
interrupt is now raised before DMA ends for ATAPI commands, and tests have
to be adjusted.

This is also hinted by the description of the command header in the AHCI
specification, where the "A" bit is described as

When ‘1’, indicates that a PIO setup FIS shall be sent by the device
indicating a transfer for the ATAPI command.

and also by the description of the ACMD (ATAPI command region):

The ATAPI command must be either 12 or 16 bytes in length. The length
transmitted by the HBA is determined by the PIO setup FIS that is sent
by the device requesting the ATAPI command.

QEMU, which conflates the "generator" and the "receiver" of the FIS into
one device, always uses ATAPI_PACKET_SIZE, aka 12, for the length.

Signed-off-by: Paolo Bonzini 
---
 hw/ide/ahci.c   | 15 ++-
 tests/libqos/ahci.c | 30 ++
 tests/libqos/ahci.h |  2 ++
 3 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c
index e22d7be05f..45ce195fb8 100644
--- a/hw/ide/ahci.c
+++ b/hw/ide/ahci.c
@@ -1291,11 +1291,12 @@ static void ahci_start_transfer(IDEDMA *dma)
 int is_write = opts & AHCI_CMD_WRITE;
 int is_atapi = opts & AHCI_CMD_ATAPI;
 int has_sglist = 0;
+uint16_t fis_len;
 
 if (is_atapi && !ad->done_atapi_packet) {
 /* already prepopulated iobuffer */
 ad->done_atapi_packet = true;
-size = 0;
+fis_len = size;
 goto out;
 }
 
@@ -1315,19 +1316,15 @@ static void ahci_start_transfer(IDEDMA *dma)
 }
 }
 
+/* Update number of transferred bytes, destroy sglist */
+dma_buf_commit(s, size);
+fis_len = le32_to_cpu(ad->cur_cmd->status);
 out:
 /* declare that we processed everything */
 s->data_ptr = s->data_end;
-
-/* Update number of transferred bytes, destroy sglist */
-dma_buf_commit(s, size);
+ahci_write_fis_pio(ad, fis_len);
 
 s->end_transfer_func(s);
-
-if (!(s->status & DRQ_STAT)) {
-/* done with PIO send/receive */
-ahci_write_fis_pio(ad, le32_to_cpu(ad->cur_cmd->status));
-}
 }
 
 static void ahci_start_dma(IDEDMA *dma, IDEState *s,
diff --git a/tests/libqos/ahci.c b/tests/libqos/ahci.c
index bc201d762b..04f33e246c 100644
--- a/tests/libqos/ahci.c
+++ b/tests/libqos/ahci.c
@@ -477,6 +477,23 @@ void ahci_port_check_d2h_sanity(AHCIQState *ahci, uint8_t 
port, uint8_t slot)
 g_free(d2h);
 }
 
+void ahci_port_check_pio_sanity_atapi(AHCIQState *ahci, uint8_t port,
+  uint8_t slot)
+{
+PIOSetupFIS *pio = g_malloc0(0x20);
+
+/* We cannot check the Status or E_Status registers, because
+ * the status may have again changed between the PIO Setup FIS
+ * and the conclusion of the command with the D2H Register FIS. */
+qtest_memread(ahci->parent->qts, ahci->port[port].fb + 0x20, pio, 0x20);
+g_assert_cmphex(pio->fis_type, ==, 0x5f);
+
+g_assert(le16_to_cpu(pio->tx_count) == 12 ||
+ le16_to_cpu(pio->tx_count) == 16);
+
+g_free(pio);
+}
+
 void ahci_port_check_pio_sanity(AHCIQState *ahci, uint8_t port,
 uint8_t slot, size_t buffsize)
 {
@@ -831,9 +848,9 @@ void ahci_command_enable_atapi_dma(AHCICommand *cmd)
 RegH2DFIS *fis = &(cmd->fis);
 g_assert(cmd->props->atapi);
 fis->feature_low |= 0x01;
-cmd->interrupts &= ~AHCI_PX_IS_PSS;
+/* PIO is still used to transfer the ATAPI command */
+g_assert(cmd->props->pio);
 cmd->props->dma = true;
-cmd->props->pio = false;
 /* BUG: We expect the DMA Setup interrupt for DMA commands */
 /* cmd->interrupts |= AHCI_PX_IS_DSS; */
 }
@@ -845,7 +862,7 @@ AHCICommand *ahci_command_create(uint8_t command_name)
 
 g_assert(props);
 cmd = g_new0(AHCICommand, 1);
-g_assert(!(props->dma && props->pio));
+g_assert(!(props->dma && props->pio) || props->atapi);
 g_assert(!(props->lba28 && props->lba48));
 g_assert(!(props->read && props->write));
 g_assert(!props->size || props->data);
@@ -1216,7 +1233,12 @@ void ahci_command_verify(AHCIQState *ahci, AHCICommand 
*cmd)
 ahci_port_check_d2h_sanity(ahci, port, slot);
 }
 if (cmd->props->pio) {
-ahci_port_check_pio_sanity(ahci, port, slot, cmd->xbytes);
+/* For ATAPI, we might have only used PIO for the command.  */
+if (cmd->props->atapi && (!cmd->xbytes || cmd->props->dma)) {
+ahci_port_check_pio_sanity_atapi(ahci, port, slot);
+} else {
+ahci_port_check_pio_sanity(ahci, port, slot, cmd->xbytes);
+}
 }
 }
 
diff --git a/tests/libqos/ahci.h b/tests/libqos/ahci.h
index 715ca1e226..cdba8099dd 100644
--- a/tests/libqos/ahci.h
+++ b/tests/libqos/ahci.h
@@ -598,6 +598,8 @@ void ahci_port_check_nonbusy(AHCIQState *ahci, 

[Qemu-block] [PATCH 2/6] ide: push end_transfer_func out of start_transfer callback, rename callback

2018-04-17 Thread Paolo Bonzini
Now that end_transfer_func is a tail call in ahci_start_transfer,
formalize the fact that the callback (of which ahci_start_transfer is
the sole implementation) takes care of the transfer too: rename it to
pio_transfer and, if it is present, call the end_transfer_func as soon
as it returns.

Signed-off-by: Paolo Bonzini 
---
 hw/ide/ahci.c | 15 +++
 hw/ide/core.c |  8 +---
 hw/ide/trace-events   |  2 +-
 include/hw/ide/internal.h |  2 +-
 4 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c
index 45ce195fb8..10bcc65308 100644
--- a/hw/ide/ahci.c
+++ b/hw/ide/ahci.c
@@ -1280,8 +1280,8 @@ out:
 return 0;
 }
 
-/* DMA dev <-> ram */
-static void ahci_start_transfer(IDEDMA *dma)
+/* Transfer PIO data between RAM and device */
+static void ahci_pio_transfer(IDEDMA *dma)
 {
 AHCIDevice *ad = DO_UPCAST(AHCIDevice, dma, dma);
 IDEState *s = >port.ifs[0];
@@ -1304,9 +1304,9 @@ static void ahci_start_transfer(IDEDMA *dma)
 has_sglist = 1;
 }
 
-trace_ahci_start_transfer(ad->hba, ad->port_no, is_write ? "writ" : "read",
-  size, is_atapi ? "atapi" : "ata",
-  has_sglist ? "" : "o");
+trace_ahci_pio_transfer(ad->hba, ad->port_no, is_write ? "writ" : "read",
+size, is_atapi ? "atapi" : "ata",
+has_sglist ? "" : "o");
 
 if (has_sglist && size) {
 if (is_write) {
@@ -1319,12 +1319,11 @@ static void ahci_start_transfer(IDEDMA *dma)
 /* Update number of transferred bytes, destroy sglist */
 dma_buf_commit(s, size);
 fis_len = le32_to_cpu(ad->cur_cmd->status);
+
 out:
 /* declare that we processed everything */
 s->data_ptr = s->data_end;
 ahci_write_fis_pio(ad, fis_len);
-
-s->end_transfer_func(s);
 }
 
 static void ahci_start_dma(IDEDMA *dma, IDEState *s,
@@ -1440,7 +1439,7 @@ static const IDEDMAOps ahci_dma_ops = {
 .start_dma = ahci_start_dma,
 .restart = ahci_restart,
 .restart_dma = ahci_restart_dma,
-.start_transfer = ahci_start_transfer,
+.pio_transfer = ahci_pio_transfer,
 .prepare_buf = ahci_dma_prepare_buf,
 .commit_buf = ahci_commit_buf,
 .rw_buf = ahci_dma_rw_buf,
diff --git a/hw/ide/core.c b/hw/ide/core.c
index 866c659498..7932b7c069 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -527,16 +527,18 @@ static void ide_clear_retry(IDEState *s)
 void ide_transfer_start(IDEState *s, uint8_t *buf, int size,
 EndTransferFunc *end_transfer_func)
 {
-s->end_transfer_func = end_transfer_func;
 s->data_ptr = buf;
 s->data_end = buf + size;
 ide_set_retry(s);
 if (!(s->status & ERR_STAT)) {
 s->status |= DRQ_STAT;
 }
-if (s->bus->dma->ops->start_transfer) {
-s->bus->dma->ops->start_transfer(s->bus->dma);
+if (!s->bus->dma->ops->pio_transfer) {
+s->end_transfer_func = end_transfer_func;
+return;
 }
+s->bus->dma->ops->pio_transfer(s->bus->dma);
+end_transfer_func(s);
 }
 
 static void ide_cmd_done(IDEState *s)
diff --git a/hw/ide/trace-events b/hw/ide/trace-events
index 0c39cabe72..77814618ee 100644
--- a/hw/ide/trace-events
+++ b/hw/ide/trace-events
@@ -101,7 +101,7 @@ handle_cmd_badport(void *s, int port) "ahci(%p)[%d]: guest 
accessed unused port"
 handle_cmd_badfis(void *s, int port) "ahci(%p)[%d]: guest provided an invalid 
cmd FIS"
 handle_cmd_badmap(void *s, int port, uint64_t len) "ahci(%p)[%d]: 
dma_memory_map failed, 0x%02"PRIx64" != 0x80"
 handle_cmd_unhandled_fis(void *s, int port, uint8_t b0, uint8_t b1, uint8_t 
b2) "ahci(%p)[%d]: unhandled FIS type. cmd_fis: 0x%02x-%02x-%02x"
-ahci_start_transfer(void *s, int port, const char *rw, uint32_t size, const 
char *tgt, const char *sgl) "ahci(%p)[%d]: %sing %d bytes on %s w/%s sglist"
+ahci_pio_transfer(void *s, int port, const char *rw, uint32_t size, const char 
*tgt, const char *sgl) "ahci(%p)[%d]: %sing %d bytes on %s w/%s sglist"
 ahci_start_dma(void *s, int port) "ahci(%p)[%d]: start dma"
 ahci_dma_prepare_buf(void *s, int port, int32_t io_buffer_size, int32_t limit) 
"ahci(%p)[%d]: prepare buf limit=%"PRId32" prepared=%"PRId32
 ahci_dma_prepare_buf_fail(void *s, int port) "ahci(%p)[%d]: sglist population 
failed"
diff --git a/include/hw/ide/internal.h b/include/hw/ide/internal.h
index 88212f59df..f3de6f9b73 100644
--- a/include/hw/ide/internal.h
+++ b/include/hw/ide/internal.h
@@ -444,7 +444,7 @@ struct IDEState {
 
 struct IDEDMAOps {
 DMAStartFunc *start_dma;
-DMAVoidFunc *start_transfer;
+DMAVoidFunc *pio_transfer;
 DMAInt32Func *prepare_buf;
 DMAu32Func *commit_buf;
 DMAIntFunc *rw_buf;
-- 
2.17.0





[Qemu-block] [PATCH 5/6] atapi: call ide_set_irq before ide_transfer_start

2018-04-17 Thread Paolo Bonzini
The ATAPI_INT_REASON_IO interrupt is raised when I/O starts, but in the
AHCI case ide_set_irq was actually called at the end of a mutual recursion.
Move it early, with the side effect that ide_transfer_start becomes a tail
call in ide_atapi_cmd_reply_end.

Signed-off-by: Paolo Bonzini 
---
 hw/ide/atapi.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c
index c0509c8bf5..7168ff55a7 100644
--- a/hw/ide/atapi.c
+++ b/hw/ide/atapi.c
@@ -287,6 +287,7 @@ void ide_atapi_cmd_reply_end(IDEState *s)
 } else {
 /* a new transfer is needed */
 s->nsector = (s->nsector & ~7) | ATAPI_INT_REASON_IO;
+ide_set_irq(s->bus);
 byte_count_limit = atapi_byte_count_limit(s);
 trace_ide_atapi_cmd_reply_end_bcl(s, byte_count_limit);
 size = s->packet_transfer_size;
@@ -304,13 +305,12 @@ void ide_atapi_cmd_reply_end(IDEState *s)
 if (size > (s->cd_sector_size - s->io_buffer_index))
 size = (s->cd_sector_size - s->io_buffer_index);
 }
+trace_ide_atapi_cmd_reply_end_new(s, s->status);
 s->packet_transfer_size -= size;
 s->elementary_transfer_size -= size;
 s->io_buffer_index += size;
 ide_transfer_start(s, s->io_buffer + s->io_buffer_index - size,
size, ide_atapi_cmd_reply_end);
-ide_set_irq(s->bus);
-trace_ide_atapi_cmd_reply_end_new(s, s->status);
 }
 }
 }
-- 
2.17.0





[Qemu-block] [PATCH 4/6] ide: make ide_transfer_stop idempotent

2018-04-17 Thread Paolo Bonzini
There is code checking s->end_transfer_func and it was not taught about
ide_transfer_cancel.  We can just use ide_transfer_stop because
s->end_transfer_func is only ever called in the DRQ phase.

ide_transfer_cancel can then be removed, since it would just be
calling ide_transfer_halt.

Signed-off-by: Paolo Bonzini 
---
 hw/ide/core.c | 14 --
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/hw/ide/core.c b/hw/ide/core.c
index edda171b47..bc3648eb13 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -548,10 +548,9 @@ static void ide_cmd_done(IDEState *s)
 }
 }
 
-static void ide_transfer_halt(IDEState *s,
-  void(*end_transfer_func)(IDEState *))
+static void ide_transfer_halt(IDEState *s)
 {
-s->end_transfer_func = end_transfer_func;
+s->end_transfer_func = ide_transfer_stop;
 s->data_ptr = s->io_buffer;
 s->data_end = s->io_buffer;
 s->status &= ~DRQ_STAT;
@@ -559,15 +558,10 @@ static void ide_transfer_halt(IDEState *s,
 
 void ide_transfer_stop(IDEState *s)
 {
-ide_transfer_halt(s, ide_transfer_stop);
+ide_transfer_halt(s);
 ide_cmd_done(s);
 }
 
-static void ide_transfer_cancel(IDEState *s)
-{
-ide_transfer_halt(s, ide_transfer_cancel);
-}
-
 int64_t ide_get_sector(IDEState *s)
 {
 int64_t sector_num;
@@ -1362,7 +1356,7 @@ static bool cmd_nop(IDEState *s, uint8_t cmd)
 static bool cmd_device_reset(IDEState *s, uint8_t cmd)
 {
 /* Halt PIO (in the DRQ phase), then DMA */
-ide_transfer_cancel(s);
+ide_transfer_halt(s);
 ide_cancel_dma_sync(s);
 
 /* Reset any PIO commands, reset signature, etc */
-- 
2.17.0





[Qemu-block] [PATCH 0/6] atapi: change unlimited recursion to while loop

2018-04-17 Thread Paolo Bonzini
Real hardware doesn't have an unlimited stack, so the unlimited
recursion in the ATAPI code smells a bit.  In fact, the call to
ide_transfer_start easily becomes a tail call with a small change
to the code (patch 5); however, we also need to turn the call back to
ide_atapi_cmd_reply_end into another tail call before turning the
(double) tail recursion into a while loop.

In particular, patch 1 ensures that the call to the end_transfer_func is
the last thing in ide_transfer_start.  To do so, it moves the write of
the PIO Setup FIS before the PIO transfer, which actually makes sense:
the FIS is sent by the device to inform the AHCI about the transfer,
so it cannot come after!  This is the main change from the RFC, and
it simplifies the rest of the series (the RFC had to introduce an
"end_transfer" callback just for writing the PIO Setup FIS).

I tested this manually with READ CD commands sent through sg_raw,
and the existing AHCI tests still pass.

Paolo Bonzini (6):
  ahci: move PIO Setup FIS before transfer, fix it for ATAPI commands
  ide: push end_transfer_func out of start_transfer callback, rename
callback
  ide: call ide_cmd_done from ide_transfer_stop
  ide: make ide_transfer_stop idempotent
  atapi: call ide_set_irq before ide_transfer_start
  ide: introduce ide_transfer_start_norecurse

 hw/ide/ahci.c | 30 --
 hw/ide/atapi.c| 44 +--
 hw/ide/core.c | 39 +-
 hw/ide/trace-events   |  2 +-
 include/hw/ide/internal.h |  4 +++-
 tests/libqos/ahci.c   | 30 ++
 tests/libqos/ahci.h   |  2 ++
 7 files changed, 89 insertions(+), 62 deletions(-)

-- 
2.17.0




[Qemu-block] [PATCH 3/6] ide: call ide_cmd_done from ide_transfer_stop

2018-04-17 Thread Paolo Bonzini
The code can simply be moved to the sole caller that has notify == true.

Signed-off-by: Paolo Bonzini 
---
 hw/ide/core.c | 11 ---
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/hw/ide/core.c b/hw/ide/core.c
index 7932b7c069..edda171b47 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -549,26 +549,23 @@ static void ide_cmd_done(IDEState *s)
 }
 
 static void ide_transfer_halt(IDEState *s,
-  void(*end_transfer_func)(IDEState *),
-  bool notify)
+  void(*end_transfer_func)(IDEState *))
 {
 s->end_transfer_func = end_transfer_func;
 s->data_ptr = s->io_buffer;
 s->data_end = s->io_buffer;
 s->status &= ~DRQ_STAT;
-if (notify) {
-ide_cmd_done(s);
-}
 }
 
 void ide_transfer_stop(IDEState *s)
 {
-ide_transfer_halt(s, ide_transfer_stop, true);
+ide_transfer_halt(s, ide_transfer_stop);
+ide_cmd_done(s);
 }
 
 static void ide_transfer_cancel(IDEState *s)
 {
-ide_transfer_halt(s, ide_transfer_cancel, false);
+ide_transfer_halt(s, ide_transfer_cancel);
 }
 
 int64_t ide_get_sector(IDEState *s)
-- 
2.17.0





Re: [Qemu-block] [Qemu-devel] [PATCH v5 00/21] blockjobs: add explicit job management

2018-04-17 Thread Markus Armbruster
Forgot to mention: yes, I know this has been merged already.  I'm merely
trying to catch up with recent block layer progress.



Re: [Qemu-block] [Qemu-devel] [PATCH v5 00/21] blockjobs: add explicit job management

2018-04-17 Thread Markus Armbruster
John Snow  writes:

> This series seeks to address two distinct but closely related issues
> concerning the job management API.
>
> (1) For jobs that complete when a monitor is not attached and receiving
> events or notifications, there's no way to discern the job's final
> return code. Jobs must remain in the query list until dismissed
> for reliable management.
>
> (2) Jobs that change the block graph structure at an indeterminate point
> after the job starts compete with the management layer that relies
> on that graph structure to issue meaningful commands.
>
> This structure should change only at the behest of the management
> API, and not asynchronously at unknown points in time. Before a job
> issues such changes, it must rely on explicit and synchronous
> confirmation from the management API.
>
> These changes are implemented by formalizing a State Transition Machine
> for the BlockJob subsystem.
>
> Job States:
>
> UNDEFINED   Default state. Internal state only.
> CREATED Job has been created
> RUNNING Job has been started and is running
> PAUSED  Job is not ready and has been paused
> READY   Job is ready and is running
> STANDBY Job is ready and is paused
>
> WAITING Job is waiting on peers in transaction
> PENDING Job is waiting on ACK from QMP
> ABORTINGJob is aborting or has been cancelled
> CONCLUDED   Job has finished and has a retcode available
> NULLJob is being dismantled. Internal state only.
>
> Job Verbs:
>
> CANCEL  Instructs a running job to terminate with error,
> (Except when that job is READY, which produces no error.)
> PAUSE   Request a job to pause.
> RESUME  Request a job to resume from a pause.
> SET-SPEED   Change the speed limiting parameter of a job.
> COMPLETEAsk a READY job to finish and exit.
>
> FINALIZEAsk a PENDING job to perform its graph finalization.
> DISMISS Finish cleaning up an empty job.

For each job verb and job state: what's the new job state?

> And here's my stab at a diagram:
>
>  +-+
>  |UNDEFINED|
>  +--+--+
> |
>  +--v+
>+-+CREATED+-+
>| +--++ |
>||  |
>| +--++ +--+|
>+-+RUNNING<->PAUSED||
>| +--+-+--+ +--+|
>|| ||
>|| +--+ |
>||| |
>| +--v--+   +---+ | |
>+-+READY<--->STANDBY| | |
>| +--+--+   +---+ | |
>||| |
>| +--v+   | |
>+-+WAITING+---+ |
>| +--++ |
>||  |
>| +--v+ |
>+-+PENDING| |
>| +--++ |
>||  |
> +--v-+   +--v--+   |
> |ABORTING+--->CONCLUDED|   |
> ++   +--+--+   |
> |  |
>  +--v-+|
>  |NULL++
>  ++

Is this diagram missing a few arrowheads?  E.g. on the edge between
RUNNING and WAITING.

Might push the limits of ASCII art, but here goes anyway: can we label
the arrows with job verbs?

Can you briefly explain how this state machine addresses (1) and (2)?



[Qemu-block] [PATCH v3 1/2] qcow2: Give the refcount cache the minimum possible size by default

2018-04-17 Thread Alberto Garcia
The L2 and refcount caches have default sizes that can be overridden
using the l2-cache-size and refcount-cache-size (an additional
parameter named cache-size sets the combined size of both caches).

Unless forced by one of the aforementioned parameters, QEMU will set
the unspecified sizes so that the L2 cache is 4 times larger than the
refcount cache.

This is based on the premise that the refcount metadata needs to be
only a fourth of the L2 metadata to cover the same amount of disk
space. This is incorrect for two reasons:

 a) The amount of disk covered by an L2 table depends solely on the
cluster size, but in the case of a refcount block it depends on
the cluster size *and* the width of each refcount entry.
The 4/1 ratio is only valid with 16-bit entries (the default).

 b) When we talk about disk space and L2 tables we are talking about
guest space (L2 tables map guest clusters to host clusters),
whereas refcount blocks are used for host clusters (including
L1/L2 tables and the refcount blocks themselves). On a fully
populated (and uncompressed) qcow2 file, image size > virtual size
so there are more refcount entries than L2 entries.

Problem (a) could be fixed by adjusting the algorithm to take into
account the refcount entry width. Problem (b) could be fixed by
increasing a bit the refcount cache size to account for the clusters
used for qcow2 metadata.

However this patch takes a completely different approach and instead
of keeping a ratio between both cache sizes it assigns as much as
possible to the L2 cache and the remainder to the refcount cache.

The reason is that L2 tables are used for every single I/O request
from the guest and the effect of increasing the cache is significant
and clearly measurable. Refcount blocks are however only used for
cluster allocation and internal snapshots and in practice are accessed
sequentially in most cases, so the effect of increasing the cache is
negligible (even when doing random writes from the guest).

So, make the refcount cache as small as possible unless the user
explicitly asks for a larger one.

Signed-off-by: Alberto Garcia 
Reviewed-by: Eric Blake 
Reviewed-by: Max Reitz 
---
 block/qcow2.c  | 31 +++
 block/qcow2.h  |  4 
 tests/qemu-iotests/137.out |  2 +-
 3 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index ef68772aca..091088e09e 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -802,23 +802,30 @@ static void read_cache_sizes(BlockDriverState *bs, 
QemuOpts *opts,
 } else if (refcount_cache_size_set) {
 *l2_cache_size = combined_cache_size - *refcount_cache_size;
 } else {
-*refcount_cache_size = combined_cache_size
- / (DEFAULT_L2_REFCOUNT_SIZE_RATIO + 1);
-*l2_cache_size = combined_cache_size - *refcount_cache_size;
+uint64_t virtual_disk_size = bs->total_sectors * BDRV_SECTOR_SIZE;
+uint64_t max_l2_cache = virtual_disk_size / (s->cluster_size / 8);
+uint64_t min_refcount_cache =
+(uint64_t) MIN_REFCOUNT_CACHE_SIZE * s->cluster_size;
+
+/* Assign as much memory as possible to the L2 cache, and
+ * use the remainder for the refcount cache */
+if (combined_cache_size >= max_l2_cache + min_refcount_cache) {
+*l2_cache_size = max_l2_cache;
+*refcount_cache_size = combined_cache_size - *l2_cache_size;
+} else {
+*refcount_cache_size =
+MIN(combined_cache_size, min_refcount_cache);
+*l2_cache_size = combined_cache_size - *refcount_cache_size;
+}
 }
 } else {
-if (!l2_cache_size_set && !refcount_cache_size_set) {
+if (!l2_cache_size_set) {
 *l2_cache_size = MAX(DEFAULT_L2_CACHE_BYTE_SIZE,
  (uint64_t)DEFAULT_L2_CACHE_CLUSTERS
  * s->cluster_size);
-*refcount_cache_size = *l2_cache_size
- / DEFAULT_L2_REFCOUNT_SIZE_RATIO;
-} else if (!l2_cache_size_set) {
-*l2_cache_size = *refcount_cache_size
-   * DEFAULT_L2_REFCOUNT_SIZE_RATIO;
-} else if (!refcount_cache_size_set) {
-*refcount_cache_size = *l2_cache_size
- / DEFAULT_L2_REFCOUNT_SIZE_RATIO;
+}
+if (!refcount_cache_size_set) {
+*refcount_cache_size = MIN_REFCOUNT_CACHE_SIZE * s->cluster_size;
 }
 }
 
diff --git a/block/qcow2.h b/block/qcow2.h
index adf5c3950f..01b5250415 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -77,10 +77,6 @@
 #define DEFAULT_L2_CACHE_CLUSTERS 8 /* clusters */
 #define DEFAULT_L2_CACHE_BYTE_SIZE 1048576 /* bytes */
 
-/* The 

[Qemu-block] [PATCH v3 0/2] Give the refcount cache the minimum possible size by default

2018-04-17 Thread Alberto Garcia
Hi,

we talked about this the other day, so here are the patches to change
the default cache sizes in qcow2.

Without this patch:

 * refcount-cache-size = l2-cache-size / 4

unless otherwise specified by the user. This is wasteful, the refcount
cache is accessed sequentially during normal I/O, so there's no point
in caching more tables. I measured the effect on the refcount cache
size when populating an empty qcow2 image using random writes, and
there's no difference between having the minimum or the maximum
sizes(*).

With this patch:

 * refcount-cache-size is always 4 clusters by default (the minimum)

 * If "cache-size" is set then l2-cache-size is set to the maximum if
   possible (disk_size * 8 / cluster_size) and the remainder is
   assigned to the refcount cache.

Regards,

Berto

(*) there is, actually: having a very large cache can even make the
I/O slightly slower, because the larger the cache the longer it
takes longer to find a cached entry. I only noticed this under
tmpfs anyway.

Changes:
v3:
- Mention that if you use internal snapshots you may want to increase
  the cache size [Max]

v2: https://lists.gnu.org/archive/html/qemu-block/2018-03/msg00822.html
- s/overriden/overridden/ (in both patches)

v1: https://lists.gnu.org/archive/html/qemu-block/2018-03/msg00709.html
- Initial release

Alberto Garcia (2):
  qcow2: Give the refcount cache the minimum possible size by default
  docs: Document the new default sizes of the qcow2 caches

 block/qcow2.c  | 31 +++
 block/qcow2.h  |  4 
 docs/qcow2-cache.txt   | 33 -
 tests/qemu-iotests/137.out |  2 +-
 4 files changed, 36 insertions(+), 34 deletions(-)

-- 
2.11.0




[Qemu-block] [PATCH v3 2/2] docs: Document the new default sizes of the qcow2 caches

2018-04-17 Thread Alberto Garcia
We have just reduced the refcount cache size to the minimum unless
the user explicitly requests a larger one, so we have to update the
documentation to reflect this change.

Signed-off-by: Alberto Garcia 
---
 docs/qcow2-cache.txt | 33 -
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/docs/qcow2-cache.txt b/docs/qcow2-cache.txt
index 170191a242..8a09a5cc5f 100644
--- a/docs/qcow2-cache.txt
+++ b/docs/qcow2-cache.txt
@@ -116,31 +116,30 @@ There are three options available, and all of them take 
bytes:
 "refcount-cache-size":   maximum size of the refcount block cache
 "cache-size":maximum size of both caches combined
 
-There are two things that need to be taken into account:
+There are a few things that need to be taken into account:
 
  - Both caches must have a size that is a multiple of the cluster size
(or the cache entry size: see "Using smaller cache sizes" below).
 
- - If you only set one of the options above, QEMU will automatically
-   adjust the others so that the L2 cache is 4 times bigger than the
-   refcount cache.
+ - The default L2 cache size is 8 clusters or 1MB (whichever is more),
+   and the minimum is 2 clusters (or 2 cache entries, see below).
 
-This means that these options are equivalent:
+ - The default (and minimum) refcount cache size is 4 clusters.
 
-   -drive file=hd.qcow2,l2-cache-size=2097152
-   -drive file=hd.qcow2,refcount-cache-size=524288
-   -drive file=hd.qcow2,cache-size=2621440
+ - If only "cache-size" is specified then QEMU will assign as much
+   memory as possible to the L2 cache before increasing the refcount
+   cache size.
 
-The reason for this 1/4 ratio is to ensure that both caches cover the
-same amount of disk space. Note however that this is only valid with
-the default value of refcount_bits (16). If you are using a different
-value you might want to calculate both cache sizes yourself since QEMU
-will always use the same 1/4 ratio.
+Unlike L2 tables, refcount blocks are not used during normal I/O but
+only during allocations and internal snapshots. In most cases they are
+accessed sequentially (even during random guest I/O) so increasing the
+refcount cache size won't have any measurable effect in performance
+(this can change if you are using internal snapshots, so you may want
+to think about increasing the cache size if you use them heavily).
 
-It's also worth mentioning that there's no strict need for both caches
-to cover the same amount of disk space. The refcount cache is used
-much less often than the L2 cache, so it's perfectly reasonable to
-keep it small.
+Before QEMU 2.12 the refcount cache had a default size of 1/4 of the
+L2 cache size. This resulted in unnecessarily large caches, so now the
+refcount cache is as small as possible unless overridden by the user.
 
 
 Using smaller cache entries
-- 
2.11.0