date:20181101

Re: [Qemu-block] [PATCH] block/nvme: optimize the performance of nvme driver based on vfio-pci

2018-11-01 Thread Fam Zheng

On Thu, 11/01 18:38, Li Feng wrote:
> When the IO size is larger than 2 pages, we move the the pointer one by
> one in the pagelist, this is inefficient.
> 
> This is a simple benchmark result:
> 
> Before:
> $ qemu-io -c 'write 0 1G' nvme://:00:04.0/1
> 
> wrote 1073741824/1073741824 bytes at offset 0
> 1 GiB, 1 ops; 0:00:02.41 (424.504 MiB/sec and 0.4146 ops/sec)
> 
>  $ qemu-io -c 'read 0 1G' nvme://:00:04.0/1
> 
> read 1073741824/1073741824 bytes at offset 0
> 1 GiB, 1 ops; 0:00:02.03 (503.055 MiB/sec and 0.4913 ops/sec)
> 
> After:
> $ qemu-io -c 'write 0 1G' nvme://:00:04.0/1
> 
> wrote 1073741824/1073741824 bytes at offset 0
> 1 GiB, 1 ops; 0:00:02.17 (471.517 MiB/sec and 0.4605 ops/sec)
> 
>  $ qemu-io -c 'read 0 1G' nvme://:00:04.0/1   
>   
>   
>1 ↵
> 
> read 1073741824/1073741824 bytes at offset 0
> 1 GiB, 1 ops; 0:00:01.94 (526.770 MiB/sec and 0.5144 ops/sec)
> 
> Signed-off-by: Li Feng 
> ---
>  block/nvme.c | 16 ++--
>  1 file changed, 6 insertions(+), 10 deletions(-)
> 
> diff --git a/block/nvme.c b/block/nvme.c
> index 29294038fc..982097b5b1 100644
> --- a/block/nvme.c
> +++ b/block/nvme.c
> @@ -837,7 +837,7 @@ try_map:
>  }
>  
>  for (j = 0; j < qiov->iov[i].iov_len / s->page_size; j++) {
> -pagelist[entries++] = iova + j * s->page_size;
> +pagelist[entries++] = cpu_to_le64(iova + j * s->page_size);
>  }
>  trace_nvme_cmd_map_qiov_iov(s, i, qiov->iov[i].iov_base,
>  qiov->iov[i].iov_len / s->page_size);
> @@ -850,20 +850,16 @@ try_map:
>  case 0:
>  abort();
>  case 1:
> -cmd->prp1 = cpu_to_le64(pagelist[0]);
> +cmd->prp1 = pagelist[0];
>  cmd->prp2 = 0;
>  break;
>  case 2:
> -cmd->prp1 = cpu_to_le64(pagelist[0]);
> -cmd->prp2 = cpu_to_le64(pagelist[1]);;
> +cmd->prp1 = pagelist[0];
> +cmd->prp2 = pagelist[1];
>  break;
>  default:
> -cmd->prp1 = cpu_to_le64(pagelist[0]);
> -cmd->prp2 = cpu_to_le64(req->prp_list_iova);
> -for (i = 0; i < entries - 1; ++i) {
> -pagelist[i] = cpu_to_le64(pagelist[i + 1]);
> -}
> -pagelist[entries - 1] = 0;
> +cmd->prp1 = pagelist[0];
> +cmd->prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t));
>  break;
>  }
>  trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries);
> -- 
> 2.11.0
> 

Nice! Thanks. I've queued the patch.

Fam

[Qemu-block] [PATCH] nvme: fix oob access issue(CVE-2018-16847)

2018-11-01 Thread Li Qiang

Currently, the nvme_cmb_ops mr doesn't check the addr and size.
This can lead an oob access issue. This is triggerable in the guest.
Add check to avoid this issue.

Fixes CVE-2018-16847.

Reported-by: Li Qiang 
Reviewed-by: Paolo Bonzini 
Signed-off-by: Li Qiang 
---
 hw/block/nvme.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index fc7dacb..d097add 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -1175,6 +1175,10 @@ static void nvme_cmb_write(void *opaque, hwaddr addr, 
uint64_t data,
 unsigned size)
 {
 NvmeCtrl *n = (NvmeCtrl *)opaque;
+
+if (addr + size > NVME_CMBSZ_GETSIZE(n->bar.cmbsz)) {
+return;
+}
 memcpy(&n->cmbuf[addr], &data, size);
 }
 
@@ -1183,6 +1187,9 @@ static uint64_t nvme_cmb_read(void *opaque, hwaddr addr, 
unsigned size)
 uint64_t val;
 NvmeCtrl *n = (NvmeCtrl *)opaque;
 
+if (addr + size > NVME_CMBSZ_GETSIZE(n->bar.cmbsz)) {
+return 0;
+}
 memcpy(&val, &n->cmbuf[addr], size);
 return val;
 }
-- 
1.8.3.1

[Qemu-block] [PATCH 4/7] qcow2: refactor decompress_buffer

2018-11-01 Thread Vladimir Sementsov-Ogievskiy

- make it look more like a pair of qcow2_compress - rename the function
  and its parameters
- drop extra out_len variable, check filling of output buffer by strm
  structure itself
- fix code style
- add some documentation

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/qcow2.c | 56 ---
 1 file changed, 35 insertions(+), 21 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index b45a6c4135..e9d24b801e 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -3758,32 +3758,46 @@ static ssize_t qcow2_compress(void *dest, size_t 
dest_size,
 return ret;
 }
 
-static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
- const uint8_t *buf, int buf_size)
+/* qcow2_decompress()
+ *
+ * Decompress some data (not more than @src_size bytes) to produce exactly
+ * @dest_size bytes.
+ *
+ * @dest - destination buffer, @dest_size bytes
+ * @src - source buffer, @src_size bytes
+ *
+ * Returns: 0 on success
+ *  -1 on fail
+ */
+static ssize_t qcow2_decompress(void *dest, size_t dest_size,
+const void *src, size_t src_size)
 {
-z_stream strm1, *strm = &strm1;
-int ret, out_len;
-
-memset(strm, 0, sizeof(*strm));
+int ret = 0;
+z_stream strm;
 
-strm->next_in = (uint8_t *)buf;
-strm->avail_in = buf_size;
-strm->next_out = out_buf;
-strm->avail_out = out_buf_size;
+memset(&strm, 0, sizeof(strm));
+strm.avail_in = src_size;
+strm.next_in = (void *) src;
+strm.avail_out = dest_size;
+strm.next_out = dest;
 
-ret = inflateInit2(strm, -12);
+ret = inflateInit2(&strm, -12);
 if (ret != Z_OK) {
 return -1;
 }
-ret = inflate(strm, Z_FINISH);
-out_len = strm->next_out - out_buf;
-if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
-out_len != out_buf_size) {
-inflateEnd(strm);
-return -1;
+
+ret = inflate(&strm, Z_FINISH);
+if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || strm.avail_out != 0) {
+/* We approve Z_BUF_ERROR because we need @dest buffer to be filled, 
but
+ * @src buffer may be processed partly (because in qcow2 we know size 
of
+ * compressed data with precision of one sector)
+ */
+ret = -1;
 }
-inflateEnd(strm);
-return 0;
+
+inflateEnd(&strm);
+
+return ret;
 }
 
 #define MAX_COMPRESS_THREADS 4
@@ -3973,8 +3987,8 @@ int qcow2_decompress_cluster(BlockDriverState *bs, 
uint64_t cluster_offset)
 if (ret < 0) {
 return ret;
 }
-if (decompress_buffer(s->cluster_cache, s->cluster_size,
-  s->cluster_data + sector_offset, csize) < 0) {
+if (qcow2_decompress(s->cluster_cache, s->cluster_size,
+ s->cluster_data + sector_offset, csize) < 0) {
 return -EIO;
 }
 s->cluster_cache_offset = coffset;
-- 
2.18.0

[Qemu-block] [PATCH 5/7] qcow2: use byte-based read in qcow2_decompress_cluster

2018-11-01 Thread Vladimir Sementsov-Ogievskiy

We are gradually moving away from sector-based interfaces, towards
byte-based.  Get rid of it here too.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/qcow2.c | 16 ++--
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index e9d24b801e..950b9f7ec6 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -3956,14 +3956,15 @@ fail:
 int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
 {
 BDRVQcow2State *s = bs->opaque;
-int ret, csize, nb_csectors, sector_offset;
+int ret, csize, nb_csectors;
 uint64_t coffset;
+struct iovec iov;
+QEMUIOVector local_qiov;
 
 coffset = cluster_offset & s->cluster_offset_mask;
 if (s->cluster_cache_offset != coffset) {
 nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
-sector_offset = coffset & 511;
-csize = nb_csectors * 512 - sector_offset;
+csize = nb_csectors * 512 - (coffset & 511);
 
 /* Allocate buffers on first decompress operation, most images are
  * uncompressed and the memory overhead can be avoided.  The buffers
@@ -3981,14 +3982,17 @@ int qcow2_decompress_cluster(BlockDriverState *bs, 
uint64_t cluster_offset)
 s->cluster_cache = g_malloc(s->cluster_size);
 }
 
+iov.iov_base = s->cluster_data;
+iov.iov_len = csize;
+qemu_iovec_init_external(&local_qiov, &iov, 1);
+
 BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
-ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data,
-nb_csectors);
+ret = bdrv_co_preadv(bs->file, coffset, csize, &local_qiov, 0);
 if (ret < 0) {
 return ret;
 }
 if (qcow2_decompress(s->cluster_cache, s->cluster_size,
- s->cluster_data + sector_offset, csize) < 0) {
+ s->cluster_data, csize) < 0) {
 return -EIO;
 }
 s->cluster_cache_offset = coffset;
-- 
2.18.0

[Qemu-block] [PATCH 3/7] qcow2: move decompression from qcow2-cluster.c to qcow2.c

2018-11-01 Thread Vladimir Sementsov-Ogievskiy

Compression is done in threads in qcow2.c. We want to do decompression
in the same way, so, firstly, move it to the same file.

The only change is braces around if-body in decompress_buffer, to
satisfy checkpatch.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/qcow2-cluster.c | 70 --
 block/qcow2.c | 71 +++
 2 files changed, 71 insertions(+), 70 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index d37fe08b3d..e2737429f5 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -1571,76 +1571,6 @@ again:
 return 0;
 }
 
-static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
- const uint8_t *buf, int buf_size)
-{
-z_stream strm1, *strm = &strm1;
-int ret, out_len;
-
-memset(strm, 0, sizeof(*strm));
-
-strm->next_in = (uint8_t *)buf;
-strm->avail_in = buf_size;
-strm->next_out = out_buf;
-strm->avail_out = out_buf_size;
-
-ret = inflateInit2(strm, -12);
-if (ret != Z_OK)
-return -1;
-ret = inflate(strm, Z_FINISH);
-out_len = strm->next_out - out_buf;
-if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
-out_len != out_buf_size) {
-inflateEnd(strm);
-return -1;
-}
-inflateEnd(strm);
-return 0;
-}
-
-int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
-{
-BDRVQcow2State *s = bs->opaque;
-int ret, csize, nb_csectors, sector_offset;
-uint64_t coffset;
-
-coffset = cluster_offset & s->cluster_offset_mask;
-if (s->cluster_cache_offset != coffset) {
-nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
-sector_offset = coffset & 511;
-csize = nb_csectors * 512 - sector_offset;
-
-/* Allocate buffers on first decompress operation, most images are
- * uncompressed and the memory overhead can be avoided.  The buffers
- * are freed in .bdrv_close().
- */
-if (!s->cluster_data) {
-/* one more sector for decompressed data alignment */
-s->cluster_data = qemu_try_blockalign(bs->file->bs,
-QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size + 512);
-if (!s->cluster_data) {
-return -ENOMEM;
-}
-}
-if (!s->cluster_cache) {
-s->cluster_cache = g_malloc(s->cluster_size);
-}
-
-BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED);
-ret = bdrv_read(bs->file, coffset >> 9, s->cluster_data,
-nb_csectors);
-if (ret < 0) {
-return ret;
-}
-if (decompress_buffer(s->cluster_cache, s->cluster_size,
-  s->cluster_data + sector_offset, csize) < 0) {
-return -EIO;
-}
-s->cluster_cache_offset = coffset;
-}
-return 0;
-}
-
 /*
  * This discards as many clusters of nb_clusters as possible at once (i.e.
  * all clusters in the same L2 slice) and returns the number of discarded
diff --git a/block/qcow2.c b/block/qcow2.c
index 9eab2dbfb6..b45a6c4135 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -3758,6 +3758,34 @@ static ssize_t qcow2_compress(void *dest, size_t 
dest_size,
 return ret;
 }
 
+static int decompress_buffer(uint8_t *out_buf, int out_buf_size,
+ const uint8_t *buf, int buf_size)
+{
+z_stream strm1, *strm = &strm1;
+int ret, out_len;
+
+memset(strm, 0, sizeof(*strm));
+
+strm->next_in = (uint8_t *)buf;
+strm->avail_in = buf_size;
+strm->next_out = out_buf;
+strm->avail_out = out_buf_size;
+
+ret = inflateInit2(strm, -12);
+if (ret != Z_OK) {
+return -1;
+}
+ret = inflate(strm, Z_FINISH);
+out_len = strm->next_out - out_buf;
+if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) ||
+out_len != out_buf_size) {
+inflateEnd(strm);
+return -1;
+}
+inflateEnd(strm);
+return 0;
+}
+
 #define MAX_COMPRESS_THREADS 4
 
 typedef struct Qcow2CompressData {
@@ -3911,6 +3939,49 @@ fail:
 return ret;
 }
 
+int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
+{
+BDRVQcow2State *s = bs->opaque;
+int ret, csize, nb_csectors, sector_offset;
+uint64_t coffset;
+
+coffset = cluster_offset & s->cluster_offset_mask;
+if (s->cluster_cache_offset != coffset) {
+nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
+sector_offset = coffset & 511;
+csize = nb_csectors * 512 - sector_offset;
+
+/* Allocate buffers on first decompress operation, most images are
+ * uncompressed and the memory overhead can be avoided.  The buffers
+ * are freed in .bdrv_close().
+ */
+if (!s->cluster_data) {
+/* one more sector for decompressed data alignment */
+s->cluster_data =

[Qemu-block] [PATCH 6/7] qcow2: aio support for compressed cluster read

2018-11-01 Thread Vladimir Sementsov-Ogievskiy

Allocate buffers locally and release qcow2 lock. Than, reads inside
qcow2_co_preadv_compressed may be done in parallel, however all
decompression is still done synchronously. Let's improve it in the
following commit.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/qcow2.h |  4 ---
 block/qcow2.c | 93 ++-
 2 files changed, 47 insertions(+), 50 deletions(-)

diff --git a/block/qcow2.h b/block/qcow2.h
index 29c98d87a0..8495d2efbe 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -272,9 +272,6 @@ typedef struct BDRVQcow2State {
 QEMUTimer *cache_clean_timer;
 unsigned cache_clean_interval;
 
-uint8_t *cluster_cache;
-uint8_t *cluster_data;
-uint64_t cluster_cache_offset;
 QLIST_HEAD(QCowClusterAlloc, QCowL2Meta) cluster_allocs;
 
 uint64_t *refcount_table;
@@ -610,7 +607,6 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t 
min_size,
 bool exact_size);
 int qcow2_shrink_l1_table(BlockDriverState *bs, uint64_t max_size);
 int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index);
-int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
 int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num,
   uint8_t *buf, int nb_sectors, bool enc, Error 
**errp);
 
diff --git a/block/qcow2.c b/block/qcow2.c
index 950b9f7ec6..678d737044 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -74,6 +74,12 @@ typedef struct {
 #define  QCOW2_EXT_MAGIC_CRYPTO_HEADER 0x0537be77
 #define  QCOW2_EXT_MAGIC_BITMAPS 0x23852875
 
+static int qcow2_co_preadv_compressed(BlockDriverState *bs,
+  uint64_t file_cluster_offset,
+  uint64_t offset,
+  uint64_t bytes,
+  QEMUIOVector *qiov);
+
 static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename)
 {
 const QCowHeader *cow_header = (const void *)buf;
@@ -1410,7 +1416,6 @@ static int coroutine_fn qcow2_do_open(BlockDriverState 
*bs, QDict *options,
 goto fail;
 }
 
-s->cluster_cache_offset = -1;
 s->flags = flags;
 
 ret = qcow2_refcount_init(bs);
@@ -1910,15 +1915,15 @@ static coroutine_fn int 
qcow2_co_preadv(BlockDriverState *bs, uint64_t offset,
 break;
 
 case QCOW2_CLUSTER_COMPRESSED:
-/* add AIO support for compressed blocks ? */
-ret = qcow2_decompress_cluster(bs, cluster_offset);
+qemu_co_mutex_unlock(&s->lock);
+ret = qcow2_co_preadv_compressed(bs, cluster_offset,
+ offset, cur_bytes,
+ &hd_qiov);
+qemu_co_mutex_lock(&s->lock);
 if (ret < 0) {
 goto fail;
 }
 
-qemu_iovec_from_buf(&hd_qiov, 0,
-s->cluster_cache + offset_in_cluster,
-cur_bytes);
 break;
 
 case QCOW2_CLUSTER_NORMAL:
@@ -2054,8 +2059,6 @@ static coroutine_fn int qcow2_co_pwritev(BlockDriverState 
*bs, uint64_t offset,
 
 qemu_iovec_init(&hd_qiov, qiov->niov);
 
-s->cluster_cache_offset = -1; /* disable compressed cache */
-
 qemu_co_mutex_lock(&s->lock);
 
 while (bytes != 0) {
@@ -2219,8 +,6 @@ static void qcow2_close(BlockDriverState *bs)
 g_free(s->image_backing_file);
 g_free(s->image_backing_format);
 
-g_free(s->cluster_cache);
-qemu_vfree(s->cluster_data);
 qcow2_refcount_close(bs);
 qcow2_free_snapshots(bs);
 }
@@ -3397,7 +3398,6 @@ qcow2_co_copy_range_to(BlockDriverState *bs,
 QCowL2Meta *l2meta = NULL;
 
 assert(!bs->encrypted);
-s->cluster_cache_offset = -1; /* disable compressed cache */
 
 qemu_co_mutex_lock(&s->lock);
 
@@ -3953,51 +3953,52 @@ fail:
 return ret;
 }
 
-int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset)
+static int qcow2_co_preadv_compressed(BlockDriverState *bs,
+  uint64_t file_cluster_offset,
+  uint64_t offset,
+  uint64_t bytes,
+  QEMUIOVector *qiov)
 {
 BDRVQcow2State *s = bs->opaque;
-int ret, csize, nb_csectors;
+int ret = 0, csize, nb_csectors;
 uint64_t coffset;
+uint8_t *buf, *out_buf;
 struct iovec iov;
 QEMUIOVector local_qiov;
+int offset_in_cluster = offset_into_cluster(s, offset);
 
-coffset = cluster_offset & s->cluster_offset_mask;
-if (s->cluster_cache_offset != coffset) {
-nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1;
-csize = nb_csectors * 512 - (coffset & 511);
+coffset = file_cluster_offset & s->cluster_offset_mask;
+nb_csectors = ((file_cluster_offset >> s->csize_shift) & s->csize_mask) + 
1;
+

[Qemu-block] [PATCH 1/7] qcow2: use Z_OK instead of 0 for deflateInit2 return code check

2018-11-01 Thread Vladimir Sementsov-Ogievskiy

Use appropriate macro, corresponding to deflateInit2 spec.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/qcow2.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 30689b7688..3e9367c449 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -3734,7 +3734,7 @@ static ssize_t qcow2_compress(void *dest, const void 
*src, size_t size)
 memset(&strm, 0, sizeof(strm));
 ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, Z_DEFLATED,
-12, 9, Z_DEFAULT_STRATEGY);
-if (ret != 0) {
+if (ret != Z_OK) {
 return -2;
 }
 
-- 
2.18.0

[Qemu-block] [PATCH 7/7] qcow2: do decompression in threads

2018-11-01 Thread Vladimir Sementsov-Ogievskiy

Do decompression in threads, like it is already done for compression.
This improves asynchronous compressed reads performance.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/qcow2.c | 35 ---
 1 file changed, 28 insertions(+), 7 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 678d737044..3f1c773b13 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -3802,20 +3802,24 @@ static ssize_t qcow2_decompress(void *dest, size_t 
dest_size,
 
 #define MAX_COMPRESS_THREADS 4
 
+typedef ssize_t (*Qcow2CompressFunc)(void *dest, size_t dest_size,
+ const void *src, size_t src_size);
 typedef struct Qcow2CompressData {
 void *dest;
 size_t dest_size;
 const void *src;
 size_t src_size;
 ssize_t ret;
+
+Qcow2CompressFunc func;
 } Qcow2CompressData;
 
 static int qcow2_compress_pool_func(void *opaque)
 {
 Qcow2CompressData *data = opaque;
 
-data->ret = qcow2_compress(data->dest, data->dest_size,
-   data->src, data->src_size);
+data->ret = data->func(data->dest, data->dest_size,
+   data->src, data->src_size);
 
 return 0;
 }
@@ -3825,10 +3829,10 @@ static void qcow2_compress_complete(void *opaque, int 
ret)
 qemu_coroutine_enter(opaque);
 }
 
-/* See qcow2_compress definition for parameters description */
-static ssize_t qcow2_co_compress(BlockDriverState *bs,
- void *dest, size_t dest_size,
- const void *src, size_t src_size)
+static ssize_t qcow2_co_do_compress(BlockDriverState *bs,
+void *dest, size_t dest_size,
+const void *src, size_t src_size,
+Qcow2CompressFunc func)
 {
 BDRVQcow2State *s = bs->opaque;
 BlockAIOCB *acb;
@@ -3838,6 +3842,7 @@ static ssize_t qcow2_co_compress(BlockDriverState *bs,
 .dest_size = dest_size,
 .src = src,
 .src_size = src_size,
+.func = func,
 };
 
 while (s->nb_compress_threads >= MAX_COMPRESS_THREADS) {
@@ -3860,6 +3865,22 @@ static ssize_t qcow2_co_compress(BlockDriverState *bs,
 return arg.ret;
 }
 
+static ssize_t qcow2_co_compress(BlockDriverState *bs,
+ void *dest, size_t dest_size,
+ const void *src, size_t src_size)
+{
+return qcow2_co_do_compress(bs, dest, dest_size, src, src_size,
+qcow2_compress);
+}
+
+static ssize_t qcow2_co_decompress(BlockDriverState *bs,
+   void *dest, size_t dest_size,
+   const void *src, size_t src_size)
+{
+return qcow2_co_do_compress(bs, dest, dest_size, src, src_size,
+qcow2_decompress);
+}
+
 /* XXX: put compressed sectors first, then all the cluster aligned
tables to avoid losing bytes in alignment */
 static coroutine_fn int
@@ -3987,7 +4008,7 @@ static int qcow2_co_preadv_compressed(BlockDriverState 
*bs,
 goto fail;
 }
 
-if (qcow2_decompress(out_buf, s->cluster_size, buf, csize) < 0) {
+if (qcow2_co_decompress(bs, out_buf, s->cluster_size, buf, csize) < 0) {
 ret = -EIO;
 goto fail;
 }
-- 
2.18.0

[Qemu-block] [PATCH 0/7] qcow2 decompress in threads

2018-11-01 Thread Vladimir Sementsov-Ogievskiy

Hi all!

The series brings threads to qcow2 decompression path, like it is
already done for compression.

Performance gain is illustrated by the following test:

[]# cat gen.sh
#!/bin/bash

echo 'create pattern-file /ssd/t_pat'

./qemu-img create -f raw /ssd/t_pat 10g
for i in {0..9}; do
./qemu-io -f raw -c "write -P 0xab ${i}g 1g" /ssd/t_pat
done

echo 'convert it to compressed /ssd/t_pat.compressed.qcow2'

./qemu-img convert -W -f raw -O qcow2 -c /ssd/t_pat /ssd/t_pat.compressed.qcow2
rm -f /ssd/t_pat

test:
[]# time ./qemu-img convert -f qcow2 --target-image-opts -n 
/ssd/t_pat.compressed.qcow2  'driver=null-co,size=10G'



result before the series:

real0m16.585s
user0m14.899s
sys 0m2.219s



result after the series:

real0m6.528s
user0m19.343s
sys 0m3.081s


Note: my cpu is 4-cores 8-threads i7-4790

Vladimir Sementsov-Ogievskiy (7):
  qcow2: use Z_OK instead of 0 for deflateInit2 return code check
  qcow2: make more generic interface for qcow2_compress
  qcow2: move decompression from qcow2-cluster.c to qcow2.c
  qcow2: refactor decompress_buffer
  qcow2: use byte-based read in qcow2_decompress_cluster
  qcow2: aio support for compressed cluster read
  qcow2: do decompression in threads

 block/qcow2.h |   4 -
 block/qcow2-cluster.c |  70 -
 block/qcow2.c | 169 +++---
 3 files changed, 143 insertions(+), 100 deletions(-)

-- 
2.18.0

[Qemu-block] [PATCH 2/7] qcow2: make more generic interface for qcow2_compress

2018-11-01 Thread Vladimir Sementsov-Ogievskiy

Give explicit size both for source and destination buffers, to make it
similar with decompression path and than cleanly reuse parameter
structure for decompression threads.

Signed-off-by: Vladimir Sementsov-Ogievskiy 
---
 block/qcow2.c | 30 ++
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 3e9367c449..9eab2dbfb6 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -3718,14 +3718,15 @@ fail:
 /*
  * qcow2_compress()
  *
- * @dest - destination buffer, at least of @size-1 bytes
- * @src - source buffer, @size bytes
+ * @dest - destination buffer, @dest_size bytes
+ * @src - source buffer, @src_size bytes
  *
  * Returns: compressed size on success
- *  -1 if compression is inefficient
+ *  -1 destination buffer is not enough to store compressed data
  *  -2 on any other error
  */
-static ssize_t qcow2_compress(void *dest, const void *src, size_t size)
+static ssize_t qcow2_compress(void *dest, size_t dest_size,
+  const void *src, size_t src_size)
 {
 ssize_t ret;
 z_stream strm;
@@ -3740,14 +3741,14 @@ static ssize_t qcow2_compress(void *dest, const void 
*src, size_t size)
 
 /* strm.next_in is not const in old zlib versions, such as those used on
  * OpenBSD/NetBSD, so cast the const away */
-strm.avail_in = size;
+strm.avail_in = src_size;
 strm.next_in = (void *) src;
-strm.avail_out = size - 1;
+strm.avail_out = dest_size;
 strm.next_out = dest;
 
 ret = deflate(&strm, Z_FINISH);
 if (ret == Z_STREAM_END) {
-ret = size - 1 - strm.avail_out;
+ret = dest_size - strm.avail_out;
 } else {
 ret = (ret == Z_OK ? -1 : -2);
 }
@@ -3761,8 +3762,9 @@ static ssize_t qcow2_compress(void *dest, const void 
*src, size_t size)
 
 typedef struct Qcow2CompressData {
 void *dest;
+size_t dest_size;
 const void *src;
-size_t size;
+size_t src_size;
 ssize_t ret;
 } Qcow2CompressData;
 
@@ -3770,7 +3772,8 @@ static int qcow2_compress_pool_func(void *opaque)
 {
 Qcow2CompressData *data = opaque;
 
-data->ret = qcow2_compress(data->dest, data->src, data->size);
+data->ret = qcow2_compress(data->dest, data->dest_size,
+   data->src, data->src_size);
 
 return 0;
 }
@@ -3782,15 +3785,17 @@ static void qcow2_compress_complete(void *opaque, int 
ret)
 
 /* See qcow2_compress definition for parameters description */
 static ssize_t qcow2_co_compress(BlockDriverState *bs,
- void *dest, const void *src, size_t size)
+ void *dest, size_t dest_size,
+ const void *src, size_t src_size)
 {
 BDRVQcow2State *s = bs->opaque;
 BlockAIOCB *acb;
 ThreadPool *pool = aio_get_thread_pool(bdrv_get_aio_context(bs));
 Qcow2CompressData arg = {
 .dest = dest,
+.dest_size = dest_size,
 .src = src,
-.size = size,
+.src_size = src_size,
 };
 
 while (s->nb_compress_threads >= MAX_COMPRESS_THREADS) {
@@ -3857,7 +3862,8 @@ qcow2_co_pwritev_compressed(BlockDriverState *bs, 
uint64_t offset,
 
 out_buf = g_malloc(s->cluster_size);
 
-out_len = qcow2_co_compress(bs, out_buf, buf, s->cluster_size);
+out_len = qcow2_co_compress(bs, out_buf, s->cluster_size - 1,
+buf, s->cluster_size);
 if (out_len == -2) {
 ret = -EINVAL;
 goto fail;
-- 
2.18.0

[Qemu-block] [PATCH] block/nvme: optimize the performance of nvme driver based on vfio-pci

2018-11-01 Thread Li Feng

When the IO size is larger than 2 pages, we move the the pointer one by
one in the pagelist, this is inefficient.

This is a simple benchmark result:

Before:
$ qemu-io -c 'write 0 1G' nvme://:00:04.0/1

wrote 1073741824/1073741824 bytes at offset 0
1 GiB, 1 ops; 0:00:02.41 (424.504 MiB/sec and 0.4146 ops/sec)

 $ qemu-io -c 'read 0 1G' nvme://:00:04.0/1

read 1073741824/1073741824 bytes at offset 0
1 GiB, 1 ops; 0:00:02.03 (503.055 MiB/sec and 0.4913 ops/sec)

After:
$ qemu-io -c 'write 0 1G' nvme://:00:04.0/1

wrote 1073741824/1073741824 bytes at offset 0
1 GiB, 1 ops; 0:00:02.17 (471.517 MiB/sec and 0.4605 ops/sec)

 $ qemu-io -c 'read 0 1G' nvme://:00:04.0/1 


 1 ↵

read 1073741824/1073741824 bytes at offset 0
1 GiB, 1 ops; 0:00:01.94 (526.770 MiB/sec and 0.5144 ops/sec)

Signed-off-by: Li Feng 
---
 block/nvme.c | 16 ++--
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/block/nvme.c b/block/nvme.c
index 29294038fc..982097b5b1 100644
--- a/block/nvme.c
+++ b/block/nvme.c
@@ -837,7 +837,7 @@ try_map:
 }
 
 for (j = 0; j < qiov->iov[i].iov_len / s->page_size; j++) {
-pagelist[entries++] = iova + j * s->page_size;
+pagelist[entries++] = cpu_to_le64(iova + j * s->page_size);
 }
 trace_nvme_cmd_map_qiov_iov(s, i, qiov->iov[i].iov_base,
 qiov->iov[i].iov_len / s->page_size);
@@ -850,20 +850,16 @@ try_map:
 case 0:
 abort();
 case 1:
-cmd->prp1 = cpu_to_le64(pagelist[0]);
+cmd->prp1 = pagelist[0];
 cmd->prp2 = 0;
 break;
 case 2:
-cmd->prp1 = cpu_to_le64(pagelist[0]);
-cmd->prp2 = cpu_to_le64(pagelist[1]);;
+cmd->prp1 = pagelist[0];
+cmd->prp2 = pagelist[1];
 break;
 default:
-cmd->prp1 = cpu_to_le64(pagelist[0]);
-cmd->prp2 = cpu_to_le64(req->prp_list_iova);
-for (i = 0; i < entries - 1; ++i) {
-pagelist[i] = cpu_to_le64(pagelist[i + 1]);
-}
-pagelist[entries - 1] = 0;
+cmd->prp1 = pagelist[0];
+cmd->prp2 = cpu_to_le64(req->prp_list_iova + sizeof(uint64_t));
 break;
 }
 trace_nvme_cmd_map_qiov(s, cmd, req, qiov, entries);
-- 
2.11.0

Re: [Qemu-block] [PATCH for-3.1] blockdev: Consistently use snapshot_node_name in external_snapshot_prepare()

2018-11-01 Thread Alberto Garcia

On Thu 01 Nov 2018 05:30:37 PM CET, Peter Maydell wrote:
> In the function external_snapshot_prepare() we have a
> BlockdevSnapshotSync struct, which has the usual combination
> of has_snapshot_node_name and snapshot_node_name fields for an
> optional field. We set up a local variable
> const char *snapshot_node_name =
> s->has_snapshot_node_name ? s->snapshot_node_name : NULL;
>
> and then mostly use "if (!snapshot_node_name)" for checking
> whether we have a snapshot node name. The exception is that in
> one place we check s->has_snapshot_node_name instead. This
> confuses Coverity (CID 1396473), which thinks it might be
> possible to get here with s->has_snapshot_node_name true but
> snapshot_node_name NULL, and warns that the call to
> qdict_put_str() will segfault in that case.
>
> Make the code consistent and unconfuse Coverity by using
> the same check for this conditional that we do in the rest
> of the surrounding code.
>
> Signed-off-by: Peter Maydell 

Reviewed-by: Alberto Garcia 

Berto

[Qemu-block] [PATCH for-3.1] blockdev: Consistently use snapshot_node_name in external_snapshot_prepare()

2018-11-01 Thread Peter Maydell

In the function external_snapshot_prepare() we have a
BlockdevSnapshotSync struct, which has the usual combination
of has_snapshot_node_name and snapshot_node_name fields for an
optional field. We set up a local variable
const char *snapshot_node_name =
s->has_snapshot_node_name ? s->snapshot_node_name : NULL;

and then mostly use "if (!snapshot_node_name)" for checking
whether we have a snapshot node name. The exception is that in
one place we check s->has_snapshot_node_name instead. This
confuses Coverity (CID 1396473), which thinks it might be
possible to get here with s->has_snapshot_node_name true but
snapshot_node_name NULL, and warns that the call to
qdict_put_str() will segfault in that case.

Make the code consistent and unconfuse Coverity by using
the same check for this conditional that we do in the rest
of the surrounding code.

Signed-off-by: Peter Maydell 
---
Disclaimer: tested only with "make check"...

 blockdev.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/blockdev.c b/blockdev.c
index 574adbcb7f5..b24610c606e 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -1639,7 +1639,7 @@ static void external_snapshot_prepare(BlkActionState 
*common,
 }
 
 options = qdict_new();
-if (s->has_snapshot_node_name) {
+if (snapshot_node_name) {
 qdict_put_str(options, "node-name", snapshot_node_name);
 }
 qdict_put_str(options, "driver", format);
-- 
2.19.1

Re: [Qemu-block] [PULL 00/15] Python queue, 2018-10-30

2018-11-01 Thread Peter Maydell

On 31 October 2018 at 00:31, Eduardo Habkost  wrote:
> Sorry for submitting this at the last minute.
>
> The following changes since commit a2e002ff7913ce93aa0f7dbedd2123dce5f1a9cd:
>
>   Merge remote-tracking branch 
> 'remotes/vivier2/tags/qemu-trivial-for-3.1-pull-request' into staging 
> (2018-10-30 15:49:55 +)
>
> are available in the Git repository at:
>
>   git://github.com/ehabkost/qemu.git tags/python-next-pull-request
>
> for you to fetch changes up to e301e65c97142c4d2a2adf35f5a4fa73d65d8a4f:
>
>   scripts/qemu.py: use a more consistent docstring style (2018-10-30 21:13:54 
> -0300)
>
> 
> Python queue, 2018-10-30
>
> * Makefile rule for running acceptance tests
>   (make check-acceptance) (Cleber Rosa)
> * Make iotests compatible with Python 3
>   (Max Reitz)
> * device-crash-test whitelist update (Thomas Huth)
> * Misc cleanups (Cleber Rosa)
>
> 

Applied, thanks.

-- PMM

Re: [Qemu-block] [PATCH 3/7] qcow2: split out reading normal clusters from qcow2_co_preadv

2018-11-01 Thread Vladimir Sementsov-Ogievskiy

27.09.2018 20:35, Max Reitz wrote:

On 07.08.18 19:43, Vladimir Sementsov-Ogievskiy wrote:


Memory allocation may become less efficient for encrypted case. It's a
payment for further asynchronous scheme.

Signed-off-by: Vladimir Sementsov-Ogievskiy 

---
 block/qcow2.c | 114 --
 1 file changed, 64 insertions(+), 50 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 65e3ca00e2..5e7f2ee318 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -1808,6 +1808,67 @@ out:
 return ret;
 }

+static coroutine_fn int qcow2_co_preadv_normal(BlockDriverState *bs,
+   uint64_t file_cluster_offset,
+   uint64_t offset,
+   uint64_t bytes,
+   QEMUIOVector *qiov,
+   uint64_t qiov_offset)
+{
+int ret;
+BDRVQcow2State *s = bs->opaque;
+void *crypt_buf = NULL;
+QEMUIOVector hd_qiov;
+int offset_in_cluster = offset_into_cluster(s, offset);
+
+if ((file_cluster_offset & 511) != 0) {
+return -EIO;
+}
+
+qemu_iovec_init(&hd_qiov, qiov->niov);


So you're not just re-allocating the bounce buffer for every single
call, but also this I/O vector.  Hm.



+if (bs->encrypted) {
+assert(s->crypto);
+assert(bytes <= QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size);
+
+crypt_buf = qemu_try_blockalign(bs->file->bs, bytes);


1. Why did you remove the comment?

2. The check for whether crypt_buf was successfully allocated is missing.

3. Do you have any benchmarks for encrypted images?  Having to allocate
the bounce buffer for potentially every single cluster sounds rather bad
to me.

Hmm, about this.

Now we have compress threads to do several compress operations in parallel. And 
I plan to do the same thing for decompression, encryption and decryption. So, 
we'll definitely need several cluster-size buffers to do all these operations. 
How many? The first thought is just as many as maximum number of threads (or 
twice as many, to have in and out buffers for compression). But it's wrong, 
consider for example encryption on write:

1. get free thread for encryption (may be yield if maximum thread number 
achieved, waiting until all threads are busy)
2. get buffer (per thread)
3. thread handles encryption
a. free thread here?
4. write encrypted data to underlying file
b. free thread here?

Of course, we want free thread as soon as possible, in "a." not in "b.". And 
this mean, that we should free buffer in "a." too, so we should copy data to 
locally allocated buffer, or, better, we just use local buffer from the 
beginning, and thread don't own it's own buffer..

So, what are the options we have?

1. the most simple thing: just allocate buffers for each request locally, like 
it is already done for compression.
pros: simple
cons: allocate/free every time may influence performance, as you noticed

2. keep a pool of such buffers, so when buffer freed, it's just queued to the 
list of free buffers
pros:
   reduce number of real alloc/free calls
   we can limit the pool maximum size
   we can allocate new buffers on demand, not the whole pool at start
cons:
   hmm, so, it looks like custom allocator. Is it really needed? Is it really 
faster than just use system allocator and call alloc/free every time we need a 
buffer?

3. should not we use g_slice_alloc instead of inventing it in "2."

So, I've decided to do some tests, and here are results (memcpy is for 32K, all 
other things allocates 64K in a loop, list_alloc is a simple realization of 
"2.":

nothing: 2 it / 0.301361 sec = 663655696.202532 it/s
memcpy: 200 it / 1.633015 sec = 1224728.554970 it/s
g_malloc: 2 it / 5.346707 sec = 37406202.540530 it/s
g_slice_alloc: 2 it / 7.812036 sec = 25601520.402792 it/s
list_alloc: 2 it / 5.432771 sec = 36813626.268630 it/s
posix_memalign: 2000 it / 1.025543 sec = 19501864.376084 it/s
aligned_alloc: 2000 it / 1.035639 sec = 19311752.149739 it/s

So, you see that yes, list_alloc is twice as fast as posix_memalign, but on the 
other hand, simple memcpy is a lot slower (16 times slower) (and I don't say 
about real disk IO which will be even more slower), so, should we care about 
allocation, what do you thing?
test is attached.





+qemu_iovec_add(&hd_qiov, crypt_buf, bytes);
+} else {
+qemu_iovec_concat(&hd_qiov, qiov, qiov_offset, bytes);


qcow2_co_preadv() continues to do this as well.  That's superfluous now.



+}
+
+BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO);
+ret = bdrv_co_preadv(bs->file,
+ file_cluster_offset + offset_in_cluster,
+ bytes, &hd_qiov, 0);
+if (ret < 0) {
+goto out;
+}
+
+if (bs->encrypted) {
+assert(s->cr

Re: [Qemu-block] [PATCH] block/nvme: optimize the performance of nvme driver based on vfio-pci

[Qemu-block] [PATCH] nvme: fix oob access issue(CVE-2018-16847)

[Qemu-block] [PATCH 4/7] qcow2: refactor decompress_buffer

[Qemu-block] [PATCH 5/7] qcow2: use byte-based read in qcow2_decompress_cluster

[Qemu-block] [PATCH 3/7] qcow2: move decompression from qcow2-cluster.c to qcow2.c

[Qemu-block] [PATCH 6/7] qcow2: aio support for compressed cluster read

[Qemu-block] [PATCH 1/7] qcow2: use Z_OK instead of 0 for deflateInit2 return code check

[Qemu-block] [PATCH 7/7] qcow2: do decompression in threads

[Qemu-block] [PATCH 0/7] qcow2 decompress in threads

[Qemu-block] [PATCH 2/7] qcow2: make more generic interface for qcow2_compress

[Qemu-block] [PATCH] block/nvme: optimize the performance of nvme driver based on vfio-pci

Re: [Qemu-block] [PATCH for-3.1] blockdev: Consistently use snapshot_node_name in external_snapshot_prepare()

[Qemu-block] [PATCH for-3.1] blockdev: Consistently use snapshot_node_name in external_snapshot_prepare()

Re: [Qemu-block] [PULL 00/15] Python queue, 2018-10-30

Re: [Qemu-block] [PATCH 3/7] qcow2: split out reading normal clusters from qcow2_co_preadv

15 matches

Site Navigation

Mail list logo

Footer information