from:"Ashijeet Acharya"

Re: [Qemu-block] [Qemu-devel] [PATCH v9 0/8] Optimize VMDK I/O by allocating multiple clusters

2017-10-19 Thread Ashijeet Acharya

On Fri, Oct 20, 2017 at 11:58 Fam Zheng  wrote:

> On Mon, 10/09 22:12, Fam Zheng wrote:
> > On Mon, 10/09 18:29, Ashijeet Acharya wrote:
> > > Optimization test results:
> > >
> > > This patch series improves 128 KB sequential write performance to an
> > > empty VMDK file by 54%
> > >
> > > Benchmark command: ./qemu-img bench -w -c 1024 -s 128K -d 1 -t none -f
> > > vmdk test.vmdk
> > >
> > > Changes in v9:
> > > - rebase the series
> >
> > Thanks, looks good to me, applied:
> >
> > https://github.com/famz/qemu/tree/staging
>
> Ashijeet: I've been testing my branch and it seems installing
> Fedora/CentOS to a
> VMDK image is broken with your patches applied. Both guest and QEMU are
> responsive, but the installing of packages stops to make any progress at
> some
> point:
>
> Installing rootfiles.noarch (317/318)
> Installing langpacks-en.noarch (318/318)
> Performing post-installation setup tasks
> Configuring fedora-release.noarch
> Configuring filesystem.x86_64
> Configuring GeoIP-GeoLite-data.noarch
> Configuring python3.x86_64
> Configuring fedora-logos.x86_64
> Configuring kernel-core.x86_64
>
> # hang here
>
> Can you reproduce this on your machine?
>
> My command line is something like this:
>
> qemu-system-x86_64 -enable-kvm -cpu host -m 1G -qmp
> unix:/home/fam/.q/qemu-8DOC9EF4/qmp,server,nowait -name 8DOC9EF4 -netdev
> user,id=vnet,hostfwd=:0.0.0.0:10022-:22 -device
> virtio-net-pci,netdev=vnet -drive
> file=/var/tmp/test2.vmdk,if=none,id=drive-1,cache=none,aio=native -device
> virtio-blk-pci,drive=drive-1 -cdrom /stor/iso/CentOS-6.9-x86_64-minimal.iso
> -pidfile /home/fam/.q/qemu-8DOC9EF4/pid
>
> qemu.git master doesn't have this problem. So I'll drop this series from
> the
> pull request until it is resolved.


Fam: Alright, I will look into this but I cannot give you a deadline
unfortunately. I will try my best to resolve this as soon as though.

Ashijeet

>
>
> Fam
>

[Qemu-block] [PATCH v9 6/8] vmdk: New functions to assist allocating multiple clusters

2017-10-09 Thread Ashijeet Acharya

Introduce two new helper functions handle_alloc() and
vmdk_alloc_cluster_offset(). handle_alloc() helps to allocate multiple
clusters at once starting from a given offset on disk and performs COW
if necessary for first and last allocated clusters.
vmdk_alloc_cluster_offset() helps to return the offset of the first of
the many newly allocated clusters. Also, provide proper documentation
for both.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 201 ---
 1 file changed, 191 insertions(+), 10 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 11bc0f09c7..d5dfd21abe 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -136,6 +136,7 @@ typedef struct VmdkMetaData {
 unsigned int l2_offset;
 int valid;
 uint32_t *l2_cache_entry;
+uint32_t nb_clusters;
 } VmdkMetaData;
 
 typedef struct VmdkGrainMarker {
@@ -1259,6 +1260,183 @@ static int get_cluster_table(VmdkExtent *extent, 
uint64_t offset,
 return VMDK_OK;
 }
 
+/*
+ * vmdk_handle_alloc
+ *
+ * Allocate new clusters for an area that either is yet unallocated or needs a
+ * copy on write.
+ *
+ * Returns:
+ *   VMDK_OK:   if new clusters were allocated, *bytes may be decreased if
+ *  the new allocation doesn't cover all of the requested area.
+ *  *cluster_offset is updated to contain the offset of the
+ *  first newly allocated cluster.
+ *
+ *   VMDK_UNALLOC:  if no clusters could be allocated. *cluster_offset is left
+ *  unchanged.
+ *
+ *   VMDK_ERROR:in error cases
+ */
+static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent,
+ uint64_t offset, uint64_t *cluster_offset,
+ int64_t *bytes, VmdkMetaData *m_data,
+ bool allocate, uint32_t *alloc_clusters_counter)
+{
+int l1_index, l2_offset, l2_index;
+uint32_t *l2_table;
+uint32_t cluster_sector;
+uint32_t nb_clusters;
+bool zeroed = false;
+uint64_t skip_start_bytes, skip_end_bytes;
+int ret;
+
+ret = get_cluster_table(extent, offset, &l1_index, &l2_offset,
+&l2_index, &l2_table);
+if (ret < 0) {
+return ret;
+}
+
+cluster_sector = le32_to_cpu(l2_table[l2_index]);
+
+skip_start_bytes = vmdk_find_offset_in_cluster(extent, offset);
+/* Calculate the number of clusters to look for. Here we truncate the last
+ * cluster, i.e. 1 less than the actual value calculated as we may need to
+ * perform COW for the last one. */
+nb_clusters = DIV_ROUND_UP(skip_start_bytes + *bytes,
+   extent->cluster_sectors << BDRV_SECTOR_BITS) - 
1;
+
+nb_clusters = MIN(nb_clusters, extent->l2_size - l2_index);
+assert(nb_clusters <= INT_MAX);
+
+/* update bytes according to final nb_clusters value */
+if (nb_clusters != 0) {
+*bytes = ((nb_clusters * extent->cluster_sectors) << BDRV_SECTOR_BITS)
+ - skip_start_bytes;
+} else {
+nb_clusters = 1;
+}
+*alloc_clusters_counter += nb_clusters;
+
+/* we need to use MIN() for basically 3 cases that arise :
+ * 1. alloc very first cluster : here skip_start_bytes >= 0 and
+ **bytes <= cluster_size.
+ * 2. alloc middle clusters : here *bytes is a perfect multiple of
+ *cluster_size and skip_start_bytes is 0.
+ * 3. alloc very last cluster : here *bytes <= cluster_size and
+ *skip_start_bytes is 0
+ */
+skip_end_bytes = skip_start_bytes + MIN(*bytes,
+ extent->cluster_sectors * BDRV_SECTOR_SIZE
+- skip_start_bytes);
+
+if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) {
+zeroed = true;
+}
+
+if (!cluster_sector || zeroed) {
+if (!allocate) {
+return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
+}
+
+cluster_sector = extent->next_cluster_sector;
+extent->next_cluster_sector += extent->cluster_sectors
+* nb_clusters;
+
+ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
+   offset, skip_start_bytes,
+   skip_end_bytes);
+if (ret < 0) {
+return ret;
+}
+if (m_data) {
+m_data->valid = 1;
+m_data->l1_index = l1_index;
+m_data->l2_index = l2_index;
+m_data->l2_offset = l2_offset;
+m_data->l2_cache_entry = &l2_table[l2_index];
+m_data->nb_clusters = nb_clusters;
+}
+}
+*cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
+return VMDK_OK;
+}
+
+/*
+ * vmdk_alloc_clusters
+ *
+ * For a given offset on

[Qemu-block] [PATCH v9 7/8] vmdk: Update metadata for multiple clusters

2017-10-09 Thread Ashijeet Acharya

Include a next pointer in VmdkMetaData struct to point to the previous
allocated L2 table. Modify vmdk_L2update to start updating metadata for
allocation of multiple clusters at once.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 128 ++-
 1 file changed, 101 insertions(+), 27 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index d5dfd21abe..cbeffb1552 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -137,6 +137,8 @@ typedef struct VmdkMetaData {
 int valid;
 uint32_t *l2_cache_entry;
 uint32_t nb_clusters;
+uint32_t offset;
+struct VmdkMetaData *next;
 } VmdkMetaData;
 
 typedef struct VmdkGrainMarker {
@@ -1133,34 +1135,87 @@ exit:
 return ret;
 }
 
-static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
- uint32_t offset)
+static int vmdk_alloc_cluster_link_l2(VmdkExtent *extent,
+  VmdkMetaData *m_data, bool zeroed)
 {
-offset = cpu_to_le32(offset);
+int i;
+uint32_t offset, temp_offset;
+int *l2_table_array;
+int l2_array_size;
+
+if (zeroed) {
+temp_offset = VMDK_GTE_ZEROED;
+} else {
+temp_offset = m_data->offset;
+}
+
+l2_array_size = sizeof(uint32_t) * m_data->nb_clusters;
+l2_table_array = qemu_try_blockalign(extent->file->bs,
+ QEMU_ALIGN_UP(l2_array_size,
+   BDRV_SECTOR_SIZE));
+if (l2_table_array == NULL) {
+return VMDK_ERROR;
+}
+memset(l2_table_array, 0, QEMU_ALIGN_UP(l2_array_size, BDRV_SECTOR_SIZE));
 /* update L2 table */
+offset = temp_offset;
+for (i = 0; i < m_data->nb_clusters; i++) {
+l2_table_array[i] = cpu_to_le32(offset);
+if (!zeroed) {
+offset += extent->cluster_sectors;
+}
+}
 if (bdrv_pwrite_sync(extent->file,
-((int64_t)m_data->l2_offset * 512)
-+ (m_data->l2_index * sizeof(offset)),
-&offset, sizeof(offset)) < 0) {
+ ((int64_t)m_data->l2_offset * 512)
+ + ((m_data->l2_index) * sizeof(offset)),
+ l2_table_array, l2_array_size) < 0) {
 return VMDK_ERROR;
 }
 /* update backup L2 table */
 if (extent->l1_backup_table_offset != 0) {
 m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
 if (bdrv_pwrite_sync(extent->file,
-((int64_t)m_data->l2_offset * 512)
-+ (m_data->l2_index * sizeof(offset)),
-&offset, sizeof(offset)) < 0) {
+ ((int64_t)m_data->l2_offset * 512)
+ + ((m_data->l2_index) * sizeof(offset)),
+ l2_table_array, l2_array_size) < 0) {
 return VMDK_ERROR;
 }
 }
+
+offset = temp_offset;
 if (m_data->l2_cache_entry) {
-*m_data->l2_cache_entry = offset;
+for (i = 0; i < m_data->nb_clusters; i++) {
+*m_data->l2_cache_entry = cpu_to_le32(offset);
+m_data->l2_cache_entry++;
+
+if (!zeroed) {
+offset += extent->cluster_sectors;
+}
+}
 }
 
+qemu_vfree(l2_table_array);
 return VMDK_OK;
 }
 
+static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
+ bool zeroed)
+{
+int ret;
+
+while (m_data->next != NULL) {
+
+ret = vmdk_alloc_cluster_link_l2(extent, m_data, zeroed);
+if (ret < 0) {
+return ret;
+}
+
+m_data = m_data->next;
+ }
+
+ return VMDK_OK;
+}
+
 /*
  * vmdk_l2load
  *
@@ -1277,9 +1332,10 @@ static int get_cluster_table(VmdkExtent *extent, 
uint64_t offset,
  *
  *   VMDK_ERROR:in error cases
  */
+
 static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent,
  uint64_t offset, uint64_t *cluster_offset,
- int64_t *bytes, VmdkMetaData *m_data,
+ int64_t *bytes, VmdkMetaData **m_data,
  bool allocate, uint32_t *alloc_clusters_counter)
 {
 int l1_index, l2_offset, l2_index;
@@ -1288,6 +1344,7 @@ static int vmdk_handle_alloc(BlockDriverState *bs, 
VmdkExtent *extent,
 uint32_t nb_clusters;
 bool zeroed = false;
 uint64_t skip_start_bytes, skip_end_bytes;
+VmdkMetaData *old_m_data;
 int ret;
 
 ret = get_cluster_table(extent, offset, &l1_index, &l2_offset,
@@ -1348,13 +1405,21 @@ static int vmdk_handle_alloc(BlockDriverState *bs, 
VmdkExtent *extent,
 if (ret < 0) {
 return ret;
 }
-if (m_data) {
-m_data-&g

[Qemu-block] [PATCH v9 4/8] vmdk: Factor out metadata loading code out of vmdk_get_cluster_offset()

2017-10-09 Thread Ashijeet Acharya

Move the cluster tables loading code out of the existing
vmdk_get_cluster_offset() function and implement it in separate
get_cluster_table() and vmdk_l2load() functions.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 153 ---
 1 file changed, 105 insertions(+), 48 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index d4ea92bdcf..07707779a0 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1160,6 +1160,105 @@ static int vmdk_L2update(VmdkExtent *extent, 
VmdkMetaData *m_data,
 return VMDK_OK;
 }
 
+/*
+ * vmdk_l2load
+ *
+ * Load a new L2 table into memory. If the table is in the cache, the cache
+ * is used; otherwise the L2 table is loaded from the image file.
+ *
+ * Returns:
+ *   VMDK_OK:   on success
+ *   VMDK_ERROR:in error cases
+ */
+static int vmdk_l2load(VmdkExtent *extent, uint64_t offset, int l2_offset,
+   uint32_t **new_l2_table, int *new_l2_index)
+{
+int min_index, i, j;
+uint32_t *l2_table;
+uint32_t min_count;
+
+for (i = 0; i < L2_CACHE_SIZE; i++) {
+if (l2_offset == extent->l2_cache_offsets[i]) {
+/* increment the hit count */
+if (++extent->l2_cache_counts[i] == UINT32_MAX) {
+for (j = 0; j < L2_CACHE_SIZE; j++) {
+extent->l2_cache_counts[j] >>= 1;
+}
+}
+l2_table = extent->l2_cache + (i * extent->l2_size);
+goto found;
+}
+}
+/* not found: load a new entry in the least used one */
+min_index = 0;
+min_count = UINT32_MAX;
+for (i = 0; i < L2_CACHE_SIZE; i++) {
+if (extent->l2_cache_counts[i] < min_count) {
+min_count = extent->l2_cache_counts[i];
+min_index = i;
+}
+}
+l2_table = extent->l2_cache + (min_index * extent->l2_size);
+if (bdrv_pread(extent->file,
+(int64_t)l2_offset * 512,
+l2_table,
+extent->l2_size * sizeof(uint32_t)
+) != extent->l2_size * sizeof(uint32_t)) {
+return VMDK_ERROR;
+}
+
+extent->l2_cache_offsets[min_index] = l2_offset;
+extent->l2_cache_counts[min_index] = 1;
+found:
+*new_l2_index = ((offset >> 9) / extent->cluster_sectors) % 
extent->l2_size;
+*new_l2_table = l2_table;
+
+return VMDK_OK;
+}
+
+/*
+ * get_cluster_table
+ *
+ * For a given offset, load (and allocate if needed) the l2 table.
+ *
+ * Returns:
+ *   VMDK_OK:on success
+ *
+ *   VMDK_UNALLOC:   if cluster is not mapped
+ *
+ *   VMDK_ERROR: in error cases
+ */
+static int get_cluster_table(VmdkExtent *extent, uint64_t offset,
+ int *new_l1_index, int *new_l2_offset,
+ int *new_l2_index, uint32_t **new_l2_table)
+{
+int l1_index, l2_offset, l2_index;
+uint32_t *l2_table;
+int ret;
+
+offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
+l1_index = (offset >> 9) / extent->l1_entry_sectors;
+if (l1_index >= extent->l1_size) {
+return VMDK_ERROR;
+}
+l2_offset = extent->l1_table[l1_index];
+if (!l2_offset) {
+return VMDK_UNALLOC;
+}
+
+ret = vmdk_l2load(extent, offset, l2_offset, &l2_table, &l2_index);
+if (ret < 0) {
+return ret;
+}
+
+*new_l1_index = l1_index;
+*new_l2_offset = l2_offset;
+*new_l2_index = l2_index;
+*new_l2_table = l2_table;
+
+return VMDK_OK;
+}
+
 /**
  * vmdk_get_cluster_offset
  *
@@ -1189,66 +1288,24 @@ static int vmdk_get_cluster_offset(BlockDriverState *bs,
uint64_t skip_start_bytes,
uint64_t skip_end_bytes)
 {
-unsigned int l1_index, l2_offset, l2_index;
-int min_index, i, j;
-uint32_t min_count, *l2_table;
+int l1_index, l2_offset, l2_index;
+uint32_t *l2_table;
 bool zeroed = false;
 int64_t ret;
 int64_t cluster_sector;
 
-if (m_data) {
-m_data->valid = 0;
-}
 if (extent->flat) {
 *cluster_offset = extent->flat_start_offset;
 return VMDK_OK;
 }
 
-offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
-l1_index = (offset >> 9) / extent->l1_entry_sectors;
-if (l1_index >= extent->l1_size) {
-return VMDK_ERROR;
-}
-l2_offset = extent->l1_table[l1_index];
-if (!l2_offset) {
-return VMDK_UNALLOC;
-}
-for (i = 0; i < L2_CACHE_SIZE; i++) {
-if (l2_offset == extent->l2_cache_offsets[i]) {
-/* increment the hit count */
-if (++extent->l2_cache_counts[i] == 0x) {
-for (j = 0; j < L2_CACHE_SIZE; j++) {
-extent->l2_cache_counts[j] >>= 1;
-

[Qemu-block] [PATCH v9 5/8] vmdk: Set maximum bytes allocated in one cycle

2017-10-09 Thread Ashijeet Acharya

Set the maximum bytes allowed to get allocated at once to be not more
than the extent size boundary to handle writes at two separate extents
appropriately.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 07707779a0..11bc0f09c7 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1641,6 +1641,7 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
offset,
 uint64_t cluster_offset;
 uint64_t bytes_done = 0;
 VmdkMetaData m_data;
+uint64_t extent_end;
 
 if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) {
 error_report("Wrong offset: offset=0x%" PRIx64
@@ -1654,9 +1655,17 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
offset,
 if (!extent) {
 return -EIO;
 }
+extent_end = extent->end_sector * BDRV_SECTOR_SIZE;
+
 offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
-n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
- - offset_in_cluster);
+
+/* truncate n_bytes to first cluster because we need to perform COW */
+if (offset_in_cluster > 0) {
+n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
+ - offset_in_cluster);
+} else {
+n_bytes = MIN(bytes, extent_end - offset);
+}
 
 ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset,
   !(extent->compressed || zeroed),
-- 
2.13.5

[Qemu-block] [PATCH v9 8/8] vmdk: Make vmdk_get_cluster_offset() return cluster offset only

2017-10-09 Thread Ashijeet Acharya

vmdk_alloc_clusters() introduced earlier now handles the task of
allocating clusters and performing COW when needed. Thus we can change
vmdk_get_cluster_offset() to stick to the sole purpose of returning
cluster offset using sector number. Update the changes at all call
sites.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 56 
 1 file changed, 12 insertions(+), 44 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index cbeffb1552..497e30f6df 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1511,25 +1511,16 @@ static int vmdk_alloc_clusters(BlockDriverState *bs,
  * For flat extents, the start offset as parsed from the description file is
  * returned.
  *
- * For sparse extents, look up in L1, L2 table. If allocate is true, return an
- * offset for a new cluster and update L2 cache. If there is a backing file,
- * COW is done before returning; otherwise, zeroes are written to the allocated
- * cluster. Both COW and zero writing skips the sector range
- * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller
- * has new data to write there.
+ * For sparse extents, look up the L1, L2 table.
  *
  * Returns: VMDK_OK if cluster exists and mapped in the image.
- *  VMDK_UNALLOC if cluster is not mapped and @allocate is false.
- *  VMDK_ERROR if failed.
+ *  VMDK_UNALLOC if cluster is not mapped.
+ *  VMDK_ERROR if failed
  */
 static int vmdk_get_cluster_offset(BlockDriverState *bs,
VmdkExtent *extent,
-   VmdkMetaData *m_data,
uint64_t offset,
-   bool allocate,
-   uint64_t *cluster_offset,
-   uint64_t skip_start_bytes,
-   uint64_t skip_end_bytes)
+   uint64_t *cluster_offset)
 {
 int l1_index, l2_offset, l2_index;
 uint32_t *l2_table;
@@ -1554,31 +1545,9 @@ static int vmdk_get_cluster_offset(BlockDriverState *bs,
 }
 
 if (!cluster_sector || zeroed) {
-if (!allocate) {
-return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
-}
-
-cluster_sector = extent->next_cluster_sector;
-extent->next_cluster_sector += extent->cluster_sectors;
-
-/* First of all we write grain itself, to avoid race condition
- * that may to corrupt the image.
- * This problem may occur because of insufficient space on host disk
- * or inappropriate VM shutdown.
- */
-ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
-offset, skip_start_bytes, skip_end_bytes);
-if (ret) {
-return ret;
-}
-if (m_data) {
-m_data->valid = 1;
-m_data->l1_index = l1_index;
-m_data->l2_index = l2_index;
-m_data->l2_offset = l2_offset;
-m_data->l2_cache_entry = &l2_table[l2_index];
-}
+return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
 }
+
 *cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
 return VMDK_OK;
 }
@@ -1621,9 +1590,7 @@ static int64_t coroutine_fn 
vmdk_co_get_block_status(BlockDriverState *bs,
 return 0;
 }
 qemu_co_mutex_lock(&s->lock);
-ret = vmdk_get_cluster_offset(bs, extent, NULL,
-  sector_num * 512, false, &offset,
-  0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, sector_num * 512, &offset);
 qemu_co_mutex_unlock(&s->lock);
 
 index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
@@ -1814,13 +1781,14 @@ vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, 
uint64_t bytes,
 ret = -EIO;
 goto fail;
 }
-ret = vmdk_get_cluster_offset(bs, extent, NULL,
-  offset, false, &cluster_offset, 0, 0);
+
 offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
 
 n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
  - offset_in_cluster);
 
+ret = vmdk_get_cluster_offset(bs, extent, offset, &cluster_offset);
+
 if (ret != VMDK_OK) {
 /* if not allocated, try to read from parent image, if exist */
 if (bs->backing && ret != VMDK_ZEROED) {
@@ -2565,9 +2533,9 @@ static int vmdk_check(BlockDriverState *bs, 
BdrvCheckResult *result,
 ret = -EINVAL;
 break;
 }
-ret = vmdk_get_cluster_offset(bs, extent, NULL,
+ret = vmdk_get_cluster_offset(bs, extent,
   sector_num << BDRV_SECTOR_BITS,
-  false, &clu

[Qemu-block] [PATCH v9 3/8] vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset()

2017-10-09 Thread Ashijeet Acharya

Rename the existing get_cluster_offset() to vmdk_get_cluster_offset()
and update name in all the callers accordingly.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 46 +++---
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 109c589b43..d4ea92bdcf 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1161,7 +1161,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData 
*m_data,
 }
 
 /**
- * get_cluster_offset
+ * vmdk_get_cluster_offset
  *
  * Look up cluster offset in extent file by sector number, and store in
  * @cluster_offset.
@@ -1180,14 +1180,14 @@ static int vmdk_L2update(VmdkExtent *extent, 
VmdkMetaData *m_data,
  *  VMDK_UNALLOC if cluster is not mapped and @allocate is false.
  *  VMDK_ERROR if failed.
  */
-static int get_cluster_offset(BlockDriverState *bs,
-  VmdkExtent *extent,
-  VmdkMetaData *m_data,
-  uint64_t offset,
-  bool allocate,
-  uint64_t *cluster_offset,
-  uint64_t skip_start_bytes,
-  uint64_t skip_end_bytes)
+static int vmdk_get_cluster_offset(BlockDriverState *bs,
+   VmdkExtent *extent,
+   VmdkMetaData *m_data,
+   uint64_t offset,
+   bool allocate,
+   uint64_t *cluster_offset,
+   uint64_t skip_start_bytes,
+   uint64_t skip_end_bytes)
 {
 unsigned int l1_index, l2_offset, l2_index;
 int min_index, i, j;
@@ -1321,9 +1321,9 @@ static int64_t coroutine_fn 
vmdk_co_get_block_status(BlockDriverState *bs,
 return 0;
 }
 qemu_co_mutex_lock(&s->lock);
-ret = get_cluster_offset(bs, extent, NULL,
- sector_num * 512, false, &offset,
- 0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, NULL,
+  sector_num * 512, false, &offset,
+  0, 0);
 qemu_co_mutex_unlock(&s->lock);
 
 index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
@@ -1514,8 +1514,8 @@ vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, 
uint64_t bytes,
 ret = -EIO;
 goto fail;
 }
-ret = get_cluster_offset(bs, extent, NULL,
- offset, false, &cluster_offset, 0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, NULL,
+  offset, false, &cluster_offset, 0, 0);
 offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
 
 n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
@@ -1601,10 +1601,10 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
offset,
 n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
  - offset_in_cluster);
 
-ret = get_cluster_offset(bs, extent, &m_data, offset,
- !(extent->compressed || zeroed),
- &cluster_offset, offset_in_cluster,
- offset_in_cluster + n_bytes);
+ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset,
+  !(extent->compressed || zeroed),
+  &cluster_offset, offset_in_cluster,
+  offset_in_cluster + n_bytes);
 if (extent->compressed) {
 if (ret == VMDK_OK) {
 /* Refuse write to allocated cluster for streamOptimized */
@@ -1613,8 +1613,8 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
offset,
 return -EIO;
 } else {
 /* allocate */
-ret = get_cluster_offset(bs, extent, &m_data, offset,
- true, &cluster_offset, 0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset,
+  true, &cluster_offset, 0, 0);
 }
 }
 if (ret == VMDK_ERROR) {
@@ -2244,9 +2244,9 @@ static int vmdk_check(BlockDriverState *bs, 
BdrvCheckResult *result,
 ret = -EINVAL;
 break;
 }
-ret = get_cluster_offset(bs, extent, NULL,
- sector_num << BDRV_SECTOR_BITS,
- false, &cluster_offset, 0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, NULL,
+  sector_num << BDRV_SECTOR_BITS,
+

[Qemu-block] [PATCH v9 2/8] vmdk: Rename get_whole_cluster() to vmdk_perform_cow()

2017-10-09 Thread Ashijeet Acharya

Rename the existing function get_whole_cluster() to vmdk_perform_cow()
as its sole purpose is to perform COW for the first and the last
allocated clusters if needed.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 23 ++-
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index e86ca39ff2..109c589b43 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1045,8 +1045,8 @@ static void vmdk_refresh_limits(BlockDriverState *bs, 
Error **errp)
 }
 }
 
-/**
- * get_whole_cluster
+/*
+ * vmdk_perform_cow
  *
  * Copy backing file's cluster that covers @sector_num, otherwise write zero,
  * to the cluster at @cluster_sector_num.
@@ -1054,13 +1054,18 @@ static void vmdk_refresh_limits(BlockDriverState *bs, 
Error **errp)
  * If @skip_start_sector < @skip_end_sector, the relative range
  * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave
  * it for call to write user data in the request.
+ *
+ * Returns:
+ *   VMDK_OK:   on success
+ *
+ *   VMDK_ERROR:in error cases
  */
-static int get_whole_cluster(BlockDriverState *bs,
- VmdkExtent *extent,
- uint64_t cluster_offset,
- uint64_t offset,
- uint64_t skip_start_bytes,
- uint64_t skip_end_bytes)
+static int vmdk_perform_cow(BlockDriverState *bs,
+VmdkExtent *extent,
+uint64_t cluster_offset,
+uint64_t offset,
+uint64_t skip_start_bytes,
+uint64_t skip_end_bytes)
 {
 int ret = VMDK_OK;
 int64_t cluster_bytes;
@@ -1261,7 +1266,7 @@ static int get_cluster_offset(BlockDriverState *bs,
  * This problem may occur because of insufficient space on host disk
  * or inappropriate VM shutdown.
  */
-ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
+ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
 offset, skip_start_bytes, skip_end_bytes);
 if (ret) {
 return ret;
-- 
2.13.5

[Qemu-block] [PATCH v9 0/8] Optimize VMDK I/O by allocating multiple clusters

2017-10-09 Thread Ashijeet Acharya

Previously posted series patches:
v1 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg02044.html
v2 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg05080.html
v3 - http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg00074.html
v4 - http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg03851.html
v5 - http://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg00929.html
v6 - http://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg00947.html
v7 - http://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg06600.html
v8 - http://lists.nongnu.org/archive/html/qemu-devel/2017-07/msg08623.html

This series helps to optimize the I/O performance of VMDK driver.

Patch 1 helps us to move vmdk_find_offset_in_cluster.

Patch 2 & 3 perform a simple function re-naming tasks.

Patch 4 is used to factor out metadata loading code and implement it in separate
functions. This will help us to avoid code duplication in future patches of this
series.

Patch 5 helps to set the upper limit of the bytes handled in one cycle.

Patch 6 adds new functions to help us allocate multiple clusters according to
the size requested, perform COW if required and return the offset of the first
newly allocated cluster.

Patch 7 changes the metadata update code to update the L2 tables for multiple
clusters at once.

Patch 8 helps us to finally change vmdk_get_cluster_offset() to find cluster
offset only as cluster allocation task is now handled by vmdk_alloc_clusters()

Optimization test results:

This patch series improves 128 KB sequential write performance to an
empty VMDK file by 54%

Benchmark command: ./qemu-img bench -w -c 1024 -s 128K -d 1 -t none -f
vmdk test.vmdk

Changes in v9:
- rebase the series

Changes in v8:
- fix minor variable naming issue in patch 6

Changes in v7:
- comment the use of MIN() in calculating skip_end_bytes
- use extent->cluster_sectors instead of 128
- place check for m_data != NULL
- use g_new0(VmdkMetaData, 1) instead of g_malloc0(sizeof(*m_data))

Changes in v6:
- rename total_alloc_clusters as alloc_clusters_counter (fam)

Changes in v5:
- fix commit message and comment in patch 4 (fam)
- add vmdk_ prefix to handle_alloc() (fam)
- fix alignment issue in patch 6 (fam)
- use BDRV_SECTOR_BITS (fam)
- fix endianness calculation in patch 7 (fam)

Changes in v4:
- fix commit message in patch 1 (fam)
- drop size_to_clusters() function (fam)
- fix grammatical errors in function documentations (fam)
- factor out metadata loading coding in a separate patch (patch 4) (fam)
- rename vmdk_alloc_cluster_offset() to vmdk_alloc_clusters() (fam)
- break patch 4(in v3) into separate patches (patch 3 and 8) (fam)
- rename extent_size to extent_end (fam)
- use QEMU_ALIGN_UP instead of vmdk_align_offset. (fam)
- drop next and simply do m_data = m_data->next (fam)

Changes in v3:
- move size_to_clusters() from patch 1 to 3 (fam)
- use DIV_ROUND_UP in size_to_clusters (fam)
- make patch 2 compilable (fam)
- rename vmdk_L2update as vmdk_l2update and use UINT32_MAX (fam)
- combine patch 3 and patch 4 (as in v2) to make them compilable (fam)
- call bdrv_pwrite_sync() for batches of atmost 512 clusters at once (fam)

Changes in v2:
- segregate the ugly Patch 1 in v1 into 6 readable and sensible patches
- include benchmark test results in v2

Ashijeet Acharya (8):
  vmdk: Move vmdk_find_offset_in_cluster() to the top
  vmdk: Rename get_whole_cluster() to vmdk_perform_cow()
  vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset()
  vmdk: Factor out metadata loading code out of
vmdk_get_cluster_offset()
  vmdk: Set maximum bytes allocated in one cycle
  vmdk: New functions to assist allocating multiple clusters
  vmdk: Update metadata for multiple clusters
  vmdk: Make vmdk_get_cluster_offset() return cluster offset only

 block/vmdk.c | 538 +--
 1 file changed, 416 insertions(+), 122 deletions(-)

-- 
2.13.5

[Qemu-block] [PATCH v9 1/8] vmdk: Move vmdk_find_offset_in_cluster() to the top

2017-10-09 Thread Ashijeet Acharya

Move the existing vmdk_find_offset_in_cluster() function to the top of
the driver.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index c665bcc977..e86ca39ff2 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -242,6 +242,18 @@ static void vmdk_free_last_extent(BlockDriverState *bs)
 s->extents = g_renew(VmdkExtent, s->extents, s->num_extents);
 }
 
+static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
+   int64_t offset)
+{
+uint64_t extent_begin_offset, extent_relative_offset;
+uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;
+
+extent_begin_offset =
+(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE;
+extent_relative_offset = offset - extent_begin_offset;
+return extent_relative_offset % cluster_size;
+}
+
 /* Return -ve errno, or 0 on success and write CID into *pcid. */
 static int vmdk_read_cid(BlockDriverState *bs, int parent, uint32_t *pcid)
 {
@@ -1283,18 +1295,6 @@ static VmdkExtent *find_extent(BDRVVmdkState *s,
 return NULL;
 }
 
-static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
-   int64_t offset)
-{
-uint64_t extent_begin_offset, extent_relative_offset;
-uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;
-
-extent_begin_offset =
-(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE;
-extent_relative_offset = offset - extent_begin_offset;
-return extent_relative_offset % cluster_size;
-}
-
 static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent,
   int64_t sector_num)
 {
-- 
2.13.5

Re: [Qemu-block] [Qemu-devel] [PATCH v8 0/8] Optimize VMDK I/O by allocating multiple clusters

2017-10-04 Thread Ashijeet Acharya

On Wed, Oct 4, 2017 at 1:58 PM, Fam Zheng  wrote:

> On Wed, 10/04 13:47, Ashijeet Acharya wrote:
> > Fam: Ping?
>
> Hi Ashijeet, looks like this patch doesn't apply to current master, could
> you
> rebase and post another version?
>

Hello Fam,

I will try to do it over the weekend then and you can merge it next week
hopefully.

Ashijeet

>
> Fam
>

Re: [Qemu-block] [PATCH v8 0/8] Optimize VMDK I/O by allocating multiple clusters

2017-10-04 Thread Ashijeet Acharya

On Thu, Aug 10, 2017 at 11:13 PM, Stefan Hajnoczi 
wrote:

> On Thu, Aug 10, 2017 at 9:18 AM, Ashijeet Acharya
>  wrote:
> > On Thu, Aug 10, 2017 at 1:41 PM, Stefan Hajnoczi 
> wrote:
> >>
> >> On Thu, Jul 27, 2017 at 3:33 PM, Ashijeet Acharya
> >>  wrote:
> >> > Previously posted series patches:
> >> > v1 -
> >> > http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg02044.html
> >> > v2 -
> >> > http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg05080.html
> >> > v3 -
> >> > http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg00074.html
> >> > v4 -
> >> > http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg03851.html
> >> > v5 -
> >> > http://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg00929.html
> >> > v6 -
> >> > http://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg00947.html
> >> > v7 -
> >> > http://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg06600.html
> >> >
> >> > This series helps to optimize the I/O performance of VMDK driver.
> >> >
> >> > Patch 1 helps us to move vmdk_find_offset_in_cluster.
> >> >
> >> > Patch 2 & 3 perform a simple function re-naming tasks.
> >> >
> >> > Patch 4 is used to factor out metadata loading code and implement it
> in
> >> > separate
> >> > functions. This will help us to avoid code duplication in future
> patches
> >> > of this
> >> > series.
> >> >
> >> > Patch 5 helps to set the upper limit of the bytes handled in one
> cycle.
> >> >
> >> > Patch 6 adds new functions to help us allocate multiple clusters
> >> > according to
> >> > the size requested, perform COW if required and return the offset of
> the
> >> > first
> >> > newly allocated cluster.
> >> >
> >> > Patch 7 changes the metadata update code to update the L2 tables for
> >> > multiple
> >> > clusters at once.
> >> >
> >> > Patch 8 helps us to finally change vmdk_get_cluster_offset() to find
> >> > cluster
> >> > offset only as cluster allocation task is now handled by
> >> > vmdk_alloc_clusters()
> >> >
> >> > Optimization test results:
> >> >
> >> > This patch series improves 128 KB sequential write performance to an
> >> > empty VMDK file by 54%
> >> >
> >> > Benchmark command: ./qemu-img bench -w -c 1024 -s 128K -d 1 -t none -f
> >> > vmdk test.vmdk
> >> >
> >> > Changes in v8:
> >> > - fix minor variable naming issue in patch 6
> >>
> >> Fam: Ping?
> >>
> >> Ashijeet: Feel free to send a ping reply if no one reviews your
> >> patches within a few days.
> >
> >
> > Hi Stefan,
> >
> > I had a chat with Fam on #qemu-block before submitting this series and he
> > said he will be merging it soon when the freeze is over (I am not sure
> if it
> > is yet) since all the patches are already reviewed :-)
>
> Good to hear :).
>
> QEMU 2.10 is scheduled to be released on 22nd or 29th of August.
>
> Stefan
>

Fam: Ping?

Ashijeet

Re: [Qemu-block] [PATCH v2 0/7] Refactor DMG driver to have chunk size independence

2017-09-05 Thread Ashijeet Acharya

On Tue, Sep 5, 2017 at 4:28 PM, Stefan Hajnoczi  wrote:

> On Wed, Aug 30, 2017 at 06:32:52PM +0530, Ashijeet Acharya wrote:
> > On Tue, Aug 29, 2017 at 8:55 PM, Stefan Hajnoczi 
> wrote:
> >
> > > On Sun, Aug 20, 2017 at 1:47 PM, Ashijeet Acharya
> > >  wrote:
> > > > On Fri, May 5, 2017 at 7:29 PM, Stefan Hajnoczi 
> > > wrote:
> > > >>
> > > >> On Thu, Apr 27, 2017 at 01:36:30PM +0530, Ashijeet Acharya wrote:
> > > >> > This series helps to provide chunk size independence for DMG
> driver to
> > > >> > prevent
> > > >> > denial-of-service in cases where untrusted files are being
> accessed by
> > > >> > the user.
> > > >>
> > > >> The core of the chunk size dependence problem are these lines:
> > > >>
> > > >> s->compressed_chunk = qemu_try_blockalign(bs->file->bs,
> > > >>
>  ds.max_compressed_size +
> > > 1);
> > > >> s->uncompressed_chunk = qemu_try_blockalign(bs->file->bs,
> > > >> 512 *
> > > >> ds.max_sectors_per_chunk);
> > > >>
> > > >> The refactoring needs to eliminate these buffers because their size
> is
> > > >> controlled by the untrusted input file.
> > > >
> > > >
> > > > Oh okay, I understand now. But wouldn't I still need to allocate some
> > > memory
> > > > for these buffers to be able to use them for the compressed chunks
> case
> > > you
> > > > mentioned below. Instead of letting the DMG images control the size
> of
> > > these
> > > > buffers, maybe I can hard-code the size of these buffers instead?
> > > >
> > > >>
> > > >>
> > > >> After applying your patches these lines remain unchanged and we
> still
> > > >> cannot use input files that have a 250 MB chunk size, for example.
> So
> > > >> I'm not sure how this series is supposed to work.
> > > >>
> > > >> Here is the approach I would take:
> > > >>
> > > >> In order to achieve this dmg_read_chunk() needs to be scrapped.  It
> is
> > > >> designed to read a full chunk.  The new model does not read full
> chunks
> > > >> anymore.
> > > >>
> > > >> Uncompressed reads or zeroes should operate directly on qiov, not
> > > >> s->uncompressed_chunk.  This code will be dropped:
> > > >>
> > > >> data = s->uncompressed_chunk + sector_offset_in_chunk * 512;
> > > >> qemu_iovec_from_buf(qiov, i * 512, data, 512);
> > > >
> > > >
> > > > I have never worked with qiov before, are there any places where I
> can
> > > refer
> > > > to inside other drivers to get the idea of how to use it directly (I
> am
> > > > searching by myself in the meantime...)?
> > >
> > > A QEMUIOVector is a utility type for struct iovec iov[] processing.
> > > See util/iov.c.  This is called "vectored" or "scatter-gather" I/O.
> > >
> > > Instead of transferring data to/from a single  tuple,
> > > they take an array [].  For example, the buffer "Hello
> > > world" could be split into two elements:
> > > [{"Hello ", strlen("Hello ")},
> > >  {"world", strlen("world")}]
> > >
> > > Vectored I/O is often used because it eliminates memory copies.  Say
> > > you have a network packet header struct and also a data payload array.
> > > Traditionally you would have to allocate a new buffer large enough for
> > > both header and payload, copy the header and payload into the buffer,
> > > and finally give this temporary buffer to the I/O function.  This is
> > > inefficient.  With vectored I/O you can create a vector with two
> > > elements, the header and the payload, and the I/O function can process
> > > them without needing a temporary buffer copy.
> > >
> >
> > Thanks for the detailed explanation, I think I understood the concept now
> > and how to use qiov efficiently.
> > Correct me if I am wrong here. In order to use qiov directly for
> > uncompressed chunks:
> >
> > 1. Declare a new local_qiov inside dmg_co_preadv (let's say)
>
> No, the operation should use qiov directly if the chunk

Re: [Qemu-block] [PATCH v2 0/7] Refactor DMG driver to have chunk size independence

2017-08-30 Thread Ashijeet Acharya

On Tue, Aug 29, 2017 at 8:55 PM, Stefan Hajnoczi  wrote:

> On Sun, Aug 20, 2017 at 1:47 PM, Ashijeet Acharya
>  wrote:
> > On Fri, May 5, 2017 at 7:29 PM, Stefan Hajnoczi 
> wrote:
> >>
> >> On Thu, Apr 27, 2017 at 01:36:30PM +0530, Ashijeet Acharya wrote:
> >> > This series helps to provide chunk size independence for DMG driver to
> >> > prevent
> >> > denial-of-service in cases where untrusted files are being accessed by
> >> > the user.
> >>
> >> The core of the chunk size dependence problem are these lines:
> >>
> >> s->compressed_chunk = qemu_try_blockalign(bs->file->bs,
> >>   ds.max_compressed_size +
> 1);
> >> s->uncompressed_chunk = qemu_try_blockalign(bs->file->bs,
> >> 512 *
> >> ds.max_sectors_per_chunk);
> >>
> >> The refactoring needs to eliminate these buffers because their size is
> >> controlled by the untrusted input file.
> >
> >
> > Oh okay, I understand now. But wouldn't I still need to allocate some
> memory
> > for these buffers to be able to use them for the compressed chunks case
> you
> > mentioned below. Instead of letting the DMG images control the size of
> these
> > buffers, maybe I can hard-code the size of these buffers instead?
> >
> >>
> >>
> >> After applying your patches these lines remain unchanged and we still
> >> cannot use input files that have a 250 MB chunk size, for example.  So
> >> I'm not sure how this series is supposed to work.
> >>
> >> Here is the approach I would take:
> >>
> >> In order to achieve this dmg_read_chunk() needs to be scrapped.  It is
> >> designed to read a full chunk.  The new model does not read full chunks
> >> anymore.
> >>
> >> Uncompressed reads or zeroes should operate directly on qiov, not
> >> s->uncompressed_chunk.  This code will be dropped:
> >>
> >> data = s->uncompressed_chunk + sector_offset_in_chunk * 512;
> >> qemu_iovec_from_buf(qiov, i * 512, data, 512);
> >
> >
> > I have never worked with qiov before, are there any places where I can
> refer
> > to inside other drivers to get the idea of how to use it directly (I am
> > searching by myself in the meantime...)?
>
> A QEMUIOVector is a utility type for struct iovec iov[] processing.
> See util/iov.c.  This is called "vectored" or "scatter-gather" I/O.
>
> Instead of transferring data to/from a single  tuple,
> they take an array [].  For example, the buffer "Hello
> world" could be split into two elements:
> [{"Hello ", strlen("Hello ")},
>  {"world", strlen("world")}]
>
> Vectored I/O is often used because it eliminates memory copies.  Say
> you have a network packet header struct and also a data payload array.
> Traditionally you would have to allocate a new buffer large enough for
> both header and payload, copy the header and payload into the buffer,
> and finally give this temporary buffer to the I/O function.  This is
> inefficient.  With vectored I/O you can create a vector with two
> elements, the header and the payload, and the I/O function can process
> them without needing a temporary buffer copy.
>

Thanks for the detailed explanation, I think I understood the concept now
and how to use qiov efficiently.
Correct me if I am wrong here. In order to use qiov directly for
uncompressed chunks:

1. Declare a new local_qiov inside dmg_co_preadv (let's say)
2. Initialize it with qemu_iovec_init()
3. Reset it with qemu_iovec_reset() (this is because we will perform this
action in a loop and thus need to reset it before every loop?)
4. Declare a buffer "uncompressed_buf" and allocate it with
qemu_try_blockalign()
5. Add this buffer to our local_qiov using qemu_iovec_add()
6. Read data from file directly into local_qiov using bdrv_co_preadv()
7. On success concatenate it with the qiov passed into the main
dmg_co_preadv() function.

I think this method only works for uncompressed chunks. For the compressed
ones, I believe we will still need to do it in the existing way, i.e. read
chunk from file -> decompress into output buffer -> use
qemu_iovec_from_buf() because we cannot read directly since data is in
compressed form. Makes sense?


> > I got clearly what you are trying
> > to say, but don't know how to implement it. I think, don't we already do
> > that for the zeroed chunks in DMG in dmg_co_preadv()?
>
> Yes, dmg_co_preadv() directl

Re: [Qemu-block] [PATCH v2 0/7] Refactor DMG driver to have chunk size independence

2017-08-20 Thread Ashijeet Acharya

On Fri, May 5, 2017 at 7:29 PM, Stefan Hajnoczi  wrote:

> On Thu, Apr 27, 2017 at 01:36:30PM +0530, Ashijeet Acharya wrote:
> > This series helps to provide chunk size independence for DMG driver to
> prevent
> > denial-of-service in cases where untrusted files are being accessed by
> the user.
>
> The core of the chunk size dependence problem are these lines:
>
> s->compressed_chunk = qemu_try_blockalign(bs->file->bs,
>   ds.max_compressed_size + 1);
> s->uncompressed_chunk = qemu_try_blockalign(bs->file->bs,
> 512 *
> ds.max_sectors_per_chunk);
>
> The refactoring needs to eliminate these buffers because their size is
> controlled by the untrusted input file.
>

Oh okay, I understand now. But wouldn't I still need to allocate some
memory for these buffers to be able to use them for the compressed chunks
case you mentioned below. Instead of letting the DMG images control the
size of these buffers, maybe I can hard-code the size of these buffers
instead?

>
> After applying your patches these lines remain unchanged and we still
> cannot use input files that have a 250 MB chunk size, for example.  So
> I'm not sure how this series is supposed to work.
>
> Here is the approach I would take:
>
> In order to achieve this dmg_read_chunk() needs to be scrapped.  It is
> designed to read a full chunk.  The new model does not read full chunks
> anymore.
>
> Uncompressed reads or zeroes should operate directly on qiov, not
> s->uncompressed_chunk.  This code will be dropped:
>
> data = s->uncompressed_chunk + sector_offset_in_chunk * 512;
> qemu_iovec_from_buf(qiov, i * 512, data, 512);
>

I have never worked with qiov before, are there any places where I can
refer to inside other drivers to get the idea of how to use it directly (I
am searching by myself in the meantime...)? I got clearly what you are
trying to say, but don't know how to implement it. I think, don't we
already do that for the zeroed chunks in DMG in dmg_co_preadv()?

>
> Compressed reads still buffers.  I suggest the following buffers:
>
> 1. compressed_buf - compressed data is read into this buffer from file
> 2. uncompressed_buf - a place to discard decompressed data while
>   simulating a seek operation
>

Yes, these are the buffers whose size I can hard-code as discussed above?
You can suggest the preferred size to me.

> Data is read from compressed chunks by reading a reasonable amount
> (64k?) into compressed_buf.  If the user wishes to read at an offset
> into this chunk then a loop decompresses data we are seeking over into
> uncompressed_buf (and refills compressed_buf if it becomes empty) until
> the desired offset is reached.  Then decompression can continue
> directly into the user's qiov and uncompressed_buf isn't used to
> decompress the data requested by the user.
>

Yes, this series does exactly that but keeps using the "uncompressed"
buffer once we reach the desired offset. Once, I understand to use qiov
directly, we can do this. Also, Kevin did suggest me (as I remember
vaguely) that in reality we never actually get the read request at a
particular offset because DMG driver is generally used with "qemu-img
convert", which means all read requests are from the top.

>
> Sequential compressed reads can be optimized by keeping the compression
> state across read calls.  That means the zlib/bz2 state plus
> compressed_buf and the current offset.  That way we don't need to
> re-seek into the current compressed chunk to handle sequential reads.
>

I guess, that's what I implemented with this series so now I can reuse the
"caching access point" part in the next series to implement this
optimization.

Thanks
Ashijeet

Re: [Qemu-block] [PATCH v8 0/8] Optimize VMDK I/O by allocating multiple clusters

2017-08-10 Thread Ashijeet Acharya

On Thu, Aug 10, 2017 at 1:41 PM, Stefan Hajnoczi  wrote:

> On Thu, Jul 27, 2017 at 3:33 PM, Ashijeet Acharya
>  wrote:
> > Previously posted series patches:
> > v1 - http://lists.nongnu.org/archive/html/qemu-devel/2017-
> 03/msg02044.html
> > v2 - http://lists.nongnu.org/archive/html/qemu-devel/2017-
> 03/msg05080.html
> > v3 - http://lists.nongnu.org/archive/html/qemu-devel/2017-
> 04/msg00074.html
> > v4 - http://lists.nongnu.org/archive/html/qemu-devel/2017-
> 04/msg03851.html
> > v5 - http://lists.nongnu.org/archive/html/qemu-devel/2017-
> 06/msg00929.html
> > v6 - http://lists.nongnu.org/archive/html/qemu-devel/2017-
> 06/msg00947.html
> > v7 - http://lists.nongnu.org/archive/html/qemu-devel/2017-
> 06/msg06600.html
> >
> > This series helps to optimize the I/O performance of VMDK driver.
> >
> > Patch 1 helps us to move vmdk_find_offset_in_cluster.
> >
> > Patch 2 & 3 perform a simple function re-naming tasks.
> >
> > Patch 4 is used to factor out metadata loading code and implement it in
> separate
> > functions. This will help us to avoid code duplication in future patches
> of this
> > series.
> >
> > Patch 5 helps to set the upper limit of the bytes handled in one cycle.
> >
> > Patch 6 adds new functions to help us allocate multiple clusters
> according to
> > the size requested, perform COW if required and return the offset of the
> first
> > newly allocated cluster.
> >
> > Patch 7 changes the metadata update code to update the L2 tables for
> multiple
> > clusters at once.
> >
> > Patch 8 helps us to finally change vmdk_get_cluster_offset() to find
> cluster
> > offset only as cluster allocation task is now handled by
> vmdk_alloc_clusters()
> >
> > Optimization test results:
> >
> > This patch series improves 128 KB sequential write performance to an
> > empty VMDK file by 54%
> >
> > Benchmark command: ./qemu-img bench -w -c 1024 -s 128K -d 1 -t none -f
> > vmdk test.vmdk
> >
> > Changes in v8:
> > - fix minor variable naming issue in patch 6
>
> Fam: Ping?
>
> Ashijeet: Feel free to send a ping reply if no one reviews your
> patches within a few days.
>

Hi Stefan,

I had a chat with Fam on #qemu-block before submitting this series and he
said he will be merging it soon when the freeze is over (I am not sure if
it is yet) since all the patches are already reviewed :-)

Ashijeet

[Qemu-block] [PATCH v8 7/8] vmdk: Update metadata for multiple clusters

2017-07-27 Thread Ashijeet Acharya

Include a next pointer in VmdkMetaData struct to point to the previous
allocated L2 table. Modify vmdk_L2update to start updating metadata for
allocation of multiple clusters at once.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 128 ++-
 1 file changed, 101 insertions(+), 27 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 5f27dbb..4a59ca4 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -137,6 +137,8 @@ typedef struct VmdkMetaData {
 int valid;
 uint32_t *l2_cache_entry;
 uint32_t nb_clusters;
+uint32_t offset;
+struct VmdkMetaData *next;
 } VmdkMetaData;
 
 typedef struct VmdkGrainMarker {
@@ -1116,34 +1118,87 @@ exit:
 return ret;
 }
 
-static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
- uint32_t offset)
+static int vmdk_alloc_cluster_link_l2(VmdkExtent *extent,
+  VmdkMetaData *m_data, bool zeroed)
 {
-offset = cpu_to_le32(offset);
+int i;
+uint32_t offset, temp_offset;
+int *l2_table_array;
+int l2_array_size;
+
+if (zeroed) {
+temp_offset = VMDK_GTE_ZEROED;
+} else {
+temp_offset = m_data->offset;
+}
+
+l2_array_size = sizeof(uint32_t) * m_data->nb_clusters;
+l2_table_array = qemu_try_blockalign(extent->file->bs,
+ QEMU_ALIGN_UP(l2_array_size,
+   BDRV_SECTOR_SIZE));
+if (l2_table_array == NULL) {
+return VMDK_ERROR;
+}
+memset(l2_table_array, 0, QEMU_ALIGN_UP(l2_array_size, BDRV_SECTOR_SIZE));
 /* update L2 table */
+offset = temp_offset;
+for (i = 0; i < m_data->nb_clusters; i++) {
+l2_table_array[i] = cpu_to_le32(offset);
+if (!zeroed) {
+offset += extent->cluster_sectors;
+}
+}
 if (bdrv_pwrite_sync(extent->file,
-((int64_t)m_data->l2_offset * 512)
-+ (m_data->l2_index * sizeof(offset)),
-&offset, sizeof(offset)) < 0) {
+ ((int64_t)m_data->l2_offset * 512)
+ + ((m_data->l2_index) * sizeof(offset)),
+ l2_table_array, l2_array_size) < 0) {
 return VMDK_ERROR;
 }
 /* update backup L2 table */
 if (extent->l1_backup_table_offset != 0) {
 m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
 if (bdrv_pwrite_sync(extent->file,
-((int64_t)m_data->l2_offset * 512)
-+ (m_data->l2_index * sizeof(offset)),
-&offset, sizeof(offset)) < 0) {
+ ((int64_t)m_data->l2_offset * 512)
+ + ((m_data->l2_index) * sizeof(offset)),
+ l2_table_array, l2_array_size) < 0) {
 return VMDK_ERROR;
 }
 }
+
+offset = temp_offset;
 if (m_data->l2_cache_entry) {
-*m_data->l2_cache_entry = offset;
+for (i = 0; i < m_data->nb_clusters; i++) {
+*m_data->l2_cache_entry = cpu_to_le32(offset);
+m_data->l2_cache_entry++;
+
+if (!zeroed) {
+offset += extent->cluster_sectors;
+}
+}
 }
 
+qemu_vfree(l2_table_array);
 return VMDK_OK;
 }
 
+static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
+ bool zeroed)
+{
+int ret;
+
+while (m_data->next != NULL) {
+
+ret = vmdk_alloc_cluster_link_l2(extent, m_data, zeroed);
+if (ret < 0) {
+return ret;
+}
+
+m_data = m_data->next;
+ }
+
+ return VMDK_OK;
+}
+
 /*
  * vmdk_l2load
  *
@@ -1260,9 +1315,10 @@ static int get_cluster_table(VmdkExtent *extent, 
uint64_t offset,
  *
  *   VMDK_ERROR:in error cases
  */
+
 static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent,
  uint64_t offset, uint64_t *cluster_offset,
- int64_t *bytes, VmdkMetaData *m_data,
+ int64_t *bytes, VmdkMetaData **m_data,
  bool allocate, uint32_t *alloc_clusters_counter)
 {
 int l1_index, l2_offset, l2_index;
@@ -1271,6 +1327,7 @@ static int vmdk_handle_alloc(BlockDriverState *bs, 
VmdkExtent *extent,
 uint32_t nb_clusters;
 bool zeroed = false;
 uint64_t skip_start_bytes, skip_end_bytes;
+VmdkMetaData *old_m_data;
 int ret;
 
 ret = get_cluster_table(extent, offset, &l1_index, &l2_offset,
@@ -1331,13 +1388,21 @@ static int vmdk_handle_alloc(BlockDriverState *bs, 
VmdkExtent *extent,
 if (ret < 0) {
 return ret;
 }
-if (m_data) {
-m_data->valid = 1;

[Qemu-block] [PATCH v8 6/8] vmdk: New functions to assist allocating multiple clusters

2017-07-27 Thread Ashijeet Acharya

Introduce two new helper functions handle_alloc() and
vmdk_alloc_cluster_offset(). handle_alloc() helps to allocate multiple
clusters at once starting from a given offset on disk and performs COW
if necessary for first and last allocated clusters.
vmdk_alloc_cluster_offset() helps to return the offset of the first of
the many newly allocated clusters. Also, provide proper documentation
for both.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 201 ---
 1 file changed, 191 insertions(+), 10 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index fe2046b..5f27dbb 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -136,6 +136,7 @@ typedef struct VmdkMetaData {
 unsigned int l2_offset;
 int valid;
 uint32_t *l2_cache_entry;
+uint32_t nb_clusters;
 } VmdkMetaData;
 
 typedef struct VmdkGrainMarker {
@@ -1242,6 +1243,183 @@ static int get_cluster_table(VmdkExtent *extent, 
uint64_t offset,
 return VMDK_OK;
 }
 
+/*
+ * vmdk_handle_alloc
+ *
+ * Allocate new clusters for an area that either is yet unallocated or needs a
+ * copy on write.
+ *
+ * Returns:
+ *   VMDK_OK:   if new clusters were allocated, *bytes may be decreased if
+ *  the new allocation doesn't cover all of the requested area.
+ *  *cluster_offset is updated to contain the offset of the
+ *  first newly allocated cluster.
+ *
+ *   VMDK_UNALLOC:  if no clusters could be allocated. *cluster_offset is left
+ *  unchanged.
+ *
+ *   VMDK_ERROR:in error cases
+ */
+static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent,
+ uint64_t offset, uint64_t *cluster_offset,
+ int64_t *bytes, VmdkMetaData *m_data,
+ bool allocate, uint32_t *alloc_clusters_counter)
+{
+int l1_index, l2_offset, l2_index;
+uint32_t *l2_table;
+uint32_t cluster_sector;
+uint32_t nb_clusters;
+bool zeroed = false;
+uint64_t skip_start_bytes, skip_end_bytes;
+int ret;
+
+ret = get_cluster_table(extent, offset, &l1_index, &l2_offset,
+&l2_index, &l2_table);
+if (ret < 0) {
+return ret;
+}
+
+cluster_sector = le32_to_cpu(l2_table[l2_index]);
+
+skip_start_bytes = vmdk_find_offset_in_cluster(extent, offset);
+/* Calculate the number of clusters to look for. Here we truncate the last
+ * cluster, i.e. 1 less than the actual value calculated as we may need to
+ * perform COW for the last one. */
+nb_clusters = DIV_ROUND_UP(skip_start_bytes + *bytes,
+   extent->cluster_sectors << BDRV_SECTOR_BITS) - 
1;
+
+nb_clusters = MIN(nb_clusters, extent->l2_size - l2_index);
+assert(nb_clusters <= INT_MAX);
+
+/* update bytes according to final nb_clusters value */
+if (nb_clusters != 0) {
+*bytes = ((nb_clusters * extent->cluster_sectors) << BDRV_SECTOR_BITS)
+ - skip_start_bytes;
+} else {
+nb_clusters = 1;
+}
+*alloc_clusters_counter += nb_clusters;
+
+/* we need to use MIN() for basically 3 cases that arise :
+ * 1. alloc very first cluster : here skip_start_bytes >= 0 and
+ **bytes <= cluster_size.
+ * 2. alloc middle clusters : here *bytes is a perfect multiple of
+ *cluster_size and skip_start_bytes is 0.
+ * 3. alloc very last cluster : here *bytes <= cluster_size and
+ *skip_start_bytes is 0
+ */
+skip_end_bytes = skip_start_bytes + MIN(*bytes,
+ extent->cluster_sectors * BDRV_SECTOR_SIZE
+- skip_start_bytes);
+
+if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) {
+zeroed = true;
+}
+
+if (!cluster_sector || zeroed) {
+if (!allocate) {
+return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
+}
+
+cluster_sector = extent->next_cluster_sector;
+extent->next_cluster_sector += extent->cluster_sectors
+* nb_clusters;
+
+ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
+   offset, skip_start_bytes,
+   skip_end_bytes);
+if (ret < 0) {
+return ret;
+}
+if (m_data) {
+m_data->valid = 1;
+m_data->l1_index = l1_index;
+m_data->l2_index = l2_index;
+m_data->l2_offset = l2_offset;
+m_data->l2_cache_entry = &l2_table[l2_index];
+m_data->nb_clusters = nb_clusters;
+}
+}
+*cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
+return VMDK_OK;
+}
+
+/*
+ * vmdk_alloc_clusters
+ *
+ * For a given offset on

[Qemu-block] [PATCH v8 8/8] vmdk: Make vmdk_get_cluster_offset() return cluster offset only

2017-07-27 Thread Ashijeet Acharya

vmdk_alloc_clusters() introduced earlier now handles the task of
allocating clusters and performing COW when needed. Thus we can change
vmdk_get_cluster_offset() to stick to the sole purpose of returning
cluster offset using sector number. Update the changes at all call
sites.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 56 
 1 file changed, 12 insertions(+), 44 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 4a59ca4..a84b26c 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1494,25 +1494,16 @@ static int vmdk_alloc_clusters(BlockDriverState *bs,
  * For flat extents, the start offset as parsed from the description file is
  * returned.
  *
- * For sparse extents, look up in L1, L2 table. If allocate is true, return an
- * offset for a new cluster and update L2 cache. If there is a backing file,
- * COW is done before returning; otherwise, zeroes are written to the allocated
- * cluster. Both COW and zero writing skips the sector range
- * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller
- * has new data to write there.
+ * For sparse extents, look up the L1, L2 table.
  *
  * Returns: VMDK_OK if cluster exists and mapped in the image.
- *  VMDK_UNALLOC if cluster is not mapped and @allocate is false.
- *  VMDK_ERROR if failed.
+ *  VMDK_UNALLOC if cluster is not mapped.
+ *  VMDK_ERROR if failed
  */
 static int vmdk_get_cluster_offset(BlockDriverState *bs,
VmdkExtent *extent,
-   VmdkMetaData *m_data,
uint64_t offset,
-   bool allocate,
-   uint64_t *cluster_offset,
-   uint64_t skip_start_bytes,
-   uint64_t skip_end_bytes)
+   uint64_t *cluster_offset)
 {
 int l1_index, l2_offset, l2_index;
 uint32_t *l2_table;
@@ -1537,31 +1528,9 @@ static int vmdk_get_cluster_offset(BlockDriverState *bs,
 }
 
 if (!cluster_sector || zeroed) {
-if (!allocate) {
-return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
-}
-
-cluster_sector = extent->next_cluster_sector;
-extent->next_cluster_sector += extent->cluster_sectors;
-
-/* First of all we write grain itself, to avoid race condition
- * that may to corrupt the image.
- * This problem may occur because of insufficient space on host disk
- * or inappropriate VM shutdown.
- */
-ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
-offset, skip_start_bytes, skip_end_bytes);
-if (ret) {
-return ret;
-}
-if (m_data) {
-m_data->valid = 1;
-m_data->l1_index = l1_index;
-m_data->l2_index = l2_index;
-m_data->l2_offset = l2_offset;
-m_data->l2_cache_entry = &l2_table[l2_index];
-}
+return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
 }
+
 *cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
 return VMDK_OK;
 }
@@ -1604,9 +1573,7 @@ static int64_t coroutine_fn 
vmdk_co_get_block_status(BlockDriverState *bs,
 return 0;
 }
 qemu_co_mutex_lock(&s->lock);
-ret = vmdk_get_cluster_offset(bs, extent, NULL,
-  sector_num * 512, false, &offset,
-  0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, sector_num * 512, &offset);
 qemu_co_mutex_unlock(&s->lock);
 
 index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
@@ -1797,13 +1764,14 @@ vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, 
uint64_t bytes,
 ret = -EIO;
 goto fail;
 }
-ret = vmdk_get_cluster_offset(bs, extent, NULL,
-  offset, false, &cluster_offset, 0, 0);
+
 offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
 
 n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
  - offset_in_cluster);
 
+ret = vmdk_get_cluster_offset(bs, extent, offset, &cluster_offset);
+
 if (ret != VMDK_OK) {
 /* if not allocated, try to read from parent image, if exist */
 if (bs->backing && ret != VMDK_ZEROED) {
@@ -2550,9 +2518,9 @@ static int vmdk_check(BlockDriverState *bs, 
BdrvCheckResult *result,
 sector_num);
 break;
 }
-ret = vmdk_get_cluster_offset(bs, extent, NULL,
+ret = vmdk_get_cluster_offset(bs, extent,
   sector_num << BDRV_SECTOR_BITS,
-  false, &clu

[Qemu-block] [PATCH v8 5/8] vmdk: Set maximum bytes allocated in one cycle

2017-07-27 Thread Ashijeet Acharya

Set the maximum bytes allowed to get allocated at once to be not more
than the extent size boundary to handle writes at two separate extents
appropriately.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 5647f53..fe2046b 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1624,6 +1624,7 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
offset,
 uint64_t cluster_offset;
 uint64_t bytes_done = 0;
 VmdkMetaData m_data;
+uint64_t extent_end;
 
 if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) {
 error_report("Wrong offset: offset=0x%" PRIx64
@@ -1637,9 +1638,17 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
offset,
 if (!extent) {
 return -EIO;
 }
+extent_end = extent->end_sector * BDRV_SECTOR_SIZE;
+
 offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
-n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
- - offset_in_cluster);
+
+/* truncate n_bytes to first cluster because we need to perform COW */
+if (offset_in_cluster > 0) {
+n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
+ - offset_in_cluster);
+} else {
+n_bytes = MIN(bytes, extent_end - offset);
+}
 
 ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset,
   !(extent->compressed || zeroed),
-- 
2.6.2

[Qemu-block] [PATCH v8 3/8] vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset()

2017-07-27 Thread Ashijeet Acharya

Rename the existing get_cluster_offset() to vmdk_get_cluster_offset()
and update name in all the callers accordingly.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 46 +++---
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 73ae786..f403981 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1144,7 +1144,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData 
*m_data,
 }
 
 /**
- * get_cluster_offset
+ * vmdk_get_cluster_offset
  *
  * Look up cluster offset in extent file by sector number, and store in
  * @cluster_offset.
@@ -1163,14 +1163,14 @@ static int vmdk_L2update(VmdkExtent *extent, 
VmdkMetaData *m_data,
  *  VMDK_UNALLOC if cluster is not mapped and @allocate is false.
  *  VMDK_ERROR if failed.
  */
-static int get_cluster_offset(BlockDriverState *bs,
-  VmdkExtent *extent,
-  VmdkMetaData *m_data,
-  uint64_t offset,
-  bool allocate,
-  uint64_t *cluster_offset,
-  uint64_t skip_start_bytes,
-  uint64_t skip_end_bytes)
+static int vmdk_get_cluster_offset(BlockDriverState *bs,
+   VmdkExtent *extent,
+   VmdkMetaData *m_data,
+   uint64_t offset,
+   bool allocate,
+   uint64_t *cluster_offset,
+   uint64_t skip_start_bytes,
+   uint64_t skip_end_bytes)
 {
 unsigned int l1_index, l2_offset, l2_index;
 int min_index, i, j;
@@ -1304,9 +1304,9 @@ static int64_t coroutine_fn 
vmdk_co_get_block_status(BlockDriverState *bs,
 return 0;
 }
 qemu_co_mutex_lock(&s->lock);
-ret = get_cluster_offset(bs, extent, NULL,
- sector_num * 512, false, &offset,
- 0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, NULL,
+  sector_num * 512, false, &offset,
+  0, 0);
 qemu_co_mutex_unlock(&s->lock);
 
 index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
@@ -1497,8 +1497,8 @@ vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, 
uint64_t bytes,
 ret = -EIO;
 goto fail;
 }
-ret = get_cluster_offset(bs, extent, NULL,
- offset, false, &cluster_offset, 0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, NULL,
+  offset, false, &cluster_offset, 0, 0);
 offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
 
 n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
@@ -1584,10 +1584,10 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
offset,
 n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
  - offset_in_cluster);
 
-ret = get_cluster_offset(bs, extent, &m_data, offset,
- !(extent->compressed || zeroed),
- &cluster_offset, offset_in_cluster,
- offset_in_cluster + n_bytes);
+ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset,
+  !(extent->compressed || zeroed),
+  &cluster_offset, offset_in_cluster,
+  offset_in_cluster + n_bytes);
 if (extent->compressed) {
 if (ret == VMDK_OK) {
 /* Refuse write to allocated cluster for streamOptimized */
@@ -1596,8 +1596,8 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
offset,
 return -EIO;
 } else {
 /* allocate */
-ret = get_cluster_offset(bs, extent, &m_data, offset,
- true, &cluster_offset, 0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset,
+  true, &cluster_offset, 0, 0);
 }
 }
 if (ret == VMDK_ERROR) {
@@ -2229,9 +2229,9 @@ static int vmdk_check(BlockDriverState *bs, 
BdrvCheckResult *result,
 sector_num);
 break;
 }
-ret = get_cluster_offset(bs, extent, NULL,
- sector_num << BDRV_SECTOR_BITS,
- false, &cluster_offset, 0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, NULL,
+  sector_num << BDRV_SECTOR_BITS,
+

[Qemu-block] [PATCH v8 4/8] vmdk: Factor out metadata loading code out of vmdk_get_cluster_offset()

2017-07-27 Thread Ashijeet Acharya

Move the cluster tables loading code out of the existing
vmdk_get_cluster_offset() function and implement it in separate
get_cluster_table() and vmdk_l2load() functions.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 153 ---
 1 file changed, 105 insertions(+), 48 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index f403981..5647f53 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1143,6 +1143,105 @@ static int vmdk_L2update(VmdkExtent *extent, 
VmdkMetaData *m_data,
 return VMDK_OK;
 }
 
+/*
+ * vmdk_l2load
+ *
+ * Load a new L2 table into memory. If the table is in the cache, the cache
+ * is used; otherwise the L2 table is loaded from the image file.
+ *
+ * Returns:
+ *   VMDK_OK:   on success
+ *   VMDK_ERROR:in error cases
+ */
+static int vmdk_l2load(VmdkExtent *extent, uint64_t offset, int l2_offset,
+   uint32_t **new_l2_table, int *new_l2_index)
+{
+int min_index, i, j;
+uint32_t *l2_table;
+uint32_t min_count;
+
+for (i = 0; i < L2_CACHE_SIZE; i++) {
+if (l2_offset == extent->l2_cache_offsets[i]) {
+/* increment the hit count */
+if (++extent->l2_cache_counts[i] == UINT32_MAX) {
+for (j = 0; j < L2_CACHE_SIZE; j++) {
+extent->l2_cache_counts[j] >>= 1;
+}
+}
+l2_table = extent->l2_cache + (i * extent->l2_size);
+goto found;
+}
+}
+/* not found: load a new entry in the least used one */
+min_index = 0;
+min_count = UINT32_MAX;
+for (i = 0; i < L2_CACHE_SIZE; i++) {
+if (extent->l2_cache_counts[i] < min_count) {
+min_count = extent->l2_cache_counts[i];
+min_index = i;
+}
+}
+l2_table = extent->l2_cache + (min_index * extent->l2_size);
+if (bdrv_pread(extent->file,
+(int64_t)l2_offset * 512,
+l2_table,
+extent->l2_size * sizeof(uint32_t)
+) != extent->l2_size * sizeof(uint32_t)) {
+return VMDK_ERROR;
+}
+
+extent->l2_cache_offsets[min_index] = l2_offset;
+extent->l2_cache_counts[min_index] = 1;
+found:
+*new_l2_index = ((offset >> 9) / extent->cluster_sectors) % 
extent->l2_size;
+*new_l2_table = l2_table;
+
+return VMDK_OK;
+}
+
+/*
+ * get_cluster_table
+ *
+ * For a given offset, load (and allocate if needed) the l2 table.
+ *
+ * Returns:
+ *   VMDK_OK:on success
+ *
+ *   VMDK_UNALLOC:   if cluster is not mapped
+ *
+ *   VMDK_ERROR: in error cases
+ */
+static int get_cluster_table(VmdkExtent *extent, uint64_t offset,
+ int *new_l1_index, int *new_l2_offset,
+ int *new_l2_index, uint32_t **new_l2_table)
+{
+int l1_index, l2_offset, l2_index;
+uint32_t *l2_table;
+int ret;
+
+offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
+l1_index = (offset >> 9) / extent->l1_entry_sectors;
+if (l1_index >= extent->l1_size) {
+return VMDK_ERROR;
+}
+l2_offset = extent->l1_table[l1_index];
+if (!l2_offset) {
+return VMDK_UNALLOC;
+}
+
+ret = vmdk_l2load(extent, offset, l2_offset, &l2_table, &l2_index);
+if (ret < 0) {
+return ret;
+}
+
+*new_l1_index = l1_index;
+*new_l2_offset = l2_offset;
+*new_l2_index = l2_index;
+*new_l2_table = l2_table;
+
+return VMDK_OK;
+}
+
 /**
  * vmdk_get_cluster_offset
  *
@@ -1172,66 +1271,24 @@ static int vmdk_get_cluster_offset(BlockDriverState *bs,
uint64_t skip_start_bytes,
uint64_t skip_end_bytes)
 {
-unsigned int l1_index, l2_offset, l2_index;
-int min_index, i, j;
-uint32_t min_count, *l2_table;
+int l1_index, l2_offset, l2_index;
+uint32_t *l2_table;
 bool zeroed = false;
 int64_t ret;
 int64_t cluster_sector;
 
-if (m_data) {
-m_data->valid = 0;
-}
 if (extent->flat) {
 *cluster_offset = extent->flat_start_offset;
 return VMDK_OK;
 }
 
-offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
-l1_index = (offset >> 9) / extent->l1_entry_sectors;
-if (l1_index >= extent->l1_size) {
-return VMDK_ERROR;
-}
-l2_offset = extent->l1_table[l1_index];
-if (!l2_offset) {
-return VMDK_UNALLOC;
-}
-for (i = 0; i < L2_CACHE_SIZE; i++) {
-if (l2_offset == extent->l2_cache_offsets[i]) {
-/* increment the hit count */
-if (++extent->l2_cache_counts[i] == 0x) {
-for (j = 0; j < L2_CACHE_SIZE; j++) {
-extent->l2_cache_counts[j] >>= 1;
-}
-

[Qemu-block] [PATCH v8 2/8] vmdk: Rename get_whole_cluster() to vmdk_perform_cow()

2017-07-27 Thread Ashijeet Acharya

Rename the existing function get_whole_cluster() to vmdk_perform_cow()
as its sole purpose is to perform COW for the first and the last
allocated clusters if needed.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 23 ++-
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 22be887..73ae786 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1028,8 +1028,8 @@ static void vmdk_refresh_limits(BlockDriverState *bs, 
Error **errp)
 }
 }
 
-/**
- * get_whole_cluster
+/*
+ * vmdk_perform_cow
  *
  * Copy backing file's cluster that covers @sector_num, otherwise write zero,
  * to the cluster at @cluster_sector_num.
@@ -1037,13 +1037,18 @@ static void vmdk_refresh_limits(BlockDriverState *bs, 
Error **errp)
  * If @skip_start_sector < @skip_end_sector, the relative range
  * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave
  * it for call to write user data in the request.
+ *
+ * Returns:
+ *   VMDK_OK:   on success
+ *
+ *   VMDK_ERROR:in error cases
  */
-static int get_whole_cluster(BlockDriverState *bs,
- VmdkExtent *extent,
- uint64_t cluster_offset,
- uint64_t offset,
- uint64_t skip_start_bytes,
- uint64_t skip_end_bytes)
+static int vmdk_perform_cow(BlockDriverState *bs,
+VmdkExtent *extent,
+uint64_t cluster_offset,
+uint64_t offset,
+uint64_t skip_start_bytes,
+uint64_t skip_end_bytes)
 {
 int ret = VMDK_OK;
 int64_t cluster_bytes;
@@ -1244,7 +1249,7 @@ static int get_cluster_offset(BlockDriverState *bs,
  * This problem may occur because of insufficient space on host disk
  * or inappropriate VM shutdown.
  */
-ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
+ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
 offset, skip_start_bytes, skip_end_bytes);
 if (ret) {
 return ret;
-- 
2.6.2

[Qemu-block] [PATCH v8 1/8] vmdk: Move vmdk_find_offset_in_cluster() to the top

2017-07-27 Thread Ashijeet Acharya

Move the existing vmdk_find_offset_in_cluster() function to the top of
the driver.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index a9bd22b..22be887 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -242,6 +242,18 @@ static void vmdk_free_last_extent(BlockDriverState *bs)
 s->extents = g_renew(VmdkExtent, s->extents, s->num_extents);
 }
 
+static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
+   int64_t offset)
+{
+uint64_t extent_begin_offset, extent_relative_offset;
+uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;
+
+extent_begin_offset =
+(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE;
+extent_relative_offset = offset - extent_begin_offset;
+return extent_relative_offset % cluster_size;
+}
+
 static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
 {
 char *desc;
@@ -1266,18 +1278,6 @@ static VmdkExtent *find_extent(BDRVVmdkState *s,
 return NULL;
 }
 
-static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
-   int64_t offset)
-{
-uint64_t extent_begin_offset, extent_relative_offset;
-uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;
-
-extent_begin_offset =
-(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE;
-extent_relative_offset = offset - extent_begin_offset;
-return extent_relative_offset % cluster_size;
-}
-
 static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent,
   int64_t sector_num)
 {
-- 
2.6.2

[Qemu-block] [PATCH v8 0/8] Optimize VMDK I/O by allocating multiple clusters

2017-07-27 Thread Ashijeet Acharya

Previously posted series patches:
v1 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg02044.html
v2 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg05080.html
v3 - http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg00074.html
v4 - http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg03851.html
v5 - http://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg00929.html
v6 - http://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg00947.html
v7 - http://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg06600.html

This series helps to optimize the I/O performance of VMDK driver.

Patch 1 helps us to move vmdk_find_offset_in_cluster.

Patch 2 & 3 perform a simple function re-naming tasks.

Patch 4 is used to factor out metadata loading code and implement it in separate
functions. This will help us to avoid code duplication in future patches of this
series.

Patch 5 helps to set the upper limit of the bytes handled in one cycle.

Patch 6 adds new functions to help us allocate multiple clusters according to
the size requested, perform COW if required and return the offset of the first
newly allocated cluster.

Patch 7 changes the metadata update code to update the L2 tables for multiple
clusters at once.

Patch 8 helps us to finally change vmdk_get_cluster_offset() to find cluster
offset only as cluster allocation task is now handled by vmdk_alloc_clusters()

Optimization test results:

This patch series improves 128 KB sequential write performance to an
empty VMDK file by 54%

Benchmark command: ./qemu-img bench -w -c 1024 -s 128K -d 1 -t none -f
vmdk test.vmdk

Changes in v8:
- fix minor variable naming issue in patch 6

Changes in v7:
- comment the use of MIN() in calculating skip_end_bytes
- use extent->cluster_sectors instead of 128
- place check for m_data != NULL
- use g_new0(VmdkMetaData, 1) instead of g_malloc0(sizeof(*m_data))

Changes in v6:
- rename total_alloc_clusters as alloc_clusters_counter (fam)

Changes in v5:
- fix commit message and comment in patch 4 (fam)
- add vmdk_ prefix to handle_alloc() (fam)
- fix alignment issue in patch 6 (fam)
- use BDRV_SECTOR_BITS (fam)
- fix endianness calculation in patch 7 (fam)

Changes in v4:
- fix commit message in patch 1 (fam)
- drop size_to_clusters() function (fam)
- fix grammatical errors in function documentations (fam)
- factor out metadata loading coding in a separate patch (patch 4) (fam)
- rename vmdk_alloc_cluster_offset() to vmdk_alloc_clusters() (fam)
- break patch 4(in v3) into separate patches (patch 3 and 8) (fam)
- rename extent_size to extent_end (fam)
- use QEMU_ALIGN_UP instead of vmdk_align_offset. (fam)
- drop next and simply do m_data = m_data->next (fam)

Changes in v3:
- move size_to_clusters() from patch 1 to 3 (fam)
- use DIV_ROUND_UP in size_to_clusters (fam)
- make patch 2 compilable (fam)
- rename vmdk_L2update as vmdk_l2update and use UINT32_MAX (fam)
- combine patch 3 and patch 4 (as in v2) to make them compilable (fam)
- call bdrv_pwrite_sync() for batches of atmost 512 clusters at once (fam)

Changes in v2:
- segregate the ugly Patch 1 in v1 into 6 readable and sensible patches
- include benchmark test results in v2

Ashijeet Acharya (8):
  vmdk: Move vmdk_find_offset_in_cluster() to the top
  vmdk: Rename get_whole_cluster() to vmdk_perform_cow()
  vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset()
  vmdk: Factor out metadata loading code out of
vmdk_get_cluster_offset()
  vmdk: Set maximum bytes allocated in one cycle
  vmdk: New functions to assist allocating multiple clusters
  vmdk: Update metadata for multiple clusters
  vmdk: Make vmdk_get_cluster_offset() return cluster offset only

 block/vmdk.c | 538 +--
 1 file changed, 416 insertions(+), 122 deletions(-)

-- 
2.6.2

[Qemu-block] [PATCH v7 8/8] vmdk: Make vmdk_get_cluster_offset() return cluster offset only

2017-06-29 Thread Ashijeet Acharya

vmdk_alloc_clusters() introduced earlier now handles the task of
allocating clusters and performing COW when needed. Thus we can change
vmdk_get_cluster_offset() to stick to the sole purpose of returning
cluster offset using sector number. Update the changes at all call
sites.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 56 
 1 file changed, 12 insertions(+), 44 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 60b8adc..d41fde9 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1493,25 +1493,16 @@ static int vmdk_alloc_clusters(BlockDriverState *bs,
  * For flat extents, the start offset as parsed from the description file is
  * returned.
  *
- * For sparse extents, look up in L1, L2 table. If allocate is true, return an
- * offset for a new cluster and update L2 cache. If there is a backing file,
- * COW is done before returning; otherwise, zeroes are written to the allocated
- * cluster. Both COW and zero writing skips the sector range
- * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller
- * has new data to write there.
+ * For sparse extents, look up the L1, L2 table.
  *
  * Returns: VMDK_OK if cluster exists and mapped in the image.
- *  VMDK_UNALLOC if cluster is not mapped and @allocate is false.
- *  VMDK_ERROR if failed.
+ *  VMDK_UNALLOC if cluster is not mapped.
+ *  VMDK_ERROR if failed
  */
 static int vmdk_get_cluster_offset(BlockDriverState *bs,
VmdkExtent *extent,
-   VmdkMetaData *m_data,
uint64_t offset,
-   bool allocate,
-   uint64_t *cluster_offset,
-   uint64_t skip_start_bytes,
-   uint64_t skip_end_bytes)
+   uint64_t *cluster_offset)
 {
 int l1_index, l2_offset, l2_index;
 uint32_t *l2_table;
@@ -1536,31 +1527,9 @@ static int vmdk_get_cluster_offset(BlockDriverState *bs,
 }
 
 if (!cluster_sector || zeroed) {
-if (!allocate) {
-return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
-}
-
-cluster_sector = extent->next_cluster_sector;
-extent->next_cluster_sector += extent->cluster_sectors;
-
-/* First of all we write grain itself, to avoid race condition
- * that may to corrupt the image.
- * This problem may occur because of insufficient space on host disk
- * or inappropriate VM shutdown.
- */
-ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
-offset, skip_start_bytes, skip_end_bytes);
-if (ret) {
-return ret;
-}
-if (m_data) {
-m_data->valid = 1;
-m_data->l1_index = l1_index;
-m_data->l2_index = l2_index;
-m_data->l2_offset = l2_offset;
-m_data->l2_cache_entry = &l2_table[l2_index];
-}
+return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
 }
+
 *cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
 return VMDK_OK;
 }
@@ -1603,9 +1572,7 @@ static int64_t coroutine_fn 
vmdk_co_get_block_status(BlockDriverState *bs,
 return 0;
 }
 qemu_co_mutex_lock(&s->lock);
-ret = vmdk_get_cluster_offset(bs, extent, NULL,
-  sector_num * 512, false, &offset,
-  0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, sector_num * 512, &offset);
 qemu_co_mutex_unlock(&s->lock);
 
 index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
@@ -1796,13 +1763,14 @@ vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, 
uint64_t bytes,
 ret = -EIO;
 goto fail;
 }
-ret = vmdk_get_cluster_offset(bs, extent, NULL,
-  offset, false, &cluster_offset, 0, 0);
+
 offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
 
 n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
  - offset_in_cluster);
 
+ret = vmdk_get_cluster_offset(bs, extent, offset, &cluster_offset);
+
 if (ret != VMDK_OK) {
 /* if not allocated, try to read from parent image, if exist */
 if (bs->backing && ret != VMDK_ZEROED) {
@@ -2549,9 +2517,9 @@ static int vmdk_check(BlockDriverState *bs, 
BdrvCheckResult *result,
 sector_num);
 break;
 }
-ret = vmdk_get_cluster_offset(bs, extent, NULL,
+ret = vmdk_get_cluster_offset(bs, extent,
   sector_num << BDRV_SECTOR_BITS,
-  false, &clu

[Qemu-block] [PATCH v7 7/8] vmdk: Update metadata for multiple clusters

2017-06-29 Thread Ashijeet Acharya

Include a next pointer in VmdkMetaData struct to point to the previous
allocated L2 table. Modify vmdk_L2update to start updating metadata for
allocation of multiple clusters at once.

Signed-off-by: Ashijeet Acharya 
---
 block/vmdk.c | 128 ++-
 1 file changed, 101 insertions(+), 27 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 277db16..60b8adc 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -137,6 +137,8 @@ typedef struct VmdkMetaData {
 int valid;
 uint32_t *l2_cache_entry;
 uint32_t nb_clusters;
+uint32_t offset;
+struct VmdkMetaData *next;
 } VmdkMetaData;
 
 typedef struct VmdkGrainMarker {
@@ -1116,34 +1118,87 @@ exit:
 return ret;
 }
 
-static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
- uint32_t offset)
+static int vmdk_alloc_cluster_link_l2(VmdkExtent *extent,
+  VmdkMetaData *m_data, bool zeroed)
 {
-offset = cpu_to_le32(offset);
+int i;
+uint32_t offset, temp_offset;
+int *l2_table_array;
+int l2_array_size;
+
+if (zeroed) {
+temp_offset = VMDK_GTE_ZEROED;
+} else {
+temp_offset = m_data->offset;
+}
+
+l2_array_size = sizeof(uint32_t) * m_data->nb_clusters;
+l2_table_array = qemu_try_blockalign(extent->file->bs,
+ QEMU_ALIGN_UP(l2_array_size,
+   BDRV_SECTOR_SIZE));
+if (l2_table_array == NULL) {
+return VMDK_ERROR;
+}
+memset(l2_table_array, 0, QEMU_ALIGN_UP(l2_array_size, BDRV_SECTOR_SIZE));
 /* update L2 table */
+offset = temp_offset;
+for (i = 0; i < m_data->nb_clusters; i++) {
+l2_table_array[i] = cpu_to_le32(offset);
+if (!zeroed) {
+offset += extent->cluster_sectors;
+}
+}
 if (bdrv_pwrite_sync(extent->file,
-((int64_t)m_data->l2_offset * 512)
-+ (m_data->l2_index * sizeof(offset)),
-&offset, sizeof(offset)) < 0) {
+ ((int64_t)m_data->l2_offset * 512)
+ + ((m_data->l2_index) * sizeof(offset)),
+ l2_table_array, l2_array_size) < 0) {
 return VMDK_ERROR;
 }
 /* update backup L2 table */
 if (extent->l1_backup_table_offset != 0) {
 m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
 if (bdrv_pwrite_sync(extent->file,
-((int64_t)m_data->l2_offset * 512)
-+ (m_data->l2_index * sizeof(offset)),
-&offset, sizeof(offset)) < 0) {
+ ((int64_t)m_data->l2_offset * 512)
+ + ((m_data->l2_index) * sizeof(offset)),
+ l2_table_array, l2_array_size) < 0) {
 return VMDK_ERROR;
 }
 }
+
+offset = temp_offset;
 if (m_data->l2_cache_entry) {
-*m_data->l2_cache_entry = offset;
+for (i = 0; i < m_data->nb_clusters; i++) {
+*m_data->l2_cache_entry = cpu_to_le32(offset);
+m_data->l2_cache_entry++;
+
+if (!zeroed) {
+offset += extent->cluster_sectors;
+}
+}
 }
 
+qemu_vfree(l2_table_array);
 return VMDK_OK;
 }
 
+static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
+ bool zeroed)
+{
+int ret;
+
+while (m_data->next != NULL) {
+
+ret = vmdk_alloc_cluster_link_l2(extent, m_data, zeroed);
+if (ret < 0) {
+return ret;
+}
+
+m_data = m_data->next;
+ }
+
+ return VMDK_OK;
+}
+
 /*
  * vmdk_l2load
  *
@@ -1260,9 +1315,10 @@ static int get_cluster_table(VmdkExtent *extent, 
uint64_t offset,
  *
  *   VMDK_ERROR:in error cases
  */
+
 static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent,
  uint64_t offset, uint64_t *cluster_offset,
- int64_t *bytes, VmdkMetaData *m_data,
+ int64_t *bytes, VmdkMetaData **m_data,
  bool allocate, uint32_t *total_alloc_clusters)
 {
 int l1_index, l2_offset, l2_index;
@@ -1271,6 +1327,7 @@ static int vmdk_handle_alloc(BlockDriverState *bs, 
VmdkExtent *extent,
 uint32_t nb_clusters;
 bool zeroed = false;
 uint64_t skip_start_bytes, skip_end_bytes;
+VmdkMetaData *old_m_data;
 int ret;
 
 ret = get_cluster_table(extent, offset, &l1_index, &l2_offset,
@@ -1331,13 +1388,21 @@ static int vmdk_handle_alloc(BlockDriverState *bs, 
VmdkExtent *extent,
 if (ret < 0) {
 return ret;
 }
-if (m_data) {
-m_data->valid = 1;
-m_data-

[Qemu-block] [PATCH v7 6/8] vmdk: New functions to assist allocating multiple clusters

2017-06-29 Thread Ashijeet Acharya

Introduce two new helper functions handle_alloc() and
vmdk_alloc_cluster_offset(). handle_alloc() helps to allocate multiple
clusters at once starting from a given offset on disk and performs COW
if necessary for first and last allocated clusters.
vmdk_alloc_cluster_offset() helps to return the offset of the first of
the many newly allocated clusters. Also, provide proper documentation
for both.

Signed-off-by: Ashijeet Acharya 
---
 block/vmdk.c | 200 ---
 1 file changed, 190 insertions(+), 10 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index fe2046b..277db16 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -136,6 +136,7 @@ typedef struct VmdkMetaData {
 unsigned int l2_offset;
 int valid;
 uint32_t *l2_cache_entry;
+uint32_t nb_clusters;
 } VmdkMetaData;
 
 typedef struct VmdkGrainMarker {
@@ -1242,6 +1243,182 @@ static int get_cluster_table(VmdkExtent *extent, 
uint64_t offset,
 return VMDK_OK;
 }
 
+/*
+ * vmdk_handle_alloc
+ *
+ * Allocate new clusters for an area that either is yet unallocated or needs a
+ * copy on write.
+ *
+ * Returns:
+ *   VMDK_OK:   if new clusters were allocated, *bytes may be decreased if
+ *  the new allocation doesn't cover all of the requested area.
+ *  *cluster_offset is updated to contain the offset of the
+ *  first newly allocated cluster.
+ *
+ *   VMDK_UNALLOC:  if no clusters could be allocated. *cluster_offset is left
+ *  unchanged.
+ *
+ *   VMDK_ERROR:in error cases
+ */
+static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent,
+ uint64_t offset, uint64_t *cluster_offset,
+ int64_t *bytes, VmdkMetaData *m_data,
+ bool allocate, uint32_t *total_alloc_clusters)
+{
+int l1_index, l2_offset, l2_index;
+uint32_t *l2_table;
+uint32_t cluster_sector;
+uint32_t nb_clusters;
+bool zeroed = false;
+uint64_t skip_start_bytes, skip_end_bytes;
+int ret;
+
+ret = get_cluster_table(extent, offset, &l1_index, &l2_offset,
+&l2_index, &l2_table);
+if (ret < 0) {
+return ret;
+}
+
+cluster_sector = le32_to_cpu(l2_table[l2_index]);
+
+skip_start_bytes = vmdk_find_offset_in_cluster(extent, offset);
+/* Calculate the number of clusters to look for. Here we truncate the last
+ * cluster, i.e. 1 less than the actual value calculated as we may need to
+ * perform COW for the last one. */
+nb_clusters = DIV_ROUND_UP(skip_start_bytes + *bytes,
+   extent->cluster_sectors << BDRV_SECTOR_BITS) - 
1;
+
+nb_clusters = MIN(nb_clusters, extent->l2_size - l2_index);
+assert(nb_clusters <= INT_MAX);
+
+/* update bytes according to final nb_clusters value */
+if (nb_clusters != 0) {
+*bytes = ((nb_clusters * extent->cluster_sectors) << BDRV_SECTOR_BITS)
+ - skip_start_bytes;
+} else {
+nb_clusters = 1;
+}
+*total_alloc_clusters += nb_clusters;
+
+/* we need to use MIN() for basically 3 cases that arise :
+ * 1. alloc very first cluster : here skip_start_bytes >= 0 and
+ **bytes <= cluster_size.
+ * 2. alloc middle clusters : here *bytes is a perfect multiple of
+ *cluster_size and skip_start_bytes is 0.
+ * 3. alloc very last cluster : here *bytes <= cluster_size and
+ *skip_start_bytes is 0
+ */
+skip_end_bytes = skip_start_bytes + MIN(*bytes,
+ extent->cluster_sectors * BDRV_SECTOR_SIZE
+- skip_start_bytes);
+
+if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) {
+zeroed = true;
+}
+
+if (!cluster_sector || zeroed) {
+if (!allocate) {
+return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
+}
+
+cluster_sector = extent->next_cluster_sector;
+extent->next_cluster_sector += extent->cluster_sectors
+* nb_clusters;
+
+ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
+   offset, skip_start_bytes,
+   skip_end_bytes);
+if (ret < 0) {
+return ret;
+}
+if (m_data) {
+m_data->valid = 1;
+m_data->l1_index = l1_index;
+m_data->l2_index = l2_index;
+m_data->l2_offset = l2_offset;
+m_data->l2_cache_entry = &l2_table[l2_index];
+m_data->nb_clusters = nb_clusters;
+}
+}
+*cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
+return VMDK_OK;
+}
+
+/*
+ * vmdk_alloc_clusters
+ *
+ * For a given offset on the virtual disk, find the clust

[Qemu-block] [PATCH v7 5/8] vmdk: Set maximum bytes allocated in one cycle

2017-06-29 Thread Ashijeet Acharya

Set the maximum bytes allowed to get allocated at once to be not more
than the extent size boundary to handle writes at two separate extents
appropriately.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 5647f53..fe2046b 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1624,6 +1624,7 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
offset,
 uint64_t cluster_offset;
 uint64_t bytes_done = 0;
 VmdkMetaData m_data;
+uint64_t extent_end;
 
 if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) {
 error_report("Wrong offset: offset=0x%" PRIx64
@@ -1637,9 +1638,17 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
offset,
 if (!extent) {
 return -EIO;
 }
+extent_end = extent->end_sector * BDRV_SECTOR_SIZE;
+
 offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
-n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
- - offset_in_cluster);
+
+/* truncate n_bytes to first cluster because we need to perform COW */
+if (offset_in_cluster > 0) {
+n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
+ - offset_in_cluster);
+} else {
+n_bytes = MIN(bytes, extent_end - offset);
+}
 
 ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset,
   !(extent->compressed || zeroed),
-- 
2.6.2

[Qemu-block] [PATCH v7 3/8] vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset()

2017-06-29 Thread Ashijeet Acharya

Rename the existing get_cluster_offset() to vmdk_get_cluster_offset()
and update name in all the callers accordingly.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 46 +++---
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 73ae786..f403981 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1144,7 +1144,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData 
*m_data,
 }
 
 /**
- * get_cluster_offset
+ * vmdk_get_cluster_offset
  *
  * Look up cluster offset in extent file by sector number, and store in
  * @cluster_offset.
@@ -1163,14 +1163,14 @@ static int vmdk_L2update(VmdkExtent *extent, 
VmdkMetaData *m_data,
  *  VMDK_UNALLOC if cluster is not mapped and @allocate is false.
  *  VMDK_ERROR if failed.
  */
-static int get_cluster_offset(BlockDriverState *bs,
-  VmdkExtent *extent,
-  VmdkMetaData *m_data,
-  uint64_t offset,
-  bool allocate,
-  uint64_t *cluster_offset,
-  uint64_t skip_start_bytes,
-  uint64_t skip_end_bytes)
+static int vmdk_get_cluster_offset(BlockDriverState *bs,
+   VmdkExtent *extent,
+   VmdkMetaData *m_data,
+   uint64_t offset,
+   bool allocate,
+   uint64_t *cluster_offset,
+   uint64_t skip_start_bytes,
+   uint64_t skip_end_bytes)
 {
 unsigned int l1_index, l2_offset, l2_index;
 int min_index, i, j;
@@ -1304,9 +1304,9 @@ static int64_t coroutine_fn 
vmdk_co_get_block_status(BlockDriverState *bs,
 return 0;
 }
 qemu_co_mutex_lock(&s->lock);
-ret = get_cluster_offset(bs, extent, NULL,
- sector_num * 512, false, &offset,
- 0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, NULL,
+  sector_num * 512, false, &offset,
+  0, 0);
 qemu_co_mutex_unlock(&s->lock);
 
 index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
@@ -1497,8 +1497,8 @@ vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, 
uint64_t bytes,
 ret = -EIO;
 goto fail;
 }
-ret = get_cluster_offset(bs, extent, NULL,
- offset, false, &cluster_offset, 0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, NULL,
+  offset, false, &cluster_offset, 0, 0);
 offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
 
 n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
@@ -1584,10 +1584,10 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
offset,
 n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
  - offset_in_cluster);
 
-ret = get_cluster_offset(bs, extent, &m_data, offset,
- !(extent->compressed || zeroed),
- &cluster_offset, offset_in_cluster,
- offset_in_cluster + n_bytes);
+ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset,
+  !(extent->compressed || zeroed),
+  &cluster_offset, offset_in_cluster,
+  offset_in_cluster + n_bytes);
 if (extent->compressed) {
 if (ret == VMDK_OK) {
 /* Refuse write to allocated cluster for streamOptimized */
@@ -1596,8 +1596,8 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
offset,
 return -EIO;
 } else {
 /* allocate */
-ret = get_cluster_offset(bs, extent, &m_data, offset,
- true, &cluster_offset, 0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset,
+  true, &cluster_offset, 0, 0);
 }
 }
 if (ret == VMDK_ERROR) {
@@ -2229,9 +2229,9 @@ static int vmdk_check(BlockDriverState *bs, 
BdrvCheckResult *result,
 sector_num);
 break;
 }
-ret = get_cluster_offset(bs, extent, NULL,
- sector_num << BDRV_SECTOR_BITS,
- false, &cluster_offset, 0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, NULL,
+  sector_num << BDRV_SECTOR_BITS,
+

[Qemu-block] [PATCH v7 2/8] vmdk: Rename get_whole_cluster() to vmdk_perform_cow()

2017-06-29 Thread Ashijeet Acharya

Rename the existing function get_whole_cluster() to vmdk_perform_cow()
as its sole purpose is to perform COW for the first and the last
allocated clusters if needed.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 23 ++-
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 22be887..73ae786 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1028,8 +1028,8 @@ static void vmdk_refresh_limits(BlockDriverState *bs, 
Error **errp)
 }
 }
 
-/**
- * get_whole_cluster
+/*
+ * vmdk_perform_cow
  *
  * Copy backing file's cluster that covers @sector_num, otherwise write zero,
  * to the cluster at @cluster_sector_num.
@@ -1037,13 +1037,18 @@ static void vmdk_refresh_limits(BlockDriverState *bs, 
Error **errp)
  * If @skip_start_sector < @skip_end_sector, the relative range
  * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave
  * it for call to write user data in the request.
+ *
+ * Returns:
+ *   VMDK_OK:   on success
+ *
+ *   VMDK_ERROR:in error cases
  */
-static int get_whole_cluster(BlockDriverState *bs,
- VmdkExtent *extent,
- uint64_t cluster_offset,
- uint64_t offset,
- uint64_t skip_start_bytes,
- uint64_t skip_end_bytes)
+static int vmdk_perform_cow(BlockDriverState *bs,
+VmdkExtent *extent,
+uint64_t cluster_offset,
+uint64_t offset,
+uint64_t skip_start_bytes,
+uint64_t skip_end_bytes)
 {
 int ret = VMDK_OK;
 int64_t cluster_bytes;
@@ -1244,7 +1249,7 @@ static int get_cluster_offset(BlockDriverState *bs,
  * This problem may occur because of insufficient space on host disk
  * or inappropriate VM shutdown.
  */
-ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
+ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
 offset, skip_start_bytes, skip_end_bytes);
 if (ret) {
 return ret;
-- 
2.6.2

[Qemu-block] [PATCH v7 1/8] vmdk: Move vmdk_find_offset_in_cluster() to the top

2017-06-29 Thread Ashijeet Acharya

Move the existing vmdk_find_offset_in_cluster() function to the top of
the driver.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index a9bd22b..22be887 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -242,6 +242,18 @@ static void vmdk_free_last_extent(BlockDriverState *bs)
 s->extents = g_renew(VmdkExtent, s->extents, s->num_extents);
 }
 
+static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
+   int64_t offset)
+{
+uint64_t extent_begin_offset, extent_relative_offset;
+uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;
+
+extent_begin_offset =
+(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE;
+extent_relative_offset = offset - extent_begin_offset;
+return extent_relative_offset % cluster_size;
+}
+
 static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
 {
 char *desc;
@@ -1266,18 +1278,6 @@ static VmdkExtent *find_extent(BDRVVmdkState *s,
 return NULL;
 }
 
-static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
-   int64_t offset)
-{
-uint64_t extent_begin_offset, extent_relative_offset;
-uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;
-
-extent_begin_offset =
-(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE;
-extent_relative_offset = offset - extent_begin_offset;
-return extent_relative_offset % cluster_size;
-}
-
 static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent,
   int64_t sector_num)
 {
-- 
2.6.2

[Qemu-block] [PATCH v7 4/8] vmdk: Factor out metadata loading code out of vmdk_get_cluster_offset()

2017-06-29 Thread Ashijeet Acharya

Move the cluster tables loading code out of the existing
vmdk_get_cluster_offset() function and implement it in separate
get_cluster_table() and vmdk_l2load() functions.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 153 ---
 1 file changed, 105 insertions(+), 48 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index f403981..5647f53 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1143,6 +1143,105 @@ static int vmdk_L2update(VmdkExtent *extent, 
VmdkMetaData *m_data,
 return VMDK_OK;
 }
 
+/*
+ * vmdk_l2load
+ *
+ * Load a new L2 table into memory. If the table is in the cache, the cache
+ * is used; otherwise the L2 table is loaded from the image file.
+ *
+ * Returns:
+ *   VMDK_OK:   on success
+ *   VMDK_ERROR:in error cases
+ */
+static int vmdk_l2load(VmdkExtent *extent, uint64_t offset, int l2_offset,
+   uint32_t **new_l2_table, int *new_l2_index)
+{
+int min_index, i, j;
+uint32_t *l2_table;
+uint32_t min_count;
+
+for (i = 0; i < L2_CACHE_SIZE; i++) {
+if (l2_offset == extent->l2_cache_offsets[i]) {
+/* increment the hit count */
+if (++extent->l2_cache_counts[i] == UINT32_MAX) {
+for (j = 0; j < L2_CACHE_SIZE; j++) {
+extent->l2_cache_counts[j] >>= 1;
+}
+}
+l2_table = extent->l2_cache + (i * extent->l2_size);
+goto found;
+}
+}
+/* not found: load a new entry in the least used one */
+min_index = 0;
+min_count = UINT32_MAX;
+for (i = 0; i < L2_CACHE_SIZE; i++) {
+if (extent->l2_cache_counts[i] < min_count) {
+min_count = extent->l2_cache_counts[i];
+min_index = i;
+}
+}
+l2_table = extent->l2_cache + (min_index * extent->l2_size);
+if (bdrv_pread(extent->file,
+(int64_t)l2_offset * 512,
+l2_table,
+extent->l2_size * sizeof(uint32_t)
+) != extent->l2_size * sizeof(uint32_t)) {
+return VMDK_ERROR;
+}
+
+extent->l2_cache_offsets[min_index] = l2_offset;
+extent->l2_cache_counts[min_index] = 1;
+found:
+*new_l2_index = ((offset >> 9) / extent->cluster_sectors) % 
extent->l2_size;
+*new_l2_table = l2_table;
+
+return VMDK_OK;
+}
+
+/*
+ * get_cluster_table
+ *
+ * For a given offset, load (and allocate if needed) the l2 table.
+ *
+ * Returns:
+ *   VMDK_OK:on success
+ *
+ *   VMDK_UNALLOC:   if cluster is not mapped
+ *
+ *   VMDK_ERROR: in error cases
+ */
+static int get_cluster_table(VmdkExtent *extent, uint64_t offset,
+ int *new_l1_index, int *new_l2_offset,
+ int *new_l2_index, uint32_t **new_l2_table)
+{
+int l1_index, l2_offset, l2_index;
+uint32_t *l2_table;
+int ret;
+
+offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
+l1_index = (offset >> 9) / extent->l1_entry_sectors;
+if (l1_index >= extent->l1_size) {
+return VMDK_ERROR;
+}
+l2_offset = extent->l1_table[l1_index];
+if (!l2_offset) {
+return VMDK_UNALLOC;
+}
+
+ret = vmdk_l2load(extent, offset, l2_offset, &l2_table, &l2_index);
+if (ret < 0) {
+return ret;
+}
+
+*new_l1_index = l1_index;
+*new_l2_offset = l2_offset;
+*new_l2_index = l2_index;
+*new_l2_table = l2_table;
+
+return VMDK_OK;
+}
+
 /**
  * vmdk_get_cluster_offset
  *
@@ -1172,66 +1271,24 @@ static int vmdk_get_cluster_offset(BlockDriverState *bs,
uint64_t skip_start_bytes,
uint64_t skip_end_bytes)
 {
-unsigned int l1_index, l2_offset, l2_index;
-int min_index, i, j;
-uint32_t min_count, *l2_table;
+int l1_index, l2_offset, l2_index;
+uint32_t *l2_table;
 bool zeroed = false;
 int64_t ret;
 int64_t cluster_sector;
 
-if (m_data) {
-m_data->valid = 0;
-}
 if (extent->flat) {
 *cluster_offset = extent->flat_start_offset;
 return VMDK_OK;
 }
 
-offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
-l1_index = (offset >> 9) / extent->l1_entry_sectors;
-if (l1_index >= extent->l1_size) {
-return VMDK_ERROR;
-}
-l2_offset = extent->l1_table[l1_index];
-if (!l2_offset) {
-return VMDK_UNALLOC;
-}
-for (i = 0; i < L2_CACHE_SIZE; i++) {
-if (l2_offset == extent->l2_cache_offsets[i]) {
-/* increment the hit count */
-if (++extent->l2_cache_counts[i] == 0x) {
-for (j = 0; j < L2_CACHE_SIZE; j++) {
-extent->l2_cache_counts[j] >>= 1;
-}
-

[Qemu-block] [PATCH v7 0/8] Optimize VMDK I/O by allocating multiple clusters

2017-06-29 Thread Ashijeet Acharya

Previously posted series patches:
v1 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg02044.html
v2 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg05080.html
v3 - http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg00074.html
v4 - http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg03851.html
v5 - http://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg00929.html
v6 - http://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg00947.html

This series helps to optimize the I/O performance of VMDK driver.

Patch 1 helps us to move vmdk_find_offset_in_cluster.

Patch 2 & 3 perform a simple function re-naming tasks.

Patch 4 is used to factor out metadata loading code and implement it in separate
functions. This will help us to avoid code duplication in future patches of this
series.

Patch 5 helps to set the upper limit of the bytes handled in one cycle.

Patch 6 adds new functions to help us allocate multiple clusters according to
the size requested, perform COW if required and return the offset of the first
newly allocated cluster.

Patch 7 changes the metadata update code to update the L2 tables for multiple
clusters at once.

Patch 8 helps us to finally change vmdk_get_cluster_offset() to find cluster 
offset
only as cluster allocation task is now handled by vmdk_alloc_clusters()

Optimization test results:

This patch series improves 128 KB sequential write performance to an
empty VMDK file by 54%

Benchmark command: ./qemu-img bench -w -c 1024 -s 128K -d 1 -t none -f
vmdk test.vmdk

Changes in v7:
- comment the use of MIN() in calculating skip_end_bytes
- use extent->cluster_sectors instead of 128
- place check for m_data != NULL
- use g_new0(VmdkMetaData, 1) instead of g_malloc0(sizeof(*m_data))

Changes in v6:
- rename total_alloc_clusters as alloc_clusters_counter (fam)

Changes in v5:
- fix commit message and comment in patch 4 (fam)
- add vmdk_ prefix to handle_alloc() (fam)
- fix alignment issue in patch 6 (fam)
- use BDRV_SECTOR_BITS (fam)
- fix endianness calculation in patch 7 (fam)

Changes in v4:
- fix commit message in patch 1 (fam)
- drop size_to_clusters() function (fam)
- fix grammatical errors in function documentations (fam)
- factor out metadata loading coding in a separate patch (patch 4) (fam)
- rename vmdk_alloc_cluster_offset() to vmdk_alloc_clusters() (fam)
- break patch 4(in v3) into separate patches (patch 3 and 8) (fam)
- rename extent_size to extent_end (fam)
- use QEMU_ALIGN_UP instead of vmdk_align_offset. (fam)
- drop next and simply do m_data = m_data->next (fam)

Changes in v3:
- move size_to_clusters() from patch 1 to 3 (fam)
- use DIV_ROUND_UP in size_to_clusters (fam)
- make patch 2 compilable (fam)
- rename vmdk_L2update as vmdk_l2update and use UINT32_MAX (fam)
- combine patch 3 and patch 4 (as in v2) to make them compilable (fam)
- call bdrv_pwrite_sync() for batches of atmost 512 clusters at once (fam)

Changes in v2:
- segregate the ugly Patch 1 in v1 into 6 readable and sensible patches
- include benchmark test results in v2

Ashijeet Acharya (8):
  vmdk: Move vmdk_find_offset_in_cluster() to the top
  vmdk: Rename get_whole_cluster() to vmdk_perform_cow()
  vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset()
  vmdk: Factor out metadata loading code out of
vmdk_get_cluster_offset()
  vmdk: Set maximum bytes allocated in one cycle
  vmdk: New functions to assist allocating multiple clusters
  vmdk: Update metadata for multiple clusters
  vmdk: Make vmdk_get_cluster_offset() return cluster offset only

 block/vmdk.c | 537 +--
 1 file changed, 415 insertions(+), 122 deletions(-)

-- 
2.6.2

Re: [Qemu-block] [PATCH v6 7/8] vmdk: Update metadata for multiple clusters

2017-06-29 Thread Ashijeet Acharya

On Tue, Jun 27, 2017 at 1:34 PM, Fam Zheng  wrote:
> On Mon, 06/05 13:22, Ashijeet Acharya wrote:
>> @@ -1876,6 +1942,13 @@ static int vmdk_pwritev(BlockDriverState *bs, 
>> uint64_t offset,
>>  offset += n_bytes;
>>  bytes_done += n_bytes;
>>
>> +while (m_data->next != NULL) {
>
> If you do
>
>while (m_data) {
>
>> +VmdkMetaData *next;
>> +next = m_data->next;
>> +g_free(m_data);
>> +m_data = next;
>> +}
>> +
>>  /* update CID on the first write every time the virtual disk is
>>   * opened */
>>  if (!s->cid_updated) {
>> @@ -1886,6 +1959,7 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
>> offset,
>>  s->cid_updated = true;
>>  }
>>  }
>> +g_free(m_data);
>
> then you can remove this line.

As I explained last time, I can't do this because I am reusing the
first allocated m_data. If I am to do it the way you suggest, I will
have to move the allocation of first m_data (m_data =
g_new0(VmdkMetaData, 1)) inside the outer while loop, otherwise things
will segfault.

Ashijeet

Re: [Qemu-block] [PATCH v6 6/8] vmdk: New functions to assist allocating multiple clusters

2017-06-29 Thread Ashijeet Acharya

On Tue, Jun 27, 2017 at 1:32 PM, Fam Zheng  wrote:
> On Mon, 06/05 13:22, Ashijeet Acharya wrote:
>> +/*
>> + * vmdk_handle_alloc
>> + *
>> + * Allocate new clusters for an area that either is yet unallocated or 
>> needs a
>> + * copy on write. If *cluster_offset is non_zero, clusters are only 
>> allocated if
>> + * the new allocation can match the specified host offset.
>
> I don't think this matches the function body, the passed in *cluster_offset
> value is ignored.
>
>> + *
>> + * Returns:
>> + *   VMDK_OK:   if new clusters were allocated, *bytes may be decreased 
>> if
>> + *  the new allocation doesn't cover all of the requested 
>> area.
>> + *  *cluster_offset is updated to contain the offset of the
>> + *  first newly allocated cluster.
>> + *
>> + *   VMDK_UNALLOC:  if no clusters could be allocated. *cluster_offset is 
>> left
>> + *  unchanged.
>> + *
>> + *   VMDK_ERROR:in error cases
>> + */
>> +static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent,
>> + uint64_t offset, uint64_t *cluster_offset,
>> + int64_t *bytes, VmdkMetaData *m_data,
>> + bool allocate, uint32_t 
>> *alloc_clusters_counter)
>> +{
>> +int l1_index, l2_offset, l2_index;
>> +uint32_t *l2_table;
>> +uint32_t cluster_sector;
>> +uint32_t nb_clusters;
>> +bool zeroed = false;
>> +uint64_t skip_start_bytes, skip_end_bytes;
>> +int ret;
>> +
>> +ret = get_cluster_table(extent, offset, &l1_index, &l2_offset,
>> +&l2_index, &l2_table);
>> +if (ret < 0) {
>> +return ret;
>> +}
>> +
>> +cluster_sector = le32_to_cpu(l2_table[l2_index]);
>> +
>> +skip_start_bytes = vmdk_find_offset_in_cluster(extent, offset);
>> +/* Calculate the number of clusters to look for. Here we truncate the 
>> last
>> + * cluster, i.e. 1 less than the actual value calculated as we may need 
>> to
>> + * perform COW for the last one. */
>> +nb_clusters = DIV_ROUND_UP(skip_start_bytes + *bytes,
>> +   extent->cluster_sectors << BDRV_SECTOR_BITS) 
>> - 1;
>> +
>> +nb_clusters = MIN(nb_clusters, extent->l2_size - l2_index);
>> +assert(nb_clusters <= INT_MAX);
>> +
>> +/* update bytes according to final nb_clusters value */
>> +if (nb_clusters != 0) {
>> +*bytes = ((nb_clusters * extent->cluster_sectors) << 
>> BDRV_SECTOR_BITS)
>> + - skip_start_bytes;
>> +} else {
>> +nb_clusters = 1;
>> +}
>> +*alloc_clusters_counter += nb_clusters;
>> +skip_end_bytes = skip_start_bytes + MIN(*bytes,
>> + extent->cluster_sectors * BDRV_SECTOR_SIZE
>> +- skip_start_bytes);
>
> I don't understand the MIN part, shouldn't skip_end_bytes simply be
> skip_start_bytes + *bytes?
>
>> +
>> +if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) {
>> +zeroed = true;
>> +}
>> +
>> +if (!cluster_sector || zeroed) {
>> +if (!allocate) {
>> +return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
>> +}
>> +
>> +cluster_sector = extent->next_cluster_sector;
>> +extent->next_cluster_sector += extent->cluster_sectors
>> +* nb_clusters;
>> +
>> +ret = vmdk_perform_cow(bs, extent, cluster_sector * 
>> BDRV_SECTOR_SIZE,
>> +   offset, skip_start_bytes,
>> +   skip_end_bytes);
>> +if (ret < 0) {
>> +return ret;
>> +}
>> +if (m_data) {
>> +m_data->valid = 1;
>> +m_data->l1_index = l1_index;
>> +m_data->l2_index = l2_index;
>> +m_data->l2_offset = l2_offset;
>> +m_data->l2_cache_entry = &l2_table[l2_index];
>> +m_data->nb_clusters = nb_clusters;
>> +}
>> +}
>> +*cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
>> +return VMDK_OK;
>> +}
>> +
>> +/*
>> + * vmdk_alloc_clusters
>> + *
>> + * For a given offs

[Qemu-block] [PATCH v6 8/8] vmdk: Make vmdk_get_cluster_offset() return cluster offset only

2017-06-05 Thread Ashijeet Acharya

vmdk_alloc_clusters() introduced earlier now handles the task of
allocating clusters and performing COW when needed. Thus we can change
vmdk_get_cluster_offset() to stick to the sole purpose of returning
cluster offset using sector number. Update the changes at all call
sites.

Signed-off-by: Ashijeet Acharya 
---
 block/vmdk.c | 56 
 1 file changed, 12 insertions(+), 44 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 9fa2414..accf1c3 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1485,25 +1485,16 @@ static int vmdk_alloc_clusters(BlockDriverState *bs,
  * For flat extents, the start offset as parsed from the description file is
  * returned.
  *
- * For sparse extents, look up in L1, L2 table. If allocate is true, return an
- * offset for a new cluster and update L2 cache. If there is a backing file,
- * COW is done before returning; otherwise, zeroes are written to the allocated
- * cluster. Both COW and zero writing skips the sector range
- * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller
- * has new data to write there.
+ * For sparse extents, look up the L1, L2 table.
  *
  * Returns: VMDK_OK if cluster exists and mapped in the image.
- *  VMDK_UNALLOC if cluster is not mapped and @allocate is false.
- *  VMDK_ERROR if failed.
+ *  VMDK_UNALLOC if cluster is not mapped.
+ *  VMDK_ERROR if failed
  */
 static int vmdk_get_cluster_offset(BlockDriverState *bs,
VmdkExtent *extent,
-   VmdkMetaData *m_data,
uint64_t offset,
-   bool allocate,
-   uint64_t *cluster_offset,
-   uint64_t skip_start_bytes,
-   uint64_t skip_end_bytes)
+   uint64_t *cluster_offset)
 {
 int l1_index, l2_offset, l2_index;
 uint32_t *l2_table;
@@ -1528,31 +1519,9 @@ static int vmdk_get_cluster_offset(BlockDriverState *bs,
 }
 
 if (!cluster_sector || zeroed) {
-if (!allocate) {
-return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
-}
-
-cluster_sector = extent->next_cluster_sector;
-extent->next_cluster_sector += extent->cluster_sectors;
-
-/* First of all we write grain itself, to avoid race condition
- * that may to corrupt the image.
- * This problem may occur because of insufficient space on host disk
- * or inappropriate VM shutdown.
- */
-ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
-offset, skip_start_bytes, skip_end_bytes);
-if (ret) {
-return ret;
-}
-if (m_data) {
-m_data->valid = 1;
-m_data->l1_index = l1_index;
-m_data->l2_index = l2_index;
-m_data->l2_offset = l2_offset;
-m_data->l2_cache_entry = &l2_table[l2_index];
-}
+return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
 }
+
 *cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
 return VMDK_OK;
 }
@@ -1595,9 +1564,7 @@ static int64_t coroutine_fn 
vmdk_co_get_block_status(BlockDriverState *bs,
 return 0;
 }
 qemu_co_mutex_lock(&s->lock);
-ret = vmdk_get_cluster_offset(bs, extent, NULL,
-  sector_num * 512, false, &offset,
-  0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, sector_num * 512, &offset);
 qemu_co_mutex_unlock(&s->lock);
 
 index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
@@ -1788,13 +1755,14 @@ vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, 
uint64_t bytes,
 ret = -EIO;
 goto fail;
 }
-ret = vmdk_get_cluster_offset(bs, extent, NULL,
-  offset, false, &cluster_offset, 0, 0);
+
 offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
 
 n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
  - offset_in_cluster);
 
+ret = vmdk_get_cluster_offset(bs, extent, offset, &cluster_offset);
+
 if (ret != VMDK_OK) {
 /* if not allocated, try to read from parent image, if exist */
 if (bs->backing && ret != VMDK_ZEROED) {
@@ -2541,9 +2509,9 @@ static int vmdk_check(BlockDriverState *bs, 
BdrvCheckResult *result,
 sector_num);
 break;
 }
-ret = vmdk_get_cluster_offset(bs, extent, NULL,
+ret = vmdk_get_cluster_offset(bs, extent,
   sector_num << BDRV_SECTOR_BITS,
-  false, &cluster_offset, 0, 0);

[Qemu-block] [PATCH v6 7/8] vmdk: Update metadata for multiple clusters

2017-06-05 Thread Ashijeet Acharya

Include a next pointer in VmdkMetaData struct to point to the previous
allocated L2 table. Modify vmdk_L2update to start updating metadata for
allocation of multiple clusters at once.

Signed-off-by: Ashijeet Acharya 
---
 block/vmdk.c | 128 ++-
 1 file changed, 101 insertions(+), 27 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index b671dc9..9fa2414 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -137,6 +137,8 @@ typedef struct VmdkMetaData {
 int valid;
 uint32_t *l2_cache_entry;
 uint32_t nb_clusters;
+uint32_t offset;
+struct VmdkMetaData *next;
 } VmdkMetaData;
 
 typedef struct VmdkGrainMarker {
@@ -1116,34 +1118,87 @@ exit:
 return ret;
 }
 
-static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
- uint32_t offset)
+static int vmdk_alloc_cluster_link_l2(VmdkExtent *extent,
+  VmdkMetaData *m_data, bool zeroed)
 {
-offset = cpu_to_le32(offset);
+int i;
+uint32_t offset, temp_offset;
+int *l2_table_array;
+int l2_array_size;
+
+if (zeroed) {
+temp_offset = VMDK_GTE_ZEROED;
+} else {
+temp_offset = m_data->offset;
+}
+
+l2_array_size = sizeof(uint32_t) * m_data->nb_clusters;
+l2_table_array = qemu_try_blockalign(extent->file->bs,
+ QEMU_ALIGN_UP(l2_array_size,
+   BDRV_SECTOR_SIZE));
+if (l2_table_array == NULL) {
+return VMDK_ERROR;
+}
+memset(l2_table_array, 0, QEMU_ALIGN_UP(l2_array_size, BDRV_SECTOR_SIZE));
 /* update L2 table */
+offset = temp_offset;
+for (i = 0; i < m_data->nb_clusters; i++) {
+l2_table_array[i] = cpu_to_le32(offset);
+if (!zeroed) {
+offset += 128;
+}
+}
 if (bdrv_pwrite_sync(extent->file,
-((int64_t)m_data->l2_offset * 512)
-+ (m_data->l2_index * sizeof(offset)),
-&offset, sizeof(offset)) < 0) {
+ ((int64_t)m_data->l2_offset * 512)
+ + ((m_data->l2_index) * sizeof(offset)),
+ l2_table_array, l2_array_size) < 0) {
 return VMDK_ERROR;
 }
 /* update backup L2 table */
 if (extent->l1_backup_table_offset != 0) {
 m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
 if (bdrv_pwrite_sync(extent->file,
-((int64_t)m_data->l2_offset * 512)
-+ (m_data->l2_index * sizeof(offset)),
-&offset, sizeof(offset)) < 0) {
+ ((int64_t)m_data->l2_offset * 512)
+ + ((m_data->l2_index) * sizeof(offset)),
+ l2_table_array, l2_array_size) < 0) {
 return VMDK_ERROR;
 }
 }
+
+offset = temp_offset;
 if (m_data->l2_cache_entry) {
-*m_data->l2_cache_entry = offset;
+for (i = 0; i < m_data->nb_clusters; i++) {
+*m_data->l2_cache_entry = cpu_to_le32(offset);
+m_data->l2_cache_entry++;
+
+if (!zeroed) {
+offset += 128;
+}
+}
 }
 
+qemu_vfree(l2_table_array);
 return VMDK_OK;
 }
 
+static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
+ bool zeroed)
+{
+int ret;
+
+while (m_data->next != NULL) {
+
+ret = vmdk_alloc_cluster_link_l2(extent, m_data, zeroed);
+if (ret < 0) {
+return ret;
+}
+
+m_data = m_data->next;
+ }
+
+ return VMDK_OK;
+}
+
 /*
  * vmdk_l2load
  *
@@ -1261,9 +1316,10 @@ static int get_cluster_table(VmdkExtent *extent, 
uint64_t offset,
  *
  *   VMDK_ERROR:in error cases
  */
+
 static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent,
  uint64_t offset, uint64_t *cluster_offset,
- int64_t *bytes, VmdkMetaData *m_data,
+ int64_t *bytes, VmdkMetaData **m_data,
  bool allocate, uint32_t *alloc_clusters_counter)
 {
 int l1_index, l2_offset, l2_index;
@@ -1272,6 +1328,7 @@ static int vmdk_handle_alloc(BlockDriverState *bs, 
VmdkExtent *extent,
 uint32_t nb_clusters;
 bool zeroed = false;
 uint64_t skip_start_bytes, skip_end_bytes;
+VmdkMetaData *old_m_data;
 int ret;
 
 ret = get_cluster_table(extent, offset, &l1_index, &l2_offset,
@@ -1323,13 +1380,21 @@ static int vmdk_handle_alloc(BlockDriverState *bs, 
VmdkExtent *extent,
 if (ret < 0) {
 return ret;
 }
-if (m_data) {
-m_data->valid = 1;
-m_data->l1_index = l1_index;
-m_dat

[Qemu-block] [PATCH v6 6/8] vmdk: New functions to assist allocating multiple clusters

2017-06-05 Thread Ashijeet Acharya

Introduce two new helper functions handle_alloc() and
vmdk_alloc_cluster_offset(). handle_alloc() helps to allocate multiple
clusters at once starting from a given offset on disk and performs COW
if necessary for first and last allocated clusters.
vmdk_alloc_cluster_offset() helps to return the offset of the first of
the many newly allocated clusters. Also, provide proper documentation
for both.

Signed-off-by: Ashijeet Acharya 
---
 block/vmdk.c | 192 +++
 1 file changed, 182 insertions(+), 10 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index fe2046b..b671dc9 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -136,6 +136,7 @@ typedef struct VmdkMetaData {
 unsigned int l2_offset;
 int valid;
 uint32_t *l2_cache_entry;
+uint32_t nb_clusters;
 } VmdkMetaData;
 
 typedef struct VmdkGrainMarker {
@@ -1242,6 +1243,174 @@ static int get_cluster_table(VmdkExtent *extent, 
uint64_t offset,
 return VMDK_OK;
 }
 
+/*
+ * vmdk_handle_alloc
+ *
+ * Allocate new clusters for an area that either is yet unallocated or needs a
+ * copy on write. If *cluster_offset is non_zero, clusters are only allocated 
if
+ * the new allocation can match the specified host offset.
+ *
+ * Returns:
+ *   VMDK_OK:   if new clusters were allocated, *bytes may be decreased if
+ *  the new allocation doesn't cover all of the requested area.
+ *  *cluster_offset is updated to contain the offset of the
+ *  first newly allocated cluster.
+ *
+ *   VMDK_UNALLOC:  if no clusters could be allocated. *cluster_offset is left
+ *  unchanged.
+ *
+ *   VMDK_ERROR:in error cases
+ */
+static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent,
+ uint64_t offset, uint64_t *cluster_offset,
+ int64_t *bytes, VmdkMetaData *m_data,
+ bool allocate, uint32_t *alloc_clusters_counter)
+{
+int l1_index, l2_offset, l2_index;
+uint32_t *l2_table;
+uint32_t cluster_sector;
+uint32_t nb_clusters;
+bool zeroed = false;
+uint64_t skip_start_bytes, skip_end_bytes;
+int ret;
+
+ret = get_cluster_table(extent, offset, &l1_index, &l2_offset,
+&l2_index, &l2_table);
+if (ret < 0) {
+return ret;
+}
+
+cluster_sector = le32_to_cpu(l2_table[l2_index]);
+
+skip_start_bytes = vmdk_find_offset_in_cluster(extent, offset);
+/* Calculate the number of clusters to look for. Here we truncate the last
+ * cluster, i.e. 1 less than the actual value calculated as we may need to
+ * perform COW for the last one. */
+nb_clusters = DIV_ROUND_UP(skip_start_bytes + *bytes,
+   extent->cluster_sectors << BDRV_SECTOR_BITS) - 
1;
+
+nb_clusters = MIN(nb_clusters, extent->l2_size - l2_index);
+assert(nb_clusters <= INT_MAX);
+
+/* update bytes according to final nb_clusters value */
+if (nb_clusters != 0) {
+*bytes = ((nb_clusters * extent->cluster_sectors) << BDRV_SECTOR_BITS)
+ - skip_start_bytes;
+} else {
+nb_clusters = 1;
+}
+*alloc_clusters_counter += nb_clusters;
+skip_end_bytes = skip_start_bytes + MIN(*bytes,
+ extent->cluster_sectors * BDRV_SECTOR_SIZE
+- skip_start_bytes);
+
+if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) {
+zeroed = true;
+}
+
+if (!cluster_sector || zeroed) {
+if (!allocate) {
+return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
+}
+
+cluster_sector = extent->next_cluster_sector;
+extent->next_cluster_sector += extent->cluster_sectors
+* nb_clusters;
+
+ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
+   offset, skip_start_bytes,
+   skip_end_bytes);
+if (ret < 0) {
+return ret;
+}
+if (m_data) {
+m_data->valid = 1;
+m_data->l1_index = l1_index;
+m_data->l2_index = l2_index;
+m_data->l2_offset = l2_offset;
+m_data->l2_cache_entry = &l2_table[l2_index];
+m_data->nb_clusters = nb_clusters;
+}
+}
+*cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
+return VMDK_OK;
+}
+
+/*
+ * vmdk_alloc_clusters
+ *
+ * For a given offset on the virtual disk, find the cluster offset in vmdk
+ * file. If the offset is not found, allocate a new cluster.
+ *
+ * If the cluster is newly allocated, m_data->nb_clusters is set to the number
+ * of contiguous clusters that have been allocated. In this case, the other
+ * fields of m_data are valid and contain in

[Qemu-block] [PATCH v6 5/8] vmdk: Set maximum bytes allocated in one cycle

2017-06-05 Thread Ashijeet Acharya

Set the maximum bytes allowed to get allocated at once to be not more
than the extent size boundary to handle writes at two separate extents
appropriately.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 5647f53..fe2046b 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1624,6 +1624,7 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
offset,
 uint64_t cluster_offset;
 uint64_t bytes_done = 0;
 VmdkMetaData m_data;
+uint64_t extent_end;
 
 if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) {
 error_report("Wrong offset: offset=0x%" PRIx64
@@ -1637,9 +1638,17 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
offset,
 if (!extent) {
 return -EIO;
 }
+extent_end = extent->end_sector * BDRV_SECTOR_SIZE;
+
 offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
-n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
- - offset_in_cluster);
+
+/* truncate n_bytes to first cluster because we need to perform COW */
+if (offset_in_cluster > 0) {
+n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
+ - offset_in_cluster);
+} else {
+n_bytes = MIN(bytes, extent_end - offset);
+}
 
 ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset,
   !(extent->compressed || zeroed),
-- 
2.6.2

[Qemu-block] [PATCH v6 4/8] vmdk: Factor out metadata loading code out of vmdk_get_cluster_offset()

2017-06-05 Thread Ashijeet Acharya

Move the cluster tables loading code out of the existing
vmdk_get_cluster_offset() function and implement it in separate
get_cluster_table() and vmdk_l2load() functions.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 153 ---
 1 file changed, 105 insertions(+), 48 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index f403981..5647f53 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1143,6 +1143,105 @@ static int vmdk_L2update(VmdkExtent *extent, 
VmdkMetaData *m_data,
 return VMDK_OK;
 }
 
+/*
+ * vmdk_l2load
+ *
+ * Load a new L2 table into memory. If the table is in the cache, the cache
+ * is used; otherwise the L2 table is loaded from the image file.
+ *
+ * Returns:
+ *   VMDK_OK:   on success
+ *   VMDK_ERROR:in error cases
+ */
+static int vmdk_l2load(VmdkExtent *extent, uint64_t offset, int l2_offset,
+   uint32_t **new_l2_table, int *new_l2_index)
+{
+int min_index, i, j;
+uint32_t *l2_table;
+uint32_t min_count;
+
+for (i = 0; i < L2_CACHE_SIZE; i++) {
+if (l2_offset == extent->l2_cache_offsets[i]) {
+/* increment the hit count */
+if (++extent->l2_cache_counts[i] == UINT32_MAX) {
+for (j = 0; j < L2_CACHE_SIZE; j++) {
+extent->l2_cache_counts[j] >>= 1;
+}
+}
+l2_table = extent->l2_cache + (i * extent->l2_size);
+goto found;
+}
+}
+/* not found: load a new entry in the least used one */
+min_index = 0;
+min_count = UINT32_MAX;
+for (i = 0; i < L2_CACHE_SIZE; i++) {
+if (extent->l2_cache_counts[i] < min_count) {
+min_count = extent->l2_cache_counts[i];
+min_index = i;
+}
+}
+l2_table = extent->l2_cache + (min_index * extent->l2_size);
+if (bdrv_pread(extent->file,
+(int64_t)l2_offset * 512,
+l2_table,
+extent->l2_size * sizeof(uint32_t)
+) != extent->l2_size * sizeof(uint32_t)) {
+return VMDK_ERROR;
+}
+
+extent->l2_cache_offsets[min_index] = l2_offset;
+extent->l2_cache_counts[min_index] = 1;
+found:
+*new_l2_index = ((offset >> 9) / extent->cluster_sectors) % 
extent->l2_size;
+*new_l2_table = l2_table;
+
+return VMDK_OK;
+}
+
+/*
+ * get_cluster_table
+ *
+ * For a given offset, load (and allocate if needed) the l2 table.
+ *
+ * Returns:
+ *   VMDK_OK:on success
+ *
+ *   VMDK_UNALLOC:   if cluster is not mapped
+ *
+ *   VMDK_ERROR: in error cases
+ */
+static int get_cluster_table(VmdkExtent *extent, uint64_t offset,
+ int *new_l1_index, int *new_l2_offset,
+ int *new_l2_index, uint32_t **new_l2_table)
+{
+int l1_index, l2_offset, l2_index;
+uint32_t *l2_table;
+int ret;
+
+offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
+l1_index = (offset >> 9) / extent->l1_entry_sectors;
+if (l1_index >= extent->l1_size) {
+return VMDK_ERROR;
+}
+l2_offset = extent->l1_table[l1_index];
+if (!l2_offset) {
+return VMDK_UNALLOC;
+}
+
+ret = vmdk_l2load(extent, offset, l2_offset, &l2_table, &l2_index);
+if (ret < 0) {
+return ret;
+}
+
+*new_l1_index = l1_index;
+*new_l2_offset = l2_offset;
+*new_l2_index = l2_index;
+*new_l2_table = l2_table;
+
+return VMDK_OK;
+}
+
 /**
  * vmdk_get_cluster_offset
  *
@@ -1172,66 +1271,24 @@ static int vmdk_get_cluster_offset(BlockDriverState *bs,
uint64_t skip_start_bytes,
uint64_t skip_end_bytes)
 {
-unsigned int l1_index, l2_offset, l2_index;
-int min_index, i, j;
-uint32_t min_count, *l2_table;
+int l1_index, l2_offset, l2_index;
+uint32_t *l2_table;
 bool zeroed = false;
 int64_t ret;
 int64_t cluster_sector;
 
-if (m_data) {
-m_data->valid = 0;
-}
 if (extent->flat) {
 *cluster_offset = extent->flat_start_offset;
 return VMDK_OK;
 }
 
-offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
-l1_index = (offset >> 9) / extent->l1_entry_sectors;
-if (l1_index >= extent->l1_size) {
-return VMDK_ERROR;
-}
-l2_offset = extent->l1_table[l1_index];
-if (!l2_offset) {
-return VMDK_UNALLOC;
-}
-for (i = 0; i < L2_CACHE_SIZE; i++) {
-if (l2_offset == extent->l2_cache_offsets[i]) {
-/* increment the hit count */
-if (++extent->l2_cache_counts[i] == 0x) {
-for (j = 0; j < L2_CACHE_SIZE; j++) {
-extent->l2_cache_counts[j] >>= 1;
-}
-

[Qemu-block] [PATCH v6 2/8] vmdk: Rename get_whole_cluster() to vmdk_perform_cow()

2017-06-05 Thread Ashijeet Acharya

Rename the existing function get_whole_cluster() to vmdk_perform_cow()
as its sole purpose is to perform COW for the first and the last
allocated clusters if needed.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 23 ++-
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 22be887..73ae786 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1028,8 +1028,8 @@ static void vmdk_refresh_limits(BlockDriverState *bs, 
Error **errp)
 }
 }
 
-/**
- * get_whole_cluster
+/*
+ * vmdk_perform_cow
  *
  * Copy backing file's cluster that covers @sector_num, otherwise write zero,
  * to the cluster at @cluster_sector_num.
@@ -1037,13 +1037,18 @@ static void vmdk_refresh_limits(BlockDriverState *bs, 
Error **errp)
  * If @skip_start_sector < @skip_end_sector, the relative range
  * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave
  * it for call to write user data in the request.
+ *
+ * Returns:
+ *   VMDK_OK:   on success
+ *
+ *   VMDK_ERROR:in error cases
  */
-static int get_whole_cluster(BlockDriverState *bs,
- VmdkExtent *extent,
- uint64_t cluster_offset,
- uint64_t offset,
- uint64_t skip_start_bytes,
- uint64_t skip_end_bytes)
+static int vmdk_perform_cow(BlockDriverState *bs,
+VmdkExtent *extent,
+uint64_t cluster_offset,
+uint64_t offset,
+uint64_t skip_start_bytes,
+uint64_t skip_end_bytes)
 {
 int ret = VMDK_OK;
 int64_t cluster_bytes;
@@ -1244,7 +1249,7 @@ static int get_cluster_offset(BlockDriverState *bs,
  * This problem may occur because of insufficient space on host disk
  * or inappropriate VM shutdown.
  */
-ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
+ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
 offset, skip_start_bytes, skip_end_bytes);
 if (ret) {
 return ret;
-- 
2.6.2

[Qemu-block] [PATCH v6 3/8] vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset()

2017-06-05 Thread Ashijeet Acharya

Rename the existing get_cluster_offset() to vmdk_get_cluster_offset()
and update name in all the callers accordingly.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 46 +++---
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 73ae786..f403981 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1144,7 +1144,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData 
*m_data,
 }
 
 /**
- * get_cluster_offset
+ * vmdk_get_cluster_offset
  *
  * Look up cluster offset in extent file by sector number, and store in
  * @cluster_offset.
@@ -1163,14 +1163,14 @@ static int vmdk_L2update(VmdkExtent *extent, 
VmdkMetaData *m_data,
  *  VMDK_UNALLOC if cluster is not mapped and @allocate is false.
  *  VMDK_ERROR if failed.
  */
-static int get_cluster_offset(BlockDriverState *bs,
-  VmdkExtent *extent,
-  VmdkMetaData *m_data,
-  uint64_t offset,
-  bool allocate,
-  uint64_t *cluster_offset,
-  uint64_t skip_start_bytes,
-  uint64_t skip_end_bytes)
+static int vmdk_get_cluster_offset(BlockDriverState *bs,
+   VmdkExtent *extent,
+   VmdkMetaData *m_data,
+   uint64_t offset,
+   bool allocate,
+   uint64_t *cluster_offset,
+   uint64_t skip_start_bytes,
+   uint64_t skip_end_bytes)
 {
 unsigned int l1_index, l2_offset, l2_index;
 int min_index, i, j;
@@ -1304,9 +1304,9 @@ static int64_t coroutine_fn 
vmdk_co_get_block_status(BlockDriverState *bs,
 return 0;
 }
 qemu_co_mutex_lock(&s->lock);
-ret = get_cluster_offset(bs, extent, NULL,
- sector_num * 512, false, &offset,
- 0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, NULL,
+  sector_num * 512, false, &offset,
+  0, 0);
 qemu_co_mutex_unlock(&s->lock);
 
 index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
@@ -1497,8 +1497,8 @@ vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, 
uint64_t bytes,
 ret = -EIO;
 goto fail;
 }
-ret = get_cluster_offset(bs, extent, NULL,
- offset, false, &cluster_offset, 0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, NULL,
+  offset, false, &cluster_offset, 0, 0);
 offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
 
 n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
@@ -1584,10 +1584,10 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
offset,
 n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
  - offset_in_cluster);
 
-ret = get_cluster_offset(bs, extent, &m_data, offset,
- !(extent->compressed || zeroed),
- &cluster_offset, offset_in_cluster,
- offset_in_cluster + n_bytes);
+ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset,
+  !(extent->compressed || zeroed),
+  &cluster_offset, offset_in_cluster,
+  offset_in_cluster + n_bytes);
 if (extent->compressed) {
 if (ret == VMDK_OK) {
 /* Refuse write to allocated cluster for streamOptimized */
@@ -1596,8 +1596,8 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
offset,
 return -EIO;
 } else {
 /* allocate */
-ret = get_cluster_offset(bs, extent, &m_data, offset,
- true, &cluster_offset, 0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset,
+  true, &cluster_offset, 0, 0);
 }
 }
 if (ret == VMDK_ERROR) {
@@ -2229,9 +2229,9 @@ static int vmdk_check(BlockDriverState *bs, 
BdrvCheckResult *result,
 sector_num);
 break;
 }
-ret = get_cluster_offset(bs, extent, NULL,
- sector_num << BDRV_SECTOR_BITS,
- false, &cluster_offset, 0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, NULL,
+  sector_num << BDRV_SECTOR_BITS,
+

[Qemu-block] [PATCH v6 0/8] Optimize VMDK I/O by allocating multiple clusters

2017-06-05 Thread Ashijeet Acharya

Previously posted series patches:
v1 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg02044.html
v2 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg05080.html
v3 - http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg00074.html
v4 - http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg03851.html
v5 - http://lists.nongnu.org/archive/html/qemu-devel/2017-06/msg00929.html

This series helps to optimize the I/O performance of VMDK driver.

Patch 1 helps us to move vmdk_find_offset_in_cluster.

Patch 2 & 3 perform a simple function re-naming tasks.

Patch 4 is used to factor out metadata loading code and implement it in separate
functions. This will help us to avoid code duplication in future patches of this
series.

Patch 5 helps to set the upper limit of the bytes handled in one cycle.

Patch 6 adds new functions to help us allocate multiple clusters according to
the size requested, perform COW if required and return the offset of the first
newly allocated cluster.

Patch 7 changes the metadata update code to update the L2 tables for multiple
clusters at once.

Patch 8 helps us to finally change vmdk_get_cluster_offset() to find cluster 
offset
only as cluster allocation task is now handled by vmdk_alloc_clusters()

Optimization test results:

This patch series improves 128 KB sequential write performance to an
empty VMDK file by 54%

Benchmark command: ./qemu-img bench -w -c 1024 -s 128K -d 1 -t none -f
vmdk test.vmdk

Changes in v6:
- rename total_alloc_clusters as alloc_clusters_counter (fam)

Changes in v5:
- fix commit message and comment in patch 4 (fam)
- add vmdk_ prefix to handle_alloc() (fam)
- fix alignment issue in patch 6 (fam)
- use BDRV_SECTOR_BITS (fam)
- fix endianness calculation in patch 7 (fam)

Changes in v4:
- fix commit message in patch 1 (fam)
- drop size_to_clusters() function (fam)
- fix grammatical errors in function documentations (fam)
- factor out metadata loading coding in a separate patch (patch 4) (fam)
- rename vmdk_alloc_cluster_offset() to vmdk_alloc_clusters() (fam)
- break patch 4(in v3) into separate patches (patch 3 and 8) (fam)
- rename extent_size to extent_end (fam)
- use QEMU_ALIGN_UP instead of vmdk_align_offset. (fam)
- drop next and simply do m_data = m_data->next (fam)

Changes in v3:
- move size_to_clusters() from patch 1 to 3 (fam)
- use DIV_ROUND_UP in size_to_clusters (fam)
- make patch 2 compilable (fam)
- rename vmdk_L2update as vmdk_l2update and use UINT32_MAX (fam)
- combine patch 3 and patch 4 (as in v2) to make them compilable (fam)
- call bdrv_pwrite_sync() for batches of atmost 512 clusters at once (fam)

Changes in v2:
- segregate the ugly Patch 1 in v1 into 6 readable and sensible patches
- include benchmark test results in v2

Ashijeet Acharya (8):
  vmdk: Move vmdk_find_offset_in_cluster() to the top
  vmdk: Rename get_whole_cluster() to vmdk_perform_cow()
  vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset()
  vmdk: Factor out metadata loading code out of
vmdk_get_cluster_offset()
  vmdk: Set maximum bytes allocated in one cycle
  vmdk: New functions to assist allocating multiple clusters
  vmdk: Update metadata for multiple clusters
  vmdk: Make vmdk_get_cluster_offset() return cluster offset only

 block/vmdk.c | 529 +--
 1 file changed, 407 insertions(+), 122 deletions(-)

-- 
2.6.2

[Qemu-block] [PATCH v6 1/8] vmdk: Move vmdk_find_offset_in_cluster() to the top

2017-06-05 Thread Ashijeet Acharya

Move the existing vmdk_find_offset_in_cluster() function to the top of
the driver.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index a9bd22b..22be887 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -242,6 +242,18 @@ static void vmdk_free_last_extent(BlockDriverState *bs)
 s->extents = g_renew(VmdkExtent, s->extents, s->num_extents);
 }
 
+static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
+   int64_t offset)
+{
+uint64_t extent_begin_offset, extent_relative_offset;
+uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;
+
+extent_begin_offset =
+(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE;
+extent_relative_offset = offset - extent_begin_offset;
+return extent_relative_offset % cluster_size;
+}
+
 static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
 {
 char *desc;
@@ -1266,18 +1278,6 @@ static VmdkExtent *find_extent(BDRVVmdkState *s,
 return NULL;
 }
 
-static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
-   int64_t offset)
-{
-uint64_t extent_begin_offset, extent_relative_offset;
-uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;
-
-extent_begin_offset =
-(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE;
-extent_relative_offset = offset - extent_begin_offset;
-return extent_relative_offset % cluster_size;
-}
-
 static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent,
   int64_t sector_num)
 {
-- 
2.6.2

Re: [Qemu-block] [PATCH v5 6/8] vmdk: New functions to assist allocating multiple clusters

2017-06-04 Thread Ashijeet Acharya

On Mon, Jun 5, 2017 at 11:23 AM, Ashijeet Acharya
 wrote:
> Introduce two new helper functions handle_alloc() and
> vmdk_alloc_cluster_offset(). handle_alloc() helps to allocate multiple
> clusters at once starting from a given offset on disk and performs COW
> if necessary for first and last allocated clusters.
> vmdk_alloc_cluster_offset() helps to return the offset of the first of
> the many newly allocated clusters. Also, provide proper documentation
> for both.
>
> Signed-off-by: Ashijeet Acharya 
> ---
>  block/vmdk.c | 192 
> +++
>  1 file changed, 182 insertions(+), 10 deletions(-)
>
> diff --git a/block/vmdk.c b/block/vmdk.c
> index fe2046b..b671dc9 100644
> --- a/block/vmdk.c
> +++ b/block/vmdk.c
> @@ -136,6 +136,7 @@ typedef struct VmdkMetaData {
>  unsigned int l2_offset;
>  int valid;
>  uint32_t *l2_cache_entry;
> +uint32_t nb_clusters;
>  } VmdkMetaData;
>
>  typedef struct VmdkGrainMarker {
> @@ -1242,6 +1243,174 @@ static int get_cluster_table(VmdkExtent *extent, 
> uint64_t offset,
>  return VMDK_OK;
>  }
>
> +/*
> + * vmdk_handle_alloc
> + *
> + * Allocate new clusters for an area that either is yet unallocated or needs 
> a
> + * copy on write. If *cluster_offset is non_zero, clusters are only 
> allocated if
> + * the new allocation can match the specified host offset.
> + *
> + * Returns:
> + *   VMDK_OK:   if new clusters were allocated, *bytes may be decreased 
> if
> + *  the new allocation doesn't cover all of the requested 
> area.
> + *  *cluster_offset is updated to contain the offset of the
> + *  first newly allocated cluster.
> + *
> + *   VMDK_UNALLOC:  if no clusters could be allocated. *cluster_offset is 
> left
> + *  unchanged.
> + *
> + *   VMDK_ERROR:in error cases
> + */
> +static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent,
> + uint64_t offset, uint64_t *cluster_offset,
> + int64_t *bytes, VmdkMetaData *m_data,
> + bool allocate, uint32_t *total_alloc_clusters)
> +{
> +int l1_index, l2_offset, l2_index;
> +uint32_t *l2_table;
> +uint32_t cluster_sector;
> +uint32_t nb_clusters;
> +bool zeroed = false;
> +uint64_t skip_start_bytes, skip_end_bytes;
> +int ret;
> +
> +ret = get_cluster_table(extent, offset, &l1_index, &l2_offset,
> +&l2_index, &l2_table);
> +if (ret < 0) {
> +return ret;
> +}
> +
> +cluster_sector = le32_to_cpu(l2_table[l2_index]);
> +
> +skip_start_bytes = vmdk_find_offset_in_cluster(extent, offset);
> +/* Calculate the number of clusters to look for. Here we truncate the 
> last
> + * cluster, i.e. 1 less than the actual value calculated as we may need 
> to
> + * perform COW for the last one. */
> +nb_clusters = DIV_ROUND_UP(skip_start_bytes + *bytes,
> +   extent->cluster_sectors << BDRV_SECTOR_BITS) 
> - 1;
> +
> +nb_clusters = MIN(nb_clusters, extent->l2_size - l2_index);
> +assert(nb_clusters <= INT_MAX);
> +
> +/* update bytes according to final nb_clusters value */
> +if (nb_clusters != 0) {
> +*bytes = ((nb_clusters * extent->cluster_sectors) << 
> BDRV_SECTOR_BITS)
> + - skip_start_bytes;
> +} else {
> +nb_clusters = 1;
> +}
> +*total_alloc_clusters += nb_clusters;

I have left it as it is for now as I wasn't sure about what you meant.
You can check my query I posted on v4 of this thread for more so that
we can solve this soon. :-)

Ashijeet

> +skip_end_bytes = skip_start_bytes + MIN(*bytes,
> + extent->cluster_sectors * BDRV_SECTOR_SIZE
> +- skip_start_bytes);
> +
> +if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) {
> +zeroed = true;
> +}
> +
> +if (!cluster_sector || zeroed) {
> +if (!allocate) {
> +return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
> +}
> +
> +cluster_sector = extent->next_cluster_sector;
> +extent->next_cluster_sector += extent->cluster_sectors
> +* nb_clusters;
> +
> +ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
> +   offset, skip_start_bytes,
> +   skip_end_bytes);
> +if (ret &l

[Qemu-block] [PATCH v5 8/8] vmdk: Make vmdk_get_cluster_offset() return cluster offset only

2017-06-04 Thread Ashijeet Acharya

vmdk_alloc_clusters() introduced earlier now handles the task of
allocating clusters and performing COW when needed. Thus we can change
vmdk_get_cluster_offset() to stick to the sole purpose of returning
cluster offset using sector number. Update the changes at all call
sites.

Signed-off-by: Ashijeet Acharya 
---
 block/vmdk.c | 56 
 1 file changed, 12 insertions(+), 44 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 9fa2414..accf1c3 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1485,25 +1485,16 @@ static int vmdk_alloc_clusters(BlockDriverState *bs,
  * For flat extents, the start offset as parsed from the description file is
  * returned.
  *
- * For sparse extents, look up in L1, L2 table. If allocate is true, return an
- * offset for a new cluster and update L2 cache. If there is a backing file,
- * COW is done before returning; otherwise, zeroes are written to the allocated
- * cluster. Both COW and zero writing skips the sector range
- * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller
- * has new data to write there.
+ * For sparse extents, look up the L1, L2 table.
  *
  * Returns: VMDK_OK if cluster exists and mapped in the image.
- *  VMDK_UNALLOC if cluster is not mapped and @allocate is false.
- *  VMDK_ERROR if failed.
+ *  VMDK_UNALLOC if cluster is not mapped.
+ *  VMDK_ERROR if failed
  */
 static int vmdk_get_cluster_offset(BlockDriverState *bs,
VmdkExtent *extent,
-   VmdkMetaData *m_data,
uint64_t offset,
-   bool allocate,
-   uint64_t *cluster_offset,
-   uint64_t skip_start_bytes,
-   uint64_t skip_end_bytes)
+   uint64_t *cluster_offset)
 {
 int l1_index, l2_offset, l2_index;
 uint32_t *l2_table;
@@ -1528,31 +1519,9 @@ static int vmdk_get_cluster_offset(BlockDriverState *bs,
 }
 
 if (!cluster_sector || zeroed) {
-if (!allocate) {
-return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
-}
-
-cluster_sector = extent->next_cluster_sector;
-extent->next_cluster_sector += extent->cluster_sectors;
-
-/* First of all we write grain itself, to avoid race condition
- * that may to corrupt the image.
- * This problem may occur because of insufficient space on host disk
- * or inappropriate VM shutdown.
- */
-ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
-offset, skip_start_bytes, skip_end_bytes);
-if (ret) {
-return ret;
-}
-if (m_data) {
-m_data->valid = 1;
-m_data->l1_index = l1_index;
-m_data->l2_index = l2_index;
-m_data->l2_offset = l2_offset;
-m_data->l2_cache_entry = &l2_table[l2_index];
-}
+return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
 }
+
 *cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
 return VMDK_OK;
 }
@@ -1595,9 +1564,7 @@ static int64_t coroutine_fn 
vmdk_co_get_block_status(BlockDriverState *bs,
 return 0;
 }
 qemu_co_mutex_lock(&s->lock);
-ret = vmdk_get_cluster_offset(bs, extent, NULL,
-  sector_num * 512, false, &offset,
-  0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, sector_num * 512, &offset);
 qemu_co_mutex_unlock(&s->lock);
 
 index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
@@ -1788,13 +1755,14 @@ vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, 
uint64_t bytes,
 ret = -EIO;
 goto fail;
 }
-ret = vmdk_get_cluster_offset(bs, extent, NULL,
-  offset, false, &cluster_offset, 0, 0);
+
 offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
 
 n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
  - offset_in_cluster);
 
+ret = vmdk_get_cluster_offset(bs, extent, offset, &cluster_offset);
+
 if (ret != VMDK_OK) {
 /* if not allocated, try to read from parent image, if exist */
 if (bs->backing && ret != VMDK_ZEROED) {
@@ -2541,9 +2509,9 @@ static int vmdk_check(BlockDriverState *bs, 
BdrvCheckResult *result,
 sector_num);
 break;
 }
-ret = vmdk_get_cluster_offset(bs, extent, NULL,
+ret = vmdk_get_cluster_offset(bs, extent,
   sector_num << BDRV_SECTOR_BITS,
-  false, &cluster_offset, 0, 0);

[Qemu-block] [PATCH v5 4/8] vmdk: Factor out metadata loading code out of vmdk_get_cluster_offset()

2017-06-04 Thread Ashijeet Acharya

Move the cluster tables loading code out of the existing
vmdk_get_cluster_offset() function and implement it in separate
get_cluster_table() and vmdk_l2load() functions.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 153 ---
 1 file changed, 105 insertions(+), 48 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index f403981..5647f53 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1143,6 +1143,105 @@ static int vmdk_L2update(VmdkExtent *extent, 
VmdkMetaData *m_data,
 return VMDK_OK;
 }
 
+/*
+ * vmdk_l2load
+ *
+ * Load a new L2 table into memory. If the table is in the cache, the cache
+ * is used; otherwise the L2 table is loaded from the image file.
+ *
+ * Returns:
+ *   VMDK_OK:   on success
+ *   VMDK_ERROR:in error cases
+ */
+static int vmdk_l2load(VmdkExtent *extent, uint64_t offset, int l2_offset,
+   uint32_t **new_l2_table, int *new_l2_index)
+{
+int min_index, i, j;
+uint32_t *l2_table;
+uint32_t min_count;
+
+for (i = 0; i < L2_CACHE_SIZE; i++) {
+if (l2_offset == extent->l2_cache_offsets[i]) {
+/* increment the hit count */
+if (++extent->l2_cache_counts[i] == UINT32_MAX) {
+for (j = 0; j < L2_CACHE_SIZE; j++) {
+extent->l2_cache_counts[j] >>= 1;
+}
+}
+l2_table = extent->l2_cache + (i * extent->l2_size);
+goto found;
+}
+}
+/* not found: load a new entry in the least used one */
+min_index = 0;
+min_count = UINT32_MAX;
+for (i = 0; i < L2_CACHE_SIZE; i++) {
+if (extent->l2_cache_counts[i] < min_count) {
+min_count = extent->l2_cache_counts[i];
+min_index = i;
+}
+}
+l2_table = extent->l2_cache + (min_index * extent->l2_size);
+if (bdrv_pread(extent->file,
+(int64_t)l2_offset * 512,
+l2_table,
+extent->l2_size * sizeof(uint32_t)
+) != extent->l2_size * sizeof(uint32_t)) {
+return VMDK_ERROR;
+}
+
+extent->l2_cache_offsets[min_index] = l2_offset;
+extent->l2_cache_counts[min_index] = 1;
+found:
+*new_l2_index = ((offset >> 9) / extent->cluster_sectors) % 
extent->l2_size;
+*new_l2_table = l2_table;
+
+return VMDK_OK;
+}
+
+/*
+ * get_cluster_table
+ *
+ * For a given offset, load (and allocate if needed) the l2 table.
+ *
+ * Returns:
+ *   VMDK_OK:on success
+ *
+ *   VMDK_UNALLOC:   if cluster is not mapped
+ *
+ *   VMDK_ERROR: in error cases
+ */
+static int get_cluster_table(VmdkExtent *extent, uint64_t offset,
+ int *new_l1_index, int *new_l2_offset,
+ int *new_l2_index, uint32_t **new_l2_table)
+{
+int l1_index, l2_offset, l2_index;
+uint32_t *l2_table;
+int ret;
+
+offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
+l1_index = (offset >> 9) / extent->l1_entry_sectors;
+if (l1_index >= extent->l1_size) {
+return VMDK_ERROR;
+}
+l2_offset = extent->l1_table[l1_index];
+if (!l2_offset) {
+return VMDK_UNALLOC;
+}
+
+ret = vmdk_l2load(extent, offset, l2_offset, &l2_table, &l2_index);
+if (ret < 0) {
+return ret;
+}
+
+*new_l1_index = l1_index;
+*new_l2_offset = l2_offset;
+*new_l2_index = l2_index;
+*new_l2_table = l2_table;
+
+return VMDK_OK;
+}
+
 /**
  * vmdk_get_cluster_offset
  *
@@ -1172,66 +1271,24 @@ static int vmdk_get_cluster_offset(BlockDriverState *bs,
uint64_t skip_start_bytes,
uint64_t skip_end_bytes)
 {
-unsigned int l1_index, l2_offset, l2_index;
-int min_index, i, j;
-uint32_t min_count, *l2_table;
+int l1_index, l2_offset, l2_index;
+uint32_t *l2_table;
 bool zeroed = false;
 int64_t ret;
 int64_t cluster_sector;
 
-if (m_data) {
-m_data->valid = 0;
-}
 if (extent->flat) {
 *cluster_offset = extent->flat_start_offset;
 return VMDK_OK;
 }
 
-offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
-l1_index = (offset >> 9) / extent->l1_entry_sectors;
-if (l1_index >= extent->l1_size) {
-return VMDK_ERROR;
-}
-l2_offset = extent->l1_table[l1_index];
-if (!l2_offset) {
-return VMDK_UNALLOC;
-}
-for (i = 0; i < L2_CACHE_SIZE; i++) {
-if (l2_offset == extent->l2_cache_offsets[i]) {
-/* increment the hit count */
-if (++extent->l2_cache_counts[i] == 0x) {
-for (j = 0; j < L2_CACHE_SIZE; j++) {
-extent->l2_cache_counts[j] >>= 1;
-}
-

[Qemu-block] [PATCH v5 5/8] vmdk: Set maximum bytes allocated in one cycle

2017-06-04 Thread Ashijeet Acharya

Set the maximum bytes allowed to get allocated at once to be not more
than the extent size boundary to handle writes at two separate extents
appropriately.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 5647f53..fe2046b 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1624,6 +1624,7 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
offset,
 uint64_t cluster_offset;
 uint64_t bytes_done = 0;
 VmdkMetaData m_data;
+uint64_t extent_end;
 
 if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) {
 error_report("Wrong offset: offset=0x%" PRIx64
@@ -1637,9 +1638,17 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
offset,
 if (!extent) {
 return -EIO;
 }
+extent_end = extent->end_sector * BDRV_SECTOR_SIZE;
+
 offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
-n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
- - offset_in_cluster);
+
+/* truncate n_bytes to first cluster because we need to perform COW */
+if (offset_in_cluster > 0) {
+n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
+ - offset_in_cluster);
+} else {
+n_bytes = MIN(bytes, extent_end - offset);
+}
 
 ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset,
   !(extent->compressed || zeroed),
-- 
2.6.2

[Qemu-block] [PATCH v5 3/8] vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset()

2017-06-04 Thread Ashijeet Acharya

Rename the existing get_cluster_offset() to vmdk_get_cluster_offset()
and update name in all the callers accordingly.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 46 +++---
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 73ae786..f403981 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1144,7 +1144,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData 
*m_data,
 }
 
 /**
- * get_cluster_offset
+ * vmdk_get_cluster_offset
  *
  * Look up cluster offset in extent file by sector number, and store in
  * @cluster_offset.
@@ -1163,14 +1163,14 @@ static int vmdk_L2update(VmdkExtent *extent, 
VmdkMetaData *m_data,
  *  VMDK_UNALLOC if cluster is not mapped and @allocate is false.
  *  VMDK_ERROR if failed.
  */
-static int get_cluster_offset(BlockDriverState *bs,
-  VmdkExtent *extent,
-  VmdkMetaData *m_data,
-  uint64_t offset,
-  bool allocate,
-  uint64_t *cluster_offset,
-  uint64_t skip_start_bytes,
-  uint64_t skip_end_bytes)
+static int vmdk_get_cluster_offset(BlockDriverState *bs,
+   VmdkExtent *extent,
+   VmdkMetaData *m_data,
+   uint64_t offset,
+   bool allocate,
+   uint64_t *cluster_offset,
+   uint64_t skip_start_bytes,
+   uint64_t skip_end_bytes)
 {
 unsigned int l1_index, l2_offset, l2_index;
 int min_index, i, j;
@@ -1304,9 +1304,9 @@ static int64_t coroutine_fn 
vmdk_co_get_block_status(BlockDriverState *bs,
 return 0;
 }
 qemu_co_mutex_lock(&s->lock);
-ret = get_cluster_offset(bs, extent, NULL,
- sector_num * 512, false, &offset,
- 0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, NULL,
+  sector_num * 512, false, &offset,
+  0, 0);
 qemu_co_mutex_unlock(&s->lock);
 
 index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
@@ -1497,8 +1497,8 @@ vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, 
uint64_t bytes,
 ret = -EIO;
 goto fail;
 }
-ret = get_cluster_offset(bs, extent, NULL,
- offset, false, &cluster_offset, 0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, NULL,
+  offset, false, &cluster_offset, 0, 0);
 offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
 
 n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
@@ -1584,10 +1584,10 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
offset,
 n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
  - offset_in_cluster);
 
-ret = get_cluster_offset(bs, extent, &m_data, offset,
- !(extent->compressed || zeroed),
- &cluster_offset, offset_in_cluster,
- offset_in_cluster + n_bytes);
+ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset,
+  !(extent->compressed || zeroed),
+  &cluster_offset, offset_in_cluster,
+  offset_in_cluster + n_bytes);
 if (extent->compressed) {
 if (ret == VMDK_OK) {
 /* Refuse write to allocated cluster for streamOptimized */
@@ -1596,8 +1596,8 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
offset,
 return -EIO;
 } else {
 /* allocate */
-ret = get_cluster_offset(bs, extent, &m_data, offset,
- true, &cluster_offset, 0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset,
+  true, &cluster_offset, 0, 0);
 }
 }
 if (ret == VMDK_ERROR) {
@@ -2229,9 +2229,9 @@ static int vmdk_check(BlockDriverState *bs, 
BdrvCheckResult *result,
 sector_num);
 break;
 }
-ret = get_cluster_offset(bs, extent, NULL,
- sector_num << BDRV_SECTOR_BITS,
- false, &cluster_offset, 0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, NULL,
+  sector_num << BDRV_SECTOR_BITS,
+

[Qemu-block] [PATCH v5 6/8] vmdk: New functions to assist allocating multiple clusters

2017-06-04 Thread Ashijeet Acharya

Introduce two new helper functions handle_alloc() and
vmdk_alloc_cluster_offset(). handle_alloc() helps to allocate multiple
clusters at once starting from a given offset on disk and performs COW
if necessary for first and last allocated clusters.
vmdk_alloc_cluster_offset() helps to return the offset of the first of
the many newly allocated clusters. Also, provide proper documentation
for both.

Signed-off-by: Ashijeet Acharya 
---
 block/vmdk.c | 192 +++
 1 file changed, 182 insertions(+), 10 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index fe2046b..b671dc9 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -136,6 +136,7 @@ typedef struct VmdkMetaData {
 unsigned int l2_offset;
 int valid;
 uint32_t *l2_cache_entry;
+uint32_t nb_clusters;
 } VmdkMetaData;
 
 typedef struct VmdkGrainMarker {
@@ -1242,6 +1243,174 @@ static int get_cluster_table(VmdkExtent *extent, 
uint64_t offset,
 return VMDK_OK;
 }
 
+/*
+ * vmdk_handle_alloc
+ *
+ * Allocate new clusters for an area that either is yet unallocated or needs a
+ * copy on write. If *cluster_offset is non_zero, clusters are only allocated 
if
+ * the new allocation can match the specified host offset.
+ *
+ * Returns:
+ *   VMDK_OK:   if new clusters were allocated, *bytes may be decreased if
+ *  the new allocation doesn't cover all of the requested area.
+ *  *cluster_offset is updated to contain the offset of the
+ *  first newly allocated cluster.
+ *
+ *   VMDK_UNALLOC:  if no clusters could be allocated. *cluster_offset is left
+ *  unchanged.
+ *
+ *   VMDK_ERROR:in error cases
+ */
+static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent,
+ uint64_t offset, uint64_t *cluster_offset,
+ int64_t *bytes, VmdkMetaData *m_data,
+ bool allocate, uint32_t *total_alloc_clusters)
+{
+int l1_index, l2_offset, l2_index;
+uint32_t *l2_table;
+uint32_t cluster_sector;
+uint32_t nb_clusters;
+bool zeroed = false;
+uint64_t skip_start_bytes, skip_end_bytes;
+int ret;
+
+ret = get_cluster_table(extent, offset, &l1_index, &l2_offset,
+&l2_index, &l2_table);
+if (ret < 0) {
+return ret;
+}
+
+cluster_sector = le32_to_cpu(l2_table[l2_index]);
+
+skip_start_bytes = vmdk_find_offset_in_cluster(extent, offset);
+/* Calculate the number of clusters to look for. Here we truncate the last
+ * cluster, i.e. 1 less than the actual value calculated as we may need to
+ * perform COW for the last one. */
+nb_clusters = DIV_ROUND_UP(skip_start_bytes + *bytes,
+   extent->cluster_sectors << BDRV_SECTOR_BITS) - 
1;
+
+nb_clusters = MIN(nb_clusters, extent->l2_size - l2_index);
+assert(nb_clusters <= INT_MAX);
+
+/* update bytes according to final nb_clusters value */
+if (nb_clusters != 0) {
+*bytes = ((nb_clusters * extent->cluster_sectors) << BDRV_SECTOR_BITS)
+ - skip_start_bytes;
+} else {
+nb_clusters = 1;
+}
+*total_alloc_clusters += nb_clusters;
+skip_end_bytes = skip_start_bytes + MIN(*bytes,
+ extent->cluster_sectors * BDRV_SECTOR_SIZE
+- skip_start_bytes);
+
+if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) {
+zeroed = true;
+}
+
+if (!cluster_sector || zeroed) {
+if (!allocate) {
+return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
+}
+
+cluster_sector = extent->next_cluster_sector;
+extent->next_cluster_sector += extent->cluster_sectors
+* nb_clusters;
+
+ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
+   offset, skip_start_bytes,
+   skip_end_bytes);
+if (ret < 0) {
+return ret;
+}
+if (m_data) {
+m_data->valid = 1;
+m_data->l1_index = l1_index;
+m_data->l2_index = l2_index;
+m_data->l2_offset = l2_offset;
+m_data->l2_cache_entry = &l2_table[l2_index];
+m_data->nb_clusters = nb_clusters;
+}
+}
+*cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
+return VMDK_OK;
+}
+
+/*
+ * vmdk_alloc_clusters
+ *
+ * For a given offset on the virtual disk, find the cluster offset in vmdk
+ * file. If the offset is not found, allocate a new cluster.
+ *
+ * If the cluster is newly allocated, m_data->nb_clusters is set to the number
+ * of contiguous clusters that have been allocated. In this case, the other
+ * fields of m_data are valid and contain inform

[Qemu-block] [PATCH v5 2/8] vmdk: Rename get_whole_cluster() to vmdk_perform_cow()

2017-06-04 Thread Ashijeet Acharya

Rename the existing function get_whole_cluster() to vmdk_perform_cow()
as its sole purpose is to perform COW for the first and the last
allocated clusters if needed.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 23 ++-
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 22be887..73ae786 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1028,8 +1028,8 @@ static void vmdk_refresh_limits(BlockDriverState *bs, 
Error **errp)
 }
 }
 
-/**
- * get_whole_cluster
+/*
+ * vmdk_perform_cow
  *
  * Copy backing file's cluster that covers @sector_num, otherwise write zero,
  * to the cluster at @cluster_sector_num.
@@ -1037,13 +1037,18 @@ static void vmdk_refresh_limits(BlockDriverState *bs, 
Error **errp)
  * If @skip_start_sector < @skip_end_sector, the relative range
  * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave
  * it for call to write user data in the request.
+ *
+ * Returns:
+ *   VMDK_OK:   on success
+ *
+ *   VMDK_ERROR:in error cases
  */
-static int get_whole_cluster(BlockDriverState *bs,
- VmdkExtent *extent,
- uint64_t cluster_offset,
- uint64_t offset,
- uint64_t skip_start_bytes,
- uint64_t skip_end_bytes)
+static int vmdk_perform_cow(BlockDriverState *bs,
+VmdkExtent *extent,
+uint64_t cluster_offset,
+uint64_t offset,
+uint64_t skip_start_bytes,
+uint64_t skip_end_bytes)
 {
 int ret = VMDK_OK;
 int64_t cluster_bytes;
@@ -1244,7 +1249,7 @@ static int get_cluster_offset(BlockDriverState *bs,
  * This problem may occur because of insufficient space on host disk
  * or inappropriate VM shutdown.
  */
-ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
+ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
 offset, skip_start_bytes, skip_end_bytes);
 if (ret) {
 return ret;
-- 
2.6.2

[Qemu-block] [PATCH v5 7/8] vmdk: Update metadata for multiple clusters

2017-06-04 Thread Ashijeet Acharya

Include a next pointer in VmdkMetaData struct to point to the previous
allocated L2 table. Modify vmdk_L2update to start updating metadata for
allocation of multiple clusters at once.

Signed-off-by: Ashijeet Acharya 
---
 block/vmdk.c | 128 ++-
 1 file changed, 101 insertions(+), 27 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index b671dc9..9fa2414 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -137,6 +137,8 @@ typedef struct VmdkMetaData {
 int valid;
 uint32_t *l2_cache_entry;
 uint32_t nb_clusters;
+uint32_t offset;
+struct VmdkMetaData *next;
 } VmdkMetaData;
 
 typedef struct VmdkGrainMarker {
@@ -1116,34 +1118,87 @@ exit:
 return ret;
 }
 
-static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
- uint32_t offset)
+static int vmdk_alloc_cluster_link_l2(VmdkExtent *extent,
+  VmdkMetaData *m_data, bool zeroed)
 {
-offset = cpu_to_le32(offset);
+int i;
+uint32_t offset, temp_offset;
+int *l2_table_array;
+int l2_array_size;
+
+if (zeroed) {
+temp_offset = VMDK_GTE_ZEROED;
+} else {
+temp_offset = m_data->offset;
+}
+
+l2_array_size = sizeof(uint32_t) * m_data->nb_clusters;
+l2_table_array = qemu_try_blockalign(extent->file->bs,
+ QEMU_ALIGN_UP(l2_array_size,
+   BDRV_SECTOR_SIZE));
+if (l2_table_array == NULL) {
+return VMDK_ERROR;
+}
+memset(l2_table_array, 0, QEMU_ALIGN_UP(l2_array_size, BDRV_SECTOR_SIZE));
 /* update L2 table */
+offset = temp_offset;
+for (i = 0; i < m_data->nb_clusters; i++) {
+l2_table_array[i] = cpu_to_le32(offset);
+if (!zeroed) {
+offset += 128;
+}
+}
 if (bdrv_pwrite_sync(extent->file,
-((int64_t)m_data->l2_offset * 512)
-+ (m_data->l2_index * sizeof(offset)),
-&offset, sizeof(offset)) < 0) {
+ ((int64_t)m_data->l2_offset * 512)
+ + ((m_data->l2_index) * sizeof(offset)),
+ l2_table_array, l2_array_size) < 0) {
 return VMDK_ERROR;
 }
 /* update backup L2 table */
 if (extent->l1_backup_table_offset != 0) {
 m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
 if (bdrv_pwrite_sync(extent->file,
-((int64_t)m_data->l2_offset * 512)
-+ (m_data->l2_index * sizeof(offset)),
-&offset, sizeof(offset)) < 0) {
+ ((int64_t)m_data->l2_offset * 512)
+ + ((m_data->l2_index) * sizeof(offset)),
+ l2_table_array, l2_array_size) < 0) {
 return VMDK_ERROR;
 }
 }
+
+offset = temp_offset;
 if (m_data->l2_cache_entry) {
-*m_data->l2_cache_entry = offset;
+for (i = 0; i < m_data->nb_clusters; i++) {
+*m_data->l2_cache_entry = cpu_to_le32(offset);
+m_data->l2_cache_entry++;
+
+if (!zeroed) {
+offset += 128;
+}
+}
 }
 
+qemu_vfree(l2_table_array);
 return VMDK_OK;
 }
 
+static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
+ bool zeroed)
+{
+int ret;
+
+while (m_data->next != NULL) {
+
+ret = vmdk_alloc_cluster_link_l2(extent, m_data, zeroed);
+if (ret < 0) {
+return ret;
+}
+
+m_data = m_data->next;
+ }
+
+ return VMDK_OK;
+}
+
 /*
  * vmdk_l2load
  *
@@ -1261,9 +1316,10 @@ static int get_cluster_table(VmdkExtent *extent, 
uint64_t offset,
  *
  *   VMDK_ERROR:in error cases
  */
+
 static int vmdk_handle_alloc(BlockDriverState *bs, VmdkExtent *extent,
  uint64_t offset, uint64_t *cluster_offset,
- int64_t *bytes, VmdkMetaData *m_data,
+ int64_t *bytes, VmdkMetaData **m_data,
  bool allocate, uint32_t *total_alloc_clusters)
 {
 int l1_index, l2_offset, l2_index;
@@ -1272,6 +1328,7 @@ static int vmdk_handle_alloc(BlockDriverState *bs, 
VmdkExtent *extent,
 uint32_t nb_clusters;
 bool zeroed = false;
 uint64_t skip_start_bytes, skip_end_bytes;
+VmdkMetaData *old_m_data;
 int ret;
 
 ret = get_cluster_table(extent, offset, &l1_index, &l2_offset,
@@ -1323,13 +1380,21 @@ static int vmdk_handle_alloc(BlockDriverState *bs, 
VmdkExtent *extent,
 if (ret < 0) {
 return ret;
 }
-if (m_data) {
-m_data->valid = 1;
-m_data->l1_index = l1_index;
-m_dat

[Qemu-block] [PATCH v5 1/8] vmdk: Move vmdk_find_offset_in_cluster() to the top

2017-06-04 Thread Ashijeet Acharya

Move the existing vmdk_find_offset_in_cluster() function to the top of
the driver.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index a9bd22b..22be887 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -242,6 +242,18 @@ static void vmdk_free_last_extent(BlockDriverState *bs)
 s->extents = g_renew(VmdkExtent, s->extents, s->num_extents);
 }
 
+static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
+   int64_t offset)
+{
+uint64_t extent_begin_offset, extent_relative_offset;
+uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;
+
+extent_begin_offset =
+(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE;
+extent_relative_offset = offset - extent_begin_offset;
+return extent_relative_offset % cluster_size;
+}
+
 static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
 {
 char *desc;
@@ -1266,18 +1278,6 @@ static VmdkExtent *find_extent(BDRVVmdkState *s,
 return NULL;
 }
 
-static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
-   int64_t offset)
-{
-uint64_t extent_begin_offset, extent_relative_offset;
-uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;
-
-extent_begin_offset =
-(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE;
-extent_relative_offset = offset - extent_begin_offset;
-return extent_relative_offset % cluster_size;
-}
-
 static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent,
   int64_t sector_num)
 {
-- 
2.6.2

[Qemu-block] [PATCH v5 0/8] Optimize VMDK I/O by allocating multiple clusters

2017-06-04 Thread Ashijeet Acharya

Previously posted series patches:
v1 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg02044.html
v2 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg05080.html
v3 - http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg00074.html
v4 - http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg03851.html

This series helps to optimize the I/O performance of VMDK driver.

Patch 1 helps us to move vmdk_find_offset_in_cluster.

Patch 2 & 3 perform a simple function re-naming tasks.

Patch 4 is used to factor out metadata loading code and implement it in separate
functions. This will help us to avoid code duplication in future patches of this
series.

Patch 5 helps to set the upper limit of the bytes handled in one cycle.

Patch 6 adds new functions to help us allocate multiple clusters according to
the size requested, perform COW if required and return the offset of the first
newly allocated cluster.

Patch 7 changes the metadata update code to update the L2 tables for multiple
clusters at once.

Patch 8 helps us to finally change vmdk_get_cluster_offset() to find cluster 
offset
only as cluster allocation task is now handled by vmdk_alloc_clusters()

Optimization test results:

This patch series improves 128 KB sequential write performance to an
empty VMDK file by 54%

Benchmark command: ./qemu-img bench -w -c 1024 -s 128K -d 1 -t none -f
vmdk test.vmdk

Changes in v5:
- fix commit message and comment in patch 4 (fam)
- add vmdk_ prefix to handle_alloc() (fam)
- fix alignment issue in patch 6 (fam)
- use BDRV_SECTOR_BITS (fam)
- fix endianness calculation in patch 7 (fam)

Changes in v4:
- fix commit message in patch 1 (fam)
- drop size_to_clusters() function (fam)
- fix grammatical errors in function documentations (fam)
- factor out metadata loading coding in a separate patch (patch 4) (fam)
- rename vmdk_alloc_cluster_offset() to vmdk_alloc_clusters() (fam)
- break patch 4(in v3) into separate patches (patch 3 and 8) (fam)
- rename extent_size to extent_end (fam)
- use QEMU_ALIGN_UP instead of vmdk_align_offset. (fam)
- drop next and simply do m_data = m_data->next (fam)

Changes in v3:
- move size_to_clusters() from patch 1 to 3 (fam)
- use DIV_ROUND_UP in size_to_clusters (fam)
- make patch 2 compilable (fam)
- rename vmdk_L2update as vmdk_l2update and use UINT32_MAX (fam)
- combine patch 3 and patch 4 (as in v2) to make them compilable (fam)
- call bdrv_pwrite_sync() for batches of atmost 512 clusters at once (fam)

Changes in v2:
- segregate the ugly Patch 1 in v1 into 6 readable and sensible patches
- include benchmark test results in v2

Ashijeet Acharya (8):
  vmdk: Move vmdk_find_offset_in_cluster() to the top
  vmdk: Rename get_whole_cluster() to vmdk_perform_cow()
  vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset()
  vmdk: Factor out metadata loading code out of
vmdk_get_cluster_offset()
  vmdk: Set maximum bytes allocated in one cycle
  vmdk: New functions to assist allocating multiple clusters
  vmdk: Update metadata for multiple clusters
  vmdk: Make vmdk_get_cluster_offset() return cluster offset only

 block/vmdk.c | 529 +--
 1 file changed, 407 insertions(+), 122 deletions(-)

-- 
2.6.2

Re: [Qemu-block] [Qemu-devel] [PATCH v4 6/8] vmdk: New functions to assist allocating multiple clusters

2017-06-03 Thread Ashijeet Acharya

On Thu, Jun 1, 2017 at 7:27 PM, Fam Zheng  wrote:
> On Sat, 04/22 10:43, Ashijeet Acharya wrote:
>> Introduce two new helper functions handle_alloc() and
>> vmdk_alloc_cluster_offset(). handle_alloc() helps to allocate multiple
>> clusters at once starting from a given offset on disk and performs COW
>> if necessary for first and last allocated clusters.
>> vmdk_alloc_cluster_offset() helps to return the offset of the first of
>> the many newly allocated clusters. Also, provide proper documentation
>> for both.
>>
>> Signed-off-by: Ashijeet Acharya 
>> ---
>>  block/vmdk.c | 192 
>> +++
>>  1 file changed, 182 insertions(+), 10 deletions(-)
>>
>> diff --git a/block/vmdk.c b/block/vmdk.c
>> index 7862791..8d34cd9 100644
>> --- a/block/vmdk.c
>> +++ b/block/vmdk.c
>> @@ -136,6 +136,7 @@ typedef struct VmdkMetaData {
>>  unsigned int l2_offset;
>>  int valid;
>>  uint32_t *l2_cache_entry;
>> +uint32_t nb_clusters;
>>  } VmdkMetaData;
>>
>>  typedef struct VmdkGrainMarker {
>> @@ -1242,6 +1243,174 @@ static int get_cluster_table(VmdkExtent *extent, 
>> uint64_t offset,
>>  return VMDK_OK;
>>  }
>>
>> +/*
>> + * handle_alloc
>> + *
>> + * Allocate new clusters for an area that either is yet unallocated or 
>> needs a
>> + * copy on write. If *cluster_offset is non_zero, clusters are only 
>> allocated if
>> + * the new allocation can match the specified host offset.
>> + *
>> + * Returns:
>> + *   VMDK_OK:   if new clusters were allocated, *bytes may be decreased 
>> if
>> + *  the new allocation doesn't cover all of the requested 
>> area.
>> + *  *cluster_offset is updated to contain the offset of the
>> + *  first newly allocated cluster.
>> + *
>> + *   VMDK_UNALLOC:  if no clusters could be allocated. *cluster_offset is 
>> left
>> + *  unchanged.
>> + *
>> + *   VMDK_ERROR:in error cases
>> + */
>> +static int handle_alloc(BlockDriverState *bs, VmdkExtent *extent,
>> +uint64_t offset, uint64_t *cluster_offset,
>> +int64_t *bytes, VmdkMetaData *m_data,
>> +bool allocate, uint32_t *total_alloc_clusters)
>
> Not super important but personally I always prefer to stick to a "vmdk_" 
> prefix
> when naming local identifiers, so that ctags and git grep can take it easy.

Done.

>
>> +{
>> +int l1_index, l2_offset, l2_index;
>> +uint32_t *l2_table;
>> +uint32_t cluster_sector;
>> +uint32_t nb_clusters;
>> +bool zeroed = false;
>> +uint64_t skip_start_bytes, skip_end_bytes;
>> +int ret;
>> +
>> +ret = get_cluster_table(extent, offset, &l1_index, &l2_offset,
>> +&l2_index, &l2_table);
>> +if (ret < 0) {
>> +return ret;
>> +}
>> +
>> +cluster_sector = le32_to_cpu(l2_table[l2_index]);
>> +
>> +skip_start_bytes = vmdk_find_offset_in_cluster(extent, offset);
>> +/* Calculate the number of clusters to look for. Here we truncate the 
>> last
>> + * cluster, i.e. 1 less than the actual value calculated as we may need 
>> to
>> + * perform COW for the last one. */
>> +nb_clusters = DIV_ROUND_UP(skip_start_bytes + *bytes,
>> +extent->cluster_sectors << BDRV_SECTOR_BITS) - 
>> 1;
>
> Alignment could be improved: here ^

Done.

>
>> +
>> +nb_clusters = MIN(nb_clusters, extent->l2_size - l2_index);
>> +assert(nb_clusters <= INT_MAX);
>> +
>> +/* update bytes according to final nb_clusters value */
>> +if (nb_clusters != 0) {
>> +*bytes = ((nb_clusters * extent->cluster_sectors) << 9)
>
> Better use BDRV_SECTOR_BITS instead of 9.

Done.

>
>> +- skip_start_bytes;
>> +} else {
>> +nb_clusters = 1;
>> +}
>> +*total_alloc_clusters += nb_clusters;
>
> It is weird that you increment *total_alloc_clusters instead of simply 
> assigning
> to it, because it's not clear why before reading the caller code.
>
I am incrementing it because we allocate clusters in every iteration
and *total_alloc_clusters contains number of clusters allocated in
total and not just in one iteration. So basically it is a sum of all
the m_data->nb_clusters present in every element of the linked list I
prepare.

> It's better if you just return nb_clusters from this function (either as a
> return value, or assign to *total_alloc_clusters), then do the accumulation in
> vmdk_pwritev by adding m_data->nb_clusters, which is simpler.

If I understood it correctly, you mean to say that I should add all
the m_data->nb_clusters present in my linked list inside
vmdk_pwritev() using a loop? If yes, I think that's what I have been
doing earlier by incrementing *total_alloc_clusters there itself and
is better, no?
Also, please correct me if I am wrong :-)

Ashijeet

Re: [Qemu-block] [PATCH v2 0/7] Refactor DMG driver to have chunk size independence

2017-04-27 Thread Ashijeet Acharya

On Thu, Apr 27, 2017 at 1:36 PM, Ashijeet Acharya
 wrote:
> Previously posted series patches:
> v1: http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg04641.html
>
> This series helps to provide chunk size independence for DMG driver to prevent
> denial-of-service in cases where untrusted files are being accessed by the 
> user.
>
> This task is mentioned on the public block ToDo
> Here -> http://wiki.qemu.org/ToDo/Block/DmgChunkSizeIndependence
>
> Patch 1 introduces a new data structure to aid caching of random access points
> within a compressed stream.
>
> Patch 2 is an extension of patch 1 and introduces a new function to
> initialize/update/reset our cached random access point.
>
> Patch 3 limits the output buffer size to a max of 2MB to avoid QEMU allocate
> huge amounts of memory.
>
> Patch 4 is a simple preparatory patch to aid handling of various types of 
> chunks.
>
> Patches 5 & 6 help to handle various types of chunks.
>
> Patch 7 simply refactors dmg_co_preadv() to read multiple sectors at once.
>
> Patch 8 finally removes the error messages QEMU used to throw when an image 
> with
> chunk sizes above 64MB were accessed by the user.

John, I have squashed patch 3 and 8 (as in v1) actually and that
change is represented in patch 7 (as in v2). The cover letter here is
quite misleading, as I forgot to update it and simply did a ctrl-c --
ctrl-v carelessly.

Ashijeet

> Ashijeet Acharya (7):
>   dmg: Introduce a new struct to cache random access points
>   dmg: New function to help us cache random access point
>   dmg: Refactor and prepare dmg_read_chunk() to cache random access
> points
>   dmg: Handle zlib compressed chunks
>   dmg: Handle bz2 compressed/raw/zeroed chunks
>   dmg: Refactor dmg_co_preadv() to start reading multiple sectors
>   dmg: Limit the output buffer size to a max of 2MB
>
>  block/dmg.c | 214 
> +++-
>  block/dmg.h |  10 +++
>  2 files changed, 148 insertions(+), 76 deletions(-)
>
> --
> 2.6.2
>

[Qemu-block] [PATCH v2 5/7] dmg: Handle bz2 compressed/raw/zeroed chunks

2017-04-27 Thread Ashijeet Acharya

We do not need to cache the access point for these chunks but need to
update our various supporting variables like chunk, sectors_read etc.
to keep maintaining our code structure. Call cache_access_point() after
reading chunks of these types.

Signed-off-by: Ashijeet Acharya 
---
 block/dmg.c | 18 ++
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/block/dmg.c b/block/dmg.c
index 073e864..f9045f9 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -680,20 +680,30 @@ update:
  (char *)s->uncompressed_chunk,
  (unsigned int)
 (512 * s->sectorcounts[chunk]));
+
 if (ret < 0) {
 return ret;
 }
+cache_access_point(drs, NULL, -1, chunk, sectors_read,
+   sector_offset);
 break;
 case 1: /* copy */
-ret = bdrv_pread(bs->file, s->offsets[chunk],
- s->uncompressed_chunk, s->lengths[chunk]);
-if (ret != s->lengths[chunk]) {
-return -1;
+if (drs->sectors_read == -1) {
+ret = bdrv_pread(bs->file, s->offsets[chunk],
+ s->uncompressed_chunk, s->lengths[chunk]);
+if (ret != s->lengths[chunk]) {
+return -1;
+}
 }
+cache_access_point(drs, NULL, -1, chunk, sectors_read,
+   sector_offset);
 break;
 case 2: /* zero */
 /* see dmg_read, it is treated specially. No buffer needs to be
  * pre-filled, the zeroes can be set directly. */
+cache_access_point(drs, NULL, -1, chunk, sectors_read,
+   sector_offset);
+
 break;
 }
 s->current_chunk = chunk;
-- 
2.6.2

[Qemu-block] [PATCH v2 4/7] dmg: Handle zlib compressed chunks

2017-04-27 Thread Ashijeet Acharya

Set the output buffer size to be equal to the size of number of sectors
stored in @sectors_read. Start inflating to a max output buffer size of
2MB and cache our access point to aid random access later if required.

Signed-off-by: Ashijeet Acharya 
---
 block/dmg.c | 48 ++--
 1 file changed, 34 insertions(+), 14 deletions(-)

diff --git a/block/dmg.c b/block/dmg.c
index 32623e2..073e864 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -621,27 +621,47 @@ static inline int dmg_read_chunk(BlockDriverState *bs, 
uint64_t sector_num,
 
 switch (s->types[chunk]) { /* block entry type */
 case 0x8005: { /* zlib compressed */
-/* we need to buffer, because only the chunk as whole can be
- * inflated. */
-ret = bdrv_pread(bs->file, s->offsets[chunk],
- s->compressed_chunk, s->lengths[chunk]);
-if (ret != s->lengths[chunk]) {
-return -1;
+/* check for cached random access point */
+if (drs->saved_next_in == NULL) {
+/* we need to buffer, because only the chunk as whole can be
+ * inflated. */
+ret = bdrv_pread(bs->file, s->offsets[chunk],
+ s->compressed_chunk, s->lengths[chunk]);
+if (ret != s->lengths[chunk]) {
+return -1;
+}
+
+s->zstream.next_in = s->compressed_chunk;
+s->zstream.avail_in = s->lengths[chunk];
+} else {
+s->zstream.next_in = drs->saved_next_in;
+s->zstream.avail_in = drs->saved_avail_in;
 }
 
-s->zstream.next_in = s->compressed_chunk;
-s->zstream.avail_in = s->lengths[chunk];
 s->zstream.next_out = s->uncompressed_chunk;
-s->zstream.avail_out = 512 * s->sectorcounts[chunk];
-ret = inflateReset(&s->zstream);
-if (ret != Z_OK) {
-return -1;
+
+s->zstream.avail_out = sectors_read * BDRV_SECTOR_SIZE;
+
+if (drs->saved_next_in == NULL) {
+ret = inflateReset(&s->zstream);
+if (ret != Z_OK) {
+return -1;
+}
+}
+/* reset total_out for each successive call */
+s->zstream.total_out = 0;
+ret = inflate(&s->zstream, Z_SYNC_FLUSH);
+if (ret == Z_OK &&
+s->zstream.total_out == 512 * sectors_read) {
+goto update;
 }
-ret = inflate(&s->zstream, Z_FINISH);
 if (ret != Z_STREAM_END ||
-s->zstream.total_out != 512 * s->sectorcounts[chunk]) {
+s->zstream.total_out != 512 * sectors_read) {
 return -1;
 }
+update:
+cache_access_point(drs, s->zstream.next_in, s->zstream.avail_in,
+   chunk, sectors_read, sector_offset);
 break; }
 case 0x8006: /* bzip2 compressed */
 if (!dmg_uncompress_bz2) {
-- 
2.6.2

[Qemu-block] [PATCH v2 0/7] Refactor DMG driver to have chunk size independence

2017-04-27 Thread Ashijeet Acharya

Previously posted series patches:
v1: http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg04641.html

This series helps to provide chunk size independence for DMG driver to prevent
denial-of-service in cases where untrusted files are being accessed by the user.

This task is mentioned on the public block ToDo
Here -> http://wiki.qemu.org/ToDo/Block/DmgChunkSizeIndependence

Patch 1 introduces a new data structure to aid caching of random access points
within a compressed stream.

Patch 2 is an extension of patch 1 and introduces a new function to
initialize/update/reset our cached random access point.

Patch 3 limits the output buffer size to a max of 2MB to avoid QEMU allocate
huge amounts of memory.

Patch 4 is a simple preparatory patch to aid handling of various types of 
chunks.

Patches 5 & 6 help to handle various types of chunks.

Patch 7 simply refactors dmg_co_preadv() to read multiple sectors at once.

Patch 8 finally removes the error messages QEMU used to throw when an image with
chunk sizes above 64MB were accessed by the user.

->Testing procedure:
Convert a DMG file to raw format using the "qemu-img convert" tool present in
v2.9.0
Next convert the same image again after applying these patches.
Compare the two images using "qemu-img compare" tool to check if they are
identical.

You can pickup any DMG image from the collection present
Here -> https://lists.gnu.org/archive/html/qemu-devel/2014-12/msg03606.html

->Important note:
These patches assume that the terms "chunk" and "block" are synonyms of each 
other
when we talk about bz2 compressed streams. Thus according to the bz2 docs[1],
the max uncompressed size of a chunk/block can reach to 46MB which is less than
the previously allowed size of 64MB, so we can continue decompressing the whole
chunk/block at once instead of partial decompression just like we do now.

This limitation was forced by the fact that bz2 compressed streams do not allow
random access midway through a chunk/block as the BZ2_bzDecompress() API in 
bzlib
seeks for the magic key "BZh" before starting decompression.[2] This magic key 
is
present at the start of every chunk/block only and since our cached random 
access
points need not necessarily point to the start of a chunk/block, 
BZ2_bzDecompress()
fails with an error value BZ_DATA_ERROR_MAGIC[3]

[1] https://en.wikipedia.org/wiki/Bzip2#File_format
[2] https://blastedbio.blogspot.in/2011/11/random-access-to-bzip2.html
[3] http://linux.math.tifr.res.in/manuals/html/manual_3.html#SEC17

Special thanks to Peter Wu for helping me understand and tackle the bz2
compressed chunks.

Changes in v2:
- limit the buffer size to 2MB after fixing the buffering problems (john/fam)

Ashijeet Acharya (7):
  dmg: Introduce a new struct to cache random access points
  dmg: New function to help us cache random access point
  dmg: Refactor and prepare dmg_read_chunk() to cache random access
points
  dmg: Handle zlib compressed chunks
  dmg: Handle bz2 compressed/raw/zeroed chunks
  dmg: Refactor dmg_co_preadv() to start reading multiple sectors
  dmg: Limit the output buffer size to a max of 2MB

 block/dmg.c | 214 +++-
 block/dmg.h |  10 +++
 2 files changed, 148 insertions(+), 76 deletions(-)

-- 
2.6.2

[Qemu-block] [PATCH v2 1/7] dmg: Introduce a new struct to cache random access points

2017-04-27 Thread Ashijeet Acharya

We need to cache the random access point while performing partial
decompression so that we can resume decompression from that point
onwards in our next sequential read request. Introduce a new struct
DMGReadState which will help us do this.

Signed-off-by: Ashijeet Acharya 
---
 block/dmg.h | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/block/dmg.h b/block/dmg.h
index b592d6f..ee67ae1 100644
--- a/block/dmg.h
+++ b/block/dmg.h
@@ -31,6 +31,15 @@
 #include "block/block_int.h"
 #include 
 
+/* used to cache current position in compressed input stream */
+typedef struct DMGReadState {
+uint8_t *saved_next_in;
+int64_t saved_avail_in;
+int32_t saved_chunk_type;
+int64_t sectors_read; /* possible sectors read in each cycle */
+int32_t sector_offset_in_chunk;
+} DMGReadState;
+
 typedef struct BDRVDMGState {
 CoMutex lock;
 /* each chunk contains a certain number of sectors,
@@ -51,6 +60,7 @@ typedef struct BDRVDMGState {
 uint8_t *compressed_chunk;
 uint8_t *uncompressed_chunk;
 z_stream zstream;
+DMGReadState *drs;
 } BDRVDMGState;
 
 extern int (*dmg_uncompress_bz2)(char *next_in, unsigned int avail_in,
-- 
2.6.2

[Qemu-block] [PATCH v2 6/7] dmg: Refactor dmg_co_preadv() to start reading multiple sectors

2017-04-27 Thread Ashijeet Acharya

At the moment, dmg_co_preadv() reads one sector at a time. Make it
read multiple sectors at a time depending on the number of sectors
stored in "drs->sectors_read". This does not provide any significant
optimization in the I/O process of DMG but is still a nicer way.

Adjust the 'data' variable depending on our cached access point
situation to align our read requests appropriately.

Signed-off-by: Ashijeet Acharya 
---
 block/dmg.c | 23 +++
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/block/dmg.c b/block/dmg.c
index f9045f9..8b7460c 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -718,7 +718,7 @@ dmg_co_preadv(BlockDriverState *bs, uint64_t offset, 
uint64_t bytes,
 BDRVDMGState *s = bs->opaque;
 uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
 int nb_sectors = bytes >> BDRV_SECTOR_BITS;
-int ret, i;
+int ret, i = 0;
 DMGReadState *drs = s->drs;
 
 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
@@ -726,8 +726,7 @@ dmg_co_preadv(BlockDriverState *bs, uint64_t offset, 
uint64_t bytes,
 
 qemu_co_mutex_lock(&s->lock);
 
-for (i = 0; i < nb_sectors; i++) {
-uint32_t sector_offset_in_chunk;
+while (i < nb_sectors) {
 void *data;
 
 if (dmg_read_chunk(bs, sector_num + i, drs) != 0) {
@@ -738,12 +737,20 @@ dmg_co_preadv(BlockDriverState *bs, uint64_t offset, 
uint64_t bytes,
  * s->uncompressed_chunk may be too small to cover the large all-zeroes
  * section. dmg_read_chunk is called to find s->current_chunk */
 if (s->types[s->current_chunk] == 2) { /* all zeroes block entry */
-qemu_iovec_memset(qiov, i * 512, 0, 512);
-continue;
+qemu_iovec_memset(qiov, i * 512, 0,
+512 * drs->sectors_read);
+goto increment;
+}
+
+if (drs->saved_next_in == NULL) {
+data = s->uncompressed_chunk + drs->sector_offset_in_chunk * 512;
+} else {
+data = s->uncompressed_chunk;
 }
-sector_offset_in_chunk = sector_num + i - s->sectors[s->current_chunk];
-data = s->uncompressed_chunk + sector_offset_in_chunk * 512;
-qemu_iovec_from_buf(qiov, i * 512, data, 512);
+qemu_iovec_from_buf(qiov, i * 512, data, drs->sectors_read * 512);
+
+increment:
+i += drs->sectors_read;
 }
 
 ret = 0;
-- 
2.6.2

[Qemu-block] [PATCH v2 7/7] dmg: Limit the output buffer size to a max of 2MB

2017-04-27 Thread Ashijeet Acharya

The size of the output buffer is limited to a maximum of 2MB so that
QEMU doesn't end up allocating huge amounts of memory while
decompressing compressed input streams.

2MB is an appropriate size because "qemu-img convert" has the same I/O
buffer size and the most important use case for DMG files is to be
compatible with qemu-img convert.

We have refactored the DMG driver to accept and process images
irrespective of their chunk sizes since we now have a limit of 2MB on
our output buffer size. Thus QEMU will not allocate huge amounts of
memory no matter what the chunk size is. Remove the error messages to
prevent denial-of-service in cases where untrusted files are being
accessed by the user.

Signed-off-by: Ashijeet Acharya 
---
 block/dmg.c | 24 +---
 1 file changed, 1 insertion(+), 23 deletions(-)

diff --git a/block/dmg.c b/block/dmg.c
index 8b7460c..ade2578 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -37,7 +37,7 @@ enum {
 /* Limit chunk sizes to prevent unreasonable amounts of memory being used
  * or truncating when converting to 32-bit types
  */
-DMG_LENGTHS_MAX = 64 * 1024 * 1024, /* 64 MB */
+DMG_LENGTHS_MAX = 2 * 1024 * 1024, /* 2 MB */
 DMG_SECTOR_MAX = DMG_LENGTHS_MAX / 512,
 };
 
@@ -209,7 +209,6 @@ static int dmg_read_mish_block(BDRVDMGState *s, 
DmgHeaderState *ds,
uint8_t *buffer, uint32_t count)
 {
 uint32_t type, i;
-int ret;
 size_t new_size;
 uint32_t chunk_count;
 int64_t offset = 0;
@@ -258,16 +257,6 @@ static int dmg_read_mish_block(BDRVDMGState *s, 
DmgHeaderState *ds,
 /* sector count */
 s->sectorcounts[i] = buff_read_uint64(buffer, offset + 0x10);
 
-/* all-zeroes sector (type 2) does not need to be "uncompressed" and 
can
- * therefore be unbounded. */
-if (s->types[i] != 2 && s->sectorcounts[i] > DMG_SECTOR_MAX) {
-error_report("sector count %" PRIu64 " for chunk %" PRIu32
- " is larger than max (%u)",
- s->sectorcounts[i], i, DMG_SECTOR_MAX);
-ret = -EINVAL;
-goto fail;
-}
-
 /* offset in (compressed) data fork */
 s->offsets[i] = buff_read_uint64(buffer, offset + 0x18);
 s->offsets[i] += in_offset;
@@ -275,23 +264,12 @@ static int dmg_read_mish_block(BDRVDMGState *s, 
DmgHeaderState *ds,
 /* length in (compressed) data fork */
 s->lengths[i] = buff_read_uint64(buffer, offset + 0x20);
 
-if (s->lengths[i] > DMG_LENGTHS_MAX) {
-error_report("length %" PRIu64 " for chunk %" PRIu32
- " is larger than max (%u)",
- s->lengths[i], i, DMG_LENGTHS_MAX);
-ret = -EINVAL;
-goto fail;
-}
-
 update_max_chunk_size(s, i, &ds->max_compressed_size,
   &ds->max_sectors_per_chunk);
 offset += 40;
 }
 s->n_chunks += chunk_count;
 return 0;
-
-fail:
-return ret;
 }
 
 static int dmg_read_resource_fork(BlockDriverState *bs, DmgHeaderState *ds,
-- 
2.6.2

[Qemu-block] [PATCH v2 3/7] dmg: Refactor and prepare dmg_read_chunk() to cache random access points

2017-04-27 Thread Ashijeet Acharya

Refactor dmg_read_chunk() to start making use of the new DMGReadState
structure and do chunk and sector related calculations based on it.
Add a new argument "DMGReadState *drs" to it.

Also, rename DMG_SECTORCOUNTS_MAX to DMG_SECTOR_MAX to avoid indentaion
problems at some places inside dmg_read_chunk()

Signed-off-by: Ashijeet Acharya 
---
 block/dmg.c | 159 +++-
 1 file changed, 94 insertions(+), 65 deletions(-)

diff --git a/block/dmg.c b/block/dmg.c
index c6fe8b0..32623e2 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -38,7 +38,7 @@ enum {
  * or truncating when converting to 32-bit types
  */
 DMG_LENGTHS_MAX = 64 * 1024 * 1024, /* 64 MB */
-DMG_SECTORCOUNTS_MAX = DMG_LENGTHS_MAX / 512,
+DMG_SECTOR_MAX = DMG_LENGTHS_MAX / 512,
 };
 
 static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename)
@@ -260,10 +260,10 @@ static int dmg_read_mish_block(BDRVDMGState *s, 
DmgHeaderState *ds,
 
 /* all-zeroes sector (type 2) does not need to be "uncompressed" and 
can
  * therefore be unbounded. */
-if (s->types[i] != 2 && s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) {
+if (s->types[i] != 2 && s->sectorcounts[i] > DMG_SECTOR_MAX) {
 error_report("sector count %" PRIu64 " for chunk %" PRIu32
  " is larger than max (%u)",
- s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX);
+ s->sectorcounts[i], i, DMG_SECTOR_MAX);
 ret = -EINVAL;
 goto fail;
 }
@@ -578,78 +578,106 @@ static inline uint32_t search_chunk(BDRVDMGState *s, 
uint64_t sector_num)
 return s->n_chunks; /* error */
 }
 
-static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
+static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num,
+ DMGReadState *drs)
 {
 BDRVDMGState *s = bs->opaque;
 
+int ret;
+uint32_t sector_offset;
+uint64_t sectors_read;
+uint32_t chunk;
+
 if (!is_sector_in_chunk(s, s->current_chunk, sector_num)) {
-int ret;
-uint32_t chunk = search_chunk(s, sector_num);
+chunk = search_chunk(s, sector_num);
+} else {
+chunk = drs->saved_chunk_type;
+}
 
-if (chunk >= s->n_chunks) {
+if (chunk >= s->n_chunks) {
+return -1;
+}
+
+/* reset our access point cache if we had a change in current chunk */
+if (chunk != drs->saved_chunk_type) {
+cache_access_point(drs, NULL, -1, -1, -1, -1);
+}
+
+sector_offset = sector_num - s->sectors[chunk];
+
+if ((s->sectorcounts[chunk] - sector_offset) > DMG_SECTOR_MAX) {
+sectors_read = DMG_SECTOR_MAX;
+} else {
+sectors_read = s->sectorcounts[chunk] - sector_offset;
+}
+
+/* truncate sectors read if it exceeds the 2MB buffer of qemu-img
+ * convert */
+if ((sector_num % DMG_SECTOR_MAX) + sectors_read > DMG_SECTOR_MAX) {
+sectors_read = DMG_SECTOR_MAX - (sector_num % DMG_SECTOR_MAX);
+}
+
+s->current_chunk = s->n_chunks;
+
+switch (s->types[chunk]) { /* block entry type */
+case 0x8005: { /* zlib compressed */
+/* we need to buffer, because only the chunk as whole can be
+ * inflated. */
+ret = bdrv_pread(bs->file, s->offsets[chunk],
+ s->compressed_chunk, s->lengths[chunk]);
+if (ret != s->lengths[chunk]) {
 return -1;
 }
 
-s->current_chunk = s->n_chunks;
-switch (s->types[chunk]) { /* block entry type */
-case 0x8005: { /* zlib compressed */
-/* we need to buffer, because only the chunk as whole can be
- * inflated. */
-ret = bdrv_pread(bs->file, s->offsets[chunk],
- s->compressed_chunk, s->lengths[chunk]);
-if (ret != s->lengths[chunk]) {
-return -1;
-}
-
-s->zstream.next_in = s->compressed_chunk;
-s->zstream.avail_in = s->lengths[chunk];
-s->zstream.next_out = s->uncompressed_chunk;
-s->zstream.avail_out = 512 * s->sectorcounts[chunk];
-ret = inflateReset(&s->zstream);
-if (ret != Z_OK) {
-return -1;
-}
-ret = inflate(&s->zstream, Z_FINISH);
-if (ret != Z_STREAM_END ||
-s->zstream.total_out != 512 * s->sectorcounts[chunk]) {
-return -1;
-}
-break; }
-case 0x8006: /* bzip2 compressed */
-if (!dmg_uncompress_bz2) {
-break;
-}
-/* we need to buffer, because only t

[Qemu-block] [PATCH v2 2/7] dmg: New function to help us cache random access point

2017-04-27 Thread Ashijeet Acharya

Introduce a new cache_access_point() function which will help us first
cache a random access point inside a compressed stream and then keep
updating it according to our requirement at appropriate times.

Signed-off-by: Ashijeet Acharya 
---
 block/dmg.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/block/dmg.c b/block/dmg.c
index a7d25fc..c6fe8b0 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -128,6 +128,18 @@ static void update_max_chunk_size(BDRVDMGState *s, 
uint32_t chunk,
 }
 }
 
+static void cache_access_point(DMGReadState *drs, uint8_t *next_in,
+int64_t avail_in, int32_t chunk,
+int64_t sectors_read, int32_t sector_offset)
+{
+drs->saved_next_in = next_in;
+drs->saved_avail_in = avail_in;
+drs->saved_chunk_type = chunk;
+drs->sectors_read = sectors_read;
+drs->sector_offset_in_chunk = sector_offset;
+return;
+}
+
 static int64_t dmg_find_koly_offset(BdrvChild *file, Error **errp)
 {
 BlockDriverState *file_bs = file->bs;
@@ -507,6 +519,10 @@ static int dmg_open(BlockDriverState *bs, QDict *options, 
int flags,
 goto fail;
 }
 
+s->drs = g_malloc(sizeof(DMGReadState));
+/* initialise our access point cache */
+cache_access_point(s->drs, NULL, -1, -1, -1, -1);
+
 if (inflateInit(&s->zstream) != Z_OK) {
 ret = -EINVAL;
 goto fail;
@@ -523,6 +539,7 @@ fail:
 g_free(s->lengths);
 g_free(s->sectors);
 g_free(s->sectorcounts);
+g_free(s->drs);
 qemu_vfree(s->compressed_chunk);
 qemu_vfree(s->uncompressed_chunk);
 return ret;
@@ -685,6 +702,7 @@ static void dmg_close(BlockDriverState *bs)
 g_free(s->lengths);
 g_free(s->sectors);
 g_free(s->sectorcounts);
+g_free(s->drs);
 qemu_vfree(s->compressed_chunk);
 qemu_vfree(s->uncompressed_chunk);
 
-- 
2.6.2

Re: [Qemu-block] [Qemu-devel] [PATCH v1 3/8] dmg: Limit the output buffer size to a max of 2MB

2017-04-27 Thread Ashijeet Acharya

On Thu, Apr 27, 2017 at 12:56 PM, Fam Zheng  wrote:
> On Wed, 04/26 17:30, John Snow wrote:
>> Seems OK otherwise, but I would normally expect you to fix the buffering
>> problems first, and then reduce the size of the buffer -- not the other
>> way around. This version introduces new limitations that didn't exist
>> previously (As of this commit, QEMU can't open DMG files with chunks
>> larger than 2MB now, right?)
>
> Yes, each commit should _not_ introduce issues (compiling failures, functional
> degeneration, etc.), and cannot rely on following commits to fix things 
> screwed
> up in this one.
>
> This is important for bisectability - each commit can be built and tested in 
> the
> whole git history.

Yes, understood. That's why I am gonna squash it with the last patch
(patch 8) which removes this limitation completely.

Ashijeet

Re: [Qemu-block] [Qemu-devel] [PATCH v1 3/8] dmg: Limit the output buffer size to a max of 2MB

2017-04-27 Thread Ashijeet Acharya

On Thu, Apr 27, 2017 at 3:00 AM, John Snow  wrote:
>
>
> On 04/25/2017 03:59 PM, Ashijeet Acharya wrote:
>> The size of the output buffer is limited to a maximum of 2MB so that
>> QEMU doesn't end up allocating huge amounts of memory while
>> decompressing compressed input streams.
>>
>> 2MB is an appropriate size because "qemu-img convert" has the same I/O
>> buffer size and the most important use case for DMG files is to be
>> compatible with qemu-img convert.
>>
>> Signed-off-by: Ashijeet Acharya 
>> ---
>
> Patch 1 adds a new structure and patch 2 starts using it, but in a
> store-only manner and only with placeholder variables that are difficult
> to authenticate, so there's still "insufficient data" to review either
> patch meaningfully.
>
> This patch seems unrelated to either of those, so the ordering is strange.

Actually, I have tried to keep these patches very short so that it is
easier to review them (mainly because of the time limitation I have).
But seems like I over tried. If you have any suggestions for the first
2 patches, I am happy to change them in you preferred way.

>
>>  block/dmg.c | 12 ++--
>>  1 file changed, 6 insertions(+), 6 deletions(-)
>>
>> diff --git a/block/dmg.c b/block/dmg.c
>> index c6fe8b0..7ae30e3 100644
>> --- a/block/dmg.c
>> +++ b/block/dmg.c
>> @@ -37,8 +37,8 @@ enum {
>>  /* Limit chunk sizes to prevent unreasonable amounts of memory being 
>> used
>>   * or truncating when converting to 32-bit types
>>   */
>> -DMG_LENGTHS_MAX = 64 * 1024 * 1024, /* 64 MB */
>> -DMG_SECTORCOUNTS_MAX = DMG_LENGTHS_MAX / 512,
>> +DMG_MAX_OUTPUT = 2 * 1024 * 1024, /* 2 MB */
>
> why "MAX OUTPUT" ? Aren't we using this for buffering on reads?

I just thought that this looked better, but I will revert back to the
original one.

>
>> +DMG_SECTOR_MAX = DMG_MAX_OUTPUT / 512,
>>  };
>>
>>  static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename)
>> @@ -260,10 +260,10 @@ static int dmg_read_mish_block(BDRVDMGState *s, 
>> DmgHeaderState *ds,
>>
>>  /* all-zeroes sector (type 2) does not need to be "uncompressed" 
>> and can
>>   * therefore be unbounded. */
>> -if (s->types[i] != 2 && s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) {
>> +if (s->types[i] != 2 && s->sectorcounts[i] > DMG_SECTOR_MAX) {
>>  error_report("sector count %" PRIu64 " for chunk %" PRIu32
>>   " is larger than max (%u)",
>> - s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX);
>> + s->sectorcounts[i], i, DMG_SECTOR_MAX);
>>  ret = -EINVAL;
>>  goto fail;
>>  }
>> @@ -275,10 +275,10 @@ static int dmg_read_mish_block(BDRVDMGState *s, 
>> DmgHeaderState *ds,
>>  /* length in (compressed) data fork */
>>  s->lengths[i] = buff_read_uint64(buffer, offset + 0x20);
>>
>> -if (s->lengths[i] > DMG_LENGTHS_MAX) {
>> +if (s->lengths[i] > DMG_MAX_OUTPUT) {
>>  error_report("length %" PRIu64 " for chunk %" PRIu32
>>   " is larger than max (%u)",
>> - s->lengths[i], i, DMG_LENGTHS_MAX);
>> + s->lengths[i], i, DMG_MAX_OUTPUT);
>>  ret = -EINVAL;
>>  goto fail;
>>  }
>>
>
> Seems OK otherwise, but I would normally expect you to fix the buffering
> problems first, and then reduce the size of the buffer -- not the other
> way around. This version introduces new limitations that didn't exist
> previously (As of this commit, QEMU can't open DMG files with chunks
> larger than 2MB now, right?)

I think I will squash it with the last one (patch 8) which removes
this limitation completely and will also fix the problem of handling
the buffering problems first and then reducing the buffer size?

Ashijeet

[Qemu-block] [PATCH v1 7/8] dmg: Refactor dmg_co_preadv() to start reading multiple sectors

2017-04-25 Thread Ashijeet Acharya

At the moment, dmg_co_preadv() reads one sector at a time. Make it
read multiple sectors at a time depending on the number of sectors
stored in "drs->sectors_read". This does not provide any significant
optimization in the I/O process of DMG but is still a nicer way.

Adjust the 'data' variable depending on our cached access point
situation to align our read requests appropriately.

Signed-off-by: Ashijeet Acharya 
---
 block/dmg.c | 23 +++
 1 file changed, 15 insertions(+), 8 deletions(-)

diff --git a/block/dmg.c b/block/dmg.c
index f643e41..b0f3c84 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -718,7 +718,7 @@ dmg_co_preadv(BlockDriverState *bs, uint64_t offset, 
uint64_t bytes,
 BDRVDMGState *s = bs->opaque;
 uint64_t sector_num = offset >> BDRV_SECTOR_BITS;
 int nb_sectors = bytes >> BDRV_SECTOR_BITS;
-int ret, i;
+int ret, i = 0;
 DMGReadState *drs = s->drs;
 
 assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0);
@@ -726,8 +726,7 @@ dmg_co_preadv(BlockDriverState *bs, uint64_t offset, 
uint64_t bytes,
 
 qemu_co_mutex_lock(&s->lock);
 
-for (i = 0; i < nb_sectors; i++) {
-uint32_t sector_offset_in_chunk;
+while (i < nb_sectors) {
 void *data;
 
 if (dmg_read_chunk(bs, sector_num + i, drs) != 0) {
@@ -738,12 +737,20 @@ dmg_co_preadv(BlockDriverState *bs, uint64_t offset, 
uint64_t bytes,
  * s->uncompressed_chunk may be too small to cover the large all-zeroes
  * section. dmg_read_chunk is called to find s->current_chunk */
 if (s->types[s->current_chunk] == 2) { /* all zeroes block entry */
-qemu_iovec_memset(qiov, i * 512, 0, 512);
-continue;
+qemu_iovec_memset(qiov, i * 512, 0,
+512 * drs->sectors_read);
+goto increment;
+}
+
+if (drs->saved_next_in == NULL) {
+data = s->uncompressed_chunk + drs->sector_offset_in_chunk * 512;
+} else {
+data = s->uncompressed_chunk;
 }
-sector_offset_in_chunk = sector_num + i - s->sectors[s->current_chunk];
-data = s->uncompressed_chunk + sector_offset_in_chunk * 512;
-qemu_iovec_from_buf(qiov, i * 512, data, 512);
+qemu_iovec_from_buf(qiov, i * 512, data, drs->sectors_read * 512);
+
+increment:
+i += drs->sectors_read;
 }
 
 ret = 0;
-- 
2.6.2

[Qemu-block] [PATCH v1 4/8] dmg: Refactor and prepare dmg_read_chunk() to cache random access points

2017-04-25 Thread Ashijeet Acharya

Refactor dmg_read_chunk() to start making use of the new DMGReadState
structure and do chunk and sector related calculations based on it.
Add a new argument "DMGReadState *drs" to it.

Signed-off-by: Ashijeet Acharya 
---
 block/dmg.c | 153 
 1 file changed, 91 insertions(+), 62 deletions(-)

diff --git a/block/dmg.c b/block/dmg.c
index 7ae30e3..dc356b0 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -578,78 +578,106 @@ static inline uint32_t search_chunk(BDRVDMGState *s, 
uint64_t sector_num)
 return s->n_chunks; /* error */
 }
 
-static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num)
+static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num,
+ DMGReadState *drs)
 {
 BDRVDMGState *s = bs->opaque;
 
+int ret;
+uint32_t sector_offset;
+uint64_t sectors_read;
+uint32_t chunk;
+
 if (!is_sector_in_chunk(s, s->current_chunk, sector_num)) {
-int ret;
-uint32_t chunk = search_chunk(s, sector_num);
+chunk = search_chunk(s, sector_num);
+} else {
+chunk = drs->saved_chunk_type;
+}
 
-if (chunk >= s->n_chunks) {
+if (chunk >= s->n_chunks) {
+return -1;
+}
+
+/* reset our access point cache if we had a change in current chunk */
+if (chunk != drs->saved_chunk_type) {
+cache_access_point(drs, NULL, -1, -1, -1, -1);
+}
+
+sector_offset = sector_num - s->sectors[chunk];
+
+if ((s->sectorcounts[chunk] - sector_offset) > DMG_SECTOR_MAX) {
+sectors_read = DMG_SECTOR_MAX;
+} else {
+sectors_read = s->sectorcounts[chunk] - sector_offset;
+}
+
+/* truncate sectors read if it exceeds the 2MB buffer of qemu-img
+ * convert */
+if ((sector_num % DMG_SECTOR_MAX) + sectors_read > DMG_SECTOR_MAX) {
+sectors_read = DMG_SECTOR_MAX - (sector_num % DMG_SECTOR_MAX);
+}
+
+s->current_chunk = s->n_chunks;
+
+switch (s->types[chunk]) { /* block entry type */
+case 0x8005: { /* zlib compressed */
+/* we need to buffer, because only the chunk as whole can be
+ * inflated. */
+ret = bdrv_pread(bs->file, s->offsets[chunk],
+ s->compressed_chunk, s->lengths[chunk]);
+if (ret != s->lengths[chunk]) {
 return -1;
 }
 
-s->current_chunk = s->n_chunks;
-switch (s->types[chunk]) { /* block entry type */
-case 0x8005: { /* zlib compressed */
-/* we need to buffer, because only the chunk as whole can be
- * inflated. */
-ret = bdrv_pread(bs->file, s->offsets[chunk],
- s->compressed_chunk, s->lengths[chunk]);
-if (ret != s->lengths[chunk]) {
-return -1;
-}
-
-s->zstream.next_in = s->compressed_chunk;
-s->zstream.avail_in = s->lengths[chunk];
-s->zstream.next_out = s->uncompressed_chunk;
-s->zstream.avail_out = 512 * s->sectorcounts[chunk];
-ret = inflateReset(&s->zstream);
-if (ret != Z_OK) {
-return -1;
-}
-ret = inflate(&s->zstream, Z_FINISH);
-if (ret != Z_STREAM_END ||
-s->zstream.total_out != 512 * s->sectorcounts[chunk]) {
-return -1;
-}
-break; }
-case 0x8006: /* bzip2 compressed */
-if (!dmg_uncompress_bz2) {
-break;
-}
-/* we need to buffer, because only the chunk as whole can be
- * inflated. */
-ret = bdrv_pread(bs->file, s->offsets[chunk],
- s->compressed_chunk, s->lengths[chunk]);
-if (ret != s->lengths[chunk]) {
-return -1;
-}
-
-ret = dmg_uncompress_bz2((char *)s->compressed_chunk,
- (unsigned int) s->lengths[chunk],
- (char *)s->uncompressed_chunk,
- (unsigned int)
- (512 * s->sectorcounts[chunk]));
-if (ret < 0) {
-return ret;
-}
-break;
-case 1: /* copy */
-ret = bdrv_pread(bs->file, s->offsets[chunk],
- s->uncompressed_chunk, s->lengths[chunk]);
-if (ret != s->lengths[chunk]) {
-return -1;
-}
-break;
-case 2: /* zero */
-/* see dmg_read, it is treated specially. No buffer needs to be
- * pre-filled, the zeroes can be set directly. */
+s->zstream.ne

[Qemu-block] [PATCH v1 8/8] dmg: Remove the error messages to allow wild images

2017-04-25 Thread Ashijeet Acharya

We have refactored the DMG driver to accept and process images
irrespective of their chunk sizes since we now have limit of 2MB on our
output buffer size. Thus QEMU will not allocate huge amounts of memory
no matter what the chunk size is.

Remove the error messages to prevent denial-of-service in cases where
untrusted files are being accessed by the user.

Signed-off-by: Ashijeet Acharya 
---
 block/dmg.c | 22 --
 1 file changed, 22 deletions(-)

diff --git a/block/dmg.c b/block/dmg.c
index b0f3c84..01ec40e 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -209,7 +209,6 @@ static int dmg_read_mish_block(BDRVDMGState *s, 
DmgHeaderState *ds,
uint8_t *buffer, uint32_t count)
 {
 uint32_t type, i;
-int ret;
 size_t new_size;
 uint32_t chunk_count;
 int64_t offset = 0;
@@ -258,16 +257,6 @@ static int dmg_read_mish_block(BDRVDMGState *s, 
DmgHeaderState *ds,
 /* sector count */
 s->sectorcounts[i] = buff_read_uint64(buffer, offset + 0x10);
 
-/* all-zeroes sector (type 2) does not need to be "uncompressed" and 
can
- * therefore be unbounded. */
-if (s->types[i] != 2 && s->sectorcounts[i] > DMG_SECTOR_MAX) {
-error_report("sector count %" PRIu64 " for chunk %" PRIu32
- " is larger than max (%u)",
- s->sectorcounts[i], i, DMG_SECTOR_MAX);
-ret = -EINVAL;
-goto fail;
-}
-
 /* offset in (compressed) data fork */
 s->offsets[i] = buff_read_uint64(buffer, offset + 0x18);
 s->offsets[i] += in_offset;
@@ -275,23 +264,12 @@ static int dmg_read_mish_block(BDRVDMGState *s, 
DmgHeaderState *ds,
 /* length in (compressed) data fork */
 s->lengths[i] = buff_read_uint64(buffer, offset + 0x20);
 
-if (s->lengths[i] > DMG_MAX_OUTPUT) {
-error_report("length %" PRIu64 " for chunk %" PRIu32
- " is larger than max (%u)",
- s->lengths[i], i, DMG_MAX_OUTPUT);
-ret = -EINVAL;
-goto fail;
-}
-
 update_max_chunk_size(s, i, &ds->max_compressed_size,
   &ds->max_sectors_per_chunk);
 offset += 40;
 }
 s->n_chunks += chunk_count;
 return 0;
-
-fail:
-return ret;
 }
 
 static int dmg_read_resource_fork(BlockDriverState *bs, DmgHeaderState *ds,
-- 
2.6.2

[Qemu-block] [PATCH v1 5/8] dmg: Handle zlib compressed chunks

2017-04-25 Thread Ashijeet Acharya

Set the output buffer size to be equal to the size of number of sectors
stored in @sectors_read. Start inflating to a max output buffer size of
2MB and cache our access point to aid random access later if required.

Signed-off-by: Ashijeet Acharya 
---
 block/dmg.c | 48 ++--
 1 file changed, 34 insertions(+), 14 deletions(-)

diff --git a/block/dmg.c b/block/dmg.c
index dc356b0..749c151 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -621,27 +621,47 @@ static inline int dmg_read_chunk(BlockDriverState *bs, 
uint64_t sector_num,
 
 switch (s->types[chunk]) { /* block entry type */
 case 0x8005: { /* zlib compressed */
-/* we need to buffer, because only the chunk as whole can be
- * inflated. */
-ret = bdrv_pread(bs->file, s->offsets[chunk],
- s->compressed_chunk, s->lengths[chunk]);
-if (ret != s->lengths[chunk]) {
-return -1;
+/* check for cached random access point */
+if (drs->saved_next_in == NULL) {
+/* we need to buffer, because only the chunk as whole can be
+ * inflated. */
+ret = bdrv_pread(bs->file, s->offsets[chunk],
+ s->compressed_chunk, s->lengths[chunk]);
+if (ret != s->lengths[chunk]) {
+return -1;
+}
+
+s->zstream.next_in = s->compressed_chunk;
+s->zstream.avail_in = s->lengths[chunk];
+} else {
+s->zstream.next_in = drs->saved_next_in;
+s->zstream.avail_in = drs->saved_avail_in;
 }
 
-s->zstream.next_in = s->compressed_chunk;
-s->zstream.avail_in = s->lengths[chunk];
 s->zstream.next_out = s->uncompressed_chunk;
-s->zstream.avail_out = 512 * s->sectorcounts[chunk];
-ret = inflateReset(&s->zstream);
-if (ret != Z_OK) {
-return -1;
+
+s->zstream.avail_out = sectors_read * BDRV_SECTOR_SIZE;
+
+if (drs->saved_next_in == NULL) {
+ret = inflateReset(&s->zstream);
+if (ret != Z_OK) {
+return -1;
+}
+}
+/* reset total_out for each successive call */
+s->zstream.total_out = 0;
+ret = inflate(&s->zstream, Z_SYNC_FLUSH);
+if (ret == Z_OK &&
+s->zstream.total_out == 512 * sectors_read) {
+goto update;
 }
-ret = inflate(&s->zstream, Z_FINISH);
 if (ret != Z_STREAM_END ||
-s->zstream.total_out != 512 * s->sectorcounts[chunk]) {
+s->zstream.total_out != 512 * sectors_read) {
 return -1;
 }
+update:
+cache_access_point(drs, s->zstream.next_in, s->zstream.avail_in,
+   chunk, sectors_read, sector_offset);
 break; }
 case 0x8006: /* bzip2 compressed */
 if (!dmg_uncompress_bz2) {
-- 
2.6.2

[Qemu-block] [PATCH v1 3/8] dmg: Limit the output buffer size to a max of 2MB

2017-04-25 Thread Ashijeet Acharya

The size of the output buffer is limited to a maximum of 2MB so that
QEMU doesn't end up allocating huge amounts of memory while
decompressing compressed input streams.

2MB is an appropriate size because "qemu-img convert" has the same I/O
buffer size and the most important use case for DMG files is to be
compatible with qemu-img convert.

Signed-off-by: Ashijeet Acharya 
---
 block/dmg.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/block/dmg.c b/block/dmg.c
index c6fe8b0..7ae30e3 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -37,8 +37,8 @@ enum {
 /* Limit chunk sizes to prevent unreasonable amounts of memory being used
  * or truncating when converting to 32-bit types
  */
-DMG_LENGTHS_MAX = 64 * 1024 * 1024, /* 64 MB */
-DMG_SECTORCOUNTS_MAX = DMG_LENGTHS_MAX / 512,
+DMG_MAX_OUTPUT = 2 * 1024 * 1024, /* 2 MB */
+DMG_SECTOR_MAX = DMG_MAX_OUTPUT / 512,
 };
 
 static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename)
@@ -260,10 +260,10 @@ static int dmg_read_mish_block(BDRVDMGState *s, 
DmgHeaderState *ds,
 
 /* all-zeroes sector (type 2) does not need to be "uncompressed" and 
can
  * therefore be unbounded. */
-if (s->types[i] != 2 && s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) {
+if (s->types[i] != 2 && s->sectorcounts[i] > DMG_SECTOR_MAX) {
 error_report("sector count %" PRIu64 " for chunk %" PRIu32
  " is larger than max (%u)",
- s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX);
+ s->sectorcounts[i], i, DMG_SECTOR_MAX);
 ret = -EINVAL;
 goto fail;
 }
@@ -275,10 +275,10 @@ static int dmg_read_mish_block(BDRVDMGState *s, 
DmgHeaderState *ds,
 /* length in (compressed) data fork */
 s->lengths[i] = buff_read_uint64(buffer, offset + 0x20);
 
-if (s->lengths[i] > DMG_LENGTHS_MAX) {
+if (s->lengths[i] > DMG_MAX_OUTPUT) {
 error_report("length %" PRIu64 " for chunk %" PRIu32
  " is larger than max (%u)",
- s->lengths[i], i, DMG_LENGTHS_MAX);
+ s->lengths[i], i, DMG_MAX_OUTPUT);
 ret = -EINVAL;
 goto fail;
 }
-- 
2.6.2

[Qemu-block] [PATCH v1 6/8] dmg: Handle bz2 compressed/raw/zeroed chunks

2017-04-25 Thread Ashijeet Acharya

We do not need to cache the access point for these chunks but need to
update our various supporting variables like chunk, sectors_read etc.
to keep maintaining our code structure. Call cache_access_point() after
reading chunks of these types.

Signed-off-by: Ashijeet Acharya 
---
 block/dmg.c | 18 ++
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/block/dmg.c b/block/dmg.c
index 749c151..f643e41 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -680,20 +680,30 @@ update:
  (char *)s->uncompressed_chunk,
  (unsigned int)
 (512 * s->sectorcounts[chunk]));
+
 if (ret < 0) {
 return ret;
 }
+cache_access_point(drs, NULL, -1, chunk, sectors_read,
+   sector_offset);
 break;
 case 1: /* copy */
-ret = bdrv_pread(bs->file, s->offsets[chunk],
- s->uncompressed_chunk, s->lengths[chunk]);
-if (ret != s->lengths[chunk]) {
-return -1;
+if (drs->sectors_read == -1) {
+ret = bdrv_pread(bs->file, s->offsets[chunk],
+ s->uncompressed_chunk, s->lengths[chunk]);
+if (ret != s->lengths[chunk]) {
+return -1;
+}
 }
+cache_access_point(drs, NULL, -1, chunk, sectors_read,
+   sector_offset);
 break;
 case 2: /* zero */
 /* see dmg_read, it is treated specially. No buffer needs to be
  * pre-filled, the zeroes can be set directly. */
+cache_access_point(drs, NULL, -1, chunk, sectors_read,
+   sector_offset);
+
 break;
 }
 s->current_chunk = chunk;
-- 
2.6.2

[Qemu-block] [PATCH v1 0/8] Refactor DMG driver to have chunk size independence

2017-04-25 Thread Ashijeet Acharya

This series helps to provide chunk size independence for DMG driver to prevent
denial-of-service in cases where untrusted files are being accessed by the user.

This task is mentioned on the public block ToDo
Here -> http://wiki.qemu.org/ToDo/Block/DmgChunkSizeIndependence

Patch 1 introduces a new data structure to aid caching of random access points
within a compressed stream.

Patch 2 is an extension of patch 1 and introduces a new function to
initialize/update/reset our cached random access point.

Patch 3 limits the output buffer size to a max of 2MB to avoid QEMU allocate
huge amounts of memory.

Patch 4 is a simple preparatory patch to aid handling of various types of 
chunks.

Patches 5 & 6 help to handle various types of chunks.

Patch 7 simply refactors dmg_co_preadv() to read multiple sectors at once.

Patch 8 finally removes the error messages QEMU used to throw when an image with
chunk sizes above 64MB were accessed by the user.

->Testing procedure:
Convert a DMG file to raw format using the "qemu-img convert" tool present in
v2.9.0
Next convert the same image again after applying these patches.
Compare the two images using "qemu-img compare" tool to check if they are
identical.

You can pickup any DMG image from the collection present
Here -> https://lists.gnu.org/archive/html/qemu-devel/2014-12/msg03606.html

->Important note:
These patches assume that the terms "chunk" and "block" are synonyms of each 
other
when we talk about bz2 compressed streams. Thus according to the bz2 docs[1],
the max uncompressed size of a chunk/block can reach to 46MB which is less than
the previously allowed size of 64MB, so we can continue decompressing the whole
chunk/block at once instead of partial decompression just like we do now.

This limitation was forced by the fact that bz2 compressed streams do not allow
random access midway through a chunk/block as the BZ2_bzDecompress() API in 
bzlib
seeks for the magic key "BZh" before starting decompression.[2] This magic key 
is
present at the start of every chunk/block only and since our cached random 
access
points need not necessarily point to the start of a chunk/block, 
BZ2_bzDecompress()
fails with an error value BZ_DATA_ERROR_MAGIC[3]

[1] https://en.wikipedia.org/wiki/Bzip2#File_format
[2] https://blastedbio.blogspot.in/2011/11/random-access-to-bzip2.html
[3] http://linux.math.tifr.res.in/manuals/html/manual_3.html#SEC17

Special thanks to Peter Wu for helping me understand and tackle the bz2
compressed chunks.

Ashijeet Acharya (8):
  dmg: Introduce a new struct to cache random access points
  dmg: New function to help us cache random access point
  dmg: Limit the output buffer size to a max of 2MB
  dmg: Refactor and prepare dmg_read_chunk() to cache random access
points
  dmg: Handle zlib compressed chunks
  dmg: Handle bz2 compressed/raw/zeroed chunks
  dmg: Refactor dmg_co_preadv() to start reading multiple sectors
  dmg: Remove the error messages to allow wild images

 block/dmg.c | 214 +++-
 block/dmg.h |  10 +++
 2 files changed, 148 insertions(+), 76 deletions(-)

-- 
2.6.2

[Qemu-block] [PATCH v1 1/8] dmg: Introduce a new struct to cache random access points

2017-04-25 Thread Ashijeet Acharya

We need to cache the random access point while performing partial
decompression so that we can resume decompression from that point
onwards in our next sequential read request. Introduce a new struct
DMGReadState which will help us do this.

Signed-off-by: Ashijeet Acharya 
---
 block/dmg.h | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/block/dmg.h b/block/dmg.h
index b592d6f..ee67ae1 100644
--- a/block/dmg.h
+++ b/block/dmg.h
@@ -31,6 +31,15 @@
 #include "block/block_int.h"
 #include 
 
+/* used to cache current position in compressed input stream */
+typedef struct DMGReadState {
+uint8_t *saved_next_in;
+int64_t saved_avail_in;
+int32_t saved_chunk_type;
+int64_t sectors_read; /* possible sectors read in each cycle */
+int32_t sector_offset_in_chunk;
+} DMGReadState;
+
 typedef struct BDRVDMGState {
 CoMutex lock;
 /* each chunk contains a certain number of sectors,
@@ -51,6 +60,7 @@ typedef struct BDRVDMGState {
 uint8_t *compressed_chunk;
 uint8_t *uncompressed_chunk;
 z_stream zstream;
+DMGReadState *drs;
 } BDRVDMGState;
 
 extern int (*dmg_uncompress_bz2)(char *next_in, unsigned int avail_in,
-- 
2.6.2

[Qemu-block] [PATCH v1 2/8] dmg: New function to help us cache random access point

2017-04-25 Thread Ashijeet Acharya

Introduce a new cache_access_point() function which will help us first
cache a random access point inside a compressed stream and then keep
updating it according to our requirement at appropriate times.

Signed-off-by: Ashijeet Acharya 
---
 block/dmg.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/block/dmg.c b/block/dmg.c
index a7d25fc..c6fe8b0 100644
--- a/block/dmg.c
+++ b/block/dmg.c
@@ -128,6 +128,18 @@ static void update_max_chunk_size(BDRVDMGState *s, 
uint32_t chunk,
 }
 }
 
+static void cache_access_point(DMGReadState *drs, uint8_t *next_in,
+int64_t avail_in, int32_t chunk,
+int64_t sectors_read, int32_t sector_offset)
+{
+drs->saved_next_in = next_in;
+drs->saved_avail_in = avail_in;
+drs->saved_chunk_type = chunk;
+drs->sectors_read = sectors_read;
+drs->sector_offset_in_chunk = sector_offset;
+return;
+}
+
 static int64_t dmg_find_koly_offset(BdrvChild *file, Error **errp)
 {
 BlockDriverState *file_bs = file->bs;
@@ -507,6 +519,10 @@ static int dmg_open(BlockDriverState *bs, QDict *options, 
int flags,
 goto fail;
 }
 
+s->drs = g_malloc(sizeof(DMGReadState));
+/* initialise our access point cache */
+cache_access_point(s->drs, NULL, -1, -1, -1, -1);
+
 if (inflateInit(&s->zstream) != Z_OK) {
 ret = -EINVAL;
 goto fail;
@@ -523,6 +539,7 @@ fail:
 g_free(s->lengths);
 g_free(s->sectors);
 g_free(s->sectorcounts);
+g_free(s->drs);
 qemu_vfree(s->compressed_chunk);
 qemu_vfree(s->uncompressed_chunk);
 return ret;
@@ -685,6 +702,7 @@ static void dmg_close(BlockDriverState *bs)
 g_free(s->lengths);
 g_free(s->sectors);
 g_free(s->sectorcounts);
+g_free(s->drs);
 qemu_vfree(s->compressed_chunk);
 qemu_vfree(s->uncompressed_chunk);
 
-- 
2.6.2

[Qemu-block] [PATCH v4 8/8] vmdk: Make vmdk_get_cluster_offset() return cluster offset only

2017-04-21 Thread Ashijeet Acharya

vmdk_alloc_clusters() introduced earlier now handles the task of
allocating clusters and performing COW when needed. Thus we can change
vmdk_get_cluster_offset() to stick to the sole purpose of returning
cluster offset using sector number. Update the changes at all call
sites.

Signed-off-by: Ashijeet Acharya 
---
 block/vmdk.c | 56 
 1 file changed, 12 insertions(+), 44 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index e52c373..be08bde 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1486,25 +1486,16 @@ static int vmdk_alloc_clusters(BlockDriverState *bs,
  * For flat extents, the start offset as parsed from the description file is
  * returned.
  *
- * For sparse extents, look up in L1, L2 table. If allocate is true, return an
- * offset for a new cluster and update L2 cache. If there is a backing file,
- * COW is done before returning; otherwise, zeroes are written to the allocated
- * cluster. Both COW and zero writing skips the sector range
- * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller
- * has new data to write there.
+ * For sparse extents, look up the L1, L2 table.
  *
  * Returns: VMDK_OK if cluster exists and mapped in the image.
- *  VMDK_UNALLOC if cluster is not mapped and @allocate is false.
- *  VMDK_ERROR if failed.
+ *  VMDK_UNALLOC if cluster is not mapped.
+ *  VMDK_ERROR if failed
  */
 static int vmdk_get_cluster_offset(BlockDriverState *bs,
VmdkExtent *extent,
-   VmdkMetaData *m_data,
uint64_t offset,
-   bool allocate,
-   uint64_t *cluster_offset,
-   uint64_t skip_start_bytes,
-   uint64_t skip_end_bytes)
+   uint64_t *cluster_offset)
 {
 int l1_index, l2_offset, l2_index;
 uint32_t *l2_table;
@@ -1529,31 +1520,9 @@ static int vmdk_get_cluster_offset(BlockDriverState *bs,
 }
 
 if (!cluster_sector || zeroed) {
-if (!allocate) {
-return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
-}
-
-cluster_sector = extent->next_cluster_sector;
-extent->next_cluster_sector += extent->cluster_sectors;
-
-/* First of all we write grain itself, to avoid race condition
- * that may to corrupt the image.
- * This problem may occur because of insufficient space on host disk
- * or inappropriate VM shutdown.
- */
-ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
-offset, skip_start_bytes, skip_end_bytes);
-if (ret) {
-return ret;
-}
-if (m_data) {
-m_data->valid = 1;
-m_data->l1_index = l1_index;
-m_data->l2_index = l2_index;
-m_data->l2_offset = l2_offset;
-m_data->l2_cache_entry = &l2_table[l2_index];
-}
+return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
 }
+
 *cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
 return VMDK_OK;
 }
@@ -1596,9 +1565,7 @@ static int64_t coroutine_fn 
vmdk_co_get_block_status(BlockDriverState *bs,
 return 0;
 }
 qemu_co_mutex_lock(&s->lock);
-ret = vmdk_get_cluster_offset(bs, extent, NULL,
-  sector_num * 512, false, &offset,
-  0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, sector_num * 512, &offset);
 qemu_co_mutex_unlock(&s->lock);
 
 index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
@@ -1789,13 +1756,14 @@ vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, 
uint64_t bytes,
 ret = -EIO;
 goto fail;
 }
-ret = vmdk_get_cluster_offset(bs, extent, NULL,
-  offset, false, &cluster_offset, 0, 0);
+
 offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
 
 n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
  - offset_in_cluster);
 
+ret = vmdk_get_cluster_offset(bs, extent, offset, &cluster_offset);
+
 if (ret != VMDK_OK) {
 /* if not allocated, try to read from parent image, if exist */
 if (bs->backing && ret != VMDK_ZEROED) {
@@ -2542,9 +2510,9 @@ static int vmdk_check(BlockDriverState *bs, 
BdrvCheckResult *result,
 sector_num);
 break;
 }
-ret = vmdk_get_cluster_offset(bs, extent, NULL,
+ret = vmdk_get_cluster_offset(bs, extent,
   sector_num << BDRV_SECTOR_BITS,
-  false, &cluster_offset, 0, 0);

[Qemu-block] [PATCH v4 3/8] vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset()

2017-04-21 Thread Ashijeet Acharya

Rename the existing get_cluster_offset() to vmdk_get_cluster_offset()
and update name in all the callers accordingly.

Signed-off-by: Ashijeet Acharya 
---
 block/vmdk.c | 46 +++---
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 73ae786..f403981 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1144,7 +1144,7 @@ static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData 
*m_data,
 }
 
 /**
- * get_cluster_offset
+ * vmdk_get_cluster_offset
  *
  * Look up cluster offset in extent file by sector number, and store in
  * @cluster_offset.
@@ -1163,14 +1163,14 @@ static int vmdk_L2update(VmdkExtent *extent, 
VmdkMetaData *m_data,
  *  VMDK_UNALLOC if cluster is not mapped and @allocate is false.
  *  VMDK_ERROR if failed.
  */
-static int get_cluster_offset(BlockDriverState *bs,
-  VmdkExtent *extent,
-  VmdkMetaData *m_data,
-  uint64_t offset,
-  bool allocate,
-  uint64_t *cluster_offset,
-  uint64_t skip_start_bytes,
-  uint64_t skip_end_bytes)
+static int vmdk_get_cluster_offset(BlockDriverState *bs,
+   VmdkExtent *extent,
+   VmdkMetaData *m_data,
+   uint64_t offset,
+   bool allocate,
+   uint64_t *cluster_offset,
+   uint64_t skip_start_bytes,
+   uint64_t skip_end_bytes)
 {
 unsigned int l1_index, l2_offset, l2_index;
 int min_index, i, j;
@@ -1304,9 +1304,9 @@ static int64_t coroutine_fn 
vmdk_co_get_block_status(BlockDriverState *bs,
 return 0;
 }
 qemu_co_mutex_lock(&s->lock);
-ret = get_cluster_offset(bs, extent, NULL,
- sector_num * 512, false, &offset,
- 0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, NULL,
+  sector_num * 512, false, &offset,
+  0, 0);
 qemu_co_mutex_unlock(&s->lock);
 
 index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num);
@@ -1497,8 +1497,8 @@ vmdk_co_preadv(BlockDriverState *bs, uint64_t offset, 
uint64_t bytes,
 ret = -EIO;
 goto fail;
 }
-ret = get_cluster_offset(bs, extent, NULL,
- offset, false, &cluster_offset, 0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, NULL,
+  offset, false, &cluster_offset, 0, 0);
 offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
 
 n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
@@ -1584,10 +1584,10 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
offset,
 n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
  - offset_in_cluster);
 
-ret = get_cluster_offset(bs, extent, &m_data, offset,
- !(extent->compressed || zeroed),
- &cluster_offset, offset_in_cluster,
- offset_in_cluster + n_bytes);
+ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset,
+  !(extent->compressed || zeroed),
+  &cluster_offset, offset_in_cluster,
+  offset_in_cluster + n_bytes);
 if (extent->compressed) {
 if (ret == VMDK_OK) {
 /* Refuse write to allocated cluster for streamOptimized */
@@ -1596,8 +1596,8 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
offset,
 return -EIO;
 } else {
 /* allocate */
-ret = get_cluster_offset(bs, extent, &m_data, offset,
- true, &cluster_offset, 0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset,
+  true, &cluster_offset, 0, 0);
 }
 }
 if (ret == VMDK_ERROR) {
@@ -2229,9 +2229,9 @@ static int vmdk_check(BlockDriverState *bs, 
BdrvCheckResult *result,
 sector_num);
 break;
 }
-ret = get_cluster_offset(bs, extent, NULL,
- sector_num << BDRV_SECTOR_BITS,
- false, &cluster_offset, 0, 0);
+ret = vmdk_get_cluster_offset(bs, extent, NULL,
+  sector_num << BDRV_SECTOR_BITS,
+

[Qemu-block] [PATCH v4 5/8] vmdk: Set maximum bytes allocated in one cycle

2017-04-21 Thread Ashijeet Acharya

Set the maximum bytes allowed to get allocated at once to be not more
than the extent size boundary to handle writes at two separate extents
appropriately.

Signed-off-by: Ashijeet Acharya 
---
 block/vmdk.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 4cee868..7862791 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1624,6 +1624,7 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
offset,
 uint64_t cluster_offset;
 uint64_t bytes_done = 0;
 VmdkMetaData m_data;
+uint64_t extent_end;
 
 if (DIV_ROUND_UP(offset, BDRV_SECTOR_SIZE) > bs->total_sectors) {
 error_report("Wrong offset: offset=0x%" PRIx64
@@ -1637,9 +1638,17 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
offset,
 if (!extent) {
 return -EIO;
 }
+extent_end = extent->end_sector * BDRV_SECTOR_SIZE;
+
 offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
-n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
- - offset_in_cluster);
+
+/* truncate n_bytes to first cluster because we need to perform COW */
+if (offset_in_cluster > 0) {
+n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
+ - offset_in_cluster);
+} else {
+n_bytes = MIN(bytes, extent_end - offset);
+}
 
 ret = vmdk_get_cluster_offset(bs, extent, &m_data, offset,
   !(extent->compressed || zeroed),
-- 
2.6.2

[Qemu-block] [PATCH v4 4/8] vmdk: Factor out metadata loading code out of vmdk_get_cluster_offset()

2017-04-21 Thread Ashijeet Acharya

Move the cluster tables loading code out of the existing
vmdk_get_cluster_offset() function and implement it in separate
get_cluster_table() and vmdk_L2load() functions. This patch will help
us avoid code duplication in future patches of this series.

Signed-off-by: Ashijeet Acharya 
---
 block/vmdk.c | 153 ---
 1 file changed, 105 insertions(+), 48 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index f403981..4cee868 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1143,6 +1143,105 @@ static int vmdk_L2update(VmdkExtent *extent, 
VmdkMetaData *m_data,
 return VMDK_OK;
 }
 
+/*
+ * vmdk_l2load
+ *
+ * Load a new L2 table into memory. If the table is in the cache, the cache
+ * is used; otherwise the L2 table is loaded from the image file.
+ *
+ * Returns:
+ *   VMDK_OK:   on success
+ *   VMDK_ERROR:in error cases
+ */
+static int vmdk_l2load(VmdkExtent *extent, uint64_t offset, int l2_offset,
+   uint32_t **new_l2_table, int *new_l2_index)
+{
+int min_index, i, j;
+uint32_t *l2_table;
+uint32_t min_count;
+
+for (i = 0; i < L2_CACHE_SIZE; i++) {
+if (l2_offset == extent->l2_cache_offsets[i]) {
+/* increment the hit count */
+if (++extent->l2_cache_counts[i] == UINT32_MAX) {
+for (j = 0; j < L2_CACHE_SIZE; j++) {
+extent->l2_cache_counts[j] >>= 1;
+}
+}
+l2_table = extent->l2_cache + (i * extent->l2_size);
+goto found;
+}
+}
+/* not found: load a new entry in the least used one */
+min_index = 0;
+min_count = UINT32_MAX;
+for (i = 0; i < L2_CACHE_SIZE; i++) {
+if (extent->l2_cache_counts[i] < min_count) {
+min_count = extent->l2_cache_counts[i];
+min_index = i;
+}
+}
+l2_table = extent->l2_cache + (min_index * extent->l2_size);
+if (bdrv_pread(extent->file,
+(int64_t)l2_offset * 512,
+l2_table,
+extent->l2_size * sizeof(uint32_t)
+) != extent->l2_size * sizeof(uint32_t)) {
+return VMDK_ERROR;
+}
+
+extent->l2_cache_offsets[min_index] = l2_offset;
+extent->l2_cache_counts[min_index] = 1;
+found:
+*new_l2_index = ((offset >> 9) / extent->cluster_sectors) % 
extent->l2_size;
+*new_l2_table = l2_table;
+
+return VMDK_OK;
+}
+
+/*
+ * get_cluster_table
+ *
+ * for a given offset, load (and allocate if needed) the l2 table.
+ *
+ * Returns:
+ *   VMDK_OK:on success
+ *
+ *   VMDK_UNALLOC:   if cluster is not mapped
+ *
+ *   VMDK_ERROR: in error cases
+ */
+static int get_cluster_table(VmdkExtent *extent, uint64_t offset,
+ int *new_l1_index, int *new_l2_offset,
+ int *new_l2_index, uint32_t **new_l2_table)
+{
+int l1_index, l2_offset, l2_index;
+uint32_t *l2_table;
+int ret;
+
+offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
+l1_index = (offset >> 9) / extent->l1_entry_sectors;
+if (l1_index >= extent->l1_size) {
+return VMDK_ERROR;
+}
+l2_offset = extent->l1_table[l1_index];
+if (!l2_offset) {
+return VMDK_UNALLOC;
+}
+
+ret = vmdk_l2load(extent, offset, l2_offset, &l2_table, &l2_index);
+if (ret < 0) {
+return ret;
+}
+
+*new_l1_index = l1_index;
+*new_l2_offset = l2_offset;
+*new_l2_index = l2_index;
+*new_l2_table = l2_table;
+
+return VMDK_OK;
+}
+
 /**
  * vmdk_get_cluster_offset
  *
@@ -1172,66 +1271,24 @@ static int vmdk_get_cluster_offset(BlockDriverState *bs,
uint64_t skip_start_bytes,
uint64_t skip_end_bytes)
 {
-unsigned int l1_index, l2_offset, l2_index;
-int min_index, i, j;
-uint32_t min_count, *l2_table;
+int l1_index, l2_offset, l2_index;
+uint32_t *l2_table;
 bool zeroed = false;
 int64_t ret;
 int64_t cluster_sector;
 
-if (m_data) {
-m_data->valid = 0;
-}
 if (extent->flat) {
 *cluster_offset = extent->flat_start_offset;
 return VMDK_OK;
 }
 
-offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
-l1_index = (offset >> 9) / extent->l1_entry_sectors;
-if (l1_index >= extent->l1_size) {
-return VMDK_ERROR;
-}
-l2_offset = extent->l1_table[l1_index];
-if (!l2_offset) {
-return VMDK_UNALLOC;
-}
-for (i = 0; i < L2_CACHE_SIZE; i++) {
-if (l2_offset == extent->l2_cache_offsets[i]) {
-/* increment the hit count */
-if (++extent->l2_cache_counts[i] == 0x) {
-for (j = 0; j < L2_CACHE_SIZE; j++) {
-exten

[Qemu-block] [PATCH v4 2/8] vmdk: Rename get_whole_cluster() to vmdk_perform_cow()

2017-04-21 Thread Ashijeet Acharya

Rename the existing function get_whole_cluster() to vmdk_perform_cow()
as its sole purpose is to perform COW for the first and the last
allocated clusters if needed.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 23 ++-
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 22be887..73ae786 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1028,8 +1028,8 @@ static void vmdk_refresh_limits(BlockDriverState *bs, 
Error **errp)
 }
 }
 
-/**
- * get_whole_cluster
+/*
+ * vmdk_perform_cow
  *
  * Copy backing file's cluster that covers @sector_num, otherwise write zero,
  * to the cluster at @cluster_sector_num.
@@ -1037,13 +1037,18 @@ static void vmdk_refresh_limits(BlockDriverState *bs, 
Error **errp)
  * If @skip_start_sector < @skip_end_sector, the relative range
  * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave
  * it for call to write user data in the request.
+ *
+ * Returns:
+ *   VMDK_OK:   on success
+ *
+ *   VMDK_ERROR:in error cases
  */
-static int get_whole_cluster(BlockDriverState *bs,
- VmdkExtent *extent,
- uint64_t cluster_offset,
- uint64_t offset,
- uint64_t skip_start_bytes,
- uint64_t skip_end_bytes)
+static int vmdk_perform_cow(BlockDriverState *bs,
+VmdkExtent *extent,
+uint64_t cluster_offset,
+uint64_t offset,
+uint64_t skip_start_bytes,
+uint64_t skip_end_bytes)
 {
 int ret = VMDK_OK;
 int64_t cluster_bytes;
@@ -1244,7 +1249,7 @@ static int get_cluster_offset(BlockDriverState *bs,
  * This problem may occur because of insufficient space on host disk
  * or inappropriate VM shutdown.
  */
-ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
+ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
 offset, skip_start_bytes, skip_end_bytes);
 if (ret) {
 return ret;
-- 
2.6.2

[Qemu-block] [PATCH v4 6/8] vmdk: New functions to assist allocating multiple clusters

2017-04-21 Thread Ashijeet Acharya

Introduce two new helper functions handle_alloc() and
vmdk_alloc_cluster_offset(). handle_alloc() helps to allocate multiple
clusters at once starting from a given offset on disk and performs COW
if necessary for first and last allocated clusters.
vmdk_alloc_cluster_offset() helps to return the offset of the first of
the many newly allocated clusters. Also, provide proper documentation
for both.

Signed-off-by: Ashijeet Acharya 
---
 block/vmdk.c | 192 +++
 1 file changed, 182 insertions(+), 10 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 7862791..8d34cd9 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -136,6 +136,7 @@ typedef struct VmdkMetaData {
 unsigned int l2_offset;
 int valid;
 uint32_t *l2_cache_entry;
+uint32_t nb_clusters;
 } VmdkMetaData;
 
 typedef struct VmdkGrainMarker {
@@ -1242,6 +1243,174 @@ static int get_cluster_table(VmdkExtent *extent, 
uint64_t offset,
 return VMDK_OK;
 }
 
+/*
+ * handle_alloc
+ *
+ * Allocate new clusters for an area that either is yet unallocated or needs a
+ * copy on write. If *cluster_offset is non_zero, clusters are only allocated 
if
+ * the new allocation can match the specified host offset.
+ *
+ * Returns:
+ *   VMDK_OK:   if new clusters were allocated, *bytes may be decreased if
+ *  the new allocation doesn't cover all of the requested area.
+ *  *cluster_offset is updated to contain the offset of the
+ *  first newly allocated cluster.
+ *
+ *   VMDK_UNALLOC:  if no clusters could be allocated. *cluster_offset is left
+ *  unchanged.
+ *
+ *   VMDK_ERROR:in error cases
+ */
+static int handle_alloc(BlockDriverState *bs, VmdkExtent *extent,
+uint64_t offset, uint64_t *cluster_offset,
+int64_t *bytes, VmdkMetaData *m_data,
+bool allocate, uint32_t *total_alloc_clusters)
+{
+int l1_index, l2_offset, l2_index;
+uint32_t *l2_table;
+uint32_t cluster_sector;
+uint32_t nb_clusters;
+bool zeroed = false;
+uint64_t skip_start_bytes, skip_end_bytes;
+int ret;
+
+ret = get_cluster_table(extent, offset, &l1_index, &l2_offset,
+&l2_index, &l2_table);
+if (ret < 0) {
+return ret;
+}
+
+cluster_sector = le32_to_cpu(l2_table[l2_index]);
+
+skip_start_bytes = vmdk_find_offset_in_cluster(extent, offset);
+/* Calculate the number of clusters to look for. Here we truncate the last
+ * cluster, i.e. 1 less than the actual value calculated as we may need to
+ * perform COW for the last one. */
+nb_clusters = DIV_ROUND_UP(skip_start_bytes + *bytes,
+extent->cluster_sectors << BDRV_SECTOR_BITS) - 1;
+
+nb_clusters = MIN(nb_clusters, extent->l2_size - l2_index);
+assert(nb_clusters <= INT_MAX);
+
+/* update bytes according to final nb_clusters value */
+if (nb_clusters != 0) {
+*bytes = ((nb_clusters * extent->cluster_sectors) << 9)
+- skip_start_bytes;
+} else {
+nb_clusters = 1;
+}
+*total_alloc_clusters += nb_clusters;
+skip_end_bytes = skip_start_bytes + MIN(*bytes,
+ extent->cluster_sectors * BDRV_SECTOR_SIZE
+- skip_start_bytes);
+
+if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) {
+zeroed = true;
+}
+
+if (!cluster_sector || zeroed) {
+if (!allocate) {
+return zeroed ? VMDK_ZEROED : VMDK_UNALLOC;
+}
+
+cluster_sector = extent->next_cluster_sector;
+extent->next_cluster_sector += extent->cluster_sectors
+* nb_clusters;
+
+ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
+   offset, skip_start_bytes,
+   skip_end_bytes);
+if (ret < 0) {
+return ret;
+}
+if (m_data) {
+m_data->valid = 1;
+m_data->l1_index = l1_index;
+m_data->l2_index = l2_index;
+m_data->l2_offset = l2_offset;
+m_data->l2_cache_entry = &l2_table[l2_index];
+m_data->nb_clusters = nb_clusters;
+}
+}
+*cluster_offset = cluster_sector << BDRV_SECTOR_BITS;
+return VMDK_OK;
+}
+
+/*
+ * vmdk_alloc_clusters
+ *
+ * For a given offset on the virtual disk, find the cluster offset in vmdk
+ * file. If the offset is not found, allocate a new cluster.
+ *
+ * If the cluster is newly allocated, m_data->nb_clusters is set to the number
+ * of contiguous clusters that have been allocated. In this case, the other
+ * fields of m_data are valid and contain information about the

[Qemu-block] [PATCH v4 1/8] vmdk: Move vmdk_find_offset_in_cluster() to the top

2017-04-21 Thread Ashijeet Acharya

Move the existing vmdk_find_offset_in_cluster() function to the top of
the driver.

Signed-off-by: Ashijeet Acharya 
Reviewed-by: Fam Zheng 
---
 block/vmdk.c | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index a9bd22b..22be887 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -242,6 +242,18 @@ static void vmdk_free_last_extent(BlockDriverState *bs)
 s->extents = g_renew(VmdkExtent, s->extents, s->num_extents);
 }
 
+static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
+   int64_t offset)
+{
+uint64_t extent_begin_offset, extent_relative_offset;
+uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;
+
+extent_begin_offset =
+(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE;
+extent_relative_offset = offset - extent_begin_offset;
+return extent_relative_offset % cluster_size;
+}
+
 static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
 {
 char *desc;
@@ -1266,18 +1278,6 @@ static VmdkExtent *find_extent(BDRVVmdkState *s,
 return NULL;
 }
 
-static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
-   int64_t offset)
-{
-uint64_t extent_begin_offset, extent_relative_offset;
-uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;
-
-extent_begin_offset =
-(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE;
-extent_relative_offset = offset - extent_begin_offset;
-return extent_relative_offset % cluster_size;
-}
-
 static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent,
   int64_t sector_num)
 {
-- 
2.6.2

[Qemu-block] [PATCH v4 7/8] vmdk: Update metadata for multiple clusters

2017-04-21 Thread Ashijeet Acharya

Include a next pointer in VmdkMetaData struct to point to the previous
allocated L2 table. Modify vmdk_L2update to start updating metadata for
allocation of multiple clusters at once.

Signed-off-by: Ashijeet Acharya 
---
 block/vmdk.c | 129 ++-
 1 file changed, 102 insertions(+), 27 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 8d34cd9..e52c373 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -137,6 +137,8 @@ typedef struct VmdkMetaData {
 int valid;
 uint32_t *l2_cache_entry;
 uint32_t nb_clusters;
+uint32_t offset;
+struct VmdkMetaData *next;
 } VmdkMetaData;
 
 typedef struct VmdkGrainMarker {
@@ -1116,34 +1118,89 @@ exit:
 return ret;
 }
 
-static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
- uint32_t offset)
+static int vmdk_alloc_cluster_link_l2(VmdkExtent *extent,
+  VmdkMetaData *m_data, bool zeroed)
 {
-offset = cpu_to_le32(offset);
+int i;
+uint32_t offset, temp_offset;
+int *l2_table_array;
+int l2_array_size;
+
+if (zeroed) {
+temp_offset = VMDK_GTE_ZEROED;
+} else {
+temp_offset = m_data->offset;
+}
+
+temp_offset = cpu_to_le32(temp_offset);
+
+l2_array_size = sizeof(uint32_t) * m_data->nb_clusters;
+l2_table_array = qemu_try_blockalign(extent->file->bs,
+ QEMU_ALIGN_UP(l2_array_size,
+   BDRV_SECTOR_SIZE));
+if (l2_table_array == NULL) {
+return VMDK_ERROR;
+}
+memset(l2_table_array, 0, QEMU_ALIGN_UP(l2_array_size, BDRV_SECTOR_SIZE));
 /* update L2 table */
+offset = temp_offset;
+for (i = 0; i < m_data->nb_clusters; i++) {
+l2_table_array[i] = offset;
+if (!zeroed) {
+offset += 128;
+}
+}
 if (bdrv_pwrite_sync(extent->file,
-((int64_t)m_data->l2_offset * 512)
-+ (m_data->l2_index * sizeof(offset)),
-&offset, sizeof(offset)) < 0) {
+ ((int64_t)m_data->l2_offset * 512)
+ + ((m_data->l2_index) * sizeof(offset)),
+ l2_table_array, l2_array_size) < 0) {
 return VMDK_ERROR;
 }
 /* update backup L2 table */
 if (extent->l1_backup_table_offset != 0) {
 m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
 if (bdrv_pwrite_sync(extent->file,
-((int64_t)m_data->l2_offset * 512)
-+ (m_data->l2_index * sizeof(offset)),
-&offset, sizeof(offset)) < 0) {
+ ((int64_t)m_data->l2_offset * 512)
+ + ((m_data->l2_index) * sizeof(offset)),
+ l2_table_array, l2_array_size) < 0) {
 return VMDK_ERROR;
 }
 }
+
+offset = temp_offset;
 if (m_data->l2_cache_entry) {
-*m_data->l2_cache_entry = offset;
+for (i = 0; i < m_data->nb_clusters; i++) {
+*m_data->l2_cache_entry = offset;
+m_data->l2_cache_entry++;
+
+if (!zeroed) {
+offset += 128;
+}
+}
 }
 
+qemu_vfree(l2_table_array);
 return VMDK_OK;
 }
 
+static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
+ bool zeroed)
+{
+int ret;
+
+while (m_data->next != NULL) {
+
+ret = vmdk_alloc_cluster_link_l2(extent, m_data, zeroed);
+if (ret < 0) {
+return ret;
+}
+
+m_data = m_data->next;
+ }
+
+ return VMDK_OK;
+}
+
 /*
  * vmdk_l2load
  *
@@ -1263,7 +1320,7 @@ static int get_cluster_table(VmdkExtent *extent, uint64_t 
offset,
  */
 static int handle_alloc(BlockDriverState *bs, VmdkExtent *extent,
 uint64_t offset, uint64_t *cluster_offset,
-int64_t *bytes, VmdkMetaData *m_data,
+int64_t *bytes, VmdkMetaData **m_data,
 bool allocate, uint32_t *total_alloc_clusters)
 {
 int l1_index, l2_offset, l2_index;
@@ -1272,6 +1329,7 @@ static int handle_alloc(BlockDriverState *bs, VmdkExtent 
*extent,
 uint32_t nb_clusters;
 bool zeroed = false;
 uint64_t skip_start_bytes, skip_end_bytes;
+VmdkMetaData *old_m_data;
 int ret;
 
 ret = get_cluster_table(extent, offset, &l1_index, &l2_offset,
@@ -1323,13 +1381,21 @@ static int handle_alloc(BlockDriverState *bs, 
VmdkExtent *extent,
 if (ret < 0) {
 return ret;
 }
-if (m_data) {
-m_data->valid = 1;
-m_data->l1_index = l1_index;
-m_data->l2_index = l2_index;
-m_data->l2_offset = l

[Qemu-block] [PATCH v4 0/8] Optimize VMDK I/O by allocating multiple clusters

2017-04-21 Thread Ashijeet Acharya

Previously posted series patches:
v1 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg02044.html
v2 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg05080.html
v3 - http://lists.nongnu.org/archive/html/qemu-devel/2017-04/msg00074.html

This series helps to optimize the I/O performance of VMDK driver.

Patch 1 helps us to move vmdk_find_offset_in_cluster.

Patch 2 & 3 perform a simple function re-naming tasks.

Patch 4 is used to factor out metadata loading code and implement it in separate
functions. This will help us to avoid code duplication in future patches of this
series.

Patch 5 helps to set the upper limit of the bytes handled in one cycle.

Patch 6 adds new functions to help us allocate multiple clusters according to
the size requested, perform COW if required and return the offset of the first
newly allocated cluster.

Patch 7 changes the metadata update code to update the L2 tables for multiple
clusters at once.

Patch 8 helps us to finally change vmdk_get_cluster_offset() to find cluster 
offset
only as cluster allocation task is now handled by vmdk_alloc_clusters()

Note: v4 has an addition of new optimization of calling bdrv_pwrite_sync() only
once for atmost 512 clusters, as a result performance has increased to a great
extent (earlier till v2 it was 29%).

Optimization test results:

This patch series improves 128 KB sequential write performance to an
empty VMDK file by 54%

Benchmark command: ./qemu-img bench -w -c 1024 -s 128K -d 1 -t none -f
vmdk test.vmdk

As showoff, these patches now complete a 128M write request on an empty VMDK 
file
on my slow laptop in just 2.7 secs compared to 3.5 mins as in v2.9.0
This is obviously using qemu-io with "--cache writeback" and not an official
benchmark, but worth mentioning from a newbie's perspective.

Note: These patches pass all 41/41 tests suitable for the VMDK driver.

Changes in v4:
- fix commit message in patch 1 (fam)
- drop size_to_clusters() function (fam)
- fix grammatical errors in function documentations (fam)
- factor out metadata loading coding in a separate patch (patch 4) (fam)
- rename vmdk_alloc_cluster_offset() to vmdk_alloc_clusters() (fam)
- break patch 4(in v3) into separate patches (patch 3 and 8) (fam)
- rename extent_size to extent_end (fam)
- use QEMU_ALIGN_UP instead of vmdk_align_offset. (fam)
- drop next and simply do m_data = m_data->next (fam)

Changes in v3:
- move size_to_clusters() from patch 1 to 3 (fam)
- use DIV_ROUND_UP in size_to_clusters (fam)
- make patch 2 compilable (fam)
- rename vmdk_L2update as vmdk_l2update and use UINT32_MAX (fam)
- combine patch 3 and patch 4 (as in v2) to make them compilable (fam)
- call bdrv_pwrite_sync() for batches of atmost 512 clusters at once (fam)

Changes in v2:
- segregate the ugly Patch 1 in v1 into 6 readable and sensible patches
- include benchmark test results in v2

Ashijeet Acharya (8):
  vmdk: Move vmdk_find_offset_in_cluster() to the top
  vmdk: Rename get_whole_cluster() to vmdk_perform_cow()
  vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset()
  vmdk: Factor out metadata loading code out of
vmdk_get_cluster_offset()
  vmdk: Set maximum bytes allocated in one cycle
  vmdk: New functions to assist allocating multiple clusters
  vmdk: Update metadata for multiple clusters
  vmdk: Make vmdk_get_cluster_offset() return cluster offset only

 block/vmdk.c | 530 +--
 1 file changed, 408 insertions(+), 122 deletions(-)

-- 
2.6.2

Re: [Qemu-block] [PATCH v3 5/6] vmdk: Set maximum bytes allocated in one cycle

2017-04-21 Thread Ashijeet Acharya

On Fri, Apr 21, 2017 at 8:23 PM, Ashijeet Acharya
 wrote:
> On Wed, Apr 19, 2017 at 6:30 PM, Fam Zheng  wrote:
>> On Sat, 04/01 20:14, Ashijeet Acharya wrote:
>>> Set the maximum bytes allowed to get allocated at once to be not more
>>> than the extent size boundary to handle writes at two separate extents
>>> appropriately.
>>>
>>> Signed-off-by: Ashijeet Acharya 
>>> ---
>>>  block/vmdk.c | 13 +++--
>>>  1 file changed, 11 insertions(+), 2 deletions(-)
>>>
>>> diff --git a/block/vmdk.c b/block/vmdk.c
>>> index a8babd7..9456ddd 100644
>>> --- a/block/vmdk.c
>>> +++ b/block/vmdk.c
>>> @@ -1767,6 +1767,7 @@ static int vmdk_pwritev(BlockDriverState *bs, 
>>> uint64_t offset,
>>>  int64_t offset_in_cluster, n_bytes;
>>>  uint64_t cluster_offset;
>>>  uint64_t bytes_done = 0;
>>> +uint64_t extent_size;
>>>  VmdkMetaData m_data;
>>>  uint32_t total_alloc_clusters = 0;
>>>
>>> @@ -1782,9 +1783,17 @@ static int vmdk_pwritev(BlockDriverState *bs, 
>>> uint64_t offset,
>>>  if (!extent) {
>>>  return -EIO;
>>>  }
>>> +extent_size = extent->end_sector * BDRV_SECTOR_SIZE;
>>
>> Maybe extent_end to be more accurate?
>
> Done
>
>>> +
>>>  offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
>>> -n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
>>> - - offset_in_cluster);
>>> +
>>> +/* truncate n_bytes to first cluster because we need to perform 
>>> COW */
>>
>> Makes sense, but shouldn't this be squashed into patch patch 3? Because it 
>> looks
>> like it is fixing an intermediate bug.
>
> Did you mean that I should merge this whole patch into patch 3? Maybe
> moving it before patch 3 rather than squashing it make more sense?

Instead I have moved it before patch 3 in v4

Ashijeet

Re: [Qemu-block] [PATCH v3 6/6] vmdk: Update metadata for multiple clusters

2017-04-21 Thread Ashijeet Acharya

On Fri, Apr 21, 2017 at 1:45 PM, Fam Zheng  wrote:
> On Sat, 04/01 20:14, Ashijeet Acharya wrote:
>> Include a next pointer in VmdkMetaData struct to point to the previous
>> allocated L2 table. Modify vmdk_L2update to start updating metadata for
>> allocation of multiple clusters at once.
>>
>> Signed-off-by: Ashijeet Acharya 
>
> This is the metadata part of the coalesed allocation. I think patch 3 is
> functionally incomplete without these changes, and is perhaps broken because
> metadata is not handled correctly.
>
> Such an "intermediate functional regression" is not good in a series, which we
> need to avoid.

I have moved this patch right after patch 3 because merging both will
result in an unnecessary huge patch. Will that work?
>
>> ---
>>  block/vmdk.c | 136 
>> ---
>>  1 file changed, 111 insertions(+), 25 deletions(-)
>>
>> diff --git a/block/vmdk.c b/block/vmdk.c
>> index 9456ddd..c7675db 100644
>> --- a/block/vmdk.c
>> +++ b/block/vmdk.c
>> @@ -137,6 +137,8 @@ typedef struct VmdkMetaData {
>>  int valid;
>>  uint32_t *l2_cache_entry;
>>  uint32_t nb_clusters;
>> +uint32_t offset;
>> +struct VmdkMetaData *next;
>>  } VmdkMetaData;
>>
>>  typedef struct VmdkGrainMarker {
>> @@ -263,6 +265,12 @@ static inline uint64_t size_to_clusters(VmdkExtent 
>> *extent, uint64_t size)
>>  return (DIV_ROUND_UP(size + round_off_size, BDRV_SECTOR_SIZE * 128) - 
>> 1);
>>  }
>>
>> +static inline int64_t vmdk_align_offset(int64_t offset, int n)
>> +{
>> +offset = (offset + n - 1) & ~(n - 1);
>> +return offset;
>> +}
>> +
>>  static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
>>  {
>>  char *desc;
>> @@ -1037,29 +1045,88 @@ static void vmdk_refresh_limits(BlockDriverState 
>> *bs, Error **errp)
>>  }
>>  }
>>
>> -static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
>> - uint32_t offset)
>> +static int vmdk_alloc_cluster_link_l2(VmdkExtent *extent,
>> +  VmdkMetaData *m_data, bool zeroed)
>>  {
>> -offset = cpu_to_le32(offset);
>> +int i;
>> +uint32_t offset, temp_offset;
>> +int *l2_table_array;
>> +int l2_array_size;
>> +
>> +if (zeroed) {
>> +temp_offset = VMDK_GTE_ZEROED;
>> +} else {
>> +temp_offset = m_data->offset;
>> +}
>> +
>> +temp_offset = cpu_to_le32(temp_offset);
>> +
>> +l2_array_size = sizeof(uint32_t) * m_data->nb_clusters;
>> +l2_table_array = qemu_try_blockalign(extent->file->bs,
>> +vmdk_align_offset(l2_array_size, 512));
>
> Indentation is off.
>
> Use QEMU_ALIGN_UP, instead of vmdk_align_offset.
>
> 512 is a magic number, use BDRV_SECTOR_SIZE.

Done

>
>> +if (l2_table_array == NULL) {
>> +return VMDK_ERROR;
>> +}
>> +memset(l2_table_array, 0, vmdk_align_offset(l2_array_size, 512));
>> +
>>  /* update L2 table */
>> +offset = temp_offset;
>> +for (i = 0; i < m_data->nb_clusters; i++) {
>> +l2_table_array[i] = offset;
>> +if (!zeroed) {
>> +offset += 128;
>
> Something is going wrong here with endianness on BE host, I believe.

I have changed temp_offset to LE above, wouldn't that be enough. I am not sure.

>
>> +}
>> +}
>> +
>>  if (bdrv_pwrite_sync(extent->file,
>> -((int64_t)m_data->l2_offset * 512)
>> -+ (m_data->l2_index * sizeof(offset)),
>> -&offset, sizeof(offset)) < 0) {
>> +((int64_t)m_data->l2_offset * 512)
>> ++ ((m_data->l2_index) * sizeof(offset)),
>> + l2_table_array, l2_array_size) < 0) {
>
> You can fix the indentation while changing these lines. If not, don't change 
> it,
> or at least don't make it uglier.

I have aligned it, if it still looks ugly in v4, I will revert.

>
>>  return VMDK_ERROR;
>>  }
>> +
>>  /* update backup L2 table */
>>  if (extent->l1_backup_table_offset != 0) {
>>  m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
>>  if (bdrv_pwrite_sync(extent->file,
>>  ((int64_t)m_data->l2_offset * 512)
>> -

Re: [Qemu-block] [PATCH v3 5/6] vmdk: Set maximum bytes allocated in one cycle

2017-04-21 Thread Ashijeet Acharya

On Wed, Apr 19, 2017 at 6:30 PM, Fam Zheng  wrote:
> On Sat, 04/01 20:14, Ashijeet Acharya wrote:
>> Set the maximum bytes allowed to get allocated at once to be not more
>> than the extent size boundary to handle writes at two separate extents
>> appropriately.
>>
>> Signed-off-by: Ashijeet Acharya 
>> ---
>>  block/vmdk.c | 13 +++--
>>  1 file changed, 11 insertions(+), 2 deletions(-)
>>
>> diff --git a/block/vmdk.c b/block/vmdk.c
>> index a8babd7..9456ddd 100644
>> --- a/block/vmdk.c
>> +++ b/block/vmdk.c
>> @@ -1767,6 +1767,7 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
>> offset,
>>  int64_t offset_in_cluster, n_bytes;
>>  uint64_t cluster_offset;
>>  uint64_t bytes_done = 0;
>> +uint64_t extent_size;
>>  VmdkMetaData m_data;
>>  uint32_t total_alloc_clusters = 0;
>>
>> @@ -1782,9 +1783,17 @@ static int vmdk_pwritev(BlockDriverState *bs, 
>> uint64_t offset,
>>  if (!extent) {
>>  return -EIO;
>>  }
>> +extent_size = extent->end_sector * BDRV_SECTOR_SIZE;
>
> Maybe extent_end to be more accurate?

Done

>> +
>>  offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
>> -n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
>> - - offset_in_cluster);
>> +
>> +/* truncate n_bytes to first cluster because we need to perform COW 
>> */
>
> Makes sense, but shouldn't this be squashed into patch patch 3? Because it 
> looks
> like it is fixing an intermediate bug.

Did you mean that I should merge this whole patch into patch 3? Maybe
moving it before patch 3 rather than squashing it make more sense?

Ashijeet

Re: [Qemu-block] [PATCH v3 4/6] vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset()

2017-04-19 Thread Ashijeet Acharya

On Wed, Apr 19, 2017 at 18:27 Fam Zheng  wrote:

> On Sat, 04/01 20:14, Ashijeet Acharya wrote:
> > Rename the existing get_cluster_offset() function to
> > vmdk_get_cluster_offset() and have it make use of the new
> > get_cluster_table() to load the cluster tables. Also, it is no longer
> > used to allocate new clusters and hence perform COW. Make the necessary
> > renames at all the occurrences of get_cluster_offset().
> >
> > Signed-off-by: Ashijeet Acharya 
> > ---
> >  block/vmdk.c | 117
> +++
> >  1 file changed, 21 insertions(+), 96 deletions(-)
>
> This is definitely more than a function rename, like I said in reply to
> patch 3,
> it could probably be split to smaller ones (rename, and others, for
> example),
> and reordered to make reviewing easier.

Maybe, because I have also refactored it to have vmdk_get_cluster_offset()
make use of the get_cluster_table() (and friends) to avoid duplication.

I will try to split it as

1. Rename
2. Refactor it to make use of get_cluster_table() by moving that out of
patch 3 as of now.

Will that work?

I think this will also keep the compiler happy while reviewing.

Ashijeet

Re: [Qemu-block] [PATCH v3 3/6] vmdk: New functions to assist allocating multiple clusters

2017-04-19 Thread Ashijeet Acharya

On Wed, Apr 19, 2017 at 18:26 Fam Zheng  wrote:

> On Sat, 04/01 20:14, Ashijeet Acharya wrote:
> > Move the cluster tables loading code out of the existing
> > get_cluster_offset() function to avoid code duplication and implement it
> > in separate get_cluster_table() and vmdk_L2load() functions.
> >
> > Introduce two new helper functions handle_alloc() and
> > vmdk_alloc_cluster_offset(). handle_alloc() helps to allocate multiple
> > clusters at once starting from a given offset on disk and performs COW
> > if necessary for first and last allocated clusters.
> > vmdk_alloc_cluster_offset() helps to return the offset of the first of
> > the many newly allocated clusters. Also, provide proper documentation
> > for both.
> >
> > Signed-off-by: Ashijeet Acharya 
> > ---
> >  block/vmdk.c | 337
> ++-
> >  1 file changed, 308 insertions(+), 29 deletions(-)
> >
> > diff --git a/block/vmdk.c b/block/vmdk.c
> > index 73ae786..e5a289d 100644
> > --- a/block/vmdk.c
> > +++ b/block/vmdk.c
> > @@ -136,6 +136,7 @@ typedef struct VmdkMetaData {
> >  unsigned int l2_offset;
> >  int valid;
> >  uint32_t *l2_cache_entry;
> > +uint32_t nb_clusters;
> >  } VmdkMetaData;
> >
> >  typedef struct VmdkGrainMarker {
> > @@ -254,6 +255,14 @@ static inline uint64_t
> vmdk_find_offset_in_cluster(VmdkExtent *extent,
> >  return extent_relative_offset % cluster_size;
> >  }
> >
> > +static inline uint64_t size_to_clusters(VmdkExtent *extent, uint64_t
> size)
> > +{
> > +uint64_t cluster_size, round_off_size;
> > +cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;
> > +round_off_size = cluster_size - (size % cluster_size);
> > +return DIV_ROUND_UP(size + round_off_size, BDRV_SECTOR_SIZE * 128)
> - 1;
>
> What is (BDRV_SECTOR_SIZE * 128)? Do you mean extent->cluster_size?  And
> the
> function doesn't make sense up to me.
>
> Just un-inline this to
>
> DIV_ROUND_UP(size,
>  extent->cluster_sectors << BDRV_SECTOR_BITS) - 1
>
> in the calling site and be done with it.
>
> > +}
> > +
> >  static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
> >  {
> >  char *desc;
> > @@ -1028,6 +1037,133 @@ static void vmdk_refresh_limits(BlockDriverState
> *bs, Error **errp)
> >  }
> >  }
> >
> > +static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
> > + uint32_t offset)
> > +{
> > +offset = cpu_to_le32(offset);
> > +/* update L2 table */
> > +if (bdrv_pwrite_sync(extent->file,
> > +((int64_t)m_data->l2_offset * 512)
> > ++ (m_data->l2_index * sizeof(offset)),
> > +&offset, sizeof(offset)) < 0) {
> > +return VMDK_ERROR;
> > +}
> > +/* update backup L2 table */
> > +if (extent->l1_backup_table_offset != 0) {
> > +m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
> > +if (bdrv_pwrite_sync(extent->file,
> > +((int64_t)m_data->l2_offset * 512)
> > ++ (m_data->l2_index * sizeof(offset)),
> > +&offset, sizeof(offset)) < 0) {
> > +return VMDK_ERROR;
> > +}
> > +}
> > +if (m_data->l2_cache_entry) {
> > +*m_data->l2_cache_entry = offset;
> > +}
> > +
> > +return VMDK_OK;
> > +}
> > +
> > +/*
> > + * vmdk_l2load
> > + *
> > + * Loads a new L2 table into memory. If the table is in the cache, the
> cache
>
> Not a native speaker, but s/Loads/Load/ feels more nature and consistent
> with
> other comments.
>
> > + * is used; otherwise the L2 table is loaded from the image file.
> > + *
> > + * Returns:
> > + *   VMDK_OK:   on success
> > + *   VMDK_ERROR:in error cases
> > + */
> > +static int vmdk_l2load(VmdkExtent *extent, uint64_t offset, int
> l2_offset,
> > +   uint32_t **new_l2_table, int *new_l2_index)
> > +{
> > +int min_index, i, j;
> > +uint32_t *l2_table;
> > +uint32_t min_count;
> > +
> > +for (i = 0; i < L2_CACHE_SIZE; i++) {
> > +if (l2_offset == extent->l2_cache_offsets[i]) {
> > +/* increment the hit count */
> > +if (++extent->l2_cache_counts[i] == UINT32_MAX)

Re: [Qemu-block] [PATCH v3 1/6] vmdk: Move vmdk_find_offset_in_cluster() to the top

2017-04-10 Thread Ashijeet Acharya

On Sat, Apr 1, 2017 at 8:14 PM, Ashijeet Acharya
 wrote:
> Move the existing vmdk_find_offset_in_cluster() function to the top of
> the driver. Also, introduce a new helper function size_to_clusters()
> which returns the number of clusters for a given size in bytes. Here,
> we leave the last cluster as we need to perform COW for that one.
>
I will remove the trailing part of the commit message in v4 as there
is no size_to_clusters() in this patch anymore, I forgot to update it!

Ashijeet

[Qemu-block] [PATCH v3 5/6] vmdk: Set maximum bytes allocated in one cycle

2017-04-01 Thread Ashijeet Acharya

Set the maximum bytes allowed to get allocated at once to be not more
than the extent size boundary to handle writes at two separate extents
appropriately.

Signed-off-by: Ashijeet Acharya 
---
 block/vmdk.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index a8babd7..9456ddd 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1767,6 +1767,7 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
offset,
 int64_t offset_in_cluster, n_bytes;
 uint64_t cluster_offset;
 uint64_t bytes_done = 0;
+uint64_t extent_size;
 VmdkMetaData m_data;
 uint32_t total_alloc_clusters = 0;
 
@@ -1782,9 +1783,17 @@ static int vmdk_pwritev(BlockDriverState *bs, uint64_t 
offset,
 if (!extent) {
 return -EIO;
 }
+extent_size = extent->end_sector * BDRV_SECTOR_SIZE;
+
 offset_in_cluster = vmdk_find_offset_in_cluster(extent, offset);
-n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
- - offset_in_cluster);
+
+/* truncate n_bytes to first cluster because we need to perform COW */
+if (offset_in_cluster > 0) {
+n_bytes = MIN(bytes, extent->cluster_sectors * BDRV_SECTOR_SIZE
+ - offset_in_cluster);
+} else {
+n_bytes = MIN(bytes, extent_size - offset);
+}
 
 ret = vmdk_alloc_cluster_offset(bs, extent, &m_data, offset,
 !(extent->compressed || zeroed),
-- 
2.6.2

[Qemu-block] [PATCH v3 6/6] vmdk: Update metadata for multiple clusters

2017-04-01 Thread Ashijeet Acharya

Include a next pointer in VmdkMetaData struct to point to the previous
allocated L2 table. Modify vmdk_L2update to start updating metadata for
allocation of multiple clusters at once.

Signed-off-by: Ashijeet Acharya 
---
 block/vmdk.c | 136 ---
 1 file changed, 111 insertions(+), 25 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 9456ddd..c7675db 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -137,6 +137,8 @@ typedef struct VmdkMetaData {
 int valid;
 uint32_t *l2_cache_entry;
 uint32_t nb_clusters;
+uint32_t offset;
+struct VmdkMetaData *next;
 } VmdkMetaData;
 
 typedef struct VmdkGrainMarker {
@@ -263,6 +265,12 @@ static inline uint64_t size_to_clusters(VmdkExtent 
*extent, uint64_t size)
 return (DIV_ROUND_UP(size + round_off_size, BDRV_SECTOR_SIZE * 128) - 1);
 }
 
+static inline int64_t vmdk_align_offset(int64_t offset, int n)
+{
+offset = (offset + n - 1) & ~(n - 1);
+return offset;
+}
+
 static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
 {
 char *desc;
@@ -1037,29 +1045,88 @@ static void vmdk_refresh_limits(BlockDriverState *bs, 
Error **errp)
 }
 }
 
-static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
- uint32_t offset)
+static int vmdk_alloc_cluster_link_l2(VmdkExtent *extent,
+  VmdkMetaData *m_data, bool zeroed)
 {
-offset = cpu_to_le32(offset);
+int i;
+uint32_t offset, temp_offset;
+int *l2_table_array;
+int l2_array_size;
+
+if (zeroed) {
+temp_offset = VMDK_GTE_ZEROED;
+} else {
+temp_offset = m_data->offset;
+}
+
+temp_offset = cpu_to_le32(temp_offset);
+
+l2_array_size = sizeof(uint32_t) * m_data->nb_clusters;
+l2_table_array = qemu_try_blockalign(extent->file->bs,
+vmdk_align_offset(l2_array_size, 512));
+if (l2_table_array == NULL) {
+return VMDK_ERROR;
+}
+memset(l2_table_array, 0, vmdk_align_offset(l2_array_size, 512));
+
 /* update L2 table */
+offset = temp_offset;
+for (i = 0; i < m_data->nb_clusters; i++) {
+l2_table_array[i] = offset;
+if (!zeroed) {
+offset += 128;
+}
+}
+
 if (bdrv_pwrite_sync(extent->file,
-((int64_t)m_data->l2_offset * 512)
-+ (m_data->l2_index * sizeof(offset)),
-&offset, sizeof(offset)) < 0) {
+((int64_t)m_data->l2_offset * 512)
++ ((m_data->l2_index) * sizeof(offset)),
+ l2_table_array, l2_array_size) < 0) {
 return VMDK_ERROR;
 }
+
 /* update backup L2 table */
 if (extent->l1_backup_table_offset != 0) {
 m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
 if (bdrv_pwrite_sync(extent->file,
 ((int64_t)m_data->l2_offset * 512)
-+ (m_data->l2_index * sizeof(offset)),
-&offset, sizeof(offset)) < 0) {
++ ((m_data->l2_index) * sizeof(offset)),
+l2_table_array, l2_array_size) < 0) {
 return VMDK_ERROR;
 }
 }
+
+offset = temp_offset;
 if (m_data->l2_cache_entry) {
-*m_data->l2_cache_entry = offset;
+for (i = 0; i < m_data->nb_clusters; i++) {
+*m_data->l2_cache_entry = offset;
+m_data->l2_cache_entry++;
+
+if (!zeroed) {
+offset += 128;
+}
+}
+}
+
+qemu_vfree(l2_table_array);
+return VMDK_OK;
+}
+
+static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
+ bool zeroed)
+{
+int ret;
+
+while (m_data->next != NULL) {
+VmdkMetaData *next;
+
+ret = vmdk_alloc_cluster_link_l2(extent, m_data, zeroed);
+if (ret < 0) {
+return ret;
+}
+
+next = m_data->next;
+m_data = next;
 }
 
 return VMDK_OK;
@@ -1271,7 +1338,7 @@ exit:
  */
 static int handle_alloc(BlockDriverState *bs, VmdkExtent *extent,
 uint64_t offset, uint64_t *cluster_offset,
-int64_t *bytes, VmdkMetaData *m_data,
+int64_t *bytes, VmdkMetaData **m_data,
 bool allocate, uint32_t *total_alloc_clusters)
 {
 int l1_index, l2_offset, l2_index;
@@ -1280,6 +1347,7 @@ static int handle_alloc(BlockDriverState *bs, VmdkExtent 
*extent,
 uint32_t nb_clusters;
 bool zeroed = false;
 uint64_t skip_start_bytes, skip_end_bytes;
+VmdkMetaData *old_m_data;
 int ret;
 
 ret = get_cluster_table(extent, offset, &l1_index, &l2_offset,
@@ -1330,13 +1398,21 @@ static int handle_alloc(BlockDriverState *bs, 
VmdkExtent *exten

[Qemu-block] [PATCH v3 4/6] vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset()

2017-04-01 Thread Ashijeet Acharya

Rename the existing get_cluster_offset() function to
vmdk_get_cluster_offset() and have it make use of the new
get_cluster_table() to load the cluster tables. Also, it is no longer
used to allocate new clusters and hence perform COW. Make the necessary
renames at all the occurrences of get_cluster_offset().

Signed-off-by: Ashijeet Acharya 
---
 block/vmdk.c | 117 +++
 1 file changed, 21 insertions(+), 96 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index e5a289d..a8babd7 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1419,7 +1419,7 @@ static int vmdk_alloc_cluster_offset(BlockDriverState *bs,
 }
 
 /**
- * get_cluster_offset
+ * vmdk_get_cluster_offset
  *
  * Look up cluster offset in extent file by sector number, and store in
  * @cluster_offset.
@@ -1427,84 +1427,34 @@ static int vmdk_alloc_cluster_offset(BlockDriverState 
*bs,
  * For flat extents, the start offset as parsed from the description file is
  * returned.
  *
- * For sparse extents, look up in L1, L2 table. If allocate is true, return an
- * offset for a new cluster and update L2 cache. If there is a backing file,
- * COW is done before returning; otherwise, zeroes are written to the allocated
- * cluster. Both COW and zero writing skips the sector range
- * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller
- * has new data to write there.
+ * For sparse extents, look up in L1, L2 table.
  *
  * Returns: VMDK_OK if cluster exists and mapped in the image.
- *  VMDK_UNALLOC if cluster is not mapped and @allocate is false.
- *  VMDK_ERROR if failed.
+ *  VMDK_UNALLOC if cluster is not mapped.
+ *  VMDK_ERROR if failed
  */
-static int get_cluster_offset(BlockDriverState *bs,
-  VmdkExtent *extent,
-  VmdkMetaData *m_data,
-  uint64_t offset,
-  bool allocate,
-  uint64_t *cluster_offset,
-  uint64_t skip_start_bytes,
-  uint64_t skip_end_bytes)
+static int vmdk_get_cluster_offset(BlockDriverState *bs,
+   VmdkExtent *extent,
+   uint64_t offset,
+   uint64_t *cluster_offset)
 {
-unsigned int l1_index, l2_offset, l2_index;
-int min_index, i, j;
-uint32_t min_count, *l2_table;
+int l1_index, l2_offset, l2_index;
+uint32_t *l2_table;
 bool zeroed = false;
 int64_t ret;
 int64_t cluster_sector;
 
-if (m_data) {
-m_data->valid = 0;
-}
 if (extent->flat) {
 *cluster_offset = extent->flat_start_offset;
 return VMDK_OK;
 }
 
-offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE;
-l1_index = (offset >> 9) / extent->l1_entry_sectors;
-if (l1_index >= extent->l1_size) {
-return VMDK_ERROR;
-}
-l2_offset = extent->l1_table[l1_index];
-if (!l2_offset) {
-return VMDK_UNALLOC;
-}
-for (i = 0; i < L2_CACHE_SIZE; i++) {
-if (l2_offset == extent->l2_cache_offsets[i]) {
-/* increment the hit count */
-if (++extent->l2_cache_counts[i] == 0x) {
-for (j = 0; j < L2_CACHE_SIZE; j++) {
-extent->l2_cache_counts[j] >>= 1;
-}
-}
-l2_table = extent->l2_cache + (i * extent->l2_size);
-goto found;
-}
-}
-/* not found: load a new entry in the least used one */
-min_index = 0;
-min_count = 0x;
-for (i = 0; i < L2_CACHE_SIZE; i++) {
-if (extent->l2_cache_counts[i] < min_count) {
-min_count = extent->l2_cache_counts[i];
-min_index = i;
-}
-}
-l2_table = extent->l2_cache + (min_index * extent->l2_size);
-if (bdrv_pread(extent->file,
-(int64_t)l2_offset * 512,
-l2_table,
-extent->l2_size * sizeof(uint32_t)
-) != extent->l2_size * sizeof(uint32_t)) {
-return VMDK_ERROR;
+ret = get_cluster_table(extent, offset, &l1_index, &l2_offset,
+&l2_index, &l2_table);
+if (ret < 0) {
+return ret;
 }
 
-extent->l2_cache_offsets[min_index] = l2_offset;
-extent->l2_cache_counts[min_index] = 1;
- found:
-l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size;
 cluster_sector = le32_to_cpu(l2_table[l2_index]);
 
 if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) {
@@ -1512,31 +1462,9 @@ static int get_cluster_offset(BlockDriverState *bs,
 }
 
 if (!cluster_sector || zeroed) {
-if (!allocate) {
-return zeroed ? VMDK_ZE

[Qemu-block] [PATCH v3 3/6] vmdk: New functions to assist allocating multiple clusters

2017-04-01 Thread Ashijeet Acharya

Move the cluster tables loading code out of the existing
get_cluster_offset() function to avoid code duplication and implement it
in separate get_cluster_table() and vmdk_L2load() functions.

Introduce two new helper functions handle_alloc() and
vmdk_alloc_cluster_offset(). handle_alloc() helps to allocate multiple
clusters at once starting from a given offset on disk and performs COW
if necessary for first and last allocated clusters.
vmdk_alloc_cluster_offset() helps to return the offset of the first of
the many newly allocated clusters. Also, provide proper documentation
for both.

Signed-off-by: Ashijeet Acharya 
---
 block/vmdk.c | 337 ++-
 1 file changed, 308 insertions(+), 29 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 73ae786..e5a289d 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -136,6 +136,7 @@ typedef struct VmdkMetaData {
 unsigned int l2_offset;
 int valid;
 uint32_t *l2_cache_entry;
+uint32_t nb_clusters;
 } VmdkMetaData;
 
 typedef struct VmdkGrainMarker {
@@ -254,6 +255,14 @@ static inline uint64_t 
vmdk_find_offset_in_cluster(VmdkExtent *extent,
 return extent_relative_offset % cluster_size;
 }
 
+static inline uint64_t size_to_clusters(VmdkExtent *extent, uint64_t size)
+{
+uint64_t cluster_size, round_off_size;
+cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;
+round_off_size = cluster_size - (size % cluster_size);
+return DIV_ROUND_UP(size + round_off_size, BDRV_SECTOR_SIZE * 128) - 1;
+}
+
 static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
 {
 char *desc;
@@ -1028,6 +1037,133 @@ static void vmdk_refresh_limits(BlockDriverState *bs, 
Error **errp)
 }
 }
 
+static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
+ uint32_t offset)
+{
+offset = cpu_to_le32(offset);
+/* update L2 table */
+if (bdrv_pwrite_sync(extent->file,
+((int64_t)m_data->l2_offset * 512)
++ (m_data->l2_index * sizeof(offset)),
+&offset, sizeof(offset)) < 0) {
+return VMDK_ERROR;
+}
+/* update backup L2 table */
+if (extent->l1_backup_table_offset != 0) {
+m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
+if (bdrv_pwrite_sync(extent->file,
+((int64_t)m_data->l2_offset * 512)
++ (m_data->l2_index * sizeof(offset)),
+&offset, sizeof(offset)) < 0) {
+return VMDK_ERROR;
+}
+}
+if (m_data->l2_cache_entry) {
+*m_data->l2_cache_entry = offset;
+}
+
+return VMDK_OK;
+}
+
+/*
+ * vmdk_l2load
+ *
+ * Loads a new L2 table into memory. If the table is in the cache, the cache
+ * is used; otherwise the L2 table is loaded from the image file.
+ *
+ * Returns:
+ *   VMDK_OK:   on success
+ *   VMDK_ERROR:in error cases
+ */
+static int vmdk_l2load(VmdkExtent *extent, uint64_t offset, int l2_offset,
+   uint32_t **new_l2_table, int *new_l2_index)
+{
+int min_index, i, j;
+uint32_t *l2_table;
+uint32_t min_count;
+
+for (i = 0; i < L2_CACHE_SIZE; i++) {
+if (l2_offset == extent->l2_cache_offsets[i]) {
+/* increment the hit count */
+if (++extent->l2_cache_counts[i] == UINT32_MAX) {
+for (j = 0; j < L2_CACHE_SIZE; j++) {
+extent->l2_cache_counts[j] >>= 1;
+}
+}
+l2_table = extent->l2_cache + (i * extent->l2_size);
+goto found;
+}
+}
+/* not found: load a new entry in the least used one */
+min_index = 0;
+min_count = UINT32_MAX;
+for (i = 0; i < L2_CACHE_SIZE; i++) {
+if (extent->l2_cache_counts[i] < min_count) {
+min_count = extent->l2_cache_counts[i];
+min_index = i;
+}
+}
+l2_table = extent->l2_cache + (min_index * extent->l2_size);
+if (bdrv_pread(extent->file,
+(int64_t)l2_offset * 512,
+l2_table,
+extent->l2_size * sizeof(uint32_t)
+) != extent->l2_size * sizeof(uint32_t)) {
+return VMDK_ERROR;
+}
+
+extent->l2_cache_offsets[min_index] = l2_offset;
+extent->l2_cache_counts[min_index] = 1;
+found:
+*new_l2_index = ((offset >> 9) / extent->cluster_sectors) % 
extent->l2_size;
+*new_l2_table = l2_table;
+
+return VMDK_OK;
+}
+
+/*
+ * get_cluster_table
+ *
+ * for a given offset, load (and allocate if needed) the l2 table.
+ *
+ * Returns:
+ *   VMDK_OK:on success
+ *
+ *   VMDK_UNALLOC:   if cluster is not mapped
+ *
+ *   VMDK_ERROR: in error cases
+ */
+static int get_cluster_table(VmdkExtent *extent, uint64_t offset,
+ int *new_l1_index,

[Qemu-block] [PATCH v3 2/6] vmdk: Rename get_whole_cluster() to vmdk_perform_cow()

2017-04-01 Thread Ashijeet Acharya

Rename the existing function get_whole_cluster() to vmdk_perform_cow()
as its sole purpose is to perform COW for the first and the last
allocated clusters if needed.

Signed-off-by: Ashijeet Acharya 
---
 block/vmdk.c | 23 ++-
 1 file changed, 14 insertions(+), 9 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index 22be887..73ae786 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1028,8 +1028,8 @@ static void vmdk_refresh_limits(BlockDriverState *bs, 
Error **errp)
 }
 }
 
-/**
- * get_whole_cluster
+/*
+ * vmdk_perform_cow
  *
  * Copy backing file's cluster that covers @sector_num, otherwise write zero,
  * to the cluster at @cluster_sector_num.
@@ -1037,13 +1037,18 @@ static void vmdk_refresh_limits(BlockDriverState *bs, 
Error **errp)
  * If @skip_start_sector < @skip_end_sector, the relative range
  * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave
  * it for call to write user data in the request.
+ *
+ * Returns:
+ *   VMDK_OK:   on success
+ *
+ *   VMDK_ERROR:in error cases
  */
-static int get_whole_cluster(BlockDriverState *bs,
- VmdkExtent *extent,
- uint64_t cluster_offset,
- uint64_t offset,
- uint64_t skip_start_bytes,
- uint64_t skip_end_bytes)
+static int vmdk_perform_cow(BlockDriverState *bs,
+VmdkExtent *extent,
+uint64_t cluster_offset,
+uint64_t offset,
+uint64_t skip_start_bytes,
+uint64_t skip_end_bytes)
 {
 int ret = VMDK_OK;
 int64_t cluster_bytes;
@@ -1244,7 +1249,7 @@ static int get_cluster_offset(BlockDriverState *bs,
  * This problem may occur because of insufficient space on host disk
  * or inappropriate VM shutdown.
  */
-ret = get_whole_cluster(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
+ret = vmdk_perform_cow(bs, extent, cluster_sector * BDRV_SECTOR_SIZE,
 offset, skip_start_bytes, skip_end_bytes);
 if (ret) {
 return ret;
-- 
2.6.2

[Qemu-block] [PATCH v3 0/6] Optiomize VMDK I/O by allocating multiple clusters

2017-04-01 Thread Ashijeet Acharya

Previously posted series patches:
v1 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg02044.html
v2 - http://lists.nongnu.org/archive/html/qemu-devel/2017-03/msg05080.html

This series helps to optimize the I/O performance of VMDK driver.

Patch 1 helps us to move vmdk_find_offset_in_cluster.

Patch 2 performs a simple function re-naming task.

Patch 3 adds new functions to help us allocate multiple clusters according to
the size requested, perform COW if required and return the offset of the first
newly allocated cluster. Also make loading of metadata tables easier and
avoid code duplication.

Patch 4 performs a simple function re-naming task and re-factors it to make use 
of
new metadata functions to avoid code duplication.

Patch 5 helps to set the upper limit of the bytes handled in one cycle.

Patch 6 changes the metadata update code to update the L2 tables for multiple
clusters at once.

Note: v3 has an addition of new optimization of calling bdrv_pwrite_sync() only
once for atmost 512 clusters, as a result performance has increased to a great
extent (earlier till v2 it was 29%).

Optimization test results:

This patch series improves 128 KB sequential write performance to an
empty VMDK file by 54%

Benchmark command: ./qemu-img bench -w -c 1024 -s 128K -d 1 -t none -f
vmdk test.vmdk

Note: These patches pass all 41/41 tests suitable for the VMDK driver.

Changes in v3:
- move size_to_clusters() from patch 1 to 3 (fam)
- use DIV_ROUND_UP in size_to_clusters (fam)
- make patch 2 compilable (fam)
- rename vmdk_L2update as vmdk_l2update and use UINT32_MAX (fam)
- combine patch 3 and patch 4 (as in v2) to make them compilable (fam)
- call bdrv_pwrite_sync() for batches of atmost 512 clusters at once (fam)

Changes in v2:
- segregate the ugly Patch 1 in v1 into 6 readable and sensible patches
- include benchmark test results in v2

Ashijeet Acharya (6):
  vmdk: Move vmdk_find_offset_in_cluster() to the top
  vmdk: Rename get_whole_cluster() to vmdk_perform_cow()
  vmdk: New functions to assist allocating multiple clusters
  vmdk: Rename get_cluster_offset() to vmdk_get_cluster_offset()
  vmdk: Set maximum bytes allocated in one cycle
  vmdk: Update metadata for multiple clusters

 block/vmdk.c | 608 ---
 1 file changed, 456 insertions(+), 152 deletions(-)

-- 
2.6.2

[Qemu-block] [PATCH v3 1/6] vmdk: Move vmdk_find_offset_in_cluster() to the top

2017-04-01 Thread Ashijeet Acharya

Move the existing vmdk_find_offset_in_cluster() function to the top of
the driver. Also, introduce a new helper function size_to_clusters()
which returns the number of clusters for a given size in bytes. Here,
we leave the last cluster as we need to perform COW for that one.

Signed-off-by: Ashijeet Acharya 
---
 block/vmdk.c | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/block/vmdk.c b/block/vmdk.c
index a9bd22b..22be887 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -242,6 +242,18 @@ static void vmdk_free_last_extent(BlockDriverState *bs)
 s->extents = g_renew(VmdkExtent, s->extents, s->num_extents);
 }
 
+static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
+   int64_t offset)
+{
+uint64_t extent_begin_offset, extent_relative_offset;
+uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;
+
+extent_begin_offset =
+(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE;
+extent_relative_offset = offset - extent_begin_offset;
+return extent_relative_offset % cluster_size;
+}
+
 static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent)
 {
 char *desc;
@@ -1266,18 +1278,6 @@ static VmdkExtent *find_extent(BDRVVmdkState *s,
 return NULL;
 }
 
-static inline uint64_t vmdk_find_offset_in_cluster(VmdkExtent *extent,
-   int64_t offset)
-{
-uint64_t extent_begin_offset, extent_relative_offset;
-uint64_t cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE;
-
-extent_begin_offset =
-(extent->end_sector - extent->sectors) * BDRV_SECTOR_SIZE;
-extent_relative_offset = offset - extent_begin_offset;
-return extent_relative_offset % cluster_size;
-}
-
 static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent,
   int64_t sector_num)
 {
-- 
2.6.2

Re: [Qemu-block] [Qemu-devel] [PATCH v2 3/7] vmdk: Factor out metadata loading code out of get_cluster_offset()

2017-04-01 Thread Ashijeet Acharya

On Fri, Mar 31, 2017 at 11:33 AM, Fam Zheng  wrote:
> On Sat, 03/25 16:48, Ashijeet Acharya wrote:
>> Move the cluster tables loading code out of the existing
>> get_cluster_offset() function and implement it in separate
>
> Now it's renamed to vmdk_perform_cow() in previous patch, the commit message
> following it should be updated.

No, I think you confused get_cluster_offset() with get_whole_cluster() here :-)
I have a separate patch 5/7 which renames get_cluster_offset(), so the
commit message is fine till this stage.
>
>> get_cluster_table() and vmdk_L2load() functions. This patch will help
>> us avoid code duplication in future patches of this series.
>>
>> Signed-off-by: Ashijeet Acharya 
>> ---
>>  block/vmdk.c | 99 
>> 
>>  1 file changed, 99 insertions(+)
>>
>> diff --git a/block/vmdk.c b/block/vmdk.c
>> index f5fda2c..a42322e 100644
>> --- a/block/vmdk.c
>> +++ b/block/vmdk.c
>> @@ -1037,6 +1037,105 @@ static void vmdk_refresh_limits(BlockDriverState 
>> *bs, Error **errp)
>>  }
>>
>>  /*
>> + * vmdk_L2load
>
> Personally, I think vmdk_l2load is good enough, the upper case doesn't add
> readability but makes it slightly harder to type.

Done.

>
>> + *
>> + * Loads a new L2 table into memory. If the table is in the cache, the cache
>> + * is used; otherwise the L2 table is loaded from the image file.
>> + *
>> + * Returns:
>> + *   VMDK_OK:   on success
>> + *   VMDK_ERROR:in error cases
>> + */
>> +static int vmdk_L2load(VmdkExtent *extent, uint64_t offset, int l2_offset,
>> +   uint32_t **new_l2_table, int *new_l2_index)
>> +{
>> +int min_index, i, j;
>> +uint32_t *l2_table;
>> +uint32_t min_count;
>> +
>> +for (i = 0; i < L2_CACHE_SIZE; i++) {
>> +if (l2_offset == extent->l2_cache_offsets[i]) {
>> +/* increment the hit count */
>> +if (++extent->l2_cache_counts[i] == 0x) {
>
> Please use UINT32_MAX.

Done.

>
>> +for (j = 0; j < L2_CACHE_SIZE; j++) {
>> +extent->l2_cache_counts[j] >>= 1;
>> +}
>> +}
>> +l2_table = extent->l2_cache + (i * extent->l2_size);
>> +goto found;
>> +}
>> +}
>> +/* not found: load a new entry in the least used one */
>> +min_index = 0;
>> +min_count = 0x;
>
> Please use UINT32_MAX.

Done.

>
>> +for (i = 0; i < L2_CACHE_SIZE; i++) {
>> +if (extent->l2_cache_counts[i] < min_count) {
>> +min_count = extent->l2_cache_counts[i];
>> +min_index = i;
>> +}
>> +}
>> +l2_table = extent->l2_cache + (min_index * extent->l2_size);
>> +if (bdrv_pread(extent->file,
>> +(int64_t)l2_offset * 512,
>> +l2_table,
>> +extent->l2_size * sizeof(uint32_t)
>> +) != extent->l2_size * sizeof(uint32_t)) {
>> +return VMDK_ERROR;
>> +}
>> +
>> +extent->l2_cache_offsets[min_index] = l2_offset;
>> +extent->l2_cache_counts[min_index] = 1;
>> +found:
>> +*new_l2_index = ((offset >> 9) / extent->cluster_sectors) % 
>> extent->l2_size;
>> +*new_l2_table = l2_table;
>> +
>> +return VMDK_OK;
>> +}
>> +
>> +/*
>> + * get_cluster_table
>> + *
>> + * for a given offset, load (and allocate if needed) the l2 table.
>> + *
>> + * Returns:
>> + *   VMDK_OK:on success
>> + *
>> + *   VMDK_UNALLOC:   if cluster is not mapped
>> + *
>> + *   VMDK_ERROR: in error cases
>> + */
>> +static int get_cluster_table(VmdkExtent *extent, uint64_t offset,
>> + int *new_l1_index, int *new_l2_offset,
>> + int *new_l2_index, uint32_t **new_l2_table)
>
> Again, a static function must be introduced with the code change where it is
> used, at least for once. It keeps the compiler happy (-Wunused-function) and
> makes reviewing easy.

Done.


Ashijeet

Re: [Qemu-block] [Qemu-devel] [PATCH v2 7/7] vmdk: Update metadata for multiple clusters

2017-03-31 Thread Ashijeet Acharya

On Fri, Mar 31, 2017 at 2:38 PM, Fam Zheng  wrote:
> On Fri, 03/31 14:17, Ashijeet Acharya wrote:
>> On Fri, Mar 31, 2017 at 12:56 PM, Fam Zheng  wrote:
>> > On Sat, 03/25 16:48, Ashijeet Acharya wrote:
>> >> Include a next pointer in VmdkMetaData struct to point to the previous
>> >> allocated L2 table. Modify vmdk_L2update to start updating metadata for
>> >> allocation of multiple clusters at once.
>> >>
>> >> Signed-off-by: Ashijeet Acharya 
>> >> ---
>> >>  block/vmdk.c | 131 
>> >> ++-
>> >>  1 file changed, 102 insertions(+), 29 deletions(-)
>> >>
>> >> diff --git a/block/vmdk.c b/block/vmdk.c
>> >> index 3de8b8f..4517409 100644
>> >> --- a/block/vmdk.c
>> >> +++ b/block/vmdk.c
>> >> @@ -137,6 +137,8 @@ typedef struct VmdkMetaData {
>> >>  int valid;
>> >>  uint32_t *l2_cache_entry;
>> >>  uint32_t nb_clusters;
>> >> +uint32_t offset;
>> >> +struct VmdkMetaData *next;
>> >>  } VmdkMetaData;
>> >>
>> >>  typedef struct VmdkGrainMarker {
>> >> @@ -1037,29 +1039,81 @@ static void vmdk_refresh_limits(BlockDriverState 
>> >> *bs, Error **errp)
>> >>  }
>> >>  }
>> >>
>> >> -static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data,
>> >> - uint32_t offset)
>> >> +static int vmdk_alloc_cluster_link_l2(VmdkExtent *extent,
>> >> +  VmdkMetaData *m_data, bool zeroed)
>> >>  {
>> >> -offset = cpu_to_le32(offset);
>> >> +int i;
>> >> +uint32_t offset, temp_offset;
>> >> +
>> >> +if (zeroed) {
>> >> +temp_offset = VMDK_GTE_ZEROED;
>> >> +} else {
>> >> +temp_offset = m_data->offset;
>> >> +}
>> >> +
>> >> +temp_offset = cpu_to_le32(temp_offset);
>> >> +
>> >>  /* update L2 table */
>> >> -if (bdrv_pwrite_sync(extent->file,
>> >> +offset = temp_offset;
>> >> +for (i = 0; i < m_data->nb_clusters; i++) {
>> >> +if (bdrv_pwrite_sync(extent->file,
>> >>  ((int64_t)m_data->l2_offset * 512)
>> >> -+ (m_data->l2_index * sizeof(offset)),
>> >> -&offset, sizeof(offset)) < 0) {
>> >> -return VMDK_ERROR;
>> >> ++ ((m_data->l2_index + i) * sizeof(offset)),
>> >> +&(offset), sizeof(offset)) < 0) {
>> >> +return VMDK_ERROR;
>> >> +}
>> >> +if (!zeroed) {
>> >> +offset += 128;
>> >> +}
>> >>  }
>> >> +
>> >>  /* update backup L2 table */
>> >> +offset = temp_offset;
>> >>  if (extent->l1_backup_table_offset != 0) {
>> >>  m_data->l2_offset = extent->l1_backup_table[m_data->l1_index];
>> >> -if (bdrv_pwrite_sync(extent->file,
>> >> -((int64_t)m_data->l2_offset * 512)
>> >> -+ (m_data->l2_index * sizeof(offset)),
>> >> -&offset, sizeof(offset)) < 0) {
>> >> -return VMDK_ERROR;
>> >> +for (i = 0; i < m_data->nb_clusters; i++) {
>> >> +if (bdrv_pwrite_sync(extent->file,
>> >> +((int64_t)m_data->l2_offset * 512)
>> >> ++ ((m_data->l2_index + i) * sizeof(offset)),
>> >> +&(offset), sizeof(offset)) < 0) {
>> >> +return VMDK_ERROR;
>> >> +}
>> >> +if (!zeroed) {
>> >> +offset += 128;
>> >> +}
>> >>  }
>> >>  }
>> >> +
>> >> +offset = temp_offset;
>> >>  if (m_data->l2_cache_entry) {
>> >> -*m_data->l2_cache_entry = offset;
>> >> +for (i = 0; i < m_data->nb_clusters; i++) {
>> >> +*m_data->l2_cache_entry = offset;
>> >> +m_data->l2_cache_entry++;
>> >>

1 2 3 >

1 - 100 of 203 matches

Mail list logo